
    cCi%                        S SK JrJr  S SKrS SKJr  S SKJr  SSKJrJ	r	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/  \R`                  " \15      r2 " S S\(5      r3 " S S\"5      r4\Rj                  " \ " 5       5      \Rj                  " S5      :  a   " S S\Rl                  5      r7O \" S5       " S S\Rp                  5      5       r7 " S S \#5      r9 " S! S"\)5      r: " S# S$\-5      r; " S% S&\$5      r< " S' S(\&5      r= " S) S*\'5      r> " S+ S,\%5      r?/ S-Qr@g).    )CallableOptionalN)version)nn   )CacheDynamicCache)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)deprecate_kwarg)check_model_inputs)get_torch_version   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLPLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward)MistralModel   )Qwen2Configc                   (   ^  \ rS rSrU 4S jrSrU =r$ )Qwen2MLP'   c                 >  > [         TU ]  U5        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l        [        R                  " U R
                  U R                  SS9U l        g )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_projselfconfig	__class__s     a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/qwen2/modular_qwen2.pyr,   Qwen2MLP.__init__(   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWX    )r2   r0   r1   )__name__
__module____qualname____firstlineno__r,   __static_attributes____classcell__r6   s   @r7   r&   r&   '   s    Y Yr9   r&   c                   6  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9  SS	\R                  S
\
\R                  \R                  4   S\\R                     S\\   S\\R                     S\\   S\
\R                  \\R                     4   4S jj5       rSrU =r$ )Qwen2Attention/   r5   	layer_idxc                 \  > [         TU ]  X5        [        R                  " UR                  UR
                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l	        [        R                  " UR                  UR                  U R                  -  SS9U l
        [        R                  " UR
                  U R                  -  UR                  SS9U l        UR                  U   S:X  a  UR                  U l        g S U l        g )NTr)   Fsliding_attention)r+   r,   r   r-   r.   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projlayer_typessliding_windowr4   r5   rD   r6   s      r7   r,   Qwen2Attention.__init__0   s    +ii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^eijii 2 2F4N4NQUQ^Q^4^eijii : :T]] JFL^L^ejk7=7I7I)7TXk7kf33qur9   past_key_valuepast_key_valuesz4.58)new_namer   hidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                 J   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                  U R                   U R"                  S.UD6u  nnUR$                  " / UQSP76 R'                  5       nU R)                  U5      nUU4$ )Nr#   r   )sincosrX   eagerg        )dropoutscalingrO   )shaperH   rI   view	transposerK   rL   r    updaterD   r!   r5   _attn_implementationr   trainingattention_dropoutra   rO   reshape
contiguousrM   )r4   rU   rV   rW   rS   rX   rY   input_shapehidden_shapequery_states
key_statesvalue_statesr^   r]   cache_kwargsattention_interfaceattn_outputattn_weightss                     r7   forwardQwen2Attention.forward8   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ "));;;;FFHkk+.L((r9   )rK   rM   rI   rO   rL   )NN)r:   r;   r<   r=   r$   intr,   r   torchTensortupler   r   
LongTensorr   r   rt   r>   r?   r@   s   @r7   rB   rB   /   s    v{ vs v %0A6R ,059*)||*) #5<<#=>*) !.	*)
 "%*) !!1!12*) -.*) 
u||Xell33	4*) S*)r9   rB   z2.3.0c                   8   ^  \ rS rSrSS\SS4U 4S jjjrSrU =r$ )Qwen2RMSNormh   epsrZ   Nc                 "   > [         TU ]  XSS9  g )NT)normalized_shaper~   elementwise_affine)r+   r,   r4   r.   r~   r6   s      r7   r,   Qwen2RMSNorm.__init__i   s    GkW[\r9    gư>)r:   r;   r<   r=   floatr,   r>   r?   r@   s   @r7   r|   r|   h   s    	]U 	]d 	] 	]r9   r|   RMSNormc                   x   ^  \ rS rSrS
S\SS4U 4S jjjrS\R                  S\R                  4S jrS r	S	r
U =r$ )r|   n   r~   rZ   Nc                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Qwen2RMSNorm is equivalent to T5LayerNorm
N)r+   r,   r   	Parameterrw   onesweightvariance_epsilonr   s      r7   r,   r   p   s/     G,,uzz+'>?DK$'!r9   rU   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   r\   T)keepdim)	dtypetorw   float32powmeanrsqrtr   r   )r4   rU   input_dtypevariances       r7   rt   Qwen2RMSNorm.forwardx   sw    '--K),,U]];M$((+00T0BH)EKKCXCX8X,YYM;;!1!1+!>>>r9   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)ry   r   rb   r   )r4   s    r7   
extra_reprQwen2RMSNorm.extra_repr   s*    DKK--./vd6K6K5LMMr9   )r   r   r   )r:   r;   r<   r=   r   r,   rw   rx   rt   r   r>   r?   r@   s   @r7   r|   r|   n   sB    	(U 	(d 	( 	(	? 	?%,, 	?	N 	Nr9   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Qwen2DecoderLayer   r5   rD   c                 H   > [         TU ]  XS9  UR                  U   U l        g )N)r5   rD   )r+   r,   rN   attention_typerP   s      r7   r,   Qwen2DecoderLayer.__init__   s&    <$00;r9   )r   )	r:   r;   r<   r=   r$   rv   r,   r>   r?   r@   s   @r7   r   r      s    <{ <s < <r9   r   c                       \ rS rSrSrg)Qwen2PreTrainedModel   r   Nr:   r;   r<   r=   r>   r   r9   r7   r   r          r9   r   c                   "  ^  \ rS rSrS\4U 4S jjr\" 5       \       SS\\	R                     S\\	R                     S\\	R                     S\\   S\\	R                     S	\\   S
\\	R                     S\\   S\4S jj5       5       rSrU =r$ )
Qwen2Model   r5   c                 `   > [         TU ]  U5        SU R                  R                  ;   U l        g )NrF   )r+   r,   r5   rN   has_sliding_layersr3   s     r7   r,   Qwen2Model.__init__   s'     "59P9P"Pr9   	input_idsrW   position_idsrS   inputs_embeds	use_cacherX   rY   rZ   c                    US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc  [        U R                  S9nUcD  Ub  UR	                  5       OSn	[
        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       d?  U R                  UUUUUS.nS[        S0 UD60n
U R                  (       a  [        S0 UD6U
S'   UnU R                  X5      nU R                   S U R                  R"                    H  nU" U4XR$                     UUUUUS	.UD6nM!     U R'                  U5      n[)        UU(       a  US
9$ S S
9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r5   r   r#   )device)r5   input_embedsrW   rX   rS   r   full_attentionrF   )rW   r   rS   r   rX   rV   )last_hidden_staterS   r   )
ValueErrorembed_tokensr	   r5   get_seq_lengthrw   arangerb   r   	unsqueeze
isinstancedictr   r   r   
rotary_emblayersnum_hidden_layersr   normr   )r4   r   rW   r   rS   r   r   rX   rY   past_seen_tokenscausal_mask_mappingmask_kwargsrU   rV   decoder_layers                  r7   rt   Qwen2Model.forward   s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L ?-FF ++ -"0"0#2 ,K !"4"C{"C# &&;\;k_j;k#$78% #oomJ![[)H4;;+H+HIM)	23O3OP) /#-$7	 	M J 		-0&+/8O
 	
>B
 	
r9   )r   )NNNNNNN)r:   r;   r<   r=   r$   r,   r   r   r   rw   rz   rx   r   FloatTensorboolr   r   r   rt   r>   r?   r@   s   @r7   r   r      s    Q{ Q  151537+/59$(59E
E,,-E
 !.E
 u//0	E

 "%E
   1 12E
 D>E
 !!1!12E
 +,E
 
!E
  E
r9   r   c                       \ rS rSrSrg)Qwen2ForCausalLM   r   Nr   r   r9   r7   r   r      r   r9   r   c                       \ rS rSrSrg)Qwen2ForSequenceClassification   r   Nr   r   r9   r7   r   r      r   r9   r   c                       \ rS rSrSrg)Qwen2ForTokenClassification   r   Nr   r   r9   r7   r   r      r   r9   r   c                       \ rS rSrSrg)Qwen2ForQuestionAnswering   r   Nr   r   r9   r7   r   r      r   r9   r   )r   r   r   r|   r   r   r   )Atypingr   r   rw   	packagingr   r   cache_utilsr   r	   integrationsr
   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   utils.import_utilsr   llama.modeling_llamar   r   r   r   r   r   r   r   r    r!   mistral.modeling_mistralr"   configuration_qwen2r$   
get_loggerr:   loggerr&   rB   parser   r|   Moduler   r   r   r   r   r   r   __all__r   r9   r7   <module>r      s:   %    . 7 R B 6 & @ @ 0 / 3   4 , 
		H	%Yx Y4)^ 4)n =="$%w)??]rzz ] !+Nryy N ,N(<) <	/ 	L
 L
^	' 		%C 		"= 		 9 	r9   