
    cCi4                     j   S SK JrJrJr  S SKrS SKJr  SSKJr  SSK	J
r
JrJr  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.  SSK/J0r0J1r1  \)Rd                  " \35      r4 " S S\Rj                  5      r6 " S S\Rj                  5      r7 " S S\Rj                  5      r8S r9SMS jr:S\Rv                  S\<S\Rv                  4S jr=   SNS\Rj                  S \Rv                  S!\Rv                  S"\Rv                  S#\\Rv                     S$\>S%\\>   S&\\>   S\?\Rv                  \Rv                  4   4S' jjr@ " S( S)\Rj                  5      rA " S* S+\Rj                  5      rB " S, S-\5      rC " S. S/\C5      rD " S0 S1\Rj                  5      rE " S2 S3\Rj                  5      rF " S4 S5\Rj                  5      rG\' " S6 S7\"5      5       rHS#\\Rv                     S\4S8 jrIS9\<S\4S: jrJS;\\R                     S\Rv                  S<\\<   S\Rv                  4S= jrL " S> S?\H5      rM " S@ SA\M5      rN\' " SB SC\H5      5       rO\' " SD SE\H5      5       rP " SF SG\H\5      rQ\' " SH SI\H5      5       rR\' " SJ SK\H5      5       rS/ SLQrTg)O    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )T5GemmaConfigT5GemmaModuleConfigc                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
T5GemmaRMSNorm5   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__r)   nn	Parametertorchzerosweight)selfr(   r)   	__class__s      f/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr-   T5GemmaRMSNorm.__init__6   s,    ll5;;s#34    c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )N   T)keepdim)r0   rsqrtpowmeanr)   )r3   xs     r5   _normT5GemmaRMSNorm._norm;   s4    5;;quuQx}}R}>IJJJr7   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )Ng      ?)r@   floatr2   type_as)r3   r?   outputs      r5   forwardT5GemmaRMSNorm.forward>   sC    AGGI& 3!2!2!445~~a  r7   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler2   shaper)   r3   s    r5   
extra_reprT5GemmaRMSNorm.extra_reprE   s'    ))*+6$((<<r7   )r)   r2   )gư>)__name__
__module____qualname____firstlineno__intrC   r-   r@   rF   rL   __static_attributes____classcell__r4   s   @r5   r&   r&   5   s0    5C 5e 5 5
K!= =r7   r&   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
T5GemmaMLPI   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        [
        R                  " UR                  5      U l        g )NFbias)r,   r-   confighidden_sizeintermediate_sizer.   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr3   r\   r4   s     r5   r-   T5GemmaMLP.__init__J   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556zz&"5"56r7   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      nU R	                  U5      nU$ r+   )rd   r`   ra   rg   rb   )r3   r?   hidden_statesrb   s       r5   rF   T5GemmaMLP.forwardU   sH    DNN1$56aH]3NN=1	r7   )rd   r\   rb   rg   r`   r]   r^   ra   )rN   rO   rP   rQ   r-   rF   rS   rT   rU   s   @r5   rW   rW   I   s    	7 r7   rW   c                      ^  \ rS rSr% \R
                  \S'   SU 4S jjr\R                  " 5       \	S 5       5       r
SrU =r$ )T5GemmaRotaryEmbedding\   inv_freqc                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultrp   F)
persistent)r,   r-   hasattr
isinstancerr   dictgetrs   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr\   r   rope_init_fnattention_scalingregister_bufferrp   original_inv_freq)r3   r\   devicerp   r4   s       r5   r-   T5GemmaRotaryEmbedding.__init___   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r7   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r:   r"   mpscpuF)device_typeenabledr9   r(   dtype)rp   rC   expandrJ   tor   rx   rt   strr0   autocast	transposecatcosr   sinr   )
r3   r?   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r5   rF   T5GemmaRotaryEmbedding.forwardp   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r\   r|   r   r}   r~   rs   r+   )rN   rO   rP   rQ   r0   Tensor__annotations__r-   no_gradr   rF   rS   rT   rU   s   @r5   rn   rn   \   s4    ll/" ]]_<  <r7   rn   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr:   r9   r   )rJ   r0   r   )r?   x1x2s      r5   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r5   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr7   rk   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)rJ   r   reshape)rk   r   batchnum_key_value_headsslenhead_dims         r5   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr7   modulequerykeyvalueattention_maskrg   scalingsoftcapc                    Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb"  US S 2S S 2S S 2S U	R                  S   24   nX-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R!                  5       nX4$ )	N      r9   r   r:   )r(   r   )ptrainingr"   )r   r   num_key_value_groupsr0   matmulr   tanhrJ   r.   
functionalsoftmaxfloat32r   r   rg   r   
contiguous)r   r   r   r   r   rg   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r5   eager_attention_forwardr      s/    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r7   c                   \  ^  \ rS rSrSrS\S\4U 4S jjr\" SSSS	9  SS
\	R                  S\\	R                  \	R                  4   S\\	R                     S\\   S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jj5       rSrU =r$ )T5GemmaSelfAttention   =Multi-headed attention from 'Attention Is All You Need' paperr\   	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        UR                  U l        [        R                   " UR
                  UR                  U R                  -  UR"                  S9U l        [        R                   " UR
                  UR                  U R                  -  UR"                  S9U l        [        R                   " UR
                  UR                  U R                  -  UR"                  S9U l        [        R                   " UR                  U R                  -  UR
                  UR"                  S9U l        U R                  R,                  U l        UR.                  U   S:X  a  UR0                  U l        g S U l        g )Nr   r   rZ   sliding_attention)r,   r-   r\   r   getattrr]   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout
is_decoder	is_causalr.   r_   attention_biasq_projk_projv_projo_projattn_logit_softcappinglayer_typessliding_windowr3   r\   r   r4   s      r5   r-   T5GemmaSelfAttention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>**ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7=7I7I)7TXk7kf33qur7   past_key_valuepast_key_values4.58new_nameversionrk   position_embeddingsr   cache_positionr   r   c                 `   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       a  U R                  OSU R                   U R"                  U R$                  S.UD6u  nnUR&                  " / UQSP76 R)                  5       nU R+                  U5      nUU4$ Nr:   r"   r9   )r   r   r   eager        rg   r   r   r   rJ   r   r   viewr   r   r   r   updater   r   r\   _attn_implementationr   r   r   r   r   r   r   r   r   r3   rk   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r5   rF   T5GemmaSelfAttention.forward       $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7%
 /3mmD**LL..//%
 %
!\ "));;;;FFHkk+.L((r7   r   r   r\   r   r   r   r   r   r   r   r   r   r   NN)rN   rO   rP   rQ   __doc__r$   rR   r-   r   r0   r   rI   r   r   
LongTensorr   r   rF   rS   rT   rU   s   @r5   r   r      s    Gv2 vs v4 %0A6R ,059+)||+) #5<<#=>+) !.	+)
 "%+) !!1!12+) -.+) 
u||Xell3XeELL>Q5RR	S+) S+)r7   r   c                   $  ^  \ rS rSrSrS\S\4U 4S jjr\" SSSS	9 SS
\	R                  S\\	R                     S\\	R                     S\\   S\\   S\\	R                  \\	R                     \\\	R                        4   4S jj5       rSrU =r$ )T5GemmaCrossAttentioni  r   r\   r   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        U R                  R,                  U l        UR$                  c  [/        S5      eg )Nr   r   FrZ   zBCross-attention needs cross_attention_hidden_size to be specified.)r,   r-   r\   r   r   r]   r   r   r   r   r   r   r   r   r.   r_   r   r   cross_attention_hidden_sizer   r   r   r   
ValueErrorr   s      r5   r-   T5GemmaCrossAttention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
 ii&&68J8JQWQfQf
 '+kk&H&H#--5abb 6r7   r   r   r   r   rk   r   encoder_hidden_statesr   r   c                    Uc  [        S5      eUR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nUb1  UR                  R                  U R                  5      n	UR                  n
Ub  W	(       d  UR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nU R                  U5      R	                  U5      R                  SS5      nUb7  W
R                  XU R                  5      u  pSUR                  U R                  '   OFW
R                  U R                     R                  nU
R                  U R                     R                  n[         nU R"                  R$                  S:w  a  [&        U R"                  R$                     nU" U UUUU4U R(                  (       a  U R*                  OSU R,                  S U R.                  S.UD6u  nnUR0                  " / UQSP76 R3                  5       nU R5                  U5      nUU4$ )	Nz5Encoder hidden state is required for cross attention.r:   r"   r9   Tr   r   r   )r  rJ   r   r   r   r   
is_updatedrz   r   cross_attention_cacher   r   r   layerskeysvaluesr   r\   r   r   r   r   r   r   r   r   r   )r3   rk   r   r  r   r   r   r   r   r  curr_past_key_valueencoder_input_shapeencoder_hidden_shaper   r   r   r   r   s                     r5   rF   T5GemmaCrossAttention.forward<  sC    !(TUU#))#2.88b8$--8{{=166|DNNqRST&(3377GJ"1"G"G"*"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL*+>+E+Ej`d`n`n+o(
=A**4>>:,33DNNCHHJ.55dnnELLL(?;;++w6"9$++:Z:Z"[$7%
 /3mmD**LL//%
 %
!\ "));;;;FFHkk+.L((r7   )r   r   r\   r   r   r   r   r   r   r   r   r   r+   )rN   rO   rP   rQ   r   r$   rR   r-   r   r0   r   r   r   r   r   rI   rF   rS   rT   rU   s   @r5   r  r    s    Gc2 cs c8 %0A6R ,03)||3) !.3)  (5	3)
 "%3) -.3) 
u||Xell3XeELL>Q5RR	S3) S3)r7   r  c                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\	\R                  4   4
S
 jjrSrU =r$ )T5GemmaEncoderLayeris  zEncoder sub-layer.r   c                 $  > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        g N)r\   r   r)   )r,   r-   r]   r\   r   r   attention_typer   	self_attnr&   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrW   mlppre_feedforward_layernormpost_feedforward_layernormr.   re   rf   rg   r   s      r5   r-   T5GemmaEncoderLayer.__init__v  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r7   rk   r   r   r   r   c           	      8   UnU R                  U5      nU R                  " SUUUUS S.UD6u  pU R                  U5      nX`R                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX`R                  U5      -   nU$ )N)rk   r   r   r   r    )r  r  r  rg   r  r  r  )r3   rk   r   r   r   r   residual_s           r5   rF   T5GemmaEncoderLayer.forward  s     !44]C>> 
' 3)% 
 
 55mD <<#>> 66}E/77F <<#>>r7   )r  r\   rg   r]   r   r  r  r  r  r  r  r   )rN   rO   rP   rQ   r   rR   r-   r0   r   rI   r   r   FloatTensorrF   rS   rT   rU   s   @r5   r  r  s  s    7# 70 2637|| #5<<#=> !.	
 u//0 
u  !	" r7   r  c                   v  ^  \ rS rSrSrS\4U 4S jjr\" SSSS9       SS	\R                  S
\
\R                  \R                  4   S\\R                     S\\R                     S\\   S\\   S\\R                     S\\R                     S\\R                     S\R                  4S jj5       rSrU =r$ )T5GemmaDecoderLayeri  z2Decoder sub-layer: an extra cross-attention layer.r   c                    > [         TU ]  X5        [        XS9U l        [	        UR
                  UR                  S9U l        [	        UR
                  UR                  S9U l        g r  )	r,   r-   r  
cross_attnr&   r]   r  pre_cross_attn_layernormpost_cross_attn_layernormr   s      r5   r-   T5GemmaDecoderLayer.__init__  sS    +/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r7   r   r   r   r   rk   r   r   r   	use_cacher   r  encoder_attention_maskr   c
                    UnU R                  U5      nU R                  " SUUUUUb  UR                  OS UUS.U
D6u  pU R                  U5      nXR	                  U5      -   nUnU R                  U5      nU R                  " SUUU	UUS.U
D6u  pU R                  U5      nXR	                  U5      -   nUnU R                  U5      nU R                  U5      nU R                  U5      nXR	                  U5      -   nU$ )N)rk   r   r   r   r   r,  r   )rk   r  r   r   r,  r   )r  r  self_attention_cacher  rg   r)  r(  r*  r  r  r  )r3   rk   r   r   r   r   r,  r   r  r-  r   r!  r"  s                r5   rF   T5GemmaDecoderLayer.forward  s0    !44]C>> 	
' 3)%DSD_O@@ei)	
 	
 55mD <<#>> 55mD?? 
'"71+
 
 66}E <<#>> 66}E/77F <<#>>r7   )r(  r*  r)  )NNNFNNN)rN   rO   rP   rQ   r   rR   r-   r   r0   r   rI   r   r   r
   boolr$  rF   rS   rT   rU   s   @r5   r&  r&    s   <e# e %0A6R
 26379=$)598<9=.||. #5<<#=>. !.	.
 u//0. ""56. D>. !!1!12.  (5. !) 6. 
		. S.r7   r&  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaClassificationHeadi  z-Head for sentence-level classification tasks.r]   
num_labelsclassifier_dropout_ratec                    > [         TU ]  5         [        R                  " US9U l        [        R
                  " X5      U l        g )N)r   )r,   r-   r.   re   rg   r_   out_proj)r3   r]   r4  r5  r4   s       r5   r-   "T5GemmaClassificationHead.__init__  s/    zz$;<		+:r7   rk   r   c                 J    U R                  U5      nU R                  U5      nU$ r+   rg   r7  )r3   rk   s     r5   rF   !T5GemmaClassificationHead.forward  s$    ]3m4r7   r:  )r   )rN   rO   rP   rQ   r   rR   rC   r-   r0   r   rF   rS   rT   rU   s   @r5   r3  r3    sF    7;C ;S ;SX ; ;
U\\ ell  r7   r3  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaLMHeadi  z.Head for language modeling (generation) tasks.r]   
vocab_sizer[   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )NrZ   )r,   r-   r.   r_   r7  )r3   r]   r>  r[   r4   s       r5   r-   T5GemmaLMHead.__init__  s     		+Er7   rk   r   c                 (    U R                  U5      nU$ r+   r7  )r3   rk   logitss      r5   rF   T5GemmaLMHead.forward  s    }-r7   rB  )F)rN   rO   rP   rQ   r   rR   r1  r-   r0   r   rF   rS   rT   rU   s   @r5   r=  r=    sJ    8FC FS F F FU\\ ell  r7   r=  c                   \  ^  \ rS rSrSrS\S\4U 4S jjr\" SSSS	9  SS
\	R                  S\\	R                  \	R                  4   S\\	R                     S\\   S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jj5       rSrU =r$ )T5GemmaAttentioni  r   r\   r   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        U R                  R*                  U l        UR,                  U   S:X  a  UR.                  U l        g S U l        g )Nr   r   TrZ   r   )r,   r-   r\   r   r   r]   r   r   r   r   r   r   r   r   r.   r_   r   r   r   r   r   r   r   r   r   s      r5   r-   T5GemmaAttention.__init__  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7=7I7I)7TXk7kf33qur7   r   r   r   r   rk   r   r   r   r   r   c                 `   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       a  U R                  OSU R                   U R"                  U R$                  S.UD6u  nnUR&                  " / UQSP76 R)                  5       nU R+                  U5      nUU4$ r   r   r   s                     r5   rF   T5GemmaAttention.forward  r   r7   r   r   )rN   rO   rP   rQ   r   r#   rR   r-   r   r0   r   rI   r   r   r   r   r   rF   rS   rT   rU   s   @r5   rF  rF    s    Gv} v v2 %0A6R ,059+)||+) #5<<#=>+) !.	+)
 "%+) !!1!12+) -.+) 
u||Xell3XeELL>Q5RR	S+) S+)r7   rF  c                   n   ^  \ rS rSr% \\S'   SrSrSS/rS/r	Sr
SrSrSrSr\\S.rU 4S	 jrS
 rSrU =r$ )T5GemmaPreTrainedModeliG  r\   modelTr  r&  r   )rk   
attentionsc                 z  > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       a  UR                  R                  R                  S   S-  nUR                  R                  R                  R                  SX#-  S9  [        UR                  S5      (       aG  UR                  R                  b/  UR                  R                  R                  R                  5         g g g [	        U[        5      (       as  U R                  R                  (       dW  UR                  R                  R                  S   S-  nUR                  R                  R                  R                  SX#-  S9  g g SUR                   R"                  ;   a%  UR                  R                  R                  5         g g )Nr   r   r   )r>   stdr[   RMSNorm)r,   _init_weightsr\   initializer_rangerx   r3  r7  r2   rJ   datanormal_rw   r[   zero_r=  tie_word_embeddingsr4   rN   )r3   r   rP  scaler4   s       r5   rR  $T5GemmaPreTrainedModel._init_weightsY  sJ   f%kk++f788OO**003t;EOO""''//Sck/Jv//FOO4H4H4T$$))//1 5U/..;;22..44Q74?&&++33#+3N 3 &**333MM$$& 4r7   c                 b   U R                   R                  R                  nU R                   R                  R                  nUc  [	        S5      eUR                  UR                  5      nUSSS24   R                  5       USSS24'   X$S'   Uc  [	        S5      eUR                  US:H  U5        U$ )	z
Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
pad_token_id replacement for labels that were -100.
This is a common preparation step for decoder inputs in sequence-to-sequence models.
Nz:self.model.config.decoder.bos_token_id has to be defined. .r:   r"   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r\   decoderbos_token_idpad_token_idr  	new_zerosrJ   clonemasked_fill_)r3   	input_idsdecoder_start_token_idr]  shifted_input_idss        r5   _shift_right#T5GemmaPreTrainedModel._shift_rightj  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r7   r   )rN   rO   rP   rQ   r#   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr&  rF  _can_record_outputsrR  rd  rS   rT   rU   s   @r5   rL  rL  G  sd    &*#.0EF#4"5N!"&,&
'"! !r7   rL  c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z,
This creates bidirectional attention mask.
	batch_idxhead_idxq_idxkv_idxr   c                    > Tc#  [         R                  " S[         R                  S9$ TX4   R                  [         R                  5      $ )Nr   r   )r0   onesr1  r   )rq  rr  rs  rt  r   s       r5   
inner_mask/bidirectional_mask_function.<locals>.inner_mask  s;    !::b

33i/033EJJ??r7   rR   r1  )r   rw  s   ` r5   bidirectional_mask_functionrz    s9    
@c @S @ @c @d @
 r7   r   c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z@
This creates bidirectional attention mask with sliding window.
rq  rr  rs  rt  r   c                 $   > UT-
  U:  X2T-   :  -  $ r+   r   )rq  rr  rs  rt  r   s       r5   rw  >sliding_window_bidirectional_mask_function.<locals>.inner_mask  s     &/F^=S4STTr7   ry  )r   rw  s   ` r5   *sliding_window_bidirectional_mask_functionr~    s9    
Uc US U Uc Ud U r7   	token_idsr]  c                    U b<  Uc  [        S5      eX:g  R                  UR                  [        R                  5      nU$ [        R
                  " UR                  S   UR                  S   4UR                  [        R                  S9nU$ )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r"   r   r   )r  r   r   r0   longrv  rJ   )r  rk   r]  r   s       r5   make_default_2d_attention_maskr    s     RSS#3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r7   c                      ^  \ rS rSr\\S.rU 4S jr\" 5           SS\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\\   S	\4S
 jj5       rSrU =r$ )T5GemmaEncoderi  )rN  rk   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        [        US9U l        SU l        [
        R                  " [!        UR"                  5       Vs/ s H  n[%        X5      PM     sn5      U l        [
        R(                  " UR*                  5      U l        U R/                  5         g s  snf )Nr  r\   F)r,   r-   r]  padding_idxr>  r.   	Embeddingr]   embed_tokensr&   r  normrn   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layersr  r
  re   rf   rg   	post_initr   s      r5   r-   T5GemmaEncoder.__init__  s     !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	0?&+#mmEJ6KcKcEdeEd	 3Ede
 zz&"5"56 	 fs   D$ra  r   r   inputs_embedsr   r   c           	         US L US L-  (       a  [        S5      eUR                  SS 5        Uc  U R                  U5      n[        R                  " SUR
                  S   UR                  S9nUc  UR                  S5      nUc   [        XU R                  R                  5      n[        U=n[        5      (       db  U R                  UUUS US.n[        S0 UDS[        U5      0D6[        S0 UD[!        U R                  R"                  5      [        U5      S.D6S	.nUn	U R%                  X5      n
[        R&                  " U R                  R(                  S
-  U	R*                  S9nX-  n	U R-                  U	5      n	U R.                  S U R                  R0                    H  nU" U	U
X|R2                     U40 UD6n	M     U R5                  U	5      n	U R-                  U	5      n	[7        U	S9$ )N:You must specify exactly one of input_ids or inputs_embedsr   r   r"   r   r\   input_embedsr   r   r   r   or_mask_function)r  and_mask_functionfull_attentionr         ?r   )last_hidden_stater   )r  popr  r0   arangerJ   r   r   r  r\   r]  rx   ry   r   rz  r   r~  r   r  tensorr]   r   rg   r
  r  r  r  r   )r3   ra  r   r   r  r   r   self_attn_mask_mappingmask_kwargsrk   r   
normalizerlayer_modules                r5   rF   T5GemmaEncoder.forward  s    -t";<YZZ 	

$d+  --i8Ma)<)<Q)?H\H\])33A6L!;IVZVaVaVnVnoNNB0DII++ -"0"0#' ,K #5 #!#%@%P# &G &!&%OPTP[P[PjPj%k&A.&Q&
&" &"oomJ\\$++"9"93">mFYFYZ
%2]3 KK(G$++*G*GHL(#&'B'BC	
 M I 		-0]3+
 	
r7   )rg   r  r  r
  r  r  r  r>  NNNN)rN   rO   rP   rQ   r   r  ro  r-   r!   r   r0   r   r   r$  r   r   r   rF   rS   rT   rU   s   @r5   r  r    s    *,
$  15153759A
E,,-A
 !.A
 u//0	A

   1 12A
 +,A
 
A
 A
r7   r  c                   p  ^  \ rS rSr\" \SS9\" \SS9\S.rU 4S jr	\
" 5                SS\\R                     S\\R                     S\\R                     S	\\   S
\\R                      S\\   S\\R                     S\\R                     S\\R                     S\\   S\4S jj5       rSrU =r$ )T5GemmaDecoderi  r"   )index)rN  cross_attentionsrk   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        U R                  5         g s  snf r+   )	r,   r-   r.   r  r  r  r&  r
  r  r   s      r5   r-   T5GemmaDecoder.__init__  sW     mmEJ6KcKcEdeEd	 3Ede
 	 fs   A*ra  r   r   r   r  r,  r   r  r-  r   r   c
                    US L US L-  (       a  [        S5      eUc  [        S5      eUc  U R                  U5      nU R                  (       d8  U(       a1  Uc.  [        [	        U R
                  S9[	        U R
                  S95      nUcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      nUc#  Uc   [        XU R
                  R                  5      n[        U=n[        5      (       d9  U R
                  UUUUb  UR                   OS US.n[#        S0 UD6[%        S0 UD6S.n[        U	=n[        5      (       d-  U R
                  UU	US S S.nS	[#        S0 UDS
['        U	5      0D60nUnU R)                  X5      n[        R*                  " U R
                  R,                  S-  UR.                  S9nUU-  nU R1                  U5      nU R2                  S U R
                  R4                    H$  nU" UUUUR6                     UUUUUUS	   4	0 U
D6nM&     U R9                  U5      nU R1                  U5      n[;        UUS9$ )Nr  z0`encoder_hidden_states` must be given in decoderr  r   r"   r  r  r  r  r  r  r   )r  r   r   )r  r  r   r
   r	   r\   get_seq_lengthr0   r  rJ   r   r   r  r]  rx   ry   r/  r   r   rz  r  r  r]   r   rg   r
  r  r  r  r   )r3   ra  r   r   r   r  r,  r   r  r-  r   past_seen_tokensr  r  cross_attn_mask_mappingrk   r   r  r  s                      r5   rF   T5GemmaDecoder.forward  s    -t";<YZZ (OPP  --i8M}}/F1,dkk2RT`hlhshsTtuO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L!o&=;IVZVaVaVnVnoNNB0DII++ -"0"0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TRR++ 5"8"0#' $K !"4 #!#%@AW%X#'# &"oomJ\\$++"9"93">mFYFYZ
%
2]3 KK(G$++*G*GHL(#&|'B'BC%'(89 M I 		-0]38++
 	
r7   )r
  )	NNNNNNNNN)rN   rO   rP   rQ   r    r   r  r&  ro  r-   r!   r   r0   r   r   r
   r$  r1  r   r   r   rF   rS   rT   rU   s   @r5   r  r    s-   $%9C*+@J,  1515379=59$(598<9=Z
E,,-Z
 !.Z
 u//0	Z

 ""56Z
   1 12Z
 D>Z
 !!1!12Z
  (5Z
 !) 6Z
 +,Z
 
3Z
 Z
r7   r  c                     ^  \ rS rSrS\4U 4S jjrS rS rS r\	\
            SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\R$                     S\\R$                     S\\   S\\R                     S\\   S\4S jj5       5       rSrU =r$ )T5GemmaModelix  r\   c                    > [         TU ]  U5        UR                  (       d  [        S5      e[	        UR
                  5      U l        [        UR                  5      U l        U R                  5         g )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r,   r-   is_encoder_decoderr  r  encoderr  r[  r  rh   s     r5   r-   T5GemmaModel.__init__z  sO     ((uvv%fnn5%fnn5r7   c                     U R                   $ r+   r  rK   s    r5   get_encoderT5GemmaModel.get_encoder  s    ||r7   c                 6    U R                   R                  5       $ r+   r  get_input_embeddingsrK   s    r5   r  !T5GemmaModel.get_input_embeddings      ||0022r7   c                 8    U R                   R                  U5      $ r+   r  set_input_embeddingsr3   new_embeddingss     r5   r  !T5GemmaModel.set_input_embeddings      ||00@@r7   ra  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   r  decoder_inputs_embedsr,  r   r   r   c                    Uc  U R                   " SUUUU	S.UD6nUR                  nU R                  " SUUUU
UUUUUS.	UD6n[        UR                  UR                  UR                  SS5      (       a  UR                  OUR                  4UR                  UR                  UR                  UR                  UR                  S9$ )a8  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
ra  r   r   r  )	ra  r   r   r  r   r  r-  r,  r   output_hidden_statesF)r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentionsr   )	r  r  r[  r   r   rz   rk   rN  r  )r3   ra  r   r   r  r  r  r  r   r  r  r,  r   r   r  decoder_outputss                   r5   rF   T5GemmaModel.forward  s    . ""ll #-)+	
 O !0 A A,, 
'1-/+"7#1)
 
 "-??+;;zz0%88 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r7   )r[  r  )NNNNNNNNNNNN)rN   rO   rP   rQ   r#   r-   r  r  r  r   r   r   r0   r   r$  
BoolTensorr   r
   r   r1  r   r   r   rF   rS   rT   rU   s   @r5   r  r  x  s_   	} 	3A  156:378<=A;?599=048<$(598
E,,-8
 !!2!238
 u//0	8

 $E$4$458
 !))9)9 :8
 'u'7'788
 "/28
 ""568
  -8
  (58
 D>8
 !!1!128
 +,8
 
8
  8
r7   r  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	    SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\\   S\4S jj5       5       rSrU =r$ )T5GemmaEncoderModeli  r\   c                    > [         TU ]  U5        UR                  (       a  [        S5      e[	        UR
                  5      U l        U R                  5         g )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r,   r-   r  r  r  r  r  rh   s     r5   r-   T5GemmaEncoderModel.__init__  s?     $$pqq%fnn5r7   c                 6    U R                   R                  5       $ r+   r  rK   s    r5   r  (T5GemmaEncoderModel.get_input_embeddings  r  r7   c                 8    U R                   R                  U5      $ r+   r  r  s     r5   r  (T5GemmaEncoderModel.set_input_embeddings  r  r7   ra  r   r   r  r   r   c                 4    U R                   " SUUUUS.UD6nU$ )Nr  r   r  )r3   ra  r   r   r  r   r  s          r5   rF   T5GemmaEncoderModel.forward  s5     ,, 
)%'	

 
 r7   r  r  )rN   rO   rP   rQ   r#   r-   r  r  r   r   r   r0   r   r$  r   r   r   r   rF   rS   rT   rU   s   @r5   r  r    s    } 3A  156:3704E,,- !!2!23 u//0	
  - +, 
  r7   r  c            %       l  ^  \ rS rSrSS/rSS0rSS/S/40rS\4U 4S	 jjrS
 r	S r
S rS rS r\\              S"S\\R$                     S\\R&                     S\\R$                     S\\R$                     S\\R(                     S\\R$                     S\\   S\\   S\\R&                     S\\R&                     S\\R$                     S\\   S\\R$                     S\\\R4                  4   S\\   S\\\R&                     \4   4 S jj5       5       rS\R4                  4S  jr S!r!U =r"$ )#T5GemmaForConditionalGenerationi  z!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_reprk   rC  r\   c                   > SUl         [        TU ]	  U5        [        U5      U l        UR
                  R                  U l        [        UR
                  R                  U R                  5      U l	        SU l
        U R                  5         g )NTForMaskedLM)r  r,   r-   r  rM  r[  r>  r=  r]   lm_head	loss_typer  rh   s     r5   r-   (T5GemmaForConditionalGeneration.__init__  sb    $(! !&)
 ..33$V^^%?%?Q&r7   c                 $    XR                   l        g r+   r  r7  r  s     r5   set_output_embeddings5T5GemmaForConditionalGeneration.set_output_embeddings   s     .r7   c                 .    U R                   R                  $ r+   r  rK   s    r5   get_output_embeddings5T5GemmaForConditionalGeneration.get_output_embeddings  s    ||$$$r7   c                     U R                   R                  (       aC  U R                  U R                  R                  U R                  5       R                  5       5        g g r+   )r\   rW  _tie_or_clone_weightsr  r7  get_decoderr  rK   s    r5   _tie_weights,T5GemmaForConditionalGeneration._tie_weights  s@    ;;**&&t||'<'<d>N>N>P>e>e>gh +r7   c                 .    U R                   R                  $ r+   )rM  r  rK   s    r5   r  +T5GemmaForConditionalGeneration.get_encoder      zz!!!r7   c                 .    U R                   R                  $ r+   )rM  r[  rK   s    r5   r  +T5GemmaForConditionalGeneration.get_decoder  r  r7   ra  r   r   r  r  r  r  r   r  r  labelsr,  r   logits_to_keepr   r   c                    Ub  Uc  U
c  U R                  U5      nU R                  " SUUUUUUUUU	U
UUS.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  5       R                  nUR                  b4  UUR                  -  n[        R                  " U5      nUUR                  -  nSnUb  U R                  " UXR                  40 UD6n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )a  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)ra  r   r   r  r  r  r  r   r  r  r,  r   )	lossrC  r   r  r  r  r  r  r  r   )rd  rM  r  rx   rR   slicer  r  r\   final_logit_softcappingr0   r   loss_functionr>  r   r   r  r  r  r  r  r  )r3   ra  r   r   r  r  r  r  r   r  r  r  r,  r   r  r   r  rk   slice_indicesrC  decoder_configr  s                         r5   rF   'T5GemmaForConditionalGeneration.forward  ss   < "3";@U@] $ 1 1& 9.2jj /
)%/#9!5++'"7)/
 /
  (998B>SV8W8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%%ffooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r7   c                 $    U R                  U5      $ r+   )rd  )r3   r  s     r5   %prepare_decoder_input_ids_from_labelsET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels^  s      ((r7   )r  r  rM  r>  )NNNNNNNNNNNNNr   )#rN   rO   rP   rQ   _tied_weights_keys_tp_plan_pp_planr#   r-   r  r  r  r  r  r   r   r   r0   r   r$  r  r   r
   r1  r   rR   r   r   r   rI   r   rF   r  rS   rT   rU   s   @r5   r  r    s   =?XY"M2H"o%6
$CDH	} 	/%i
""  156:378<=A;?599=59=A-1$(5934I
E,,-I
 !!2!23I
 u//0	I

 $E$4$45I
 !))9)9 :I
 'u'7'78I
 "/2I
 ""56I
   1 12I
  ((9(9:I
 ))*I
 D>I
 !!1!12I
 c5<</0I
  +,!I
" 
uU&&'8	9#I
  I
V)ELL ) )r7   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjrS rS r	\
\          SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\R                      S\\R                      S\\R                     S\\   S\4S jj5       5       rSrU =r$ ) T5GemmaForSequenceClassificationib  r\   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
Nr5  皙?r  r,   r-   r4  r  rM  r  r  r]   r[  r   r3  scorer  r3   r\   r  r]   classifier_dropoutr4   s        r5   r-   )T5GemmaForSequenceClassification.__init__d  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r7   c                 6    U R                   R                  5       $ r+   rM  r  rK   s    r5   r  5T5GemmaForSequenceClassification.get_input_embeddings{      zz..00r7   c                 :    U R                   R                  U5        g r+   rM  r  r3   r   s     r5   r  5T5GemmaForSequenceClassification.set_input_embeddings~      

''.r7   ra  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R                  c  US	:w  a  [        S
5      eU R                   R                  c  SnGOUb  XR                   R                  :g  R!                  UR"                  [$        R&                  5      n[$        R(                  " UR                  S   UR"                  [$        R&                  S9nUU-  R+                  S5      nU R                   R                  (       a*  US	-  n[$        R,                  " UUR                  S   S	-
  S9nO.Sn[.        R1                  U R                  R                   S35        U[$        R(                  " UUR"                  S9U4   nSnU
b  U R3                  UU
UU R                   S9n[5        UUUUS9$ )  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   r  r  r  r  r  r  r,  r   r   r  r   r"   z=Cannot handle batch sizes > 1 if no padding token is defined.r:   r  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )rC  r  pooled_logitsr\   r  rC  rk   rN  )r\   r  NotImplementedErrorr4   rN   r  rd  rM  r  r  r  rk   rN  r  rJ   r]  r   r   r0   int32r  argmaxclamploggerwarning_oncer  r   )r3   ra  r   r   r  r  r  r  r  r  r  r   outputsr  rk   rN  rC  
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr%  r  s                          r5   rF   (T5GemmaForSequenceClassification.forward  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r7   rM  r4  r  r+   
NNNNNNNNNN)rN   rO   rP   rQ   r#   r   r1  r-   r  r  r   r   r0   r   r   r   r$  r   r   r   rF   rS   rT   rU   s   @r5   r  r  b  sS   } (4.  .1/  1515378<9=;?5959=A-1i
E,,-i
 !.i
 u//0	i

 $E$4$45i
 !) 6i
 'u'7'78i
 "/2i
   1 12i
  ((9(9:i
 ))*i
 +,i
 
"i
  i
r7   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjrS rS r	\
\          SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\R                      S\\R                      S\\R                     S\\   S\4S jj5       5       rSrU =r$ )T5GemmaForTokenClassificationi  r\   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for token classification. When set to False, only encoder is used.
Nr5  r  r  r  s        r5   r-   &T5GemmaForTokenClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r7   c                 6    U R                   R                  5       $ r+   r  rK   s    r5   r  2T5GemmaForTokenClassification.get_input_embeddings	  r  r7   c                 :    U R                   R                  U5        g r+   r  r  s     r5   r  2T5GemmaForTokenClassification.set_input_embeddings  r  r7   ra  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nSnU
b  U R                  UXR                   5      n[        UUUUS9$ )	r  Nr  r   r!  Fr"  r#  r&  )r\   r  r'  r4   rN   r  rd  rM  r  r  r  rk   rN  r  r  r   )r3   ra  r   r   r  r  r  r  r  r  r  r   r-  r  rk   rN  rC  r  s                     r5   rF   %T5GemmaForTokenClassification.forward  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-.%%ffkkBD$'!	
 	
r7   r3  r+   r4  )rN   rO   rP   rQ   r#   r   r1  r-   r  r  r   r   r0   r   r   r   r$  r   r   r   rF   rS   rT   rU   s   @r5   r6  r6    sS   } (4.  01/  1515378<9=;?5959=A-1N
E,,-N
 !.N
 u//0	N

 $E$4$45N
 !) 6N
 'u'7'78N
 "/2N
   1 12N
  ((9(9:N
 ))*N
 +,N
 
N
  N
r7   r6  )r  r  r  rL  r  r6  )Nr"   )r   NN)Utypingr   r   r   r0   torch.nnr.   activationsr   cache_utilsr   r	   r
   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr    r!   configuration_t5gemmar#   r$   
get_loggerrN   r+  Moduler&   rW   rn   r   r   r   rR   r   rC   rI   r   r   r  r  r&  r3  r=  rF  rL  rz  r~  r   r  r  r  r  r  r  r  r6  __all__r   r7   r5   <module>rR     s  , - ,   ! C C ) R B 9  L F & R R 0 ? E 
		H	%=RYY =( &!<RYY !<H(6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FI)299 I)XS)BII S)l14 1h8- 8v		 	BII 	H)ryy H)V :!_ :! :!z
0F 
8 
s x (()<< 3- \\	"Z
+ Z
zj
^ j
Z O
) O
 O
d !0 ! !Ho)&<o o)d I
'= I
 I
X o
$: o
 o
dr7   