
    cCi~                        S SK JrJrJrJr  S SKrS SKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJrJrJrJrJrJr  SS
KJrJr  SSKJr  SSK J!r!J"r"J#r#J$r$  SSK%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0J1r1J2r2J3r3J4r4  Sr5\$Rl                  " \75      r8 " S S\+5      r9 " S S\5      r: " S S\05      r; " S S\.5      r< " S S\15      r= " S S\-5      r> " S S \-5      r?S!\\R                     S"\4S# jrAS$\BS"\4S% jrC " S& S'\5      rD " S( S)\D5      rE " S* S+\R                  5      rG " S, S-\R                  5      rH\" " S. S/\/5      5       rIS0\\R                     S1\R                  S2\\B   S"\R                  4S3 jrK " S4 S5\I5      rL " S6 S7\L5      rM\" " S8 S9\I5      5       rN\" " S: S;\I5      5       rO " S< S=\I\5      rP\" " S> S?\I5      5       rQ\" " S@ SA\I5      5       rR/ SBQrSg)C    )AnyCallableOptionalUnionN   )CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )Gemma2Config)Gemma2Attention	Gemma2MLPGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forwardz google/t5gemma-2b-2b-prefixlm-itc                       \ rS rSrSrg)T5GemmaModuleConfig?    N__name__
__module____qualname____firstlineno____static_attributes__r,       e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/t5gemma/modular_t5gemma.pyr*   r*   ?       r3   r*   c                   f  ^  \ rS rSrSrSrS/r0 SS_SS_SS_S	S
_SS_SS_SS
_SS_SS_SS_SS
_SS_SS_SS_SS
_SS_SS_SS
0ErS/S/4SS/S/4S/S/4S/S/4SS/S/4S/S/4S.r        S)S\	\
\\\\4   4      S\	\
\\\\4   4      S \S!\S"\S#\S$\S%\4U 4S& jjjrU 4S' jrS(rU =r$ )*T5GemmaConfigC   aP  
This is the configuration class to store the configuration of a [`T5GemmaModel`]. It is used to instantiate an T5Gemma
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to a hypothetical balanced Gemma2 encoder-decoder model.
e.g. [google/t5gemma-2b-2b-prefixlm-it](https://huggingface.co/google/t5gemma-2b-2b-prefixlm-it)
```python
>>> from transformers import T5GemmaConfig, T5GemmaModel
>>> t5gemma_config = T5GemmaConfig.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
>>> model = T5GemmaModel(t5gemma_config)
```
Configuration objects inherit from [PretrainedConfig] and can be used to control the model outputs. Read the
documentation from [PretrainedConfig] for more information.
Args:
    encoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
        Configuration for the encoder.
    decoder (`Union[T5GemmaModuleConfig, dict]`, optional, *optional*):
        Configuration for the decoder.
    is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
        Whether the model is used as an encoder/decoder or not.
    dropout_rate (`float`, *optional*, defaults to 0.0):
        The ratio for all dropout layers (following T5).
    classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
        The dropout ratio for classifier (following T5).
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for attention.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether tie input and output embeddings.
    vocab_size (`int`, *optional*, defaults to 256000):
        Vocabulary size of the T5Gemma model (the same as Gemma 2).
    kwargs (additional keyword arguments, optional, *optional*):
        Will be passed to the PretrainedConfig base class.
t5gemmapast_key_valuesz!encoder.layers.*.self_attn.q_projcolwisez!encoder.layers.*.self_attn.k_projz!encoder.layers.*.self_attn.v_projz!encoder.layers.*.self_attn.o_projrowwisezencoder.layers.*.mlp.gate_projzencoder.layers.*.mlp.up_projzencoder.layers.*.mlp.down_projz!decoder.layers.*.self_attn.q_projz!decoder.layers.*.self_attn.k_projz!decoder.layers.*.self_attn.v_projz!decoder.layers.*.self_attn.o_projz"decoder.layers.*.cross_attn.q_projz"decoder.layers.*.cross_attn.k_projz"decoder.layers.*.cross_attn.v_projz"decoder.layers.*.cross_attn.o_projzdecoder.layers.*.mlp.gate_projzdecoder.layers.*.mlp.up_projzdecoder.layers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)zencoder.embed_tokenszencoder.layerszencoder.normzdecoder.embed_tokenszdecoder.layerszdecoder.normencoderdecoderis_encoder_decoderdropout_rateclassifier_dropout_rateattention_dropouttie_word_embeddings
vocab_sizec	                 p  > [        U[        5      (       a  [        S0 UD6nO6Uc  [        5       nO([        U[        5      (       d   [        U5       S35       e[        U[        5      (       a  [        S0 UD6nO.Uc  UnO([        U[        5      (       d   [        U5       S35       e[        S0 UR	                  5       D6n[        S0 UR	                  5       D6nSUl        XAl        Xal        Xl        SUl        SUl	        XBl        Xbl        UR                  Ul        X l        S H  n
X;  d  M
  [        X*5      X'   M     [        TU ]<  " S0 U	D6  X0l        U	R#                  SUR                  5      U l	        U	R#                  SUR$                  5      U l        X@l        X`l        XPl        Xpl        Xl        g )Nz is not supported.FT)bos_token_idpad_token_ideos_token_id	use_cacheinitializer_ranger,   )
isinstancedictr*   typeto_dict
is_decoderrD   rF   rA   rM   hidden_sizecross_attention_hidden_sizerB   getattrsuper__init__rC   getrN   rE   rG   rH   )selfrA   rB   rC   rD   rE   rF   rG   rH   kwargsspecial_token_key	__class__s              r4   rX   T5GemmaConfig.__init__   s    gt$$)4G4G_)+Gg':;;aWN`=aa;gt$$)4G4G_Gg':;;aWN`=aa;%:(9:%:(9:"+$5!! +$5!.5.A.A+!Q .,3G,O) "R 	"6""4K1B1BC!',?AZAZ![(!2'>$#6  %r3   c                    > / SQnX;   a,  [        U R                  X5        [        U R                  X5        [        TU ]  X5        g )N)output_hidden_statesoutput_attentions_attn_implementationrD   rF   rH   )setattrrA   rB   rW   __setattr__)rZ   keyvalueshared_attr_with_submodulesr]   s       r4   rd   T5GemmaConfig.__setattr__   s<    '
# -DLL#-DLL#-C'r3   )
rF   rE   rB   rD   rA   rN   rC   rG   rM   rH   )NNT        ri   ri   Ti  )r.   r/   r0   r1   __doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr   r   r*   rP   r   boolfloatintrX   rd   r2   __classcell__r]   s   @r4   r7   r7   C   s   B J#4"5+Y 	,Y 	,Y	
 	,Y 	)) 	'	 	)) 	,Y 	,Y 	,Y 	,Y 	-i 	-i  	-i!" 	-i#$ 	))%& 	'	'( 	)))0 #.0A B+-=>@QR)*_,=>"-0A B+-=>@QR)*_,=>	 IMHL#'!),#&$( 8%% 3T#s(^ CDE8% % 3T#s(^ CDE8% !	8%
 8% "'8% !8% "8% 8% 8%t( (r3   r7   c                       \ rS rSrSrg)T5GemmaRMSNorm   r,   Nr-   r,   r3   r4   ru   ru      r5   r3   ru   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
T5GemmaMLP   c                 n   > [         TU ]  U5        [        R                  " UR                  5      U l        g N)rW   rX   nnDropoutrD   dropoutrZ   configr]   s     r4   rX   T5GemmaMLP.__init__   s&     zz&"5"56r3   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      nU R	                  U5      nU$ r{   )act_fn	gate_projup_projr~   	down_proj)rZ   xr?   r   s       r4   forwardT5GemmaMLP.forward   sH    DNN1$56aH]3NN=1	r3   )r~   )r.   r/   r0   r1   rX   r   r2   rr   rs   s   @r4   rx   rx      s    7 r3   rx   c                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )T5GemmaRotaryEmbedding   c                 $   > [         TU ]  X5        g r{   )rW   rX   )rZ   r   devicer]   s      r4   rX   T5GemmaRotaryEmbedding.__init__   s    (r3   r,   r{   )r.   r/   r0   r1   rX   r2   rr   rs   s   @r4   r   r      s    ) )r3   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )T5GemmaSelfAttention   r   	layer_idxc                 F   > [         TU ]  X5        UR                  U l        g r{   )rW   rX   rS   	is_causalrZ   r   r   r]   s      r4   rX   T5GemmaSelfAttention.__init__   s    +**r3   )r   )	r.   r/   r0   r1   r*   rq   rX   r2   rr   rs   s   @r4   r   r      s    +2 +s + +r3   r   c                      ^  \ rS rSrS\S\4U 4S jjr\" SSSS9 SS	\R                  S
\
\R                     S\
\R                     S\
\   S\\   S\\R                  \
\R                     \
\\R                        4   4S jj5       rSrU =r$ )T5GemmaCrossAttention   r   r   c                 ~  > [         TU ]  X5        U ?SU l        UR                  c  [        S5      e[        R                  " UR                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  UR                  U R                  -  UR                  S9U l        g )NFzBCross-attention needs cross_attention_hidden_size to be specified.bias)rW   rX   sliding_windowr   rU   
ValueErrorr|   Linearnum_key_value_headshead_dimattention_biask_projv_projr   s      r4   rX   T5GemmaCrossAttention.__init__   s    +--5abbii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
r3   past_key_valuer:   4.58new_nameversionr?   r@   encoder_hidden_statesr[   returnc                    Uc  [        S5      eUR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nUb1  UR                  R                  U R                  5      n	UR                  n
Ub  W	(       d  UR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nU R                  U5      R	                  U5      R                  SS5      nUb7  W
R                  XU R                  5      u  pSUR                  U R                  '   OFW
R                  U R                     R                  nU
R                  U R                     R                  n[         nU R"                  R$                  S:w  a  [&        U R"                  R$                     nU" U UUUU4U R(                  (       a  U R*                  OSU R,                  S U R.                  S.UD6u  nnUR0                  " / UQSP76 R3                  5       nU R5                  U5      nUU4$ )	Nz5Encoder hidden state is required for cross attention.   r   Teagerri   )r~   scalingr   softcap)r   shaper   q_projview	transpose
is_updatedrY   r   cross_attention_cacher   r   updatelayerskeysvaluesr(   r   rb   r   trainingrF   r   attn_logit_softcappingreshape
contiguouso_proj)rZ   r?   r@   r   r:   r[   input_shapehidden_shapequery_statesr   curr_past_key_valueencoder_input_shapeencoder_hidden_shape
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                     r4   r   T5GemmaCrossAttention.forward   sC    !(TUU#))#2.88b8$--8{{=166|DNNqRST&(3377GJ"1"G"G"*"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL*+>+E+Ej`d`n`n+o(
=A**4>>:,33DNNCHHJ.55dnnELLL(?;;++w6"9$++:Z:Z"[$7%
 /3mmD**LL//%
 %
!\ "));;;;FFHkk+.L((r3   )r   r   r   r{   )r.   r/   r0   r1   r*   rq   rX   r   torchTensorr   r   r   r   tupler   r2   rr   rs   s   @r4   r   r      s    
2 
s 
 %0A6R ,03)||3) !.3)  (5	3)
 "%3) -.3) 
u||Xell3XeELL>Q5RR	S3) S3)r3   r   r@   r   c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z,
This creates bidirectional attention mask.
	batch_idxhead_idxq_idxkv_idxr   c                    > Tc#  [         R                  " S[         R                  S9$ TX4   R                  [         R                  5      $ )Nr,   dtype)r   onesro   to)r   r   r   r   r@   s       r4   
inner_mask/bidirectional_mask_function.<locals>.inner_mask:  s;    !::b

33i/033EJJ??r3   rq   ro   )r@   r   s   ` r4   bidirectional_mask_functionr   5  s9    
@c @S @ @c @d @
 r3   r   c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z@
This creates bidirectional attention mask with sliding window.
r   r   r   r   r   c                 $   > UT-
  U:  X2T-   :  -  $ r{   r,   )r   r   r   r   r   s       r4   r   >sliding_window_bidirectional_mask_function.<locals>.inner_maskG  s     &/F^=S4STTr3   r   )r   r   s   ` r4   *sliding_window_bidirectional_mask_functionr   B  s9    
Uc US U Uc Ud U r3   c                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\	\R                  4   4
S
 jjrSrU =r$ )T5GemmaEncoderLayeriM  zEncoder sub-layer.r   c                 $  > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        g N)r   r   eps)rW   rX   rT   r   r   layer_typesattention_typer   	self_attnru   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrx   mlppre_feedforward_layernormpost_feedforward_layernormr|   r}   rD   r~   r   s      r4   rX   T5GemmaEncoderLayer.__init__P  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r3   r?   position_embeddingsr@   position_idsr   c           	      8   UnU R                  U5      nU R                  " SUUUUS S.UD6u  pU R                  U5      nX`R                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX`R                  U5      -   nU$ )N)r?   r   r@   r   r:   r,   )r   r   r   r~   r   r   r   )rZ   r?   r   r@   r   r[   residual_s           r4   r   T5GemmaEncoderLayer.forwardd  s     !44]C>> 
' 3)% 
 
 55mD <<#>> 66}E/77F <<#>>r3   )r   r   r~   rT   r   r   r   r   r   r   r   )NN)r.   r/   r0   r1   rj   rq   rX   r   r   r   r   
LongTensorFloatTensorr   r2   rr   rs   s   @r4   r   r   M  s    7# 70 2637|| #5<<#=> !.	
 u//0 
u  !	" r3   r   c                   v  ^  \ rS rSrSrS\4U 4S jjr\" SSSS9       SS	\R                  S
\
\R                  \R                  4   S\\R                     S\\R                     S\\   S\\   S\\R                     S\\R                     S\\R                     S\R                  4S jj5       rSrU =r$ )T5GemmaDecoderLayeri  z2Decoder sub-layer: an extra cross-attention layer.r   c                    > [         TU ]  X5        [        XS9U l        [	        UR
                  UR                  S9U l        [	        UR
                  UR                  S9U l        g r   )	rW   rX   r   
cross_attnru   rT   r   pre_cross_attn_layernormpost_cross_attn_layernormr   s      r4   rX   T5GemmaDecoderLayer.__init__  sS    +/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r3   r   r:   r   r   r?   r   r@   r   rM   cache_positionr   encoder_attention_maskr   c
                    UnU R                  U5      nU R                  " SUUUUUb  UR                  OS UUS.U
D6u  pU R                  U5      nXR	                  U5      -   nUnU R                  U5      nU R                  " SUUU	UUS.U
D6u  pU R                  U5      nXR	                  U5      -   nUnU R                  U5      nU R                  U5      nU R                  U5      nXR	                  U5      -   nU$ )N)r?   r   r@   r   r:   rM   r  )r?   r   r@   r:   rM   r,   )r   r   self_attention_cacher   r~   r   r   r  r   r   r   )rZ   r?   r   r@   r   r:   rM   r  r   r  r[   r   r   s                r4   r   T5GemmaDecoderLayer.forward  s0    !44]C>> 	
' 3)%DSD_O@@ei)	
 	
 55mD <<#>> 55mD?? 
'"71+
 
 66}E <<#>> 66}E/77F <<#>>r3   )r   r  r   )NNNFNNN)r.   r/   r0   r1   rj   rq   rX   r   r   r   r   r   r   r
   ro   r   r   r2   rr   rs   s   @r4   r   r     s   <e# e %0A6R
 26379=$)598<9=.||. #5<<#=>. !.	.
 u//0. ""56. D>. !!1!12.  (5. !) 6. 
		. S.r3   r   c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaClassificationHeadi  z-Head for sentence-level classification tasks.rT   
num_labelsrE   c                    > [         TU ]  5         [        R                  " US9U l        [        R
                  " X5      U l        g )N)p)rW   rX   r|   r}   r~   r   out_proj)rZ   rT   r
  rE   r]   s       r4   rX   "T5GemmaClassificationHead.__init__  s/    zz$;<		+:r3   r?   r   c                 J    U R                  U5      nU R                  U5      nU$ r{   r~   r  )rZ   r?   s     r4   r   !T5GemmaClassificationHead.forward  s$    ]3m4r3   r  )ri   )r.   r/   r0   r1   rj   rq   rp   rX   r   r   r   r2   rr   rs   s   @r4   r	  r	    sF    7;C ;S ;SX ; ;
U\\ ell  r3   r	  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaLMHeadi  z.Head for language modeling (generation) tasks.rT   rH   r   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )Nr   )rW   rX   r|   r   r  )rZ   rT   rH   r   r]   s       r4   rX   T5GemmaLMHead.__init__  s     		+Er3   r?   r   c                 (    U R                  U5      nU$ r{   r  )rZ   r?   logitss      r4   r   T5GemmaLMHead.forward  s    }-r3   r  )F)r.   r/   r0   r1   rj   rq   ro   rX   r   r   r   r2   rr   rs   s   @r4   r  r    sJ    8FC FS F F FU\\ ell  r3   r  c                   <    \ rS rSr% \\S'   SrSrSS/rS r	S r
S	rg
)T5GemmaPreTrainedModeli  r   modelTr   r   c                    [         R                  " X5        U R                  R                  n[	        U[
        5      (       a  UR                  R                  R                  S   S-  nUR                  R                  R                  R                  SX#-  S9  [        UR                  S5      (       aG  UR                  R                  b/  UR                  R                  R                  R                  5         g g g [	        U[        5      (       as  U R                  R                  (       dW  UR                  R                  R                  S   S-  nUR                  R                  R                  R                  SX#-  S9  g g SUR                   R"                  ;   a%  UR                  R                  R                  5         g g )Nr   g      ri   )meanstdr   RMSNorm)r   _init_weightsr   rN   rO   r	  r  weightr   datanormal_hasattrr   zero_r  rG   r]   r.   )rZ   moduler  scales       r4   r!  $T5GemmaPreTrainedModel._init_weights  sJ   %%d3kk++f788OO**003t;EOO""''//Sck/Jv//FOO4H4H4T$$))//1 5U/..;;22..44Q74?&&++33#+3N 3 &**333MM$$& 4r3   c                 b   U R                   R                  R                  nU R                   R                  R                  nUc  [	        S5      eUR                  UR                  5      nUSSS24   R                  5       USSS24'   X$S'   Uc  [	        S5      eUR                  US:H  U5        U$ )	z
Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
pad_token_id replacement for labels that were -100.
This is a common preparation step for decoder inputs in sequence-to-sequence models.
Nz:self.model.config.decoder.bos_token_id has to be defined. .r   r   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r   rB   rJ   rK   r   	new_zerosr   clonemasked_fill_)rZ   r=   decoder_start_token_idrK   shifted_input_idss        r4   _shift_right#T5GemmaPreTrainedModel._shift_right  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r3   r,   N)r.   r/   r0   r1   r7   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr!  r0  r2   r,   r3   r4   r  r    s*    &*#.0EF'"!r3   r  	token_idsr?   rK   c                    U b<  Uc  [        S5      eX:g  R                  UR                  [        R                  5      nU$ [        R
                  " UR                  S   UR                  S   4UR                  [        R                  S9nU$ )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r   r   r   )r   r   r   r   longr   r   )r6  r?   rK   r@   s       r4   make_default_2d_attention_maskr:  	  s     RSS#3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r3   c                      ^  \ rS rSr\\S.rU 4S jr\" 5           SS\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\\   S	\4S
 jj5       rSrU =r$ )T5GemmaEncoderi  )
attentionsr?   c           	      R  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        [        US9U l        SU l        [
        R                  " [!        UR"                  5       Vs/ s H  n[%        X5      PM     sn5      U l        [
        R(                  " UR*                  5      U l        U R/                  5         g s  snf )Nr   r   F)rW   rX   rK   padding_idxrH   r|   	EmbeddingrT   embed_tokensru   r   normr   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layersr   r   r}   rD   r~   	post_initr   s      r4   rX   T5GemmaEncoder.__init__   s     !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	0?&+#mmEJ6KcKcEdeEd	 3Ede
 zz&"5"56 	 fs   D$r=   r@   r   r>   r[   r   c           	         US L US L-  (       a  [        S5      eUR                  SS 5        Uc  U R                  U5      n[        R                  " SUR
                  S   UR                  S9nUc  UR                  S5      nUc   [        XU R                  R                  5      n[        U=n[        5      (       db  U R                  UUUS US.n[        S0 UDS[        U5      0D6[        S0 UD[!        U R                  R"                  5      [        U5      S.D6S	.nUn	U R%                  X5      n
[        R&                  " U R                  R(                  S
-  U	R*                  S9nX-  n	U R-                  U	5      n	U R.                  S U R                  R0                    H  nU" U	U
X|R2                     U40 UD6n	M     U R5                  U	5      n	U R-                  U	5      n	[7        U	S9$ )N:You must specify exactly one of input_ids or inputs_embedsr:   r   r   r   r   input_embedsr@   r  r:   r   or_mask_function)rP  and_mask_functionfull_attentionsliding_attention      ?r   )last_hidden_stater,   )r   poprB  r   aranger   r   	unsqueezer:  r   rK   rO   rP   r&   r   r'   r   r   rD  tensorrT   r   r~   r   rH  r   rC  r   )rZ   r=   r@   r   r>   r[   r  self_attn_mask_mappingmask_kwargsr?   r   
normalizerlayer_modules                r4   r   T5GemmaEncoder.forward2  s    -t";<YZZ 	

$d+  --i8Ma)<)<Q)?H\H\])33A6L!;IVZVaVaVnVnoNNB0DII++ -"0"0#' ,K #5 #!#%@%P# &G &!&%OPTP[P[PjPj%k&A.&Q&
&" &"oomJ\\$++"9"93">mFYFYZ
%2]3 KK(G$++*G*GHL(#&'B'BC	
 M I 		-0]3+
 	
r3   )r~   rB  rE  r   rC  r@  rD  rH   NNNN)r.   r/   r0   r1   r   r   _can_record_outputsrX   r   r   r   r   r   r   r   r   r   r   r2   rr   rs   s   @r4   r<  r<    s    *,
$  15153759A
E,,-A
 !.A
 u//0	A

   1 12A
 +,A
 
A
 A
r3   r<  c                   p  ^  \ rS rSr\" \SS9\" \SS9\S.rU 4S jr	\
" 5                SS\\R                     S\\R                     S\\R                     S	\\   S
\\R                      S\\   S\\R                     S\\R                     S\\R                     S\\   S\4S jj5       rSrU =r$ )T5GemmaDecoderiw  r   )index)r=  cross_attentionsr?   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        U R                  5         g s  snf r{   )	rW   rX   r|   rF  rG  rH  r   r   rI  r   s      r4   rX   T5GemmaDecoder.__init__~  sW     mmEJ6KcKcEdeEd	 3Ede
 	 fs   A*r=   r@   r   r:   r>   rM   r  r   r  r[   r   c
                    US L US L-  (       a  [        S5      eUc  [        S5      eUc  U R                  U5      nU R                  (       d8  U(       a1  Uc.  [        [	        U R
                  S9[	        U R
                  S95      nUcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      nUc#  Uc   [        XU R
                  R                  5      n[        U=n[        5      (       d9  U R
                  UUUUb  UR                   OS US.n[#        S0 UD6[%        S0 UD6S.n[        U	=n[        5      (       d-  U R
                  UU	US S S.nS	[#        S0 UDS
['        U	5      0D60nUnU R)                  X5      n[        R*                  " U R
                  R,                  S-  UR.                  S9nUU-  nU R1                  U5      nU R2                  S U R
                  R4                    H$  nU" UUUUR6                     UUUUUUS	   4	0 U
D6nM&     U R9                  U5      nU R1                  U5      n[;        UUS9$ )NrL  z0`encoder_hidden_states` must be given in decoderr?  r   r   rM  rN  rR  rS  rP  rU  r   )rV  r:   r,   )r   rB  r   r
   r	   r   get_seq_lengthr   rX  r   r   rY  r:  rK   rO   rP   r  r&   r'   r   rD  rZ  rT   r   r~   r   rH  r   rC  r   )rZ   r=   r@   r   r:   r>   rM   r  r   r  r[   past_seen_tokensr[  r\  cross_attn_mask_mappingr?   r   r]  r^  s                      r4   r   T5GemmaDecoder.forward  s    -t";<YZZ (OPP  --i8M}}/F1,dkk2RT`hlhshsTtuO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L!o&=;IVZVaVaVnVnoNNB0DII++ -"0"0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TRR++ 5"8"0#' $K !"4 #!#%@AW%X#'# &"oomJ\\$++"9"93">mFYFYZ
%
2]3 KK(G$++*G*GHL(#&|'B'BC%'(89 M I 		-0]38++
 	
r3   )r   )	NNNNNNNNN)r.   r/   r0   r1   r   r   r   r   ra  rX   r   r   r   r   r   r
   r   ro   r   r   r   r   r2   rr   rs   s   @r4   rc  rc  w  s-   $%9C*+@J,  1515379=59$(598<9=Z
E,,-Z
 !.Z
 u//0	Z

 ""56Z
   1 12Z
 D>Z
 !!1!12Z
  (5Z
 !) 6Z
 +,Z
 
3Z
 Z
r3   rc  c                     ^  \ rS rSrS\4U 4S jjrS rS rS r\	\
            SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\R$                     S\\R$                     S\\   S\\R                     S\\   S\4S jj5       5       rSrU =r$ )T5GemmaModeli  r   c                    > [         TU ]  U5        UR                  (       d  [        S5      e[	        UR
                  5      U l        [        UR                  5      U l        U R                  5         g )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	rW   rX   rC   r   r<  rA   rc  rB   rI  r   s     r4   rX   T5GemmaModel.__init__  sO     ((uvv%fnn5%fnn5r3   c                     U R                   $ r{   rA   rZ   s    r4   get_encoderT5GemmaModel.get_encoder  s    ||r3   c                 6    U R                   R                  5       $ r{   rA   get_input_embeddingsrs  s    r4   rx  !T5GemmaModel.get_input_embeddings      ||0022r3   c                 8    U R                   R                  U5      $ r{   rA   set_input_embeddingsrZ   new_embeddingss     r4   r}  !T5GemmaModel.set_input_embeddings      ||00@@r3   r=   r@   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr:   r>   decoder_inputs_embedsrM   r  r[   r   c                    Uc  U R                   " SUUUU	S.UD6nUR                  nU R                  " SUUUU
UUUUUS.	UD6n[        UR                  UR                  UR                  SS5      (       a  UR                  OUR                  4UR                  UR                  UR                  UR                  UR                  S9$ )a8  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
r=   r@   r   r>   )	r=   r@   r   r>   r:   r   r  rM   r  r`   F)rV  r:   decoder_hidden_statesdecoder_attentionsre  encoder_last_hidden_stater   encoder_attentionsr,   )	rA   rV  rB   r   r:   rY   r?   r=  re  )rZ   r=   r@   r   r  r  r  r  r:   r>   r  rM   r  r[   r   decoder_outputss                   r4   r   T5GemmaModel.forward  s    . ""ll #-)+	
 O !0 A A,, 
'1-/+"7#1)
 
 "-??+;;zz0%88 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r3   )rB   rA   )NNNNNNNNNNNN)r.   r/   r0   r1   r7   rX   rt  rx  r}  r   r   r   r   r   r   
BoolTensorr   r
   r   ro   r   r   r   r   r2   rr   rs   s   @r4   rn  rn    s_   	} 	3A  156:378<=A;?599=048<$(598
E,,-8
 !!2!238
 u//0	8

 $E$4$458
 !))9)9 :8
 'u'7'788
 "/28
 ""568
  -8
  (58
 D>8
 !!1!128
 +,8
 
8
  8
r3   rn  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	    SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\\   S\4S jj5       5       rSrU =r$ )T5GemmaEncoderModeli7  r   c                    > [         TU ]  U5        UR                  (       a  [        S5      e[	        UR
                  5      U l        U R                  5         g )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)rW   rX   rC   r   r<  rA   rI  r   s     r4   rX   T5GemmaEncoderModel.__init__9  s?     $$pqq%fnn5r3   c                 6    U R                   R                  5       $ r{   rw  rs  s    r4   rx  (T5GemmaEncoderModel.get_input_embeddingsB  rz  r3   c                 8    U R                   R                  U5      $ r{   r|  r~  s     r4   r}  (T5GemmaEncoderModel.set_input_embeddingsE  r  r3   r=   r@   r   r>   r[   r   c                 4    U R                   " SUUUUS.UD6nU$ )Nr  r,   rr  )rZ   r=   r@   r   r>   r[   r  s          r4   r   T5GemmaEncoderModel.forwardH  s5     ,, 
)%'	

 
 r3   rr  r`  )r.   r/   r0   r1   r7   rX   rx  r}  r   r   r   r   r   r   r   r   r   r   r   r2   rr   rs   s   @r4   r  r  7  s    } 3A  156:3704E,,- !!2!23 u//0	
  - +, 
  r3   r  c            %       l  ^  \ rS rSrSS/rSS0rSS/S/40rS\4U 4S	 jjrS
 r	S r
S rS rS r\\              S"S\\R$                     S\\R&                     S\\R$                     S\\R$                     S\\R(                     S\\R$                     S\\   S\\   S\\R&                     S\\R&                     S\\R$                     S\\   S\\R$                     S\\\R4                  4   S\\   S\\\R&                     \4   4 S jj5       5       rS\R4                  4S  jr S!r!U =r"$ )#T5GemmaForConditionalGenerationi\  z!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_repr?   r  r   c                   > SUl         [        TU ]	  U5        [        U5      U l        UR
                  R                  U l        [        UR
                  R                  U R                  5      U l	        SU l
        U R                  5         g )NTForMaskedLM)rC   rW   rX   rn  r  rB   rH   r  rT   lm_head	loss_typerI  r   s     r4   rX   (T5GemmaForConditionalGeneration.__init__a  sb    $(! !&)
 ..33$V^^%?%?Q&r3   c                 $    XR                   l        g r{   r  r  r~  s     r4   set_output_embeddings5T5GemmaForConditionalGeneration.set_output_embeddingsl  s     .r3   c                 .    U R                   R                  $ r{   r  rs  s    r4   get_output_embeddings5T5GemmaForConditionalGeneration.get_output_embeddingso  s    ||$$$r3   c                     U R                   R                  (       aC  U R                  U R                  R                  U R                  5       R                  5       5        g g r{   )r   rG   _tie_or_clone_weightsr  r  get_decoderrx  rs  s    r4   _tie_weights,T5GemmaForConditionalGeneration._tie_weightsr  s@    ;;**&&t||'<'<d>N>N>P>e>e>gh +r3   c                 .    U R                   R                  $ r{   )r  rA   rs  s    r4   rt  +T5GemmaForConditionalGeneration.get_encoderw      zz!!!r3   c                 .    U R                   R                  $ r{   )r  rB   rs  s    r4   r  +T5GemmaForConditionalGeneration.get_decoderz  r  r3   r=   r@   r   r  r  r  r  r:   r>   r  labelsrM   r  logits_to_keepr[   r   c                    Ub  Uc  U
c  U R                  U5      nU R                  " SUUUUUUUUU	U
UUS.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  5       R                  nUR                  b4  UUR                  -  n[        R                  " U5      nUUR                  -  nSnUb  U R                  " UXR                  40 UD6n[        UUUR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9	$ )a  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
N)r=   r@   r   r  r  r  r  r:   r>   r  rM   r  )	lossr  r:   r  r  re  r  r   r  r,   )r0  r  rV  rO   rq   slicer  r  r   final_logit_softcappingr   tanhloss_functionrH   r   r:   r  r  re  r  r   r  )rZ   r=   r@   r   r  r  r  r  r:   r>   r  r  rM   r  r  r[   r  r?   slice_indicesr  decoder_configr  s                         r4   r   'T5GemmaForConditionalGeneration.forward}  ss   < "3";@U@] $ 1 1& 9.2jj /
)%/#9!5++'"7)/
 /
  (998B>SV8W8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%%ffooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r3   c                 $    U R                  U5      $ r{   )r0  )rZ   r  s     r4   %prepare_decoder_input_ids_from_labelsET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r3   )r  r  r  rH   )NNNNNNNNNNNNNr   )#r.   r/   r0   r1   _tied_weights_keys_tp_plan_pp_planr7   rX   r  r  r  rt  r  r   r   r   r   r   r   r  r   r
   ro   r   rq   r   r   r   r   r   r   r  r2   rr   rs   s   @r4   r  r  \  s   =?XY"M2H"o%6
$CDH	} 	/%i
""  156:378<=A;?599=59=A-1$(5934I
E,,-I
 !!2!23I
 u//0	I

 $E$4$45I
 !))9)9 :I
 'u'7'78I
 "/2I
 ""56I
   1 12I
  ((9(9:I
 ))*I
 D>I
 !!1!12I
 c5<</0I
  +,!I
" 
uU&&'8	9#I
  I
V)ELL ) )r3   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjrS rS r	\
\          SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\R                      S\\R                      S\\R                     S\\   S\4S jj5       5       rSrU =r$ ) T5GemmaForSequenceClassificationi  r   rC   c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
NrE   皙?rC   rW   rX   r
  rn  r  r  rA   rT   rB   rV   r	  scorerI  rZ   r   rC   rT   classifier_dropoutr]   s        r4   rX   )T5GemmaForSequenceClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r3   c                 6    U R                   R                  5       $ r{   r  rx  rs  s    r4   rx  5T5GemmaForSequenceClassification.get_input_embeddings      zz..00r3   c                 :    U R                   R                  U5        g r{   r  r}  rZ   rf   s     r4   r}  5T5GemmaForSequenceClassification.set_input_embeddings      

''.r3   r=   r@   r   r  r  r  r  r>   r  r  r[   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R                  c  US	:w  a  [        S
5      eU R                   R                  c  SnGOUb  XR                   R                  :g  R!                  UR"                  [$        R&                  5      n[$        R(                  " UR                  S   UR"                  [$        R&                  S9nUU-  R+                  S5      nU R                   R                  (       a*  US	-  n[$        R,                  " UUR                  S   S	-
  S9nO.Sn[.        R1                  U R                  R                   S35        U[$        R(                  " UUR"                  S9U4   nSnU
b  U R3                  UU
UU R                   S9n[5        UUUUS9$ )  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r@   r   r  r  r  r  r>   r  rM   r@   r   r>   r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r8  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rM  )r  r  pooled_logitsr   r  r  r?   r=  )r   rC   NotImplementedErrorr]   r.   r   r0  r  rV  r  r  r?   r=  r  r   rK   r   r   r   int32rX  argmaxclamploggerwarning_oncer  r   )rZ   r=   r@   r   r  r  r  r  r>   r  r  r[   outputsrV  r?   r=  r  
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr  r  s                          r4   r   (T5GemmaForSequenceClassification.forward  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r3   r  r
  r  r{   
NNNNNNNNNN)r.   r/   r0   r1   r7   r   ro   rX   rx  r}  r   r   r   r   r   r   r   r   r   r   r   r2   rr   rs   s   @r4   r  r    sS   } (4.  .1/  1515378<9=;?5959=A-1i
E,,-i
 !.i
 u//0	i

 $E$4$45i
 !) 6i
 'u'7'78i
 "/2i
   1 12i
  ((9(9:i
 ))*i
 +,i
 
"i
  i
r3   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjrS rS r	\
\          SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\R                      S\\R                      S\\R                     S\\   S\4S jj5       5       rSrU =r$ )T5GemmaForTokenClassificationi[  r   rC   c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for token classification. When set to False, only encoder is used.
NrE   r  r  r  s        r4   rX   &T5GemmaForTokenClassification.__init__]  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r3   c                 6    U R                   R                  5       $ r{   r  rs  s    r4   rx  2T5GemmaForTokenClassification.get_input_embeddingsu  r  r3   c                 :    U R                   R                  U5        g r{   r  r  s     r4   r}  2T5GemmaForTokenClassification.set_input_embeddingsx  r  r3   r=   r@   r   r  r  r  r  r>   r  r  r[   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nSnU
b  U R                  UXR                   5      n[        UUUUS9$ )	r  Nr  r  r  Fr  r  r  )r   rC   r  r]   r.   r   r0  r  rV  r  r  r?   r=  r  r  r   )rZ   r=   r@   r   r  r  r  r  r>   r  r  r[   r  rV  r?   r=  r  r  s                     r4   r   %T5GemmaForTokenClassification.forward{  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-.%%ffkkBD$'!	
 	
r3   r  r{   r  )r.   r/   r0   r1   r7   r   ro   rX   rx  r}  r   r   r   r   r   r   r   r   r   r   r   r2   rr   rs   s   @r4   r  r  [  sS   } (4.  01/  1515378<9=;?5959=A-1N
E,,-N
 !.N
 u//0	N

 $E$4$45N
 !) 6N
 'u'7'78N
 "/2N
   1 12N
  ((9(9:N
 ))*N
 +,N
 
N
  N
r3   r  )r7   r*   r  rn  r  r  r  r  )Ttypingr   r   r   r   r   torch.nnr|   cache_utilsr   r	   r
   configuration_utilsr   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   gemma2.configuration_gemma2r    gemma2.modeling_gemma2r!   r"   r#   r$   r%   r&   r'   r(   _CHECKPOINT_FOR_DOC
get_loggerr.   r  r*   r7   ru   rx   r   r   r   r   r   rq   r   r   r   Moduler	  r  r  r   r:  r<  rc  rn  r  r  r  r  __all__r,   r3   r4   <module>r     sO    2 1   C C 3 ) B 9  G &  1 ? 6	 	 	 9  
		H	%	, 	L($ L(^	] 		 	)2 )
+? +D)O D)N
0F 
8 
s x 14 1h8- 8v		 	BII 	 /!2 /! /!d(()<< 3- \\	"Z
+ Z
zj
^ j
Z O
) O
 O
d !0 ! !Ho)&<o o)d I
'= I
 I
X o
$: o
 o
d	r3   