
    cCiO                     f   S SK r S SKJr  S SKJr  S SKJrJr  S SKrS SK	J
r
  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJrJrJ r   SSK!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,J-r-J.r.  SSK/J0r0  SSK1J2r2  SSK3J4r4  SSK5J6r6J7r7  \.Rp                  " \95      r:\\," SS9 " S S\5      5       5       r;\\," SS9 " S S\*5      5       5       r< " S S \
Rz                  5      r> " S! S"\
R~                  5      r@ " S# S$\
R~                  5      rA " S% S&\
R~                  5      rBS' rCSRS( jrDS)\R                  S*\FS+\R                  4S, jrG   SSS-\
R~                  S.\R                  S/\R                  S0\R                  S1\\R                     S2\HS3\\H   S4\\H   S+\I\R                  \R                  4   4S5 jjrJ " S6 S7\
R~                  5      rK " S8 S9\5      rL\, " S: S;\&5      5       rMS<\FS+\\F\F\F\F/\N4   4S= jrO\, " S> S?\M5      5       rP\, " S@ SA\M\5      5       rQ " SB SC\
R~                  5      rRSD\\R                     SE\\R                     SF\FS+\\   4SG jrS\," SHS9 " SI SJ\M5      5       rT\," SHS9 " SK SL\M\5      5       rU " SM SN\M5      rV " SO SP\\M5      rW/ SQQrXg)T    N)Callable)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)PretrainedConfig)GenerationMixin)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    )custom_introc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)Gemma3ModelOutputWithPast3   a  
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r+   r   torchFloatTensor__annotations____static_attributes__r,       d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/gemma3/modeling_gemma3.pyr)   r)   3   s     8<%"3"34;r6   r)   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)Gemma3CausalLMOutputWithPastC   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr+   r,   )r-   r.   r/   r0   r1   r;   r   r2   r3   r4   r<   r=   r	   r>   tupler?   r+   r5   r,   r6   r7   r9   r9   C   s     )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju001297;%"3"34;r6   r9   c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3TextScaledWordEmbeddinga   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 p   > [         TU ]  XU5        U R                  S[        R                  " U5      SS9  g )NrG   F
persistent)super__init__register_bufferr2   tensor)selfrD   rE   rF   rG   	__class__s        r7   rL   &Gemma3TextScaledWordEmbedding.__init__f   s1    D]ELL,ERWXr6   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ N)rK   forwardrG   toweightdtype)rO   rR   rP   s     r7   rU   %Gemma3TextScaledWordEmbedding.forwardj   s2    wy)D,<,<,?,?@Q@Q,RRRr6   r,   )      ?)r-   r.   r/   r0   r1   intfloatrL   r2   TensorrU   r5   __classcell__rP   s   @r7   rB   rB   a   sM    Ys Y3 YS Y_d Y YS S Sr6   rB   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	Gemma3MLPn   configc                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFbias)rK   rL   rc   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrO   rc   rP   s     r7   rL   Gemma3MLP.__init__o   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r6   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rT   )rn   rp   rl   rm   )rO   xrn   s      r7   rU   Gemma3MLP.forwardy   s6    NN4;;t~~a/@#ADLLQRO#ST	r6   )rp   rc   rn   rl   rh   ri   rm   )	r-   r.   r/   r0   r&   rL   rU   r5   r^   r_   s   @r7   ra   ra   n   s    7/ 7 r6   ra   c                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
Gemma3RMSNorm~   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g rT   )rK   rL   rz   rj   	Parameterr2   zerosrW   )rO   ry   rz   rP   s      r7   rL   Gemma3RMSNorm.__init__   s,    ll5;;s#34r6   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )Nr"   T)keepdim)r2   rsqrtpowmeanrz   )rO   rt   s     r7   _normGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr6   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )NrZ   )r   r\   rW   type_as)rO   rt   outputs      r7   rU   Gemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r6   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r@   rW   shaperz   rO   s    r7   
extra_reprGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r6   )rz   rW   )gư>)r-   r.   r/   r0   r[   r\   rL   r   rU   r   r5   r^   r_   s   @r7   rw   rw   ~   s0    5C 5e 5 5
K!= =r6   rw   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\R                  " 5       \
S 5       5       rSrU =r$ )Gemma3RotaryEmbedding   inv_freqrc   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultr   FrI   )rK   rL   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrc   r   rope_init_fnattention_scalingrM   r   original_inv_freq)rO   rc   devicer   rP   s       r7   rL   Gemma3RotaryEmbedding.__init__   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r6   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r   r$   mpscpuF)device_typeenabledr"   ry   rX   )r   r\   expandr   rV   r   r   r   strr2   autocast	transposecatcosr   sinrX   )
rO   rt   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r7   rU   Gemma3RotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   rc   r   r   r   r   r   rT   )r-   r.   r/   r0   r2   r]   r4   r&   rL   no_gradr   rU   r5   r^   r_   s   @r7   r   r      sA    ll// / /" ]]_<  <r6   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r"   r   )r   r2   r   )rt   x1x2s      r7   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r6   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r7   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr6   r>   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r$   N)r   r   reshape)r>   r   batchnum_key_value_headsslenhead_dims         r7   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr6   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                    Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb"  US S 2S S 2S S 2S U	R                  S   24   nX-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R!                  5       nX4$ )	N      r"   r   r   )ry   rX   )ptrainingr$   )r   r   num_key_value_groupsr2   matmulr   tanhr   rj   
functionalsoftmaxfloat32rV   rX   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r7   eager_attention_forwardr      s/    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r6   c                   >  ^  \ rS rSrSrS\S\4U 4S jjr\" SSSS	9  SS
\	R                  S\	R                  S\\	R                     S\\   S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jj5       rSrU =r$ )Gemma3Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrc   	layer_idxc                   > [         TU ]  5         UR                  U   S:H  U l        Xl        X l        [        USUR                  UR                  -  5      U l	        UR                  UR                  -  U l        UR                  S-  U l        U R                  R                  U l        U R                  R                  (       + U l        ["        R$                  " UR                  UR                  U R                  -  UR&                  S9U l        ["        R$                  " UR                  UR                  U R                  -  UR&                  S9U l        ["        R$                  " UR                  UR                  U R                  -  UR&                  S9U l        ["        R$                  " UR                  U R                  -  UR                  UR&                  S9U l        U R                  R0                  U l        U R                  (       a  UR2                  OS U l        [5        UR                  UR6                  S9U l        [5        UR                  UR6                  S9U l        g )Nsliding_attentionr   r   rf   )ry   rz   )rK   rL   layer_types
is_slidingrc   r   getattrrh   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropoutuse_bidirectional_attention	is_causalrj   rk   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowrw   rms_norm_epsq_normk_normrO   rc   r   rP   s      r7   rL   Gemma3Attention.__init__
  s    ,,Y7;NN"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>![[DDDii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;f33D#V=P=PQ#V=P=PQr6   past_key_valuer=   4.58new_nameversionr>   position_embeddingsr   cache_positionr   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                   (       a  U R"                  OSU R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Nr   r$   r"   )r   r   r	  eager        )r   r   r   )r   r   r   viewr   r   r   r   r   r   updater   r   rc   _attn_implementationr   r   r   r   r   r   r   r   )rO   r>   r  r   r=   r	  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r7   rU   Gemma3Attention.forward'  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((r6   )r   r   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   )NN)r-   r.   r/   r0   r1   r&   r[   rL   r    r2   r]   r   r	   
LongTensorr   r   r@   rU   r5   r^   r_   s   @r7   r   r     s    GR/ RC R: %0A6R ,059-)||-) #\\-) !.	-)
 "%-) !!1!12-) -.-) 
u||Xell3XeELL>Q5RR	S-) S-)r6   r   c                   ~  ^  \ rS rSrS\S\4U 4S jjr\" SSSS9      SS	\R                  S
\R                  S\R                  S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S\\R                  \
\\R                  \R                  4      4   4S jj5       rSrU =r$ )Gemma3DecoderLayeriX  rc   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        UR
                  U   U l        [        XS9U l        [        U5      U l
        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )N)rc   r   rz   )rK   rL   rc   rh   r   r   attention_typer   	self_attnra   mlprw   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr  s      r7   rL   Gemma3DecoderLayer.__init__Y  s    !--"$00;(LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r6   r  r=   r  r  r>   position_embeddings_globalposition_embeddings_localr   r   output_attentions	use_cacher	  r   c
                 `   UnU R                  U5      nU R                  R                  (       a  UnOUnU R                  " SUUUUUUUU	S.U
D6u  pU R                  U5      nX-   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )N)r>   r  r   r   r=   r%  r&  r	  r,   )r  r  r   r  r   r  r!  )rO   r>   r#  r$  r   r   r=   r%  r&  r	  r   residualr  self_attn_weightsoutputss                  r7   rU   Gemma3DecoderLayer.forwardf  s     !,,]; >>$$";"<+/>> 
,
' 3)%+/)
,
 
,
( 55mD 0 66}E/77F 0 "++Gr6   )
r  rc   rh   r  r   r  r  r!  r   r  )NNNFFN)r-   r.   r/   r0   r&   r[   rL   r    r2   r]   r   r  r	   boolr@   r3   rU   r5   r^   r_   s   @r7   r  r  X  s   c/ cC c %0A6R 2637+/,1$)590||0 %*LL0 $)<<	0
 !.0 u//00 "%0 $D>0 D>0 !!1!120 
u  (51B1BEDUDU1U+V"WW	X0 S0r6   r  c                   h   ^  \ rS rSr% \\S'   SrSr/ SQrS/r	Sr
SrSrSrSr\\S.rU 4S jrS	rU =r$ )
Gemma3PreTrainedModeli  rc    T)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr=   )r>   r?   c                   > [         TU ]  U5        [        U[        5      (       a%  UR                  R
                  R                  5         g SUR                  R                  ;   a%  UR                  R
                  R                  5         g g )NRMSNorm)
rK   _init_weightsr   Gemma3MultiModalProjectormm_input_projection_weightdatazero_rP   r-   rW   )rO   r   rP   s     r7   r5  #Gemma3PreTrainedModel._init_weights  se    f%f788--2288:&**333MM$$& 4r6   r,   )r-   r.   r/   r0   r%   r4   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsr5  r5   r^   r_   s   @r7   r.  r.    s]    &*# $5"5N!"&+%
' 'r6   r.  r   c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z9
Enables a bidirectional mask within the sliding window.
	batch_idxhead_idxq_idxkv_idxr   c                 $   > [        X#-
  5      T:  $ )zA token can attend to any other token if their absolute distance is within
the (exclusive) sliding window size (distance < sliding_window).)abs)rF  rG  rH  rI  r   s       r7   
inner_mask1_bidirectional_window_overlay.<locals>.inner_mask  s     5>"^33r6   r[   r,  )r   rL  s   ` r7   _bidirectional_window_overlayrO    s3    
4c 4S 4 4c 4d 4
 r6   c                   F  ^  \ rS rSr% \\S'   S\4U 4S jjr\" 5       \         SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\
R                     S	\	\   S
\	\   S\	\   S\	\
R                     S\\   S\4S jj5       5       rSrU =r$ )Gemma3TextModeli  rc   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        US9U l        SU l        [*        R,                  " U5      nUR.                  Ul        SS0Ul        [%        US9U l        U R7                  5         g s  snf )N      ?)rG   r  rc   Fr   r   )rK   rL   pad_token_idrF   
vocab_sizerB   rh   rc   embed_tokensrj   
ModuleListrangenum_hidden_layersr  layersrw   r   normr   
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar   rotary_emb_local	post_initr  s      r7   rL   Gemma3TextModel.__init__  s    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdDcy2Dcd
 "&"4"4&:M:MN	/v>&+# v&"77*I6 5V D 	 es    ErR   r   r   r=   inputs_embedsr&  r%  output_hidden_statesr	  r   r   c
                 Z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a'  Uc$  U R                  (       d  [        U R                   S9nU	cD  Ub  UR                  5       OSn[        R                  " UXR                  S   -   UR                  S9n	Uc  U	R!                  S5      n[#        U=n[$        5      (       d}  U R                   UUU	UUS.nUR'                  5       nU R                   R(                  (       a(  S	 US
'   [+        U R                   R,                  5      US
'   [/        S0 UD6[1        S0 UD6S.nUnU R3                  X5      nU R5                  X5      nU(       a  SOS nU(       a  SOS nU R6                  S U R                   R8                    HF  nU(       a  UU4-  nU" U4UUUUR:                     UUUUU	S.U
D6nUS   nU(       d  M=  UUS   4-  nMH     U R=                  U5      nU(       a  UU4-  n[?        UUUUS9$ )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrT  r   r$   r   rc   input_embedsr   r	  r=   r   c                  H    [         R                  " S[         R                  S9$ )NTr   )r2   rN   r,  )argss    r7   <lambda>)Gemma3TextModel.forward.<locals>.<lambda>"  s    TY^YcYc@dr6   or_mask_functionfull_attentionr   r,   )r#  r$  r   r   r=   r%  r&  r	  )last_hidden_stater=   r>   r?   ) rc   r%  rg  r&  
ValueErrorr^  r   loggerwarning_oncerW  r
   get_seq_lengthr2   aranger   r   r   r   r   r_  r   rO  r   r   r   r]  rc  r[  rZ  r  r\  r   )rO   rR   r   r   r=   rf  r&  r%  rg  r	  r   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr>   r#  r$  all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                         r7   rU   Gemma3TextModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*$++>O!CRC^==?de"\\  #6#6q#99$++N )33A6L ?-FF ++ -"0"0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & &*__]%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HIM#!m%55!)+E*C2=3O3OP) /"3#- M *!,M  =#3"55) J, 		-0-!11&+++%	
 	
r6   )rW  r^  r[  r\  rF   r]  rc  rV  	NNNNNNNNN)r-   r.   r/   r0   r&   r4   rL   r!   r   r   r2   r  r]   r	   r3   r,  r   r   r   rU   r5   r^   r_   s   @r7   rQ  rQ    s   / 4  151537+/59$(,0/359o
E,,-o
 !.o
 u//0	o

 "%o
   1 12o
 D>o
 $D>o
 'tno
 !!1!12o
 +,o
 
!o
  o
r6   rQ  c                     ^  \ rS rSr% S/rSS0rSS/S/40r\\S'   Sr	S\4U 4S	 jjr
\\           SS
\\R                     S\\R                      S\\R                     S\\   S\\R$                     S\\R                     S\\   S\\   S\\   S\\R                     S\\\R                   4   S\4S jj5       5       rSrU =r$ )Gemma3ForCausalLMiY  lm_head.weightlm_headcolwise_repr>   r<   rc   language_modelc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g re   )
rK   rL   rQ  modelrV  rj   rk   rh   r  rd  rq   s     r7   rL   Gemma3ForCausalLM.__init__a  sU     $V,
 ++yy!3!3V5F5FUS 	r6   rR   r   r   r=   rf  labelsr&  r%  rg  r	  logits_to_keepr   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                   R                  bH  UU R                   R                  -  n[        R                  " U5      nUU R                   R                  -  nSnUb  U R                  " UX`R                  40 UD6n[        UUUR                  UR                   UR"                  S9$ )a"  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma3ForCausalLM

>>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```N)	rR   r   r   r=   rf  r&  r%  rg  r	  r;   r<   r=   r>   r?   r,   )rc   r%  rg  r  rt  r   r[   slicer  final_logit_softcappingr2   r   loss_functionrV  r   r=   r>   r?   )rO   rR   r   r   r=   rf  r  r&  r%  rg  r	  r  r   r*  r>   slice_indicesr<   r;   s                     r7   rU   Gemma3ForCausalLM.forwardj  sT   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%%ffooPPD%#33!//))
 	
r6   )r  r  rV  )NNNNNNNNNNr   )r-   r.   r/   r0   _tied_weights_keys_tp_plan_pp_planr&   r4   r;  rL   r   r   r   r2   r  r]   r	   r3   r,  r   r[   r   rU   r5   r^   r_   s   @r7   r  r  Y  sd   *+=)H_-z:;H(/   151537+/59-1$(,0/35934F
E,,-F
 !.F
 u//0	F

 "%F
   1 12F
 ))*F
 D>F
 $D>F
 'tnF
 !!1!12F
 c5<</0F
 
 F
  F
r6   r  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )r6  i  rc   c                   > [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nr  rS  )kernel_sizestride)rK   rL   rj   r|   r2   r}   vision_configrh   text_configr7  rw   layer_norm_epsmm_soft_emb_normr[   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider  	AvgPool2davg_poolrq   s     r7   rL   "Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r6   vision_outputsc                    UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )Nr$   r"   )r   r   r   r  r   r  flattenr  r2   r   r7  r   )	rO   r  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r7   rU   !Gemma3MultiModalProjector.forward  s    $2$8$8!
z"0":":1a"@"9"A"AD$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EGfGf#g '//??r6   )r  r  r7  r  r  r  )r-   r.   r/   r0   r%   rL   r2   r]   rU   r5   r^   r_   s   @r7   r6  r6    s)    \| \ @ell @ @r6   r6  token_type_idsimage_group_idstokens_per_imagec           
      `   ^ ^ T c  gS[         S[         S[         S[         S[        4
UU 4S jjnU$ )z
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
NrF  rG  rH  rI  r   c                 D  > [         R                  " UT
R                  S   :  US5      nT
X4   n[         R                  " UT
R                  S   :  US5      nT	X4   n[         R                  " UT	R                  S   :  US5      nT
X4   S:H  US:H  -  nT	X4   U:H  nXx-  $ )Nr$   r   r   )r2   wherer   )rF  rG  rH  rI  safe_idxtoken_type_ids_at_kv_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockr  r  s            r7   rL  0token_type_ids_mask_function.<locals>.inner_mask  s     ;;v(<(<Q(??K#1)2E#F #(;;v8L8LQ8O/OQikl#m $3I4G$H!$)KK9N9Nq9Q0QSlnp$q!()9:a?D\`aDab*9+;<@YY 00r6   rN  )r  r  r  rL  s   ``  r7   token_type_ids_mask_functionr    sC     1c 1S 1 1c 1d 1 1" r6   zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c            !       L  ^  \ rS rSrSS0rSrS\4U 4S jjrS rS r	S	 r
S
 rS\R                  S\R                  4S jrS\R                  S\R                   S\R                   4S jr\\             SS\\R                     S\\R                      S\\R                     S\\R                     S\\   S\\R                     S\\R                     S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )Gemma3Modeli  zlanguage_model.modelr  Frc   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        [        U5      U l        UR                  R                  U l	        [        R                  " UR                  S9nX l
        U R                  R                  b  U R                  R                  OSU l        U R                  5         g )NrT  r   )rK   rL   r#   from_configr  vision_towerr6  multi_modal_projectorr  rV  r  rc   rU  rd  )rO   rc   r  rP   s      r7   rL   Gemma3Model.__init__  s     %119M9MN%>v%F" ,,77"..f6H6HI,8<8P8P8\DKK44bdr6   c                 6    U R                   R                  5       $ rT   )r  get_input_embeddingsr   s    r7   r   Gemma3Model.get_input_embeddings  s    ""7799r6   c                 :    U R                   R                  U5        g rT   )r  set_input_embeddingsrO   r   s     r7   r   Gemma3Model.set_input_embeddings  s    007r6   c                     Xl         g rT   r  rO   decoders     r7   set_decoderGemma3Model.set_decoder  s    %r6   c                     U R                   $ rT   r  r   s    r7   get_decoderGemma3Model.get_decoder  s    """r6   pixel_valuesr   c                 Z    U R                  US9R                  nU R                  U5      nU$ )a]  
Projects the last hidden state from the vision model into language model space.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)r  )r  rt  r  )rO   r  r  image_featuress       r7   get_image_featuresGemma3Model.get_image_features  s3     ***EWW33NCr6   rR   rf  r  c           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)rX   r   r   r   r$   z6Image features and image tokens do not match: tokens: z, features )r  r2   rN   rc   image_token_idlongr   allsumr   	expand_asrV   r   numelru  )rO   rR   rf  r  special_image_maskn_image_tokensn_image_featuress          r7   get_placeholder_mask Gemma3Model.get_placeholder_mask*  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r6   r   r   r=   r  r	  r  r&  r%  rg  return_dictc                    USL USL-  (       a  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUbR  U R                  R
                  U R                  :  a.  XR                  R
                  :H  nUR                  5       nSUU'   OUnUc  U R                  5       " U5      nUcE  Ub  UR                  5       OSn[        R                  " UUUR                  S   -   UR                  S9nUbY  U R                  U5      nUR                  UR                  UR                   5      nU R#                  XUS9nUR%                  UU5      n['        U=n[(        5      (       GdZ  U R                  R+                  5       UUUUUS.nU
(       + =(       d'    USL =(       d    UR,                  (       + =(       d    USLnUb  U(       a  US:H  R                  UR                  5      nU[.        R0                  R3                  USSS	9SS2SS
24   ) -  n[        R4                  " UR7                  5       SS9S-
  n[        R8                  " UU[        R:                  " US
UR                  S95      n[=        UR                  UR                  5      UU R                  R>                  5      US'   [A        S0 UD6[C        S0 UD6S.nU RD                  " SUUUUU
UUSUS.	UD6n[G        URH                  U
(       a  URJ                  OSURL                  URN                  Ub  WS9$ SS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```Nri  r   r$   rj  )rf  r  rk  r$   r   r   r   r   rq  rr  T)	r   r   r=   rf  r&  r%  rg  r  r	  )rt  r=   r>   r?   r+   r,   )(ru  rc   r%  rg  use_return_dictr  rV  cloner  rx  r2   ry  r   r   r  rV   rX   r  masked_scatterr   r   get_text_configis_initializedrj   r   padcumsumr[   r  	full_liker  r  r   r   r  r)   rt  r=   r>   r?   )rO   rR   r  r   r   r=   r  r	  rf  r  r&  r%  rg  r  	lm_kwargsr  llm_input_idsrz  r  r{  r|  
is_prefillis_imagenew_image_startr  r*  s                             r7   rU   Gemma3Model.forwardB  s   \ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  T[[%?%?4??%R!*kk.H.H!H%OO-M01M,-%M  557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-FF ++557 -"0"0#2 ,K  ,"d*,&555,  t+	  )j +a/33N4I4IJ"*bmm.?.?&XY.?.Z[\^a_a^a[a.b-b"b"',,/B/B/D!"Lq"P"'++ou~rZbZiZi/j# 3O"%%n&;&;<ot{{OnOn3./ #5"C{"C%F%U%U#
 %% 
.%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r6   )r  r  rU  r  rV  )NNNNNNNNNNNNN)r-   r.   r/   r0   _checkpoint_conversion_mappingaccepts_loss_kwargsr%   rL   r  r  r  r  r2   r]   r  r  r3   r  r   r   r   r	   r,  r   r@   r)   rU   r5   r^   r_   s   @r7   r  r    s    '=>N%O"
| 
:8&#u||  "))":?:K:K"]b]n]n"0  15481537+/595959-1$(,0/3&*L
E,,-L
 u001L
 !.	L

 u//0L
 "%L
 !!1!12L
 !!1!12L
   1 12L
 ))*L
 D>L
 $D>L
 'tnL
 d^L
  
u//	0!L
  L
r6   r  c            "         ^  \ rS rSrSSSSS.rS/rSrS	\4U 4S
 jjrS r	S r
S rS rS r\S 5       r\S 5       r\S 5       r\              S'S\\R*                     S\\R,                     S\\R.                     S\\R*                     S\\   S\\R*                     S\\R*                     S\\R,                     S\\R*                     S\\   S\\   S\\   S\\   S \\\R.                  4   S!\\\4   4S" jj5       r          S(U 4S# jjr\  S)S	\!S$\R.                  S\\R.                     S\R.                  S\\   S\\R.                     S\\R.                     S!\"4S% jj5       r#S&r$U =r%$ )*Gemma3ForConditionalGenerationi  model.language_modelmodel.vision_towermodel.multi_modal_projectorr  )^language_model.model^vision_tower^multi_modal_projectorz^language_model.lm_headr  Frc   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g re   )rK   rL   r  r  rj   rk   r  rh   rV  r  rd  rq   s     r7   rL   'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr6   c                 6    U R                   R                  5       $ rT   r  r  r   s    r7   r  3Gemma3ForConditionalGeneration.get_input_embeddings      zz..00r6   c                 :    U R                   R                  U5        g rT   r  r  r  s     r7   r  3Gemma3ForConditionalGeneration.set_input_embeddings      

''.r6   c                 :    U R                   R                  U5        g rT   )r  r  r  s     r7   r  *Gemma3ForConditionalGeneration.set_decoder  s    

w'r6   c                 6    U R                   R                  5       $ rT   )r  r  r   s    r7   r  *Gemma3ForConditionalGeneration.get_decoder  s    zz%%''r6   c                 8    U R                   R                  U5      $ rT   )r  r  )rO   r  s     r7   r  1Gemma3ForConditionalGeneration.get_image_features  s    zz,,\::r6   c                 .    U R                   R                  $ rT   )r  r  r   s    r7   r  -Gemma3ForConditionalGeneration.language_model  s    zz(((r6   c                 .    U R                   R                  $ rT   )r  r  r   s    r7   r  +Gemma3ForConditionalGeneration.vision_tower  s    zz&&&r6   c                 .    U R                   R                  $ rT   )r  r  r   s    r7   r  4Gemma3ForConditionalGeneration.multi_modal_projector  s    zz///r6   rR   r  r   r   r=   r  r	  rf  r  r&  r%  rg  r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUU
U	UUUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	GbQ  UR                  5       nUSSS2SS24   nU	SSS24   nUb  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR!                  SU R                   R"                  R$                  5      nUR!                  S5      R                  UR                  5      nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  UR.                  S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenize=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
N)rR   r  r  r   r   r=   rf  r&  r  r%  rg  r  r	  r   .r   r$   )r;   r<   r=   r>   r?   r+   r,   )rc   r%  rg  r  r  r   r[   r  r  r\   r   rV   r   r   rj   CrossEntropyLossr  r  rV  r9   r=   r>   r?   r+   )rO   rR   r  r   r   r=   r  r	  rf  r  r&  r%  rg  r  r  r  r*  r>   r  r<   r;   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsr   s                               r7   rU   &Gemma3ForConditionalGeneration.forward  s~   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%))%+'/!5#)
 
"  
8B>SV8W8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r6   c                 V   > [         TU ]  " U4UUUUUU	U
US.UD6nUS   S:X  a  XmS'   U$ )N)r=   rf  r   r   r	  r&  r  r  r   r  )rK   prepare_inputs_for_generation)rO   rR   r=   rf  r	  r   r  r   r  r&  r  r  r   model_inputsrP   s                 r7   r%  <Gemma3ForConditionalGeneration.prepare_inputs_for_generation  s[      w<
+')%)))
 
 !!+7(r6   rl  c                    U R                  5       UUUUUS.nUb  UR                  S   S:w  a  US:H  R                  UR                  5      n	U	[        R
                  R                  U	SSS9S S 2S S24   ) -  n
[        R                  " U
R                  5       SS9S-
  n[        R                  " X[        R                  " US5      5      n[        UR                  UR                  5      XR                  5      US'   [        S	0 UD6$ )
Nrk  r$   r  r   r  r   r   rq  r,   )r  r   rV   r   rj   r   r  r2   r  r[   r  r  r  r  r   )rc   rl  r   r	  r=   r   r  r   r|  r  r  r  s               r7   r   8Gemma3ForConditionalGeneration.create_masks_for_generate  s
    ,,.(,,.(
 %,*<*<Q*?1*D
 '!+//0E0EFH&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(U__UcegEhiO.J!!."7"78/KeKe/K*+ )7;77r6   )r  r  )NNNNNNNNNNNNNr   )
NNNNNNNTNNrT   )&r-   r.   r/   r0   r  r  r  r%   rL   r  r  r  r  r  propertyr  r  r  r   r   r2   r  r3   r]   r	   r,  r   r[   r@   r9   rU   r%  staticmethodr   r   r   r5   r^   r_   s   @r7   r  r    s    "8-"?#,	&" ++  | 1/((; ) ) ' ' 0 0  15481537+/595959-1$(,0/3&*34|
E,,-|
 u001|
 !.	|

 u//0|
 "%|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B "H  26!8 !8ll!8 !.!8 	!8
 "%!8 u||,!8 !.!8 
!8 !8r6   r  c                   h  ^  \ rS rSrSSSS.rU 4S jrS rS r\\	         SS	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\   S\
\R                     S\
\R                     S\
\R                     S\
\   S\\   S\4S jj5       5       rSrU =r$ )Gemma3ForSequenceClassificationi  r  r   r  )r  r  r  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  R                  U R                  SS9U l	        U R                  5         g re   )rK   rL   
num_labelsr  r  rj   rk   r  rh   scorerd  rq   s     r7   rL   (Gemma3ForSequenceClassification.__init__  sZ      ++ (
YYv11==tUZ[
 	r6   c                 6    U R                   R                  5       $ rT   r  r   s    r7   r  4Gemma3ForSequenceClassification.get_input_embeddings  r
  r6   c                 :    U R                   R                  U5        g rT   r  r  s     r7   r  4Gemma3ForSequenceClassification.set_input_embeddings  r  r6   rR   r  r   r   r=   rf  r  r  r&  r   r   c
                    U R                   " U4UUUUUUU	S.U
D6nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  R                  c  US:w  a  [        S5      eU R                  R
                  R                  c  SnOUb  XR                  R
                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                   R"                   S35        U[        R                  " XR                  S	9U4   nSnUb  U R%                  XUU R                  S
9n['        UUUR(                  UR*                  UR,                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r   r  r   r=   rf  r  r&  Nr   r$   z=Cannot handle batch sizes > 1 if no padding token is defined.r   )r   rX   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rj  )r<   r  pooled_logitsrc   r  )r  rt  r0  r   rc   r  rU  ru  rV   r   r2   int32ry  argmaxrv  rw  rP   r-   r  r   r=   r>   r?   )rO   rR   r  r   r   r=   rf  r  r  r&  r   transformer_outputsr>   r<   r  last_non_pad_tokennon_pad_masktoken_indicesr7  r;   s                       r7   rU   'Gemma3ForSequenceClassification.forward  s   , #jj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%%VR_hlhshs%tD/ /??-;;*55
 	
r6   )r  r/  r0  r  )r-   r.   r/   r0   r  rL   r  r  r   r   r   r2   r  r3   r]   r	   r,  r   r   r   rU   r5   r^   r_   s   @r7   r-  r-    s.   !7-"?&"1/  15481537+/5959-1$(C
E,,-C
 u001C
 !.	C

 u//0C
 "%C
   1 12C
 !!1!12C
 ))*C
 D>C
 +,C
 
*C
  C
r6   r-  c                   $    \ rS rSr% Sr\\S'   Srg)#Gemma3TextForSequenceClassificationi,  z
Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
It uses the generic sequence classification implementation for efficiency and consistency.
rc   r,   N)r-   r.   r/   r0   r1   r&   r4   r5   r,   r6   r7   r@  r@  ,  s    
 r6   r@  )r.  rQ  r  r  r  r-  r@  )Nr$   )r  NN)Yr_  collections.abcr   dataclassesr   typingr   r   r2   torch.nnrj   activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr    utils.genericr!   autor#   configuration_gemma3r%   r&   
get_loggerr-   rv  r)   r9   	EmbeddingrB   Modulera   rw   r   r   r   r]   r[   r   r\   r@   r   r   r  r.  r,  rO  rQ  r  r6  r  r  r  r-  r@  __all__r,   r6   r7   <module>rY     sC  ,  $ ! "   ! . 3 ) m m B [ q q K F & _ _ 0 /  @ 
		H	% 
< 7 < < 
<; < <0
SBLL 
S		  =BII =(!<BII !<H(6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FN)bii N)b?3 ?D 'O ' '>
# 
(CcSVCWY]C]:^ 
 N
+ N
 N
b X
- X
 X
v!@		 !@HU\\*ell+  h	B 
Q
' Q

Q
h 
s8%:O s8
s8l[
&; [
|*JLa r6   