
    cCi                        S SK r S SKJr  S SKJrJrJr  S SKrS SKJ	r	  S SK
J	s  Jr  S SKJr  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/J0r0  SSK1J2r2  SSK3J4r4  SSK5J6r6J7r7  \0Rp                  " \95      r: " S S\	Rv                  5      r< " S S\	Rv                  5      r= " S S\R                  Rv                  5      r> " S S\	Rv                  5      r? " S S\	R                  5      rA\" S 5       " S! S \	Rv                  5      5       rB " S" S#\	Rv                  5      rCS$\R                  S%\R                  S&\R                  S'\E\R                  \R                  4   4S( jrFS)\R                  S*\GS'\R                  4S+ jrH S^S,\	Rv                  S-\R                  S.\R                  S/\R                  S0\\R                     S1\IS2\I4S3 jjrJ S^S,\	Rv                  S-\R                  S.\R                  S/\R                  S0\\R                     S1\IS2\I4S4 jjrK " S5 S6\	Rv                  5      rL " S7 S8\5      rM\. " S9 S:\)5      5       rN\. " S; S<\N5      5       rO " S= S>\N\5      rP\\." S?S@9 " SA SB\#5      5       5       rQ " SC SD\R                  Rv                  5      rR " SE SF\	Rv                  5      rSSG rT " SH SI\	Rv                  5      rUSJ\R                  S-\R                  4SK jrVS-\R                  S.\R                  SJ\R                  S'\E\R                  \R                  4   4SL jrW " SM SN\	Rv                  5      rX " SO SP\	Rv                  5      rY " SQ SR\5      rZ " SS ST\	Rv                  5      r[ " SU SV\	Rv                  5      r\ " SW SX\	Rv                  5      r] " SY SZ\N5      r^ " S[ S\\N\5      r_/ S]Qr`g)_    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )Llama4ConfigLlama4TextConfigc                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Llama4TextExperts.   configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        U R                  U l        [        R                  " [        R                  " U R                  U R
                  SU R                  -  5      5      U l        [        R                  " [        R                  " U R                  U R                  U R
                  45      5      U l        [        UR                     U l        g N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_size
expert_dimnn	Parametertorchemptygate_up_proj	down_projr	   
hidden_actact_fnselfr'   	__class__s     d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/llama4/modeling_llama4.pyr,   Llama4TextExperts.__init__/   s    !33!'!9!9!--00LLT5E5EtGWGWYZ]a]l]lYl)mnekk43C3CT__VZVfVf2g&hiV../    hidden_statesreturnc                 n   UR                  U R                  R                  S   SU R                  5      n[        R
                  " XR                  5      nUR                  SSS9u  p4[        R
                  " X@R                  U5      -  U R                  5      nUR                  SU R                  5      nU$ )a  
This should really not be run on a single machine, as we are reaching compute bound:
- the inputs are expected to be "sorted" per expert already.
- the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

Args:
    hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
    selected_experts (torch.Tensor): (batch_size * token_num, top_k)
    routing_weights (torch.Tensor): (batch_size * token_num, top_k)
Returns:
    torch.Tensor
r   r*   dim)	viewr6   shaper0   r4   bmmchunkr9   r7   )r;   r@   gate_upgateupnext_statess         r=   forwardLlama4TextExperts.forward9   s     &**4+<+<+B+B1+Er4K[K[\))M+<+<====+iikk$&7!7$..I!&&r4+;+;<r?   )r9   r7   r1   r6   r0   r/   r.   )__name__
__module____qualname____firstlineno__r#   r,   r4   TensorrN   __static_attributes____classcell__r<   s   @r=   r%   r%   .   s0    0/ 0U\\ ell  r?   r%   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Llama4TextMLPO   c                 X  > [         TU ]  5         Uc  UR                  nXl        [        R
                  " UR                  USS9U l        [        R
                  " UR                  USS9U l        [        R
                  " X!R                  SS9U l	        [        UR                     U l        g NFbias)r+   r,   r/   r'   r2   Linearr0   	gate_projup_projr7   r	   r8   activation_fn)r;   r'   r/   r<   s      r=   r,   Llama4TextMLP.__init__P   s    $ & 8 86#5#57HuUyy!3!35FUS#46H6HuU#F$5$56r?   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      $ N)rb   r`   ra   r7   )r;   xr7   s      r=   rN   Llama4TextMLP.forward\   s7    &&t~~a'89DLLOK	~~i((r?   )rb   r'   r7   r`   ra   re   rP   rQ   rR   rS   r,   rN   rU   rV   rW   s   @r=   rY   rY   O   s    
7) )r?   rY   c                   F   ^  \ rS rSrSS\4U 4S jjjrS rS rS rSr	U =r
$ )	Llama4TextL2Norma   epsc                 .   > [         TU ]  5         Xl        g re   )r+   r,   rl   )r;   rl   r<   s     r=   r,   Llama4TextL2Norm.__init__b   s    r?   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ Nr*   rC   T)keepdimr4   rsqrtpowmeanrl   r;   rf   s     r=   _normLlama4TextL2Norm._normf   4    5;;quuQx}}R}>IJJJr?   c                 ^    U R                  UR                  5       5      R                  U5      $ re   )rw   floattype_asrv   s     r=   rN   Llama4TextL2Norm.forwardi   s"    zz!'')$,,Q//r?   c                      SU R                    3$ )Nzeps=rl   r;   s    r=   
extra_reprLlama4TextL2Norm.extra_reprl   s    dhhZ  r?   r   )gư>)rP   rQ   rR   rS   r{   r,   rw   rN   r   rU   rV   rW   s   @r=   rj   rj   a   s)    E  K0! !r?   rj   c                   >   ^  \ rS rSrSU 4S jjrS rS rS rSrU =r	$ )Llama4TextRMSNormp   c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g)z,
Llama4RMSNorm is equivalent to T5LayerNorm
N)r+   r,   rl   r2   r3   r4   onesweight)r;   r0   rl   r<   s      r=   r,   Llama4TextRMSNorm.__init__q   s.     	ll5::k#:;r?   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ rp   rr   rv   s     r=   rw   Llama4TextRMSNorm._normy   ry   r?   c                 z    U R                  UR                  5       5      R                  U5      nX R                  -  $ re   )rw   r{   r|   r   )r;   rf   outputs      r=   rN   Llama4TextRMSNorm.forward|   s.    AGGI&..q1##r?   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   rG   rl   r   s    r=   r   Llama4TextRMSNorm.extra_repr   s'    ))*+6$((<<r?   )rl   r   )gh㈵>)
rP   rQ   rR   rS   r,   rw   rN   r   rU   rV   rW   s   @r=   r   r   p   s    <K$= =r?   r   c                   4   ^  \ rS rSrU 4S jrU 4S jrSrU =r$ )Llama4Router   c                    > [         TU ]  UR                  UR                  SS9  UR                  U l        UR
                  U l        g r\   )r+   r,   r0   r-   r.   num_experts_per_toktop_kr:   s     r=   r,   Llama4Router.__init__   s>    ++V-E-EER!33//
r?   c                 j  > [         TU ]  U5      n[        R                  " X R                  SS9u  p4[        R
                  " U[        S5      5      R                  SXC5      n[        R                  R                  R                  UR                  5       5      R                  UR                  5      nXR4$ )Nr!   rD   z-inf)r+   rN   r4   topkr   	full_liker{   scatter_r2   
functionalsigmoidtodtype)r;   r@   router_logitsrouter_top_valuerouter_indicesrouter_scoresr<   s         r=   rN   Llama4Router.forward   s    6+0::mZZUV+W(uV}ENNqR`s++33M4G4G4IJMMmNaNab++r?   )r.   r   rh   rW   s   @r=   r   r      s    0
, ,r?   r   Llama4TextMoec                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r      c                    > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        U5      U l	        [        U5      U l        [        U5      U l        g re   )r+   r,   r   r   r0   
hidden_dimr-   r.   r%   expertsr   routerrY   shared_expertr:   s     r=   r,   Llama4TextMoe.__init__   s[    //
 ,,!33(0"6**62r?   c                    UR                  SU R                  5      nU R                  U5      u  p#UR                  UR                  S   S5      nXBR                  SS5      R                  SS5      -  nU R                  U5      nU R                  U5      nUR                  UR                  UR                  S   SUR                  S   5      R                  SS95        Xc4$ )NrC   r!   r   rD   )
reshaper   r   repeatrG   	transposer   r   add_sum)r;   r@   r   r   	routed_in
routed_outouts          r=   rN   Llama4TextMoe.forward   s    %--b$//B'+{{='A$!(()<)<Q)?C	 7 71 = E Eb! LL	\\),
  /##M$7$7$:B
@P@PQS@TUYY^_Y`a!!r?   )r   r   r.   r   r   r   rh   rW   s   @r=   r   r      s    3" "r?   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\R                  " 5       \
S 5       5       rSrU =r$ )Llama4TextRotaryEmbedding   inv_freqr'   c                 X  > [         TU ]  5         UR                  b  SOSU l        UR                  U l        UR                  U l        Xl        [        U R                     U l	        U R                  U R                  U5      u  o0l
        U R                  SUSS9  U R                  U l        g )Nllama3defaultr   F)
persistent)r+   r,   rope_scaling	rope_typemax_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr'   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r;   r'   devicer   r<   s       r=   r,   "Llama4TextRotaryEmbedding.__init__   s    %+%8%8%D)"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r?   c                    U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       n[	        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  UR
                  5      U-  R                  SS5      n[        R                  " [        R                  " U5      U5      nXpR                  -  nS S S 5        U$ ! , (       d  f       W$ = f)	Nr   rC   r!   mpscpuF)device_typeenabledr*   )r   r{   expandrG   
isinstancer   typestrr4   autocastr   r   polar	ones_liker   )r;   rf   position_idsinv_freq_expandedposition_ids_expandedr   freqs	freqs_ciss           r=   rN   !Llama4TextRotaryEmbedding.forward   s    !MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&))!((36KKVVWXZ[\EEOOE$:EBI!$:$::I D
  DC
 s   A(D==
E)r   r'   r   r   r   r   r   re   )rP   rQ   rR   rS   r4   rT   __annotations__r#   r,   no_gradr   rN   rU   rV   rW   s   @r=   r   r      sA    ll// / / ]]_
  
r?   r   xqxkr   rA   c           	      *   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[         R
                  " X2S S 2S S 2S S S 24   -  5      R                  S5      n[         R
                  " XBS S 2S S 2S S S 24   -  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrC   r*   r   )r4   view_as_complexr{   r   rG   view_as_realflattenr|   )r   r   r   xq_xk_xq_outxk_outs          r=   apply_rotary_embr      s    
 


 2 2 IBHHSbM I2 Iq I
JC



 2 2 IBHHSbM I2 Iq I
JC1dA&> >?GGJF1dA&> >?GGJF>>"v~~b111r?   r@   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)rG   r   r   )r@   r   batchnum_key_value_headsslenhead_dims         r=   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr?   modulequerykeyvalueattention_maskscalingdropoutc                    [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr*   r   rC   rD   ptrainingr!   )r   num_key_value_groupsr4   matmulr   rG   r2   r   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r=   eager_attention_forwardr	     s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2(>L==((6??([L,,|:K''1-88:K$$r?   c                 
   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U R
                  S-  -  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R                  R                  U
SS9n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )	Nr*   r         r   rC   rD   r   r!   )r   r   r4   r   r   r   rG   r2   r   r   r   r   r  r  s                r=   vision_eager_attention_forwardr     s     3 ; ;<JU$?$?@L<<';';Aq'ABV__VZEZZL!$Q1.D
0@0@0D.D%DE#1==((2(>L==((6??([L,,|:K''1-88:K$$r?   c                   X  ^  \ rS rSrSrS\4U 4S jjr\" SSSS9  SS	\R                  S
\
\R                  \R                  4   S\\R                     S\\   S\\R                     S\\   S\
\R                  \\R                     \\
\R                        4   4S jj5       rSrU =r$ )Llama4TextAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperr'   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  U l        UR                  UR                  -  U l	        UR                  U l        U R                  S-  U l
        UR                  U l        UR                  U l        UR                  U l        UR                  U l        SU l        UR                   U   U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR
                  UR                  U R                  -  UR(                  S9U l        [$        R&                  " UR                  U R                  -  UR
                  UR(                  S9U l        U R                  R2                  (       a-  U R"                  (       a  [5        UR6                  5      U l        g g g )Nr   r  Tr]   )r+   r,   r'   	layer_idxgetattrr0   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalno_rope_layersuse_roper2   r_   attention_biasq_projk_projv_projo_projuse_qk_normrj   rms_norm_epsqk_normr;   r'   r  r<   s      r=   r,   Llama4TextAttention.__init__  s   "
F4F4F&JdJd4de#)#=#= $*$>$>&B\B\$\!#)#=#= }}d* ++!--'-'E'E$!'!9!9--i8ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 ;;""t}}+F,?,?@DL (5"r?   past_key_valuepast_key_values4.58new_nameversionr@   position_embeddingsr   cache_positionr  rA   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      n	U R	                  U5      R                  " / UQSPU R                  P76 n
U R                  U5      R                  U5      R                  SS5      nU R                  (       a'  [        XUR                  U	R                  5      5      u  p[        U S5      (       a"  U R                  U	5      n	U R                  U
5      n
U R                  (       a  U R                  (       d  [        R                  " [        R                   " UR#                  5       S-   U R$                  -  5      5      U R&                  -  S-   nUR                  SUS   SS45      R)                  / UQSPSP75      nX-  R                  U	R*                  5      n	U	R                  SS5      n	U
R                  SS5      n
Ub#  SU0nUR-                  XU R.                  U5      u  p[0        nU R2                  R4                  S:w  a  [6        U R2                  R4                     nU" U U	U
UU4U R8                  (       d  SOU R:                  U R<                  S	.UD6u  nnUR>                  " / UQSP76 RA                  5       nU RC                  U5      nUU4$ )
NrC   r!   r*   r!        ?r+  eager        )r   r   )"rG   r   r  rF   r  r  r   r  r   r   r   hasattrr!  r  r4   log1pfloorr{   r  r  r   r   updater  r	  r'   _attn_implementationr   r   r  r   r   r  r  )r;   r@   r*  r   r%  r+  r  input_shapehidden_shapequery_statesr  r  attn_scalescache_kwargsattention_interfacer  r  s                    r=   rN   Llama4TextAttention.forward8  s    $))#2.88b8$--8{{=166|D[[/44UkU2Ut}}U
{{=166|DNNqRST=='7*=*@*@ATAT*U($L 4##<<5Lj1J ''EKK)=)=)?#)EIYIY(YZ[^b^m^mmpss  &**A{21+EFMMNbP[Nb]^Nb`aNbcK(6::<;M;MNL#--a3))!Q/
&,n=L'6'='=jX\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r?   )r  r  r  r'   r  r   r  r  r  r  r   r   r  r  r!  r   r  r  NN)rP   rQ   rR   rS   __doc__r#   r,   r   r4   rT   r   r   r
   
LongTensorr   r   rN   rU   rV   rW   s   @r=   r  r    s    GA/ A< %0A6R ,0599)||9) #5<<#=>9) !.	9)
 "%9) !!1!129) -.9) 
u||Xell3XeELL>Q5RR	S9) S9)r?   r  c                   ~  ^  \ rS rSrU 4S jr\" SSSS9      SS\R                  S\\R                     S	\\R                     S\\
   S
\\   S\\R                     S\\\R                  \R                  4      S\\   S\\R                  \\\R                  \R                  4      4   4S jj5       rSrU =r$ )Llama4TextDecoderLayeriu  c                   > [         TU ]  5         UR                  U l        X l        UR                  U   U l        [        X5      U l        X!R                  ;   U l	        U R                  (       a  [        U5      U l        O[        XR                  S9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )N)r/   r   )r+   r,   r0   r  layer_typesattention_typer  	self_attn
moe_layersis_moe_layerr   feed_forwardrY   intermediate_size_mlpr   r   input_layernormpost_attention_layernormr"  s      r=   r,   Llama4TextDecoderLayer.__init__v  s    !--"$00;,V?%):):: -f 5D -fHdHd eD01C1CI\I\](9&:L:LRXReRe(f%r?   r$  r%  r&  r'  r@   r   r   	use_cacher+  r*  r  rA   c           
         Un	U R                  U5      nU R                  " SUUUUUUS.UD6u  pX-   nUn	U R                  U5      nU R                  U5      nU R                  (       a  Uu  pXR                  U	R                  5      -   nU$ )N)r@   r*  r   r%  rL  r+   )rI  rD  rJ  rG  rF  rF   rG   )r;   r@   r   r   r%  rL  r+  r*  r  residualattention_states_s               r=   rN   Llama4TextDecoderLayer.forward  s     !,,]; #nn 
' 3)+)
 
 !3 !55mD))-8,M #5#5hnn#EEr?   )rC  rG  r0   rI  rF  r  rJ  rD  )NNNFNN)rP   rQ   rR   rS   r,   r   r4   rT   r   r>  r
   boolr   r   r   FloatTensorrN   rU   rV   rW   s   @r=   r@  r@  u  s   g %0A6R 2637+/$)59KO"||" !." u//0	"
 "%" D>" !!1!12" &eELL%,,,F&GH" -." 
u  (51B1BEDUDU1U+V"WW	X" S"r?   r@  c                   D    \ rS rSr% \\S'   SrS/rSrSr	Sr
SrSrS rSrg)	Llama4PreTrainedModeli  r'   Tr%  Fc                 |   [        U R                  S5      (       a  U R                  R                  OU R                  R                  R                  n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [	        U[         5      (       a&  UR                  R                  R                  S5        g [	        U["        5      (       aI  UR$                  R                  R                  SUS9  UR&                  R                  R                  SUS9  g [	        U[(        5      (       a[  UR*                  R                  R                  UR,                  S9  UR.                  R                  R                  UR,                  S9  g g )Ninitializer_ranger/  )ru   stdr-  )rY  )r0  r'   rX  text_configr   r2   r_   r   datanormal_r^   zero_	Embeddingpadding_idx	LayerNormfill_r   r%   r6   r7   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r;   r   rY  s      r=   _init_weights#Llama4PreTrainedModel._init_weights  s    t{{$788 KK))((:: 	
 fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$ 122MM$$S) 122$$,,#3,?!!))s)< 122""''//FLL/A++0088V\\8J 3r?   rN  N)rP   rQ   rR   rS   r"   r   supports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrf  rU   rN  r?   r=   rV  rV    s:    &*##4"5 N!"&Kr?   rV  c                   X  ^  \ rS rSr% S/rSr\\S'   \\	\
S.rS\4U 4S jjr\\" 5       \       SS\\R$                     S\\R&                     S	\\R$                     S
\\   S\\R*                     S\\   S\\R$                     S\\   S\\\4   4S jj5       5       5       rSrU =r$ )Llama4TextModeli  r@  modelr'   )
attentionsr@   r   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   r'   F)r+   r,   pad_token_idr_  
vocab_sizer2   r^  r0   embed_tokens
ModuleListrangenum_hidden_layersr@  layersr   r   normr   
rotary_embgradient_checkpointing	post_initr"  s      r=   r,   Llama4TextModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammHMfNfNfHghHg9#F6Hgh
 &f&8&8f>Q>QR	36B&+# 	 is   C?	input_idsr   r   r%  inputs_embedsrL  r+  r  rA   c                 &   US L US L-  (       a  [        S5      eUc>  U R                  UR                  U R                  R                  R                  5      5      nU(       a  Uc  [        U R                  S9nUcD  Ub  UR                  5       OSn	[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U=n
[        5      (       d*  U R                  UUUUUS.n[        S
0 UD6[        S
0 UD6S.n
UnU R!                  X5      nU R"                  S U R                  R$                    H  nU" U4XR&                     UUUUUS.UD6nM!     U R)                  U5      n[+        UU(       a  US	9$ S S	9$ )N:You must specify exactly one of input_ids or inputs_embedsrt  r   r!   )r   )r'   input_embedsr   r+  r%  r   )full_attentionchunked_attention)r   r   r%  rL  r+  r*  )last_hidden_stater%  rN  )
ValueErrorrw  r   r   r   r   r'   get_seq_lengthr4   arangerG   	unsqueezer   dictr   r   r}  r{  rz  rC  r|  r   )r;   r  r   r   r%  r  rL  r+  r  past_seen_tokenscausal_mask_mappingmask_kwargsr@   freq_cisdecoder_layers                  r=   rN   Llama4TextModel.forward  s    -t";<YZZ  --ill4;L;L;S;S;Z;Z.[\M0*$++>O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L ?-FF ++ -"0"0#2 ,K #5"C{"C%?%N+%N#
 & ??=?![[)H4;;+H+HIM)	23O3OP) /#-$,	 	M J 		-0&+/8O
 	
>B
 	
r?   )rw  r~  r{  r|  r_  r}  rv  )NNNNNNN)rP   rQ   rR   rS   _no_split_modulesbase_model_prefixr#   r   r  r@  r   _can_record_outputsr,   r   r    r   r   r4   r>  rT   r
   rT  rS  r   r   r   r   r   rN   rU   rV   rW   s   @r=   rp  rp    s%   12)/&/    151537+/59$(59C
E,,-C
 !.C
 u//0	C

 "%C
   1 12C
 D>C
 !!1!12C
 +,C
 
u--	.C
   C
r?   rp  c                     ^  \ rS rSr% S/rSrS/rSS0r\\	S'   S\4U 4S jjr
\\         SS	\\R                     S
\\R                      S\\R                     S\\\\\R(                     4      S\\R(                     S\\R                     S\\   S\\R                     S\\\R                   4   S\\   S\\\4   4S jj5       5       rSrU =r$ )Llama4ForCausalLMi6  r@  language_modelzlm_head.weightlm_headcolwise_repr'   c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r\   )
r+   r,   rp  rq  rv  r2   r_   r0   r  r  r:   s     r=   r,   Llama4ForCausalLM.__init__=  sU     $V,
 ++yy!3!3V5F5FUS 	r?   r  r   r   r%  r  labelsrL  r+  logits_to_keepr  rA   c
                 p   U R                   " SUUUUUUUS.U
D6nUS   n[        U	[        5      (       a  [        U	* S5      OU	nU R	                  USS2USS24   5      nSnUb)  U R
                  " SXU R                  R                  S.U
D6n[        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Llama4ForCausalLM

>>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r  r   r   r%  r  rL  r+  r   N)logitsr  rv  )lossr  r%  r@   rr  rN  )rq  r   intslicer  loss_functionr'   rv  r   r%  r@   rr  )r;   r  r   r   r%  r  r  rL  r+  r  r  outputsr@   slice_indicesr  r  s                   r=   rN   Llama4ForCausalLM.forwardF  s    J ** 	
)%+')	
 	
  
8B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD%#33!//))
 	
r?   )r  rq  rv  )	NNNNNNNNr   )rP   rQ   rR   rS   r  r  _tied_weights_keys_tp_planr#   r   r,   r   r   r   r4   r>  rT   r   r
   listrT  rS  r  r   r   r   r   rN   rU   rV   rW   s   @r=   r  r  6  sZ   12(*+=)H/   151537KO59-1$(5934<
E,,-<
 !.<
 u//0	<

 "%tE4E4E/F(F"GH<
   1 12<
 ))*<
 D><
 !!1!12<
 c5<</0<
 +,<
 
u,,	-<
  <
r?   r  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )custom_introc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)Llama4CausalLMOutputWithPasti  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nr  r  r%  r@   rr  image_hidden_statesrN  )rP   rQ   rR   rS   r=  r  r   r4   rT  r   r  r%  r
   r@   r   rr  r  rU   rN  r?   r=   r  r    s     )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju001297;%"3"34;r?   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4VisionMLP2i  c                 x  > [         TU ]  5         UR                  U l        UR                  U l        [        R
                  " U R                  UR                  SS9U l        [        R
                  " UR                  UR                  SS9U l	        [        R                  " 5       U l        UR                  U l        g r\   )r+   r,   r0   r/   r2   r_   projector_input_dimfc1projector_output_dimfc2GELUrb   projector_dropoutr   r:   s     r=   r,   Llama4VisionMLP2.__init__  s    !--!'!9!999T33V5O5OV[\99V88&:U:U\abWWY//r?   c                     U R                  U5      nU R                  U5      n[        R                  " XR                  U R                  S9nU R                  U R                  U5      5      $ )Nr   )r  rb   Fr   r   r  r;   r@   s     r=   rN   Llama4VisionMLP2.forward  sR    /**=9		-<<$--X!!$((="9::r?   )rb   r   r  r  r0   r/   rh   rW   s   @r=   r  r    s    0; ;r?   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4MultiModalProjectori  c                    > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        g r\   )	r+   r,   r2   r_   vision_configvision_output_dimrZ  r0   linear_1r:   s     r=   r,   "Llama4MultiModalProjector.__init__  s?    		  22**
r?   c                 (    U R                  U5      nU$ re   r  )r;   image_featuresr@   s      r=   rN   !Llama4MultiModalProjector.forward  s    n5r?   r  rh   rW   s   @r=   r  r    s    
 r?   r  c           
      8   U R                   u  p#n[        [        R                  " U5      5      nU R	                  X%US5      n U R                  5       u  p&ptU R	                  X&[        Xq-  5      [        XA-  5      5      nUR                  SSSS5      R                  5       nUR	                  U[        Xa-  5      [        Xq-  5      [        XAS-  -  5      5      nUR                  SSSS5      R                  5       nUR	                  USUR                   S   5      n	U	$ )NrC   r   r*   r!   r   )rG   r  mathsqrtrF   sizepermuter  )
input_tensorshuffle_ratio
batch_sizenum_patcheschannels
patch_sizeheightwidthreshaped_tensoroutput_tensors
             r=   pixel_shuffler    s   (4(:(:%JXTYY{+,J$$ZZLL*6*;*;*='J"''
C@U<VX[\d\tXuvO%--aAq9DDFO%**C./U5J1KSQYlm]mQnMoO &--aAq9DDFO#((R9N9Nr9RSMr?   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionPixelShuffleMLPi  c                    > [         TU ]  5         UR                  U l        [        UR                  U R                  S-  -  5      U l        UR                  U l        [        U5      U l	        g r)   )
r+   r,   pixel_shuffle_ratior  r  	inner_dimr  
output_dimr  mlpr:   s     r=   r,   $Llama4VisionPixelShuffleMLP.__init__  sX    #)#=#= V77D<T<TVW<WXY 55#F+r?   encoded_patchesrA   c                 N    [        XR                  5      nU R                  U5      $ re   )r  r  r  )r;   r  s     r=   rN   #Llama4VisionPixelShuffleMLP.forward  s!    '9Q9QRxx((r?   )r  r  r  r  
rP   rQ   rR   rS   r,   r4   rT   rN   rU   rV   rW   s   @r=   r  r    s(    ,)u|| ) ) )r?   r  freqs_cic                     UR                   n[        UR                  5       VVs/ s H  u  p4US:X  d  X2S-
  :X  a  UOSPM     nnnU R                  " U6 $ s  snnf )Nr!   )ndim	enumeraterG   rF   )r  r   r  idrG   s         r=   reshape_for_broadcastr    sT    ::D=Fu{{=ST=STQ!q&AMQq0=SET==%   Us   Ac                 >   [         R                  " U R                  5       R                  " / U R                  S S QSPSP76 5      n[         R                  " UR                  5       R                  " / UR                  S S QSPSP76 5      n[        X#S9nUR                  UR                  5      n[         R                  " X2-  5      R                  S5      n[         R                  " XB-  5      R                  S5      nUR                  U 5      UR                  U5      4$ )NrC   r*   )r  r   r   )r4   r   r{   r   rG   r  r   r   r   r   r|   )r   r   r  query_key_	query_outkey_outs          r=   vision_apply_rotary_embr    s    
 ""5;;=#8#8#R%++cr:J#RB#RPQ#RSF  !4!4!Lciin!Lb!L!!LMD$hEH{{6==)H""6#45==a@I  199!<GU#W__S%999r?   c                     ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\\R                     S\\	   S\
\   S	\\R                  \\R                     \\\R                        4   4S
 jjrSrU =r$ )Llama4VisionAttentioni  r'   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  UR
                  -  U l        SU l        UR                  U l	        U R                  S-  U l
        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )Nr!   r  Tr]   )r+   r,   r'   r0   	embed_dimr  	num_headsr   r   r  r   r2   r_   r  r  r  r  r:   s     r=   r,   Llama4VisionAttention.__init__  s   ++33**f.H.HH$%!!'!9!9}}d*ii0NUYZii0NUYZii0NUYZii >UYZr?   r@   r  r   r%  r  rA   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      nU R	                  U5      R                  U5      n	U R                  U5      R                  U5      n
[        XUS9u  pUR                  SS5      nU	R                  SS5      n	U
R                  SS5      n
[        nU R                  R                  S;  a  [        U R                  R                     nU" U UU	U
S 4U R                  (       d  SOU R                  S SS.UD6u  pUR                  " / UQSP76 R                  5       nU R!                  U5      nX4$ )	NrC   r  r!   r*   )r.  flex_attentionr/  F)r   r   r  )rG   r   r  rF   r  r  r  r   r  r'   r4  r   r   r  r   r  r  )r;   r@   r  r   r%  r  r5  r6  r7  r  r  r:  r  r  s                 r=   rN   Llama4VisionAttention.forward
  sk    $))#2.88b8$--8{{=166|D[[/44\B
{{=166|D#:<^f#g #--a3))!Q/
#--a3(F;;++3NN"9$++:Z:Z"[$7
%
  $}}C$2H2H
%
 
%
! "));;;;FFHkk+.((r?   )r  r'   r  r   r  r  r   r  r  r   r  r<  )rP   rQ   rR   rS   r   r,   r4   rT   r   r
   r   r   r   rN   rU   rV   rW   s   @r=   r  r    s    [1 [& 26+/()||() ,,() !.	()
 "%() -.() 
u||Xell3XeELL>Q5RR	S() ()r?   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4VisionMLPi5  c                   > [         TU ]  5         Xl        [        R                  " 5       U l        [        R                  " UR                  UR                  SS9U l	        [        R                  " UR                  UR                  SS9U l
        g )NTr]   )r+   r,   r'   r2   r  rb   r_   r0   r/   r  r  r:   s     r=   r,   Llama4VisionMLP.__init__6  sc    WWY99V//1I1IPTU99V55v7I7IPTUr?   r@   rA   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ re   )r  rb   r  r  s     r=   rN   Llama4VisionMLP.forward=  s4    /**=9/r?   )rb   r'   r  r  r  rW   s   @r=   r  r  5  s)    VU\\ ell  r?   r  c            
          ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\R                  S\\R                     S\\	   4S jjr
S	rU =r$ )Llama4VisionEncoderLayeriD  r'   c                   > [         TU ]  5         UR                  U l        [        U5      U l        [        U5      U l        [        R                  " UR                  5      U l	        [        R                  " UR                  5      U l
        g re   )r+   r,   r0   r  rD  r  r  r2   r`  rI  rJ  r:   s     r=   r,   !Llama4VisionEncoderLayer.__init__E  sb    !--.v6"6*!||F,>,>?(*V5G5G(H%r?   hidden_stater  r   output_attentionsc                     UnU R                  U5      nU R                  UUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )N)r  r   )rI  rD  rJ  r  )r;   r  r  r   r  rO  r  r  s           r=   rN    Llama4VisionEncoderLayer.forwardO  s      ++L9%)^^) &4 &
"
  .  44\Bxx-./&Gr?   )r0   rI  r  rJ  rD  r<  )rP   rQ   rR   rS   r   r,   r4   rT   r   rS  rN   rU   rV   rW   s   @r=   r  r  D  s_    I1 I 26,0ll ,, !.	
 $D> r?   r  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S\	\R                     S\	\
   S	\	\
   S
\	\
   S\\\4   4S jjrSrU =r$ )Llama4VisionEncoderip  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Llama4VisionEncoderLayer`].

Args:
    config: Llama4VisionConfig
r'   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        Xl        g s  snf )NF)
r+   r,   r'   r2   rx  ry  rz  r  r{  r~  )r;   r'   rQ  r<   s      r=   r,   Llama4VisionEncoder.__init__y  sY    mmuU[UmUmOn$oOn!%=f%EOn$op&+# %ps   A,r@   r  r   r  output_hidden_statesreturn_dictrA   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnU R                   H,  n	U(       a  Xq4-   nU	" UUUUS9n
U(       a  XS   4-   nU
S   nM.     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrN  )r  r   r  r  r!   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fre   rN  .0vs     r=   	<genexpr>.Llama4VisionEncoder.forward.<locals>.<genexpr>  s     e$Sq$S   	r  r@   rr  )r'   r  r  use_return_dictr{  r   r   )r;   r@   r  r   r  r  r  encoder_statesall_attentionsencoder_layerlayer_outputss              r=   rN   Llama4VisionEncoder.forward  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d![[M#!/2B!B)*-"3!	M !!/3C2E!E)!,M )   +.>>Ne]N$Seee+Vd
 	
r?   )r'   r~  r{  NNNN)rP   rQ   rR   rS   r=  r   r,   r4   rT   r   rS  r   r   r   rN   rU   rV   rW   s   @r=   r  r  p  s    1  26,0/3&*?
||?
 ,,?
 !.	?

 $D>?
 'tn?
 d^?
 
uo%	&?
 ?
r?   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Llama4UnfoldConvolutioni  c                 8  > [         TU ]  5         UR                  n[        U[        5      (       a  X"4n[
        R                  R                  X!R                  S9U l        [        R                  " UR                  US   -  US   -  UR                  SS9U l        g )N)kernel_sizestrider   r!   Fr]   )r+   r,   r  r   r  r4   r2   Unfoldunfoldr_   num_channelsr0   linear)r;   r'   r#  r<   s      r=   r,    Llama4UnfoldConvolution.__init__  s    ''k3''&4Khhoo+FWFWoXii+a.0;q>A
r?   r@   rA   c                 p    U R                  U5      nUR                  SSS5      nU R                  U5      nU$ )Nr   r*   r!   )r&  r  r(  r  s     r=   rN   Llama4UnfoldConvolution.forward  s8    M2%--aA6M2r?   )r(  r&  r  rW   s   @r=   r!  r!    s(    

U\\ ell  r?   r!  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Llama4VisionRotaryEmbeddingi  c                   > [         TU ]  5         UR                  UR                  -  n[        R
                  " US-  [        R                  S9R                  US-  S5      n[        R                  " X3S S /SS9nSUS'   X2-  nX2-  nUR                  UR                  -  S-  nSUR                  [        R
                  " SUS5      S US-   R                  5       U-  -  -  nUS-   S	   US S S S 24   -  R                  SS
S9nUS-   S	   US S S S 24   -  R                  SS
S9n	[        R                  " X/S
S9R                  5       R                  5       SS S S24   n
U
R                  UR                  S
SS5      S:  S5      n
[        R                   " [        R"                  " [        R$                  " U
5      [        R&                  " U
5      /S
S95      nXl        g )Nr*   )r   r!   r   rD   r   )rC   rC   r-  ).NrC   .)r+   r,   
image_sizer  r4   r  int32r   catr0   r  
rope_thetar{   repeat_interleaver  masked_fillr   stackcossinr  )r;   r'   idximg_idxfrequencies_xfrequencies_yfreq_dim	rope_freqfreqs_xfreqs_yr   r  r<   s               r=   r,   $Llama4VisionRotaryEmbedding.__init__  s   6#4#44,,sAvU[[9AA#q&!L))Wbqk2:%%)C)CCqH6,,a11MN_QY]^Q^1`1f1f1hks1stu	!A%y1IdD!m4LL__`agi_j!A%y1IdD!m4LL__`agi_j		7,"5;;=HHJ3PSRSPS8T!!'//"a";a"?C((eii6F		RWHX5Y_a)bc r?   c                 L    U R                   R                  UR                  5      $ re   )r  r   r   r  s     r=   rN   #Llama4VisionRotaryEmbedding.forward  s    }} 4 455r?   r  rh   rW   s   @r=   r-  r-    s    !"6 6r?   r-  c                      ^  \ rS rSr% SrS/r\\S'   S\4U 4S jjrS r	    SS\
R                  S\\
R                     S	\\   S
\\   S\\   S\\\\
R                  S4   4   4S jjrSrU =r$ )rb  i  vision_modelr  r'   c                 ~  > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        U R                  U R                  -  S-  S-   U l        UR                  S-  U l        [        U5      U l	        [        R                  " U R                  [        R                  " U R                  5      -  5      U l        [        R                  " U R                  [        R                  " U R                  U R                  5      -  5      U l        [!        U5      U l        [        R$                  " U R                  5      U l        [        R$                  " U R                  5      U l        [+        U5      U l        [/        U5      U l        U R3                  5         g )Nr*   r!   r  )r+   r,   r/  r  r0   r'  r  rd  r!  patch_embeddingr2   r3   r4   randnrc  re  r-  rotary_embeddingr`  layernorm_prelayernorm_postr  rq  r  vision_adapterr  r:   s     r=   r,   Llama4VisionModel.__init__  sA     ++ ++!--"// OOt>1DqH''-
6v>!||DJJTEUEU9V,VW(*TZZ%++dN^N^`d`p`pBq5q(r% ;F C  \\$*:*:; ll4+;+;< )0
9&Ar?   c                     U R                   $ )zW
This function is used to fetch the first embedding layer to activate grads on inputs.
)rF  r   s    r=   get_input_embeddings&Llama4VisionModel.get_input_embeddings
  s     ###r?   pixel_valuesr   r  r  r  rA   .c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR                  u  pgpSn
SnU R                  U5      nUR                  u  pnUR                  Xj-  U-  X5      nU R                  R                  UR                  S   SUR                  S   5      n[        R                  " UU/SS9nUS-  nUR                  Xj-  XU5      nU R                  R                  UR                  UR                  S9nUU-   nU R                  U5      nUR!                  USU5      nU R#                  U5      nU R%                  USUUUS9nUR&                  nU R)                  U5      nUSS2SS2SS24   nU R+                  U5      nU(       a  UR,                  OSnU(       a  US   nOSnU(       d  [/        S	 UUU4 5       5      $ [1        UUUS
9$ )aN  

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, MllamaVisionModel

>>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
>>> model = MllamaVisionModel.from_pretrained(checkpoint)
>>> processor = AutoProcessor.from_pretrained(checkpoint)

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")

>>> output = model(**inputs)

>>> print(output.last_hidden_state.shape)
torch.Size([1, 1, 4, 1025, 7680])
```
Nr!   r   rC   rD   r   r   )r   r  r  r  r*   c              3   .   #    U  H  oc  M  Uv   M     g 7fre   rN  r  s     r=   r  ,Llama4VisionModel.forward.<locals>.<genexpr>i  s     _$Mq$Mr  r  )r'   r  r  r  rG   rF  r   rc  r   r4   r1  re  r   r   r   rI  rF   rH  rq  r  rJ  rK  r@   r   r   )r;   rP  r   r  r  r  batch_size_times_num_tilesr'  r  r  num_concurrent_media
num_chunksr  rQ  r  r   rc  positional_embeddingr  r   r@   rr  s                         r=   rN   Llama4VisionModel.forward  sB   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] COBTBT?"& 
++L9%1%7%7"
 $++&=
JK
 ..55l6H6H6KQP\PbPbcePfgyy,!@aHq $++&=zXb
  $<<??lFXFXamatat?u#&::)),7#(()CRT((6!5/  
 //**<8#AssAI. **<80D,,$JJ_\=*$M___*'!
 	
r?   )rc  r0   r/  rJ  rI  rq  r'  r  rF  r  re  rH  rd  rK  r  )rP   rQ   rR   rS   r  r  r   r   r,   rN  r4   rT   r   rS  r   r   r   rN   rU   rV   rW   s   @r=   rb  rb    s    &341 2$ 26,0/3&*_
ll_
 !._
 $D>	_

 'tn_
 d^_
 
ellC&7 88	9_
 _
r?   rb  c            '         ^  \ rS rSr% SS/r0 rSr\\S'   S\4U 4S jjr	S r
S rS	 rS
 rS rS rS\R"                  S\4S jrS\R(                  S\R"                  S\R"                  4S jr\\" SSS9               S&S\\R(                     S\\R"                     S\\R2                     S\\R(                     S\\   S\\R"                     S\\\\\   4      S\\   S\\R(                     S\\   S\\   S\\   S\\   S\\R(                     S \\\R2                  4   S!\\    S"\\!\"4   4"S# jj5       5       r#      S'S$ jr$S%r%U =r&$ )(Llama4ForConditionalGenerationir  r@  r   r'   c                 j  > [         TU ]  U5        [        UR                  5      U l        [        U5      U l        [        UR                  5      U l	        UR                  R                  U l
        U R                  R                  b  U R                  R                  OSU l        U R                  5         g )NrC   )r+   r,   rb  r  rD  r  multi_modal_projectorr  rZ  r  rv  r'   ru  r  r:   s     r=   r,   'Llama4ForConditionalGeneration.__init__x  s     -f.B.BC%>v%F"/0B0BC ,,778<8P8P8\DKK44bdr?   c                 6    U R                   R                  5       $ re   )r  rN  r   s    r=   rN  3Llama4ForConditionalGeneration.get_input_embeddings  s    ""7799r?   c                 :    U R                   R                  U5        g re   )r  set_input_embeddings)r;   r   s     r=   rc  3Llama4ForConditionalGeneration.set_input_embeddings  s    007r?   c                 6    U R                   R                  5       $ re   )r  get_output_embeddingsr   s    r=   rf  4Llama4ForConditionalGeneration.get_output_embeddings  s    ""88::r?   c                 :    U R                   R                  U5        g re   )r  set_output_embeddings)r;   new_embeddingss     r=   ri  4Llama4ForConditionalGeneration.set_output_embeddings  s    11.Ar?   c                 :    U R                   R                  U5        g re   )r  set_decoder)r;   decoders     r=   rm  *Llama4ForConditionalGeneration.set_decoder  s    ''0r?   c                 6    U R                   R                  5       $ re   )r  get_decoderr   s    r=   rq  *Llama4ForConditionalGeneration.get_decoder  s    ""..00r?   rP  vision_feature_select_strategyc                     US;  a  [        SU R                   35      eUR                  5        VVs0 s H  u  pEUc  M
  XE_M     nnnU R                  " U4SS0UD6nUR                  nU$ s  snnf )a  
Obtains image last hidden states from the vision tower and apply al projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_select_strategy (`str`):
        The feature selection strategy used to select the vision feature from the vision backbone.
        Can be one of `"default"` or `"full"`
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)r   fullz$Unexpected select feature strategy: r  F)r  rs  itemsrD  r  )r;   rP  rs  r  kr  image_outputsr  s           r=   get_image_features1Llama4ForConditionalGeneration.get_image_features  s{    $ *1DDCDDgDgChijj#)<<>C>41Q$!$>C)),]U]V\]$66 Ds
   	A+A+r  r  r  c           	      &   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nX$   R                  5       UR                  5       :w  a  [        SU SUR                  S    35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
rR  rC   z6Image features and image tokens do not match: tokens: z, features r   )rN  r4   tensorr'   image_token_idlongr   allr   r  	expand_asr   numelr  rG   )r;   r  r  r  special_image_maskn_image_tokenss         r=   get_placeholder_mask3Llama4ForConditionalGeneration.get_placeholder_mask  s     !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno,2248L8L8NNHHXXcdrdxdxyzd{c|}  "!r?   vision_feature_layerr&  )r)  r   r   r%  r  rL  r  r  r  r+  r  r  rA   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  R
                  nUSL USL-  (       a  [        S5      eUb  Ub  [        S5      eUc  U R                  5       " U5      nUb  U R                  UUS9nUR                  SUR                  S5      5      nU R                  U5      R                  UR                  UR                  5      nU R                  XUS9nUR!                  UU5      nU R"                  " SUUUUU
UUUUUS.
UD6nUS   nSnU	Gb>  Ub  USS2UR$                  S	   S	-
  * S24   R                  UR                  5      nUS
SS2SS24   UR                  UR                  5      S:g     R'                  5       nU	S
S	S24   UR                  U	R                  5      S:g     R'                  5       nO1US
SS2SS24   R'                  5       nU	S
S	S24   R'                  5       n[(        R*                  " 5       nU" UR                  SUR                  S5      5      UR                  S5      R                  UR                  5      5      nU(       d  U4US	S -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  UR2                  Ub  WS9$ SS9$ )a   
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, LlavaForConditionalGeneration

>>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
>>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

>>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
```Nr  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)rP  rs  rC   )r  r  )
r   r   r%  r  rL  r  r  r  r+  r  r   r!   .)r  r  r%  r@   rr  r  rN  )r'   r  r  r  r  rs  r  rN  ry  rF   r  r^  r   r   r   r  masked_scatterr  rG   r  r2   CrossEntropyLossr  r%  r@   rr  )r;   r  rP  r   r   r%  r  r  rs  r  rL  r  r  r  r+  r  r  r  vision_flatprojected_vision_flatr  r  r  r  shift_attention_maskshift_logitsshift_labelsloss_fctr   s                                r=   rN   &Llama4ForConditionalGeneration.forward  sg   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] .9 +**II 	' -t";<YZZ#(Av    557	BM#!44)/M 5 N
 )--b.2E2Eb2IJK$($>$>{$K$N$N$$m&9&9%! "&!:!:G\ "; " *889KMbcM%% 
)%+'/!5#))
 
 ) (6a6<<?Q;N9O9Q6Q'R'U'UV\VcVc'd$%c3B3k23G3J3J6==3Y]^3^_jjl%c12g/C/F/Fv}}/UYZ/Z[ffh%c3B3k2==?%c12g99;**,H!!"l&7&7&;<l>O>OPR>S>V>VWcWjWj>kD Y,F'+'7D7V#CVC+#33!//))2>2J
 	
 QU
 	
r?   c           	      f    U R                   R                  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)r%  r  r   r+  r  r   rP  )r  prepare_inputs_for_generation)
r;   r  r%  r  rP  r   r+  r  r  model_inputss
             r=   r  <Llama4ForConditionalGeneration.prepare_inputs_for_generationJ  sZ     **HH
+')))
 
 !! ,8(r?   )r  r^  ru  rD  rv  )NNNNNNNNNNNNNNr   )NNNNNN)'rP   rQ   rR   rS   r  r  r  r"   r   r,   rN  rc  rf  ri  rm  rq  r4   rT  r   ry  r>  r  r   r   r   rT   r
   r   r  r  rS  r   r   r   r  rN   r  rU   rV   rW   s   @r=   r[  r[  r  sX   13MNH	| 	:8;B11'' ),2"))":?:K:K"]b]n]n". +V< 15481537+/59@D8<-1$(,0/3&*5934!A
E,,-A
 u001A
 !.	A

 u//0A
 "%A
   1 12A
 'uS$s)^'<=A
 )1A
 ))*A
 D>A
 $D>A
 'tnA
 d^A
 !!1!12A
  c5<</0!A
" +,#A
$ 
u22	3%A
 = A
L  r?   r[  )rV  rp  rb  r  r[  )r/  )ar  dataclassesr   typingr   r   r   r4   torch.nnr2   torch.nn.functionalr   r  /transformers.models.llama4.configuration_llama4r   activationsr	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr    configuration_llama4r"   r#   
get_loggerrP   loggerModuler%   rY   rj   r   r_   r   r   r   rT   r   r   r  r   r{   r	  r  r  r@  rV  rp  r  r  r  r  r  r  r  r  r  r  r  r  r!  r-  rb  r[  __all__rN  r?   r=   <module>r     s     ! , ,     N ! . ) 7 K B 9 m m K F & R R 0 / @ 
		H	%		 B)BII )$!uxx !=		 =(,299 , _-"BII " ."*		 B	2	2	2 ||	2 5<<%&		2	UU\\ 	U# 	U%,, 	U( %II%<<% 
% <<	%
 U\\*% % %D %II%<<% 
% <<	%
 U\\*% % %4[)")) [)|37 3l #KO #K #KL `
+ `
 `
FN
- N
b 
<; < <0;uxx ;"		 (
)")) 
)!ELL ! !:<<:	: ll: 5<<%&	:8)BII 8)vbii )9 )XO
")) O
dbii (6")) 6,C
- C
Lt%:O tnr?   