
    cCiQ                        S r SSKJr  SSKJrJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJr  \R:                  " \5      rS r  " S S\RB                  5      r"S r#S-S jr$ S.S\RB                  S\RJ                  S\RJ                  S\RJ                  S\\RJ                     S\&S\&4S jjr' " S S\RB                  5      r( " S  S!\RB                  5      r) " S" S#\RB                  5      r* " S$ S%\5      r+ " S& S'\RB                  5      r,\ " S( S)\5      5       r-S* r.\ " S+ S,\-5      5       r/S,S)/r0g)/zPyTorch Pixtral model.    )Callable)OptionalUnionN)nn   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc                    / nU  H  nUR                   SS  u  pE[        R                  " [        R                  " U5      [        R                  " U5      SS9n[        R                  " USS9R                  SS5      R                  SS5      u  pxXq-  U-   n	UR                  U	S S 2S4   5        M     [        R                  " U5      $ )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_grididss
             f/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgridr1   %   s    I"BC(~~ell62ELL4GRVWTr2::2qAGG2N 6)QT# # 99Y    c                      ^  \ rS rSr% Sr\R                  \S'   SU 4S jjr\R                  " 5       \
S 5       5       rSrU =r$ )PixtralRotaryEmbedding0   a  
The key with pixtral embedding is just that you have a frequency for each pixel positions.
If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
is given by indexing the pre_computed frequency on the width and height.

What you output is of dimension (batch, height * width, dim) with dim the embed dim.

This simply means that for each image hidden state, you are going to add
a corresponding positional embedding, based on its index in the grid.
inv_freqc                 ~  > [         T
U ]  5         SU l        UR                  U l        UR
                  U l        UR                  UR                  -  nSU R                  [        R                  " SU R                  S5      R                  5       U R                  -  -  -  n[        R                  " X4R                  S9n[        R                  " X4R                  S9n[        R                  " XTS S S2   5      R                  5       n[        R                  " XdSS S2   5      R                  5       n[        R                  " US S 2S S S 24   R                  SUS5      US S S 2S S 24   R                  USS5      /SS9R!                  SU R                  S-  5      n	U R#                  S	[        R                  " X4SS9S
S9  g )Ndefault      ?r   r   )devicer   r   r   r6   F)
persistent)super__init__	rope_typehead_dimr   
rope_thetabase
image_size
patch_sizer   r    floatr:   outerr%   repeatr"   register_buffer)selfconfigr:   max_patches_per_sidefreqshwfreqs_hfreqs_wr6   	__class__s             r0   r=   PixtralRotaryEmbedding.__init__>   ss   "??%%	%00F4E4EEtyyU\\!TXXq%A%G%G%IDHH%TUVLL-llCLL-llC++ass,224++aqt!t-335994
#**1.BAFa
#**+?AF 
 '"dhh!m
$ 	 	ZH3GR)P]bcr2   c                    U R                   U   n[        UR                  R                  [        5      (       a0  UR                  R                  S:w  a  UR                  R                  OSn[
        R                  " USS9   UnUR                  5       nUR                  5       nS S S 5        WR                  UR                  S9WR                  UR                  S94$ ! , (       d  f       N@= f)NmpscpuF)device_typeenabled)dtype)r6   
isinstancer:   typestrr   autocastcossintorW   )rH   xposition_idsrK   rU   embr\   r]   s           r0   forwardPixtralRotaryEmbedding.forwardW   s     l+'1!((--'E'E!((--[`J`ahhmmfk^^UCC'')C'')C D
 vvAGGv$cff177f&;;; DCs    #C
C+)rA   r   r>   N)__name__
__module____qualname____firstlineno____doc__r   Tensor__annotations__r=   no_gradr   rb   __static_attributes____classcell__rP   s   @r0   r4   r4   0   s<    	 lld2 ]]_	<  	<r2   r4   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r%   )r_   x1x2s      r0   rotate_halfrs   f   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezers   )qkr\   r]   r`   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embr{   m   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr2   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr   r   )r   rW   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32r^   rW   r   r   
contiguous)
r|   r}   r~   r   r   r   r   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r2   c                     ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\	\R                  \R                  4      S\\
   S\\   S	\	\R                  \\R                     4   4S
 jjrSrU =r$ )PixtralAttention   zA
Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        SU l        U R                  S-  U l	        SU l        UR                  U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        g )NFg      ࿩bias)r<   r=   rI   hidden_size	embed_dimnum_attention_heads	num_headsr?   	is_causalr   attention_dropoutr   r   Lineark_projv_projq_projo_projrH   rI   rP   s     r0   r=   PixtralAttention.__init__   s    ++33$..8}}d*//iiUKiiUKiiUKiiUKr2   hidden_statesr   position_embeddingsoutput_attentionsr   returnc                    UR                  5       u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nU	R	                  XgU R
                  U R                  5      R                  SS5      n	U
R	                  XgU R
                  U R                  5      R                  SS5      n
UR	                  XgU R
                  U R                  5      R                  SS5      nUu  p[        XXSS9u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU R                  R                  S:X  a   US   R                  UR                  SS	9US'   U" U U	U
UU4U R                  (       d  S
OU R                   U R"                  S.UD6u  nnUR%                  XgS5      R'                  5       nU R)                  U5      nU(       d  SnUU4$ )z#Input shape: Batch x Time x Channelr   r   r   )rx   eagerflash_attention_2r`   T)non_blocking        )r   r   r   N)sizer   r   r   viewr   r?   r   r{   r   rI   _attn_implementationr   r^   r:   r   r   r   r"   r   r   )rH   r   r   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesr\   r]   attention_interfacer   r   s                    r0   rb   PixtralAttention.forward   s    "/!3!3!5
Q{{=1[[/
{{=1#((dnndmm\ffghjkl__Z$..$--Xbbcdfgh
#((dnndmm\ffghjkl&#7RUjk#l (?;;++w6"9$++:Z:Z"[ ;;++/BB%+N%;%>%>}?S?Sbf%>%gF>"$7	%
  $}}C$,,LL	%
 	%
!\ "))*rBMMOkk+. LL((r2   )rI   r   r   r?   r   r   r   r   r   r   r   )NNF)re   rf   rg   rh   ri   r=   r   rj   r   tupleboolr   r	   rb   rm   rn   ro   s   @r0   r   r      s    L* 26KO,1/)||/) !./) &eELL%,,,F&GH	/)
 $D>/) -./) 
u||Xell33	4/) /)r2   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
PixtralMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g )NFr   )r<   r=   rI   r   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fnr   s     r0   r=   PixtralMLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r2   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rd   )r   r   r   r   )rH   r_   r   s      r0   rb   PixtralMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )r   rI   r   r   r   r   r   )re   rf   rg   rh   r=   rb   rm   rn   ro   s   @r0   r   r      s    0 r2   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )PixtralRMSNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z-
PixtralRMSNorm is equivalent to T5LayerNorm
N)r<   r=   r   	Parameterr   onesweightvariance_epsilon)rH   r   epsrP   s      r0   r=   PixtralRMSNorm.__init__   s/     	ll5::k#:; #r2   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   r   T)keepdim)	rW   r^   r   r   powmeanrsqrtr   r   )rH   r   input_dtypevariances       r0   rb   PixtralRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r2   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r   r   r   r   rH   s    r0   
extra_reprPixtralRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr2   )r   r   )gư>)	re   rf   rg   rh   r=   rb   r   rm   rn   ro   s   @r0   r   r      s    $;J Jr2   r   c                      ^  \ rS rSrU 4S jr  SS\R                  S\R                  S\\\R                  \R                  4      S\\	   S\
\   S\\R                     4S	 jjrS
rU =r$ )PixtralAttentionLayeri  c                    > [         TU ]  5         [        UR                  SS9U l        [        U5      U l        [        U5      U l        [        UR                  SS9U l	        g )Nh㈵>r   )
r<   r=   r   r   attention_normr   feed_forwardr   	attentionffn_normr   s     r0   r=   PixtralAttentionLayer.__init__  sP    ,V-?-?TJ&v.)&1&v'9'9tDr2   r   r   r   r   r   r   c                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pXa-   nUnU R                  U5      nU R                  U5      nXa-   nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r    )r   r   r   r   )	rH   r   r   r   r   r   residualr   outputss	            r0   rb   PixtralAttentionLayer.forward  s    $ !++M:&*nn '
') 3/	'

 '
# !0 m4))-8 0 "&Gr2   )r   r   r   r   )NN)re   rf   rg   rh   r=   r   rj   r   r   r   r   r	   FloatTensorrb   rm   rn   ro   s   @r0   r   r     s    E LP,0'||' ' &eELL%,,,F&GH	'
 $D>' -.' 
u  	!' 'r2   r   c                      ^  \ rS rSrU 4S jr     SS\\R                     S\\\R                  \R                  4      S\\	   S\\	   S\\	   S\
\   S	\\\4   4S
 jjrSrU =r$ )PixtralTransformeriA  c                   > [         TU ]  5         Xl        [        R                  R                  5       U l        [        UR                  5       H'  nU R                  R                  [        U5      5        M)     SU l        g )NF)r<   r=   rI   r   r   
ModuleListlayersrangenum_hidden_layersr$   r   gradient_checkpointing)rH   rI   r   rP   s      r0   r=   PixtralTransformer.__init__B  s\    hh))+v//0AKK4V<= 1&+#r2   r   r   r   output_hidden_statesreturn_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSn	Un
U R                   H2  nU(       a  X4-   nU" U
U4UUS.UD6nUS   n
U(       d  M*  XS   4-   n	M4     U(       a  X4-   nU(       d  [        S XU	4 5       5      $ [        XU	S9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embeddings which serve as input to the Transformer.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr   )r   r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frd   r   ).0vs     r0   	<genexpr>-PixtralTransformer.forward.<locals>.<genexpr>  s     e$Sq$Ss   	)last_hidden_stater   
attentions)rI   r   r   use_return_dictr   r   r   )rH   inputs_embedsr   r   r   r   r   r   encoder_statesall_attentionsr   encoder_layerlayer_outputss                r0   rb   PixtralTransformer.forwardJ  s
   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%![[M#!/2B!B) %8"3	
 M *!,M  !/3C2E!E )   +.>>Ne]N$Seee+Vd
 	
r2   )rI   r   r   )NNNNN)re   rf   rg   rh   r=   r   r   rj   r   r   r   r	   r   r   rb   rm   rn   ro   s   @r0   r   r   A  s    , 26KO,0/3&*?
 !.?
 &eELL%,,,F&GH	?

 $D>?
 'tn?
 d^?
 -.?
 
uo%	&?
 ?
r2   r   c                   X    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrS/rSr
SrSrSr	S rSrg	)
PixtralPreTrainedModeli  rI   modelpixel_valuesTr   c                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g g )Nr   )r   stdr9   )rI   initializer_rangerX   r   r   Conv2dr   datanormal_r   zero_r   fill_)rH   r|   r  s      r0   _init_weights$PixtralPreTrainedModel._init_weights  s    kk++fryy"))455MM&&CS&9{{&  &&( '//MM$$S) 0r2   r   N)re   rf   rg   rh   r   rk   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modulesr  rm   r   r2   r0   r  r    sU    $O&*#"&N01N"&*r2   r  c                    UR                   nUR                  nUR                  S   n[        R                  " U5      R
                  n[        R                  " XD4XRUS9n[        R                  " U 5      R                  S5      n[        R                  " S/U S S -   5      R                  S5      n[        X5       H  u  pSXiU
2X24'   M     US S S S 2S S 24   R                  UR                  S   SSS5      nU$ )Nr   )
fill_valuerW   r:   r   r   )rW   r:   r   r   finfominfulltensorcumsumzipexpand)r&   r!  rW   r:   seq_lend_mincausal_maskblock_end_idxblock_start_idxstartends              r0   generate_block_attention_maskr,    s    LLE]]Fll1oGKK""E**g/EW]^KLL!23::2>MllA3):3B)?#?@GGKO/9
,-#Iuy() : dD!Q./66v||A2rRKr2   c                      ^  \ rS rSrSrU 4S jrS r\\    SS\	R                  S\\	R                     S\\   S\\   S	\\   S
\\   S\\\4   4S jj5       5       rSrU =r$ )PixtralVisionModeli  vision_encoderc                 n  > [         TU ]  U5        Xl        [        R                  " UR
                  UR                  UR                  UR                  SS9U l        UR                  U l        [        UR                  SS9U l
        [        U5      U l        [        U5      U l        U R                  5         g )NF)in_channelsout_channelskernel_sizestrider   r   r   )r<   r=   rI   r   r  num_channelsr   rC   
patch_convr   ln_prer   transformerr4   patch_positional_embedding	post_initr   s     r0   r=   PixtralVisionModel.__init__  s     ))++++))$$
 !++$V%7%7TB-f5*@*H'r2   c                     U R                   $ rd   )r6  r   s    r0   get_input_embeddings'PixtralVisionModel.get_input_embeddings  s    r2   r	  image_sizesr   r   r   r   r   c           
      <   Uc  UR                   u  ppX4/U-  nU R                  U5      n[        X5       VVs/ s H1  u  pUSS US   U R                  -  2S US   U R                  -  24   PM3     nnn[        R
                  " U Vs/ s H  nUR                  S5      R                  PM      snSS9R                  S5      nU R                  U5      n[        XR                  R                  U R                  R                  -  S9nUUS'   U R                  UU5      nU R                  R                  S:X  a  S nO:[        U Vs/ s H"  nUR                   S   UR                   S	   -  PM$     snU5      nU R                   " U4UUUUS
S.UD6$ s  snnf s  snf s  snf )N.r   r   r   )r'   r`   r   r   r   T)r   r   r   r   r   )r   r6  r#  rC   r   r%   flattenTru   r7  r1   rI   rB   r9  r   r,  r8  )rH   r	  r?  r   r   r   argsr   r   r   r*   r+   patch_embedsembedr   r&   r   r`   r   r   s                       r0   rb   PixtralVisionModel.forward  s    +7+=+=(J6"?+j8K |4  #<=
= #5$q'T__457U$q'T__:T7UUV= 	 
 yy:K!L:KQ!))A,..:K!LRST^^_`a{{<0 0)?)?4;;CYCY)Y
 ".~"==lLY;;++/BB!N:4EF4Eqqwwr{*4EFN 
) 3!5/
 
 	
3
 "M  Gs   8F%F)F)rI   r7  r6  r9  rC   r8  )NNNN)re   rf   rg   rh   r  r=   r=  r   r   r   rj   r   r   r   r	   r   r   r   rb   rm   rn   ro   s   @r0   r.  r.    s    ("  /3/3,0&*1
ll1
 ell+1
 'tn	1

 $D>1
 d^1
 -.1
 
uo%	&1
  1
r2   r.  )Nr   )r   )1ri   collections.abcr   typingr   r   r   r   activationsr   modeling_flash_attention_utilsr	   modeling_layersr
   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_pixtralr   
get_loggerre   loggerr1   Moduler4   rs   r{   rj   rD   r   r   r   r   r   r   r  r,  r.  __all__r   r2   r0   <module>rV     so    $ "   ! B 9 / 6 F & > > 6 
		H	% 2<RYY 2<l(F %II%<<% 
% <<	%
 U\\*% % %.F)ryy F)T "JRYY J(/6 /dH
 H
V *_ * *2  J
/ J
 J
Z  !9
:r2   