
    cCi48                        S SK JrJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  S	S
KJrJrJrJrJrJr  S	SKJr  SSKJr  \R0                  " \5      r " S S\5      r " S S\R8                  5      r " S S\R8                  5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r#/ SQr$g)    )OptionalUnionN)nn   )ACT2FN)Cache)Unpack)logging   )LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModelTransformersKwargs)MistralRMSNorm   )Mistral3Configc                       \ rS rSrSrg)Mistral3RMSNorm(    N__name__
__module____qualname____firstlineno____static_attributes__r       g/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/mistral3/modular_mistral3.pyr   r   (       r   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
Mistral3PatchMerger,   z4
Learned merging of spatial_merge_size ** 2 patches
configc                   > [         TU ]  5         Xl        UR                  R                  nUR
                  U l        U R                  R                  R                  U l        [        R                  " X R
                  S-  -  USS9U l	        g )Nr   Fbias)
super__init__r%   vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr%   r,   	__class__s      r    r*   Mistral3PatchMerger.__init__1   sn    **66"(";";++33>>YY{5L5La5O'OQ\chir   image_featuresimage_sizesreturnc                    U Vs/ s H&  o3S   U R                   -  US   U R                   -  4PM(     nnU VVs/ s H	  u  pEXE-  PM     nnnUR                  S   n/ n[        UR                  U5      5       H  u  pX)   u  pEU
R	                  XEU5      R                  SSS5      R                  S5      n[        R                  R                  R                  XR                  U R                  S9nUR	                  XpR                  S-  -  S5      R                  5       nUR                  U5        M     [        R                  " USS9nU R                  U5      nU$ s  snf s  snnf )Nr   r   r   )kernel_sizestridedim)r.   shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr-   tappendcatr0   )r1   r4   r5   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r    forwardMistral3PatchMerger.forward:   s[   cn
cnU_]doo-z!}/OPcn 	 
 /::kdaAEk:  $)2>3G3GHX3Y)Z%K+DA%**13;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4( *[ ?:++N;)
 ;s
   -EE!)r%   r0   r.   r-   )r   r   r   r   __doc__r   r*   rC   TensorrS   r   __classcell__r2   s   @r    r#   r#   ,   sD    j~ jell  RWR^R^  r   r#   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Mistral3MultiModalProjectorR   r%   c                   > [         TU ]  5         [        UR                  R                  UR
                  R                  S9U l        [        U5      U l	        [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " UR                  R                  U-  UR
                  R                  UR                   S9U l        [$        UR&                     U l        [        R                  " UR
                  R                  UR
                  R                  UR                   S9U l        g )N)epsr   r'   )r)   r*   r   r+   r,   text_configrms_norm_epsnormr#   patch_merger
isinstancevision_feature_layerintlenr   r/   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)r1   r%   num_feature_layersr2   s      r    r*   $Mistral3MultiModalProjector.__init__S   s    #F$8$8$D$D&J\J\JiJij	/7",V-H-H#"N"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r   r4   r5   c                     U R                  U5      nU R                  X5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ N)r`   ra   rg   ri   rj   )r1   r4   r5   hidden_statess       r    rS   #Mistral3MultiModalProjector.forwardc   sP    >2**>Gn5/m4r   )ri   rg   rj   r`   ra   )r   r   r   r   r   r*   rC   rV   rS   r   rW   rX   s   @r    rZ   rZ   R   s/    
~ 
 ell   r   rZ   c                       \ rS rSrSrg)Mistral3CausalLMOutputWithPastl   r   Nr   r   r   r    rr   rr   l   r!   r   rr   c                       \ rS rSrSrg)Mistral3ModelOutputWithPastp   r   Nr   r   r   r    ru   ru   p   r!   r   ru   c                       \ rS rSrSrg)Mistral3PreTrainedModelt   r   Nr   r   r   r    rx   rx   t   r!   r   rx   c            !          \ rS rSr SS\R
                  S\R                  S\\\	\
\	   4      4S jjr             SS\\R                     S\\R
                     S\\R                     S	\\R                     S
\\   S\\R
                     S\\\	\
\	   4      S\\   S\\   S\\   S\\   S\\R                     S\\R                     S\\   S\\\4   4S jjrSrg)Mistral3Modelx   Npixel_valuesr5   rc   c                    Ub  UOU R                   R                  nUR                  5        VVs0 s H  u  pVUc  M
  XV_M     nnnU R                  " U4USS.UD6n[	        U[
        5      (       a  UR                  U   nO3U V	s/ s H  oR                  U	   PM     n
n	[        R                  " U
SS9nU R                  UR                  S5      U5      nU R                  R                  U R                   R                  -  nU VVs/ s H  u  pX-  X-  -  PM     nnn[        R                  " UR                  S5      U5      nU$ s  snnf s  sn	f s  snnf )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
       The tensors corresponding to the input images.
    vision_feature_layer (`Union[int, list[int]]`, *optional*):
        The index of the layer to select the vision feature. If multiple indices are provided,
        the vision feature of the corresponding indices will be concatenated to form the
        vision features.
    image_sizes (`torch.Tensor`, *optional*):
        Tensor containing the image sizes as returned by the processor.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
T)r5   output_hidden_statesr8   r;   r   )r%   rc   itemsvision_towerrb   rd   ro   rC   rH   multi_modal_projectorsqueezer.   r-   r?   )r1   r}   r5   rc   kwargskvimage_outputsselected_image_feature	layer_idxhs_poolr4   downsample_ratioheightwidthsplit_sizess                   r    get_image_features Mistral3Model.get_image_featuresy   sK   . %9$D $++JjJj 	 $*<<>C>41Q$!$>C)),uKfjuntu *C00%2%@%@AU%V"OcdOc)229=OcGd%*YYwB%?"334J4R4RST4UWbc,,77$++:X:XXgrsgrVcV\2u7PQgrs^%;%;A%>L D e
 ts   	EEEE	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsr   return_dictcache_positionr   r6   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbl  U R                  UUUS9n[        R                  " USS9R                  UR                  UR                  5      nU R                  XUS9nUR                  UU5      nU R                  " S	UUUUUU	U
SUS.	UD6n[!        UR"                  UR$                  UR&                  UR(                  Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embeds)r}   rc   r5   r   r;   )r   r4   T)	r   r   r   r   r   r   r   r   r   )last_hidden_stater   ro   
attentionsimage_hidden_statesr   )r%   r   r   use_return_dictrc   
ValueErrorget_input_embeddingsr   rC   rH   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelru   r   r   ro   r   )r1   r   r}   r   r   r   r   rc   r   r   r   r   r   r5   r   r4   special_image_maskoutputss                     r    rS   Mistral3Model.forward   s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	 -t";<YZZ  557	BM#!44)%9' 5 N
 #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K^\M%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r   r   rn   )NNNNNNNNNNNNN)r   r   r   r   rC   FloatTensorrV   r   r   rd   listr   
LongTensorr   boolr	   r   tupleru   rS   r   r   r   r    r{   r{   x   s   
 AE	)'') \\) 'uS$s)^'<=	)Z 15481537+/59@D$(,0/3&*59.2?
E,,-?
 u001?
 !.	?

 u//0?
 "%?
   1 12?
 'uS$s)^'<=?
 D>?
 $D>?
 'tn?
 d^?
 !!1!12?
 ell+?
 +,?
  
u11	2!?
 ?
r   r{   c            #          \ rS rSr SS\R
                  S\R                  S\\\	\
\	   4      4S jjr              SS\\R                     S\\R
                     S\\R                     S	\\R                     S
\\   S\\R
                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\	\R                  4   S\\R                     S\\   S\\\4   4 S jjrSrg) Mistral3ForConditionalGeneration   Nr}   r5   rc   c                 B    U R                   R                  " SUUUS.UD6$ )N)r}   r5   rc   r   )modelr   )r1   r}   r5   rc   r   s        r    r   3Mistral3ForConditionalGeneration.get_image_features   s3     zz,, 
%#!5
 	
 	
r   r   r   r   r   r   labelsr   r   r   r   r   logits_to_keepr   r6   c                 F   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUUU	U
SUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb3  U R                  " SUXpR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

>>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
>>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

>>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is the image?The image depicts two cats lying on a pink blanket."
```NT)r   r}   r   r   r   r   r   r   r   r   r   r5   r   )logitsr   
vocab_size)lossr   r   ro   r   r   r   )r%   r   r   r   r   rb   rd   slicelm_headloss_functionr^   r   rr   r   ro   r   r   )r1   r   r}   r   r   r   r   r   r   r   r   r   r   r   r5   r   r   ro   slice_indicesr   r   s                        r    rS   (Mistral3ForConditionalGeneration.forward   sM   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%)%+'/!5)#
 
   
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r   r   rn   )NNNNNNNNNNNNr   N)r   r   r   r   rC   r   rV   r   r   rd   r   r   r   r   r   r	   r   r   rr   rS   r   r   r   r    r   r      s   
 AE	
''
 \\
 'uS$s)^'<=	
  15481537+/59-1$(,0/3&*5934.2U
E,,-U
 u001U
 !.	U

 u//0U
 "%U
   1 12U
 ))*U
 D>U
 $D>U
 'tnU
 d^U
 !!1!12U
 c5<</0U
 ell+U
  +,!U
" 
u44	5#U
 U
r   r   )r{   rx   r   )%typingr   r   rC   r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   llava.modeling_llavar   r   r   r   r   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler#   rZ   rr   ru   rx   r{   r   __all__r   r   r    <module>r      s     #   !   &   6 2 
		H	%	n 	#")) #L")) 4	%@ 		": 		2 	k
J k
\d
'D d
Nr   