
    cCiXC                        S SK r S SKJrJr  S SKrS SKJr  SSKJr  SSKJ	r	  SSK
Jr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  SSKJrJr  SSKJrJr  SSKJ r J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'  S\RP                  S\)4S jr* " S S\!5      r+ " S S\ 5      r, " S S\5      r- " S S\5      r. " S S \$5      r/ " S! S"\5      r0 " S# S$\5      r1 " S% S&\#5      r2 " S' S(\Rf                  5      r4 " S) S*\Rj                  5      r6 " S+ S,\5      r7 " S- S.\75      r8 " S/ S0\5      r9\ " S1 S2\\	5      5       r:/ S3Qr;g)4    N)OptionalUnion)nn   )Cache)GenerationMixin)BaseModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple   )Aimv2AttentionAimv2EncoderLayer)	AutoModel)LlamaMLPLlamaRMSNorm)LlavaForConditionalGeneration
LlavaModel)LlavaNextCausalLMOutputWithPastLlavaNextModelOutputWithPast)SiglipEncoderSiglipVisionEmbeddings   )Ovis2ConfigOvis2VisionConfiglogitsdimc                     U R                  U5      nUR                  USS9S   n[        R                  " U [        R                  S9R                  XS5      nXBR                  5       -
  U-   nU$ )NT)keepdimr   )memory_formatg      ?)softmaxmaxtorch
zeros_likelegacy_contiguous_formatscatter_detach)r   r   y_softindexy_hardrets         a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/ovis2/modular_ovis2.pyhard_softmaxr/   %   sf    ^^C FJJsDJ)!,EfE4R4RS\\]`ilmF
==?
"V
+CJ    c                       \ rS rSrSrg)Ovis2ModelOutputWithPast/    N__name__
__module____qualname____firstlineno____static_attributes__r4   r0   r.   r2   r2   /       r0   r2   c                       \ rS rSrSrg)Ovis2CausalLMOutputWithPast3   r4   Nr5   r4   r0   r.   r=   r=   3   r;   r0   r=   c                       \ rS rSrSrg)Ovis2RMSNorm7   r4   Nr5   r4   r0   r.   r@   r@   7   r;   r0   r@   c                       \ rS rSrSrg)Ovis2VisionMLP;   r4   Nr5   r4   r0   r.   rC   rC   ;   r;   r0   rC   c                   p   ^  \ rS rSrS\4U 4S jjrS rS\R                  S\R                  4S jr
SrU =r$ )	Ovis2VisionEmbeddings?   configc                 n   > [         TU ]  U5        [        UR                  UR                  5      U l        g N)super__init__r@   hidden_sizerms_norm_epsrms_normselfrH   	__class__s     r.   rL   Ovis2VisionEmbeddings.__init__@   s*     $V%7%79L9LMr0   c                     [        S5      eNzNot needed for Ovis2)NotImplementedErrorrQ   s    r.   interpolate_pos_encoding.Ovis2VisionEmbeddings.interpolate_pos_encodingD   s    !"899r0   pixel_valuesreturnc                     U R                   R                  R                  nU R                  UR                  US95      nUR	                  S5      R                  SS5      nU R                  U5      nX@R                  U R                  5      -   nU$ )Ndtyper   r   )	patch_embeddingweightr^   toflatten	transposerO   position_embeddingposition_ids)rQ   rZ   target_dtypepatch_embeds
embeddingss        r.   forwardOvis2VisionEmbeddings.forwardG   s~    ++2288++LOO,O,OP!))!,66q!<
]]:.
"9"9$:K:K"LL
r0   )rO   )r6   r7   r8   r9   r   rL   rX   r%   FloatTensorTensorri   r:   __classcell__rR   s   @r.   rF   rF   ?   s9    N0 N:E$5$5 %,,  r0   rF   c                       \ rS rSrSrg)Ovis2VisionAttentionR   r4   Nr5   r4   r0   r.   rp   rp   R   r;   r0   rp   c                       \ rS rSrSrg)Ovis2VisionEncoderLayerV   r4   Nr5   r4   r0   r.   rs   rs   V   r;   r0   rs   c            	          ^  \ rS rSrS\4U 4S jjr\\ S	S\\	R                     S\\   S\4S jj5       5       rSrU =r$ )
Ovis2VisionEncoderZ   rH   c                    > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        g s  snf rJ   )rK   rL   r   
ModuleListrangenum_hidden_layersrs   layers)rQ   rH   _rR   s      r.   rL   Ovis2VisionEncoder.__init__[   sF     mmeTZTlTlNm$nNm%<V%DNm$no$ns   Aattention_maskkwargsr[   c                 P    UnU R                    H  nU" XB40 UD6nM     [        US9$ )Nlast_hidden_state)r|   r	   )rQ   inputs_embedsr   r   hidden_statesencoder_layers         r.   ri   Ovis2VisionEncoder.forward_   s3     &![[M)-R6RM ) ??r0   )r|   rJ   )r6   r7   r8   r9   r   rL   r   r   r   r%   rl   r   r   r	   ri   r:   rm   rn   s   @r.   rv   rv   Z   se    p0 p  26
@ !.
@ +,	
@
 

@  
@r0   rv   c                   h   ^  \ rS rSrS\4U 4S jjr\ SS\\R                     4S jj5       r
SrU =r$ )Ovis2VisionTransformern   rH   c                    > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        SU l        g )NF)rK   rL   rH   rF   rh   rv   encoderr@   rM   rN   rO   gradient_checkpointingrP   s     r.   rL   Ovis2VisionTransformer.__init__o   sM    /7)&1$V%7%79L9LM&+#r0   r   c                     U R                  U5      nU R                  " SUUS.UD6nUR                  nU R                  U5      n[	        US9$ )N)r   r   r   r4   )rh   r   r   rO   r	   )rQ   rZ   r   r   r   encoder_outputsr   s          r.   ri   Ovis2VisionTransformer.forwardw   s_     5+/<< ,
'),
 ,
 ,== MM*;<1BCCr0   )rH   rh   r   r   rO   rJ   )r6   r7   r8   r9   r   rL   r   r   r%   rl   ri   r:   rm   rn   s   @r.   r   r   n   s?    ,0 ,  26D !.D Dr0   r   c                   \   ^  \ rS rSrS\R
                  S\R
                  4U 4S jjrSrU =r$ )Ovis2VisualEmbeddingTable   visual_tokensr[   c                   > UR                   [        R                  [        R                  [        R                  [        R
                  [        R                  4;   a  [        TU ]!  U5      $ [        R                  " XR                  5      $ rJ   )r^   r%   int8int16int32int64longrK   ri   matmulr`   )rQ   r   rR   s     r.   ri   !Ovis2VisualEmbeddingTable.forward   sU    5::u{{EKKV[V`V`"aa7?=11||M;;77r0   r4   )	r6   r7   r8   r9   r%   rl   ri   r:   rm   rn   s   @r.   r   r      s#    8U\\ 8ell 8 8r0   r   c                   J    \ rS rSr% \\S'   SrSrS/rSr	Sr
SrSrSrSrSrSrg)	Ovis2PreTrainedModel   rH   modelTrp   past_key_valuesr4   N)r6   r7   r8   r9   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr:   r4   r0   r.   r   r      sF    &*#/0"3 N!"&r0   r   c                      ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  S\	\R                  \R                  4   4S jrSrU =r$ )Ovis2VisionModel   rH   c                   > [         TU ]  U5        Xl        [        U5      U l        UR
                  U l        UR                  U l        [        R                  " UR                  UR                  -  UR                  -  U R                  U R
                  -
  SS9U l        [        R                  " U R                  U R
                  -
  5      U l        g NF)bias)rK   rL   rH   r   transformernum_visual_indicator_tokens
vocab_sizer   LinearrM   hidden_stridehead_linear	LayerNorm	head_normrP   s     r.   rL   Ovis2VisionModel.__init__   s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr0   rZ   r[   c           	         U R                   " U40 UD6nUS   nU R                  R                  S:  a  UR                  u  pVnU R                  R                  n[	        [
        R                  " U5      5      n	X-  U:w  a  [        S5      eXU-  -
  U-  n
[        R                  R                  USSSU
SU
4SS5      nX-  n	UR                  XYU-  XU-  X5      nUR                  SSSSSS5      nUR                  US	X-  U-  5      nU R                  U5      nU R                  U5      nU R                  R                  S
:X  a!  [        R                  R!                  US	SS9nU$ U R                  R                  S:X  a  [#        US	S9nU$ U R                  R                  S:X  a  [        R                  R%                  US	S9nW$ )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         gumbel_argmaxT)r   hard	st_argmaxr   r#   )r   rH   r   shapeintmathsqrt
ValueErrorr   
functionalpadreshapepermuter   r   tokenize_functiongumbel_softmaxr/   r#   )rQ   rZ   r   outputsr   
num_imagesseq_len
hidden_dimr   sqrt_lpad_sizer   
prob_tokens                r.   ri   Ovis2VisionModel.forward   s   ""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uF 1 9 9m3]mD[]j! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ  [[**k9%f"5J  [[**i7..v2.>Jr0   )rH   r   r   r   r   r   )r6   r7   r8   r9   r   r   rL   r%   rk   tuplerl   ri   r:   rm   rn   s   @r.   r   r      sK    Z0 Z!E$5$5 !E%,,X]XdXdJdDe ! !r0   r   c            !         ^  \ rS rSr0 rS\4U 4S jjrS\R                  S\R                  4S jr	\
\             SS\\R                     S\\R                     S\\R                     S	\\R                     S
\\   S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                  4   S\\\4   4S jj5       5       rSrU =r$ )
Ovis2Model   rH   c                 ~  > [         TU ]  U5        [        UR                  5      U l        [        UR                  R                  UR                  5      U l        UR                  R                  U l	        UR                  U l        UR                  U l
        [        R                  " UR                  5      U l        U ?g rJ   )rK   rL   r   vision_configvision_towerr   r   rM   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr   from_configtext_configlanguage_modelmulti_modal_projectorrP   s     r.   rL   Ovis2Model.__init__   s     ,V-A-AB'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K''33F4F4FG&r0   rZ   r[   c                     U R                  U5      nUR                  u  p4n[        R                  " X4U R                   R                  4UR
                  UR                  SUR                  S9n[        R                  " X&/SS9nU R                  U5      n[        R                  " U R                  U R                   R                  -
  U R                  [        R                  S9R                  UR                  5      nU R                  U5      nX(4$ )NF)r^   devicerequires_gradlayoutr   r   r]   )r   r   r%   zerosr   r^   r   r   catr   aranger   r   ra   )	rQ   rZ   image_features
batch_sizeimg_seq_lenr}   padding_tensorvisual_indicatorvisual_indicator_featuress	            r.   get_image_featuresOvis2Model.get_image_features   s     **<8%3%9%9"
d&7&7&S&ST &&!((!((
 N#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 %)$@$@AQ$R!88r0   	input_idsr   re   r   r   labels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionlogits_to_keepc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eUc  U R	                  5       " U5      nUGb   U R                  US9u  nnU R                  UUUS9nUR                  UU5      n[        U R                  5       H  u  nnUcV  X`R	                  5       " [        R                  " U[        R                  UR                  S95      :H  nUR                  S5      nOUU:H  R                  UR                  5      nUR!                  5       (       d  M  UU   R#                  UU   5      R                  UR                  UR$                  5      UU'   M     U R&                  " S	UUUUUU	U
SUUS.
UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsrZ   )r   r   )r^   r   r   T)
r   re   r   r   r   r   r   r   r   r   )r   r   r   
attentionsimage_hidden_statesr4   )rH   r   r   r   get_input_embeddingsr   get_placeholder_maskmasked_scatter	enumerater   r%   tensorr   r   allra   any	expand_asr^   r   r2   r   r   r   r  )rQ   r   rZ   r   re   r   r   r   r   r   r   r   r   r   r   r   r   special_image_maskivisual_indicator_idmaskr   s                         r.   ri   Ovis2Model.forward   s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<YZZ  557	BM#8<8O8O]i8O8j5N5!%!:!:+- "; "
 *889K^\M*3D4S4S*T&&$(,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88::1!4"=#67M00-2E2EF "$' +U  %% 
)%+'/!5))
 
 (%77#33!//))2>2J
 	

 QU
 	
r0   )r   r   r   r   r   r   NNNNNNNNNNNNr   )r6   r7   r8   r9   _checkpoint_conversion_mappingr   rL   r%   rk   r   r   r   r   
LongTensorrl   r   boolr   r   r   r2   ri   r:   rm   rn   s   @r.   r   r      s   %'"	'{ 	'9''9 
		92  15481537+/59-1$(,0/3&*5934J
E,,-J
 u001J
 !.	J

 u//0J
 "%J
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
 c5<</0J
  
u..	/!J
  J
r0   r   c            !         ^  \ rS rSr0 rS\4U 4S jjr\S 5       rS\	R                  4S jr\\             SS\\	R                     S\\	R                     S\\	R                      S	\\	R                     S
\\   S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\\	R                   4   S\\\4   4S jj5       5       rSrU =r$ )Ovis2ForConditionalGenerationiL  rH   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  SS9U l        g r   )rK   rL   r   r   rM   r   lm_headrP   s     r.   rL   &Ovis2ForConditionalGeneration.__init__P  s0     yy!3!3V5F5FUSr0   c                     [        S5      erU   )AttributeErrorrW   s    r.   r   3Ovis2ForConditionalGeneration.multi_modal_projectorT  s    344r0   rZ   c                 4    U R                   R                  US9$ )Nr   )r   r   )rQ   rZ   s     r.   r   0Ovis2ForConditionalGeneration.get_image_featuresX  s    zz,,,,GGr0   r   r   re   r   r   r   r   r   r   r   r   r   r[   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  " SUUUUUUUU	U
SUS.UD6nUS   n[	        U[
        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb3  U R                  " SUXpR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

>>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
>>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

>>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
>>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
"user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
```NT)r   rZ   r   re   r   r   r   r   r   r   r   r   )r   r   r   )lossr   r   r   r  r  r4   )rH   r   r   r   
isinstancer   slicer  loss_functionr   r   r=   r   r   r  r  )rQ   r   rZ   r   re   r   r   r   r   r   r   r   r   r   r   r   r   slice_indicesr   r  s                       r.   ri   %Ovis2ForConditionalGeneration.forward[  s4   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ** 
%)%+'/!5)
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r0   )r  r  )r6   r7   r8   r9   r  r   rL   propertyr   r%   rk   r   r   r   r   r  rl   r   r  r   r   r   r=   ri   r:   rm   rn   s   @r.   r  r  L  s   %'"T{ T 5 5Hu/@/@ H  15481537+/59-1$(,0/3&*5934R
E,,-R
 u001R
 !.	R

 u//0R
 "%R
   1 12R
 ))*R
 D>R
 $D>R
 'tnR
 d^R
 !!1!12R
 c5<</0R
  
u11	2!R
  R
r0   r  )r   r   r  )<r   typingr   r   r%   r   cache_utilsr   
generationr   modeling_outputsr	   modeling_utilsr
   processing_utilsr   utilsr   r   r   aimv2.modeling_aimv2r   r   autor   llama.modeling_llamar   r   llava.modeling_llavar   r   llava_next.modeling_llava_nextr   r   siglip.modeling_siglipr   r   configuration_ovis2r   r   rl   r   r/   r2   r=   r@   rC   rF   rp   rs   rv   Moduler   	Embeddingr   r   r   r   r  __all__r4   r0   r.   <module>r7     s/     "     ) / - & I I D  9 L j J ? C 	; 		"A 		< 		X 	2 &	> 		/ 	@ @(DRYY D<8 8'? '1+ 1hs
 s
l b
$A? b
 b
J Rr0   