
    cCi                         S SK r S SKJr  S SKJrJrJr  S SKrS SKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSK J!r!  SSK"J#r#J$r$  \\" SS9 " S S\5      5       5       r%\\" SS9 " S S\5      5       5       r&\" S5       " S S\RN                  5      5       r( " S S\RN                  5      r) " S S \RN                  5      r* SDS!\RN                  S"\RV                  S#\RV                  S$\RV                  S%\\RV                     S&\,S'\,4S( jjr- " S) S*\RN                  5      r. " S+ S,\RN                  5      r/ " S- S.\RN                  5      r0 " S/ S0\5      r1 " S1 S2\RN                  5      r2 " S3 S4\RN                  5      r3 " S5 S6\Rh                  5      r5 " S7 S8\5      r6S9\RV                  S:\74S; jr8 " S< S=\65      r9\" S>S9 " S? S@\65      5       r:\ " SA SB\6\5      5       r;/ SCQr<g)E    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple   )	AutoModel   )Ovis2ConfigOvis2VisionConfigzJ
    Base class for Llava outputs, with hidden states and attentions.
    )custom_introc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)Ovis2ModelOutputWithPast*   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r    r   torchFloatTensor__annotations____static_attributes__r!       b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/ovis2/modeling_ovis2.pyr   r   *   s    	 8<%"3"34;r+   r   zQ
    Base class for Ovis2 causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)Ovis2CausalLMOutputWithPast?   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr    r!   )r"   r#   r$   r%   r&   r0   r   r'   r(   r)   r1   r2   r
   r3   tupler4   r    r*   r!   r+   r,   r.   r.   ?   s     )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju001297;%"3"34;r+   r.   RMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )Ovis2RMSNorm]   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Ovis2RMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parameterr'   onesweightvariance_epsilon)selfhidden_sizeeps	__class__s      r,   r<   Ovis2RMSNorm.__init___   s/     	ll5::k#:; #r+   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   Tkeepdim)	dtypetor'   float32powmeanrsqrtr@   r?   )rA   r3   input_dtypevariances       r,   forwardOvis2RMSNorm.forwardg   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r+   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r5   r?   shaper@   rA   s    r,   
extra_reprOvis2RMSNorm.extra_reprn   s*    ))*+6$2G2G1HIIr+   )r@   r?   )gư>)	r"   r#   r$   r%   r<   rR   rW   r*   __classcell__rD   s   @r,   r8   r8   ]   s    $;J Jr+   r8   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Ovis2VisionMLPr   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g Nbiasr;   r<   configrB   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnrA   rc   rD   s     r,   r<   Ovis2VisionMLP.__init__s       !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r+   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ Nri   rk   rg   rh   rA   xri   s      r,   rR   Ovis2VisionMLP.forward}   6    NN4;;t~~a/@#ADLLQRO#ST	r+   rk   rc   ri   rg   rB   rd   rh   r"   r#   r$   r%   r<   rR   r*   rY   rZ   s   @r,   r\   r\   r       0 r+   r\   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )Ovis2VisionEmbeddings   rc   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l
        U R
                  U R                  -  S-  U l        U R                  U l        [        R                  " U R                  U R                  5      U l        U R                  S[         R"                  " U R                  5      R%                  S5      SS9  ['        UR                  UR(                  5      U l        g )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r   rG   F)
persistent)r;   r<   rc   rB   	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr'   arangeexpandr8   rms_norm_epsrms_normrl   s     r,   r<   Ovis2VisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jop$V%7%79L9LMr+   pixel_valuesreturnc                     U R                   R                  R                  nU R                  UR                  US95      nUR	                  S5      R                  SS5      nU R                  U5      nX@R                  U R                  5      -   nU$ )NrJ   r   r   )	r   r?   rJ   rK   flatten	transposer   r   r   )rA   r   target_dtypepatch_embeds
embeddingss        r,   rR   Ovis2VisionEmbeddings.forward   s~    ++2288++LOO,O,OP!))!,66q!<
]]:.
"9"9$:K:K"LL
r+   )	rc   r   r   r   r   r   r   r   r   )r"   r#   r$   r%   r   r<   r'   r(   TensorrR   r*   rY   rZ   s   @r,   rz   rz      s4    N0 N*E$5$5 %,,  r+   rz   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrG   )dimrJ   )ptrainingr   r   )r'   matmulr   r   
functionalsoftmaxrL   rK   rJ   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r,   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r+   c            
          ^  \ rS rSrSrU 4S jr S	S\R                  S\\R                     S\	\R                  \\R                     4   4S jjr
SrU =r$ )
Ovis2VisionAttention   =Multi-headed attention from 'Attention Is All You Need' paperc                 h  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        g Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr`   r;   r<   rc   rB   r   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalr   re   qkv_biask_projv_projq_projout_projrl   s     r,   r<   Ovis2VisionAttention.__init__   0   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr+   r3   r   r   c                 2   UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XEU5      R#                  5       nU R%                  U5      nX4$ z#Input shape: Batch x Time x Channelr   r   eager        )r   r   r   rU   r   r   r   viewr   r   r   r   rc   _attn_implementationr   r   r   r   r   reshaper   r   rA   r3   r   r   
batch_size
seq_lengthr   querieskeysvaluesattention_interfacer   r   s                r,   rR   Ovis2VisionAttention.forward   S    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0((r+   rc   r   r   r   r   r   r   r   r   r   r   rp   r"   r#   r$   r%   r&   r<   r'   r   r   r5   rR   r*   rY   rZ   s   @r,   r   r      [    GX, 26$)||$) !.$)
 
u||Xell33	4$) $)r+   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Ovis2MLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g r_   rb   rl   s     r,   r<   Ovis2MLP.__init__   rn   r+   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rp   rq   rr   s      r,   rR   Ovis2MLP.forward  ru   r+   rv   rw   rZ   s   @r,   r   r      rx   r+   r   c            
          ^  \ rS rSrSrU 4S jr S	S\R                  S\\R                     S\	\R                  \\R                     4   4S jjr
SrU =r$ )
Ovis2Attentioni  r   c                 h  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        g r   r   rl   s     r,   r<   Ovis2Attention.__init__
  r   r+   r3   r   r   c                 2   UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XEU5      R#                  5       nU R%                  U5      nX4$ r   r   r   s                r,   rR   Ovis2Attention.forward  r   r+   r   rp   r   rZ   s   @r,   r   r     r   r+   r   c            	          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\\R                     S\	\
   S\R                  4S jjrS	rU =r$ )Ovis2VisionEncoderLayeriD  rc   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g rp   )r;   r<   r   	attentionr   ffnr8   rB   r   	rms_norm1	rms_norm2rl   s     r,   r<    Ovis2VisionEncoderLayer.__init__E  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr+   r3   r   r   r   c                     U R                  U5      nU R                  " SXBS.UD6u  pVX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r3   r   r!   )r   r   r   r   )rA   r3   r   r   norm_hidden_statesr   _
mlp_outputs           r,   rR   Ovis2VisionEncoderLayer.forwardL  sa     "^^M:r6Hrkqr%3!^^M:XX01
%2r+   )r   r   r   r   rp   )r"   r#   r$   r%   r   r<   r'   r   r   r   r   rR   r*   rY   rZ   s   @r,   r   r   D  s^    O0 O 26|| !. +,	
 
 r+   r   c            	          ^  \ rS rSrSrS\4U 4S jjr\\ S
S\	\
R                     S\\   S\4S jj5       5       rS	rU =r$ )Ovis2VisionEncoderi]  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Ovis2VisionEncoderLayer`].

Args:
    config: Ovis2VisionConfig
rc   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r;   r<   rc   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rA   rc   r   rD   s      r,   r<   Ovis2VisionEncoder.__init__f  sT    mmeTZTlTlNm$nNm%<V%DNm$no&+# %os   A&r   r   r   c                 P    UnU R                    H  nU" XB40 UD6nM     [        US9$ )Nlast_hidden_state)r   r   )rA   inputs_embedsr   r   r3   encoder_layers         r,   rR   Ovis2VisionEncoder.forwardm  s3     &![[M)-R6RM ) ??r+   )rc   r   r   rp   )r"   r#   r$   r%   r&   r   r<   r   r   r   r'   r   r   r   r   rR   r*   rY   rZ   s   @r,   r   r   ]  sh    ,0 ,  26
@ !.
@ +,	
@
 

@  
@r+   r   c                   h   ^  \ rS rSrS\4U 4S jjr\ SS\\R                     4S jj5       r
SrU =r$ )Ovis2VisionTransformeri|  rc   c                    > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        SU l        g r   )r;   r<   rc   rz   r   r   encoderr8   rB   r   r   r   rl   s     r,   r<   Ovis2VisionTransformer.__init__}  sM    /7)&1$V%7%79L9LM&+#r+   r   c                     U R                  U5      nU R                  " SUUS.UD6nUR                  nU R                  U5      n[	        US9$ )N)r  r   r  r!   )r   r
  r  r   r   )rA   r   r   r   r3   encoder_outputsr  s          r,   rR   Ovis2VisionTransformer.forward  s_     5+/<< ,
'),
 ,
 ,== MM*;<1BCCr+   )rc   r   r
  r   r   rp   )r"   r#   r$   r%   r   r<   r   r   r'   r   rR   r*   rY   rZ   s   @r,   r  r  |  s?    ,0 ,  26D !.D Dr+   r  c                   \   ^  \ rS rSrS\R
                  S\R
                  4U 4S jjrSrU =r$ )Ovis2VisualEmbeddingTablei  visual_tokensr   c                   > UR                   [        R                  [        R                  [        R                  [        R
                  [        R                  4;   a  [        TU ]!  U5      $ [        R                  " XR                  5      $ rp   )rJ   r'   int8int16int32int64longr;   rR   r   r?   )rA   r  rD   s     r,   rR   !Ovis2VisualEmbeddingTable.forward  sU    5::u{{EKKV[V`V`"aa7?=11||M;;77r+   r!   )	r"   r#   r$   r%   r'   r   rR   r*   rY   rZ   s   @r,   r  r    s#    8U\\ 8ell 8 8r+   r  c                   J    \ rS rSr% \\S'   SrSrS/rSr	Sr
SrSrSrSrSrSrg)	Ovis2PreTrainedModeli  rc   modelTr   r2   r!   N)r"   r#   r$   r%   r   r)   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr*   r!   r+   r,   r  r    sF    &*#/0"3 N!"&r+   r  r1   r   c                     U R                  U5      nUR                  USS9S   n[        R                  " U [        R                  S9R                  XS5      nXBR                  5       -
  U-   nU$ )NTrH   r   )memory_formatg      ?)r   maxr'   
zeros_likelegacy_contiguous_formatscatter_detach)r1   r   y_softindexy_hardrets         r,   hard_softmaxr1    sf    ^^C FJJsDJ)!,EfE4R4RS\\]`ilmF
==?
"V
+CJr+   c                      ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  S\	\R                  \R                  4   4S jrSrU =r$ )Ovis2VisionModeli  rc   c                   > [         TU ]  U5        Xl        [        U5      U l        UR
                  U l        UR                  U l        [        R                  " UR                  UR                  -  UR                  -  U R                  U R
                  -
  SS9U l        [        R                  " U R                  U R
                  -
  5      U l        g NFr`   )r;   r<   rc   r  transformernum_visual_indicator_tokens
vocab_sizer   re   rB   hidden_stridehead_linear	LayerNorm	head_normrl   s     r,   r<   Ovis2VisionModel.__init__  s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr+   r   r   c           	         U R                   " U40 UD6nUS   nU R                  R                  S:  a  UR                  u  pVnU R                  R                  n[	        [
        R                  " U5      5      n	X-  U:w  a  [        S5      eXU-  -
  U-  n
[        R                  R                  USSSU
SU
4SS5      nX-  n	UR                  XYU-  XU-  X5      nUR                  SSSSSS5      nUR                  US	X-  U-  5      nU R                  U5      nU R                  U5      nU R                  R                  S
:X  a!  [        R                  R!                  US	SS9nU$ U R                  R                  S:X  a  [#        US	S9nU$ U R                  R                  S:X  a  [        R                  R%                  US	S9nW$ )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         rG   gumbel_argmaxT)r   hard	st_argmaxr   r   )r6  rc   r9  rU   intmathsqrtr   r   r   padr   permuter:  r<  tokenize_functiongumbel_softmaxr1  r   )rA   r   r   outputsr  
num_imagesseq_len
hidden_dimr9  sqrt_lpad_sizer1   
prob_tokens                r,   rR   Ovis2VisionModel.forward  s   ""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uF 1 9 9m3]mD[]j! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ  [[**k9%f"5J  [[**i7..v2.>Jr+   )rc   r:  r<  r7  r6  r8  )r"   r#   r$   r%   r   r)   r<   r'   r(   r5   r   rR   r*   rY   rZ   s   @r,   r3  r3    sK    Z0 Z!E$5$5 !E%,,X]XdXdJdDe ! !r+   r3  zu
    The Ovis2 model which consists of a vision backbone and a language model, without a language modeling head.
    c            !       H  ^  \ rS rSr0 rS\4U 4S jjrS rS rS r	S r
S\R                  S	\R                  4S
 jrS\R                  S\R                  S\R                  4S jr\\             SS\\R                     S\\R                     S\\R&                     S\\R                     S\\   S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R&                  4   S	\\\4   4S jj5       5       rSrU =r$ )
Ovis2Modeli  rc   c                   > [         TU ]  U5        [        UR                  5      U l        [
        R                  " UR                  5      U l        [        UR                  R                  UR                  5      U l        UR                  R                  U l        UR                  U l
        UR                  U l        U R                  5         g rp   )r;   r<   r3  vision_configvision_towerr   from_configtext_configlanguage_modelr  r8  rB   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_ids	post_initrl   s     r,   r<   Ovis2Model.__init__  s     ,V-A-AB'33F4F4FG'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K'r+   c                 6    U R                   R                  5       $ rp   )r\  get_input_embeddingsrV   s    r,   rc  Ovis2Model.get_input_embeddings  s    ""7799r+   c                 :    U R                   R                  U5        g rp   )r\  set_input_embeddingsrA   r   s     r,   rf  Ovis2Model.set_input_embeddings  s    007r+   c                     Xl         g rp   r\  rA   decoders     r,   set_decoderOvis2Model.set_decoder  s    %r+   c                     U R                   $ rp   rj  rV   s    r,   get_decoderOvis2Model.get_decoder
  s    """r+   r   r   c                     U R                  U5      nUR                  u  p4n[        R                  " X4U R                   R                  4UR
                  UR                  SUR                  S9n[        R                  " X&/SS9nU R                  U5      n[        R                  " U R                  U R                   R                  -
  U R                  [        R                  S9R                  UR                  5      nU R                  U5      nX(4$ )a@  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
       The tensors corresponding to the input images.
    vision_feature_layer (`Union[int, list[int]]`, *optional*):
        The index of the layer to select the vision feature. If multiple indices are provided,
        the vision feature of the corresponding indices will be concatenated to form the
        vision features.
    vision_feature_select_strategy (`str`, *optional*):
        The feature selection strategy used to select the vision feature from the vision backbone.
        Can be one of `"default"` or `"full"`
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
F)rJ   devicerequires_gradlayoutr   rE  r   )rY  rU   r'   zerosr7  rJ   rs  ru  catr]  r   r^  r  rK   )	rA   r   image_featuresr   img_seq_lenr   padding_tensorvisual_indicatorvisual_indicator_featuress	            r,   get_image_featuresOvis2Model.get_image_features  s    ( **<8%3%9%9"
d&7&7&S&ST &&!((!((
 N#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 %)$@$@AQ$R!88r+   	input_idsr  rx  c           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
rJ   rs  rG   r   r   z6Image features and image tokens do not match: tokens: z, features )rc  r'   tensorrc   image_token_idr  rs  allsum	unsqueeze	expand_asrK   rU   numelr   )rA   r  r  rx  special_image_maskn_image_tokensn_image_featuress          r,   get_placeholder_maskOvis2Model.get_placeholder_mask6  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r+   r   r   r2   labels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionlogits_to_keepc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eUc  U R	                  5       " U5      nUGb   U R                  US9u  nnU R                  UUUS9nUR                  UU5      n[        U R                  5       H  u  nnUcV  X`R	                  5       " [        R                  " U[        R                  UR                  S95      :H  nUR                  S5      nOUU:H  R                  UR                  5      nUR!                  5       (       d  M  UU   R#                  UU   5      R                  UR                  UR$                  5      UU'   M     U R&                  " S	UUUUUU	U
SUUS.
UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )r  rx  r  rG   T)
r   r   r2   r  r  r  r  r  r  r  )r  r2   r3   r4   r    r!   )rc   r  r  r   rc  r}  r  masked_scatter	enumerater_  r'   r  r  rs  r  rK   anyr  rJ   r\  r   r  r2   r3   r4   )rA   r  r   r   r   r2   r  r  r  r  r  r  r  r  r   rx  r|  r  ivisual_indicator_idmaskrM  s                         r,   rR   Ovis2Model.forwardN  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<YZZ  557	BM#8<8O8O]i8O8j5N5!%!:!:+- "; "
 *889K^\M*3D4S4S*T&&$(,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88::1!4"=#67M00-2E2EF "$' +U  %% 
)%+'/!5))
 
 (%77#33!//))2>2J
 	

 QU
 	
r+   )r\  rY  r]  r_  r^  r8  NNNNNNNNNNNNr   )r"   r#   r$   r%   _checkpoint_conversion_mappingr   r<   rc  rf  rm  rp  r'   r(   r}  
LongTensorr  r   r   r   r   r
   boolr   rF  r5   r   rR   r*   rY   rZ   s   @r,   rV  rV    s    &("	{ 	:8&#'9'''9 
		'9R"))":?:K:K"]b]n]n"0  15481537+/59-1$(,0/3&*5934J
E,,-J
 u001J
 !.	J

 u//0J
 "%J
   1 12J
 ))*J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
 c5<</0J
  
u..	/!J
  J
r+   rV  c            !       R  ^  \ rS rSr0 rS/rS\4U 4S jjrS rS r	S\
R                  4S jrS	 rS
 rS\R                   4S jr\S 5       r\S 5       r\S 5       r\\             SS\\R2                     S\\R                      S\\R4                     S\\R2                     S\\   S\\R                      S\\R2                     S\\   S\\   S\\   S\\   S\\R2                     S\\\R4                  4   S\\\ 4   4S jj5       5       r!      S U 4S jjr"Sr#U =r$$ )!Ovis2ForConditionalGenerationi  zlm_head.weightrc   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g r5  )
r;   r<   rV  r  r   re   rB   r8  lm_headr`  rl   s     r,   r<   &Ovis2ForConditionalGeneration.__init__  sF     '
yy!3!3V5F5FUSr+   c                 6    U R                   R                  5       $ rp   )r  rc  rV   s    r,   rc  2Ovis2ForConditionalGeneration.get_input_embeddings  s    zz..00r+   c                 :    U R                   R                  U5        g rp   )r  rf  rg  s     r,   rf  2Ovis2ForConditionalGeneration.set_input_embeddings  s    

''.r+   r   c                     U R                   $ rp   )r  rV   s    r,   get_output_embeddings3Ovis2ForConditionalGeneration.get_output_embeddings  s    ||r+   c                 :    U R                   R                  U5        g rp   )r  rm  rk  s     r,   rm  )Ovis2ForConditionalGeneration.set_decoder  s    

w'r+   c                 6    U R                   R                  5       $ rp   )r  rp  rV   s    r,   rp  )Ovis2ForConditionalGeneration.get_decoder  s    zz%%''r+   r   c                 4    U R                   R                  US9$ )Nr  )r  r}  )rA   r   s     r,   r}  0Ovis2ForConditionalGeneration.get_image_features  s    zz,,,,GGr+   c                 .    U R                   R                  $ rp   )r  r\  rV   s    r,   r\  ,Ovis2ForConditionalGeneration.language_model  s    zz(((r+   c                 .    U R                   R                  $ rp   )r  rY  rV   s    r,   rY  *Ovis2ForConditionalGeneration.vision_tower  s    zz&&&r+   c                     [        S5      e)NzNot needed for Ovis2)AttributeErrorrV   s    r,   multi_modal_projector3Ovis2ForConditionalGeneration.multi_modal_projector  s    344r+   r  r   r   r2   r  r  r  r  r  r  r  r  c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R                  " SUUUUUUUU	U
SUS.UD6nUS   n[	        U[
        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb3  U R                  " SUXpR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

>>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
>>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

>>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
>>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
"user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
```NT)r  r   r   r   r2   r  r  r  r  r  r  r   )r1   r  r8  )r0   r1   r2   r3   r4   r    r!   )rc   r  r  r  
isinstancerF  slicer  loss_functionr[  r8  r.   r2   r3   r4   r    )rA   r  r   r   r   r2   r  r  r  r  r  r  r  r  r   rM  r3   slice_indicesr1   r0   s                       r,   rR   %Ovis2ForConditionalGeneration.forward  s4   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ** 
%)%+'/!5)
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r+   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)r2   r  r   r  r  r   r   )r;   prepare_inputs_for_generation)rA   r  r2   r  r   r   r  r  r   model_inputsrD   s             r,   r  ;Ovis2ForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !! ,8(r+   )r  r  r  )NNNNNN)%r"   r#   r$   r%   r  _tied_weights_keysr   r<   rc  rf  r   Moduler  rm  rp  r'   r(   r}  propertyr\  rY  r  r   r   r   r  r   r
   r  r   rF  r5   r.   rR   r  r*   rY   rZ   s   @r,   r  r    s   %'"*+{ 1/ryy ((Hu/@/@ H ) ) ' ' 5 5  15481537+/59-1$(,0/3&*5934R
E,,-R
 u001R
 !.	R

 u//0R
 "%R
   1 12R
 ))*R
 D>R
 $D>R
 'tnR
 d^R
 !!1!12R
 c5<</0R
  
u11	2!R
  R
n  r+   r  )r  rV  r  )r   )=rG  dataclassesr   typingr   r   r   r'   r   activationsr	   cache_utilsr
   
generationr   integrationsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   autor   configuration_ovis2r   r   r   r.   r  r8   r\   rz   r   floatr   r   r   r   r   r   r  r   r  r  rF  r1  r3  rV  r  __all__r!   r+   r,   <module>r     s=  ,  ! , ,   !   ) 7 9 H F & V V  ? 
<6 < < 
<+ < <0 Y'J299 J (J(RYY  BII P %II%<<% 
% <<	%
 U\\*% % %.:)299 :)zryy  :)RYY :)z8 2@ @>DRYY D<8 8'? ' C 1+ 1h 
g
% g

g
T [$8/ [ [| Rr+   