
    cCi                     F   S SK rS SKJr  S SKJrJrJr  S SKrS SK	J
r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)J*r*  \" S5       " S S\
RV                  5      5       r, SDS\
RV                  S\RZ                  S\RZ                  S\RZ                  S\\RZ                     S\.S\.4S jjr/ " S S\
RV                  5      r0\\!" S S!9 " S" S#\5      5       5       r1 " S$ S%\
RV                  5      r2 " S& S'\
RV                  5      r3 " S( S)\
RV                  5      r4\
Rj                  \,S*.r6 " S+ S,\5      r7 " S- S.\
RV                  5      r8\! " S/ S0\5      5       r9\! " S1 S2\95      5       r:\! " S3 S4\5      5       r; " S5 S6\
RV                  5      r<\\!" S7S!9 " S8 S9\5      5       5       r=\!" S:S!9 " S; S<\;5      5       r>\\!" S=S!9 " S> S?\5      5       5       r?\!" S@S!9 " SA SB\;\5      5       r@/ SCQrAg)E    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int)check_model_inputs   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )InternVLVisionRMSNorm,   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z4
InternVLVisionRMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/internvl/modeling_internvl.pyr$   InternVLVisionRMSNorm.__init__.   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   T)keepdim)	dtypetor'   float32powmeanrsqrtr*   r)   )r+   hidden_statesinput_dtypevariances       r/   forwardInternVLVisionRMSNorm.forward6   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r1   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler)   shaper*   r+   s    r/   
extra_repr InternVLVisionRMSNorm.extra_repr=   s*    ))*+6$2G2G1HIIr1   )r*   r)   )gư>)	__name__
__module____qualname____firstlineno__r$   r>   rD   __static_attributes____classcell__r.   s   @r/   r    r    ,   s    $;J Jr1   r    modulequerykeyvalueattention_maskscalingdropoutc                    UnUn	[         R                  " XR                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R
                  R                  U
SS9n
[        R
                  R                  XU R                  S9n
[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r   r3   dim)ptrainingr   )
r'   matmul	transposerB   r%   
functionalsoftmaxrS   rY   
contiguous)rM   rN   rO   rP   rQ   rR   rS   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r/   eager_attention_forwardre   A   s     JL<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1 ==((2(>L==((6??([L,,|:K''1-88:K$$r1   c                      ^  \ rS rSrSrS\4U 4S jjr S
S\R                  S\	\R                     S\
\   4S jjrS	rU =r$ )InternVLVisionAttention\   z+Attention Class for InternVL Vision Encoderconfigc                 $  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  5      U l        US:  a  [        R*                  " U5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      U l        g [        R,                  " 5       U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r#   r$   ri   r,   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr%   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr    q_normk_norm)r+   ri   proj_dropoutqk_normr.   s       r/   r$    InternVLVisionAttention.__init___   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta?F+DNN;BKKM?F+DNN;BKKMr1   r;   rQ   r_   c                    UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	U R	                  U5      nU R                  U5      nUR                  XEU R                  U R                  5      R                  SS5      nUR                  XEU R                  U R                  5      R                  SS5      nU	R                  XEU R                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	U4U R                  (       d  SOU R                   U R"                  SS.UD6u  pUR                  XEU R$                  5      nU R'                  U5      nU R)                  U5      nX4$ )Nr   r   eager        F)rS   rR   rv   )sizery   rz   r{   r   r   reshapero   rp   r[   viewre   ri   _attn_implementationr   rY   rs   rr   rm   r|   rt   )r+   r;   rQ   r_   
batch_sizeseq_len_query_statesr`   ra   attention_interfacerd   rb   outputs                 r/   r>   InternVLVisionAttention.forward{   s    "/!3!3!5
Q{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJ
%
 
%
! "))*t~~N&&{3((0##r1   )rs   ri   rm   rp   rv   r   rz   ro   rt   r|   r   ry   rr   r{   N)rF   rG   rH   rI   __doc__r   r$   r'   Tensorr   r   r   r>   rJ   rK   rL   s   @r/   rg   rg   \   sS    5Z3 Z> 26'$||'$ !.'$ +,	'$ '$r1   rg   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                       \ rS rSrSrSrg)$InternVLVisionModelOutputWithPooling   a2  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
    *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
    will be returned.
 N)rF   rG   rH   rI   r   rJ   r   r1   r/   r   r      s    r1   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )InternVLVisionPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pTUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l        Xpl        [        R                  " XEX3S9U l
        g )Nr   r   )kernel_sizestride)r#   r$   
image_size
patch_sizenum_channelsr,   num_patchespatch_shaper%   Conv2d
projection)	r+   ri   r   r   r   r,   r   r   r.   s	           r/   r$   &InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir1   pixel_valuesreturnc                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR                   S   UR                   S   pUR	                  S5      R                  SS5      nXgU44$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )rB   r   rq   r   flattenr[   )	r+   r   r   r   heightwidth
embeddingspatch_heightpatch_widths	            r/   r>   %InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
&,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
+666r1   )r   r   r   r   r   r   )rF   rG   rH   rI   r   r$   r'   r   r>   rJ   rK   rL   s   @r/   r   r      s.    j7ELL 7U\\ 7 7r1   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\\R                     S\R                  4S jjrSrU =r$ )InternVLVisionEmbeddings   z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

ri   r   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )r#   r$   r%   r&   r'   zerosr,   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsr}   hidden_dropout_probrS   )r+   ri   r   r.   s      r/   r$   !InternVLVisionEmbeddings.__init__   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r1   r   r   r   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr3   r         ?r   r   bicubicF)r   modealign_cornersrV   )rB   r   r'   jit
is_tracingr   r   r   permuter%   r\   interpolater   cat)r+   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrW   
new_height	new_widthsqrt_num_positionss               r/   interpolate_pos_encoding1InternVLVisionEmbeddings.interpolate_pos_encoding   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr1   r   bool_masked_posc                    UR                   u    p4nU R                  U5      u  nu  pxUR                  5       u  pnUbI  U R                  R	                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R	                  U	SS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nXgU44$ )Nr3   r   rV   )rB   r   r   r   expand	unsqueezetype_asr   r'   r   r   r   rS   )r+   r   r   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss                 r/   r>    InternVLVisionEmbeddings.forward  s    
 +001e262G2G2U/
/\!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
+666r1   )r   rS   r   r   r   r   r   r   )rF   rG   rH   rI   r   r   r$   r'   r   intr   r   
BoolTensorr>   rJ   rK   rL   s   @r/   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7 7r1   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InternVLVisionMLPi8  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r#   r$   ri   r   
hidden_actactivation_fnr%   rw   r,   intermediate_sizefc1fc2r+   ri   r.   s     r/   r$   InternVLVisionMLP.__init__9  sb    #F$5$5699V//1I1IJ99V55v7I7IJr1   r;   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )r+   r;   s     r/   r>   InternVLVisionMLP.forward@  s4    /**=9/r1   )r   ri   r   r   )
rF   rG   rH   rI   r$   r'   r   r>   rJ   rK   rL   s   @r/   r   r   8  s)    KU\\ ell  r1   r   )
layer_normrms_normc                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	\
\R                     \
\R                  \R                  4   4   4S jrS	rU =r$ )
InternVLVisionLayeriJ  z?This corresponds to the Block class in the timm implementation.ri   r   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                     " UR                  UR                  S9U l        [        UR                     " UR                  UR                  S9U l        UR                  n[        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R*                  " UR,                  5      U l        g )Nr   r-   T)requires_grad)r#   r$   chunk_size_feed_forwardseq_len_dimrg   	attentionr   mlpNORM2FN	norm_typer,   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer%   r&   r'   r(   lambda_1lambda_2r}   r   rS   )r+   ri   init_valuesr.   s      r/   r$   InternVLVisionLayer.__init__M  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r1   r;   c                    U R                  U R                  U5      5      u  p#U R                  U-  nX!-   nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  b  U R                  U-  nXA-   nU$ r   )r   r   r   r   r   rS   r   )r+   r;   attention_outputr   layer_outputs        r/   r>   InternVLVisionLayer.forward\  s     #nn!!-0
  ==+;; )8 ++M:xx-||L1==$==<7L $3r1   )	r   r   rS   r   r   r   r   r   r   )rF   rG   rH   rI   r   r   r$   r'   r   r   rA   r>   rJ   rK   rL   s   @r/   r   r   J  s_    I>3 > >|| 
uU\\"E%,,*D$EE	F r1   r   c                   d   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\\	\
4   4S jrSrU =r$ )	InternVLVisionEncoderix  ri   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r#   r$   ri   r%   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r+   ri   ir.   s      r/   r$   InternVLVisionEncoder.__init__y  sS    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A&r;   c                 J    U R                    H  nU" U5      nM     [        US9$ )N)last_hidden_state)r  r   )r+   r;   layer_modules      r/   r>   InternVLVisionEncoder.forward  s.     !JJL(7M ' +
 	
r1   )ri   r  r  )rF   rG   rH   rI   r   r$   r'   r   r   rA   r   r>   rJ   rK   rL   s   @r/   r  r  x  sA    ,3 , ,	
||	
 
uo%	&	
 	
r1   r  c                   `   ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSr\\S.rU 4S jrS	rU =r$ )
InternVLVisionPreTrainedModeli  ri   internvl_visionr   Tr   )r;   
attentionsc                 n  > [         TU ]  U5        [        U[        5      (       a  UR                  R
                  R                  5         UR                  b$  UR                  R
                  R                  5         UR                  b%  UR                  R
                  R                  5         gg[        U[        5      (       as  UR                  R
                  R                  U R                  R                  5        UR                  R
                  R                  U R                  R                  5        gg)zInitialize the weightsN)r#   _init_weightsr   r   r   datazero_r   r   r   r   fill_ri   r   r   )r+   rM   r.   s     r/   r  +InternVLVisionPreTrainedModel._init_weights  s    f%f677!!'')  ,!!&&,,.))5**//557 6 344OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r1   r   )rF   rG   rH   rI   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rg   _can_record_outputsr  rJ   rK   rL   s   @r/   r  r    sV      )$O&*#./N"& --
K Kr1   r  c                      ^  \ rS rSrS\SS4U 4S jjrS r\" SS9\ SS	\	R                  S
\\	R                     S\\\4   4S jj5       5       rSrU =r$ )InternVLVisionModeli  ri   r   Nc                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U R                  5         g )Nr   )r#   r$   ri   r   r   r  encoderuse_mean_poolingr%   r~   	LayerNormr,   r   	layernorm	post_initr   s     r/   r$   InternVLVisionModel.__init__  sm     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r1   c                 .    U R                   R                  $ r   )r   r   rC   s    r/   get_input_embeddings(InternVLVisionModel.get_input_embeddings  s    ///r1   F)tie_last_hidden_statesr   r   c                     U R                  XS9u  p4U R                  U5      nUS   nU R                  U5      n[        UUR                  UR
                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)r   r   )r  r;   r  )r   r*  r-  r   r;   r  )r+   r   r   embedding_outputr   encoder_outputssequence_outputs          r/   r>   InternVLVisionModel.forward  sa     #oolo\,,'78)!,..93-)77&11
 	
r1   )ri   r   r*  r-  r   )rF   rG   rH   rI   r   r$   r1  r   r   r'   r   r   r   r   rA   r   r>   rJ   rK   rL   s   @r/   r(  r(    sx    3  0 u5 7;
ll
 "%"2"23
 
u::	;	
  6
r1   r(  c                   @    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrg)InternVLPreTrainedModeli  ri    Tpast_key_valuesr   N)rF   rG   rH   rI   r   r  r  r   _skip_keys_device_placementr#  r"  _can_compile_fullgraphr$  r%  rJ   r   r1   r/   r:  r:    s7    &*#"3N!"&r1   r:  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )InternVLMultiModalProjectori  ri   c                 0  > [         TU ]  5         [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  5      U l        [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  UR                  R
                  5      U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  5      U l        g )Nr   r   )r#   r$   r%   r,  vision_configr,   r   downsample_ratior   rw   text_configlinear_1r   projector_hidden_actactlinear_2r   s     r/   r$   $InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar1   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   rE  rG  rH  )r+   image_featuresr;   s      r/   r>   #InternVLMultiModalProjector.forward  s@    7m4/m4r1   )rG  r   rE  rH  )	rF   rG   rH   rI   r   r$   r>   rJ   rK   rL   s   @r/   r@  r@    s    b~ b r1   r@  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)InternVLModelOutputWithPasti  a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_statesr   )rF   rG   rH   rI   r   rO  r   r'   FloatTensorr  rJ   r   r1   r/   rN  rN    s    	 8<%"3"34;r1   rN  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c                   H  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS r	S	 r
  SS
\R                  S\\\\\   4      S\\   4S jjrS\R&                  S\R                  S\R                  4S jr\\         SS\\R&                     S
\\R                     S\\R.                     S\\R&                     S\\   S\\R                     S\\\\\   4      S\\   S\\R&                     S\\   S\\\4   4S jj5       5       rSS\R.                  S\4S jjrSr U =r!$ ) InternVLModeli  zlanguage_model.modellanguage_modelri   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g r   )r#   r$   r   from_configrB  vision_towerr@  multi_modal_projectorrD  rS  r.  r   s     r/   r$   InternVLModel.__init__  sY     %11&2F2FG%@%H"'33F4F4FGr1   c                 6    U R                   R                  5       $ r   )rS  r1  rC   s    r/   r1  "InternVLModel.get_input_embeddings  s    ""7799r1   c                 :    U R                   R                  U5        g r   )rS  set_input_embeddingsr+   rP   s     r/   r\  "InternVLModel.set_input_embeddings  s    007r1   c                     Xl         g r   rS  r+   decoders     r/   set_decoderInternVLModel.set_decoder!  s    %r1   c                     U R                   $ r   r`  rC   s    r/   get_decoderInternVLModel.get_decoder$  s    """r1   r   vision_feature_layervision_feature_select_strategyc                 n   Ub  UOU R                   R                  nUb  UOU R                   R                  nUR                  U R                  S9nU R                   R
                  nUS:X  a  U R                  US9R                  nOU R                  US9R                  U   nUS:X  a  USS2SS2SS24   nUR                  S   n[        US-  5      nUR                  S   n	UR                  XUS5      nU R                  XeS	9nUR                  U	SUR                  S   5      nU R                  U5      nU$ )
a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layer (`int` or `list[int]`):
        Layer index or list of layer indices to extract features from.
Returns:
    vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
N)r5   r3   )r   defaultr   r   r   )scale_factor)ri   rh  ri  r6   r5   rC  rV  r  vision_modelr;   rB   r   r   pixel_shufflerW  )
r+   r   rh  ri  r_   rC  vision_featureschannelsfeature_sizer   s
             r/   get_image_features InternVLModel.get_image_features'  sS   & %9$D $++JjJj 	
 .9 +;; 	'
 $TZZ8;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*LZ\] ,,_,\ *11*b/BWBWXZB[\ 44_Er1   	input_idsinputs_embedsrK  c           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r5   devicer3   r   r   z6Image features and image tokens do not match: tokens: z, features )r1  r'   tensorri   image_token_idlongrw  allsumr   	expand_asr6   rB   numelrq   )r+   rt  ru  rK  special_image_maskn_image_tokensn_image_featuress          r/   get_placeholder_mask"InternVLModel.get_placeholder_mask]  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r1   rQ   position_idsr<  cache_positionr_   r   c
           	      0   Ub  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [        S5      eUc  U R	                  5       " U5      nUbX  U R                  UUUS9nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUU	S.U
D6n[        UR                  UR                  UR                  UR                   Ub  WS9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   rh  ri  )ru  rK  )rQ   r  r<  ru  r  )r  r<  r;   r  rO  r   )ri   rh  ri  rq   r1  rr  r6   rw  r5   r  masked_scatterrS  rN  r  r<  r;   r  )r+   rt  r   rQ   r  r<  ru  rh  ri  r  r_   rK  r  outputss                 r/   r>   InternVLModel.forwardu  sb     %9$D $++JjJj 	
 .9 +;; 	' -t";<YZZ  557	BM#!44)%9/M 5 N
 ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+')
 
 +%77#33!//))2>2J
 	

 QU
 	
r1   ro  rl  c           
         UR                  5       u  p4pVXR-  S:w  d  XB-  S:w  a  [        S5      eUR                  X4[        XR-  5      [        Xb-  5      5      nUR	                  SSSS5      R                  5       nUR                  U[        XR-  5      [        XB-  5      [        XbS-  -  5      5      nUR	                  SSSS5      R                  5       nU$ )a  Perform pixel shuffle downsampling on vision features.

Args:
    vision_features (`torch.Tensor`):
        Input tensor of shape (batch_size, width, height, channels).
    scale_factor (`float`, *optional*, defaults to `0.5`):
        Factor by which to downsample. Default is 0.5, which halves the dimensions.

Returns:
    vision_features (`torch.Tensor`):
        Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r   rq   r   r   r   r^   )r+   ro  rl  r   r   r   rp  s          r/   rn  InternVLModel.pixel_shuffle  s     />.B.B.D+
6 A%)=)Bjkk *..s6#893x?V;W
 *11!Q1=HHJ *..F12C8L4MsS[mn_nSoOp

 *11!Q1=HHJr1   )rS  rW  rV  NN)	NNNNNNNNN)r   )"rF   rG   rH   rI   _checkpoint_conversion_mappingr   r$   r1  r\  rc  rf  r'   rP  r   r   r   liststrrr  
LongTensorr  r   r   r   r	   r   r   rA   rN  r>   floatrn  rJ   rK   rL   s   @r/   rR  rR    s    '=>N%O"~ :8&# AE8<	4''4 'uS$s)^'<=4 )1	4l"))":?:K:K"]b]n]n"0  15481537+/59@D8<597
E,,-7
 u0017
 !.	7

 u//07
 "%7
   1 127
 'uS$s)^'<=7
 )17
 !!1!127
 +,7
 
u11	27
  7
r!U\\ ! ! !r1   rR  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)InternVLCausalLMOutputWithPasti  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitsr<  r;   r  rO  r   )rF   rG   rH   rI   r   r  r   r'   rP  r  r  r<  r	   r;   rA   r  rO  rJ   r   r1   r/   r  r    s     )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju001297;%"3"34;r1   r  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c            !         ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S\
R                  4S jrS rS r  S$S\R                   S\\\\\   4      S\\   4S jjr\S 5       r\S 5       r\S 5       r\\            S%S\\R:                     S\\R                      S\\R<                     S\\R:                     S\\   S\\R                      S\\\\\   4      S\\   S\\R:                     S\\R:                     S\\\R<                  4   S\\R<                     S \ \!   S\\"\#4   4S! jj5       5       r$      S&U 4S" jjr%S#r&U =r'$ )' InternVLForConditionalGenerationi  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightri   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFrk   )r#   r$   rR  modelr%   rw   rD  r,   
vocab_sizer  r.  r   s     r/   r$   )InternVLForConditionalGeneration.__init__   sS     "6*
yy!3!3!?!?ASASA^A^ejkr1   c                 6    U R                   R                  5       $ r   )r  r1  rC   s    r/   r1  5InternVLForConditionalGeneration.get_input_embeddings  s    zz..00r1   c                 :    U R                   R                  U5        g r   )r  r\  r]  s     r/   r\  5InternVLForConditionalGeneration.set_input_embeddings	  s    

''.r1   r   c                     U R                   $ r   )r  rC   s    r/   get_output_embeddings6InternVLForConditionalGeneration.get_output_embeddings  s    ||r1   c                 :    U R                   R                  U5        g r   )r  rc  ra  s     r/   rc  ,InternVLForConditionalGeneration.set_decoder  s    

w'r1   c                 6    U R                   R                  5       $ r   )r  rf  rC   s    r/   rf  ,InternVLForConditionalGeneration.get_decoder  s    zz%%''r1   r   rh  ri  c                 B    U R                   R                  " SUUUS.UD6$ )Nr  r   )r  rr  )r+   r   rh  ri  r_   s        r/   rr  3InternVLForConditionalGeneration.get_image_features  s3     zz,, 
%!5+I
 	
 	
r1   c                 .    U R                   R                  $ r   )r  rS  rC   s    r/   rS  /InternVLForConditionalGeneration.language_model$  s    zz(((r1   c                 .    U R                   R                  $ r   )r  rV  rC   s    r/   rV  -InternVLForConditionalGeneration.vision_tower(  s    zz&&&r1   c                 .    U R                   R                  $ r   )r  rW  rC   s    r/   rW  6InternVLForConditionalGeneration.multi_modal_projector,  s    zz///r1   rt  rQ   r  r<  ru  labelsr  logits_to_keepimage_sizesr_   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUUU
US.
UD6nUS   n[	        U[
        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )as  
Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText

>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
... )

>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {
...                 "type": "image",
...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
...             },
...             {
...                 "type": "image",
...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
...             },
...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
...         ],
...     },
... ]

>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```N)
rt  r   rQ   r  r<  ru  rh  ri  r  r  r   )r  r  r  )r  r  r<  r;   r  rO  r   )ri   rh  ri  r  r   r   slicer  loss_functionrD  r  r  r<  r;   r  rO  )r+   rt  r   rQ   r  r<  ru  rh  ri  r  r  r  r  r_   r  r;   slice_indicesr  r  s                      r/   r>   (InternVLForConditionalGeneration.forward0  s4   l %9$D $++JjJj 	
 .9 +;; 	' ** 
%)%+'!5+I)#
 
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r1   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)r<  ru  rQ   r  r  r   r   )r#   prepare_inputs_for_generation)r+   rt  r<  ru  r   rQ   r  r  r_   model_inputsr.   s             r/   r  >InternVLForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !! ,8(r1   )r  r  r  )NNNNNNNNNNr   N)NNNNNN)(rF   rG   rH   rI   r  _tied_weights_keysr   r$   r1  r\  r%   Moduler  rc  rf  r'   rP  r   r   r   r  r  rr  propertyrS  rV  rW  r   r   r  r   r	   r   r   rA   r  r>   r  rJ   rK   rL   s   @r/   r  r    s_    "8-"?#,	&" ++~ 1/ryy (( AE8<	
''
 'uS$s)^'<=
 )1	
 ) ) ' ' 0 0  15481537+/59@D8<-15934.2\
E,,-\
 u001\
 !.	\

 u//0\
 "%\
   1 12\
 'uS$s)^'<=\
 )1\
 ))*\
 !!1!12\
 c5<</0\
 ell+\
 +,\
 
u44	5\
  \
B  r1   r  )r  r(  r:  rR  r  )r   )Bcollections.abcr   dataclassesr   typingr   r   r   r'   torch.nnr%   activationsr   cache_utilsr	   
generationr
   integrationsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   autor   configuration_internvlr   r   r  r    r   r  re   rg   r   r   r   r   r,  r   r   r  r  r(  r:  r@  rN  rR  r  r  __all__r   r1   r/   <module>r     s  .  ! , ,   !   ) 7 9 d d F & a a /  H Y'JBII J (J6 %II%<<% 
% <<	%
 U\\*% % %6F$bii F$R 
+E  !7BII !7L[7ryy [7|		  3H
I+4 +\
BII 
& KO K K< '
7 '
 '
T 'o ' '")) $ 
<"9 < < 
A+ A
AH 
<[ < <0 
u'> u
upr1   