
    cCie                        S SK rS SKJr  S SKJrJrJr  S SKrS SK	J
r
  SSKJr  SSKJr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJrJrJrJrJr  SSKJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.J/r/  \R`                  " \15      r2 S@S\
Rf                  S\Rh                  S\Rh                  S\Rh                  S\\Rh                     S\5S\54S jjr6 " S S\&5      r7 " S S\$5      r8\\" S S!9 " S" S#\5      5       5       r9 " S$ S%\
Rf                  5      r: " S& S'\
Rf                  5      r; " S( S)\"5      r<\
Rz                  \7S*.r> " S+ S,\5      r? " S- S.\
Rf                  5      r@\ " S/ S0\5      5       rA\ " S1 S2\A5      5       rB " S3 S4\,5      rCSrD " S5 S6\
Rf                  5      rE " S7 S8\+5      rF " S9 S:\*5      rG " S; S<\(5      rH " S= S>\)5      rI/ S?QrJg)A    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfigmodulequerykeyvalueattention_maskscalingdropoutc                    UnUn	[         R                  " XR                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R
                  R                  U
SS9n
[        R
                  R                  XU R                  S9n
[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalsoftmaxr(   r/   
contiguous)r"   r#   r$   r%   r&   r'   r(   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                g/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forwardr?   0   s     JL<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1 ==((2(>L==((6??([L,,|:K''1-88:K$$    c                       \ rS rSrSrg)InternVLVisionRMSNormK    N__name__
__module____qualname____firstlineno____static_attributes__rD   r@   r>   rB   rB   K       r@   rB   c                      ^  \ rS rSrS\4U 4S jjr S	S\R                  S\\R                     S\	\
   4S jjrSrU =r$ )
InternVLVisionAttentionO   configc                 2  > [         TU ]  U5        U ?SU l        UR                  nU(       a  [        U R                  5      O[        R                  " 5       U l	        U(       a  [        U R                  5      U l
        g [        R                  " 5       U l
        g NF)super__init__num_key_value_groups	is_causaluse_qk_normrB   	embed_dimr4   Identityq_normk_norm)selfrO   qk_norm	__class__s      r>   rS    InternVLVisionAttention.__init__P   sd     % $$?F+DNN;BKKM?F+DNN;BKKMr@   hidden_statesr&   r8   c                    UR                  5       u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	U R	                  U5      nU R                  U5      nUR                  XEU R                  U R                  5      R                  SS5      nUR                  XEU R                  U R                  5      R                  SS5      nU	R                  XEU R                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	U4U R                  (       d  SOU R                   U R"                  SS.UD6u  pUR                  XEU R$                  5      nU R'                  U5      nU R)                  U5      nX4$ )Nr   r   eager        F)r(   r'   rU   )sizeq_projk_projv_projrY   rZ   reshape	num_headshead_dimr2   viewr?   rO   _attn_implementationr   r/   attention_dropoutscalerW   projection_layerprojection_dropout)r[   r_   r&   r8   
batch_sizeseq_len_query_statesr9   r:   attention_interfacer=   r;   outputs                 r>   forwardInternVLVisionAttention.forward[   s    "/!3!3!5
Q{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJ
%
 
%
! "))*t~~N&&{3((0##r@   )rU   rZ   rY   N)rF   rG   rH   rI   r!   rS   r0   Tensorr   r   r   rv   rJ   __classcell__r]   s   @r>   rM   rM   O   sP    	Z3 	Z 26'$||'$ !.'$ +,	'$ '$r@   rM   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                       \ rS rSrSrSrg)$InternVLVisionModelOutputWithPooling   a2  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
    *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
    will be returned.
rD   N)rF   rG   rH   rI   __doc__rJ   rD   r@   r>   r~   r~      s    r@   r~   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )InternVLVisionPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pTUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l        Xpl        [        R                  " XEX3S9U l
        g )Nr   r   )kernel_sizestride)rR   rS   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper4   Conv2d
projection)	r[   rO   r   r   r   r   r   r   r]   s	           r>   rS   &InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir@   pixel_valuesreturnc                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR                   S   UR                   S   pUR	                  S5      R                  SS5      nXgU44$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r3   r   
ValueErrorr   flattenr2   )	r[   r   rp   r   heightwidth
embeddingspatch_heightpatch_widths	            r>   rv   %InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
&,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
+666r@   )r   r   r   r   r   r   )rF   rG   rH   rI   r   rS   r0   ry   rv   rJ   rz   r{   s   @r>   r   r      s.    j7ELL 7U\\ 7 7r@   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\\R                     S\R                  4S jjrSrU =r$ )InternVLVisionEmbeddings   z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

rO   r   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )rR   rS   r4   	Parameterr0   zerosr   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probr(   )r[   rO   r   r]   s      r>   rS   !InternVLVisionEmbeddings.__init__   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r@   r   r   r   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr+   r         ?r   r   bicubicF)rc   modealign_cornersr,   )r3   r   r0   jit
is_tracingr   r   rg   permuter4   r5   interpolaterj   cat)r[   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr-   
new_height	new_widthsqrt_num_positionss               r>   interpolate_pos_encoding1InternVLVisionEmbeddings.interpolate_pos_encoding   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr@   r   bool_masked_posc                    UR                   u    p4nU R                  U5      u  nu  pxUR                  5       u  pnUbI  U R                  R	                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R	                  U	SS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nXgU44$ )Nr+   r   r,   )r3   r   rc   r   expand	unsqueezetype_asr   r0   r   r   r   r(   )r[   r   r   rr   r   r   r   r   r   rp   rq   mask_tokensw
cls_tokenss                 r>   rv    InternVLVisionEmbeddings.forward   s    
 +001e262G2G2U/
/\!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
+666r@   )r   r(   r   r   r   r   r   rx   )rF   rG   rH   rI   r   r!   rS   r0   ry   intr   r   
BoolTensorrv   rJ   rz   r{   s   @r>   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7 7r@   r   c                       \ rS rSrSrg)InternVLVisionMLPi  rD   NrE   rD   r@   r>   r   r     rK   r@   r   )
layer_normrms_normc                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	\
\R                     \
\R                  \R                  4   4   4S jrS	rU =r$ )
InternVLVisionLayeri  z?This corresponds to the Block class in the timm implementation.rO   r   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                     " UR                  UR                  S9U l        [        UR                     " UR                  UR                  S9U l        UR                  n[        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R*                  " UR,                  5      U l        g )Nr   epsT)requires_grad)rR   rS   chunk_size_feed_forwardseq_len_dimrM   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer4   r   r0   oneslambda_1lambda_2r   r   r(   )r[   rO   init_valuesr]   s      r>   rS   InternVLVisionLayer.__init__"  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r@   r_   c                    U R                  U R                  U5      5      u  p#U R                  U-  nX!-   nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  b  U R                  U-  nXA-   nU$ rx   )r   r   r   r   r   r(   r   )r[   r_   attention_outputrr   layer_outputs        r>   rv   InternVLVisionLayer.forward1  s     #nn!!-0
  ==+;; )8 ++M:xx-||L1==$==<7L $3r@   )	r   r   r(   r   r   r   r   r   r   )rF   rG   rH   rI   r   r!   rS   r0   ry   r   tuplerv   rJ   rz   r{   s   @r>   r   r     s_    I>3 > >|| 
uU\\"E%,,*D$EE	F r@   r   c                   d   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\\	\
4   4S jrSrU =r$ )	InternVLVisionEncoderiM  rO   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf rQ   )
rR   rS   rO   r4   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r[   rO   ir]   s      r>   rS   InternVLVisionEncoder.__init__N  sS    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A&r_   c                 J    U R                    H  nU" U5      nM     [        US9$ )N)last_hidden_state)r   r   )r[   r_   layer_modules      r>   rv   InternVLVisionEncoder.forwardT  s.     !JJL(7M ' +
 	
r@   )rO   r   r   )rF   rG   rH   rI   r!   rS   r0   ry   r   r   r   rv   rJ   rz   r{   s   @r>   r   r   M  sA    ,3 , ,	
||	
 
uo%	&	
 	
r@   r   c                   `   ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSr\\S.rU 4S jrS	rU =r$ )
InternVLVisionPreTrainedModeli`  rO   internvl_visionr   Tr   )r_   
attentionsc                 n  > [         TU ]  U5        [        U[        5      (       a  UR                  R
                  R                  5         UR                  b$  UR                  R
                  R                  5         UR                  b%  UR                  R
                  R                  5         gg[        U[        5      (       as  UR                  R
                  R                  U R                  R                  5        UR                  R
                  R                  U R                  R                  5        gg)zInitialize the weightsN)rR   _init_weightsr   r   r   datazero_r   r   r   r   fill_rO   r   r   )r[   r"   r]   s     r>   r   +InternVLVisionPreTrainedModel._init_weightsq  s    f%f677!!'')  ,!!&&,,.))5**//557 6 344OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r@   rD   )rF   rG   rH   rI   r!   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rM   _can_record_outputsr   rJ   rz   r{   s   @r>   r   r   `  sV      )$O&*#./N"& --
K Kr@   r   c                      ^  \ rS rSrS\SS4U 4S jjrS r\" SS9\ SS	\	R                  S
\\	R                     S\\\4   4S jj5       5       rSrU =r$ )InternVLVisionModeli  rO   r   Nc                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U R                  5         g )Nr   )rR   rS   rO   r   r   r   encoderuse_mean_poolingr4   rX   	LayerNormr   r   	layernorm	post_initr[   rO   r]   s     r>   rS   InternVLVisionModel.__init__  sm     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r@   c                 .    U R                   R                  $ rx   )r   r   )r[   s    r>   get_input_embeddings(InternVLVisionModel.get_input_embeddings  s    ///r@   F)tie_last_hidden_statesr   r   c                     U R                  XS9u  p4U R                  U5      nUS   nU R                  U5      n[        UUR                  UR
                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)r   r   )r   r_   r   )r   r  r  r~   r_   r   )r[   r   r   embedding_outputrr   encoder_outputssequence_outputs          r>   rv   InternVLVisionModel.forward  sa     #oolo\,,'78)!,..93-)77&11
 	
r@   )rO   r   r  r  rx   )rF   rG   rH   rI   r!   rS   r  r   r   r0   ry   r   r   r   r   r~   rv   rJ   rz   r{   s   @r>   r  r    sx    3  0 u5 7;
ll
 "%"2"23
 
u::	;	
  6
r@   r  c                       \ rS rSrSrg)InternVLPreTrainedModeli  rD   NrE   rD   r@   r>   r  r    rK   r@   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )InternVLMultiModalProjectori  rO   c                 0  > [         TU ]  5         [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  5      U l        [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  UR                  R
                  5      U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  5      U l        g )Nr   r   )rR   rS   r4   r  vision_configr   r   downsample_ratior   Lineartext_configlinear_1r   projector_hidden_actactlinear_2r  s     r>   rS   $InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar@   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rx   )r   r'  r)  r*  )r[   image_featuresr_   s      r>   rv   #InternVLMultiModalProjector.forward  s@    7m4/m4r@   )r)  r   r'  r*  )	rF   rG   rH   rI   r    rS   rv   rJ   rz   r{   s   @r>   r!  r!    s    b~ b r@   r!  c                       \ rS rSrSrg)InternVLModelOutputWithPasti  rD   NrE   rD   r@   r>   r0  r0    rK   r@   r0  c                      \ rS rSrSS\R
                  S\4S jjr  SS\R                  S\	\
\\\   4      S\	\   4S	 jjr\\         SS
\	\R"                     S\	\R                     S\	\R
                     S\	\R"                     S\	\   S\	\R                     S\	\
\\\   4      S\	\   S\	\R"                     S\\   S\
\\4   4S jj5       5       rSrg)InternVLModeli  vision_featuresscale_factorc           
         UR                  5       u  p4pVXR-  S:w  d  XB-  S:w  a  [        S5      eUR                  X4[        XR-  5      [        Xb-  5      5      nUR	                  SSSS5      R                  5       nUR                  U[        XR-  5      [        XB-  5      [        XbS-  -  5      5      nUR	                  SSSS5      R                  5       nU$ )a  Perform pixel shuffle downsampling on vision features.

Args:
    vision_features (`torch.Tensor`):
        Input tensor of shape (batch_size, width, height, channels).
    scale_factor (`float`, *optional*, defaults to `0.5`):
        Factor by which to downsample. Default is 0.5, which halves the dimensions.

Returns:
    vision_features (`torch.Tensor`):
        Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rc   r   rj   r   r   r7   )r[   r3  r4  rp   r   r   channelss          r>   pixel_shuffleInternVLModel.pixel_shuffle  s     />.B.B.D+
6 A%)=)Bjkk *..s6#893x?V;W
 *11!Q1=HHJ *..F12C8L4MsS[mn_nSoOp

 *11!Q1=HHJr@   Nr   vision_feature_layervision_feature_select_strategyc                 n   Ub  UOU R                   R                  nUb  UOU R                   R                  nUR                  U R                  S9nU R                   R
                  nUS:X  a  U R                  US9R                  nOU R                  US9R                  U   nUS:X  a  USS2SS2SS24   nUR                  S   n[        US-  5      nUR                  S   n	UR                  XUS5      nU R                  XeS	9nUR                  U	SUR                  S   5      nU R                  U5      nU$ )
a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layer (`int` or `list[int]`):
        Layer index or list of layer indices to extract features from.
Returns:
    vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
N)dtyper+   )r   defaultr   r   r   )r4  )rO   r9  r:  tor<  r$  vision_towerr   vision_modelr_   r3   r   rg   r7  multi_modal_projector)
r[   r   r9  r:  r8   r$  r3  r6  feature_sizerp   s
             r>   get_image_features InternVLModel.get_image_features  sS   & %9$D $++JjJj 	
 .9 +;; 	'
 $TZZ8;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*LZ\] ,,_,\ *11*b/BWBWXZB[\ 44_Er@   	input_idsr&   position_idspast_key_valuesinputs_embedscache_positionr8   r   c
           	      0   Ub  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [        S5      eUc  U R	                  5       " U5      nUbX  U R                  UUUS9nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUU	S.U
D6n[        UR                  UR                  UR                  UR                   Ub  WS9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r9  r:  )rH  r-  )r&   rF  rG  rH  rI  )r   rG  r_   r   image_hidden_statesrD   )rO   r9  r:  r   r  rC  r>  devicer<  get_placeholder_maskmasked_scatterlanguage_modelr0  r   rG  r_   r   )r[   rE  r   r&   rF  rG  rH  r9  r:  rI  r8   r-  special_image_maskoutputss                 r>   rv   InternVLModel.forward!  sb     %9$D $++JjJj 	
 .9 +;; 	' -t";<YZZ  557	BM#!44)%9/M 5 N
 ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+')
 
 +%77#33!//))2>2J
 	

 QU
 	
r@   rD   )r   )NN)	NNNNNNNNN)rF   rG   rH   rI   r0   ry   floatr7  FloatTensorr   r   r   liststrrC  r   r   
LongTensorr	   r   r   r   r0  rv   rJ   rD   r@   r>   r2  r2    sr   !U\\ ! !L AE8<	4''4 'uS$s)^'<=4 )1	4l  15481537+/59@D8<597
E,,-7
 u0017
 !.	7

 u//07
 "%7
   1 127
 'uS$s)^'<=7
 )17
 !!1!127
 +,7
 
u11	27
  7
r@   r2  c                       \ rS rSrSrg)InternVLCausalLMOutputWithPasti]  rD   NrE   rD   r@   r>   rY  rY  ]  rK   r@   rY  c                   (   ^  \ rS rSrU 4S jrSrU =r$ ) InternVLForConditionalGenerationia  c                  :   > [        5       R                  " S0 U D6  g)as  
Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText

>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
... )

>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {
...                 "type": "image",
...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
...             },
...             {
...                 "type": "image",
...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
...             },
...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
...         ],
...     },
... ]

>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```NrD   )rR   rv   )super_kwargsr]   s    r>   rv   (InternVLForConditionalGeneration.forwardb  s    H 	','r@   rD   )rF   rG   rH   rI   rv   rJ   rz   r{   s   @r>   r[  r[  a  s    $( $(r@   r[  )r   r  r  r2  r[  )rb   )Kcollections.abcr   dataclassesr   typingr   r   r   r0   torch.nnr4   activationsr   cache_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   r   configuration_internvlr    r!   
get_loggerrF   loggerModulery   rS  r?   rB   rM   r~   r   r   r   r  r   r   r   r   r  r  INTERNVL_INPUTS_DOCSTRINGr!  r0  r2  rY  r[  __all__rD   r@   r>   <module>ru     s	  "  ! , ,   !   9 K F & ] ] / ( 7 /  I 
		H	% %II%<<% 
% <<	%
 U\\*% % %6	L 	3$2 3$l 
+E  !7BII !7L[7ryy [7|	 	 3H
I+4 +\
BII 
& KO K K< '
7 '
 '
T	2 	 ! ")) $	": 	S
J S
l	%@ 	%('D %(Pr@   