
    cCiD                        S r SSKrSSKJr  SSKJrJrJr  SSKrSSKJ	r	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJrJrJ r   SSK!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+  \%RX                  " \-5      r.\\#" SS9 " S S\"5      5       5       r/ " S S\	R`                  5      r1 " S S\	R`                  5      r2 " S S\	R`                  5      r3S\20r4 " S S\	R`                  5      r5 " S  S!\	R`                  5      r6 " S" S#\	R`                  5      r7 " S$ S%\5      r8 " S& S'\	R`                  5      r9\# " S( S)\5      5       r: " S* S+\	R`                  5      r; " S, S-\	R`                  5      r< SJS.\	R`                  S/\Rz                  S0\Rz                  S1\Rz                  S2\\Rz                     S3\>S4\>4S5 jjr? " S6 S7\	R`                  5      r@ " S8 S9\5      rA " S: S;\	R`                  5      rB " S< S=\	R`                  5      rC\#" S>S9 " S? S@\:5      5       rD " SA SB\	R`                  5      rE\#" SCS9 " SD SE\:5      5       rF\#" SFS9 " SG SH\:\5      5       rG/ SIQrHg)KzPyTorch GIT model.    N)	dataclass)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int)deprecate_kwarg   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
GitVisionModelOutput4   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r%   r   torchFloatTensor__annotations__r&   r'   tupler(   __static_attributes__r)       ^/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/git/modeling_git.pyr#   r#   4   sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r4   r#   c                      ^  \ rS rSrSrU 4S jr    SS\\R                     S\\R                     S\\R                     S\
S\R                  4
S	 jjrS
rU =r$ )GitEmbeddingsG   z;Construct the embeddings from word and position embeddings.c                 :  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        [        USS5      U l        U R#                  S[$        R&                  " UR                  5      R)                  S5      SS9  g )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr=   register_bufferr/   arangeexpandselfconfig	__class__s     r5   rE   GitEmbeddings.__init__J   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
r4   	input_idsr?   inputs_embedspast_key_values_lengthreturnc                 N   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XFU-   24   nUc  U R                  U5      nOUnU R                  S:X  a  U R	                  U5      nXx-  nU R                  U5      nU R                  U5      nU$ )NrA   r   r>   )sizer?   rJ   r=   rL   rM   rQ   )	rW   r[   r?   r\   r]   input_shape
seq_length
embeddingsrL   s	            r5   forwardGitEmbeddings.forwardY   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J'':5"&":":<"H-J^^J/
\\*-
r4   )rM   rQ   r=   rL   rJ   )NNNr   )r*   r+   r,   r-   r.   rE   r   r/   
LongTensorr0   intTensorrd   r3   __classcell__rY   s   @r5   r7   r7   G   sx    E
" 153759&'E,,- u//0   1 12	
 !$ 
 r4   r7   c                      ^  \ rS rSrSU 4S jjr\" SSSS9     SS\R                  S\\R                     S	\\R                     S\\
   S
\\   S\\   S\\R                     4S jj5       rSrU =r$ )GitSelfAttentionw   c                 ,  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eX0l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        UR                  R                  UR                  R                   -  S-  S	-   5      U l        UR$                  b  U =R"                  UR$                  -  sl        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R(                  " UR                  U R                  5      U l        [&        R0                  " UR2                  5      U l        U=(       d    [7        US
S5      U l        U R8                  S:X  d  U R8                  S:X  aH  UR:                  U l        [&        R<                  " SUR:                  -  S	-
  U R                  5      U l        g g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r=   r>   relative_keyrelative_key_query) rD   rE   rH   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerY   r*   rg   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyvaluerO   attention_probs_dropout_probrQ   rR   r=   rK   rF   distance_embeddingrW   rX   r=   rw   rY   s       r5   rE   GitSelfAttention.__init__x   s)    : ::a?PVXhHiHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr4   past_key_valuepast_key_values4.58new_nameversionr'   attention_mask	head_maskoutput_attentionspixel_values_presentr^   c           	      d   UR                   u  pxn	U R                  U5      R                  USU R                  U R                  5      R                  SS5      n
U(       a  U R                  OSnU R                  U5      R                  USU R                  U R                  5      R                  SS5      nU R                  U5      R                  USU R                  U R                  5      R                  SS5      nUb  UR                  US S 2S S 2US 2S S 24   US S 2S S 2US 2S S 24   U R                  5      u  p[        R                  " US S 2S S 2S U2S S 24   U/SS9n[        R                  " US S 2S S 2S U2S S 24   U/SS9n[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  Ga  U
R                   S   UR                   S   nnUbB  [        R                  " US-
  [        R                   UR"                  S	9R                  SS5      nO>[        R$                  " U[        R                   UR"                  S	9R                  SS5      n[        R$                  " U[        R                   UR"                  S	9R                  SS5      nUU-
  nU R'                  UU R(                  -   S-
  5      nUR+                  U
R,                  S
9nU R                  S:X  a  [        R.                  " SU
U5      nUU-   nOHU R                  S:X  a8  [        R.                  " SU
U5      n[        R.                  " SUU5      nUU-   U-   nU[0        R2                  " U R                  5      -  nUb  UU-   n[4        R6                  R9                  USS9nU R;                  U5      nUb  UU-  n[        R                  " UU5      nUR=                  SSSS5      R?                  5       nURA                  5       S S U RB                  4-   nUR                  U5      nUU4$ )NrA   r   rq   r   dimrr   rs   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   )"shaper   viewrt   rz   	transposer   r   r   updaterw   r/   catmatmulr=   tensorlongr   rT   r   rK   tor   einsummathsqrtr   
functionalsoftmaxrQ   permute
contiguousr`   r{   )rW   r'   r   r   r   r   r   
batch_sizerb   _query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                r5   rd   GitSelfAttention.forward   s    %2$7$7!
JJ}%T*b$":":D<T<TUYq!_ 	 -A((aHH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	
 &/>/E/E!Q*+[Avw9I-JDNN0,N 		9Q7F7A-=#>"OUVWI))[Aww1A%BDT$U[\]K !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L*!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r4   )r{   rz   r   rQ   r   r   rw   rK   rt   r=   r   r   NNNNNFF)r*   r+   r,   r-   rE   r   r/   rh   r   r0   r
   boolr2   rd   r3   ri   rj   s   @r5   rl   rl   w   s     uD %0A6R 7;15+/,1/4R.||R. !!2!23R. E--.	R.
 "%R. $D>R. 'tnR. 
u||	R. SR.r4   rl   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )GitSelfOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr;   )rD   rE   r   r   rH   denserM   rN   rO   rP   rQ   rV   s     r5   rE   GitSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r4   r'   input_tensorr^   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   rQ   rM   rW   r'   r   s      r5   rd   GitSelfOutput.forward   5    

=1]3}'CDr4   rM   r   rQ   
r*   r+   r,   r-   rE   r/   rh   rd   r3   ri   rj   s   @r5   r   r      6    >U\\  RWR^R^  r4   r   eagerc                      ^  \ rS rSrSU 4S jjrS r\" SSSS9     SS\R                  S	\	\R                     S
\	\R                     S\	\   S\	\   S\	\   S\\R                     4S jj5       rSrU =r$ )GitAttentioni  c                    > [         TU ]  5         [        UR                     " XUS9U l        [        U5      U l        [        5       U l        g )N)r=   rw   )	rD   rE   GIT_SELF_ATTENTION_CLASSES_attn_implementationrW   r   outputsetpruned_headsr   s       r5   rE   GitAttention.__init__  sB    .v/J/JKy
	 $F+Er4   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rW   rt   rz   r   r   r   r   r   r   r   r{   union)rW   headsindexs      r5   prune_headsGitAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r4   r   r   r   r   r'   r   r   r   r   r^   c                 Z    U R                  UUUUUU5      u  pxU R                  Xq5      n	X4$ r   )rW   r   )
rW   r'   r   r   r   r   r   attn_outputself_attn_weightsattention_outputs
             r5   rd   GitAttention.forward   sA     *. *
&  ;;{B22r4   )r   r   rW   r   r   )r*   r+   r,   r-   rE   r   r   r/   rh   r   r0   r
   r   r2   rd   r3   ri   rj   s   @r5   r   r     s    ";$ %0A6R 7;15+/,1/43||3 !!2!233 E--.	3
 "%3 $D>3 'tn3 
u||	3 S3r4   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitIntermediatei7  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rD   rE   r   r   rH   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrV   s     r5   rE   GitIntermediate.__init__8  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r4   r'   r^   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rW   r'   s     r5   rd   GitIntermediate.forward@  s&    

=100?r4   r   r   rj   s   @r5   r   r   7  s(    9U\\ ell  r4   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )	GitOutputiG  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rD   rE   r   r   r   rH   r   rM   rN   rO   rP   rQ   rV   s     r5   rE   GitOutput.__init__H  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r4   r'   r   r^   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r5   rd   GitOutput.forwardN  r   r4   r   r   rj   s   @r5   r   r   G  r   r4   r   c                      ^  \ rS rSrSU 4S jjr\" SSSS9     SS\R                  S\\R                     S	\\R                     S\\
   S
\\   S\\   S\\R                     4S jj5       rS rSrU =r$ )GitLayeriU  c                    > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        [        U5      U l        [        U5      U l	        g )Nr   )rw   )
rD   rE   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rW   rX   rw   rY   s      r5   rE   GitLayer.__init__V  sI    '-'E'E$%fB+F3'r4   r   r   r   r   r'   r   r   r   r   r^   c           	          U R                  UUUUUUS9u  px[        U R                  U R                  U R                  U5      n	X4$ )N)r   r   r   )r  r   feed_forward_chunkr  r  )
rW   r'   r   r   r   r   r   r   self_attention_weightslayer_outputs
             r5   rd   GitLayer.forward^  sa     48>>/+!5 4B 4
0 1##T%A%A4CSCSUe
 33r4   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r   )rW   r   intermediate_outputr	  s       r5   r  GitLayer.feed_forward_chunkw  s)    "//0@A{{#6Ir4   )r  r  r  r   r  r   r   )r*   r+   r,   r-   rE   r   r/   rh   r   r0   r
   r   r2   rd   r  r3   ri   rj   s   @r5   r   r   U  s    ( %0A6R 7;15+/,1/44||4 !!2!234 E--.	4
 "%4 $D>4 'tn4 
u||	4 S40 r4   r   c                   (  ^  \ rS rSrU 4S jr        SS\R                  S\\R                     S\\R                     S\\	\
\\\R                        4      S\\   S\\   S	\\   S
\\   S\\   S\	\\R                     \4   4S jjrSrU =r$ )
GitEncoderi}  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf NF)
rD   rE   rX   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rW   rX   irY   s      r5   rE   GitEncoder.__init__~  sR    ]]vG_G_A`#aA`AHV$7A`#ab
&+# $b   A&r'   r   r   r   	use_cacher   output_hidden_statesr   return_dictr^   c
           	         U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  Uc  [	        U R
                  S9nU(       a  SOS n
U(       a  SOS n[        U R                  5       H=  u  pU(       a  X4-   n
Ub  X<   OS nU" UUUUUU5      nUS   nU(       d  M5  XS   4-   nM?     U(       a  X4-   n
U	(       d  [        S UUU
U4 5       5      $ [        UUU
US9$ )	NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rX   r)   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r)   ).0vs     r5   	<genexpr>%GitEncoder.forward.<locals>.<genexpr>  s"      	A  s   	r&   r   r'   r(   )
r  trainingrx   ry   r   rX   	enumerater  r2   r   )rW   r'   r   r   r   r  r   r  r   r  all_hidden_statesall_self_attentionsr  layer_modulelayer_head_masklayer_outputss                   r5   rd   GitEncoder.forward  s"    &&4==##p "	0*$++>O"6BD$5b4(4OA#$58H$H!.7.CilO(!$M *!,M  &91=M<O&O##  5&   14D D 	 "#%'		 	 	 '+++*	
 	
r4   )rX   r  r  )NNNNFFFT)r*   r+   r,   r-   rE   r/   rh   r   r0   r   r
   r2   r   r   rd   r3   ri   rj   s   @r5   r  r  }  s    , 7;15SW$(,1/4/4&*>
||>
 !!2!23>
 E--.	>

 "%uU5;L;L5M/N(N"OP>
 D>>
 $D>>
 'tn>
 'tn>
 d^>
 
uU\\"$;;	<>
 >
r4   r  c                   .    \ rS rSr% \\S'   SrSrS rSr	g)GitPreTrainedModeli  rX   gitTc                    [        U[        5      (       a  [        R                  R	                  UR
                  SU R                  R                  S9  [        R                  R	                  UR                  R                  U R                  R                  S9  [        R                  R	                  UR                  R                  U R                  R                  S9  [        U[        R                  5      (       ak  UR                  R                  R	                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R	                  SU R                  R                  S9  UR                   b2  UR                  R                  UR                      R                  5         gg[        U[        R"                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R%                  S5        gg)zInitialize the weights        )meanstd)r2  Ng      ?)r   GitVisionEmbeddingsr   initnormal_class_embeddingrX   initializer_rangepatch_embeddingweightposition_embeddingr   databiaszero_rF   r:   rM   fill_)rW   modules     r5   _init_weights GitPreTrainedModel._init_weights  s   f122GGOOF22$++B_B_O`GGOOF2299t{{?\?\O]GGOOF55<<$++B_B_O`fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r4   r)   N)
r*   r+   r,   r-   r   r1   base_model_prefixsupports_gradient_checkpointingr@  r3   r)   r4   r5   r-  r-    s    &*#*r4   r-  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )r3  i  rX   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestrider<  rq   r   r?   r@   rB   )rD   rE   rX   rH   	embed_dimr}   r~   r   	Parameterr/   randnr6  Conv2dnum_channelsr8  num_patchesnum_positionsrF   r:  rS   rT   rU   rV   s     r5   rE   GitVisionEmbeddings.__init__  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr4   rc   heightwidthr^   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   NrA   g      ?r   rq   bicubicF)r`   modealign_cornersr   )r   r:  r9  	unsqueezer/   jit
is_tracingr?   r~   r   reshaper   r   r   interpolater   r   )rW   rc   rR  rS  rO  r:  rP  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r5   interpolate_pos_encoding,GitVisionEmbeddings.interpolate_pos_encoding  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr4   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().r   rq   r   rA   r   )r   r}   rv   r8  r9  r   r   flattenr   r6  rU   r/   r   rb  r:  r?   )rW   rd  rb  r   r   rR  rS  target_dtypepatch_embedsclass_embedsrc   s              r5   rd   GitVisionEmbeddings.forward!  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr4   )	r6  rX   rJ  r}   rO  rP  r8  r~   r:  F)r*   r+   r,   r-   r    rE   r/   rh   rg   rb  r0   rd   r3   ri   rj   s   @r5   r3  r3    si    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r4   r3  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )GitVisionMLPi4  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rD   rE   rX   r	   r   activation_fnr   r   rH   r   fc1fc2rV   s     r5   rE   GitVisionMLP.__init__5  sb    #F$5$5699V//1I1IJ99V55v7I7IJr4   r'   r^   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rr  rq  rs  r   s     r5   rd   GitVisionMLP.forward<  s4    /**=9/r4   )rq  rX   rr  rs  r   rj   s   @r5   ro  ro  4  s)    KU\\ ell  r4   ro  r?  r   r   r   r   scalingrQ   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrA   r   )r   r   )pr$  r   rq   )r/   r   r   r   r   r   float32r   r   rQ   r$  r   )
r?  r   r   r   r   rw  rQ   kwargsattn_weightsr   s
             r5   eager_attention_forwardr}  D  s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r4   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )GitVisionAttentioni[  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rg  g      F)rD   rE   rX   rH   rJ  rt   	num_headshead_dimrv   scaleattention_dropoutrQ   	is_causalr   r   k_projv_projq_projout_projrV   s     r5   rE   GitVisionAttention.__init__^  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar4   r'   r   causal_attention_maskr   r^   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XVU5      R#                  5       nU R%                  U5      nU(       d  SnX4$ )z#Input shape: Batch x Time x Channelr   rq   flash_attention_2Nr   r0  )r  rw  rQ   )r   r  r  r  r   r  r  r   rX   r   r  r}  r   r  r$  rQ   r[  r   r  )rW   r'   r   r  r   r   rb   rJ  querieskeysvaluesattention_interfacer   r|  s                 r5   rd   GitVisionAttention.forwardr  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r4   )rX   rQ   rJ  r  r  r  r  r  r  r  r  )NNF)r*   r+   r,   r-   r.   rE   r/   rh   r   r   r2   rd   r3   ri   rj   s   @r5   r  r  [  s    GB. 268<,1/)||/) !./)  (5	/)
 $D>/) 
u||Xell33	4/) /)r4   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )GitVisionEncoderLayeri  rX   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rD   rE   rH   rJ  r  	self_attnr   rM   rN   layer_norm1ro  mlplayer_norm2rV   s     r5   rE   GitVisionEncoderLayer.__init__  sm    +++F3<<F<Q<QR'<<F<Q<QRr4   r'   r   r  r   r^   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r'   r   r  r   )r  r  r  r  )rW   r'   r   r  r   residualr|  outputss           r5   rd   GitVisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr4   )rJ  r  r  r  r  rm  )r*   r+   r,   r-   r    rE   r/   rh   r   r   r2   r0   rd   r3   ri   rj   s   @r5   r  r    sk    S S -2&||& &  %||	&
 $D>& 
u  	!& &r4   r  c                      ^  \ rS rSrSrS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )GitVisionEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`GitVisionEncoderLayer`].

Args:
    config: GitVisionConfig
rX   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rD   rE   rX   r   r  r  r  r  layersr  )rW   rX   r   rY   s      r5   rE   GitVisionEncoder.__init__  sT    mmERXRjRjLk$lLkq%:6%BLk$lm&+# %mr  r   r  r   r  r  r^   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   n[        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr)   )r   r   r   r&   r'   r(   )rX   r   r  use_return_dictr%  r  r   )rW   r\   r   r  r   r  r  encoder_statesall_attentionsr'   idxencoder_layerr*  s                r5   rd   GitVisionEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>N+Vd
 	
r4   )rX   r  r  )NNNNN)r*   r+   r,   r-   r.   r    rE   r   r   r/   rh   r   r   r2   r   rd   r3   ri   rj   s   @r5   r  r    s    , ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r4   r  c                      ^  \ rS rSrS\4U 4S jjr\     SS\\R                     S\\
   S\\
   S\\
   S\\
   S	\\\4   4S
 jj5       rSrU =r$ )GitVisionTransformeri/  rX   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )rD   rE   rX   rH   r3  rc   r   rM   rN   pre_layrnormr  encoderpost_layernorm)rW   rX   rJ  rY   s      r5   rE   GitVisionTransformer.__init__1  sd    &&	-f5LL8M8MN'/ ll9:O:OPr4   rd  r   r  rb  r  r^   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nU(       d	  U4USS  -   $ [        UUR                  UR                  S9$ )Nz You have to specify pixel_valuesrb  )r\   r   r  r  r   r   r  )rX   r   r  r  rv   rc   r  r  r  r   r'   r(   )	rW   rd  r   r  rb  r  r'   encoder_outputsr&   s	            r5   rd   GitVisionTransformer.forward;  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A. //0AB%'/!"*===/)77&11
 	
r4   )rX   rc   r  r  r  NNNFN)r*   r+   r,   r-   r    rE   r   r   r/   r0   r   r   r2   r   rd   r3   ri   rj   s   @r5   r  r  /  s    Q Q  59,0/338&*&
u001&
 $D>&
 'tn	&

 #+4.&
 d^&
 
uo%	&&
 &
r4   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )GitVisionModelie  rX   rd  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rD   rE   r  vision_model	post_initrV   s     r5   rE   GitVisionModel.__init__o  s'     08r4   r^   c                 B    U R                   R                  R                  $ r   )r  rc   r8  rW   s    r5   get_input_embeddings#GitVisionModel.get_input_embeddingsu  s      ++;;;r4   r   r  rb  r  c                 ^    Ub  UOU R                   R                  nU R                  UUUUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GitVisionModel

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = GitVisionModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```)rd  r   r  rb  r  )rX   r  r  )rW   rd  r   r  rb  r  s         r5   rd   GitVisionModel.forwardx  sA    8 &1%<k$++B]B]  %/!5%=# ! 
 	
r4   )r  r  )r*   r+   r,   r-   r    r1   main_input_namerE   r   Moduler  r   r   r/   r0   r   r   r2   r   rd   r3   ri   rj   s   @r5   r  r  e  s     $O <bii <  59,0/3).&*#
u001#
 $D>#
 'tn	#

 #'#
 d^#
 
uo%	&#
 #
r4   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )GitProjectioni  rX   c                 .  > [         TU ]  5         Xl        [        R                  " [        R
                  " UR                  R                  UR                  5      [        R                  " UR                  UR                  R                  S95      U l
        g r   )rD   rE   rX   r   
Sequentialr   r|   rH   rM   rN   visual_projectionrV   s     r5   rE   GitProjection.__init__  sd    !#IIf**668J8JKLL++1E1E1T1TU"
r4   rc   r^   c                 $    U R                  U5      $ r   )r  )rW   rc   s     r5   rd   GitProjection.forward  s    %%j11r4   )rX   r  )r*   r+   r,   r-   r   rE   r/   rh   rd   r3   ri   rj   s   @r5   r  r    s/    
y 
2%,, 25<< 2 2r4   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                      ^  \ rS rSrU 4S jrS rS rS rS\S\	R                  S\	R                  S	\	R                  4S
 jrSS jr\            SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\\\\	R(                     4      S\\   S\\   S\\   S\S\\   S	\\\	R                     \4   4S jj5       rSrU =r$ )GitModeli  c                 r  >^ [         TU ]  T5        TU l        [        T5      U l        [        TR                  5      U l        [        T5      U l	        [        T5      U l        TR                  b8  [        R                  " U4S j[        TR                  5       5       5      U l        U R#                  5         g )Nc              3      >#    U  HE  n[         R                  " [        R                  " S S TR                  R
                  5      5      v   MG     g7f)r   N)r   rK  r/   zerosr|   rH   )r  r   rX   s     r5   r!  $GitModel.__init__.<locals>.<genexpr>  s=      ;?A U[[Av/C/C/O/OPQQ?s   AA)rD   rE   rX   r7   rc   r  r|   image_encoderr  r  r  r  r   r   ParameterListr  img_temporal_embeddingr  rV   s    `r5   rE   GitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r4   c                 .    U R                   R                  $ r   rc   rJ   r  s    r5   r  GitModel.get_input_embeddings  s    ...r4   c                 $    XR                   l        g r   r  )rW   r   s     r5   set_input_embeddingsGitModel.set_input_embeddings  s    */'r4   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  r  r   )rW   heads_to_pruner  r   s       r5   _prune_headsGitModel._prune_heads  s<    
 +002LELLu%//;;EB 3r4   r`   r   r   r^   c           	          [         R                  " [         R                  " XX2S9SS9nUR                  US:H  [	        S5      5      nU$ )Nr   r   r   )diagonal-inf)r/   triuonesmasked_fillfloat)rW   r`   r   r   masks        r5   _generate_future_maskGitModel._generate_future_mask  s=    zz%**TLWXY	5=9r4   c                 n   UR                   S   nUR                   S   nUR                  nUR                  n	[        R                  " Xw4XS9n
[        R
                  " XvU-   4[        S5      UR                  U	S9n[        R                  " Xg4U	UR                  S9nUS:  a?  [        R                  " UR                   S   UR                   S   U-   4U	UR                  S9n[        R                  " X4SS9n[        R                  " XR                  U	5      4SS9n[        R                  " X4SS9S S S 24   nUc2  [        R
                  " UR                   S   UR                   S   4SUS9nUR                  [        R                  :w  a  [        S	5      e[        R                  " XQR                  S
9n[        S5      UU'   UR                  UR                   S   Xv-   Xt-   U-   45      nUR                  5       nUS S 2S S 2S U24   nUS S 2S S S 24   nUU-   US S 2S S 2S U24'   US S 2S S S 2S S 24   nU$ )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r/   r  fullr  r   r   r   rv   
zeros_likerU   clone)rW   tgtmemorytgt_maskr]   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   s                      r5   create_attention_maskGitModel.create_attention_mask  s0   ))A,\\!_
		;;
7TJJ#99:&M::	
	 kk!??
 "A%{{"HNN1$58N$NOH yy(0a8		9kk%&89qA#ii1=dAgF"*&+jj&,,q/6<<PQ?1S`ent&u#"((EJJ6PQQ!&!1!12IQZQZ![:?-67188$**1-z/CZEhkrErs
 2779)!Q*;<'4
31<v1EAq+:+-. 2!T1a-@""r4   r[   r   r?   rd  r   r\   r   r  r   r  rb  r  c                 V   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[        S5      eUS   nSnUb5  [        U[        5      (       d  UR                  5       OUR                  5       nU R                  XPR                   R                  5      nSnUb  UR                  S:X  a  U R                  XKS9R                  nOUR                  S	:X  a  / n[!        UR"                  S   5       HM  nU R                  USS2USS2SS24   US9R                  nUU R$                  U   -  nUR'                  U5        MO     [(        R*                  " USS
9nO[        S5      eU R-                  U5      nU R/                  UUUUS9nUcG  [(        R0                  " UR"                  S   SUR"                  S   4UR2                  UR4                  S9nUR7                  UR                  S5      UR                  S5      -  SS5      n[(        R*                  " UU4SS
9nU R9                  UUR2                  UR4                  5      nU R;                  UUUUS9nUbk  [=        UUR2                  US   S9R?                  UR4                  5      nUS:  a  USS2SS2U* S2SS24   nO!USS2SS2US   * S2US   * S24==   U-  ss'   U RA                  UUUUUU	U
UUSLS9	nUS   nU(       d	  U4USS -   $ [C        UURD                  URF                  URH                  S9$ )a?  
Examples:

```python
>>> from transformers import AutoProcessor, AutoModel
>>> import requests
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
>>> model = AutoModel.from_pretrained("microsoft/git-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> text = "this is an image of two cats"

>>> inputs = processor(images=image, text=text, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timerA   z5You have to specify either input_ids or inputs_embedsr   r      r     r   z#pixel_values must be of rank 4 or 5)r[   r?   r\   r]   rq   r   )r  r  r  r]   )tgt_len)r   r   r   r  r   r  r  r   r#  )%rX   r   r  r  r  rv   %warn_if_padding_and_no_attention_maskr`   r   r
   get_seq_lengthget_head_maskr  ndimr  r&   r  r   r  appendr/   r   r  rc   r  r   r   repeatr  r  r   r   r  r   r   r'   r(   )rW   r[   r   r?   rd  r   r\   r   r  r   r  rb  r  ra   rb   r]   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr'   r  combined_attention_maskexpanded_attn_maskr  sequence_outputs                              r5   rd   GitModel.forward  s,   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU ^
 "#& "/599  ..0$335 # &&y++2O2OP	$(!#  A%"&"4"4  #5 ###   ""a'"$!&|'9'9!'<!=I,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@A "> #())O"C !!FGG(,(>(>(O%??%'#9	 + 
 %,(-!''*A/?/E/Ea/HI&,,'..)% %>$D$D!!!$(A(F(Fq(II1a%
!
 		#<>N"OUVW --j:J:P:PRbRiRij #'"<"< ,#9	 #= #
 % "< 0 6 6B"b!(()  &)%71?U>U>VXY8Y%Z"'1{1~o.?+a.AR(RSWiiS,,2+/!5#!-T!9 ' 

 *!,#%(;;;&-+;;)77&11	
 	
r4   )rX   rc   r  r  r  r  r   )NNNNNNNNNNFN)r*   r+   r,   r-   rE   r  r  r  rg   r/   r   r   rh   r  r  r   r   r   r
   listr0   r   r2   r   rd   r3   ri   rj   s   @r5   r  r    s   &/0C# ekk 5<< \a\h\h 0#d  -115/3/3,004KO$(,0/3).&*c
ELL)c
 !.c
 u||,	c

 u||,c
 ELL)c
  -c
 "%tE4E4E/F(F"GHc
 D>c
 $D>c
 'tnc
 #'c
 d^c
 
uU\\"$>>	?c
 c
r4   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                      ^  \ rS rSrS/rU 4S jrS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\\\\
R                     4      S\	\   S\	\   S\	\   S\S\	\   S\\\
R                     \4   4S jj5       r SS jrSrU =r$ )GitForCausalLMi  zoutput.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  5      U l        U R                  5         g r   )
rD   rE   r  r.  r   r   rH   rG   r   r  rV   s     r5   rE   GitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r4   c                     U R                   $ r   r   r  s    r5   get_output_embeddings$GitForCausalLM.get_output_embeddings  s    {{r4   c                     Xl         g r   r  )rW   new_embeddingss     r5   set_output_embeddings$GitForCausalLM.set_output_embeddings  s    $r4   r[   r   r?   rd  r   r\   labelsr   r  r   r  rb  r  r^   c                    Ub  UOU R                   R                  nUb  Sn	U R                  UUUUUUUU	U
UUUS9nUS   nU R                  U5      nSnUb  U R                  R                  R
                  S   R                  R                  R                  nUSS2US2SS24   R                  5       nUSS2SS24   R                  5       nU R                  " UR                  SU R                   R                  5      UR                  S5      4SU R                   R                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

Examples:

Image captioning example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> import requests
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
>>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_caption)
two cats sleeping on a pink blanket next to remotes.
```

Visual question answering (VQA) example:

```python
>>> from transformers import AutoProcessor, AutoModelForCausalLM
>>> from huggingface_hub import hf_hub_download
>>> from PIL import Image

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

>>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
>>> image = Image.open(file_path).convert("RGB")

>>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

>>> question = "what does the front of the bus say at the top?"

>>> input_ids = processor(text=question, add_special_tokens=False).input_ids
>>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
>>> input_ids = torch.tensor(input_ids).unsqueeze(0)

>>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
>>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
['what does the front of the bus say at the top? special']
```

Video captioning example:

```python
>>> import av
>>> import numpy as np
>>> from PIL import Image
>>> from huggingface_hub import hf_hub_download
>>> from transformers import AutoProcessor, AutoModelForCausalLM

>>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
>>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

>>> # set seed for reproducibility
>>> np.random.seed(45)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # load video
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample frames
>>> num_frames = model.config.num_image_with_embedding
>>> indices = sample_frame_indices(
...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
... )
>>> frames = read_video_pyav(container, indices)

>>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

>>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

>>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
```
NF)r   r?   rd  r   r\   r   r  r   r  rb  r  r   rA   r   rG   )losslogitsr   r'   r(   )rX   r  r.  r   r  r  r  rW   r   r   loss_functionr   rG   r   r   r'   r(   )rW   r[   r   r?   rd  r   r\   r#  r   r  r   r  rb  r  r{  r  r  r&  r%  num_image_tokensshifted_logitsr   s                         r5   rd   GitForCausalLM.forward  s   j &1%<k$++B]B]I(()%%'+/!5%=#  
 "!*_-#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%%##B(>(>?B  ;;11 	D Y,F)-)9TGf$EvE%#33!//))
 	
r4   c                 H   UbC  UR                  5       nUR                  S   U:  a  UnOUR                  S   S-
  nUS S 2US 24   nUR                  nUc  UR                  U5      nUUUR                  S5      UUS.n	UR	                  5        H  u  pX;  d  M  XU
'   M     U	$ )Nr   rd  )r[   r   rd  r   r  )r  r   new_onesgetr  )rW   r[   r   r   r  r{  past_lengthremove_prefix_lengthra   model_inputsr   r   s               r5   prepare_inputs_for_generation,GitForCausalLM.prepare_inputs_for_generation  s     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I  oo!&//<N #,"JJ~6."
 !,,.JC&$)S! ) r4   )r.  r   )NNNNNNNNNNNFN)NNN)r*   r+   r,   r-   _tied_weights_keysrE   r  r!  r   r   r/   rh   r   r
   r  r   r2   r   rd   r1  r3   ri   rj   s   @r5   r  r    s    **%  -115/3/3,004)-FJ$(,0/3).&*A
ELL)A
 !.A
 u||,	A

 u||,A
 ELL)A
  -A
 &A
 "%tELL/A(A"BCA
 D>A
 $D>A
 'tnA
 #'A
 d^A
  
uU\\"$::	;!A
 A
H OS$ $r4   r  )r  r  r-  r  )r0  )Ir.   r   dataclassesr   typingr   r   r   r/   r   activationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   utils.deprecationr   configuration_gitr   r    
get_loggerr*   rx   r#   r  r7   rl   r   r   r   r   r   r   r  r-  r3  ro  rh   r  r}  r  r  r  r  r  r  r  r  __all__r)   r4   r5   <module>rC     s      ! , ,   ! . ) B 9  G l l  1 9 
		H	% 	?; 	? 	?-BII -`v.ryy v.tBII   
/3299 /3fbii  		 %) %PE
 E
P * * *6P")) Pf299 . %II%<<% 
% <<	%
 U\\*% % %.F) F)T/6 /fT
ryy T
n3
299 3
l 
2
' 2

2
j
2BII 
2 
~
! ~

~
B 
z' z
zz Qr4   