
    bCi                        S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
Jr  SSKJr  SSKJr  SSKJrJrJrJr  SS	KJrJr  SS
KJrJrJr  SSKJrJrJrJ r J!r!J"r"  SSK#J$r$J%r%J&r&  \!RN                  " \(5      r)S\	RT                  S\	RT                  4S jr+S\	RT                  S\	RT                  4S jr,\\ " S S\5      5       5       r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1S\00r2 " S S\R\                  5      r3 " S S\R\                  5      r4 " S  S!\R\                  5      r5 " S" S#\5      r6 " S$ S%\R\                  5      r7 " S& S'\R\                  5      r8 SJS(\R\                  S)\	RT                  S*\	RT                  S+\	RT                  S,\\	RT                     S-\9S.\94S/ jjr: " S0 S1\R\                  5      r; " S2 S3\R\                  5      r< " S4 S5\5      r= " S6 S7\R\                  5      r> " S8 S9\R\                  5      r?\ " S: S;\5      5       r@ " S< S=\R\                  5      rA " S> S?\@5      rB\" S@SA9 " SB SC\@5      5       rC " SD SE\@5      rD " SF SG\@5      rESKSH jrF/ SIQrGg)LzPyTorch AltCLIP model.    N)	dataclass)AnyCallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)nn
functionalcross_entropytorcharangelenr"   )r   s    f/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr*   +   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r*   t)r,   caption_loss
image_losss      r)   	clip_lossr1   /   s*    #J/L!*,,.1J%,,r+   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)AltCLIPOutput5   aq  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AltCLIPTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AltCLIPVisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r:   r;   N)getattrto_tuple).0kselfs     r)   	<genexpr>)AltCLIPOutput.to_tuple.<locals>.<genexpr>U   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysrB   s   `r)   r?   AltCLIPOutput.to_tupleT   s#     
YY[
 
 	
r+    )__name__
__module____qualname____firstlineno____doc__r5   r   r&   FloatTensor__annotations__r6   r7   r8   r9   r:   r   r;   rE   r   r?   __static_attributes__rI   r+   r)   r3   r3   5   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r+   r3   c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )AltRobertaEmbeddings\   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r#   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr>   rY   register_bufferr&   r'   expandzerosr[   sizelongrV   rB   config	__class__s     r)   rd   AltRobertaEmbeddings.__init__b   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r+   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr]   r   r`   r   rb   r"   rZ   )"create_position_ids_from_input_idsrV   &create_position_ids_from_inputs_embedsrv   hasattrr`   rt   r&   ru   rw   r[   r"   ri   rm   rY   rk   rn   rr   )rB   	input_idsr`   r[   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrm   
embeddingsrk   s                r)   forwardAltRobertaEmbeddings.forward{   sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r+   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr]   r   r}   r   )rv   r&   r'   rV   rw   r"   	unsqueezert   )rB   r   r   sequence_lengthr[   s        r)   r   ;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r+   )rn   rr   rV   rY   rk   rm   ri   )NNNNr   )
rJ   rK   rL   rM   rN   rd   r   r   rQ   __classcell__rz   s   @r)   rS   rS   \   s$    

4 rs&P= =r+   rS   c                      ^  \ rS rSrS
U 4S jjr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	rU =r$ )AltRobertaSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aH  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        g g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rY   rZ   relative_keyrelative_key_query   r   )rc   rd   rg   num_attention_headsr   
ValueErrorintattention_head_sizeall_head_sizer#   Linearquerykeyvaluerp   attention_probs_dropout_probrr   r>   rY   rj   re   distance_embeddingrB   ry   rY   rz   s      r)   rd    AltRobertaSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr+   hidden_statesattention_mask	head_maskoutput_attentionsr   c                 H   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	[        R                  " XxR	                  SS5      5      n
U R                  S:X  d  U R                  S:X  GaL  UR                   S   UR                   S   p[        R                  " U[        R                  UR                  S9R                  SS5      n[        R                  " U[        R                  UR                  S9R                  SS5      nX-
  nU R                  XR                  -   S-
  5      nUR                  UR                   S9nU R                  S:X  a  [        R"                  " S	UU5      nU
U-   n
OHU R                  S:X  a8  [        R"                  " S	UU5      n[        R"                  " S
UU5      nU
U-   U-   n
U
[$        R&                  " U R                  5      -  n
Ub  X-   n
[(        R*                  R-                  U
SS9nU R/                  U5      nUb  UU-  n[        R                  " UU	5      nUR1                  SSSS5      R3                  5       nUR5                  5       S S U R6                  4-   nUR                  U5      nU(       a  UU4nU$ U4nU$ )Nr]   r   r   r   r   r}   ra   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   )shaper   r   view	transposer   r   r&   matmulrY   r'   rw   r"   r   rj   torb   einsummathsqrtr#   r$   softmaxrr   permute
contiguousrv   r   )rB   r   r   r   r   r   hidden_shapequery_layer	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                           r)   r   AltRobertaSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*"\\,ejjQ^QeQefkklnpqrN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8FbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r+   )
r   r   r   rr   r   rj   r   rY   r   r   NNNF)rJ   rK   rL   rM   rd   r&   Tensorr   rO   boolrE   r   rQ   r   r   s   @r)   r   r      su    u6 7;15,1:||: !!2!23: E--.	:
 $D>: 
u||	: :r+   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrW   )rc   rd   r#   r   rg   densern   ro   rp   rq   rr   rx   s     r)   rd   AltRobertaSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r+   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rr   rn   rB   r   r   s      r)   r   AltRobertaSelfOutput.forward  5    

=1]3}'CDr+   rn   r   rr   
rJ   rK   rL   rM   rd   r&   r   r   rQ   r   r   s   @r)   r   r     6    >U\\  RWR^R^  r+   r   eagerc                      ^  \ rS rSrSU 4S jjrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )AltRobertaAttentioni  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )N)rY   )	rc   rd   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationrB   r   outputsetpruned_headsr   s      r)   rd   AltRobertaAttention.__init__   s@    6v7R7RS
	 +62Er+   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )r(   r   rB   r   r   r   r   r   r   r   r   r   r   union)rB   headsindexs      r)   prune_headsAltRobertaAttention.prune_heads(  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r+   r   r   r   r   r   c                 f    U R                  UUUUS9nU R                  US   U5      nU4USS  -   nU$ N)r   r   r   r   r   )rB   r   )rB   r   r   r   r   self_outputsattention_outputr   s           r)   r   AltRobertaAttention.forward:  sS     yy)/	 ! 
  ;;|AF#%QR(88r+   )r   r   rB   r   r   )rJ   rK   rL   rM   rd   r   r&   r   r   rO   r   rE   r   rQ   r   r   s   @r)   r   r     sy    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	 r+   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaIntermediateiM  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rc   rd   r#   r   rg   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrx   s     r)   rd   AltRobertaIntermediate.__init__N  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r+   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rB   r   s     r)   r   AltRobertaIntermediate.forwardV  s&    

=100?r+   r   r   r   s   @r)   r   r   M  s(    9U\\ ell  r+   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaOutputi]  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rc   rd   r#   r   r   rg   r   rn   ro   rp   rq   rr   rx   s     r)   rd   AltRobertaOutput.__init__^  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r+   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r)   r   AltRobertaOutput.forwardd  r   r+   r   r   r   s   @r)   r  r  ]  r   r+   r  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	 rS
rU =r$ )AltRobertaLayeril  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g )Nr   )
rc   rd   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater  r   rx   s     r)   rd   AltRobertaLayer.__init__m  sI    '-'E'E$,V426:&v.r+   r   r   r   r   r   c                     U R                   " U4UUUS.UD6nUS   nUSS  n[        U R                  U R                  U R                  U5      n	U	4U-   nU$ r   )r  r   feed_forward_chunkr  r  )
rB   r   r   r   r   kwargsself_attention_outputsr   r   layer_outputs
             r)   r   AltRobertaLayer.forwardu  s     "&"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r+   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r   )rB   r   intermediate_outputr  s       r)   r  "AltRobertaLayer.feed_forward_chunk  s)    "//0@A{{#6Ir+   )r  r  r  r   r  r   )rJ   rK   rL   rM   rd   r&   r   r   rO   r   rE   r   r  rQ   r   r   s   @r)   r	  r	  l  sy    / 7;15,1|| !!2!23 E--.	
 $D> 
u||	2 r+   r	  c                      ^  \ rS rSrU 4S jr\     SS\R                  S\\R                     S\\R                     S\\
   S\\
   S\\
   S	\\\R                     \4   4S
 jj5       rSrU =r$ )AltRobertaEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rc   rd   ry   r#   
ModuleListrangenum_hidden_layersr	  layergradient_checkpointing)rB   ry   irz   s      r)   rd   AltRobertaEncoder.__init__  sR    ]]U6KcKcEd#eEdOF$;Ed#ef
&+# $f   A&r   r   r   r   output_hidden_statesreturn_dictr   c           	         U(       a  SOS nU(       a  SOS n	[        U R                  5       H=  u  pU(       a  X4-   nUb  X:   OS nU" SUUUUS.UD6nUS   nU(       d  M5  XS   4-   n	M?     U(       a  X4-   n[        UUU	S9$ )NrI   )r   r   r   r   r   r   last_hidden_stater   
attentions)	enumerater   r   )rB   r   r   r   r   r%  r&  r  all_hidden_statesall_self_attentionsr"  layer_modulelayer_head_masklayer_outputss                 r)   r   AltRobertaEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO( +-)"3	
 M *!,M  &91=M<O&O#!  5$   14D D++*
 	
r+   )ry   r!  r   )NNFFT)rJ   rK   rL   rM   rd   r   r&   r   r   rO   r   r   rE   r   r   rQ   r   r   s   @r)   r  r    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r+   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rc   rd   r#   r   rg   r   Tanh
activationrx   s     r)   rd   AltRobertaPooler.__init__  s9    YYv1163E3EF
'')r+   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r6  )rB   r   first_token_tensorpooled_outputs       r)   r   AltRobertaPooler.forward  s6     +1a40

#566r+   )r6  r   r   r   s   @r)   r3  r3    s(    $
U\\ ell  r+   r3  moduler   r   r   r   scalingrr   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr]   r   )r   rb   )ptrainingr   r   )r&   r   r   r#   r$   r   float32r   rb   rr   r@  r   )
r<  r   r   r   r   r=  rr   r  attn_weightsattn_outputs
             r)   eager_attention_forwardrD    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r+   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )AltCLIPAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)rc   rd   ry   rg   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrr   	is_causalr#   r   k_projv_projq_projout_projrx   s     r)   rd   AltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar+   r   r   causal_attention_maskr   r   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XVU5      R#                  5       nU R%                  U5      nU(       d  SnX4$ )z#Input shape: Batch x Time x Channelr   r   flash_attention_2Nr           )rO  r=  rr   )r   rR  rP  rQ  r   rK  rL  r   ry   r   rO  rD  r   rM  r@  rr   reshaper   rS  )rB   r   r   rU  r   
batch_sizer   rJ  queriesrF   valuesattention_interfacerC  rB  s                 r)   r   AltCLIPAttention.forward  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r+   )ry   rr   rJ  rL  rO  rP  rK  rS  rR  rM  rQ  r   )rJ   rK   rL   rM   rN   rd   r&   r   r   r   rE   r   rQ   r   r   s   @r)   rF  rF    s    GB. 268<,1/)||/) !./)  (5	/)
 $D>/) 
u||Xell33	4/) /)r+   rF  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
AltCLIPMLPi8  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rc   rd   ry   r	   r   activation_fnr#   r   rg   r   fc1fc2rx   s     r)   rd   AltCLIPMLP.__init__9  sb    #F$5$5699V//1I1IJ99V55v7I7IJr+   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rc  rb  rd  r   s     r)   r   AltCLIPMLP.forward@  s4    /**=9/r+   )rb  ry   rc  rd  r   r   s   @r)   r`  r`  8  s)    KU\\ ell  r+   r`  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )AltCLIPEncoderLayeriG  ry   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rc   rd   rg   rJ  rF  	self_attnr#   rn   ro   layer_norm1r`  mlplayer_norm2rx   s     r)   rd   AltCLIPEncoderLayer.__init__H  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr+   r   r   rU  r   r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   rU  r   )rl  rk  rn  rm  )rB   r   r   rU  r   residualrB  r   s           r)   r   AltCLIPEncoderLayer.forwardP  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr+   )rJ  rl  rn  rm  rk  F)rJ   rK   rL   rM   r   rd   r&   r   r   r   rE   rO   r   rQ   r   r   s   @r)   ri  ri  G  sk    S} S -2&||& &  %||	&
 $D>& 
u  	!& &r+   ri  c                      ^  \ rS rSrSrS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )AltCLIPEncoderiy  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`AltCLIPEncoderLayer`].

Args:
    config: AltCLIPConfig
ry   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rc   rd   ry   r#   r  r  r  ri  layersr!  )rB   ry   _rz   s      r)   rd   AltCLIPEncoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %kr$  r   rU  r   r%  r&  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   n[        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrI   )r   r   r   r(  )ry   r   r%  use_return_dictr+  rw  r   )rB   r   r   rU  r   r%  r&  encoder_statesall_attentionsr   idxencoder_layerr0  s                r)   r   AltCLIPEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>N+Vd
 	
r+   )ry   r!  rw  )NNNNN)rJ   rK   rL   rM   rN   r   rd   r   r   r&   r   r   r   rE   r   r   rQ   r   r   s   @r)   ru  ru  y  s    ,} ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r+   ru  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )AltCLIPVisionEmbeddingsi  ry   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasr   r   r[   r\   r^   )rc   rd   ry   rg   rJ  
image_size
patch_sizer#   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsre   position_embeddingrs   r'   rt   rx   s     r)   rd    AltCLIPVisionEmbeddings.__init__  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr+   r   heightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr]   g      ?r   r   bicubicF)rv   modealign_cornersr   )r   r  weightr   r&   jit
is_tracingr[   r  r   rY  r   r#   r$   interpolater   cat)rB   r   r  r  r  r  r  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r)   interpolate_pos_encoding0AltCLIPVisionEmbeddings.interpolate_pos_encoding  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr+   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (rH  ra   r   r   r]   r   )r   r  r   r  r  rb   r   flattenr   r  rt   r&   r  r  r  r[   )rB   r  r  rZ  rx  r  r  target_dtypepatch_embedsclass_embedsr   s              r)   r   AltCLIPVisionEmbeddings.forward  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr+   )	r  ry   rJ  r  r  r  r  r  r  rs  )rJ   rK   rL   rM   r   rd   r&   r   r   r  rO   r   rQ   r   r   s   @r)   r  r    sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r+   r  c                   2    \ rS rSr% \\S'   SrSr/ rS r	Sr
g)AltCLIPPreTrainedModeli$  ry   altclipTc                 6   U R                   R                  n[        U[        5      (       a  U R                   R                  n[        R
                  R                  UR                  SUR                  S-  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  g[        U[        5      (       Ga   U R                   R                  nUR                  S-  SUR                   R                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                   R                  US9  [        R
                  R                  UR"                  R                  US9  [        R
                  R                  UR$                  R                  US9  g[        U[&        5      (       a  U R                   R                  nUR                   R(                  S-  SUR                   R                  -  S-  -  U-  nSUR                   R(                  -  S-  U-  n[        R
                  R                  UR*                  R                  US9  [        R
                  R                  UR,                  R                  US9  g[        U[.        5      (       a  [        R
                  R                  UR0                  R                  UR2                  S-  U R                   R                  -  S9  SUR0                  l        [        R
                  R                  UR6                  R                  UR8                  S-  U R                   R                  -  S9  SUR6                  l        g[        U[        R:                  5      (       aJ  UR<                  R>                  RA                  5         UR                  R>                  RC                  S5        g[        U[        RD                  5      (       ak  UR                  R>                  R                  SU R                   R                  S9  UR<                  b%  UR<                  R>                  RA                  5         gg[        U[        RF                  5      (       ax  UR                  R>                  R                  SU R                   R                  S9  URH                  b2  UR                  R>                  URH                     RA                  5         ggg)	zInitialize the weightsrX  rI  )meanstd)r  r   Tg      ?N)%ry   initializer_factorr   r  r#   initnormal_r  rJ  r  r  initializer_ranger  rF  r  rR  rP  rQ  rS  r`  rg   rc  rd  AltCLIPModeltext_projectiontext_embed_dim_is_hf_initializedvisual_projectionvision_embed_dimrn   r  datazero_fill_r   re   rV   )rB   r<  factorin_proj_stdout_proj_stdfc_stds         r)   _init_weights$AltCLIPPreTrainedModel._init_weights+  s   //f566[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 011[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
++[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?--GGOO&&--))4/$++2P2PP   9=F""5GGOO((//++T1DKK4R4RR   ;?F$$7--KK""$MM$$S)		**MM&&CT[[5S5S&T{{&  &&( '--MM&&CT[[5S5S&T!!-""6#5#56<<> . .r+   rI   N)rJ   rK   rL   rM   r   rP   base_model_prefixsupports_gradient_checkpointing_no_split_moduler  rQ   rI   r+   r)   r  r  $  s    !&*#+?r+   r  c                      ^  \ rS rSrS\4U 4S jjr\\     SS\\	R                     S\\   S\\   S\\   S\\   S	\\\4   4S
 jj5       5       rSrU =r$ )AltCLIPVisionTransformeriY  ry   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )rc   rd   ry   rg   r  r   r#   rn   ro   pre_layrnormru  encoderpost_layernorm)rB   ry   rJ  rz   s      r)   rd   !AltCLIPVisionTransformer.__init__Z  sd    &&	1&9LL8M8MN%f- ll9:O:OPr+   r  r   r%  r&  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUSS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	[        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r  T)r   r   r%  r&  r   r)  pooler_outputr   r*  )ry   r   r%  r{  r   r   r  r  r  r   r   r*  )
rB   r  r   r%  r&  r  r   encoder_outputsr)  r:  s
             r)   r    AltCLIPVisionTransformer.forwardd  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5	 ' 
 ,A.)!Q'2++M:)/')77&11	
 	
r+   )ry   r   r  r  r  )NNNNF)rJ   rK   rL   rM   r   rd   r   r   r   r&   rO   r   r   rE   r   r   rQ   r   r   s   @r)   r  r  Y  s    Q2 Q  59,0/3&*38$
u001$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
  $
r+   r  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )AltCLIPVisionModeli  ry   r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rc   rd   r  vision_model	post_initrx   s     r)   rd   AltCLIPVisionModel.__init__  s'     4V<r+   r   c                 B    U R                   R                  R                  $ r   )r  r   r  rG   s    r)   get_input_embeddings'AltCLIPVisionModel.get_input_embeddings  s      ++;;;r+   r   r%  r  r&  c                 ^    Ub  UOU R                   R                  nU R                  UUUUUS9$ )aN  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPVisionModel

>>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r  r   r%  r  r&  )ry   r{  r  )rB   r  r   r%  r  r&  s         r)   r   AltCLIPVisionModel.forward  sA    : &1%<k$++B]B]  %/!5%=# ! 
 	
r+   )r  )NNNFN)rJ   rK   rL   rM   r   rP   main_input_namerd   r#   Moduler  r   r   r&   rO   r   r   rE   r   r   rQ   r   r   s   @r)   r  r    s    $O2 <bii <  59,0/3).&*$
u001$
 $D>$
 'tn	$

 #'$
 d^$
 
u00	1$
 $
r+   r  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                   n  ^  \ rS rSr% \\S'   SU 4S jjrS rS rS r	\
         SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )AltRobertaModeli  ry   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rc   rd   ry   rS   r   r  r  r3  poolerr  )rB   ry   add_pooling_layerrz   s      r)   rd   AltRobertaModel.__init__  sL    
 	 .v6(02C&v. 	r+   c                 .    U R                   R                  $ r   r   ri   rG   s    r)   r  $AltRobertaModel.get_input_embeddings  s    ...r+   c                 $    XR                   l        g r   r  rB   r   s     r)   set_input_embeddings$AltRobertaModel.set_input_embeddings  s    */'r+   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r   r  r   )rB   heads_to_pruner   r   s       r)   _prune_headsAltRobertaModel._prune_heads  s<    
 +002LELLu%//;;EB 3r+   r   r   r`   r[   r   r   r   r%  r&  r   c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R!                  X*5      nU R#                  XPR                   R$                  5      nU R                  UUUUS9nU R'                  UUUUUSS	9nUS
   nU R(                  b  U R)                  U5      OS n[+        UUUR,                  UR.                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer]   z5You have to specify either input_ids or inputs_embedsr!   r`   r}   )r   r[   r`   r   T)r   r   r   r%  r&  r   r  )ry   r   r%  r{  r   %warn_if_padding_and_no_attention_maskrv   r"   r&   onesr   r   r`   rt   ru   rw   get_extended_attention_maskget_head_maskr  r  r  r   r   r*  )rB   r   r   r`   r[   r   r   r   r%  r&  r   rZ  r   r"   r   r   extended_attention_maskembedding_outputr  sequence_outputr:  s                        r)   r   AltRobertaModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r+   )ry   r   r  r  )T	NNNNNNNNN)rJ   rK   rL   rM   r   rP   rd   r  r  r  r   r   r&   r   r   r   rE   r   r   rQ   r   r   s   @r)   r  r    s     /0C  -11515/3,004,0/3&*G
ELL)G
 !.G
 !.	G

 u||,G
 ELL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\\"$PP	QG
 G
r+   r  c                     ^  \ rS rSr% \\S'   U 4S jrS\R                  4S jr	S\R                  SS4S jrSS	\\   S\R                  4U 4S
 jjjr\\         SS\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )AltCLIPTextModeli9  ry   c                   > [         TU ]  U5        [        USS9U l        [        R
                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        U R                  5         g )NF)r  rW   )rc   rd   r  robertar#   r   rg   project_dimtransformationrn   ro   pre_LNr  rx   s     r)   rd   AltCLIPTextModel.__init__<  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr+   r   c                 B    U R                   R                  R                  $ r   r  r   ri   rG   s    r)   r  %AltCLIPTextModel.get_input_embeddingsC  s    ||&&666r+   r   Nc                 8    XR                   R                  l        g r   r  r  s     r)   r  %AltCLIPTextModel.set_input_embeddingsF  s    27/r+   new_num_tokensc                 "   > [         TU ]  U5      $ r   )rc   resize_token_embeddings)rB   r  rz   s     r)   r  (AltCLIPTextModel.resize_token_embeddingsI  s    w.~>>r+   r   r   r`   r[   r   r   r   r&  r%  c
                    Ub  UOU R                   R                  nU R                  UUUUUUUU	SS9	n
U
S   nU R                  U5      nU R	                  U5      nUSS2S4   n[        UUU
R                  U
R                  S9$ )a  
Examples:

```python
>>> from transformers import AutoProcessor, AltCLIPTextModel

>>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> texts = ["it's a cat", "it's a dog"]

>>> inputs = processor(text=texts, padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```NT)	r   r   r`   r[   r   r   r   r%  r&  r   r  )ry   r{  r  r  r  r   r   r*  )rB   r   r   r`   r[   r   r   r   r&  r%  r   r  projection_stater  s                 r)   r   AltCLIPTextModel.forwardL  s    @ &1%<k$++B]B],,))%'/!5  

 "!* ++o6  ..?(A.6.'!//))	
 	
r+   )r  r  r  r   r  )rJ   rK   rL   rM   r   rP   rd   r#   r  r  re   r  r   r   r  r   r   r&   r   r   r   rE   r   r   rQ   r   r   s   @r)   r  r  9  s=   7bii 78",, 84 8?hsm ?r|| ? ?  -11515/3,004,0&*/3;
ELL);
 !.;
 !.	;

 u||,;
 ELL);
  -;
 $D>;
 d^;
 'tn;
 
u==	>;
  ;
r+   r  c                   V  ^  \ rS rSr% \\S'   S\4U 4S jjr\" 5       \   SS\	R                  S\\	R                     S\\	R                     S\\	R                     S\	R                  4
S	 jj5       5       r\" 5       \ SS
\	R                  S\S\	R                  4S jj5       5       r\          SS\\	R                      S
\\	R                     S\\	R                     S\\	R                      S\\	R                     S\\   S\\   S\\   S\S\\   S\\\4   4S jj5       rSrU =r$ )r  i  ry   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  Ul	        UR                  U l
        UR                  U l        UR                  U l        [        U5      U l        [#        U5      U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R.                  " [0        R2                  " U R4                  R6                  5      5      U l        U R;                  5         g )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)r  )rc   rd   r   vision_configr   	TypeErrortypetext_configr   r   projection_dimr  r  rg   r  r  
text_modelr  r  r#   r   r  r  r  r&   tensorry   logit_scale_init_valuelogit_scaler  )rB   ry   r  r  rz   s       r)   rd   AltCLIPModel.__init__  sk    &..0CDD--./q2  &,,.?@@++,-Q0 
 ((,,-3-H-H*$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r+   r   r   r[   r`   r   c                 d    U R                  UUUUS9nUR                  nU R                  U5      nU$ )af  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`AltCLIPTextModel`].

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```)r   r   r[   r`   )r  r  r  )rB   r   r   r[   r`   text_outputsr:  text_featuress           r)   get_text_featuresAltCLIPModel.get_text_features  sF    6 )%)	 ' 
 %22,,];r+   r  r  c                 `    U R                  UUS9nUR                  nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AltCLIPModel
>>> from transformers.image_utils import load_image

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```)r  r  )r  r  r  )rB   r  r  vision_outputsr:  image_featuress         r)   get_image_featuresAltCLIPModel.get_image_features  sB    : **%%= + 
 '44//>r+   return_lossr   r%  r&  c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
U R	                  UUUUUUU
S9nU R                  UUUU	U
S9nUS   nU R                  U5      nUS   nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  nSnU(       a  [        U5      nU
(       d  UUXX4nUb  U4U-   $ U$ [!        UUUUUUUS	9$ )
a0  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```N)r   r   r`   r[   r   r%  r&  r  r   r   r]   T)r?  r   keepdim)r5   r6   r7   r8   r9   r:   r;   )ry   r   r%  r{  r  r  r  r  normr!  expr&   r   r.   Tr1   r3   )rB   r   r  r   r[   r`   r-  r   r%  r  r&  r$  r)  r9   r8   r!  r7   r6   r5   r   s                       r)   r   AltCLIPModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%/!5# ' 
 **%/!5%=# + 
 &a(--l;"1o**;7 $&7&7!T&7&RR!$4$4qb$$4$OO &&**,,,{NN4DES*,,_-D&T`qF)-)9TGf$EvE-+#%* .
 	
r+   )r!  r  r  r  r  r  r  r  )NNNrs  )
NNNNNNNNFN)rJ   rK   rL   rM   r   rP   rd   r   r   r&   r   r   rO   r&  r   r+  
LongTensorr   rE   r3   r   rQ   r   r   s   @r)   r  r    s   } B %& 26/315"<<" !." u||,	"
 !." 
		"  '"H %& */"''" #'" 
			"  '"H  1548153715&*,0/3).&*[
E,,-[
 u001[
 !.	[

 u//0[
 !.[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
r+   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r&   cumsumtype_asrw   )r   rV   r   maskincremental_indicess        r)   r~   r~   \  sW     <<$((*D <<!4<<TBE[[_cc##%33r+   )r  r  r  r  )rX  )r   )HrN   r   dataclassesr   typingr   r   r   r   r&   torch.nnr#   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   configuration_altclipr   r   r   
get_loggerrJ   loggerr   r*   r1   r3   r  rS   r   r   r   r   r   r  r	  r  r3  floatrD  rF  r`  ri  ru  r  r  r  r  r  r  r  r~   __all__rI   r+   r)   <module>rI     s     ! 1 1   ! 9  G l l w w X X 
		H	%
`U\\ `ell `-%,, -5<< -  
K  
   
HV=299 V=rSbii Sn299  $& "
*")) *\RYY  ryy %0 %R.
		 .
dryy . %II%<<% 
% <<	%
 U\\*% % %.F)ryy F)T /4 /dT
RYY T
pPbii Pf 1?_ 1? 1?h1
ryy 1
h2
/ 2
j k
, k
k
\P
- P
fL
) L
`4  _r+   