
    bCiYq                        S SK r S SKJr  S SKJrJrJr  S SKrS SKJ	s  J
r  S SKJ	r	  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJr  SSKJrJrJrJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)  \\ " S S\5      5       5       r*\" S5       " S S\	RV                  5      5       r, " S S\	RV                  5      r- " S S\	RV                  5      r. " S S\	RV                  5      r/ S<S\	RV                  S\R`                  S\R`                  S \R`                  S!\\R`                     S"\1S#\14S$ jjr2 " S% S&\	RV                  5      r3 " S' S(\5      r4 " S) S*\	RV                  5      r5 " S+ S,\	RV                  5      r6\ " S- S.\5      5       r7\" S/S09 " S1 S2\75      5       r8\" S3S09 " S4 S5\75      5       r9S6\R`                  S7\R`                  4S8 jr:\ " S9 S:\75      5       r;/ S;Qr<g)=    N)	dataclass)AnyCallableOptional)nn   )ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplefilter_out_non_signature_kwargs)deprecate_kwarg)check_model_inputs   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)Aimv2Output,   ai  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Aimv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Aimv2VisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r%   r&   N)getattrto_tuple).0kselfs     b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>'Aimv2Output.to_tuple.<locals>.<genexpr>K   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysr.   s   `r/   r+   Aimv2Output.to_tupleJ   s#     
YY[
 
 	
     )__name__
__module____qualname____firstlineno____doc__r    r   torchFloatTensor__annotations__r!   r"   r#   r$   r%   r   r&   r2   r   r+   __static_attributes__r7   r6   r/   r   r   ,   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r6   r   RMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )Aimv2RMSNormQ   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Aimv2RMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parameterr=   onesweightvariance_epsilon)r.   hidden_sizeeps	__class__s      r/   rG   Aimv2RMSNorm.__init__S   s/     	ll5::k#:; #r6   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor=   float32powmeanrsqrtrK   rJ   )r.   hidden_statesinput_dtypevariances       r/   forwardAimv2RMSNorm.forward[   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r6   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r2   rJ   shaperK   r4   s    r/   
extra_reprAimv2RMSNorm.extra_reprb   s*    ))*+6$2G2G1HIIr6   )rK   rJ   )gư>)	r8   r9   r:   r;   rG   r]   ra   r@   __classcell__rN   s   @r/   rC   rC   Q   s    $;J Jr6   rC   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Aimv2MLPf   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nbias)rF   rG   configrL   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr.   rk   rN   s     r/   rG   Aimv2MLP.__init__g   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r6   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ N)rq   rs   ro   rp   )r.   xrq   s      r/   r]   Aimv2MLP.forwardq   s6    NN4;;t~~a/@#ADLLQRO#ST	r6   )rs   rk   rq   ro   rL   rl   rp   )r8   r9   r:   r;   rG   r]   r@   rc   rd   s   @r/   rf   rf   f   s    0 r6   rf   c                      ^  \ rS rSrS\4U 4S jjr\SSS\R                  4S\R                  4S jj5       r
S	\R                  S\R                  4S
 jrSrU =r$ )Aimv2VisionEmbeddingsv   rk   c                 B  > [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)kernel_sizestriderQ   position_idsr   rR   F
persistent)rF   rG   rk   
patch_sizer   Conv2dnum_channelsrL   patch_embedrC   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr=   arangeexpand)r.   rk   num_patchesrN   s      r/   rG   Aimv2VisionEmbeddings.__init__w   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir6      g     @cpur'   c                    [         R                  " [        U5      XTS9n[         R                  " [        U 5      XTS9n[         R                  " XgSS9u  pvUS-  n[         R                  " XUS9U-  n	SX9-  -  n	UR	                  5       S   U	S S S 24   -  n
UR	                  5       S   U	S S S 24   -  n[         R
                  " U
R                  5       U
R                  5       UR                  5       UR                  5       /SS9S S S 2S S 24   $ )	NrT   devicexy)indexing   g      ?).Nr   dim)r=   r   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   rT   grid_wgrid_hpos_dimomegaout_hout_ws               r/   "build_2d_sincos_position_embedding8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s     c%jEc&kFFq.W&AGK{)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr6   pixel_valuesc                    UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       aT  U R                  X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nOU R                  U R                  5      nXV-   nU$ )NrQ   r   )r   r   rT   )sizer   r   	transposer   rk   r   r   r   rL   r   rT   r   r   )r.   r   _r   r   rZ   	pos_embeds          r/   r]   Aimv2VisionEmbeddings.forward   s    *//11e((6>>qAKKAqQm4;;  ??//)(++11$++#)) @ I //0A0ABI%1r6   )rk   r   r   r   r   )r8   r9   r:   r;   r   rG   staticmethodr=   rV   Tensorr   r]   r@   rc   rd   s   @r/   r{   r{   v   sb    j0 j !$'%u}}e	e e ELL U\\  r6   r{   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )Aimv2TextEmbeddings   rk   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nr   r   Fr   )rF   rG   rL   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r=   r   r   )r.   rk   r   rN   s      r/   rG   Aimv2TextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r6   	input_idsr   inputs_embedsr'   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )NrR   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r`   r   rJ   
ValueErrorr   r   )r.   r   r   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingss           r/   r]   Aimv2TextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r6   )r   r   NNN)r8   r9   r:   r;   r   rG   r   r=   
LongTensorr>   r   r]   r@   rc   rd   s   @r/   r   r      so    

 

 153759	E,,- u//0   1 12	
 
 r6   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrR   r   )r   rT   )ptrainingr   rQ   )r=   matmulr   r   
functionalsoftmaxrV   rU   rT   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r/   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r6   c            
          ^  \ rS rSrSrU 4S jr S	S\R                  S\\R                     S\	\R                  \\R                     4   4S jjr
SrU =r$ )
Aimv2Attention   z=Multi-headed attention from 'Attention Is All You Need' paperc                 h  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fri   )rF   rG   rk   rL   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   rm   qkv_biask_projv_projq_projout_projrt   s     r/   rG   Aimv2Attention.__init__   s0   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr6   rZ   r   r'   c                 2   UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XEU5      R#                  5       nU R%                  U5      nX4$ )z#Input shape: Batch x Time x Channelr   rQ   eager        )r   r   r   )r`   r   r   r   viewr   r   r   r   rk   _attn_implementationr   r   r   r   r   reshaper   r   )r.   rZ   r   r   
batch_sizer   r   queriesr3   valuesattention_interfacer   r   s                r/   r]   Aimv2Attention.forward   sS    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0((r6   )rk   r   r   r   r   r   r   r   r   r   r   rw   )r8   r9   r:   r;   r<   rG   r=   r   r   r2   r]   r@   rc   rd   s   @r/   r   r      s[    GX, 26$)||$) !.$)
 
u||Xell33	4$) $)r6   r   c            	          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\\R                     S\	\
   S\R                  4S jjrS	rU =r$ )Aimv2EncoderLayeri&  rk   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g rw   )rF   rG   r   	attentionrf   ffnrC   rL   r   	rms_norm1	rms_norm2rt   s     r/   rG   Aimv2EncoderLayer.__init__'  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr6   rZ   r   r   r'   c                     U R                  U5      nU R                  " SXBS.UD6u  pVX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)rZ   r   r7   )r   r   r   r   )r.   rZ   r   r   norm_hidden_statesr   r   
mlp_outputs           r/   r]   Aimv2EncoderLayer.forward.  sa     "^^M:r6Hrkqr%3!^^M:XX01
%2r6   )r   r   r   r   rw   )r8   r9   r:   r;   r   rG   r=   r   r   r   r   r]   r@   rc   rd   s   @r/   r   r   &  s^    O0 O 26|| !. +,	
 
 r6   r   c                   z   ^  \ rS rSrSrS\4U 4S jjr\ S
S\\	R                     S\\   S\4S jj5       rS	rU =r$ )Aimv2Encoderi?  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Aimv2EncoderLayer`].

Args:
    config: Aimv2Config
rk   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rF   rG   rk   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r.   rk   r   rN   s      r/   rG   Aimv2Encoder.__init__H  sS    mmfNfNfHg$hHg1%6v%>Hg$hi&+# %is   A&r   r   r'   c                 R    UnU R                    H  nU" UU40 UD6nM     [        US9$ )N)last_hidden_state)r  r   )r.   r   r   r   rZ   encoder_layers         r/   r]   Aimv2Encoder.forwardO  s>     &![[M) M ) ??r6   )rk   r  r  rw   )r8   r9   r:   r;   r<   r   rG   r   r   r=   r   r   r   r   r]   r@   rc   rd   s   @r/   r  r  ?  s_    ,{ ,  26@ !.@ +,	@
 
@ @r6   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2AttentionPoolingHeadia  rk   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nri   r   T)rF   rG   rL   r   r   r   rm   r   r   r   rH   r=   zeros	cls_tokenoutput_projrt   s     r/   rG   "Aimv2AttentionPoolingHead.__init__b  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr6   rZ   r'   c                    UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " XU5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )NrR   r   r   rQ   r   r   )r`   r  r   r   r   r   r   permuteFscaled_dot_product_attentionr   rX   r  )r.   rZ   r   seq_len
hidden_dimr  r   r   r   r   outputs              r/   r]   !Aimv2AttentionPoolingHead.forwardm  s8   *7*=*='
ZNN))*b"=	kk-(00dnnV`drdrVrsM*22:XbftftXtu!!*a~~A]^kk!Q1%aAq)aAq)44UG!++Aq199*aT!&&1&-!!+.r6   )r  rL   r   r   r  r   )r8   r9   r:   r;   r   rG   r=   r   r]   r@   rc   rd   s   @r/   r  r  a  s2    	T0 	TU\\ ell  r6   r  c                   T   ^  \ rS rSr% Sr\\S'   SrSr/ SQr	Sr
SrSrU 4S jrSrU =r$ )	Aimv2PreTrainedModeli  z
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rk   aimv2T)r   r  r{   r   c                   > [         TU ]  U5        [        US5      (       ad  [        UR                  [
        R                  5      (       a:  UR                  R                  R                  [        R                  " S5      5        g g [        U[        5      (       a9  UR                  R                  R                  SU R                  R                  S9  g g )Nlogit_scaleg$I$I,@r   )rX   std)rF   _init_weightshasattr
isinstancer%  r   rH   datafill_mathlogr  r  normal_rk   initializer_range)r.   r   rN   s     r/   r'  "Aimv2PreTrainedModel._init_weights  s    f%6=))&,,bll;;""''--dhhx.@A < 9::!!))s8U8U)V ;r6   r7   )r8   r9   r:   r;   r<   r   r?   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr'  r@   rc   rd   s   @r/   r"  r"    sC    
 &*# NW Wr6   r"  zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc            
          ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
S\R                  4S jr\" SS	S
9\" SS9\ SS\\R&                     S\\   S\4S jj5       5       5       rSrU =r$ )Aimv2VisionModeli  rk   r   rZ   
attentionsc                 >  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g rw   )rF   rG   rk   r{   r   r  encoderrC   rL   r   r   use_headr  head	post_initrt   s     r/   rG   Aimv2VisionModel.__init__  so     /7#F+$V%7%79L9LM==1&9DIr6   r'   c                 .    U R                   R                  $ rw   )r   r   r4   s    r/   get_input_embeddings%Aimv2VisionModel.get_input_embeddings  s    ***r6   r   zv4.58.0)versionFtie_last_hidden_statesr   c                     U R                  U5      nU R                  " SSU0UD6nUR                  nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUS9$ )ar  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```r   Nr  pooler_outputr7   )r   r=  r  r   r>  r?  r   )r.   r   r   r   rZ   encoder_outputsr  rJ  s           r/   r]   Aimv2VisionModel.forward  sx    : 5+/<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
r6   )rk   r   r=  r?  r   r>  rw   )r8   r9   r:   r;   r   r?   main_input_namer   r   _can_record_outputsrG   r   ModulerC  r   r   r   r   r=   r   r   r   r   r]   r@   rc   rd   s   @r/   r9  r9    s     $O*$
0 +bii + %y9u5 26)
 !.)
 +,	)

 
$)
  6 :)
r6   r9  zJ
    The text model from AIMv2 without any head or projection on top.
    c            	          ^  \ rS rSrSr\\S.rS\4U 4S jjr	S\
R                  4S jrS r\" S	S
9\ SS\\R$                     S\\   S\4S jj5       5       rSrU =r$ )Aimv2TextModeli  r   r:  rk   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g rw   )rF   rG   rk   r   r   r  r=  rC   rL   r   r   eos_token_idr@  rt   s     r/   rG   Aimv2TextModel.__init__  s_     -f5#F+$V%7%79L9LM"//r6   r'   c                 .    U R                   R                  $ rw   r   r   r4   s    r/   rC  #Aimv2TextModel.get_input_embeddings  s    ...r6   c                 $    XR                   l        g rw   rV  )r.   r   s     r/   set_input_embeddings#Aimv2TextModel.set_input_embeddings  s    */'r6   FrF  r   r   c           	         U R                  U5      nUR                  u  pVn[        R                  " U[        R                  UR
                  S9nUR                  S5      R                  US5      n	Ub  [        U R                  UU	UUS S9nU R                  " S	UUS.UD6n
U
R                  nU R                  U5      nU[        R                  " UR                  S   UR
                  S9UR                  [        R                  UR
                  S9U R                  :H  R                  5       R!                  SS94   n[#        UUS9$ )
Nr   r   rR   )rk   input_embedsr   r   cache_positionpast_key_values)r   r   )r   r   rI  r7   )r   r`   r=   r   longr   	unsqueezer   r   rk   r=  r  r   rU   r   rS  argmaxr   )r.   r   r   r   rZ   r   r  r   r]  r   rK  r  pooled_outputs                r/   r]   Aimv2TextModel.forward  sJ    	2!.!4!4
QgUZZH\H\]%//299*bI%/{{*)-- $N ,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
r6   )rk   r   r=  rS  r   rw   )r8   r9   r:   r;   rM  r   r   rN  r   rG   r   rO  rC  rY  r   r   r   r=   r   r   r   r   r]   r@   rc   rd   s   @r/   rQ  rQ    s     "O +$
	 	/bii /0 u5 26'
 !.'
 +,	'

 
$'
  6'
r6   rQ  tensorr'   c                     [         R                  " U S5      n[         R                  " USSS9n[         R                  " US5      nU$ )z
This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
rQ   rR   T)r   rS   g      ?)r=   rW   sum)rd  square_tensor
sum_tensornormed_tensors       r/   _get_vector_normrj  7  s<    
 IIfa(M=b$?JIIj#.Mr6   c                     ^  \ rS rSr% \\S'   / SQrSrS\4U 4S jjr\	" 5       \
  SS\R                  S\\R                     S\\R                     S	\R                  4S
 jj5       5       r\	" 5       \
 SS\R                  S\S	\R                  4S jj5       5       r\
\   SS\\R&                     S\\R                     S\\R                     S\\   S	\4
S jj5       5       rSrU =r$ )
Aimv2ModeliB  rk   )r   r   r{   Tc                   > [         TU ]  U5        UR                  U l        UR                  R                  U l        UR                  R                  U l        [        R                  UR                  5      U l
        [        R                  UR                  5      U l        [        R                  " U R
                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFri   )rF   rG   projection_dimvision_configrL   vision_embed_dimtext_configtext_embed_dimr9  _from_configvision_modelrQ  
text_modelr   rm   visual_projectiontext_projectionrH   r=   rd  rk   logit_scale_init_valuer%  r,  r-  max_logit_scalemax_log_logit_scaler@  rt   s     r/   rG   Aimv2Model.__init__H  s     $33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r6   r   r   r   r'   c                 b    U R                  UUUS9nUR                  nU R                  U5      nU$ )a~  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`Aimv2TextModel`].

Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, Aimv2Model

>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```)r   r   r   )ru  rJ  rw  )r.   r   r   r   text_outputsrb  text_featuress          r/   get_text_featuresAimv2Model.get_text_featuresZ  sD    6 48??)% 4C 4

 %22,,];r6   r   interpolate_pos_encodingc                 `    U R                  UUS9nUR                  nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`Aimv2VisionModel`].

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, Aimv2Model
>>> from transformers.image_utils import load_image

>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```)r   r  )rt  rJ  rv  )r.   r   r  vision_outputsrb  image_featuress         r/   get_image_featuresAimv2Model.get_image_features  sC    < 6:5F5F%%= 6G 6
 '44//>r6   r   c           	          U R                   " SSU0UD6nU R                  " SUUS.UD6nUR                  nU R                  U5      nUR                  nU R	                  U5      nU[        U5      -  nU[        U5      -  nU R                  R                  SU R                  5      R                  5       R                  UR                  5      n	X-  UR                  5       -  n
U
R                  5       n[        UU
UUUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   )r   r   r   )r!   r"   r#   r$   r%   r&   r7   )rt  ru  rJ  rv  rw  rj  r%  clamprz  exprU   r   tr   )r.   r   r   r   r   r  r}  r$   r#   r%  r"   r!   s               r/   r]   Aimv2Model.forward  s   > 6:5F5F 6
%6
6

 48?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r6   )	r%  rz  rn  rr  ru  rw  rp  rt  rv  )NN)Fr   )r8   r9   r:   r;   r   r?   r3  r5  rG   r   r   r=   r   r   r>   r  boolr  r   r   r   r   r   r]   r@   rc   rd   s   @r/   rl  rl  B  sT   ]{ $ %& 26/3	!<<! !.! u||,	!
 
		!  '!F %& */#''# #'# 
			#  '#J  154815	=
E,,-=
 u001=
 !.	=

 +,=
 
=
  =
r6   rl  )r9  rl  r"  rQ  )r   )=r,  dataclassesr   typingr   r   r   r=   torch.nn.functionalr   r   r  activationsr	   integrationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr   utils.genericr   configuration_aimv2r   r   r   r   rO  rC   rf   r{   r   r   floatr   r   r   r  r  r"  r9  rQ  rj  rl  __all__r7   r6   r/   <module>r     s  .  ! * *     ! 7 / 9 K F & w w 0 / P P  
+  
   
F Y'J299 J (J(ryy  1BII 1h%")) %^ %II%<<% 
% <<	%
 U\\*% % %.:)RYY :)z2 2@299 @D		 D W? W W8 
E
+ E

E
P 
B
) B

B
JU\\ ell  b
% b
 b
J Wr6   