
    bCin                        S r SSKrSSKJr  SSKrSSKJs  Jr  SSKJr  SSK	J
r
  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  SSKJrJrJr  SSK J!r!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(J)r)J*r*   " S S\&5      r+ " S S\%5      r, " S S\$5      r- " S S\*5      r. " S S\"5      r/ " S S\!5      r0 " S S \Rb                  5      r2 " S! S"\5      r3 " S# S$\(5      r4 " S% S&\5      r5 " S' S(\)5      r6 " S) S*\Rb                  5      r7\ " S+ S,\5      5       r8\" S-S.9 " S/ S0\85      5       r9\" S1S.9 " S2 S3\85      5       r:\ " S4 S5\5      5       r;/ S6Qr<g)7z%Pytorch implementation of AIMv2 Model    N)Optional)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                      ^  \ rS rSrSr               SS\S\S\S\S\S\S	\S
\S\S\S\S\S\S\S\4U 4S jjjr	Sr
U =r$ )Aimv2VisionConfig+   a  
This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2816):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
    use_head (`str`, *optional*, defaults to `True`):
        Whether to use Attention Pooling Head or Not.
    is_native (`str`, *optional*, defaults to `False`):
        Whether to use ckpt trained for image native resolution or not.
Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = Aimv2VisionConfig()

>>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = Aimv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                    > [         TU ]  " SUUUUUUUUU
S.	UD6  Xl        Xl        Xl        Xl        Xl        Xl        Xl        U ?	g )N)	r    r!   r"   r#   r+   r$   r%   r&   r)    )
super__init__r-   r,   r(   r*   r)   r'   r.   layer_norm_eps)selfr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   kwargs	__class__s                    a/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/aimv2/modular_aimv2.pyr2   Aimv2VisionConfig.__init__d   sj    & 	 	
#// 3!%!!	
 	
 !!2!2  ("    )r(   r,   r.   r*   r)   r'   r-   )i   i         r         h㈵>        FFsilu{Gz?TF)__name__
__module____qualname____firstlineno____doc__intfloatboolstrr2   __static_attributes____classcell__r6   s   @r7   r   r   +   s    6t  !%!##$"#& #'!( (  (  	( 
 !(  (  (  (  (  !(  (  (  (  !(  (   !(  ( r9   r   c                      ^  \ rS rSrSr               SS\S\S\S\S\S\S	\S
\S\S\S\	\   S\	\   S\S\S\4U 4S jjjr
SrU =r$ )Aimv2TextConfig   a   
This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`Aimv2Model`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    pad_token_id (`int`, *optional*, defaults to 1):
        The id of the padding token in the vocabulary.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the vocabulary.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the vocabulary.
    max_position_embeddings (`int`, *optional*, defaults to 77):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.

vocab_sizer    r!   r"   r#   r'   r(   r)   r*   r+   pad_token_idbos_token_ideos_token_idmax_position_embeddingsr,   c                    > [         TU ]  " SUUUUUU
UUUUS.
UD6  Xl        Xpl        Xl        Xl        X`l        U ?U ?U ?	U ?
g )N)
rQ   r    r!   r"   r#   r+   rU   rR   rS   rT   r0   )r1   r2   r,   r(   r*   r)   r'   rS   rR   projection_sizer3   )r4   rQ   r    r!   r"   r#   r'   r(   r)   r*   r+   rR   rS   rT   rU   r,   r5   r6   s                    r7   r2   Aimv2TextConfig.__init__   su    & 	 	
!#// 3!$;%%%	
 	
 "3!2  ( r9   )r(   r,   r*   r)   r'   )i   i   i         r>   r?   FFr@   NNi  M   rA   )rB   rC   rD   rE   rF   rG   rH   rI   rJ   r   r2   rK   rL   rM   s   @r7   rO   rO      s    +^  !%!##$"#& &*&*!')"&!* *  *  	* 
 *  !*  *  !*  *  *  *  sm*  sm*  *  "%*    !*  * r9   rO   c                   2   ^  \ rS rSrSr SU 4S jjrSrU =r$ )Aimv2Config   a  
[`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import Aimv2Config, Aimv2Model

>>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
>>> configuration = Aimv2Config()

>>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
>>> model = Aimv2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
>>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = Aimv2TextConfig()
>>> config_vision = Aimv2VisionConfig()

>>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
```c                 R   > [         TU ]  " X40 UD6  X0l        X@l        SU l        U ?g )Ng      Y@)r1   r2   projection_dimlogit_scale_init_valuemax_logit_scaleinitializer_factor)r4   text_configvision_configr`   ra   r5   r6   s         r7   r2   Aimv2Config.__init__  s1     	>v>,&<#$#r9   )ra   rb   r`   )NNi   g/L
F@)rB   rC   rD   rE   rF   r2   rK   rL   rM   s   @r7   r]   r]      s    +\ `f$ $r9   r]   c                       \ rS rSrSrg)Aimv2Outputi#  r0   NrB   rC   rD   rE   rK   r0   r9   r7   rh   rh   #      r9   rh   c                       \ rS rSrSrg)Aimv2RMSNormi'  r0   Nri   r0   r9   r7   rl   rl   '  rj   r9   rl   c                       \ rS rSrSrg)Aimv2MLPi+  r0   Nri   r0   r9   r7   rn   rn   +  rj   r9   rn   c                      ^  \ rS rSrS\4U 4S jjr\SSS\R                  4S\R                  4S jj5       r
S	\R                  S\R                  4S
 jrSrU =r$ )Aimv2VisionEmbeddingsi/  configc                 B  > [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)kernel_sizestrider   position_ids)   F)
persistent)r1   r2   rq   r&   r   Conv2dr$   r    patch_embedrl   r'   rms_normr%   r.   	Embeddingposition_embeddingregister_buffertorcharangeexpand)r4   rq   num_patchesr6   s      r7   r2   Aimv2VisionEmbeddings.__init__0  s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir9      g     @cpureturnc                    [         R                  " [        U5      XTS9n[         R                  " [        U 5      XTS9n[         R                  " XgSS9u  pvUS-  n[         R                  " XUS9U-  n	SX9-  -  n	UR	                  5       S   U	S S S 24   -  n
UR	                  5       S   U	S S S 24   -  n[         R
                  " U
R                  5       U
R                  5       UR                  5       UR                  5       /SS9S S S 2S S 24   $ )	Ndtypedevicexy)indexing   g      ?).Nrv   dim)r   r   rG   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               r7   "build_2d_sincos_position_embedding8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding>  s     c%jEc&kFFq.W&AGK{)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr9   pixel_valuesc                    UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       aT  U R                  X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nOU R                  U R                  5      nXV-   nU$ )Nr   rv   )r   r   r   )sizerz   r   	transposer{   rq   r.   r   r&   r    r   r   r}   ru   )r4   r   _r   r   hidden_states	pos_embeds          r7   forwardAimv2VisionEmbeddings.forwardO  s    *//11e((6>>qAKKAqQm4;;  ??//)(++11$++#)) @ I //0A0ABI%1r9   )rq   rz   r&   r}   r{   )rB   rC   rD   rE   r   r2   staticmethodr   float32Tensorr   r   rK   rL   rM   s   @r7   rp   rp   /  sb    j0 j !$'%u}}e	e e ELL U\\  r9   rp   c                       \ rS rSrSrg)Aimv2TextEmbeddingsic  r0   Nri   r0   r9   r7   r   r   c  rj   r9   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )Aimv2Attentionig  c                   > [         TU ]  U5        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l	        g )Nbias)
r1   r2   r   Linearr   r)   k_projv_projq_projout_projr4   rq   r6   s     r7   r2   Aimv2Attention.__init__h  s     iiV__UiiV__UiiV__U		$..$..vWr9   )r   r   r   r   )rB   rC   rD   rE   r2   rK   rL   rM   s   @r7   r   r   g  s    X Xr9   r   c            	          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\\R                     S\	\
   S\R                  4S jjrS	rU =r$ )Aimv2EncoderLayerip  rq   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g N)r1   r2   r   	attentionrn   ffnrl   r    r'   	rms_norm1	rms_norm2r   s     r7   r2   Aimv2EncoderLayer.__init__q  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr9   r   attention_maskr5   r   c                     U R                  U5      nU R                  " SXBS.UD6u  pVX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   r0   )r   r   r   r   )r4   r   r   r5   norm_hidden_statesattn_outputr   
mlp_outputs           r7   r   Aimv2EncoderLayer.forwardx  sa     "^^M:r6Hrkqr%3!^^M:XX01
%2r9   )r   r   r   r   r   )rB   rC   rD   rE   r   r2   r   r   r   r   r   r   rK   rL   rM   s   @r7   r   r   p  s^    O0 O 26|| !. +,	
 
 r9   r   c                       \ rS rSrSrg)Aimv2Encoderi  r0   Nri   r0   r9   r7   r   r     rj   r9   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2AttentionPoolingHeadi  rq   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   rv   T)r1   r2   r    r#   	num_headsr   r   r)   r   r   	Parameterr   zeros	cls_tokenoutput_projr   s     r7   r2   "Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr9   r   r   c                    UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " XU5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )Nrw   rv   r   r   r   r   )shaper   r   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )r4   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              r7   r   !Aimv2AttentionPoolingHead.forward  s8   *7*=*='
ZNN))*b"=	kk-(00dnnV`drdrVrsM*22:XbftftXtu!!*a~~A]^kk!Q1%aAq)aAq)44UG!++Aq199*aT!&&1&-!!+.r9   )r   r    r   r   r   r   )rB   rC   rD   rE   r   r2   r   r   r   rK   rL   rM   s   @r7   r   r     s2    	T0 	TU\\ ell  r9   r   c                   T   ^  \ rS rSr% Sr\\S'   SrSr/ SQr	Sr
SrSrU 4S jrSrU =r$ )	Aimv2PreTrainedModeli  z
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rq   aimv2T)r   r   rp   r   c                   > [         TU ]  U5        [        US5      (       ad  [        UR                  [
        R                  5      (       a:  UR                  R                  R                  [        R                  " S5      5        g g [        U[        5      (       a9  UR                  R                  R                  SU R                  R                  S9  g g )Nlogit_scaleg$I$I,@r?   )r   std)r1   _init_weightshasattr
isinstancer   r   r   datafill_mathlogr   r   normal_rq   r,   )r4   moduler6   s     r7   r   "Aimv2PreTrainedModel._init_weights  s    f%6=))&,,bll;;""''--dhhx.@A < 9::!!))s8U8U)V ;r9   r0   )rB   rC   rD   rE   rF   r]   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   rK   rL   rM   s   @r7   r   r     sC    
 &*# NW Wr9   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc            
          ^  \ rS rSr% \\S'   Sr\\S.r	S\4U 4S jjr
S\R                  4S jr\" SS	S
9\" SS9\ SS\\R&                     S\\   S\4S jj5       5       5       rSrU =r$ )Aimv2VisionModeli  rq   r   r   
attentionsc                 >  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g r   )r1   r2   rq   rp   
embeddingsr   encoderrl   r    r'   r{   r-   r   head	post_initr   s     r7   r2   Aimv2VisionModel.__init__  so     /7#F+$V%7%79L9LM==1&9DIr9   r   c                 .    U R                   R                  $ r   )r  rz   r4   s    r7   get_input_embeddings%Aimv2VisionModel.get_input_embeddings  s    ***r9   r   zv4.58.0)versionFtie_last_hidden_statesr5   c                     U R                  U5      nU R                  " SSU0UD6nUR                  nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUS9$ )ar  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```inputs_embedsNlast_hidden_statepooler_outputr0   )r  r  r  r{   r-   r  r	   )r4   r   r   r5   r   encoder_outputsr  r  s           r7   r   Aimv2VisionModel.forward  sx    : 5+/<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
r9   )rq   r  r  r  r{   r-   r   )rB   rC   rD   rE   r   r   main_input_namer   r   _can_record_outputsr2   r   Moduler  r   r   r   r   r   r   r   r   r	   r   rK   rL   rM   s   @r7   r   r     s     $O*$
0 +bii + %y9u5 26)
 !.)
 +,	)

 
$)
  6 :)
r9   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            	          ^  \ rS rSrSr\\S.rS\4U 4S jjr	S\
R                  4S jrS r\" S	S
9\ SS\\R$                     S\\   S\4S jj5       5       rSrU =r$ )Aimv2TextModeli  	input_idsr   rq   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g r   )r1   r2   rq   r   r  r   r  rl   r    r'   r{   rT   r  r   s     r7   r2   Aimv2TextModel.__init__&  s_     -f5#F+$V%7%79L9LM"//r9   r   c                 .    U R                   R                  $ r   r  token_embeddingr  s    r7   r  #Aimv2TextModel.get_input_embeddings1  s    ...r9   c                 $    XR                   l        g r   r  )r4   r   s     r7   set_input_embeddings#Aimv2TextModel.set_input_embeddings4  s    */'r9   Fr  r   r5   c           	         U R                  U5      nUR                  u  pVn[        R                  " U[        R                  UR
                  S9nUR                  S5      R                  US5      n	Ub  [        U R                  UU	UUS S9nU R                  " S	UUS.UD6n
U
R                  nU R                  U5      nU[        R                  " UR                  S   UR
                  S9UR                  [        R                  UR
                  S9U R                  :H  R                  5       R!                  SS94   n[#        UUS9$ )
Nr   r   rw   )rq   input_embedsru   r   cache_positionpast_key_values)r  r   )r   r   r  r0   )r  r   r   r   longr   	unsqueezer   r   rq   r  r  r{   torG   rT   argmaxr	   )r4   r  r   r5   r   r   r   r   r%  ru   r  r  pooled_outputs                r7   r   Aimv2TextModel.forward7  sJ    	2!.!4!4
QgUZZH\H\]%//299*bI%/{{*)-- $N ,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
r9   )rq   r  r  rT   r{   r   )rB   rC   rD   rE   r  r   r   r  rO   r2   r   r  r  r!  r   r   r   r   r   r   r   r	   r   rK   rL   rM   s   @r7   r  r    s     "O +$
	 	/bii /0 u5 26'
 !.'
 +,	'

 
$'
  6'
r9   r  c                       \ rS rSrSrS\4S jr\\   SS\	\
R                     S\	\
R                     S\	\
R                     S	\\   S
\4
S jj5       5       rSrg)
Aimv2Modelic  Trq   c                    [         R                  " X5        UR                  U l        UR                  R                  U l        UR                  R                  U l        [        R                  UR                  5      U l
        [        R                  UR                  5      U l        [        R                  " U R
                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFr   )r
   r2   r`   re   r    vision_embed_dimrd   text_embed_dimr   _from_configvision_modelr  
text_modelr   r   visual_projectiontext_projectionr   r   tensorrq   ra   r   r   r   rb   max_log_logit_scaler  )r4   rq   s     r7   r2   Aimv2Model.__init__g  s      .$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r9   Nr  r   r   r5   r   c           	          U R                   " SSU0UD6nU R                  " SUUS.UD6nUR                  nU R                  U5      nUR                  nU R	                  U5      nU[        U5      -  nU[        U5      -  nU R                  R                  SU R                  5      R                  5       R                  UR                  5      n	X-  UR                  5       -  n
U
R                  5       n[        UU
UUUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```r   )r  r   r?   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr0   )r3  r4  r  r5  r6  r   r   clampr8  expr)  r   trh   )r4   r  r   r   r5   vision_outputstext_outputsr>  r=  r   r<  r;  s               r7   r   Aimv2Model.forwardy  s   > 6:5F5F 6
%6
6

 48?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r9   )	r   r8  r`   r1  r4  r6  r0  r3  r5  )NNN)rB   rC   rD   rE   r   r]   r2   r   r   r   r   
LongTensorFloatTensorr   r   r   rh   r   rK   r0   r9   r7   r.  r.  c  s    { $  154815	=
E,,-=
 u001=
 !.	=

 +,=
 
=
  =
r9   r.  )r]   r   rO   r   r.  r   r  )=rF   r   typingr   r   torch.nn.functionalr   
functionalr   masking_utilsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r   rO   r]   rh   rl   rn   r  rp   r   r   r   r   r   r   r   r  r.  __all__r0   r9   r7   <module>rY     s    ,       / 9 K - & 
 1 / P P 9 \ \ Q Qa * a HX & X v6$, 6$r	, 		< 		x 	1BII 1h	, 	X_ X2 2	= 			 D W? W W8 
E
+ E

E
P 
B
) B

B
J T
 T
 T
nr9   