
    cCi                        S r SSKrSSKJr  SSKJrJrJr  SSK	r
SSKrSSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJr  SSKJ r J!r!J"r"  \RF                  " \$5      r%S\RL                  S\RL                  4S jr'S\RL                  S\RL                  4S jr(S\RL                  S\)4S jr*SDS\RL                  S\+S\,S\)S\RL                  4
S jjr-SES jr.S r/ " S S\R`                  5      r1 " S S\R`                  5      r2 " S S \R`                  5      r3\\ " S! S"\5      5       5       r4 " S# S$\R`                  5      r5 " S% S&\R`                  5      r6 " S' S(\R`                  5      r7 " S) S*\R`                  5      r8 " S+ S,\R`                  5      r9 " S- S.\95      r: " S/ S0\R`                  5      r; " S1 S2\5      r<\ " S3 S4\5      5       r= " S5 S6\R`                  5      r> " S7 S8\R`                  5      r? " S9 S:\R`                  5      r@ " S; S<\=5      rA " S= S>\R`                  5      rB " S? S@\=5      rC\ " SA SB\=5      5       rD/ SCQrEg)FzPyTorch GroupViT model.    N)	dataclass)AnyOptionalUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputauto_docstringfilter_out_non_signature_kwargslogging	torch_int   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_lossr$   '   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r$   t)r&   caption_loss
image_losss      r#   groupvit_lossr+   ,   s*    #J/L!*,,.1J%,,r%   dimc                     U R                  U5      nUR                  USS9S   n[        R                  " U [        R                  S9R                  XS5      nXBR                  5       -
  U-   nU$ )NTkeepdimr   memory_format      ?)softmaxmaxr    
zeros_likelegacy_contiguous_formatscatter_detach)r   r,   y_softindexy_hardrets         r#   hard_softmaxr=   2   sf    ^^C FJJsDJ)!,EfE4R4RS\\]`ilmF
==?
"V
+CJr%   tauhardc           	      ,   [         R                  R                  R                  [         R                  " SU R
                  U R                  S9[         R                  " SU R
                  U R                  S95      nUR                  U R                  5      nX-   U-  nUR                  U5      nU(       a]  UR                  USS9S   n[         R                  " U [         R                  S9R                  X7S5      nXR                  5       -
  U-   n	U	$ Un	U	$ )N        )r   dtyper2   Tr.   r   r0   )r    distributionsgumbelGumbeltensorr   rB   sampleshaper3   r4   r5   r6   r7   r8   )
r   r>   r?   r,   gumbel_distgumbelsr9   r:   r;   r<   s
             r#   gumbel_softmaxrK   <   s    %%,,33SfllCSfllCK   .G3&G__S!F

3
-a0!!&8V8VW``admpq}}&/ J Jr%   c                    X-  U R                   S   -  S-  nX:  a4  [        [        R                  " X$-  5      5      nU R                   S   U-  nO3[        [        R                  " X-  5      5      nU R                   S   U-  nU R                   S   nU R                   S   nU R	                  XxXe5      n [
        R                  R                  XU4SUS9n U $ )a  
Args:
    attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
    height (`int`): height of the output attention map
    width (`int`): width of the output attention map
    align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

Returns:
    `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
         ?r   r   bilinearsizemodealign_corners)rH   intnproundreshaper   r   interpolate)	
attentionsheightwidthrS   scale
feat_widthfeat_height
batch_sizegroupss	            r#   resize_attention_mapra   R   s     ^z//22s:E~%-01
 &&q)Z7"((6>23%%a(K7
!!!$Ja F##JPJ**%z + J r%   c           	      V   / n[         R                  " 5          SnU  Hj  nUR                  SSS5      R                  5       nUc  UnOX4-  n[	        UR                  SSS5      R                  5       /UQ76 nUR                  U5        Ml     SSS5        US   nU$ ! , (       d  f       N= f)a  
Args:
    attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
    hw_shape (`tuple(int)`): height and width of the output attention map
Returns:
    `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
Nr   rM   r   )r    no_gradpermute
contiguousra   append)rY   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupings          r#   get_grouping_from_attentionsrn   p   s     I	$J#++Aq!4??AJ&","1">/0G0G1a0P0[0[0]i`hiL\* % 
 r]N! 
s   A3B
B(c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )GroupViTCrossAttentionLayer   configc                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        g Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrr   	__class__s     r#   rx   $GroupViTCrossAttentionLayer.__init__   sb    %f-	\\&"4"4&:O:OP
v&f&8&8f>S>STr%   c                     UnX0R                  XS9S   -   nX0R                  U R                  U5      5      -   nU R                  U5      nU$ )N)encoder_hidden_statesr   rz   r   r~   r   )r   querykeyxs       r#   forward#GroupViTCrossAttentionLayer.forward   sK    		%	;A>>A''NN1r%   r   )	__name__
__module____qualname____firstlineno__r   rx   r   __static_attributes____classcell__r   s   @r#   rp   rp      s    U3 U r%   rp   c                   @   ^  \ rS rSrS\4U 4S jjrSS jrS rSrU =r	$ )GroupViTAssignAttention   rr   c                   > [         TU ]  5         UR                  S-  U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l	        UR                  U l
        g )N      )rw   rx   r|   r\   r   Linearq_projk_projv_projproj
assign_epsr   s     r#   rx    GroupViTAssignAttention.__init__   s    ''-
ii 2 2F4F4FGii 2 2F4F4FGii 2 2F4F4FGIIf00&2D2DE	 ++r%   c                     U(       a  U R                   (       a  [        USUS9nU$ U(       a  [        USS9nU$ [        R                  R                  USS9nU$ )N)r,   r?   r,   )trainingrK   r=   r   r   r3   )r   rz   rD   r?   s       r#   get_attn GroupViTAssignAttention.get_attn   sX    dmm!$BT:D  #Db1  }},,Tr,:r%   c                 `   UnU R                  U5      nU R                  U5      nU R                  U5      nXR                  SS5      -  U R                  -  nU R                  U5      nU R                  USSS9nXUR                  SSS9U R                  -   -  nXS-  nU R                  U5      nXv4$ )Nr   rc   F)rD   r?   Tr,   r/   )	r   r   r   	transposer\   r   sumr   r   )r   r   r   valueraw_attnrz   	soft_attnouts           r#   r   GroupViTAssignAttention.forward   s    E" kk# E" MM"b11TZZ?}}X&MM(5uME	xxBx5GHliin~r%   )r   r   r   r   r\   r   )TT)
r   r   r   r   r   rx   r   r   r   r   r   s   @r#   r   r      s    ,3 ,	 r%   r   c                   <   ^  \ rS rSrS\4U 4S jjrS rS rSrU =r	$ )GroupViTTokenAssign   rr   c                 V  > [         TU ]  5         X0l        [        R                  " UR
                  UR                  S9U l        [        UR                  [        R                  R                  5      (       a  UR                  OUR                  UR                  4nU Vs/ s H  n[        XQR
                  -  5      PM     snu  pg[        XXc5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR
                  UR                  S9U l        [%        U5      U l        [)        U5      U l        [        R                  " UR
                  UR                  S9U l        [/        XR
                  XqR
                  5      U l        g s  snf rt   )rw   rx   num_output_groupr   r{   r|   r}   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterablerT   GroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrp   pre_assign_attnr   assign
norm_new_xr   mlp_channels)	r   rr   num_group_tokenr   r   r   
tokens_dimchannels_dimr   s	           r#   rx   GroupViTTokenAssign.__init__   sD    0<<(:(:@U@UV &11;??3K3KLL ##))6+B+BC 	
 JZ#ZIYAC,>,>(>$?IY#Z 
)&:` "V-?-?VEZEZ [ll6#5#56;P;PQ:6B-f5,,v'9'9v?T?TU'0B0BLRdRde $[s   !F&c                 J    U R                  U5      nU R                  U5      nU$ )z
Args:
    group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

Returns:
    projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
)r   r   )r   group_tokensprojected_group_tokenss      r#   project_group_token'GroupViTTokenAssign.project_group_token   s+     "&!=!%!6!67M!N%%r%   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  X15      nU R	                  X15      u  pEXC-  nX@R                  U R                  U5      5      -   nXE4$ )z
Args:
    image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
    group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
)r   r   r   r   r   r   r   )r   image_tokensr   r   new_image_tokens	attentions         r#   r   GroupViTTokenAssign.forward   s     ''5{{<0!%!9!9,!G!%!5!56L![&*kk2H&W#2+.?.?P`@a.bb**r%   )	r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   rx   r   r   r   r   r   s   @r#   r   r      s!    f3 f*&+ +r%   r   c                   :   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\	S
'   Sr\\	S'   S\\   4S jrSrg)GroupViTModelOutputi  a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
    Classification scores for each pixel.

    <Tip warning={true}>

    The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
    to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
    original image size as post-processing. You should always check your logits shape and resize as needed.

    </Tip>
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of
    [`GroupViTTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of
    [`GroupViTVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`GroupViTTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`GroupViTVisionModel`].
Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r   r   N)getattrto_tuple).0kr   s     r#   	<genexpr>/GroupViTModelOutput.to_tuple.<locals>.<genexpr>0  s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysr   s   `r#   r   GroupViTModelOutput.to_tuple/  s#     
YY[
 
 	
r%    )r   r   r   r   __doc__r   r   r    FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r   r   r%   r#   r   r     s    > )-D(5$$
%,48hu001837OXe//077;%"3"34;/3K%++,304L(5,,-448186:3:
%* 
r%   r   c            	          ^  \ rS rSrSr    SS\S\\\\\4   4   S\S\4U 4S jjjrSS\	R                  S	\S
\	R                  4S jjrSrU =r$ )GroupViTPatchEmbeddingsi6  z
Image to Patch Embedding.

image_size
patch_sizenum_channels	embed_dimc                 `  > [         TU ]  5         [        U[        R                  R
                  5      (       a  UOX4n[        U[        R                  R
                  5      (       a  UOX"4nUS   US   -  US   US   -  -  nXl        X l        XPl        [        R                  " X4X"S9U l        g )Nr   r   )kernel_sizestride)rw   rx   r   r   r   r   r   r   num_patchesr   Conv2d
projection)r   r   r   r   r   r   r   s         r#   rx    GroupViTPatchEmbeddings.__init__;  s     	#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$&))Lgr%   pixel_valuesinterpolate_pos_encodingr   c                 >   UR                   u  p4pVU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   r   zInput image size (*z) doesn't match model ().rM   )rH   r   
ValueErrorr   flattenr   )r   r   r   r_   r   rZ   r[   r   s           r#   r   GroupViTPatchEmbeddings.forwardL  s    2>2D2D/
&'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr%   )r   r   r   r   )      r   i   F)r   r   r   r   r   rT   r   r   rx   r    Tensorboolr   r   r   r   s   @r#   r   r   6  s     24hh #uS#X./h 	h
 h h"	ELL 	D 	]b]i]i 	 	r%   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\
S\R                  4S jjrSrU =r$ )GroupViTVisionEmbeddingsiX  rr   c                   > [         TU ]  5         [        UR                  UR                  UR
                  UR                  S9U l        U R                  R                  n[        R                  " [        R                  " SX!R                  5      5      U l        [        R                  " UR                  5      U l        [        R                   " UR                  UR"                  S9U l        UR                  U l        Xl        g )N)r   r   r   r   r   ru   )rw   rx   r   r   r   r   r|   patch_embeddingsr   r   	Parameterr    zerosposition_embeddingsDropoutdropoutr{   r}   	layernormrr   )r   rr   r   r   s      r#   rx   !GroupViTVisionEmbeddings.__init__Y  s     7((((,,((	!
 ++77#%<<A{L^L^0_#` zz&..1f&8&8f>S>ST ++r%   
embeddingsrZ   r[   r   c                 ,   UR                   S   nU R                  R                   S   n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  nUR                   S   nX R
                  -  nX0R
                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   rc   rN   r   r   rM   bicubicFrP   )rH   r  r    jit
is_tracingr   r   rW   re   r   r   rX   view)r   r  rZ   r[   r   num_positionspatch_pos_embedr,   
new_height	new_widthsqrt_num_positionss              r#   r   1GroupViTVisionEmbeddings.interpolate_pos_encodingi  s    !&&q)0066q9 yy##%%+*F6?+++22r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr%   r   r   c                     UR                   u  p4pVU R                  XS9nU R                  U5      nUR                  5       u  p8n	U(       a  XpR	                  XuU5      -   nOXpR
                  -   nU R                  U5      nU$ )N)r   )rH   r  r  rQ   r   r  r  )
r   r   r   r_   r   rZ   r[   r  seq_len_s
             r#   r    GroupViTVisionEmbeddings.forward  s    2>2D2D/
&**<*k
^^J/
!+!2
Q $#&C&CJX]&^^J#&>&>>J\\*-
r%   )rr   r  r  r  r   r  r  )r   r   r   r   r   rx   r    r  rT   r   r  r   r   r   r   s   @r#   r  r  X  sh    3  $5<< $ $UX $]b]i]i $LELL D ]b]i]i  r%   r  c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )GroupViTTextEmbeddingsi  rr   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nposition_ids)r   rc   F)
persistent)rw   rx   r|   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr    r!   expandr   rr   r   r   s      r#   rx   GroupViTTextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r%   	input_idsr#  inputs_embedsr   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nrc   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rH   r)  weightr   r#  r'  )r   r.  r#  r/  
seq_lengthmax_position_embeddingr  r  s           r#   r   GroupViTTextEmbeddings.forward  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r%   )r)  r'  NNN)r   r   r   r   r   rx   r   r    
LongTensorr   r  r   r   r   r   s   @r#   r!  r!    sp    

1 

 153759	E,,- u//0   1 12	
 
 r%   r!  c            
       &  ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjr\S	 5       r	S
 r
SS\R                  S\\R                     S\R                  4S jjr  SS\R                  S\\R                     S\\   S\\R"                     4S jjrSrU =r$ )GroupViTStagei  zMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rr   depthnum_prev_group_tokenr   r   c           	      j  > [         TU ]  5         X l        X@l        US:  a;  [        R
                  " [        R                  " SXAR                  5      5      U l	        OS U l	        [        R                  " [        U5       Vs/ s H  n[        U5      PM     sn5      U l        US:  a  [        UUUS9U l        OS U l        US:  ab  US:  a\  [        R                   " [        R"                  " UR                  UR$                  S9['        XUR                  S-  U5      5      U l        g S U l        g s  snf )Nr   r   )rr   r   r   ru   rM   )rw   rx   r9  r   r   r	  r    r
  r|   group_token
ModuleListrangeGroupViTEncoderLayerlayersr   
downsample
Sequentialr{   r}   r   group_projector)r   rr   r9  r:  r   r   r  r   s          r#   rx   GroupViTStage.__init__  s     	
.Q!||EKK?L^L^,_`D#Dmm5QV<$X<a%9&%A<$XYQ1 /!1DO #DO!#!(;#%==V//V5J5JK v?Q?QUV?VXgh$D 
 $(D # %Ys   D0c                     U R                   S L$ N)r<  r   s    r#   with_group_tokenGroupViTStage.with_group_token  s    t++r%   c                     U R                   (       a,  US S 2S U R                  * 24   US S 2U R                  * S 24   4$ US 4$ rF  )rG  r   )r   r   s     r#   split_xGroupViTStage.split_x  sN      Q/4/////0!A8L8L7L7N4N2OOOd7Nr%   r   r<  r   c                 8    Uc  U$ [         R                  " X/SS9$ )Nr   r   )r    cat)r   r   r<  s      r#   concat_xGroupViTStage.concat_x  s!    Hyy!)q11r%   hidden_statesprev_group_tokenoutput_attentionsc                    U R                   (       aM  U R                  R                  UR                  S5      SS5      nU R                  b  X@R	                  U5      -   nOSnUnU R                  XT5      nU R                   H  nU" USSS9nUS   nM     U R                  U5      u  pTSn	U R                  b  U R                  XT5      u  pYXT4n
U(       a  X4-   n
U
$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the grouping tensors of Grouping block.
r   rc   N)attention_maskcausal_attention_mask)	rG  r<  r+  rQ   rC  rN  r@  rJ  rA  )r   rP  rQ  rR  r<  r   cat_xlayer	layer_outr   outputss              r#   r   GroupViTStage.forward  s       **11-2D2DQ2GRPK##/),@,@AQ,RRKa-[[EeDPTUIaLE ! e,	??&??1:LA",Gr%   )r9  rA  rC  r<  r@  r   rF  NF)r   r   r   r   r   r   rT   rx   propertyrG  rJ  r    r  r   rN  r  r   r   r   r   r   r   s   @r#   r8  r8    s    W ($ (  ( "	 (
  (  (D , ,2%,, 2Xell5K 2W\WcWc 2 48,1	'||' #5<<0' $D>	'
 
u  	!' 'r%   r8  c            
          ^  \ rS rSr   SS\S\\   S\\   S\\   4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )r   i)  rr   r|   intermediate_sizeoutput_sizec                   > [         TU ]  5         Xl        [        UR                     U l        Ub  UOUR                  nUb  UOUR                  nUb  UOUn[        R                  " X#5      U l
        [        R                  " X45      U l        g rF  )rw   rx   rr   r	   
hidden_actactivation_fnr|   r^  r   r   fc1fc2)r   rr   r|   r^  r_  r   s        r#   rx   GroupViTMLP.__init__*  s|     	#F$5$56%0%<k&BTBT1B1N-TZTlTl%0%<k+99[<99.<r%   rP  r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rF  )rc  rb  rd  )r   rP  s     r#   r   GroupViTMLP.forward:  s4    /**=9/r%   )rb  rr   rc  rd  r5  )r   r   r   r   r   r   rT   rx   r    r  r   r   r   r   s   @r#   r   r   )  sj     &*+/%)=$= c]= $C=	=
 c]= = U\\ ell  r%   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )r   iA  c                 f   > [         TU ]  UR                  SS5      5      nUR                  SS5      $ Nr   rM   )rw   r   r   )r   r   r   s     r#   r   GroupViTMixerMLP.forwardB  s-    GOAKK1-.{{1a  r%   r   )r   r   r   r   r   r   r   r   s   @r#   r   r   A  s    ! !r%   r   c                   F  ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	    SS\R                  S	\
\R                     S
\
\R                     S\
\R                     S\
\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )ry   iG  z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rw   rx   rr   r|   r   num_attention_heads	num_headshead_dimr   r\   attention_dropoutr  r   r   r   r   r   out_projr   s     r#   rx   GroupViTAttention.__init__J  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar%   rF   r  bszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ rj  )r  ro  rp  r   rf   )r   rF   r  rt  s       r#   _shapeGroupViTAttention._shape]  s5    {{3GQQRSUVWbbddr%   rP  rT  rU  r   rR  r   c                    UR                  5       u  pgnUSLn	U R                  U5      U R                  -  n
U	(       aE  U R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nODU R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nX`R                  -  SU R                  4nU R                  XU5      R                  " U6 n
UR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       X`R                  -  X~4:w  a-  [        SX`R                  -  X~4 SUR                  5        35      eUbv  UR                  5       USX~4:w  a"  [        SUSX~4 SUR                  5        35      eUR                  X`R                  X~5      U-   nUR                  X`R                  -  X~5      nUbv  UR                  5       USX~4:w  a"  [        SUSX~4 SUR                  5        35      eUR                  X`R                  X~5      U-   nUR                  X`R                  -  X~5      n[        R                  R                  USS9nU(       a;  UR                  X`R                  X~5      nUR                  X`R                  -  X~5      nOSn[        R                  R!                  XR                   U R"                  S	9n[        R                  " UU5      nUR                  5       X`R                  -  XpR                  4:w  a5  [        S
X`R                  XpR                  4 SUR                  5        35      eUR                  X`R                  XpR                  5      nUR                  SS5      nUR%                  XgU5      nU R'                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNrc   r   rM   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rQ   r   r\   rv  r   r   ro  rp  r  r    bmmr   r   r   r   r3   r  r   rW   rr  )r   rP  rT  rU  r   rR  rt  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                      r#   r   GroupViTAttention.forward`  s    #0"4"4"6i2$> {{=1DJJ>T[[1F%GSQJ;;t{{3H'I2sSLT[[%?SIJ;;t{{='A2sKLNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(*  !,$))+Q/II 7a8R7S T-22457  (,,S..'SVkkL',,S>>-A7TL%""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK0111r%   )
rr   r  r   rp  r   ro  rr  r   r\   r   )NNNF)r   r   r   r   r   rx   r    r  rT   rv  r   r   r  r   r   r   r   r   s   @r#   ry   ry   G  s    GB&eU\\ eC ec e 268<=A,1R2||R2 !.R2  (5	R2
  ((9(9:R2 $D>R2 
u||Xell3XeELL>Q5RR	SR2 R2r%   ry   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )r?  i  rr   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g rt   )rw   rx   r|   r   ry   	self_attnr   r{   r}   layer_norm1r   r   layer_norm2r   s     r#   rx   GroupViTEncoderLayer.__init__  sm    ++*62<<F<Q<QRv&<<F<Q<QRr%   rP  rT  rU  rR  r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rP  rT  rU  rR  )r  r  r  r   )r   rP  rT  rU  rR  residualr  rY  s           r#   r   GroupViTEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr%   )r   r  r  r   r  r  )r   r   r   r   r   rx   r    r  r   r  r   r   r   r   r   r   s   @r#   r?  r?    sk    S~ S -2&||& &  %||	&
 $D>& 
u  	!& &r%   r?  c                   .    \ rS rSr% \\S'   SrSrS rSr	g)GroupViTPreTrainedModeli  rr   groupvitTc                 b   U R                   R                  n[        U[        R                  [        R
                  45      (       aV  UR                  R                  R                  SUS9  UR                  b$  UR                  R                  R                  5         Oh[        U[        R                  5      (       aI  UR                  R                  R                  5         UR                  R                  R                  S5        U R                   R                  n[        U[        5      (       ac  UR                  R                  R                  R                  SUS-  S9  UR                   R                  R                  R                  SUS-  S9  g[        U["        5      (       Ga   U R                   R                  nUR$                  S-  SUR                   R&                  -  S-  -  U-  nUR$                  S-  U-  n[        R(                  R                  UR*                  R                  US9  [        R(                  R                  UR,                  R                  US9  [        R(                  R                  UR.                  R                  US9  [        R(                  R                  UR0                  R                  US9  g[        U[2        5      (       a  U R                   R                  nUR                   R4                  S-  SUR                   R&                  -  S-  -  U-  nSUR                   R4                  -  S-  U-  n[        R(                  R                  UR6                  R                  US9  [        R(                  R                  UR8                  R                  US9  gg)	zInitialize the weightsrA   )meanstdNr2   g{Gz?r   rM   )r  )rr   initializer_ranger   r   r   r   r1  datanormal_biaszero_r{   fill_initializer_factorr!  r'  r)  ry   r   num_hidden_layersinitr   r   r   rr  r   r|   rc  rd  )r   module
init_rangefactorin_proj_stdout_proj_stdfc_stds          r#   _init_weights%GroupViTPreTrainedModel._init_weights  s    [[22
fryy"))455 MM&&CZ&@{{&  &&(--KK""$MM$$S)//f455""))..66CVd]6S%%,,1199sQU9V 122[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE,,[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? -r%   r   N)
r   r   r   r   r   r   base_model_prefixsupports_gradient_checkpointingr  r   r   r%   r#   r  r    s    "&*#@r%   r  c                      ^  \ rS rSrS\SS4U 4S jjr   SS\R                  S\\	   S\\	   S	\\	   S\
\\4   4
S
 jjrSrU =r$ )GroupViTVisionEncoderi  rr   r   Nc                 j  > [         TU ]  5         Xl        [        R                  " [        [        UR                  5      5       Vs/ s HO  n[        UUR                  U   UR                  U   UR                  U   US:  a  UR                  US-
     OSS9PMQ     sn5      U l        SU l        g s  snf )Nr   r   )rr   r9  r   r   r:  F)rw   rx   rr   r   r=  r>  r"   depthsr8  num_group_tokensnum_output_groupsstagesgradient_checkpointing)r   rr   ir   s      r#   rx   GroupViTVisionEncoder.__init__  s    mm s6==12	 3A ! --*$*$;$;A$>%+%=%=a%@LMPQE)A)A!a%)HWX 3	
 ',#	s   AB0rP  output_hidden_statesrR  return_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOS nU(       a  SOS nS n[	        U R
                  5       H=  u  pU(       a  XQ4-   nU	" XU5      n
U
S   nU
S   nU(       d  M-  U
S   c  M5  XjS   4-   nM?     U(       a  XQ4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )Nr   r   r   rM   c              3   .   #    U  H  oc  M  Uv   M     g 7frF  r   r   vs     r#   r   0GroupViTVisionEncoder.forward.<locals>.<genexpr>D  s     g$Uq$U   	last_hidden_staterP  rY   )rr   rR  r  use_return_dict	enumerater  r   r   )r   rP  r  rR  r  all_hidden_statesall_groupingsr   r  stagelayer_outputss              r#   r   GroupViTVisionEncoder.forward"  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"6BD/T!$++.HA#$58H$H!!-?PQM)!,M(+L  ]1%5%A -q1A0C C /   14D Dg]}$Uggg+Yf
 	
r%   )rr   r  r  r5  )r   r   r   r   r   rx   r    r  r   r  r   r   r   r   r   r   r   s   @r#   r  r    sv    ,3 , ,( 04,0&*%
||%
 'tn%
 $D>	%

 d^%
 
uo%	&%
 %
r%   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )GroupViTTextEncoderiJ  z
Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
[`GroupViTEncoderLayer`].

Args:
    config: GroupViTTextConfig
rr   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r[  )
rw   rx   rr   r   r=  r>  r  r?  r@  r  )r   rr   r  r   s      r#   rx   GroupViTTextEncoder.__init__S  sT    mm5QWQiQiKj$kKja%9&%AKj$kl&+# %ls   A&rT  rU  rR  r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr   )rR  r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frF  r   r  s     r#   r   .GroupViTTextEncoder.forward.<locals>.<genexpr>  s     e$Sq$Sr  r  )rr   rR  r  r  r  r@  r   r   )r   r/  rT  rU  rR  r  r  encoder_statesall_attentionsrP  idxencoder_layerr  s                r#   r   GroupViTTextEncoder.forwardY  s   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>Ne]N$Seee+Vd
 	
r%   )rr   r  r@  )NNNNN)r   r   r   r   r   r   rx   r   r    r  r  r   r   r   r   r   r   r   s   @r#   r  r  J  s    ,1 , 268<,0/3&*F
 !.F
  (5	F

 $D>F
 'tnF
 d^F
 
uo%	&F
 F
r%   r  c                      ^  \ rS rSrS\4U 4S jjr\      SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jj5       rSrU =r$ )GroupViTTextTransformeri  rr   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        UR                  U l        g rt   )rw   rx   rr   r|   r!  r  r  encoderr   r{   r}   final_layer_normeos_token_idr,  s      r#   rx    GroupViTTextTransformer.__init__  s]    &&	08*62 "Y<Q<Q R #//r%   r.  rT  r#  rR  r  r  r   c           	      0   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[        XxR                  UR                  S9n	Ub  [        X(R                  5      nU R                  UUU	UUUS9n
U
S   nU R                  U5      nU R                  S:X  ae  U[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9R)                  SS	94   nOU[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9U R                  :H  R'                  5       R)                  SS	94   nU(       d	  X4U
S
S  -   $ [+        UUU
R,                  U
R.                  S9$ )NzYou have to specify input_idsrc   )r.  r#  r   )r/  rT  rU  rR  r  r  r   rM   )rB   r   r   r   r  pooler_outputrP  rY   )rr   rR  r  r  r   rQ   r  r  r
   rB   r   r   r  r  r  r    r!   rH   torT   argmaxr   rP  rY   )r   r.  rT  r#  rR  r  r  input_shaperP  rU  encoder_outputsr  pooled_outputs                r#   r   GroupViTTextTransformer.forward  s&    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	)W !A,,]5I5I!

 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %58KKK)/')77&11	
 	
r%   )rr   r  r  r  r  NNNNNN)r   r   r   r   r   rx   r   r   r    r  r  r   r   r   r   r   r   r   s   @r#   r  r    s    	01 	0  -115/3,0/3&*L
ELL)L
 !.L
 u||,	L

 $D>L
 'tnL
 d^L
 
u00	1L
 L
r%   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  4S jr	S r
\      SS\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )GroupViTTextModeli  rr   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rF  )rw   rx   r  
text_model	post_initr   s     r#   rx   GroupViTTextModel.__init__  s&     1&9r%   r   c                 B    U R                   R                  R                  $ rF  r  r  r'  r   s    r#   get_input_embeddings&GroupViTTextModel.get_input_embeddings  s    ))999r%   c                 8    XR                   R                  l        g rF  r  )r   r   s     r#   set_input_embeddings&GroupViTTextModel.set_input_embeddings
  s    5:""2r%   r.  rT  r#  rR  r  r  c           	      *    U R                  UUUUUUS9$ )a  
Examples:

```python
>>> from transformers import CLIPTokenizer, GroupViTTextModel

>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r.  rT  r#  rR  r  r  r  )r   r.  rT  r#  rR  r  r  s          r#   r   GroupViTTextModel.forward  s,    2 )%/!5#  
 	
r%   r  r  )r   r   r   r   r   r   rx   r   Moduler  r  r   r   r    r  r  r   r   r   r   r   r   r   s   @r#   r  r    s    1 :bii :;  -115/3,0/3&*
ELL)
 !.
 u||,	

 $D>
 'tn
 d^
 
u00	1
 
r%   r  c                      ^  \ rS rSrS\4U 4S jjr\    SS\\R                     S\\
   S\\
   S\\
   S\\\4   4
S	 jj5       rS
rU =r$ )GroupViTVisionTransformeri0  rr   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        g rt   )rw   rx   rr   r|   r  r  r  r  r   r{   r}   r  r,  s      r#   rx   "GroupViTVisionTransformer.__init__1  sL    &&	26:,V4i5J5JKr%   r   r  rR  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nUR                  SS9nU(       d	  Xx4USS  -   $ [        UUUR                  UR                  S9$ )Nz You have to specify pixel_values)rP  r  rR  r  r   r   r   r  )rr   rR  r  r  r   r  r  r  r  r   rP  rY   )	r   r   r  rR  r  rP  r  r  r  s	            r#   r   !GroupViTVisionTransformer.forward:  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@5,,'!5/#	 ' 
 ,A. !NN+<=)..1.5%58KKK)/')77&11	
 	
r%   )rr   r  r  r  NNNN)r   r   r   r   r   rx   r   r   r    r   r  r   r   r   r   r   r   r   s   @r#   r  r  0  s    L3 L  59/3,0&*'
u001'
 'tn'
 $D>	'

 d^'
 
u00	1'
 '
r%   r  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\4S jr	\
    SS\\R                     S\\   S\\   S	\\   S\\\4   4
S
 jj5       rSrU =r$ )GroupViTVisionModelie  rr   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rF  )rw   rx   r  vision_modelr  r   s     r#   rx   GroupViTVisionModel.__init__i  s'     5f=r%   r   c                 B    U R                   R                  R                  $ rF  )r  r  r  r   s    r#   r  (GroupViTVisionModel.get_input_embeddingso  s      ++<<<r%   rR  r  r  c                 &    U R                  UUUUS9$ )ah  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GroupViTVisionModel

>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   rR  r  r  r  )r   r   rR  r  r  s        r#   r   GroupViTVisionModel.forwardr  s(    8   %/!5#	 ! 
 	
r%   r  r  )r   r   r   r   r   r   main_input_namerx   r   r  r   r   r    r   r  r   r   r   r   r   r   r   s   @r#   r  r  e  s      $O3 =&= =  59,0/3&* 
u001 
 $D> 
 'tn	 

 d^ 
 
u00	1 
  
r%   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\" 5       \  SS\	R                  S\\	R                     S\\	R                     S\	R                  4S jj5       5       r\" 5       \S	\	R                  S\	R                  4S
 j5       5       r\         SS\\	R                     S	\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )GroupViTModeli  rr   c                 >  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l
        UR                  U l        UR                  U l        [        U5      U l        [!        U5      U l        [$        R&                  " [$        R(                  " U R                  U R                  SS9[$        R*                  " U R                  5      [$        R,                  " SS9[$        R(                  " U R                  U R                  SS95      U l        [$        R&                  " [$        R(                  " U R                  U R                  SS9[$        R*                  " U R                  5      [$        R,                  " SS9[$        R(                  " U R                  U R                  SS95      U l        [$        R2                  " [4        R6                  " U R8                  R:                  5      5      U l        U R?                  5         g )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)r  )inplace) rw   rx   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimr|   text_embed_dimvision_embed_dimr  r  r  r  r   rB  r   BatchNorm1dReLUvisual_projectiontext_projectionr	  r    rF   rr   logit_scale_init_valuelogit_scaler  )r   rr   r  r  r   s       r#   rx   GroupViTModel.__init__  s    &,,.@AA++,-Q0 
 &..0DEE--./q2 
 ((,,$33+1+M+M()55 - 9 91+>5mD!#IId++T-M-MTXYNN4;;<GGD!IId668K8KRVW	"
  "}}IId))4+K+KRVWNN4;;<GGD!IId668K8KRVW	 
 <<T[[5W5W(XY 	r%   r.  rT  r#  r   c                 ^    U R                  UUUS9nU R                  UR                  5      nU$ )a|  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`GroupViTTextModel`].

Examples:

```python
>>> import torch
>>> from transformers import CLIPTokenizer, GroupViTModel

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```)r.  rT  r#  )r  r  r  )r   r.  rT  r#  text_outputstext_featuress         r#   get_text_featuresGroupViTModel.get_text_features  s?    4 48??)% 4C 4

 ,,\-G-GHr%   r   c                 ^    U R                  U5      nU R                  UR                  5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`GroupViTVisionModel`].

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, GroupViTModel
>>> from transformers.image_utils import load_image

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```)r  r  r  )r   r   vision_outputsimage_featuress       r#   get_image_features GroupViTModel.get_image_features  s0    4 6:5F5F|5T//0L0LMr%   return_lossrR  r  output_segmentationr  c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SnUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  UUUU	S9n
U R                  UUUUUU	S9nU
S   nU R                  U5      nUS   nU R                  U5      nXR                  SSS9-  nXR                  SSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  5       nSnU(       Gaf  U
S   nU R                  UR                  SUR                   S   5      5      nU(       a  U
S	   nOU
S
   n[#        UUR                   S
S 5      nUUR                  SSS9-  n[        R                  " UUR                  5       5      U-  nUR                  UR                   S   SUR                   S   5      R%                  SS
S5      nUR                  UR                   S   UR                   S   S5      n[        R                  " UU5      U-  nUR                  UR                   S   UR                   S   UR                   S
   UR                   S	   5      nSnU(       a  ['        U5      nU	(       d  Ub
  UUUUUUU
4nOUXXU
4nUb  U4U-   $ U$ [)        UUUUUUUU
S9$ )a  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.
output_segmentation (`bool`, *optional*):
    Whether or not to return the segmentation logits.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GroupViTModel

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```NTr  r  r   rc   r   r   r   rM   )r   r   r   r   r   r   r   r   )rr   rR  r%  r  r  r  r  r  r  normr  expr    matmulr(   rW   rH   rn   re   r+   r   )r   r.  r   rT  r#  r$  rR  r  r%  r  r   r  r   r   r  r   r   
seg_logitsimage_group_embedsrY   groupinglogits_per_image_groupflatten_groupingr   outputs                            r#   r   GroupViTModel.forward  sc   R 2C1N-TXT_T_TqTq#6#BHgHg 	  $$8$D $++JjJj 	 &1%<k$++B]B]**%/!5#	 + 
 )%/!5# ' 
 &a(--l;"1o**;7 $&7&7B&7&MM!$4$4T$4$JJ &&**,,,{NN4DES*,,.
 "0!2!%!7!78J8R8RSUWiWoWoprWs8t!u#+A.
+A.
3J@R@RSTSU@VWH "46H6M6MRT^b6M6c!c%*\\2Dkmmo%VYd%d"%;%C%C""1%r;+<+<Q+?&gaA #
  (//q0A8>>RSCTVXY &<>NOR]]J#++  #Z%5%5a%8(..:KX^^\]M^J  1D%$#  " +O,ftu)-)9TGf$EvE"-+ *#%* .	
 		
r%   )	r  r  r  r  r  r  r  r  r  )NN)	NNNNNNNNN)r   r   r   r   r   r   rx   r   r   r    r  r   r   r  r"  r6  r  r   r   r   r   r   r   r   s   @r#   r  r    s   )~ )V %& 26/3	<< !. u||,	
 
		  '@ %&u|| @Q@Q   '8  15481537&*,0/3.2&*N
E,,-N
 u001N
 !.	N

 u//0N
 d^N
 $D>N
 'tnN
 &d^N
 d^N
 
u))	*N
 N
r%   r  )r  r  r  r  )r   Frc   r  )Fr   collections.abcr   dataclassesr   typingr   r   r   numpyrU   r    r   activationsr	   modeling_attn_mask_utilsr
   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   configuration_groupvitr   r   r   
get_loggerr   loggerr  r$   r+   rT   r=   floatr  rK   ra   rn   r  rp   r   r   r   r   r  r!  r8  r   r   ry   r?  r  r  r  r  r  r  r  r  __all__r   r%   r#   <module>r@     s`     ! ' '    ! d 9 K - e e \ \ 
		H	%
`U\\ `ell `
-ell -u|| - C 5<< e t RU _d_k_k ,<:"))  -bii -`4+")) 4+n -
+ -
  -
`bii DGryy GV%RYY %P[BII [|")) 0!{ !k2		 k2^/5 /d $@o $@ $@N7
BII 7
tU
")) U
pY
bii Y
x/
/ /
d2
		 2
j.
1 .
b }
+ }
 }
@ cr%   