
    bCiL                       S r SSKrSSKJr  SSKJr  SSKJrJr  SSK	r	SSK	J
r
  SSKJr  SS	KJrJr  SS
KJrJrJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+  \$RX                  " \-5      r.Sr/\\#" SS9 " S S\5      5       5       r0\\#" SS9 " S S\5      5       5       r1 " S S\
Rd                  5      r3 " S S\
Rd                  5      r4 " S S \
Rd                  5      r5 " S! S"\
Rd                  5      r6 " S# S$\
Rd                  5      r7 " S% S&\
Rd                  5      r8 " S' S(\
Rd                  5      r9 " S) S*\
Rd                  5      r: " S+ S,\
Rd                  5      r; " S- S.\
Rd                  5      r<S/\<0r= " S0 S1\
Rd                  5      r> " S2 S3\
Rd                  5      r? " S4 S5\5      r@ " S6 S7\
Rd                  5      rA " S8 S9\
Rd                  5      rBSWS: jrC\# " S; S<\5      5       rD " S= S>\D5      rE\#" S?S9 " S@ SA\D5      5       rF\#" SBS9 " SC SD\D5      5       rG " SE SF\
Rd                  5      rH " SG SH\
Rd                  5      rI " SI SJ\
Rd                  5      rJ\#" SKS9 " SL SM\D5      5       rK\#" SNS9 " SO SP\D5      5       rL " SQ SR\
Rd                  5      rM\#" SSS9 " ST SU\D5      5       rN/ SVQrOg)XzPyTorch BridgeTower Model    N)OrderedDict)	dataclass)OptionalUnion)nn)CrossEntropyLoss   )ACT2FNQuickGELUActivation)CacheDynamicCacheEncoderDecoderCache)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputModelOutputSequenceClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int)deprecate_kwarg   )BridgeTowerConfigBridgeTowerTextConfigBridgeTowerVisionConfigRobertaTokenizerz.
    Output type of [`BridgeTowerModel`].
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
BridgeTowerModelOutput0   a  
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
    Sequence of hidden-states at the text output of the last layer of the model.
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
    Sequence of hidden-states at the image output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
    Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
    token), respectively, after further processing through layers used for auxiliary pretraining tasks.
Ntext_featuresimage_featurespooler_outputhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r&   r   torchFloatTensor__annotations__r'   r(   r)   tupler*   __static_attributes__r+       n/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/bridgetower/modeling_bridgetower.pyr$   r$   0   s|     26M8E--.526NHU../615M8E--.58<M8E%"3"345<59Ju00129r6   r$   z>
    Output type of ['BridgeTowerForContrastiveLearning']
    c                   P   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Srg)BridgeTowerContrastiveOutputH   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Image-text contrastive loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
Nlosslogitstext_embedsimage_embedscross_embedsr)   r*   r+   )r,   r-   r.   r/   r0   r;   r   r1   r2   r3   r<   r=   r4   r>   r?   r)   r*   r5   r+   r6   r7   r9   r9   H   s      )-D(5$$
%,*.FHU&&'.6:K% 1 123:7;L(5!2!234;7;L(5!2!234;8<M8E%"3"345<59Ju00129r6   r9   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSS\R                  S\\R                     4S jjr	Sr
U =r$ )	BridgeTowerResidualAttentionh   c                 h  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " [        S[        R                  " UR                  UR                  S-  5      4S[        5       4S[        R                  " UR                  S-  UR                  5      4/5      5      U l        [        R                  " UR                  UR                  S9U l        S U l        g )N@   epsc_fc   geluc_proj)super__init__r   MultiheadAttentionhidden_sizeattn	LayerNormlayer_norm_epsln_1
ModuleDictr   Linearr   mlpln_2	attn_maskselfconfig	__class__s     r7   rL   %BridgeTowerResidualAttention.__init__i   s    ))&*<*<f>P>PTV>VW	LL!3!39N9NO	==RYYv'9'96;M;MPQ;QRS023ryy););a)?ASASTU
 LL!3!39N9NO	r6   hidden_stateattention_maskc           	         Ub(  UR                  [        R                  UR                  S9nU R                  b.  U R                  R                  UR
                  UR                  S9OS U l        U R                  UUUSU R                  US9S   $ )NdtypedeviceF)need_weightsrW   key_padding_maskr   )tor1   boolrb   rW   ra   rO   )rY   r]   r^   s      r7   	attention&BridgeTowerResidualAttention.attentionz   s    %+..UZZH[H[.\N ~~) NNL$6$6|?R?RS 	
 yynn+  
  	r6   c                     XR                  U R                  U5      U5      -   nU R                  U5      nU R                  R	                  5        H  nU" U5      nM     X1-   nU$ N)rg   rR   rV   rU   values)rY   r]   r^   residual_statelayers        r7   forward$BridgeTowerResidualAttention.forward   sZ    %tyy7NP^(__yy0XX__&E .L '%4r6   )rO   rW   rR   rV   rU   rj   )r,   r-   r.   r/   rL   r1   Tensorrg   r   rn   r5   __classcell__r[   s   @r7   rA   rA   h   sH    "ell ELL "ELL (5<<BX  r6   rA   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\\R                     4S jjrSr	U =r
$ )BridgeTowerTransformer   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  (       aL  [
        R                  " [        U R                  S-
  5       Vs/ s H  n[        U5      PM     sn5      U l	        OH[
        R                  " [        U R                  5       Vs/ s H  n[        U5      PM     sn5      U l	        UR                  U l
        g s  snf s  snf )Nr   )rK   rL   rN   num_hidden_layersremove_last_layerr   
ModuleListrangerA   	resblocksstop_gradientrY   rZ   _r[   s      r7   rL   BridgeTowerTransformer.__init__   s    !--!'!9!9##]]?DTE[E[^_E_?`a?`!-f5?`aDN  ]]?DTE[E[?\]?\!-f5?\]DN $11 b ^s   -C)6C.r]   r^   c                     / nU R                    HN  nU" X5      nU R                  (       a!  UR                  UR                  5       5        M=  UR                  U5        MP     U$ rj   )r{   r|   appenddetach)rY   r]   r^   r)   blocks        r7   rn   BridgeTowerTransformer.forward   sU    ^^E >L!!$$\%8%8%:;$$\2 $ r6   )rN   rw   r{   r|   rj   )r,   r-   r.   r/   rL   r1   rp   r   rn   r5   rq   rr   s   @r7   rt   rt      s-    2ELL (5<<BX  r6   rt   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )BridgeTowerVisionEmbeddings   rZ   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)rK   rL   rZ   rN   	embed_dim
image_size
patch_sizer   	Parameterr1   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandrX   s     r7   rL   $BridgeTowerVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr6   
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr         ?r	   r   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer1   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolateviewcat)rY   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r7   interpolate_pos_encoding4BridgeTowerVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr6   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z).ra   r   r   r   r   )r   r   
ValueErrorr   r   ra   re   flatten	transposer   r   r1   r   r   r   r   )rY   r   r   
batch_sizer~   r   r   target_dtypepatch_embedsclass_embedsr   s              r7   rn   #BridgeTowerVisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr6   )	r   rZ   r   r   r   r   r   r   r   F)r,   r-   r.   r/   r    rL   r1   rp   intr   r2   rn   r5   rq   rr   s   @r7   r   r      sj    q6 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r6   r   c                      ^  \ rS rSrU 4S jr S
S\R                  S\4S jjr S
S\R                  S\4S jjr	S\R                  4S jr
S	rU =r$ )BridgeTowerVisionTransformeri  c           
      6  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        UR                  U l        UR                  (       dg  [        R                  " [        UR                  5       Vs/ s H,  n[        R
                  " UR                  UR                  S9PM.     sn5      U l        g g s  snf NrE   )rK   rL   r   r   r   rP   rN   rQ   ln_prert   transformerln_postshare_layernormry   rz   rw   ln_separater}   s      r7   rL   %BridgeTowerVisionTransformer.__init__  s    5f=ll6#5#56;P;PQ1&9||F$6$6F<Q<QR%55%%!}}V[\b\t\tVuvVuQRf00f6K6KLVuv D &vs   3Dr   r   c                    U R                  X5      nU R                  U5      nUR                  SSS5      nU R                  XB5      n[        R
                  " USS9nUR                  SSSS5      nU R                  (       a  U R                  U5      nU$ / n[        X@R                  5       H  u  pFU" U5      nUR                  U5        M      [        R
                  " USS9nU$ )Nr   r   r   r   r	   )r   r   r   r   r1   stackr   r   zipr   r   )rY   r   r^   r   r)   hidden_states_stacklns          r7   rn   $BridgeTowerVisionTransformer.forward  s     OM2%--aA6((GMq9%--aAq9 LL7M  #%%(8H8H%I! "= 1#**=9 &J "KK(;CMr6   c                 l    U R                  XS9nU R                  U5      nUR                  SSS5      nU$ )Nr   r   r   r   )r   r   r   )rY   r   r   r)   s       r7   forward_pre(BridgeTowerVisionTransformer.forward_pre+  s<    
 hM2%--aA6r6   r]   c                 N    UR                  SSS5      nU R                  U5      nU$ )Nr   r   r   )r   r   )rY   r]   visual_output_posts      r7   forward_post)BridgeTowerVisionTransformer.forward_post6  s-    )11!Q:!\\*<=!!r6   )r   r   r   r   r   r   r   )r,   r-   r.   r/   rL   r1   rp   rf   rn   r   r   r5   rq   rr   s   @r7   r   r     s]    " */	ll #'	< */	ll	 #'	" " "r6   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerLinkToweri<  c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  S;   a  UR                  S:X  a0  [        R
                  " [        R                  " S5      5      U l        O?UR                  S:X  a/  [        R
                  " [        R                  " S5      5      U l	        [        R                  " U R                  UR                  S9U l
        g [        SUR                   S35      e)	N)add
scaled_addr   r         ?r   r   rE   link_tower_type  is not implemented)rK   rL   link_tower_typerN   r   r   r1   tensorscaled_factorbetarP   rQ   NotImplementedErrorrX   s     r7   rL   BridgeTowerLinkTower.__init__=  s    %55!--!!%II%%5%'\\%,,s2C%D"''=8LLc):;	\\$*:*:@U@UVDN%(89O9O8PPc&deer6   c                 Z   U R                   S:X  a  U R                  X-   5      $ U R                   S:X  a   U R                  XR                  -  U-   5      $ U R                   S:X  a0  U R                  USU R                  -
  -  X R                  -  -   5      $ [	        SU R                    S35      e)Nr   r   r   r   r   r   )r   rP   r   r   r   )rY   r)   cross_modal_hidden_statesr^   s       r7   rn   BridgeTowerLinkTower.forwardJ  s    5(>>-"KLL!!\1>>-2D2D"DG`"`aa!!]2>>-1tyy="AD]`i`iDi"ijj%(89M9M8NNa&bccr6   )rP   r   rN   r   r   r,   r-   r.   r/   rL   rn   r5   rq   rr   s   @r7   r   r   <  s    fd dr6   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerSelfOutputiV  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )rK   rL   r   rT   rN   denserP   rQ   Dropouthidden_dropout_probdropoutrX   s     r7   rL   BridgeTowerSelfOutput.__init__W  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r6   r)   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rj   r   r  rP   rY   r)   r  s      r7   rn   BridgeTowerSelfOutput.forward]  5    

=1]3}'CDr6   rP   r   r  
r,   r-   r.   r/   rL   r1   rp   rn   r5   rq   rr   s   @r7   r   r   V  6    >U\\  RWR^R^  r6   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerIntermediateie  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rj   )rK   rL   r   rT   rN   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnrX   s     r7   rL    BridgeTowerIntermediate.__init__f  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r6   r)   r   c                 J    U R                  U5      nU R                  U5      nU$ rj   r   r  rY   r)   s     r7   rn   BridgeTowerIntermediate.forwardn  s&    

=100?r6   r  r
  rr   s   @r7   r  r  e  s(    9U\\ ell  r6   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerOutputiu  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rK   rL   r   rT   r  rN   r   rP   rQ   r   r   r  rX   s     r7   rL   BridgeTowerOutput.__init__v  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r6   r)   r  r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rj   r  r  s      r7   rn   BridgeTowerOutput.forward|  r  r6   r	  r
  rr   s   @r7   r  r  u  r  r6   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rj   )rK   rL   r   rT   rN   r   Tanh
activationrX   s     r7   rL   BridgeTowerPooler.__init__  s9    YYv1163E3EF
'')r6   r)   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r#  )rY   r)   first_token_tensorpooled_outputs       r7   rn   BridgeTowerPooler.forward  s6     +1a40

#566r6   )r#  r   r
  rr   s   @r7   r   r     s(    $
U\\ ell  r6   r   c                     ^  \ rS rSrSU 4S jjr\" SSSS9      SS\R                  S\\R                     S	\\R                     S
\\R                     S\\
   S\\   S\\R                     S\\R                     4S jj5       rSrU =r$ )BridgeTowerSelfAttentioni  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        X0l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()position_embedding_typeabsoluterelative_keyrelative_key_queryr   r   )rK   rL   rN   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   rT   querykeyvaluer   attention_probs_dropout_probr  getattrr.  max_position_embeddingsr   distance_embedding
is_decoder	layer_idxrY   rZ   r.  r>  r[   s       r7   rL   !BridgeTowerSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"r6   past_key_valuepast_key_values4.58new_nameversionr)   r^   	head_maskencoder_hidden_statesoutput_attentionscache_positionr   c                 	   UR                   u  pn
U R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nSnUS LnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nUR                  USU R                  U R                  5      R                  SS5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbc  U(       d  UOS nWR%                  UUU R                  SU05      u  nnU(       a.  [        U[        5      (       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nU R*                  S:X  d  U R*                  S	:X  Ga  UR                   S   UR                   S   nnUbB  [&        R,                  " US-
  [&        R.                  UR0                  S
9R                  SS5      nO>[&        R2                  " U[&        R.                  UR0                  S
9R                  SS5      n[&        R2                  " U[&        R.                  UR0                  S
9R                  SS5      nUU-
  nU R5                  UU R6                  -   S-
  5      nUR9                  UR:                  S9nU R*                  S:X  a  [&        R<                  " SUU5      nUU-   nOHU R*                  S	:X  a8  [&        R<                  " SUU5      n[&        R<                  " SUU5      nUU-   U-   nU[>        R@                  " U R                  5      -  nUb  UU-   n[B        RD                  RG                  USS9nU RI                  U5      nUb  UU-  n[&        R(                  " UU5      nURK                  SSSS5      RM                  5       nURO                  5       S S U RP                  4-   nUR                  U5      nUU4$ )Nr   r   r   FrJ  Tr0  r1  r`   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r   r	   ))r   r6  r   r2  r4  r   r  r   
is_updatedgetr>  cross_attention_cacheself_attention_cachelayerskeysrk   r7  r8  updater1   matmulr.  r   longrb   r   r<  r;  re   ra   einsummathsqrtr   r   softmaxr  r   
contiguousr   r5  )rY   r)   r^   rG  rH  rB  rI  rJ  r   
seq_lengthr~   query_layerrM  is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  r7   rn    BridgeTowerSelfAttention.forward  sa    %2$7$7!
jj/!&&z2t7O7OQUQiQijttq
 
2$>&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#2D.-/"=*+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  *7It)<)C)C{DNN=M~<^*&	; &*_FY*Z*ZAEO..t~~> !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L*!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r6   )r5  r4  r<  r  r=  r7  r>  r;  r2  r.  r6  r8  NNNNNNFN)r,   r-   r.   r/   rL   r   r1   rp   r   r2   r   rf   r4   rn   r5   rq   rr   s   @r7   r*  r*    s    #6 %0A6R 7;15=A+/,115e.||e. !!2!23e. E--.	e.
  ((9(9:e. "%e. $D>e. !.e. 
u||	e. Se.r6   r*  eagerc                   $  ^  \ rS rSrSU 4S jjrS r\" SSSS9      SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\   S\	\   S\	\R                     S\\R                     4S jj5       rSrU =r$ )BridgeTowerAttentioni  c                    > [         TU ]  5         [        UR                     " UUUS9U l        [        U5      U l        [        5       U l        g )Nr.  r>  )	rK   rL   #BRIDGE_TOWER_SELF_ATTENTION_CLASSES_attn_implementationrY   r   outputsetpruned_headsr?  s       r7   rL   BridgeTowerAttention.__init__   sF    78S8ST$;
	
 ,F3Er6   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rY   r2  r4  r{  r   r6  r7  r8  ry  r   r5  union)rY   headsindexs      r7   prune_heads BridgeTowerAttention.prune_heads*  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r6   rA  rB  rC  rD  r)   r^   rG  rH  rI  rJ  r   c           
      l    U R                  UUUUUUUS9nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr^   rG  rH  rB  rI  rJ  r   r   )rY   ry  )rY   r)   r^   rG  rH  rB  rI  rJ  self_outputsattention_outputoutputss              r7   rn   BridgeTowerAttention.forward<  s\     yy)"7+/) ! 
  ;;|AF#%QR(88r6   )ry  r{  rY   rp  rq  )r,   r-   r.   r/   rL   r  r   r1   rp   r   r2   r   rf   r4   rn   r5   rq   rr   s   @r7   rt  rt    s    ";$ %0A6R 7;15=A+/,115|| !!2!23 E--.	
  ((9(9: "% $D> !. 
u||	 Sr6   rt  c                   ^   ^  \ rS rSrS
U 4S jjr\" SSSS9      SS j5       rS rS	rU =r	$ )BridgeTowerBertCrossLayeriU  c                   > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        UR                  U l        UR                  U l        [	        XS9U l        [        U5      U l
        [        U5      U l        g )Nr   r>  )rK   rL   chunk_size_feed_forwardseq_len_dimrt  rg   r=  add_cross_attentioncrossattentionr  intermediater  ry  rY   rZ   r>  r[   s      r7   rL   "BridgeTowerBertCrossLayer.__init__V  sq    '-'E'E$-fJ ++#)#=#= 26O3F;'/r6   rA  rB  rC  rD  c	           
          U R                  UUS US S9n	U	S   n
U	SS  nU R                  U
UUUUUUS9nUS   n
XSS  -   n[        U R                  U R                  U R
                  U
5      nU4U-   nU$ )N)r^   rG  rI  rB  r   r   r  )rg   r  r   feed_forward_chunkr  r  )rY   r)   rH  r^   rG  encoder_attention_maskrB  rI  rJ  self_attention_outputsr  r  cross_attention_outputslayer_outputs                 r7   rn   !BridgeTowerBertCrossLayer.forwarda  s     "&)/  "0 "
 2!4 ),"&"5"51"7+/) #6 #
 315AB770##T%A%A4CSCSUe
  /G+r6   c                 J    U R                  U5      nU R                  X!5      nU$ rj   r  ry  rY   r  intermediate_outputr  s       r7   r  ,BridgeTowerBertCrossLayer.feed_forward_chunk  )    "//0@A{{#6Ir6   r  rg   r  r  r  r=  ry  r  rj   rq  )
r,   r-   r.   r/   rL   r   rn   r  r5   rq   rr   s   @r7   r  r  U  sG    	0 %0A6R
 #+ S+Z r6   r  c                   D  ^  \ rS rSrSU 4S jjr\" SSSS9       SS\R                  S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\   S\\R                     S\\R                     4S jj5       rS rSrU =r$ )BridgeTowerTextLayeri  c                 r  > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        USUS9U l	        [        U5      U l        [        U5      U l        g )Nr   r  z> should be used as a decoder model if cross attention is addedr/  rv  )rK   rL   r  r  rt  rg   r=  r  r   r  r  r  r  ry  r  s      r7   rL   BridgeTowerTextLayer.__init__  s    '-'E'E$-fJ ++#)#=#= ##?? D6)g!hii"6vWamv"wD3F;'/r6   rA  rB  rC  rD  r)   r^   rG  rH  r  rI  rJ  r   c	           
      z   U R                  UUUUUUS9n	U	S   n
U R                  (       a  U	SS nOU	SS  nU R                  (       aD  UbA  [        U S5      (       d  [        SU  S35      eU R	                  U
UUUUUUS9nUS   n
XSS -   n[        U R                  U R                  U R                  U
5      nU4U-   $ )	N)r^   rG  rI  rB  rJ  r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r  )	rg   r=  r3  r   r  r   r  r  r  )rY   r)   r^   rG  rH  r  rB  rI  rJ  r  r  r  r  r  s                 r7   rn   BridgeTowerTextLayer.forward  s    "&)/+) "0 "
 2!4 ??,Qr2G,QR0G??4@4!122 =dV DD D 
 '+&9&9 5#&; /"3- ': '#  7q9" ==G0##T%A%A4CSCSUe
 ((r6   c                 J    U R                  U5      nU R                  X!5      nU$ rj   r  r  s       r7   r  'BridgeTowerTextLayer.feed_forward_chunk  r  r6   r  rj   )NNNNNFN)r,   r-   r.   r/   rL   r   r1   rp   r   r2   r   rf   r4   rn   r  r5   rq   rr   s   @r7   r  r    s    0 %0A6R 7;15=A>B+/,1152)||2) !!2!232) E--.	2)
  ((9(9:2) !)):): ;2) "%2) $D>2) !.2) 
u||	2) S2)h r6   r  c                   V  ^  \ rS rSrSU 4S jjr          SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\\
   S
\\
   S\\
   S\\
   S\\R                     S\\\R                     \4   4S jjrSrU =r$ )BridgeTowerTextEncoderi  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l	        g s  snf )Nr  F)
rK   rL   rZ   r   ry   rz   rw   r  rm   gradient_checkpointing)rY   rZ   r>  ir[   s       r7   rL   BridgeTowerTextEncoder.__init__  sX    ]]@EfF^F^@_`@_1!&6@_`

 ',# as   A$r)   r^   rG  rH  r  rB  	use_cacherI  output_hidden_statesreturn_dictrJ  r   c                    U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       aL  U R                   R                  (       a1  Uc.  [        [        U R                   S9[        U R                   S95      nU(       a[  U R                   R                  (       a@  [        U[        5      (       a+  [        R                  S5        [        R                  " U5      n[        U R                  5       He  u  nnU	(       a  X4-   nUb  X?   OS nU" UUUUUUUUS9nUS   nU(       d  M6  UUS   4-   nU R                   R                  (       d  M\  UUS	   4-   nMg     U	(       a  X4-   nU
(       d  [        S
 UUUUU4 5       5      $ [        UUUUUS9$ )Nr+   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rZ   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r  rB  rI  rJ  r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frj   r+   .0vs     r7   	<genexpr>1BridgeTowerTextEncoder.forward.<locals>.<genexpr>(  s"      
A     	)last_hidden_staterB  r)   r*   cross_attentions)rZ   r  r  trainingloggerwarning_oncer=  r   r   r  r4   from_legacy_cache	enumeraterm   r   )rY   r)   r^   rG  rH  r  rB  r  rI  r  r  rJ  all_hidden_statesall_self_attentionsall_cross_attentionsr  layer_modulelayer_head_masklayer_outputss                      r7   rn   BridgeTowerTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	//O4K1,dkk2RT`hlhshsTtuO//JPU4V4V\
 2CCOTO(4OA|#$58H$H!.7.CilO(%'= /"3-	M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U(+  5.   14D D 
 "#%'(
 
 
 9+++*1
 	
r6   )rZ   r  rm   rj   )
NNNNNNFFTN)r,   r-   r.   r/   rL   r1   rp   r   r2   r   rf   r   r4   r   rn   r5   rq   rr   s   @r7   r  r    s   , 7;15=A>B+/$(,1/4&*15P
||P
 !!2!23P
 E--.	P

  ((9(9:P
 !)):): ;P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 !.P
 
uU\\"$MM	NP
 P
r6   r  c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )BridgeTowerTextEmbeddingsi=  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxrE   r.  r/  r   r   Fr   token_type_idsr   )rK   rL   r   r   
vocab_sizerN   pad_token_idword_embeddingsr;  position_embeddingstype_vocab_sizetoken_type_embeddingsrP   rQ   r   r   r  r:  r.  r   r1   r   r   zerosr   r   rU  r  rX   s     r7   rL   "BridgeTowerTextEmbeddings.__init__C  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r6   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr   r   r  r   r`   r/  )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   r3  r  r   r1   r  rU  r   rb   r  r  r.  r  rP   r  )rY   	input_idsr  r   inputs_embedspast_key_values_lengthinput_shaper[  buffered_token_type_ids buffered_token_type_ids_expandedr  r   r  s                r7   rn   !BridgeTowerTextEmbeddings.forward\  sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r6   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr   r   r`   r   )r   r1   r   r  rU  rb   r   r   )rY   r  r  sequence_lengthr   s        r7   r  @BridgeTowerTextEmbeddings.create_position_ids_from_inputs_embeds  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r6   )rP   r  r  r.  r  r  r  )NNNNr   )
r,   r-   r.   r/   r0   rL   rn   r  r5   rq   rr   s   @r7   r  r  =  s$    

4 rs&P= =r6   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r1   cumsumtype_asrU  )r  r  r  maskincremental_indicess        r7   r  r    sW     <<$((*D <<!4<<TBE[[_cc##%33r6   c                   V    \ rS rSr% \\S'   SrSrSS/rSr	S\
R                  4S	 jrS
rg)BridgeTowerPreTrainedModeli  rZ   bridgetowerFr*  rA   rB  modulec                 z   U R                   R                  n[        U[        5      (       Ga  U R                   R                  S-  SU R                   R
                  -  S-  -  nU R                   R                  S-  nSU R                   R                  -  S-  nUR                  R                   GH   n[        R                  R                  UR                  R                  XB-  S9  UR                  R                  R                  R                  5         [        R                  R                  UR                  R                   R"                  X2-  S9  [        R                  R                  UR$                  R&                  R"                  XR-  S9  [        R                  R                  UR$                  R(                  R"                  X2-  S9  GM#     [        R                  R                  UR*                  R,                  XB-  S9  [        R                  R                  UR*                  R.                  R"                  XB-  S9  GO[        U[        R0                  [        R2                  [        R4                  45      (       a(  UR"                  R                  R                  SSU-  S9  O[        U[        R6                  5      (       aJ  UR8                  R                  R                  5         UR"                  R                  R;                  S5        ON[        U[<        5      (       a9  UR>                  R                  R;                  U R                   R@                  5        [        U[        R0                  [B        45      (       a3  UR8                  b%  UR8                  R                  R                  5         g g g )Ng      r   )stdg        g?)meanr  r   )"rZ   initializer_factorr  r   rN   rw   r   r{   r   initnormal_rO   in_proj_weightin_proj_biasdatazero_out_projr   rU   rG   rJ   r   r   r   rT   r   r   rP   r   fill_!BridgeTowerForContrastiveLearninglogit_scalelogit_scale_init_valueBridgeTowerMLMHead)rY   r  r  proj_stdattn_stdfc_stdr   s          r7   _init_weights(BridgeTowerPreTrainedModel._init_weights  sz   kk,,f:;;//51t{{?\?\;\ae:efH{{..4H$++111d:F++55

 9 9x~N

'',,224

 3 3 : :O		 5 56<H		 0 0 7 7X^L 6 GGOOF--==8>ORGGOOF--@@GGX^O\BIIr|| DEEMM&&CTCZ&@--KK""$MM$$S) ABB##))$++*L*LMfryy*<=>>6;;CZKK""$ D[>r6   r+   N)r,   r-   r.   r/   r   r3   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr   Moduler  r5   r+   r6   r7   r  r    s6    %&+#35ST"3%BII %r6   r  c                   N   ^  \ rS rSr% \\S'   U 4S jr\S 5       rSS jr	Sr
U =r$ )BridgeTowerVisionModeli  rZ   c                 D   > [         TU ]  U5        [        U5      U l        g rj   )rK   rL   r   visualrX   s     r7   rL   BridgeTowerVisionModel.__init__  s     26:r6   c                 j    U R                   R                  R                  R                  R                  $ rj   )r  r   r   r   ra   rY   s    r7   ra   BridgeTowerVisionModel.dtype  s$    {{%%55<<BBBr6   c                 X    U R                  UR                  U R                  5      X#5      $ rj   )r  typera   )rY   image
image_maskr   s       r7   rn   BridgeTowerVisionModel.forward  s     {{5::djj1:XXr6   )r  )NF)r,   r-   r.   r/   r    r3   rL   propertyra   rn   r5   rq   rr   s   @r7   r	  r	    s0    ##; C CY Yr6   r	  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c            "         ^  \ rS rSr% \\S'   SU 4S jjrS rS rS r	\
              SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\R                     S\\\R                     \4   4S jj5       rSrU =r$ )BridgeTowerTextModeli  rZ   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rK   rL   rZ   r  r   r  encoderr   pooler	post_init)rY   rZ   add_pooling_layerr[   s      r7   rL   BridgeTowerTextModel.__init__  sL    
 	 3F;-f53D'/$ 	r6   c                 .    U R                   R                  $ rj   r   r  r  s    r7   get_input_embeddings)BridgeTowerTextModel.get_input_embeddings  s    ...r6   c                 $    XR                   l        g rj   r  rY   r8  s     r7   set_input_embeddings)BridgeTowerTextModel.set_input_embeddings  s    */'r6   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rm   rg   r  )rY   heads_to_prunerm   r  s       r7   _prune_heads!BridgeTowerTextModel._prune_heads   s<    
 +002LELLu%//;;EB 3r6   r  r^   r  r   rG  r  rH  r  rB  r  rI  r  r  rJ  r   c                 
   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nSnU	b:  [        U	[        5      (       d  U	S   S   R                  S   OU	R                  5       nUc  [        R                  " UUU-   4US9nUcs  [!        U R"                  S5      (       a4  U R"                  R$                  S S 2S U24   nUR'                  UU5      nUnO$[        R(                  " U[        R*                  US	9nU R-                  X/5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R/                  U5      nOS nU R1                  XPR                   R2                  5      nU R#                  UUUUUS
9nU R5                  UUUUUU	U
UUUUS9nUS   nU R6                  b  U R7                  U5      OS nU(       d
  UU4USS  -   $ [9        UUUR:                  UR<                  UR>                  UR@                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   rL  rb   r  r`   )r  r   r  r  r  )
r^   rG  rH  r  rB  r  rI  r  r  rJ  r   )r  r(   rB  r)   r*   r  )!rZ   rI  r  use_return_dictr=  r  r   %warn_if_padding_and_no_attention_maskr   rb   r  r   r   get_seq_lengthr1   onesr3  r   r  r   r  rU  get_extended_attention_maskinvert_attention_maskget_head_maskrw   r  r  r   rB  r)   r*   r  ) rY   r  r^   r  r   rG  r  rH  r  rB  r  rI  r  r  rJ  r  r   r[  rb   r  r  r  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr~   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr'  s                                    r7   rn   BridgeTowerTextModel.forward  s3   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/599  "1%++B/$335 # !"ZZ*jCY6Y)ZdjkN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r6   )rZ   r   r  r  )T)NNNNNNNNNNNNNN)r,   r-   r.   r/   r   r3   rL   r   r$  r)  r   r   r1   rp   r   rf   r   r4   r   rn   r5   rq   rr   s   @r7   r  r    s~    "! /0C  -11515/3,0048<9=+/$(,0/3&*15s
ELL)s
 !.s
 !.	s

 u||,s
 ELL)s
  -s
  (5s
 !) 6s
 "%s
 D>s
 $D>s
 'tns
 d^s
 !.s
  
uU\\"$PP	Q!s
 s
r6   r  zv
    The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on
    c            "         ^  \ rS rSrU 4S jrS rS r\              SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\S\\\	R                      \4   4S jj5       rS rSrU =r$ )BridgeTowerModeli  c           
      
	  > [         TU ]  U5        Xl        UR                  nUR                  nUR
                  (       aa  [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  UR                  5      U l
        O[        R                  " [        UR                  5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l	        [        R                  " [        UR                  5       Vs/ s H.  n[        R                  " UR                  UR                  5      PM0     sn5      U l
        [        R                  " SUR                  5      U l        [!        U5      U l        [%        U5      U l        UR(                  (       d  UR*                  (       a  U R"                  R,                  R.                   H  nU R"                  R,                  R0                  R2                  R4                  UR2                  l        U R"                  R,                  R0                  R6                  R4                  UR6                  l        M     [        R                  " [        UR                  5       Vs/ s H  n[9        X6S9PM     sn5      U l        [        R                  " [        UR                  5       Vs/ s H  n[9        X6S9PM     sn5      U l        [?        U5      U l         [?        U5      U l!        [        RD                  " UR                  URF                  S9U l$        [        RD                  " UR                  URF                  S9U l%        URL                  (       a!  [O        U5      U l(        [O        U5      U l)        O[        R                  " [        UR                  S-
  5       Vs/ s H  n[O        U5      PM     sn5      U l(        [        R                  " [        UR                  S-
  5       Vs/ s H  n[O        U5      PM     sn5      U l)        U RU                  5         g s  snf s  snf s  snf s  snf s  snf s  snf )Nr   r  rE   r   )+rK   rL   rZ   vision_configtext_config$share_cross_modal_transformer_layersr   rT   rN   cross_modal_text_transformcross_modal_image_transformry   rz   rw   r   r  r	  vision_modelr  
text_modelr   "init_layernorm_from_vision_encoderr  cross_modal_ln_separater   r   r  r   r  cross_modal_image_layerscross_modal_text_layersr   cross_modal_image_poolercross_modal_text_poolerrP   rQ   cross_modal_text_layernormcross_modal_image_layernormshare_link_tower_layersr   cross_modal_text_link_towercross_modal_image_link_towerr  )rY   rZ   r@  rA  r~   r   r  r[   s          r7   rL   BridgeTowerModel.__init__  sW    ,,((66.0ii8O8OQWQcQc.dD+/1yy9R9RTZTfTf/gD,.0mmQVW]WoWoQpqQpA;22F4F4FGQpq/D+ 02}}SXY_YqYqSrsSra=44f6H6HISrs0D, &(\\!V5G5G%H"2=A.{;,,1Z1Z''..FF!%!2!2!9!9!A!A!H!H!M!M		#0077??DDII G )+JOPVPhPhJijJiQ&{@Jij)
% (*}}JOPVPhPhJijJiQ&{@Jij(
$
 ):&(A%'8'@$ +-,,v7I7IvOdOd*e'+-<<8J8JPVPePe+f())/CF/KD,0DV0LD-/1}}7<V=U=UXY=Y7Z[7Z!%f-7Z[0D, 137<V=U=UXY=Y7Z[7Z!%f-7Z[1D- 	W r t k k  \ \s$   5Q'05Q,-Q13Q6*Q;5R c                 6    U R                   R                  5       $ rj   )rF  r   r  s    r7   r   %BridgeTowerModel.get_input_embeddings  s    3355r6   c                 :    U R                   R                  U5        g rj   )rF  r$  r#  s     r7   r$  %BridgeTowerModel.set_input_embeddings  s    ,,U3r6   r  r^   r  r   
pixel_maskrG  r  r>   image_token_type_idxrI  r  r  labelsr   r   c           	      .   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU(       a  SOSnU(       a  SOSnU(       a  SOSnU(       a  SOSnU
(       a  SOSnUb  Uc  [        S5      eUb  UOU R                   R                  nU	(       a  U	OSn	UR                  5       nU R                  R                  US9nU(       a  UU4-  nUc.  [        R                  " U[        R                  UR                  S9nU R                  R                  UU5      R                  UR                  5      n[        U R                  R                  R                   5      U R                   R"                  -
  S-   nU R                  R                  R                   SU  H  nU" UU5      S   nU(       d  M  UU4-  nM      UcH  U R$                  R&                  R)                  UR+                  U R$                  R,                  5      US9nOUR/                  SSS	5      nU(       a  UU4-  nU R$                  R&                  R0                  R2                  SU  H  nU" U5      nU(       d  M  UU4-  nM     U R$                  R&                  R5                  UR+                  U R$                  R,                  5      5      nU R7                  U5      nU R9                  [        R:                  " S[        R                  UR                  S95      R=                  U5      nU R?                  UU-   5      nU RA                  U5      nU R9                  [        RB                  " S
U	[        R                  UR                  S95      R=                  U5      nUU-   nU RE                  U5      n[        R                  " UR                  S5      UR                  S5      4[        R                  UR                  S9nU R                  R                  XUR                  5       5      R                  UR                  5      nU RF                  S   " UUUUU
S9n U S   n!U RH                  S   " UUUUU
S9n"U"S   n#U(       a  UU!U#44-  nU
(       a  UU S   U"S   44-  nSn$[K        U[        U R                  R                  R                   5      5       GHx  n%U R                  R                  R                   U%   " UU5      S   nU R$                  R&                  R0                  R2                  U%   " U5      R+                  U R$                  R,                  5      nU RA                  U R$                  R&                  R5                  U5      5      U-   nU RL                  U$   n&U RN                  U$   n'U&" U R7                  U5      U-   U!U5      n(U'" UU#U5      n)U RF                  U$S-      " U(U)UUU
S9n U S   n!U RH                  U$S-      " U)U(UUU
S9n"U"S   n#U$S-  n$U(       a  UU4-  nUU4-  nUU!U#44-  nU
(       d  GMj  UU S   U"S   44-  nGM{     U!U#n+n*U RQ                  U*U+5      n,U(       a  UUU4nU(       d  [S        S U*U+U,UU4 5       5      $ [U        U*U+U,UUS9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
image_token_type_idx (`int`, *optional*):
    - The token type ids for images.
output_hidden_states (`bool`, *optional*):
    If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and
    cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image,
    hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding
    modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and
    `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and
    `cross_modal_image_hidden_states` of each brdige layer.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels are currently not supported.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerModel
>>> from PIL import Image
>>> import requests

>>> # prepare image and text
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "hello world"
>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
>>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")

>>> inputs = processor(image, text, return_tensors="pt")
>>> outputs = model(**inputs)
>>> outputs.keys()
odict_keys(['text_features', 'image_features', 'pooler_output'])
```Nr+   zYBridgeTowerModel does not use `inputs_embeds`.  Make sure to pass in `input_ids` instead.r   )r  r`   r   r   r   r   )r^   r  rI  c              3   0   #    U  H  nUc  M  Uv   M     g 7frj   r+   r  s     r7   r  +BridgeTowerModel.forward.<locals>.<genexpr>  s      nA nr  )r&   r'   r(   r)   r*   )+rZ   rI  r  r   r-  r   rF  r   r1   r0  rU  rb   r1  re   r~  r  rm   rw   rE  r  r   r  ra   r   r   r{   r   rC  r  r  	expand_asrM  rD  fullrN  rJ  rI  rz   rP  rQ  get_cls_featuresr4   r$   )-rY   r  r^   r  r   rW  rG  r  r>   rX  rI  r  r  rY  r   all_hidden_states_textall_hidden_states_imageall_hidden_states_crossr  r  r  r=   extend_text_maskssplit_indexrm   r   image_embeds_with_lncross_modal_texttext_token_type_embeddingsimage_token_type_embeddingscross_modal_imageextend_image_maskslayer_outputs_textcross_text_featureslayer_outputs_imagecross_image_featureslink_layer_indexr  text_link_towerimage_link_towercross_text_features_cross_image_features_r&   r'   cls_featuress-                                                r7   rn   BridgeTowerModel.forward  s   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 (<(<"$(<"$"6BD$5b4$):%k  &1%<k$++B]B]7K3QRnn&oo0090E"{n4"!"ZZ5::iN^N^_N OOGGXcdgg

 $//117784;;;X;XX[\\ __,,22<K@E->?BK##&;.8&	 A ,,33??!!$"3"3"9"9:Um @ L
 (//1a8L#6# &&--99CCL[QE .L##'L?:' R
  $0077DD\EVEVW[WhWhWnWnEop  ::;G%)%?%?KKI4D4DE&

)$
% 	#  ::;KNh;hi#??@TU&*&@&@JJt1IL\L\]'

)(
) 	$  46QQ <<=QRZZ##A&(9(>(>q(AB**##


 "__HHUdUdUfgjj
 "99!<,#5/
 13";;A>-#4/
  315#)<>R(S'UU#%7%:<OPQ<R$S#UU {C(?(?(E(E$FGA//1177:;HYZ[\]K,,33??II!L\Z__!!''L 001B1B1I1I1V1VWc1de-. !
 #>>?OPO#@@AQR $3//<?YY#!$ 
 %55IK_as$t! "&!=!=>NQR>R!S$%0'9"3" #5Q"7"&"?"?@PST@T"U%$1'8"3# $7q#9 !#&;.8&'L?:''-@BV,W+YY'  #);A)>@STU@V(W'YY#a Hf )<=Q~,,]NK!79PRi j 'GXZmn   &')&+*
 	
r6   c                 r    U R                  U5      nU R                  U5      n[        R                  " X4/SS9$ )Nr   r   )rL  rK  r1   r   )rY   r&   r'   cls_features_textcls_features_images        r7   r`  !BridgeTowerModel.get_cls_features  s9     88G!::>Jyy+@bIIr6   )rZ   rN  rI  rQ  rK  rD  rM  rJ  rP  rL  rC  rF  r  rE  )NNNNNNNNNNNNNF)r,   r-   r.   r/   rL   r   r$  r   r   r1   
LongTensorr2   r   rf   r   r4   rp   r$   rn   r`  r5   rq   rr   s   @r7   r>  r>    s   6p64  156:594815155948.2,0/3&*-1).j
E,,-j
 !!2!23j
 !!1!12	j

 u001j
 U--.j
 E--.j
   1 12j
 u001j
 'smj
 $D>j
 'tnj
 d^j
 ))*j
 #'j
  
uU\\"$::	;!j
 j
XJ Jr6   r>  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )"BridgeTowerPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rK   rL   r   rT   rN   r   r  r  r  r
   transform_act_fnrP   rQ   rX   s     r7   rL   +BridgeTowerPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr6   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rj   )r   r  rP   r  s     r7   rn   *BridgeTowerPredictionHeadTransform.forward  s4    

=1--m<}5r6   )rP   r   r  r   rr   s   @r7   r}  r}    s    U r6   r}  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )r  i  c                 n  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  R                  SS9U l
        [
        R                  " [        R                  " UR                  R                  5      5      U l        Ub  X R                  l        g g )NF)r   )rK   rL   rZ   r}  	transformr   rT   rN   rA  r  decoderr   r1   r  r   r   )rY   rZ   r   r[   s      r7   rL   BridgeTowerMLMHead.__init__  s    ;FCyy!3!3V5G5G5R5RY^_LLV-?-?-J-J!KL	"(LL r6   c                 d    U R                  U5      nU R                  U5      U R                  -   nU$ rj   )r  r  r   )rY   x	mlm_scores      r7   rn   BridgeTowerMLMHead.forward  s-    NN1%	LL+dii7	r6   )r   rZ   r  r  rj   r   rr   s   @r7   r  r    s    ) r6   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerITMHeadi  c                 Z   > [         TU ]  5         [        R                  " US5      U l        g Nr   rK   rL   r   rT   fc)rY   rN   r[   s     r7   rL   BridgeTowerITMHead.__init__  s     ))K+r6   c                 (    U R                  U5      nU$ rj   r  )rY   r  	itm_scores      r7   rn   BridgeTowerITMHead.forward  s    GGAJ	r6   r  r   rr   s   @r7   r  r    s    , r6   r  z\
    BridgeTower Model with a language modeling head on top as done during pretraining.
    c                     ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\\\
R                     4   4S jj5       rSrU =r$ )BridgeTowerForMaskedLMi  zmlm_score.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rj   )rK   rL   r>  r  r  r  r  rX   s     r7   rL   BridgeTowerForMaskedLM.__init__  s5     +F3+F3 	r6   c                 .    U R                   R                  $ rj   r  r  r  s    r7   get_output_embeddings,BridgeTowerForMaskedLM.get_output_embeddings  s    ~~%%%r6   c                 $    XR                   l        g rj   r  )rY   new_embeddingss     r7   set_output_embeddings,BridgeTowerForMaskedLM.set_output_embeddings  s    !/r6   r  r^   r  r   rW  rG  r  r>   rI  r  r  rY  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
US9nU R                  U(       a  UR                  OUS   5      nSnUbk  [        5       nUR                  UR                  5      nU" UR                  SU R                   R                  R                  5      UR                  S5      5      nU(       d  [        U5      nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
>>> text = "a <mask> looking out of the window"

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)

>>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())

>>> print(results)
.a cat looking out of the window.
```N
r^   r  r   rW  rG  r  r>   rI  r  r  r   r   r;   r<   r)   r*   )rZ   r-  r  r  r&   r   re   rb   r   rA  r  r4   r   r)   r*   )rY   r  r^   r  r   rW  rG  r  r>   rI  r  r  rY  r  
mlm_logitsmasked_lm_lossloss_fctry  s                     r7   rn   BridgeTowerForMaskedLM.forward  s   d &1%<k$++B]B]""))%!'%/!5# # 
 ^^[G$9$9gVWjY
')HYYz001F%joob$++:Q:Q:\:\&]_e_j_jkm_noN:&F3A3M^%.YSYY!//))	
 	
r6   )r  r  NNNNNNNNNNNN)r,   r-   r.   r/   _tied_weights_keysrL   r  r  r   r   r1   r{  r2   rf   r   r   r4   rn   r5   rq   rr   s   @r7   r  r    sj    55&0  156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
~uU%6%677	8Q
 Q
r6   r  z
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\R                     S\\\\R                     4   4S jj5       rSrU =r$ )#BridgeTowerForImageAndTextRetrievaliN  c                    > [         TU ]  U5        [        U5      U l        [	        UR
                  S-  5      U l        U R                  5         g r  )rK   rL   r>  r  r  rN   r  r  rX   s     r7   rL   ,BridgeTowerForImageAndTextRetrieval.__init__U  s@     +F3+F,>,>,BC 	r6   r  r^   r  r   rW  rG  r  r>   rI  r  r  rY  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
US9nU(       a  UR                  OUS   nU R	                  U5      nSnUb-  [        5       nUR                  UR                  5      nU" X5      nU(       d  [        U5      nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )aM  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
    Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
    The pairs with 0 will be skipped for calculation.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
>>> import requests
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # forward pass
>>> scores = dict()
>>> for text in texts:
...     # prepare inputs
...     encoding = processor(image, text, return_tensors="pt")
...     outputs = model(**encoding)
...     scores[text] = outputs.logits[0, 1].item()
```Nr  r   r  )rZ   r-  r  r(   r  r   re   rb   r4   r   r)   r*   )rY   r  r^   r  r   rW  rG  r  r>   rI  r  r  rY  r  r(   r<   itm_lossr  ry  s                      r7   rn   +BridgeTowerForImageAndTextRetrieval.forward_  s    \ &1%<k$++B]B]""))%!'%/!5# # 
 2=--'!*.')HYYv}}-F/H6]F-5-AXK&(MvM'!//))	
 	
r6   )r  r  r  )r,   r-   r.   r/   rL   r   r   r1   r{  r2   rf   r   r   r4   rn   r5   rq   rr   s   @r7   r  r  N  sV     156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
'u/@/@)AA	BQ
 Q
r6   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerContrastiveHeadi  c                 X   > [         TU ]  5         [        R                  " X5      U l        g rj   r  )rY   rN   
embed_sizer[   s      r7   rL   #BridgeTowerContrastiveHead.__init__  s    ))K4r6   c                 (    U R                  U5      nU$ rj   r  )rY   r  s     r7   rn   "BridgeTowerContrastiveHead.forward  s    GGAJr6   r  r   rr   s   @r7   r  r    s    5 r6   r  zl
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )r  i  c                   > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  S-  UR                  5      U l	        [        R                  " [        R                  " U R                  R                  5      5      U l        U R#                  5         g r  )rK   rL   r>  r  r  rN   contrastive_hidden_sizeitc_text_headitc_image_headitc_cross_modal_headr   r   r1   r   rZ   r  r  r  rX   s     r7   rL   *BridgeTowerForContrastiveLearning.__init__  s     +F378J8JFLjLjk89K9KVMkMkl$>v?Q?QTU?UW]WuWu$v!<<T[[5W5W(XYr6   r  r^   r  r   rW  rG  r  r>   rI  r  r  return_lossr   c                 >   Ub  UOU R                   R                  nU R                  UUUUUUUUU	SUS9nU(       a  UR                  OUS   nU(       a  UR                  OUS   u  nnnUS   nUS   nU R                  R
                  R                  R                  U5      nU R                  R                  [        R                  " SS[        R                  U R                  R                  R                  R                  S	95      R                  U5      nU R                  R                  U5      U-   n[         R"                  R%                  U R'                  USS2S
SS24   5      SSS9n[         R"                  R%                  U R)                  USS2S
SS24   5      SSS9R+                  UR                  S9n[         R"                  R%                  U R-                  U5      SSS9R+                  UR                  S9n[        R.                  " UUU/SS9nU R0                  R3                  5       R+                  UR                  S9n[        R4                  " UUR7                  5       5      U-  n[        R4                  " UUR7                  5       5      U-  n[        R4                  " UUR7                  5       5      U-  nSnU(       a  [        R8                  " [;        U5      UR                  S9n[         R"                  R=                  UU5      n[         R"                  R=                  UU5      n[         R"                  R=                  UU5      nUU-   U-   S-  nU(       d  UUUU4USS -   n Ub  U4U -   $ U $ [?        UUUUUUR                  UR@                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
>>> import requests
>>> from PIL import Image
>>> import torch

>>> image_urls = [
...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
...     "http://images.cocodataset.org/val2017/000000039769.jpg",
... ]
>>> texts = ["two dogs in a car", "two cats sleeping on a couch"]
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
>>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")

>>> inputs = processor(images, texts, padding=True, return_tensors="pt")
>>> loss = model(**inputs, return_loss=True).loss

>>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
>>> loss_swapped = model(**inputs, return_loss=True).loss

>>> print("Loss", round(loss.item(), 4))
Loss 0.0019

>>> print("Loss with swapped images", round(loss_swapped.item(), 4))
Loss with swapped images 2.126
```NTr  r   r	   r   r[  r   r`   r   )r   pr,  rL  r   g      @)r;   r<   r=   r>   r?   r)   r*   )!rZ   r-  r  r(   r)   rE  r  r   r  r1   r_  rU  r   rb   r^  rD  r   r   	normalizer  r  re   r  r   r  exprT  tr   r~  cross_entropyr9   r*   )!rY   r  r^   r  r   rW  rG  r  r>   rI  r  r  r  r  r(   hidden_states_txthidden_states_imghidden_states_cross_modalr=   rf  ri  r?   r<   r  logits_text_to_imagelogits_text_to_crosslogits_image_to_crossitc_lossrY  text_to_image_losstext_to_cross_lossimage_to_cross_lossry  s!                                    r7   rn   )BridgeTowerForContrastiveLearning.forward  sz   j &1%<k$++B]B]""))%!'%/!%# # 
 2=--'!*%0G!!gaj 	H,.G (+(,#//<<CCPPQ]^&*&6&6&L&LJJtQejj9I9I9_9_9f9f9m9mn'

)(
) 	$ ''CCDXY\ww mm--d.@.@QPQSTWAU.V\^bc-d}}..t/B/B<PQSTVWPWCX/Y_aef.gjj%% k 
 }}..t/H/H/W]_cd.ehh%% i 
 k<FBO&&**,//{7I7I/J$||K9IJ[X$||K9IJ[X %\<>>;K L{ Z\\#f+fmmDF!#!<!<=QSY!Z!#!<!<=QSY!Z"$--"="=>SU["\*-??BUUY\\Hk<FQRQSTF-5-AXK&(MvM+#%%!//))
 	
r6   )r  r  r  r  r  )NNNNNNNNNTNN)r,   r-   r.   r/   rL   r   r   r1   r{  r2   rf   r   r9   r4   rn   r5   rq   rr   s   @r7   r  r    sO     156:594815155948,0/3&*&*x
E,,-x
 !!2!23x
 !!1!12	x

 u001x
 U--.x
 E--.x
   1 12x
 u001x
 $D>x
 'tnx
 d^x
 d^x
 
+U53D3D-EE	Fx
 x
r6   r  )r  r  r  r>  r  )r   )Pr0   rW  collectionsr   dataclassesr   typingr   r   r1   r   torch.nnr   activationsr
   r   cache_utilsr   r   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   utils.deprecationr   configuration_bridgetowerr   r   r    
get_loggerr,   r  _TOKENIZER_FOR_DOCr$   r9   r  rA   rt   r   r   r   r   r  r  r   r*  rw  rt  r  r  r  r  r  r  r	  r  r>  r}  r  r  r  r  r  r  __all__r+   r6   r7   <module>r     s      # ! "   % 6 C C 9  . l l 7 7 0 h h 
		H	%'  
:[ : :$ 
:; : :4)299 )XRYY 6P")) Pf7"299 7"td299 d4BII bii  		 		  B.ryy B.L %' #3299 3l=		 =@G5 GVY
RYY Y
zV=		 V=t4  % % %DY7 Y U
5 U
U
p 
oJ1 oJ
oJf	 "    
d
7 d

d
N ]
*D ]
]
@  
G
(B G

G
Tr6   