
    bCi	                     (   S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK	J
r
  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJrJr  SSKJrJrJr  SSKJrJrJrJrJ r   SSK!J"r"J#r#J$r$  \ RJ                  " \&5      r'\\" SS9 " S S\5      5       5       r(\\" SS9 " S S\5      5       5       r)\\ " S S\5      5       5       r*S\	RV                  S\	RV                  4S jr,S\	RV                  S\	RV                  4S jr-S\$S\.4S jr/SYS \\.\04   S!\14S" jjr2 " S# S$\
Rf                  5      r4 " S% S&\
Rj                  5      r6 " S' S(\
Rf                  5      r7 " S) S*\
Rf                  5      r8 " S+ S,\
Rf                  5      r9 " S- S.\
Rf                  5      r: " S/ S0\
Rf                  5      r; " S1 S2\
Rf                  5      r< " S3 S4\
Rf                  5      r=  SZS5\
Rf                  S6\	RV                  S7\	RV                  S8\	RV                  S9\\	RV                     S:\>S;\>S<\\	RV                     4S= jjr? " S> S?\
Rf                  5      r@ " S@ SA\
Rf                  5      rA " SB SC\
Rf                  5      rB " SD SE\
Rf                  5      rC " SF SG\
Rf                  5      rD " SH SI\5      rE " SJ SK\
Rf                  5      rF " SL SM\
Rf                  5      rG\ " SN SO\5      5       rH\" SPS9 " SQ SR\H5      5       rI\" SSS9 " ST SU\H5      5       rJ\ " SV SW\H5      5       rK/ SXQrLg)[zPyTorch ALIGN model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Srg)AlignVisionModelOutput)   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_statehidden_states )__name__
__module____qualname____firstlineno____doc__r"   r   torchFloatTensor__annotations__r#   r$   tuple__static_attributes__r%       b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/align/modeling_align.pyr    r    )   sN    
 15L(5,,-459x 1 1298<M8E%"3"345<r0   r    ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	AlignTextModelOutput:   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr#   r$   
attentionsr%   )r&   r'   r(   r)   r*   r5   r   r+   r,   r-   r#   r$   r.   r6   r/   r%   r0   r1   r3   r3   :   sh    
 04K%++,359x 1 1298<M8E%"3"345<59Ju00129r0   r3   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)AlignOutputL   a.  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The output of [`AlignVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AlignTextModel`].
vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
    The output of the [`AlignVisionModel`].
Nlosslogits_per_imagelogits_per_textr5   r"   text_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r=   r>   N)getattrto_tuple).0kselfs     r1   	<genexpr>'AlignOutput.to_tuple.<locals>.<genexpr>k   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)r.   keysrF   s   `r1   rC   AlignOutput.to_tuplej   s#     
YY[
 
 	
r0   r%   )r&   r'   r(   r)   r*   r:   r   r+   r,   r-   r;   r<   r5   r"   r=   r   r>   r   r.   r   rC   r/   r%   r0   r1   r8   r8   L   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-44818DHAH
%* 
r0   r8   logitsr?   c                     [         R                  R                  U [        R                  " [        U 5      U R                  S9SS9$ )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr+   arangelenrO   )rL   s    r1   contrastive_lossrU   s   s5    ==&&vu||CKPVP]P]/^ps&ttr0   
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)rU   t)rV   caption_loss
image_losss      r1   
align_lossr[   w   s*    #J/L!*,,.1J%,,r0   confignum_channelsc                     U R                   nXR                  -  n[        U[        XS-  -   5      U-  U-  5      nUSU-  :  a  X2-  n[        U5      $ )z4
Round number of filters based on depth multiplier.
   g?)depth_divisorwidth_coefficientmaxint)r\   r]   divisornew_dims       r1   round_filtersrf   ~   s`     ""G,,,L'3|k9:gEOPG |##w<r0   kernel_sizeadjustc                     [        U [        5      (       a  X 4n U S   S-  U S   S-  4nU(       a  US   S-
  US   US   S-
  US   4$ US   US   US   US   4$ )a.  
Utility function to get the tuple padding value for the depthwise convolution.

Args:
    kernel_size (`int` or `tuple`):
        Kernel size of the convolution layers.
    adjust (`bool`, *optional*, defaults to `True`):
        Adjusts padding value to apply to right and bottom sides of the input.
r   r_   r   )
isinstancerc   )rg   rh   corrects      r1   correct_padrl      s~     +s##"01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r0   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	AlignVisionEmbeddings   zD
A module that corresponds to the stem module of the original work.
r\   c           	      |  > [         TU ]  5         [        US5      U l        [        R
                  " SS9U l        [        R                  " UR                  U R                  SSSSS9U l	        [        R                  " U R                  UR                  UR                  S	9U l        [        UR                     U l        g )
N    )r   r   r   r   paddingr	   r_   validFrg   striders   bias)epsmomentum)super__init__rf   out_dimr   	ZeroPad2drs   Conv2dr]   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr
   
hidden_act
activationrF   r\   	__class__s     r1   r{   AlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r0   pixel_valuesr?   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)rs   r   r   r   )rF   r   featuress      r1   forwardAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r0   )r   r   r   r|   rs   )r&   r'   r(   r)   r*   r   r{   r+   Tensorr   r/   __classcell__r   s   @r1   rn   rn      s5    	40 	4ELL U\\  r0   rn   c                   :   ^  \ rS rSr       SU 4S jjrSrU =r$ )AlignVisionDepthwiseConv2d   c	                 8   > X-  n	[         T
U ]  UU	UUUUUUUS9	  g )N)	in_channelsout_channelsrg   rv   rs   dilationgroupsrw   padding_mode)rz   r{   )rF   r   depth_multiplierrg   rv   rs   r   rw   r   r   r   s             r1   r{   #AlignVisionDepthwiseConv2d.__init__   s:     #5#%#% 	 
	
r0   r%   )r   r	   r   r   r   Tzeros)r&   r'   r(   r)   r{   r/   r   r   s   @r1   r   r      s$     
 
r0   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jrSrU =r$ )AlignVisionExpansionLayer   zW
This corresponds to the expansion phase of each block in the original implementation.
r\   in_dimr|   rv   c                    > [         TU ]  5         [        R                  " UUSSSS9U l        [        R
                  " X1R                  S9U l        [        UR                     U l
        g )Nr   sameFr   r   rg   rs   rw   )num_featuresrx   )rz   r{   r   r~   expand_convr   r   	expand_bnr
   r   
expand_act)rF   r\   r   r|   rv   r   s        r1   r{   "AlignVisionExpansionLayer.__init__   sX    99 
 WBWBWX !2!23r0   r$   r?   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rF   r$   s     r1   r   !AlignVisionExpansionLayer.forward   s4    ((7}56r0   )r   r   r   )r&   r'   r(   r)   r*   r   rc   r{   r+   r,   r   r   r/   r   r   s   @r1   r   r      sM    
40 
4# 
4 
4UX 
4U%6%6 5<<  r0   r   c            
       ~   ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjrS	\	R                  S
\	R                  4S jrSrU =r$ )AlignVisionDepthwiseLayer   zc
This corresponds to the depthwise convolution phase of each block in the original implementation.
r\   r   rv   rg   adjust_paddingc                 F  > [         TU ]  5         X0l        U R                  S:X  a  SOSn[        XES9n[        R
                  " US9U l        [        X$X6SS9U l        [        R                  " X!R                  UR                  S9U l        [        UR                     U l        g )	Nr_   rt   r   )rh   rr   Fru   r   rx   ry   )rz   r{   rv   rl   r   r}   depthwise_conv_padr   depthwise_convr   r   r   depthwise_normr
   r   depthwise_act)	rF   r\   r   rv   rg   r   conv_padrs   r   s	           r1   r{   "AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7FkA"$,,w"?8FSX
 !nn%:%:VE_E_
 $F$5$56r0   r$   r?   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ )Nr_   )rv   r   r   r   r   r   s     r1   r   !AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r0   )r   r   r   r   rv   r&   r'   r(   r)   r*   r   rc   boolr{   r+   r,   r   r   r/   r   r   s   @r1   r   r      s_    7!7 7 	7
 7 7,	U%6%6 	5<< 	 	r0   r   c            	       ~   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )AlignVisionSqueezeExciteLayeri  zd
This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
r\   r   
expand_dimexpandc                   > [         TU ]  5         U(       a  UOUU l        [        S[	        X!R
                  -  5      5      U l        [        R                  " SS9U l	        [        R                  " U R                  U R                  SSS9U l        [        R                  " U R                  U R                  SSS9U l        [        UR                     U l        [        R                   " 5       U l        g )Nr   )output_sizer   )r   r   rg   rs   )rz   r{   dimrb   rc   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezer~   reducer   r
   r   
act_reduceSigmoid
act_expand)rF   r\   r   r   r   r   s        r1   r{   &AlignVisionSqueezeExciteLayer.__init__   s    !':V!S*H*H!HIJ++:ii	
 ii	
 !!2!23**,r0   r$   r?   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      n[
        R                  " X!5      nU$ r   )r   r   r   r   r   r+   mul)rF   r$   inputss      r1   r   %AlignVisionSqueezeExciteLayer.forward5  sa    ]3M26M26		&8r0   )r   r   r   r   r   r   r   )Fr   r   s   @r1   r   r     sR    '0 '# '3 'X\ ' '*
U%6%6 
5<< 
 
r0   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\4U 4S	 jjr	S
\
R                  S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionFinalBlockLayeriB  zS
This corresponds to the final phase of each block in the original implementation.
r\   r   r|   rv   	drop_rateid_skipc                   > [         TU ]  5         US:H  =(       a    U(       + U l        [        R                  " UUSSSS9U l        [        R                  " X1R                  UR                  S9U l	        [        R                  " US9U l        g )Nr   r   Fr   r   )p)rz   r{   apply_dropoutr   r~   project_convr   r   r   
project_bnDropoutdropout)rF   r\   r   r|   rv   r   r   r   s          r1   r{   #AlignVisionFinalBlockLayer.__init__G  sx     	#q[8[II 
 .. &;&;fF`F`
 zzI.r0   
embeddingsr$   r?   c                     U R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX!-   nU$ r   )r   r   r   r   )rF   r   r$   s      r1   r   "AlignVisionFinalBlockLayer.forwardX  sE    ))-86 LL7M)6Mr0   )r   r   r   r   r&   r'   r(   r)   r*   r   rc   floatr   r{   r+   r,   r   r   r/   r   r   s   @r1   r   r   B  so    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf  r0   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\S	\S
\S\4U 4S jjr	S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionBlockic  a1  
This corresponds to the block module of original the EfficientNet vision encoder implementation.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
    in_dim (`int`):
        Number of input channels.
    out_dim (`int`):
        Number of output channels.
    stride (`int`):
        Stride size to be used in convolution layers.
    expand_ratio (`int`):
        Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
    kernel_size (`int`):
        Kernel size for the depthwise convolution layer.
    drop_rate (`float`):
        Dropout rate to be used in the final phase of each block.
    id_skip (`bool`):
        Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
        of each block. Set to `True` for the first block of each stage.
    adjust_padding (`bool`):
        Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
        operation, set to `True` for inputs with odd input sizes.
r\   r   r|   rv   expand_ratiorg   r   r   r   c
           	      f  > [         TU ]  5         XPl        U R                  S:g  U l        X%-  n
U R                  (       a  [	        XXS9U l        [        UU R                  (       a  U
OUUUU	S9U l        [        XXR                  S9U l	        [        UU R                  (       a  U
OUUUUUS9U l        g )Nr   )r\   r   r|   rv   )r\   r   rv   rg   r   )r\   r   r   r   )r\   r   r|   rv   r   r   )rz   r{   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)rF   r\   r   r|   rv   r   rg   r   r   r   expand_in_dimr   s              r1   r{   AlignVisionBlock.__init__~  s     	(''1,-;;6mDN 8$(KK=V#)
 <];;
 5$(KK=V
r0   r$   r?   c                     UnU R                   S:w  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  X!5      nU$ Nr   )r   r   r   r   r   )rF   r$   r   s      r1   r   AlignVisionBlock.forward  sY    "
! NN=9M++M: ++M:
Br0   )r   r   r   r   r   r   r   r   s   @r1   r   r   c  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
 
r0   r   c            	       v   ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\
   S\	\
   S\4S	 jjrS
rU =r$ )AlignVisionEncoderi  z
Forward propagates the embeddings through each vision encoder (EfficientNet) block.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
r\   c                   >^ ^ [         TT ]  5         UR                  T l        U 4S jm[        UR                  5      n[        U4S jUR                   5       5      nSn/ n[        U5       H  n[        XR                  U   5      n[        XR                  U   5      nUR                  U   n	UR                  U   n
UR                  U   n[        T" UR                  U   5      5       Hc  nUS:H  nUS:  a  SOU	n	US:  a  UOUnXAR                  ;  nUR                  U-  U-  n[        UUUU	U
UUUUS9	nUR!                  U5        US-  nMe     M     ["        R$                  " U5      T l        g )Nc                 \   > [        [        R                  " TR                  U -  5      5      $ r   )rc   mathceildepth_coefficient)repeatsrF   s    r1   round_repeats2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr0   c              3   4   >#    U  H  nT" U5      v   M     g 7fr   r%   )rD   nr   s     r1   rG   .AlignVisionEncoder.__init__.<locals>.<genexpr>  s     L3Kaq))3Ks   r   r   )	r\   r   r|   rv   rg   r   r   r   r   )rz   r{   r   rT   r   sumnum_block_repeatsrangerf   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)rF   r\   num_base_blocks
num_blockscurr_block_numr  ir   r|   rv   rg   r   jr   r   r   blockr   r   s   `                @r1   r{   AlignVisionEncoder.__init__  sp   !'!9!9	D f001L63K3KLL
'A"6+=+=a+@AF#F,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEFq&!e$%Ev!/7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#' G (8 mmF+r0   r$   output_hidden_statesreturn_dictr?   c                     U(       a  U4OS nU R                    H  nU" U5      nU(       d  M  XA4-  nM     U(       d  [        S X4 5       5      $ [        UUS9$ )Nc              3   .   #    U  H  oc  M  Uv   M     g 7fr   r%   )rD   vs     r1   rG   -AlignVisionEncoder.forward.<locals>.<genexpr>  s     X$Fq$Fs   	)r#   r$   )r  r.   r   )rF   r$   r  r  all_hidden_statesr  s         r1   r   AlignVisionEncoder.forward  sh     1E],$[[E!-0M##!%55! !
 X]$FXXX-++
 	
r0   )r  r   )FT)r&   r'   r(   r)   r*   r   r{   r+   r,   r   r   r   r   r/   r   r   s   @r1   r   r     s\    ),0 ),\ 05&*	
((
 'tn
 d^	

 
2
 
r0   r   c                      ^  \ rS rSrSrU 4S jr    SS\\R                     S\\R                     S\\R                     S\\R                     S\R                  4
S	 jjrS
rU =r$ )AlignTextEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxrx   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_ids)dtype)rz   r{   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   rB   r  register_bufferr+   rS   r   r   r  sizelongr   s     r1   r{   AlignTextEmbeddings.__init__  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r0   	input_idsr  r  inputs_embedsr?   c                 `   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  S:X  a  U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr  r   r  r   r   rO   r  )r.  r  hasattrr  r   r+   r   r/  rO   r%  r)  r  r'  r*  r   )rF   r1  r  r  r2  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr)  r   r'  s               r1   r   AlignTextEmbeddings.forward  s<     #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r0   )r*  r   r  r'  r)  r%  )NNNN)r&   r'   r(   r)   r*   r{   r   r+   
LongTensorr,   r   r   r/   r   r   s   @r1   r  r    s    Q
* 15593759&E,,-& !!1!12& u//0	&
   1 12& 
& &r0   r  modulequerykeyvalueattention_maskscalingr   	head_maskc                    [         R                  " XR                  SS5      5      U-  n	Ub"  US S 2S S 2S S 2S UR                  S   24   n
X-   n	[        R
                  R                  U	S[         R                  S9R                  UR                  5      n	[        R
                  R                  XU R                  S9n	Ub  XR                  SSSS5      -  n	[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr_   r	   r  )r   r   )r   trainingr   )r+   matmul	transposeshaper   rQ   softmaxfloat32tor   r   rE  view
contiguous)r<  r=  r>  r?  r@  rA  r   rB  kwargsattn_weightscausal_maskattn_outputs               r1   eager_attention_forwardrR  =  s     <<}}Q':;gEL!$Q1o		"o%=>#1==((2U]](SVVW\WbWbcL==((6??([L#nnQAq&AA,,|3K''1-88:K$$r0   c                      ^  \ rS rSrU 4S jr   S
S\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	rU =r$ )AlignTextSelfAttentioniX  c                 6  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        UR                   U l        U R                  S-  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )rz   r{   r#  num_attention_headsr5  
ValueErrorr\   rc   attention_head_sizeall_head_sizer   Linearr=  r>  r?  r   attention_probs_dropout_probr   attention_dropoutrA  r   s     r1   r{   AlignTextSelfAttention.__init__Y  sD    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r0   r$   r@  rB  output_attentionsr?   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
U4U R                  (       d  SOU R                  U R                  US.UD6u  pUR                  " / UQSP76 R                  5       nU(       a  X4nU$ U4nU$ )Nr  r   r_   eager        )r   rA  rB  )rH  rZ  r=  rL  rG  r>  r?  rR  r\   _attn_implementationr   rE  r^  rA  reshaperM  )rF   r$   r@  rB  r`  rN  r6  hidden_shapequery_states
key_statesvalue_statesattention_interfacerQ  rO  outputss                  r1   r   AlignTextSelfAttention.forwardn  s[    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL
%
 
%
! "));;;;FFH1B;- JUr0   )
r[  r^  rZ  r\   r   r>  rX  r=  rA  r?  NNF)r&   r'   r(   r)   r{   r+   r   r   r,   r   r.   r   r/   r   r   s   @r1   rT  rT  X  st    60 7;15,1!||! !!2!23! E--.	!
 $D>! 
u||	! !r0   rT  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr  )rz   r{   r   r\  r#  denser*  r+  r   r,  r   r   s     r1   r{   AlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r0   r$   input_tensorr?   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rr  r   r*  rF   r$   rt  s      r1   r   AlignTextSelfOutput.forward  5    

=1]3}'CDr0   r*  rr  r   
r&   r'   r(   r)   r{   r+   r   r   r/   r   r   s   @r1   ro  ro    6    >U\\  RWR^R^  r0   ro  c                      ^  \ rS rSrU 4S jrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )AlignTextAttentioni  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )rz   r{   rT  rF   ro  outputsetpruned_headsr   s     r1   r{   AlignTextAttention.__init__  s0    *62	)&1Er0   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   )r   )rT   r   rF   rX  rZ  r  r   r=  r>  r?  r  rr  r[  union)rF   headsindexs      r1   prune_headsAlignTextAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r0   r$   r@  rB  r`  r?   c                 p    U R                   " U4UUUS.UD6nU R                  US   U5      nU4USS  -   nU$ N)r@  rB  r`  r   r   )rF   r  )	rF   r$   r@  rB  r`  rN  self_outputsattention_outputrk  s	            r1   r   AlignTextAttention.forward  s]     yy
)/	

 
  ;;|AF#%QR(88r0   )r  r  rF   rm  )r&   r'   r(   r)   r{   r  r+   r   r   r,   r   r.   r   r/   r   r   s   @r1   r~  r~    sy    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	 r0   r~  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rz   r{   r   r\  r#  intermediate_sizerr  rj   r   strr
   intermediate_act_fnr   s     r1   r{   AlignTextIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r0   r$   r?   c                 J    U R                  U5      nU R                  U5      nU$ r   rr  r  r   s     r1   r   AlignTextIntermediate.forward  s&    

=100?r0   r  r{  r   s   @r1   r  r    s(    9U\\ ell  r0   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g rq  )rz   r{   r   r\  r  r#  rr  r*  r+  r   r,  r   r   s     r1   r{   AlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r0   r$   rt  r?   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rv  rw  s      r1   r   AlignTextOutput.forward  ry  r0   rz  r{  r   s   @r1   r  r    r|  r0   r  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	 rS
rU =r$ )AlignTextLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g r   )
rz   r{   chunk_size_feed_forwardseq_len_dimr~  	attentionr  intermediater  r  r   s     r1   r{   AlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r0   r$   r@  rB  r`  r?   c                     U R                   " U4UUUS.UD6nUS   nUSS  n[        U R                  U R                  U R                  U5      n	U	4U-   nU$ r  )r  r   feed_forward_chunkr  r  )
rF   r$   r@  rB  r`  rN  self_attention_outputsr  rk  layer_outputs
             r1   r   AlignTextLayer.forward  s     "&"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r0   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  )rF   r  intermediate_outputr  s       r1   r  !AlignTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir0   )r  r  r  r  r  rm  )r&   r'   r(   r)   r{   r+   r   r   r,   r   r.   r   r  r/   r   r   s   @r1   r  r    sy    . 7;15,1|| !!2!23 E--.	
 $D> 
u||	2 r0   r  c                      ^  \ rS rSrU 4S jr\     SS\R                  S\\R                     S\\R                     S\\
   S\\
   S\\
   S	\\\R                     \4   4S
 jj5       rSrU =r$ )AlignTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rz   r{   r\   r   r  r   num_hidden_layersr  layergradient_checkpointing)rF   r\   r	  r   s      r1   r{   AlignTextEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A&r$   r@  rB  r`  r  r  r?   c           	         U(       a  SOS nU(       a  SOS n	[        U R                  5       H=  u  pU(       a  X4-   nUb  X:   OS nU" SUUUUS.UD6nUS   nU(       d  M5  XS   4-   n	M?     U(       a  X4-   n[        UUU	S9$ )Nr%   )r$   r@  rB  r`  r   r   )r#   r$   r6   )	enumerater  r   )rF   r$   r@  rB  r`  r  r  rN  r  all_self_attentionsr	  layer_modulelayer_head_masklayer_outputss                 r1   r   AlignTextEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO( +-)"3	
 M *!,M  &91=M<O&O#!  5$   14D D++*
 	
r0   )r\   r  r  )NNFFT)r&   r'   r(   r)   r{   r   r+   r   r   r,   r   r   r.   r   r   r/   r   r   s   @r1   r  r    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r0   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextPooleriG  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rz   r{   r   r\  r#  rr  Tanhr   r   s     r1   r{   AlignTextPooler.__init__H  s9    YYv1163E3EF
'')r0   r$   r?   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )rr  r   )rF   r$   first_token_tensorpooled_outputs       r1   r   AlignTextPooler.forwardM  s6     +1a40

#566r0   )r   rr  r{  r   s   @r1   r  r  G  s(    $
U\\ ell  r0   r  c                   J    \ rS rSr% \\S'   SrSrS\R                  4S jr
Srg)	AlignPreTrainedModeliV  r\   alignTr<  c                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b$  UR                  R                  R                  5         GO1[        U[        5      (       a  [        R                  R                  UR                  R                  5        UR                  R                  R                  R                  5         UR                  R                  R!                  U R                   R"                  5        O[        U[        R$                  5      (       ab  UR                  R                  R                  SUS9  UR&                  b1  UR                  R                  UR&                     R                  5         [        U[        R(                  [        R*                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R!                  S5        gg)zInitialize the weightsrc  )meanstdNg      ?)r\   initializer_rangerj   r   r\  r~   weightdatanormal_rw   zero_
AlignModelinitxavier_uniform_text_projectiontemperaturefill_temperature_init_valuer!  r  r*  r   )rF   r<  r  s      r1   _init_weights"AlignPreTrainedModel._init_weights\  s~   kk++fryy"))455MM&&CS&9{{&  &&(
++GG##F$:$:$A$AB""'',,224##))$++*L*LM--MM&&CS&9!!-""6#5#56<<>fr||R^^<==KK""$MM$$S) >r0   r%   N)r&   r'   r(   r)   r   r-   base_model_prefixsupports_gradient_checkpointingr   Moduler  r/   r%   r0   r1   r  r  V  s$    &*#*BII *r0   r  zJ
    The text model from ALIGN without any head or projection on top.
    c                   j  ^  \ rS rSr% \\S'   S/rSS\S\4U 4S jjjrS r	S r
\\         SS\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                      S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )AlignTextModelip  r\   r  add_pooling_layerc                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rz   r{   r\   r  r   r  encoderr  pooler	post_init)rF   r\   r  r   s      r1   r{   AlignTextModel.__init__y  sK    
 	 -f5'/1Bof- 	r0   c                 .    U R                   R                  $ r   r   r%  rJ   s    r1   get_input_embeddings#AlignTextModel.get_input_embeddings  s    ...r0   c                 $    XR                   l        g r   r  )rF   r?  s     r1   set_input_embeddings#AlignTextModel.set_input_embeddings  s    */'r0   r1  r@  r  r  rB  r2  r`  r  r  r?   c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[	        S5      eUu  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  SS2SU24   nUR                  X5      nUnO$[        R                  " U[        R                  US9nU R!                  X+5      nU R#                  XPR                   R$                  5      nU R                  UUUUS9nU R&                  " U4UUUUS	S
.U
D6nUS   nU R(                  b  U R)                  U5      OSn[+        UUUR,                  UR.                  S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AlignTextModel

>>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer  z5You have to specify either input_ids or inputs_embedsrN   r  r4  )r1  r  r  r2  T)r@  rB  r`  r  r  r   )r#   pooler_outputr$   r6   )r\   r`  r  use_return_dictrY  %warn_if_padding_and_no_attention_maskr.  rO   r+   onesr5  r   r  r   r   r/  get_extended_attention_maskget_head_maskr  r  r  r   r$   r6   )rF   r1  r@  r  r  rB  r2  r`  r  r  rN  r6  
batch_sizer7  rO   r8  r9  extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                         r1   r   AlignTextModel.forward  s   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,
2/!5
 
 *!,8<8OO4UY)-')77&11	
 	
r0   r\   r   r  r  T)	NNNNNNNNN)r&   r'   r(   r)   r   r-   _no_split_modulesr   r{   r  r  r   r   r   r+   r   r,   r   r.   r   r   r/   r   r   s   @r1   r  r  p  s,    ./ 4   /0  -11515/31504,0/3&*\
ELL)\
 !.\
 !.	\

 u||,\
 E--.\
  -\
 $D>\
 'tn\
 d^\
 
u00	1\
  \
r0   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjrS\	R                  4S jr\\   SS\\R                      S\\   S	\\   S\\\4   4S
 jj5       5       rSrU =r$ )AlignVisionModeli  r\   r   Fc                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:X  a%  [        R                  " UR                  SS9U l        OMUR                  S:X  a%  [        R                  " UR                  SS9U l        O[        SUR                   35      eU R                  5         g )Nr  T)	ceil_moderb   z2config.pooling must be one of ['mean', 'max'] got )rz   r{   r\   rn   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2drY  poolingr  r   s     r1   r{   AlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r0   r?   c                 B    U R                   R                  R                  $ r   )vision_modelr   r   rJ   s    r1   r  %AlignVisionModel.get_input_embeddings  s      ++777r0   r  r  c                 `   Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUSS9nUS   nU R                  U5      nUR                  UR                  SS 5      n[        UUUR                  S9$ )a\  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AlignVisionModel

>>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```Nz You have to specify pixel_valuesT)r  r  r   r_   )r#   r  r$   )r\   r  r  rY  r   r  r  re  rH  r   r$   )rF   r   r  r  r  r  r#   r  s           r1   r   AlignVisionModel.forward  s    : %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5 ' 
 ,A.$56%--m.A.A"1.EF7/')77
 	
r0   r  )NNN)r&   r'   r(   r)   r   r-   main_input_namer  r{   r   r  r  r   r   r   r+   r,   r   r   r.   r   r   r/   r   r   s   @r1   r  r    s     $O&+#0 "8bii 8  59/3&*	2
u0012
 'tn2
 d^	2

 
u>>	?2
  2
r0   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\" 5       \      SS\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\
R                  4S jj5       5       r\" 5       \S\
R                  S
\
R                  4S j5       5       r\\           SS\	\
R                      S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S\	\   S\	\   S\	\   S\	\   S
\\\4   4S jj5       5       rSrU =r$ )r  iE  r\   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        [        U5      U l        [        U5      U l        [         R"                  " U R                  U R                  5      U l        [         R&                  " [(        R*                  " U R,                  R.                  5      5      U l        U R3                  5         g )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rz   r{   rj   text_configr   	TypeErrortypevision_configr   projection_dimr#  text_embed_dimr  
text_modelr  r  r   r\  r  	Parameterr+   tensorr\   r  r  r  )rF   r\   r  r  r   s       r1   r{   AlignModel.__init__I  s)    &,,o>>++,-Q0 
 &..0ABB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r0   r1  r@  r  r  rB  r2  r?   c           	      n    U R                  UUUUUUS9nUS   SS2SSS24   nU R                  U5      n	U	$ )am  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`AlignTextModel`].

Examples:

```python
>>> import torch
>>> from transformers import AutoTokenizer, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> with torch.inference_mode():
...     text_features = model.get_text_features(**inputs)
```)r1  r@  r  r  rB  r2  r   N)r  r  )
rF   r1  r@  r  r  rB  r2  text_outputsr#   text_featuress
             r1   get_text_featuresAlignModel.get_text_featuresg  sW    : ))%' ' 
 )OAq!G4,,->?r0   r   c                 <    U R                  US9nUR                  nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`AlignVisionModel`].

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AlignModel
>>> from transformers.image_utils import load_image

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(images=image, return_tensors="pt")
>>> with torch.inference_mode():
...     image_features = model.get_image_features(**inputs)
```)r   )r  r  )rF   r   vision_outputsimage_featuress       r1   get_image_featuresAlignModel.get_image_features  s(    2 ***E'55r0   return_lossr`  r  r  c                 R   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R	                  UU
SS9nU R                  UUUUUUU	U
SS9	nUS   nUS   SS2SSS24   nU R                  U5      nXR                  SSSS	9-  nXR                  SSSS	9-  n[        R                  " XR                  5       5      U R                  -  nUR                  5       nSnU(       a  [        U5      n[        UUUUUUUS
9$ )a^  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> import torch
>>> from transformers import AutoProcessor, AlignModel
>>> from transformers.image_utils import load_image

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = load_image(url)

>>> inputs = processor(
...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
... )

>>> with torch.inference_mode():
...     outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```NT)r   r  r  )	r1  r@  r  r  rB  r2  r`  r  r  r   r   r_   r  )r   r   keepdim)r:   r;   r<   r5   r"   r=   r>   )r\   r`  r  r  r  r  r  normr+   rF  rX   r  r[   r8   )rF   r1  r   r@  r  r  rB  r2  r#  r`  r  r  r  r  r"   r5   r<   r;   r:   s                      r1   r   AlignModel.forward  sl   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%!5 + 
 ))%'/!5 ' 

 &a("1oaAg.**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4DEHXHXX*,,.o.D-+#%* .
 	
r0   )r  r  r  r  r  r  )NNNNNN)NNNNNNNNNNN)r&   r'   r(   r)   r   r-   r{   r   r   r   r+   r   r,   r  r!  r   r;  r   r   r.   r8   r   r/   r   r   s   @r1   r  r  E  s"   { < %& -11515/3,004&ELL)& !.& !.	&
 u||,& ELL)&  -& 
		&  '&P %&u/@/@ UEVEV   '6  15481515/3,004&*,0/3&*Y
E,,-Y
 u001Y
 !.	Y

 !.Y
 u||,Y
 ELL)Y
  -Y
 d^Y
 $D>Y
 'tnY
 d^Y
 
uk!	"Y
  Y
r0   r  )r  r  r  r  r  )rc  N)Mr*   r   dataclassesr   typingr   r   r   r   r+   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_alignr   r   r   
get_loggerr&   loggerr    r3   r8   r   rU   r[   rc   rf   r.   r   rl   r  rn   r~   r   r   r   r   r   r   r   r  r   rR  rT  ro  r~  r  r  r  r  r  r  r  r  r  __all__r%   r0   r1   <module>r4     sm     ! 1 1   ! 9  G l l l l P P 
		H	% 
=[ = = 
	:; 	: 	:  
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @U3:. @ @*BII 4
 
6		 6$		 $P$BII $N BNryy NbG
 G
T<")) <L (,%II%<<% 
% <<	%
 U\\*% % % %%67RYY 7v")) * *\BII  bii %/ %P.
ryy .
dbii  *? * *2 
x
) x

x
v 
M
+ M

M
` C
% C
 C
L Wr0   