
    cCi                        S r SSKrSSKrSSKJr  SSKJr  SSKJ	r	J
r
  SSKrSSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$  \!RJ                  " \&5      r'Sr(Sr)\ " S S\5      5       r*\ " S S\5      5       r+\ " S S\5      5       r,SHS jr-SIS jr.SJS jr/ " S S\R`                  5      r1 " S S\R`                  5      r2 " S  S!\R`                  5      r3 " S" S#\R`                  5      r4 " S$ S%\R`                  5      r5 " S& S'\R`                  5      r6 " S( S)\R`                  5      r7 " S* S+\R`                  5      r8 " S, S-\R`                  5      r9 " S. S/\5      r: " S0 S1\R`                  5      r; " S2 S3\5      r<S4r=S5r>\" S6\=5       " S7 S8\<5      5       r? " S9 S:\R`                  5      r@\" S;\=5       " S< S=\<5      5       rA " S> S?\R`                  5      rB " S@ SA\R`                  5      rC " SB SC\R`                  5      rD\" SD\=5       " SE SF\<5      5       rE/ SGQrFg)KzPyTorch TVLT model.    N)deepcopy)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputSequenceClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
TvltConfigr   zZinengTang/tvlt-basec                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\R                     \	S
'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Srg)TvltModelOutput0   a_  
Class for TvltModel's outputs, with potential hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
        Pixel sequence of hidden-states at the output of the last layer of the model.
    last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
        Audio sequence of hidden-states at the output of the last layer of the model.
    pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
        Tensor indicating which pixel patches are masked (1) and which are not (0).
    audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
        Tensor indicating which audio patches are masked (1) and which are not (0).
    pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
        Tensor containing the ids permutation of pixel masking.
    audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
        Tensor containing the ids permutation of audio masking.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nlast_hidden_statelast_pixel_hidden_statelast_audio_hidden_statepixel_label_masksaudio_label_maskspixel_ids_restoreaudio_ids_restore.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r    
LongTensorr!   r"   r#   r$   tupler%   __static_attributes__r&       k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.pyr   r   0   s    8 6:x 1 129;?Xe&7&78?;?Xe&7&78?48x 0 01848x 0 01848x 0 01848x 0 018=AM8E%"3"3S"89:A:>Ju00#567>r2   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Srg)	TvltDecoderOutputY   a  
Class for TvltDecoder's outputs, with potential hidden states and attentions.

Args:
    logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nlogits.r$   r%   r&   )r'   r(   r)   r*   r+   r7   r   r,   r-   r.   r$   r0   r%   r1   r&   r2   r3   r5   r5   Y   s\      +/FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>r2   r5   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S	'   Sr\\\R                  S4      \	S
'   Srg)TvltForPreTrainingOutputp   a  
Class for TvltForPreTraining's outputs, with potential hidden states and attentions.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`):
        Pixel reconstruction loss.
    matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
        Matching objective logits.
    pixel_logits (`torch.FloatTensor` of shape
        `(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
        logits.
    audio_logits (`torch.FloatTensor` of shape
        `(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
        logits.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nlossmatching_logitspixel_logitsaudio_logits.r$   r%   r&   )r'   r(   r)   r*   r+   r;   r   r,   r-   r.   r<   r=   r>   r$   r0   r%   r1   r&   r2   r3   r9   r9   p   s    0 )-D(5$$
%,37OXe//0704L(5,,-404L(5,,-4=AM8E%"3"3S"89:A:>Ju00#567>r2   r9   c                     U R                   SS u  p4[        R                  " X44U R                  S9n[	        USU-
  -  5      nXV4$ )!Generate noise for audio masking.N   devicer   )shaper,   randrC   int)pixel_values
pixel_mask
mask_ratio
batch_sizeseq_lennoiselen_keeps          r3   generate_pixel_mask_noiserN      sM     ',,Ra0JJJ
,\5H5HIE7a*n-.H?r2   c                 J   U R                   SS u  pVUS:X  aS  Xd-  n[        R                  " XWU R                  S9R	                  S5      R                  SSU5      R                  XV5      nO%US:X  a  [        R                  " XVU R                  S9n[        USU-
  -  5      n	WU	4$ )r@   NrA   zframe-levelrB   r   patch-level)rD   r,   rE   rC   	unsqueezerepeatviewrF   )
audio_values
audio_maskrI   	mask_typefreq_lenrJ   rK   num_time_patchesrL   rM   s
             r3   generate_audio_mask_noiserZ      s     ',,Ra0JM!".JJzL<O<OPYr]VAq(#T*&	 	 
m	#

:|7J7JK7a*n-.H(?r2   c           	         U R                   u  pEn[        R                  " USS9n[        R                  " USS9nUSS2SU24   n	[        R                  " U SU	R	                  S5      R                  SSU5      S9n
[        R                  " XE/U R                  S9nSUSS2SU24'   [        R                  " USUS9nUb  X-  n[        R                  " USU	S9nXX4$ )z
Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
noise. sequence: [batch_size, seq_len, hidden_dim], sequence
r   dimNrP   r]   indexrB   r   )rD   r,   argsortgatherrR   rS   onesrC   )sequencerL   rM   attention_masksrJ   rK   
hidden_dimids_shuffleids_restoreids_keepsequence_maskedlabel_maskss               r3   random_maskingrk      s     '/nn#J --1-K--3K 1ixi<(Hll8(:L:LR:P:W:WXY[\^h:ijO **j28??KK !K9H9,,{EK"&,,AXN[EEr2   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )TvltPixelEmbeddings   ,Construct the patch and position embeddings.c                   > [         TU ]  5         [        U5      U l        U R                  R                  U l        [
        R                  " [        R                  " SSUR                  5      5      U l
        [
        R                  " [        R                  " SUR                  UR                  5      5      U l        [
        R                  " [        R                  " SU R                  UR                  5      5      U l        Xl        g Nr   )super__init__TvltPixelPatchEmbeddingspatch_embeddingsnum_patches_per_imager   	Parameterr,   zeroshidden_sizetype_embed_v
num_framestemporal_embedpos_embed_vconfigselfr~   	__class__s     r3   rs   TvltPixelEmbeddings.__init__   s     8 @%)%:%:%P%P"LLQ6;M;M)NO ll5;;q&:K:KVM_M_+`a<<At7Q7QSYSeSe(fgr2   c                    UR                   u  p4pVnU R                  U5      nXR                  R                  SUS5      -  nU[        R
                  " U R                  S S 2S U24   U R                  SS9-  nXR                  -  nX4$ Nr   r\   )	rD   ru   r}   rS   r,   repeat_interleaver|   rv   rz   )	r   rG   rd   rJ   r{   num_channelsheightwidth
embeddingss	            r3   forwardTvltPixelEmbeddings.forward   s    >J>P>P;
e**<8
&&--aQ??
e--d.A.A![j[..QSWSmSmstuu
'''
**r2   )r~   rv   ru   r}   r|   rz   N	r'   r(   r)   r*   r+   rs   r   r1   __classcell__r   s   @r3   rm   rm      s    6
	+ 	+r2   rm   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )TvltAudioEmbeddings   ro   c                   > [         TU ]  5         [        U5      U l        U R                  R                  U l        [
        R                  " [        R                  " SSUR                  5      5      U l
        UR                  UR                  S   -  U l        [
        R                  " [        R                  " SU R                  U R                  -  UR                  5      5      U l        [
        R                  " [        R                  " SU R                  UR                  5      5      U l        UR                  UR                  S   -  U l        Xl        g rq   )rr   rs   TvltAudioPatchEmbeddingsru   num_patchesr   rw   r,   rx   ry   type_embed_afrequency_lengthaudio_patch_sizenum_freq_patchespos_embed_a
freq_embedr~   r   s     r3   rs   TvltAudioEmbeddings.__init__   s     8 @00<<LLQ6;M;M)NO & 7 76;R;RST;U U<<At7G7G4K`K`7`bhbtbt(uv,,u{{1d6K6KVM_M_'`a & 7 76;R;RST;U Ur2   c                 ,   U R                  U5      nUR                  S5      U R                  -  nX0R                  R	                  SUS5      -  nU[
        R                  " U R                  S S 2S U24   U R                  SS9-  nX0R                  -  nX24$ r   )	ru   sizer   r   rS   r,   r   r   r   )r   rU   rd   r   rY   s        r3   r   TvltAudioEmbeddings.forward   s    **<8
%??1-1F1FFoo,,Q0@!DD
e--d.>.>qBSCSBS?S.TVZVkVkqrss
'''
**r2   )r~   r   r   r   ru   r   r   r   r   r   s   @r3   r   r      s    6	+ 	+r2   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )rt   i  z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l
        X@l        X`l        XPl        [        R                  " XEX3S9U l        g Nr   r   )kernel_sizestride)rr   rs   
image_sizeimage_patch_sizenum_image_channelsry   
isinstancecollectionsabcIterable
patch_sizer   rv   r   Conv2d
projection)r   r~   r   r   r   ry   rv   r   s          r3   rs   !TvltPixelPatchEmbeddings.__init__	  s    !'!2!2F4K4KJ$*$=$=v?Q?Qk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!+A*Q-!?JqMU_`aUbDb c$$(%:"&))L:ir2   rG   returnc                    UR                   u  p#pEnX@R                  :w  a  [        S5      eXPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR	                  X#-  XEU5      nU R                  U5      R                  S5      R                  SS5      nUR	                  X#U R                  -  U R                  5      nU$ )	NeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   zInput image size (*) doesn't match model ().rA   )
rD   r   
ValueErrorr   reshaper   flatten	transposerv   ry   )r   rG   rJ   r{   r   r   r   r   s           r3   r    TvltPixelPatchEmbeddings.forward  s    >J>P>P;
e,,,w  __Q''5OOA4F+F$VHAeW4KDOO\]L^K__`aeapapqras`ttvw  $++J,C\[`a__\2::1=GG1M
''
A[A[4[]a]m]mn
r2   )ry   r   r   rv   r   r   r'   r(   r)   r*   r+   rs   r,   Tensorr   r1   r   r   s   @r3   rt   rt     s.    j ELL U\\  r2   rt   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )r   i+  z
This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         T
U ]  5         UR                  UR                  UR                  pCnUR
                  UR                  peX#4n[        U[        R                  R                  5      (       a  UOXD4nUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4n	Xpl        X@l        XPl        Xl        Xl        [         R"                  " XVXDS9U l        g r   )rr   rs   spectrogram_lengthr   r   num_audio_channelsry   r   r   r   r   spectrogram_sizer   r   r   patch_shaper   r   r   )r   r~   r   r   r   r   ry   r   r   r   r   s             r3   rs   !TvltAudioPatchEmbeddings.__init__2  s    %%#### /9
 %+$=$=v?Q?Qk.A#-j+//:R:R#S#SZZdYq
'*jm;@PQR@SWabcWd@de'*jm;=Ma=PT^_`Ta=ab 0$(&&))L:ir2   rU   r   c                 d   UR                   u  p#pEX0R                  :w  a  [        S5      eX@R                  S   :  d  XPR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      R                  S5      R                  SS5      nU$ )	Nr   r   r   zInput audio size (r   r   r   rA   )rD   r   r   r   r   r   r   )r   rU   rJ   r   r   r   r   s          r3   r    TvltAudioPatchEmbeddings.forwardG  s    2>2D2D/
&,,,w  ))!,,9N9Nq9Q0Q$VHAeW 5**1-.a0E0Ea0H/IM  __\2::1=GG1M
r2   )r   r   r   r   r   r   r   r   s   @r3   r   r   +  s.    j*ELL U\\  r2   r   c                   8   ^  \ rS rSrU 4S jrS rSS jrSrU =r$ )TvltSelfAttentioniW  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                   5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)rr   rs   ry   num_attention_headshasattrr   rF   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropoutr   s     r3   rs   TvltSelfAttention.__init__X  s1    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr2   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )NrP   r   rA   r      )r   r   r   rT   permute)r   xnew_x_shapes      r3   transpose_for_scores&TvltSelfAttention.transpose_for_scoresj  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r2   c                    U R                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      n[        R
                  " XR                  SS5      5      n	U	[        R                  " U R                  5      -  n	Ub  X-   n	[        R                  " SS9" U	5      n
U R                  U
5      n
Ub  X-  n
[        R
                  " X5      nUR                  SSSS5      R                  5       nUR                  5       S S U R                   4-   nUR"                  " U6 nU(       a  X4nU$ U4nU$ )NrP   r\   r   rA   r   r   )r   r   r   r   r,   matmulr   mathsqrtr   r   Softmaxr   r   
contiguousr   r   rT   )r   r$   attention_mask	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r3   r   TvltSelfAttention.forwardo  sS    JJ}5--dhh}.EF	//

=0IJ//0AB !<<5H5HR5PQ+dii8P8P.QQ%/@ **,-=> ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=2 O\M]r2   )r   r   r   r   r   r   r   NNF)	r'   r(   r)   r*   rs   r   r   r1   r   r   s   @r3   r   r   W  s    G$%
! !r2   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )TvltSelfOutputi  z
The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r~   r   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	rr   rs   r   r   ry   denser   hidden_dropout_probr   r   s     r3   rs   TvltSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r2   r$   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ r   r   r   r   r$   r   s      r3   r   TvltSelfOutput.forward  s$    

=1]3r2   r   )r'   r(   r)   r*   r+   r   rs   r,   r   r   r1   r   r   s   @r3   r   r     sI    
>z >d >
U\\  RWR^R^  r2   r   c                   8   ^  \ rS rSrU 4S jrS rSS jrSrU =r$ )TvltAttentioni  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )rr   rs   r   	attentionr   outputsetpruned_headsr   s     r3   rs   TvltAttention.__init__  s0    *62$V,Er2   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r\   )lenr   r  r   r   r  r   r   r   r   r  r   r   union)r   headsr_   s      r3   prune_headsTvltAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r2   c                 f    U R                  XX45      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r  r  )r   r$   r   r   r   self_outputsattention_outputr   s           r3   r   TvltAttention.forward  s@    ~~mYb;;|AF#%QR(88r2   )r  r  r  r   )	r'   r(   r)   r*   rs   r  r   r1   r   r   s   @r3   r  r    s    ";$ r2   r  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	TvltIntermediatei  r~   r   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rr   rs   r   r   ry   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnr   s     r3   rs   TvltIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r2   r$   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r  r   r$   s     r3   r   TvltIntermediate.forward  s&    

=100?r2   r  r'   r(   r)   r*   r   rs   r,   r   r   r1   r   r   s   @r3   r  r    s6    9z 9d 9U\\ ell  r2   r  c                      ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )

TvltOutputi  r~   r   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
rr   rs   r   r   r  ry   r   r   r   r   r   s     r3   rs   TvltOutput.__init__  sB    YYv779K9KL
zz&"<"<=r2   r$   r   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r3   r   TvltOutput.forward  s,    

=1]3%4r2   r   r  r   s   @r3   r!  r!    sD    >z >d >
U\\  RWR^R^  r2   r!  c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )	TvltLayeri  z?This corresponds to the Block class in the timm implementation.c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g Nr   eps)rr   rs   chunk_size_feed_forwardseq_len_dimr  r  r  intermediater!  r  r   	LayerNormry   layer_norm_epslayernorm_beforelayernorm_afterr   s     r3   rs   TvltLayer.__init__  s    '-'E'E$&v.,V4 ( "V-?-?VEZEZ [!||F,>,>FDYDYZr2   c                    U R                  U R                  U5      UUUS9nUS   nUSS  nXaR                  UR                  5      -   nU R	                  U5      nU R                  U5      nU R                  X5      nU4U-   nU$ )Nr   r   r   )r  r1  torC   r2  r.  r  )	r   r$   r   r   r   self_attention_outputsr  r   layer_outputs	            r3   r   TvltLayer.forward  s    !%!!-0/	 "0 "
 2!4(, )+;+;<L<S<S+TT ++M:((6 {{<?/G+r2   )r  r,  r.  r2  r1  r  r-  r   r   r   s   @r3   r'  r'    s    I[ r2   r'  c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )TvltEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rr   rs   r~   r   
ModuleListrangenum_hidden_layersr'  layergradient_checkpointing)r   r~   _r   s      r3   rs   TvltEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c                 6   U(       a  SOS nU(       a  SOS n[        U R                  5       H9  u  pU(       a  Xq4-   nUb  X9   OS nU
" XX5      nUS   nU(       d  M1  XS   4-   nM;     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr&   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r&   .0vs     r3   	<genexpr>&TvltEncoder.forward.<locals>.<genexpr>0  s     m$[q$[   	)r   r$   r%   )	enumerater@  r0   r   )r   r$   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss                r3   r   TvltEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO(kM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r2   )r~   rA  r@  )NNFFTr'   r(   r)   r*   rs   r   r1   r   r   s   @r3   r;  r;    s#    , ""
 "
r2   r;  c                   6    \ rS rSr% Sr\\S'   SrSrSr	S r
Srg	)
TvltPreTrainedModeli8  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r~   tvltrG   Tc                 
   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weights        )meanstdNg      ?)r   r   r   r   weightdatanormal_r~   initializer_ranger   zero_r/  fill_)r   modules     r3   _init_weights!TvltPreTrainedModel._init_weightsC  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r2   r&   N)r'   r(   r)   r*   r+   r   r.   base_model_prefixmain_input_namesupports_gradient_checkpointingre  r1   r&   r2   r3   rX  rX  8  s%    
 $O&*#
*r2   rX  aF  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`TvltConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a	  
    Args:
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Audio values. Audio values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
            Pixel masks. Pixel masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
            Audio masks. Audio masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
            details.

        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Pixel values mixed can
            be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.

        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel masks of pixel_values_mixed. Pixel masks mixed can be obtained using [`TvltProcessor`]. See
            [`TvltProcessor.__call__`] for details.

        mask_pixel (`bool`, *optional*):
            Whether to mask pixel for MAE tasks. Only set to True in TvltForPreTraining.

        mask_audio (`bool`, *optional*):
            Whether to mask audio for MAE tasks. Only set to True in TvltForPreTraining.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.

        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare TVLT Model transformer outputting raw hidden-states without any specific head on top.c                   4  ^  \ rS rSrU 4S jrS rS r\" \5      \	" \
\S9       SS\R                  S\R                  S\\R                     S	\\R                     S
\S\S\\   S\\   S\\   S\\\R                     \
4   4S jj5       5       rSrU =r$ )	TvltModeli  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        [        R                  " [        R                  " SSUR                  5      5      U l        UR                  (       a  S U l        O.[        R"                  " UR                  UR$                  S9U l        U R'                  5         g r)  )rr   rs   r~   rm   pixel_embeddingsr   audio_embeddingsr;  encoderr   rw   r,   rx   ry   cls_embeddinguse_mean_pooling	layernormr/  r0  	post_initr   s     r3   rs   TvltModel.__init__  s      3F ; 3F ;"6*\\%++aF<N<N*OP""!DN\\&*<*<&BWBWXDN 	r2   c                 Z    U R                   R                  U R                  R                  4$ r   )rm  ru   rn  )r   s    r3   get_input_embeddingsTvltModel.get_input_embeddings  s%    $$55t7L7L7]7]]]r2   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsro  r@  r  r  )r   heads_to_pruner@  r  s       r3   _prune_headsTvltModel._prune_heads  s<    
 +002LELLu%//;;EB 3r2   output_typeconfig_classrG   rU   rH   rV   
mask_pixel
mask_audior   rM  rN  r   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  X5      u  pU R                  X$5      u  pSnSnU(       a/  [        XU R                   R                  S9u  p[        U
UUUS9u  ppSnSnU(       ax  U R                   R                  U R                   R                  S   -  n[        UUU R                   R                  U R                   R                  US9u  nn[        UUUUS9u  pnnUR                  S5      n[        R                   " U R"                  R%                  USS5      X/S5      nU
R                  S5      nSnUb%  Ub"  [        R                   " USS2SS24   X4/S5      nUR                  5       nSnUb  U R'                  UU5      nU R)                  UUUUU	S9nUS   nU R*                  b  U R+                  U5      nUSS2SSU-   24   nUSS2SU-   S24   nU	(       d  UUUUUUU4USS -   $ [-        UUUUUUUUR.                  UR0                  S9	$ )	a  
Returns:

Examples:

```python
>>> from transformers import TvltProcessor, TvltModel
>>> import numpy as np
>>> import torch

>>> num_frames = 8
>>> images = list(np.random.randn(num_frames, 3, 224, 224))
>>> audio = list(np.random.randn(10000))

>>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
>>> model = TvltModel.from_pretrained("ZinengTang/tvlt-base")

>>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")

>>> outputs = model(**input_dict)
>>> loss = outputs.loss
```N)rH   rI   )rd   r   )rV   rI   rW   rX   r   )r   r   rM  rN  )	r   r   r   r    r!   r"   r#   r$   r%   )r~   r   rM  use_return_dictrm  rn  rN   pixel_mask_ratiork   r   r   rZ   audio_mask_ratioaudio_mask_typer   r,   catrp  rS   get_extended_attention_maskro  rr  r   r$   r%   )r   rG   rU   rH   rV   r  r  r   rM  rN  pixel_embedding_outputaudio_embedding_outputr    r"   pixel_mask_noisepixel_len_keepr!   r#   r   audio_mask_noiseaudio_len_keeprJ   embedding_outputmasked_pixel_lenr   input_shapeextended_attention_maskencoder_outputssequence_outputpixel_sequence_outputaudio_sequence_outputs                                  r3   r   TvltModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-1-B-B<-\*-1-B-B<-\* ! /H&$++JfJf0, Xf&  *	XT"0A ! #{{;;t{{?[?[\]?^^/H&%;;77++55)0,n Xf&  *	XT"0ACT "&&q)
 99&&z1a8:Pikl
 266q9!j&<"YY
1bqb5(9:'RTUVN&++-"&%&*&F&F~Wb&c#,,2/!5# ' 
 *!,>>%"nn_=O /1q;K7K3K0K L /17G3G3I0I J%%!!!!  #$ $ -$9$9////)77&11

 
	
r2   )rn  rp  r~   ro  rr  rm  )NNFFNNN)r'   r(   r)   r*   rs   rv  r{  r   TVLT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr,   r-   r   boolr   r0   r   r1   r   r   s   @r3   rk  rk    s   
$^C ++@A?Y
 3726  ,0/3&*@
''@
 ''@
 U../	@

 U../@
 @
 @
 $D>@
 'tn@
 d^@
 
uU&&'8	9@
 Z B@
r2   rk  c                   8   ^  \ rS rSrU 4S jr   SS jrSrU =r$ )TvltDecoderi/  c                   > [         TU ]  5         [        U5      nUR                  Ul        UR
                  Ul        UR                  Ul        UR                  Ul
        [        R                  " [        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        [        R                   " UR                  UR"                  S9U l        SU l        Xl        g s  snf )Nr*  F)rr   rs   r   decoder_hidden_sizery   decoder_num_hidden_layersr?  decoder_num_attention_headsr   decoder_intermediate_sizer  r   r=  r>  r'  decoder_layersr/  r0  rr  rA  r~   )r   r~   decoder_configrB  r   s       r3   rs   TvltDecoder.__init__0  s    !&)%+%?%?"+1+K+K(-3-O-O*+1+K+K( mm05f6V6V0WX0W1Y~&0WX
 f&@&@fF[F[\&+# Ys   C#c                 >   U(       a  SOS nU(       a  SOS n[        U R                  5       H-  u  pxU(       a  XQ4-   nU" XS9n	U	S   nU(       d  M%  XiS   4-   nM/     U(       a  XQ4-   nU R                  U5      n
U(       d  [        S XU4 5       5      $ [	        XUS9$ )Nr&   r5  r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r&   rF  s     r3   rI  &TvltDecoder.forward.<locals>.<genexpr>]  s     f$Tq$TrK  )r7   r$   r%   )rL  r  rr  r0   r5   )r   r$   r   rM  rN  rO  rP  rQ  rR  rT  r7   s              r3   r   TvltDecoder.forwardA  s     #7BD$5b4()<)<=OA#$58H$H!(\M)!,M  &91=M<O&O#  >   14D D .fV@S$Tfff \oppr2   )r~   r  rA  rr  )FFTrV  r   s   @r3   r  r  /  s    (  "q qr2   r  zTThe TVLT Model transformer with the decoder on top for self-supervised pre-training.c                     ^  \ rS rSrU 4S jrS rS rS rS rS r	\
" \5      \" \\S9        SS	\R                   S
\R                   S\\R                      S\\R                      S\\R$                     S\\R                      S\\R                      S\\   S\\   S\\   S\\\R                      \4   4S jj5       5       rSrU =r$ )TvltForPreTrainingia  c                   > [         T	U ]  U5        Xl        UR                  U l        UR                  U l        U R                  (       d  U R                  (       d  [        S5      e[        U5      U l        U R                  (       a  [        U5      U l	        U R                  (       Ga*  [        R                  " UR                  UR                  SS9U l        [        R                  " [         R"                  " SSUR                  5      5      U l        [        R                  " [         R"                  " SSUR                  5      5      U l        [)        U5      U l        UR                  nUR,                  nU R                  R.                  R0                  n[        R                  " [         R"                  " SXB5      5      U l        [        R                  " [         R"                  " SUR,                  U5      5      U l        [        R                  " [         R"                  " SSU5      5      U l        U R                  R8                  R:                  nUR<                  UR>                  S   -  n[        R                  " [         R"                  " SXV-  U5      5      U l         [        R                  " [         R"                  " SXb5      5      U l!        [        R                  " [         R"                  " SSU5      5      U l"        U R                  RF                  S   S-  U R                  RH                  -  n[K        X5      U l&        U R                  R>                  S   U R                  R>                  S   -  U R                  RN                  -  n[K        X5      U l(        X0l        X@l        X`l)        URF                  U l#        UR>                  U l        U RU                  5         g )Nz;Must set at least one of matching task and MAE task to trueTr   r   r   rA   )+rr   rs   r~   task_matchingtask_maer   rk  rY  TvltMatchingHeadmatching_headr   r   ry   r  encoder_to_decoderrw   r,   rx   pixel_mask_tokenaudio_mask_tokenr  decoderr{   rm  rv   decoder_pixel_pos_embeddecoder_temporal_embeddecoder_pixel_type_embedrn  r   r   r   decoder_audio_pos_embeddecoder_freq_embeddecoder_audio_type_embedr   r   TvltMAEHeadpixel_mae_headr   audio_mae_headr   rs  )
r   r~   r  r{   rv   num_audio_patchesr   pixel_mae_output_dimaudio_mae_output_dimr   s
            r3   rs   TvltForPreTraining.__init__f  s    #11""dmmZ[[f%	!1&!9D===&(ii0B0BFD^D^ei&jD#$&LLQ6C]C]1^$_D!$&LLQ6C]C]1^$_D!&v.DL"("<"<**J$(II$>$>$T$T!+-<<AG\8r+sD(*,,,u{{1fFWFWYl7m*nD',.LLQK^9_,`D) $		 : : F F%66&:Q:QRS:TT+-<<A0DFYZ,D( ')ll5;;qBR3h&iD#,.LLQK^9_,`D)#';;#?#?#Ba#G$++JhJh#h "-f"KD,,Q/$++2N2Nq2QQTXT_T_TrTrr ! #.f"KD(O)>&$4!$*$;$;D!$*$;$;D! 	r2   c           
         UR                   u  p#pEnUR                   S   U R                  S   -  nUR                   S   U R                  S   -  nUR                  UUUUU R                  S   UU R                  S   4S9n	[        R                  " SU	5      n	U	R                  UXx-  U-  U R                  S   U R                  S   -  U-  4S9n	U	$ )z:
pixel_values: [batch_size, num_frames, 3, height, width]
r   r   r   r   rD   zntchpwq->nthwpqc)rD   r   r   r,   einsum)
r   rG   rJ   r{   r   r   r   num_patches_heightnum_patches_widthpatchified_pixel_valuess
             r3   patchify_pixel!TvltForPreTraining.patchify_pixel  s	    ?K>P>P;
e)//2d6K6KA6NN(..q1T5J5J15MM"."6"6"%%a(!%%a( #7 
#
 #(,,/ACZ"["9"A"A"6C%%a(4+@+@+CClR #B #
 '&r2   c           	      \   UR                   u  p#pEX@R                  S   -  nXPR                  S   -  nUR                  UUUU R                  S   UU R                  S   4S9n[        R                  " SU5      nUR                  UXg-  U R                  S   U R                  S   -  U-  4S9nU$ )z.
audio_values: [batch_size, 1, height, width]
r   r   r  znchpwq->nhwpqc)rD   r   r   r,   r  )	r   rU   rJ   r   r   r   r  r  patchified_audio_valuess	            r3   patchify_audio!TvltForPreTraining.patchify_audio  s     3?2D2D/
&#'<'<Q'??!%:%:1%=="."6"6"%%a(!%%a( #7 	#
 #(,,/?AX"Y"9"A"A"6%%a(4+@+@+CClR #B #
 '&r2   c                     U R                  U5      nX$-
  S-  nUR                  SS9nXS-  R                  5       UR                  5       -  nU$ NrA   rP   r\   )r  r\  sum)r   rG   pixel_predictionsmaskr  r;   s         r3   pixel_mae_loss!TvltForPreTraining.pixel_mae_loss  P    "&"5"5l"C!;AyyRy   "TXXZ/r2   c                     U R                  U5      nX$-
  S-  nUR                  SS9nXS-  R                  5       UR                  5       -  nU$ r  )r  r\  r  )r   rU   audio_predictionsr  r  r;   s         r3   audio_mae_loss!TvltForPreTraining.audio_mae_loss  r  r2   c           	          UR                   u  pEnUR                  XCR                   S   U-
  S5      n[        R                  " X'/SS9n[        R                  " USUR                  S5      R                  SSU5      S9nU$ )Nr   r\   rP   r^   )rD   rS   r,   r  ra   rR   )	r   
mask_tokenrc   rg   rJ   
seq_lengthr]   mask_tokenspadded_sequences	            r3   concatenate_mask#TvltForPreTraining.concatenate_mask  s~    &.nn#
 ''
4E4Ea4H:4UWXY))X$;C,,+*?*?*C*J*J1aQT*U
 r2   r}  rG   rU   rH   rV   labelspixel_values_mixedpixel_mask_mixedr   rM  rN  r   c                 &   U
b  U
OU R                   R                  n
SnU R                  (       a}  Uc  [        S5      eUc  [        S5      eU R	                  UUUUUU	U
S9nUS   nU R                  U5      n[        5       nU" UR                  S5      UR                  S5      5      nUU-  nSnSnU R                  (       Ga  U R                  (       Ga  U R	                  UUUUSSUU	U
S	9	nU
(       a  UR                  OUS
   nU
(       a  UR                  OUS   nU
(       a  UR                  OUS   nU
(       a  UR                  OUS   nU
(       a  UR                  OUS   nU
(       a  UR                  OUS   nU R!                  U5      nU R!                  U5      nUR#                  S
5      nU R%                  U R&                  UU5      nUU R(                  R+                  S
US
5      -   nU[,        R.                  " U R0                  SS2SU24   U R2                  S
S9-   nUU R4                  -   nU R7                  U5      nU R9                  UR:                  5      nU R%                  U R<                  UU5      nUR#                  S
5      U R>                  -  nUU R@                  R+                  S
US
5      -   nU[,        R.                  " U RB                  SS2SU24   U R>                  S
S9-   nUU RD                  -   nU R7                  U5      nU RG                  UR:                  5      nU RI                  UUU5      U RK                  UUU5      -   nUU-  nU
(       d  WUU4WSS -   nWb  U4U-   $ U$ [M        UWUUWRN                  URP                  S9$ )av  
pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
    Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Audio values can be
    obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.

pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Pixel masks of pixel_values_mixed. Pixel values mixed can be obtained using [`TvltProcessor`]. See
    [`TvltProcessor.__call__`] for details.

labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
    Labels for computing the vision audio matching loss. Indices should be in `[0, 1]`. num_labels has to be 1.

Return:

Examples:

```python
>>> from transformers import TvltProcessor, TvltForPreTraining
>>> import numpy as np
>>> import torch

>>> num_frames = 8
>>> images = list(np.random.randn(num_frames, 3, 224, 224))
>>> images_mixed = list(np.random.randn(num_frames, 3, 224, 224))
>>> audio = list(np.random.randn(10000))
>>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
>>> model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base")
>>> input_dict = processor(
...     images, audio, images_mixed, sampling_rate=44100, mask_pixel=True, mask_audio=True, return_tensors="pt"
... )

>>> outputs = model(**input_dict)
>>> loss = outputs.loss
```Nr[  zMatching task requires labelsz)Matching task requires pixel_values_mixedrH   rV   r   rM  rN  r   rP   T)rH   rV   r  r  r   rM  rN  r   rA   r   r         r\      )r;   r<   r=   r>   r$   r%   ))r~   r  r  r   rY  r  r   rT   r  trainingr   r   r    r!   r"   r#   r  r   r  r  r  rS   r,   r   r  rv   r  r  r  r7   r  r   r  r  r  r  r  r  r9   r$   r%   ) r   rG   rU   rH   rV   r  r  r  r   rM  rN  
total_lossr   r  r<   loss_fctr;   r=   r>   r  r  r    r!   r"   r#   pixel_decoder_inputaudio_decoder_inputr{   pixel_decoder_outputsrY   audio_decoder_outputsr  s                                    r3   r   TvltForPreTraining.forward  s   b &1%<k$++B]B]
~ !@AA!) !LMMii"+%"3%9'   G &ajO"00AO(*HO004fkk"oFD$J===T]]]ii%%"3%9'   
G HSG$C$CX_`aXb!GRG$C$CX_`aXb!=H 9 9gVWj=H 9 9gVWj=H 9 9gVWj=H 9 9gVWj"&"9"9%# #'"9"9%# &**1-J"&"7"78M8MObdu"v"58T8T8[8[\]_ikl8m"m"58O8O++A{
{N;T=W=W]^9 # #68U8U"U$(LL1D$E!../D/K/KLL"&"7"78M8MObdu"v277:d>S>SS"58O8O8V8VWXZjlm8n"n"58O8O,,Q0A1A0A-ABDDYDY_`9 # #68U8U"U$(LL1D$E!../D/K/KLL&&|\CTUX\XkXkl,=Y D $J%|\BWQR[PF/3/?ZMF*KVK'+%%!//))
 	
r2   )r  r  r   r~   r  r  r  r  r  r  r  r  r   r  r{   r   rv   r  r  r  r  rY  )NNNNNNNN)r'   r(   r)   r*   rs   r  r  r  r  r  r   r  r   r9   r  r,   r-   r   r/   r  r   r0   r   r1   r   r   s   @r3   r  r  a  sG   
4l'8'6 ++@A+CRab
 3726-1:>8<,0/3&*H
''H
 ''H
 U../	H

 U../H
 ))*H
 %U%6%67H
 #5#4#45H
 $D>H
 'tnH
 d^H
 
uU&&')AA	BH
 c BH
r2   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
TvltPooleriw  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rr   rs   r   r   ry   r   Tanh
activationr   s     r3   rs   TvltPooler.__init__x  s9    YYv1163E3EF
'')r2   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )r   r$   first_token_tensorpooled_outputs       r3   r   TvltPooler.forward}  s4    *1a40

#566r2   )r  r   rV  r   s   @r3   r  r  w  s    $
 r2   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r  i  c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  S5      U l        g rq   )rr   rs   r  poolerr   r   ry   fcr   s     r3   rs   TvltMatchingHead.__init__  s2     ())F..2r2   c                 F    U R                  U R                  U5      5      nU$ r   r  r  r  s     r3   r   TvltMatchingHead.forward  s    M :;r2   r   rV  r   s   @r3   r  r    s    3
 r2   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )r  i  c                 z   > [         TU ]  5         Xl        [        R                  " UR
                  U5      U l        g r   )rr   rs   r~   r   r   r  r  )r   r~   
output_dimr   s      r3   rs   TvltMAEHead.__init__  s+    yy!;!;ZHr2   c                 (    U R                  U5      nU$ r   )r  r  s     r3   r   TvltMAEHead.forward  s    ]3r2   )r~   r  r   rV  r   s   @r3   r  r    s    I
 r2   r  z
    Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
    for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
    c                   <  ^  \ rS rSrU 4S jr\" \5      \" \\	S9      SS\
R                  S\
R                  S\\
R                     S\\
R                     S\\   S	\\   S
\\   S\\
R                     S\\\
R                     \4   4S jj5       5       rSrU =r$ ) TvltForAudioVisualClassificationi  c           	        > [         TU ]  U5        [        U5      U l        [        R
                  " [        R                  " UR                  UR                  S-  5      [        R                  " UR                  S-  UR                  S9[        R                  " 5       [        R                  " UR                  S-  UR                  5      5      U l        Xl        U R                  5         g )NrA   r*  )rr   rs   rk  rY  r   
Sequentialr   ry   r/  r0  GELU
num_labels
classifierr~   rs  r   s     r3   rs   )TvltForAudioVisualClassification.__init__  s     f%	 --IIf((&*<*<q*@ALL++a/V5J5JKGGIIIf((1,f.?.?@	
  	r2   r}  rG   rU   rH   rV   r   rM  rN  r  r   c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n	U	S   SS2S4   n
U R                  U
5      nSnUbY  U R                   R                  S:X  a  [        5       nU" X5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU(       d  U4U	SS -   nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )a1  
labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
    Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
    refers to the number of classes in audiovisual tasks.

Return:

Examples:
```python
>>> from transformers import TvltProcessor, TvltForAudioVisualClassification
>>> import numpy as np
>>> import torch

>>> num_frames = 8
>>> images = list(np.random.randn(num_frames, 3, 224, 224))
>>> audio = list(np.random.randn(10000))
>>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
>>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
>>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")

>>> outputs = model(**input_dict)
>>> loss = outputs.loss
```Nr  r   
regressionclassificationr   )r;   r7   r$   r%   )
r~   r  rY  r  	loss_typer
   r	   r   r$   r%   )r   rG   rU   rH   rV   r   rM  rN  r  r   r  r7   r;   r  r  s                  r3   r   (TvltForAudioVisualClassification.forward  s	   H &1%<k$++B]B]))!!/!5#  
 "!*QT*1{{$$4"9/&&*::+-/Y,F)-)9TGf$EvE'!//))	
 	
r2   )r  r~   rY  )NNNNNN)r'   r(   r)   r*   rs   r   r  r   r   r  r,   r-   r   r  r/   r   r0   r   r1   r   r   s   @r3   r	  r	    s    " ++@A+CRab
 3726,0/3&*-1B
''B
 ''B
 U../	B

 U../B
 $D>B
 'tnB
 d^B
 ))*B
 
uU&&')AA	BB
 c BB
r2   r	  )rk  r  r	  rX  )N      ?)Nr  rQ      r   )Gr+   collections.abcr   r   copyr   dataclassesr   typingr   r   r,   r   torch.nnr   r	   r
   activationsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   configuration_tvltr   
get_loggerr'   loggerr  _CHECKPOINT_FOR_DOCr   r5   r9   rN   rZ   rk   Modulerm   r   rt   r   r   r   r  r  r!  r'  r;  rX  TVLT_START_DOCSTRINGr  rk  r  r  r  r  r  r	  __all__r&   r2   r3   <module>r)     sc       ! "   A A " : J . R  + 
		H	%,  %?k %? %?P ? ? ?, ?{ ? ?B$F:+")) +6+")) +:&ryy &R)ryy )X9		 9xRYY $BII Dryy   #* #L)
")) )
X*/ *0	 * Z d`
# `
	`
F/q")) /qd ZO
, O
	O
d
 
ryy "))   V
': V
V
r ir2   