
    cCio                     r   S r SSKJrJr  SSKrSSKJr  SSKJr  SSKJ	r	  SSK
JrJrJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJrJrJrJr  SSKJrJr  SSKJr  \R@                  " \!5      r" " S S\RF                  5      r$ " S S\RF                  5      r% S5S\RF                  S\RL                  S\RL                  S\RL                  S\\RL                     S\'S\'4S jjr( " S S\RF                  5      r) " S S\RF                  5      r* " S  S!\RF                  5      r+ " S" S#\RF                  5      r, " S$ S%\RF                  5      r- " S& S'\	5      r. " S( S)\RF                  5      r/ " S* S+\RF                  5      r0\ " S, S-\5      5       r1\ " S. S/\15      5       r2\" S0S19 " S2 S3\15      5       r3/ S4Qr4g)6zPyTorch ViViT model.    )CallableOptionalN)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack) find_pruneable_heads_and_indicesprune_linear_layer)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplecheck_model_inputs   )VivitConfigc                   v   ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	S\R                  4S jjr
S	rU =r$ )VivitTubeletEmbeddings$   az  
Construct Vivit Tubelet embeddings.

This module turns a batch of videos of shape (batch_size, num_frames, num_channels, height, width) into a tensor of
shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

The seq_len (the number of patches) equals (number of frames // tubelet_size[0]) * (height // tubelet_size[1]) *
(width // tubelet_size[2]).
configc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R
                  S   -  U R                  U R
                  S   -  -  U R                  U R
                  S   -  -  U l        UR                  U l        [        R                  " UR                  UR                  UR                  UR                  S9U l        g )N   r   r   )kernel_sizestride)super__init__
num_frames
image_sizetubelet_size
patch_sizenum_patcheshidden_size	embed_dimr   Conv3dnum_channels
projectionselfr   	__class__s     b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/vivit/modeling_vivit.pyr"   VivitTubeletEmbeddings.__init__/   s     ++ ++ --__ 22$//!"446$//!"446 	
  ++))!3!3ATAT]c]p]p
    pixel_valuesinterpolate_pos_encodingreturnc                 b   UR                   u  p4pVnU(       dP  X`R                  :w  d  XpR                  :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eUR                  SSSSS	5      nU R	                  U5      nUR                  S5      R                  SS5      nU$ )
NzImage image size (*z) doesn't match model (r   r   z).r   r      )shaper$   
ValueErrorpermuter,   flatten	transpose)	r.   r3   r4   
batch_sizer#   r+   heightwidthxs	            r0   forwardVivitTubeletEmbeddings.forward?   s    >J>P>P;
e'V-F%SbSbJb$VHAeW4KDOO\]L^K__`aeapapqras`ttvw 
 $++Aq!Q:OOL) IIaL""1a(r2   )r)   r$   r#   r'   r&   r,   F)__name__
__module____qualname____firstlineno____doc__r   r"   torchTensorboolrB   __static_attributes____classcell__r/   s   @r0   r   r   $   s>    
{ 
 ELL D ]b]i]i  r2   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	S\	S\R                  4S	 jr
SS
\R                  S\S\R                  4S jjrSrU =r$ )VivitEmbeddingsP   z|
Vivit Embeddings.

Creates embeddings from a video using VivitTubeletEmbeddings, adds CLS token and positional embeddings.
r   c                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        [        U5      U l	        [        R                  " [        R
                  " SU R                  R                  S-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR                  SS  U l        Xl        g )Nr   )r!   r"   r   	ParameterrJ   zerosr(   	cls_tokenr   patch_embeddingsr'   position_embeddingsDropouthidden_dropout_probdropoutr%   r&   r   r-   s     r0   r"   VivitEmbeddings.__init__W   s    ekk!Q8J8J&KL 6v >#%<<KK400<<q@&BTBTU$
  zz&"<"<= --ab1r2   
embeddingsr?   r@   r5   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr   g      ?r   r   bicubicF)sizemodealign_cornersdim)r9   rX   rJ   jit
is_tracingr&   r   reshaper;   r   
functionalinterpolateviewcat)r.   r]   r?   r@   r'   num_positionsclass_pos_embedpatch_pos_embedre   
new_height	new_widthsqrt_num_positionss               r0   r4   (VivitEmbeddings.interpolate_pos_encodinge   s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr2   r3   r4   c                 "   UR                   u  p4pVnU R                  XS9nU R                  R                  USS/5      n	[        R
                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )Nr4   r   rd   )	r9   rW   rV   tilerJ   rl   r4   rX   r[   )
r.   r3   r4   r>   r#   r+   r?   r@   r]   
cls_tokenss
             r0   rB   VivitEmbeddings.forward   s    >J>P>P;
e**<*k
^^((*a);<
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r2   )rV   r   r[   rW   r&   rX   rD   )rE   rF   rG   rH   rI   r   r"   rJ   rK   intr4   rL   rB   rM   rN   rO   s   @r0   rQ   rQ   P   sq    { &D5<< &D &DUX &D]b]i]i &DPELL D ]b]i]i  r2   rQ   modulequerykeyvalueattention_maskscalingr[   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr_   )re   dtype)ptrainingr   r   )rJ   matmulr=   r   ri   softmaxfloat32tor   r[   r   
contiguous)
rz   r{   r|   r}   r~   r   r[   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r2   c            	          ^  \ rS rSrS\4U 4S jjr S	S\R                  S\\R                     S\	\R                  \R                  4   4S jjr
SrU =r$ )
VivitSelfAttention   r   c                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r!   r"   r(   num_attention_headshasattrr:   r   ry   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr   Linearqkv_biasr{   r|   r}   r-   s     r0   r"   VivitSelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r2   hidden_states	head_maskr5   c                    UR                   S   nUSU R                  U R                  4nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      nU R                  U5      R                  " U6 R                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pU	R!                  5       S S U R"                  4-   nU	R%                  U5      n	X4$ )	Nr   r_   r   r   eager        )r   r   r[   r   )r9   r   r   r|   rk   r=   r}   r{   r   r   _attn_implementationr   r   r   r   r   ra   r   rh   )r.   r   r   r>   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes               r0   rB   VivitSelfAttention.forward   sH    #((+
D$<$<d>V>VV	HH]+00)<FFq!L	jj/44i@JJ1aPjj/44i@JJ1aP(?;;++w6"9$++:Z:Z"[)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF--r2   )
r   r   r   r   r   r|   r   r{   r   r}   N)rE   rF   rG   rH   r   r"   rJ   rK   r   tuplerB   rM   rN   rO   s   @r0   r   r      sY    ]{ ]* PT."\\.6>u||6L.	u||U\\)	*. .r2   r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
VivitSelfOutput   z
The residual connection is defined in VivitLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	r!   r"   r   r   r(   denserY   rZ   r[   r-   s     r0   r"   VivitSelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r2   r   input_tensorr5   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r[   r.   r   r   s      r0   rB   VivitSelfOutput.forward   s$    

=1]3r2   r   )rE   rF   rG   rH   rI   r   r"   rJ   rK   rB   rM   rN   rO   s   @r0   r   r      sB    
>{ >
U\\  RWR^R^  r2   r   c                      ^  \ rS rSrS\4U 4S jjrS\\   4S jrSS\	R                  S\\	R                     S\	R                  4S	 jjrS
rU =r$ )VivitAttentioni  r   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r!   r"   r   	attentionr   outputsetpruned_headsr-   s     r0   r"   VivitAttention.__init__  s0    +F3%f-Er2   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rd   )lenr   r   r   r   r   r   r{   r|   r}   r   r   r   union)r.   r   indexs      r0   prune_headsVivitAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r2   r   r   r5   c                 N    U R                  X5      u  p4U R                  X15      nU$ r   )r   r   )r.   r   r   self_attn_output_r   s         r0   rB   VivitAttention.forward  s(    "nn]F-=r2   )r   r   r   r   )rE   rF   rG   rH   r   r"   r   ry   r   rJ   rK   r   rB   rM   rN   rO   s   @r0   r   r     sR    "{ ";S ;$U\\ hu||>T `e`l`l  r2   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VivitIntermediatei%  r   c                 ^  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r!   r"   r   r   r(   intermediate_sizer   rY   rZ   r[   
isinstance
hidden_actstrr   intermediate_act_fnr-   s     r0   r"   VivitIntermediate.__init__&  sv    YYv1163K3KL
zz&"<"<=f''--'-f.?.?'@D$'-'8'8D$r2   r   r5   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r[   )r.   r   s     r0   rB   VivitIntermediate.forward/  s4    

=100?]3r2   )r   r[   r   rE   rF   rG   rH   r   r"   rJ   rK   rB   rM   rN   rO   s   @r0   r   r   %  s/    9{ 9U\\ ell  r2   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	VivitOutputi7  r   c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r!   r"   r   r   r   r(   r   rY   rZ   r[   r-   s     r0   r"   VivitOutput.__init__8  sB    YYv779K9KL
zz&"<"<=r2   r   r   r5   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r0   rB   VivitOutput.forward=  s,    

=1]3%4r2   r   r   rO   s   @r0   r   r   7  s=    >{ >
U\\  RWR^R^  r2   r   c                      ^  \ rS rSrSrS\4U 4S jjrS
S\R                  S\	\R                     S\R                  4S jjr
S	rU =r$ )
VivitLayeriD  zNThis corresponds to the EncoderBlock class in the scenic/vivit implementation.r   c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r!   r"   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr(   layer_norm_epslayernorm_beforelayernorm_afterr-   s     r0   r"   VivitLayer.__init__G  s    '-'E'E$'/-f5!&) "V-?-?VEZEZ [!||F,>,>FDYDYZr2   r   r   r5   c                     U R                  U5      nU R                  X25      nXA-   nU R                  U5      nU R                  U5      nU R	                  XQ5      nU$ r   )r   r   r   r   r   )r.   r   r   hidden_states_normattention_outputlayer_outputs         r0   rB   VivitLayer.forwardQ  se    !22=A>>*<H )8 ++M:((6 {{<?r2   )r   r   r   r   r   r   r   r   )rE   rF   rG   rH   rI   r   r"   rJ   rK   r   rB   rM   rN   rO   s   @r0   r   r   D  sG    X[{ [U\\ hu||>T `e`l`l  r2   r   c                   x   ^  \ rS rSrS\4U 4S jjrS	S\R                  S\\R                     S\	4S jjr
SrU =r$ )
VivitEncoderib  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r!   r"   r   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r.   r   r   r/   s      r0   r"   VivitEncoder.__init__c  sR    ]]fF^F^@_#`@_1Jv$6@_#`a
&+# $as   A&r   r   r5   c                 r    [        U R                  5       H  u  p4Ub  X#   OS nU" X5      nM     [        US9$ )N)last_hidden_state)	enumerater   r	   )r.   r   r   ilayer_modulelayer_head_masks         r0   rB   VivitEncoder.forwardi  s<    (4OA.7.CilO(HM  5 ??r2   )r   r   r   r   )rE   rF   rG   rH   r   r"   rJ   rK   r   r	   rB   rM   rN   rO   s   @r0   r   r   b  sA    ,{ ,@U\\ @hu||>T @`o @ @r2   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )VivitPooleriq  r   c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r!   r"   r   r   r(   r   Tanh
activationr-   s     r0   r"   VivitPooler.__init__r  s9    YYv1163E3EF
'')r2   r   r5   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r  )r.   r   first_token_tensorpooled_outputs       r0   rB   VivitPooler.forwardw  s6     +1a40

#566r2   )r  r   r   rO   s   @r0   r  r  q  s/    ${ $
U\\ ell  r2   r  c                   P    \ rS rSr% \\S'   SrSrSr/ r	Sr
SrSrSr\\S.rS rSrg	)
VivitPreTrainedModeli  r   vivitr3   T)r   
attentionsc                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       aI  UR                   R
                  R                  5         UR"                  R
                  R                  5         gg)zInitialize the weightsr   )meanstdNg      ?)r   r   r   r*   weightdatanormal_r   initializer_ranger   zero_	Embeddingpadding_idxr   fill_rQ   rV   rX   )r.   rz   s     r0   _init_weights"VivitPreTrainedModel._init_weights  sU   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)00!!'')&&++113 1r2    N)rE   rF   rG   rH   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr  rM   r  r2   r0   r  r    sI    $O&*#N"&#(
4r2   r  c                      ^  \ rS rSrSS\S\4U 4S jjjrS rS r\	" SS9\
   SS	\\R                     S
\\R                     S\S\\   S\4
S jj5       5       rSrU =r$ )
VivitModeli  r   add_pooling_layerc                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)r!   r"   r   rQ   r]   r   encoderr   r   r(   r   	layernormr  pooler	post_init)r.   r   r*  r/   s      r0   r"   VivitModel.__init__  si    
 	 )&1#F+f&8&8f>S>ST->k&)D 	r2   c                 .    U R                   R                  $ r   )r]   rW   )r.   s    r0   get_input_embeddingsVivitModel.get_input_embeddings  s    ///r2   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)zy
Prunes heads of the model.

Args:
    heads_to_prune:
        dict of {layer_num: list of heads to prune in this layer}
N)itemsr,  r   r   r   )r.   heads_to_pruner   r   s       r0   _prune_headsVivitModel._prune_heads  s<     +002LELLu%//;;EB 3r2   F)tie_last_hidden_statesr3   r   r4   r   r5   c                 0   Uc  [        S5      eU R                  X R                  R                  5      nU R	                  XS9nU R                  XRS9nUR                  nU R                  U5      nU R                  b  U R                  U5      OSn[        XxS9$ )a(
  
Examples:

```python
>>> import av
>>> import numpy as np

>>> from transformers import VivitImageProcessor, VivitModel
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> # prepare video for the model
>>> inputs = image_processor(list(video), return_tensors="pt")

>>> # forward pass
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 3137, 768]
```Nz You have to specify pixel_valuesru   )r   )r   pooler_output)
r:   get_head_maskr   r   r]   r,  r   r-  r.  r
   )	r.   r3   r   r4   r   embedding_outputencoder_outputssequence_outputr
  s	            r0   rB   VivitModel.forward  s    h ?@@&&y++2O2OP	??<?k+/<<8H<+^);;..98<8OO4UY)Oiir2   )r   r]   r,  r-  r.  )T)NNF)rE   rF   rG   rH   r   rL   r"   r2  r7  r   r   r   rJ   FloatTensorr   r   r
   rB   rM   rN   rO   s   @r0   r)  r)    s    { t  "0	C u5 5915).	]ju001]j E--.]j #'	]j
 +,]j 
$]j  6]jr2   r)  a  
        ViViT Transformer model with a video classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for Kinetics-400.

        <Tip>

            Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
            setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
            position embeddings to the higher resolution.

        </Tip>
    )custom_introc                      ^  \ rS rSrS\4U 4S jjr\\    SS\\	R                     S\\	R                     S\\	R                     S\S\\   S	\4S
 jj5       5       rSrU =r$ )VivitForVideoClassificationi'  r   c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r*  r   )r!   r"   
num_labelsr)  r  r   r   r(   Identity
classifierr/  r-   s     r0   r"   $VivitForVideoClassification.__init__6  ss      ++%@
 OUN_N_bcNc"))F$6$68I8IJikititiv 	r2   r3   r   labelsr4   r   r5   c                     U R                   " U4X$S.UD6nUR                  nU R                  USS2SSS24   5      nSn	Ub  U R                  " X8U R                  40 UD6n	[        U	UUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import av
>>> import numpy as np
>>> import torch

>>> from transformers import VivitImageProcessor, VivitForVideoClassification
>>> from huggingface_hub import hf_hub_download

>>> np.random.seed(0)


>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


>>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
...     '''
...     Sample a given number of frame indices from the video.
...     Args:
...         clip_len (`int`): Total number of frames to sample.
...         frame_sample_rate (`int`): Sample every n-th frame.
...         seg_len (`int`): Maximum allowed index of sample's last frame.
...     Returns:
...         indices (`list[int]`): List of sampled frame indices
...     '''
...     converted_len = int(clip_len * frame_sample_rate)
...     end_idx = np.random.randint(converted_len, seg_len)
...     start_idx = end_idx - converted_len
...     indices = np.linspace(start_idx, end_idx, num=clip_len)
...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
...     return indices


>>> # video clip consists of 300 frames (10 seconds at 30 FPS)
>>> file_path = hf_hub_download(
...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample 32 frames
>>> indices = sample_frame_indices(clip_len=32, frame_sample_rate=4, seg_len=container.streams.video[0].frames)
>>> video = read_video_pyav(container=container, indices=indices)

>>> image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
>>> model = VivitForVideoClassification.from_pretrained("google/vivit-b-16x2-kinetics400")

>>> inputs = image_processor(list(video), return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
...     logits = outputs.logits

>>> # model predicts one of the 400 Kinetics-400 classes
>>> predicted_label = logits.argmax(-1).item()
>>> print(model.config.id2label[predicted_label])
LABEL_116
```)r   r4   Nr   )losslogitsr   r  )r  r   rH  loss_functionr   r   r   r  )
r.   r3   r   rJ  r4   r   outputsr?  rM  rL  s
             r0   rB   #VivitForVideoClassification.forwardB  s    z $(::$
$-$
dj$
 "33Aq!9:%%fdkkLVLD$!//))	
 	
r2   )rH  rF  r  )NNNF)rE   rF   rG   rH   r   r"   r   r   r   rJ   rA  
LongTensorrL   r   r   r   rB   rM   rN   rO   s   @r0   rD  rD  '  s    
{ 
  5915-1).j
u001j
 E--.j
 ))*	j

 #'j
 +,j
 
j
  j
r2   rD  )r)  r  rD  )r   )5rI   typingr   r   rJ   r   activationsr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   utilsr   r   r   r   utils.genericr   r   configuration_vivitr   
get_loggerrE   loggerModuler   rQ   rK   floatr   r   r   r   r   r   r   r   r  r  r)  rD  __all__r  r2   r0   <module>ra     s    %   ! 9 b b F & Q K K A , 
		H	%)RYY )XLbii Ln %II%<<% 
% <<	%
 U\\*% % %>1. 1.jbii $RYY >		 $
")) 
+ <@299 @"))   4?  4  4F j% j jD y
"6 y
y
x Pr2   