
    cCiy                       S SK r S SKJr  S SKJrJrJrJr  S SKrS SKJ	r	  SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJrJr  SSKJr  SSKJrJrJr  SSK J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+J,r,J-r-  SSK.J/r/J0r0J1r1  \%Rd                  " \35      r4 " S S\	Rj                  5      r6\# " S S\5      5       r7 SGS\	Rj                  S\Rp                  S\Rp                  S\Rp                  S\\Rp                     S\9S\94S jjr: " S  S!\	Rj                  5      r; " S" S#\	Rj                  5      r< " S$ S%\5      r= " S& S'\	Rj                  5      r> " S( S)\75      r? " S* S+\	Rj                  5      r@ " S, S-\	Rj                  5      rA " S. S/\	Rj                  5      rB " S0 S1\	Rj                  5      rC " S2 S3\	Rj                  5      rD " S4 S5\5      rE " S6 S7\	Rj                  5      rF " S8 S9\	Rj                  5      rG " S: S;\75      rH\\#" S<S=9 " S> S?\!5      5       5       rI\#" S@S=9 " SA SB\75      5       rJ\#" SCS=9 " SD SE\7\5      5       rK/ SFQrLg)H    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)OutputRecordercheck_model_inputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )!InstructBlipVideoVisionEmbeddings7   configc                 r  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  S9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr$   r	   )in_channelsout_channelskernel_sizestrider    )super__init__r+   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr+   	__class__s     z/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr2   *InstructBlipVideoVisionEmbeddings.__init__8   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"b    
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r$   Ng      ?r   r	   r    bicubicF)sizemodealign_cornersdim)shaper?   r8   jit
is_tracingr6   r   reshapepermuter   
functionalinterpolateviewcat)rA   rF   rG   rH   r=   r>   class_pos_embedpatch_pos_embedrQ   
new_height	new_widthsqrt_num_positionss               rC   interpolate_pos_encoding:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingJ   sS    !&&q)A-//55a81< yy##%%+*F6?***11!RaR%811!QR%8r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCrE   pixel_valuesr`   c                    UR                   u  p4pVU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n	[        R                  " X/SS9n
U(       a  U R                  XU5      nOU R                  nXS S 2S U
R                  S5      2S S 24   R	                  U5      -   n
U
$ )Ndtyper    r$   rK   rP   )rR   r<   weightre   toflatten	transposer:   expandr8   rZ   r`   r?   rM   )rA   rb   r`   
batch_size_rG   rH   target_dtypepatch_embedsclass_embedsrF   r?   s               rC   forward)InstructBlipVideoVisionEmbeddings.forwardr   s    '3'9'9$
v++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
#!%!>!>zSX!Y!%!8!8Q8L*//!:L8La5O"P"S"ST`"aa
rE   )	r:   r+   r4   r5   r=   r>   r<   r6   r?   F)__name__
__module____qualname____firstlineno__r'   r2   r8   Tensorintr`   FloatTensorboolrp   __static_attributes____classcell__rB   s   @rC   r)   r)   7   sr    c< c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn  rE   r)   c                   J    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ SQrS rSrg)	 InstructBlipVideoPreTrainedModel   r+   blipT)"InstructBlipVideoQFormerEmbeddingsInstructBlipVideoAttention*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       a%  UR                  R                  R                  SUS9  g[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       aS  [        R                  R!                  UR"                  SUS9  [        R                  R!                  UR$                  SUS9  g[        U[&        [(        45      (       a%  UR*                  R                  R                  5         gg)zInitialize the weights        )meanstdN      ?)r+   initializer_range
isinstancer   Linearr;   rf   datanormal_biaszero_	Embedding	LayerNormfill_r)   inittrunc_normal_r?   r:   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelquery_tokens)rA   modulefactors      rC   _init_weights.InstructBlipVideoPreTrainedModel._init_weights   sX   ..fryy"))455MM&&CV&<{{&  &&( '--MM&&CV&<--KK""$MM$$S) ABBGG!!&";";#6!RGG!!&"8"8s!O!JLb cdd$$**, erE    N)rs   rt   ru   rv   r%   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr   r{   r   rE   rC   r   r      s>    ##&*#"&N!-rE   r   r   querykeyvalueattention_maskscalingdropoutc                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrK   rP   )ptrainingr$   r    )	r8   matmulri   r   rW   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             rC   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rE   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	 SS\R                  S	\
\R                     S
\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )r      z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        SU l
        UR                  U l        [        R                  " U R                  SU R                  -  SS9U l        UR                  (       ai  [        R                   " ["        R$                  " U R                  5      5      n[        R                   " ["        R$                  " U R                  5      5      nOS nS nUbQ  ["        R&                  " U["        R(                  " USS9U45      n[        R                   " U5      U R                  l        [        R                  " U R                  U R                  5      U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr	   )r   )requires_grad)r1   r2   r+   r3   r4   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   r   qkvqkv_biasr7   r8   zerosrZ   
zeros_liker   
projection)rA   r+   q_biasv_biasr   rB   s        rC   r2   #InstructBlipVideoAttention.__init__   ss   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCrE   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr$   r    )rY   r   r   ri   r   )rA   r   r   r   s       rC   _shape!InstructBlipVideoAttention._shape   s5    {{3GQQRSUVWbbddrE   hidden_states	head_maskrI   c                 4   UR                  5       u  pEnU R                  U5      nUR                  XESU R                  X`R                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
4SU R                  (       d  SOU R                  U R                  S	.UD6u  pUR                  XES
5      R                  5       nU R                  U5      nX4$ )z#Input shape: Batch x Time x Channelr	   r    r   r$      eagerNr   )r   r   r   rK   )rM   r   rU   r   rV   r   r+   _attn_implementationr   r   r   r   r   r   )rA   r   r   r   r   tgt_lenr4   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                 rC   rp   "InstructBlipVideoAttention.forward   s    #0"4"4"6iHH]+	%%cAt~~yTbTbGbckkq!Q
	 2;1y|YWX\,(?;;++w6"9$++:Z:Z"[$7		%

  #}}C$2H2HJJ	%
 	%
! "))#;FFHook2((rE   )	r   r+   r4   r   r   r   r   r   r   N)rs   rt   ru   rv   __doc__r2   r8   rw   rx   r   r   tuplerp   r{   r|   r}   s   @rC   r   r      s    GD>eU\\ eC ec e -1$)||$) ELL)$)
 
u||Xell3XeELL>Q5RR	S$) $)rE   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r1   r2   r+   r
   
hidden_actactivation_fnr   r   r3   intermediate_sizefc1fc2r@   s     rC   r2   InstructBlipVideoMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJrE   r   rI   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rA   r   s     rC   rp   InstructBlipVideoMLP.forward  s4    /**=9/rE   )r   r+   r   r   
rs   rt   ru   rv   r2   r8   rw   rp   r{   r|   r}   s   @rC   r   r     s)    KU\\ ell  rE   r   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\	\
   S\R                  4S j5       rS	rU =r$ )
InstructBlipVideoEncoderLayeri  r+   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)r1   r2   r3   r4   r   	self_attnr   r   layer_norm_epslayer_norm1r   mlplayer_norm2r@   s     rC   r2   &InstructBlipVideoEncoderLayer.__init__  sm    ++3F;<<F<Q<QR'/<<F<Q<QRrE   r   r   r   rI   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   r   )r   r   r   r   )rA   r   r   r   residualrl   s         rC   rp   %InstructBlipVideoEncoderLayer.forward$  sz     !((7>> 
'$
 

 &0 ((7/%0rE   )r4   r   r   r   r   )rs   rt   ru   rv   r%   r2   r   r8   rw   r   r   ry   rp   r{   r|   r}   s   @rC   r   r     s`    S6 S ||  +,	
 
		 rE   r   c            
          ^  \ rS rSrSrS\4U 4S jjr\ S
S\\	R                     S\\   S\\\4   4S jj5       rS	rU =r$ )InstructBlipVideoEncoderi=  a
  
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InstructBlipVideoEncoderLayer`].

Args:
    config (`InstructBlipVideoConfig`):
        The corresponding vision configuration for the `InstructBlipVideoEncoder`.
r+   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
r1   r2   r+   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rA   r+   rl   rB   s      rC   r2   !InstructBlipVideoEncoder.__init__G  sU    mmTYZ`ZrZrTs$tTsq%B6%JTs$tu&+# %u   A&r   r   rI   c                 T    UnU R                    H  nU" U4SU0UD6nM     [        US9$ )Nr   last_hidden_state)r  r   )rA   inputs_embedsr   r   r   encoder_layers         rC   rp    InstructBlipVideoEncoder.forwardM  sC     &![[M)- M ) ??rE   )r+   r  r  r   )rs   rt   ru   rv   r   r%   r2   r   r   r8   rw   r   r   r   r   r   rp   r{   r|   r}   s   @rC   r   r   =  sj    ,6 ,  26@ !.@ +,	@
 
uo%	&@ @rE   r   c                      ^  \ rS rSr% Sr\\S'   \\S.r	S\4U 4S jjr
\" SS9\  SS\\R                     S\S	\\   S
\\\4   4S jj5       5       rS rSrU =r$ )InstructBlipVideoVisionModeli_  rb   r+   )r   
attentionsc                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )r1   r2   r+   r3   r)   rF   r   encoderr   r   r   post_layernorm	post_init)rA   r+   r4   rB   s      rC   r2   %InstructBlipVideoVisionModel.__init__g  sY     &&	;FC/7 ll9:O:OPrE   F)tie_last_hidden_statesr`   r   rI   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r`   r  r   r  pooler_outputr   )r   rF   r  r  r  r   )rA   rb   r`   r   r   encoder_outputsr  pooled_outputs           rC   rp   $InstructBlipVideoVisionModel.forwardr  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rE   c                     U R                   $ r   )rF   rA   s    rC   get_input_embeddings1InstructBlipVideoVisionModel.get_input_embeddings  s    rE   )r+   rF   r  r  r  )rs   rt   ru   rv   main_input_namer'   r   r   r   _can_record_outputsr2   r   r   r   r8   ry   rz   r   r   r   r   r   rp   r   r{   r|   r}   s   @rC   r  r  _  s    $O))60
	< 	 u5 59).
u001
 #'
 +,	

 
u00	1
  6
6 rE   r  c                   j   ^  \ rS rSrSU 4S jjrS rS rS rS rS r	    SS\
\   4S	 jjrS
rU =r$ )r   i  c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        ['        USS5      U l        U R(                  S:X  d  U R(                  S:X  aG  UR*                  U l        [        R,                  " SUR*                  -  S	-
  U R                  5      U l        S
U l        g )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr    r$   F)r1   r2   r+   r3   r   hasattrr   rx   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   getattrr'  max_position_embeddingsr   distance_embeddingsave_attentionrA   r+   is_cross_attentionrB   s      rC   r2   3InstructBlipVideoQFormerMultiHeadAttention.__init__  s    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD##rE   c                     Xl         g r   attn_gradients)rA   r:  s     rC   save_attn_gradients>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradients  s    ,rE   c                     U R                   $ r   r9  r  s    rC   get_attn_gradients=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradients  s    """rE   c                     Xl         g r   attention_map)rA   rB  s     rC   save_attention_map=InstructBlipVideoQFormerMultiHeadAttention.save_attention_map  s    *rE   c                     U R                   $ r   rA  r  s    rC   get_attention_map<InstructBlipVideoQFormerMultiHeadAttention.get_attention_map  s    !!!rE   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )NrK   r   r    r$   r	   )rM   r   r,  rY   rV   )rA   xnew_x_shapes      rC   transpose_for_scores?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$rE   r   c                    US LnU(       aC  U R                  U R                  U5      5      nU R                  U R                  U5      5      n	UnO@U R                  U R                  U5      5      nU R                  U R                  U5      5      n	U R                  U5      n
U R                  U
5      n[        R
                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  GaC  UR                  5       S   n[        R                  " U[        R                  UR                  S9R                  SS5      n[        R                  " U[        R                  UR                  S9R                  SS5      nX-
  nU R                  UU R                  -   S-
  5      nUR                  UR                   S9nU R                  S:X  a  [        R"                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R"                  " SUU5      n[        R"                  " S	UU5      nUU-   U-   nU[$        R&                  " U R(                  5      -  nUR                   nUb  X-   n[*        R,                  " SS
9" U5      R                  U5      nU(       a=  U R.                  (       a,  U R1                  U5        UR3                  U R4                  5        U R7                  U5      nUb  UU-  n[        R
                  " UU	5      nUR9                  SSSS5      R;                  5       nUR                  5       S S U R<                  4-   nUR                  " U6 nUU4$ )NrK   r   r)  r*  r$   re   devicerd   zbhld,lrd->bhlrzbhrd,lrd->bhlrrP   r   r    r	   )rK  r   r   r   r8   r   ri   r'  rM   arangelongrO  rY   r3  r2  rg   re   einsummathsqrtr,  r   Softmaxr4  rC  register_hookr;  r   rV   r   r-  )rA   r   r   r   encoder_hidden_statesencoder_attention_maskr   r6  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                             rC   rp   2InstructBlipVideoQFormerMultiHeadAttention.forward  s    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ!1!7!7%/@ **,-=>AABXY$"5"5##O4))$*B*BC #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDo--rE   )r-  r,  rB  r:  r+   r3  r   r   r2  r   r'  r   r4  r   rr   NNNN)rs   rt   ru   rv   r2   r;  r>  rC  rF  rK  r   r   rp   r{   r|   r}   s   @rC   r   r     sL    $8-#+"% "#I. +,I. I.rE   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )r   i  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )r1   r2   r   r   r3   denser   r   r/  hidden_dropout_probr   r@   s     rC   r2   +InstructBlipVideoQFormerSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rE   r   input_tensorrI   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   ro  r   r   rA   r   rr  s      rC   rp   *InstructBlipVideoQFormerSelfOutput.forward  5    

=1]3}'CDrE   r   ro  r   r   r}   s   @rC   r   r     6    >U\\  RWR^R^  rE   r   c                      ^  \ rS rSrSU 4S jjrS r    SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\
\   S
\R                  4S jjrSrU =r$ )!InstructBlipVideoQFormerAttentioni  c                    > [         TU ]  5         [        X5      U l        [	        U5      U l        [        5       U l        g r   )r1   r2   r   	attentionr   outputsetpruned_headsr5  s      rC   r2   *InstructBlipVideoQFormerAttention.__init__  s0    CF_8@ErE   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r$   rP   )lenr   r}  r   r,  r  r   r   r   r   r~  ro  r-  union)rA   headsindexs      rC   prune_heads-InstructBlipVideoQFormerAttention.prune_heads"  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:rE   r   r   r   rW  rX  r   rI   c           	      \    U R                   " SUUUUUS.UD6u  pxU R                  Xq5      n	U	$ )N)r   r   r   rW  rX  r   )r}  r~  )
rA   r   r   r   rW  rX  r   r   rl   attention_outputs
             rC   rp   )InstructBlipVideoQFormerAttention.forward4  sI      
')"7#9
 
  ;;{BrE   )r}  r~  r  rr   rl  )rs   rt   ru   rv   r2   r  r8   rw   r   ry   r   r   rp   r{   r|   r}   s   @rC   r{  r{    s    ";* 7;15=A>B ||  !!2!23  E--.	 
  ((9(9:  !)):): ;  +,  
   rE   r{  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )$InstructBlipVideoQFormerIntermediateiI  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r1   r2   r   r   r3   r   ro  r   r   strr
   intermediate_act_fnr@   s     rC   r2   -InstructBlipVideoQFormerIntermediate.__init__J  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rE   r   rI   c                 J    U R                  U5      nU R                  U5      nU$ r   ro  r  r   s     rC   rp   ,InstructBlipVideoQFormerIntermediate.forwardR  s&    

=100?rE   r  r   r}   s   @rC   r  r  I  s(    9U\\ ell  rE   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoQFormerOutputiX  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r1   r2   r   r   r   r3   ro  r   r   r/  rp  r   r@   s     rC   r2   'InstructBlipVideoQFormerOutput.__init__Y  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rE   r   rr  rI   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rt  ru  s      rC   rp   &InstructBlipVideoQFormerOutput.forward_  rw  rE   rx  r   r}   s   @rC   r  r  X  ry  rE   r  c                   V   ^  \ rS rSrU 4S jr     SS\\   4S jjrS rS r	Sr
U =r$ )	InstructBlipVideoQFormerLayerif  c                 ^  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        X l        X!R                  -  S:X  a  [	        USS9U l        SU l	        OSU l	        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        g )Nr$   r   T)r6  F)r1   r2   chunk_size_feed_forwardseq_len_dimr{  r}  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionr  intermediater  r~  intermediate_queryoutput_queryrA   r+   r  rB   s      rC   r2   &InstructBlipVideoQFormerLayer.__init__g  s    '-'E'E$:6B"7771<"CF_c"dD'+D$',D$@H4V<"Fv"N:6BrE   r   c           
      p   U R                   " U4UUS.UD6nUS:  a  US S 2S U2S S 24   n	U R                  (       a&  Uc  [        S5      eU R                  " U	4UUUUS.UD6n	[	        U R
                  U R                  U R                  U	5      n
UR                  S   U:  ag  [	        U R                  U R                  U R                  US S 2US 2S S 24   5      R                  U
R                  5      n[        R                  " X/SS9n
U
$ [	        U R                  U R                  U R                  U5      n
U
$ )N)r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r   rW  rX  r$   rP   )r}  r  r   r  r   feed_forward_chunk_queryr  r  rR   feed_forward_chunkrg   rO  r8   rZ   )rA   r   r   r   rW  rX  query_lengthr   r  query_attention_outputlayer_outputlayer_output_texts               rC   rp   %InstructBlipVideoQFormerLayer.forward{  so     >>
)
 	
 !%5a,6I%J"''(0$%eff)-)<)<**#1'*?+A* *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,)JPQR  5'',,   	L rE   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r~  rA   r  intermediate_outputr  s       rC   r  0InstructBlipVideoQFormerLayer.feed_forward_chunk  s)    "//0@A{{#6IrE   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  r  s       rC   r  6InstructBlipVideoQFormerLayer.feed_forward_chunk_query  s+    "556FG(()<OrE   )
r}  r  r  r  r  r  r  r~  r  r  NNNNr   )rs   rt   ru   rv   r2   r   r   rp   r  r  r{   r|   r}   s   @rC   r  r  f  s?    C. "#6 +,6p
 rE   r  c                   T   ^  \ rS rSrU 4S jr\     SS\\   4S jj5       rSr	U =r
$ )InstructBlipVideoQFormerEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf r  )
r1   r2   r+   r   r  r  r  r  layerr  r  s      rC   r2   (InstructBlipVideoQFormerEncoder.__init__  sY    ]]OTU[UmUmOnoOn)*6=Ono

 ',# pr  r   c                     [        U R                  R                  5       H*  nU R                  U   n	Ub  X8   OS n
U	" UUU
U4UUS.UD6nM,     [	        US9$ )N)rX  r  r
  )r  r+   r  r  r   )rA   r   r   r   rW  rX  r  r   ilayer_modulelayer_head_masks              rC   rp   'InstructBlipVideoQFormerEncoder.forward  sw     t{{445A::a=L.7.CilO(%	
 (>) M	 6 9+
 	
rE   )r+   r  r  r  )rs   rt   ru   rv   r2   r   r   r   rp   r{   r|   r}   s   @rC   r  r    s=    ,  "#
 +,
 
rE   r  c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )r   i  z;Construct the embeddings from word and position embeddings.c                 F  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R!                  S["        R$                  " UR                  5      R'                  S5      SS9  [)        USS5      U l        Xl        g )	N)padding_idxr   position_ids)r$   rK   F)
persistentr'  r(  )r1   r2   r   r   
vocab_sizer3   pad_token_idword_embeddingsr2  position_embeddingsr   r   	layernormr/  rp  r   register_bufferr8   rP  rj   r1  r'  r+   r@   s     rC   r2   +InstructBlipVideoQFormerEmbeddings.__init__  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$rE   c                    Ub  UR                  5       S   nOSnUc%  U R                  S S 2XEU-   24   R                  5       nUbi  U R                  U5      nU R                  S:X  a.  U R                  UR                  UR                  5      5      nXg-   nUb  [        R                  " X64SS9nOUnUR                  U R                  R                  R                  5      nU R                  U5      nU R                  U5      nU$ )Nr$   r   r(  rP   )rM   r  cloner  r'  r  rg   rO  r8   rZ   r  rf   re   r   )rA   	input_idsr  query_embedspast_key_values_lengthr^  rF   r  s           rC   rp   *InstructBlipVideoQFormerEmbeddings.forward  s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J++z9&*&>&>|zO`O`?a&b#'=
'"YY'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
rE   )r+   r   r  r'  r  r  )NNNr   )	rs   rt   ru   rv   r   r2   rp   r{   r|   r}   s   @rC   r   r     s#    E$   rE   r   c                     ^  \ rS rSrSrSrSrSrSr\	\
" \SSS9/\
" \SSS9/S.rS	\4U 4S
 jjrS rS rS r SS\R&                  S\\   S\R,                  S\S\R&                  4
S jjr\" 5       \      SS\R6                  S\\R:                     S\\R6                     S\\R&                     S\\R:                     S\\R:                     S\\R:                     S\\   S\ \\R:                     \!4   4S jj5       5       r"Sr#U =r$$ )InstructBlipVideoQFormerModeli  z
Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
instruction as input.
Fr$   z
.attention)r  
layer_namez.crossattention)r   r  cross_attentionsr+   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r1   r2   r+   r   rF   r  r  r  r@   s     rC   r2   &InstructBlipVideoQFormerModel.__init__-  s7     <VD6v>rE   c                 .    U R                   R                  $ r   rF   r  r  s    rC   r   2InstructBlipVideoQFormerModel.get_input_embeddings7  s    ...rE   c                 $    XR                   l        g r   r  rA   r   s     rC   set_input_embeddings2InstructBlipVideoQFormerModel.set_input_embeddings:  s    */'rE   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  r}  r  )rA   heads_to_pruner  r  s       rC   _prune_heads*InstructBlipVideoQFormerModel._prune_heads=  s<    
 +002LELLu%//;;EB 3rE   r   input_shaperO  	has_queryrI   c                    UR                  5       S:X  a  USS2SSS2SS24   nO>UR                  5       S:X  a  USS2SSSS24   nO[        SU SUR                   S35      eUR                  U R                  S9nSU-
  S	-  nU$ )
a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`tuple[int]`):
        The shape of the input to the model.
    device: (`torch.device`):
        The device of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
r	   Nr    z!Wrong shape for input_ids (shape z) or attention_mask (shape )rd   r   g     )rQ   r   rR   rg   re   )rA   r   r  rO  r  extended_attention_masks         rC   get_extended_attention_mask9InstructBlipVideoQFormerModel.get_extended_attention_maskE  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&rE   r  r  r  r   rW  rX  r   c           	      N   Uc  Uc  [        S5      eUb  UR                  S   OSn	U R                  UUUS9n
U
R                  5       SS nUu  pU
R                  nUc  [
        R                  " X4US9nU R                  X+U5      nUb  [        U[        5      (       a  US   R                  5       u  nnnOUR                  5       u  nnnUU4n[        U[        5      (       a"  U Vs/ s H  nU R                  U5      PM     nnO>Uc'  [
        R                  " UUS9nU R                  U5      nOU R                  U5      nOSnU R                  XPR                  R                  5      nU R                  " U
4UUUUU	S.UD6nUR                  nUSS2SSS24   n[!        UUS	9$ s  snf )
a  
query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
    Hidden states to be used in the attention computation. If cross-attention,
    will be used for the query (i.e., key and value will use the encoder_hidden_states).
Nz7You have to specify query_embeds when input_ids is Noner$   r   )r  r  r  rK   )rO  )r   r   rW  rX  r  r  )r   rR   rF   rM   rO  r8   onesr  r   listinvert_attention_maskget_head_maskr+   r  r  r  r   )rA   r  r   r  r  r   rW  rX  r   r  embedding_outputr  rk   r^  rO  r  encoder_batch_sizeencoder_sequence_lengthrl   encoder_hidden_shapemaskencoder_extended_attention_maskr  sequence_outputr  s                            rC   rp   %InstructBlipVideoQFormerModel.forwardp  s   $ !5VWW0<0H|))!,a??%% + 
 '++-cr2!,
!((!"ZZ*)A6RN #'"B"B>`f"g !,/66AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$77`v2w`vX\43M3Md3S`v/2w/'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y++2O2OP	+/<<,
2"7#B%,
 ,
 *;;'1a0;-'
 	
9 3xs   &F")r+   rF   r  rr   )NNNNNN)%rs   rt   ru   rv   r   r   r   r   r   r  r   r   r#  r&   r2   r   r  r  r8   rw   r   rx   rO  rz   r  r   r   
LongTensorr   ry   r   r   r   r   rp   r{   r|   r}   s   @rC   r  r    s   
 #( N 7EQ[gh
 EQ[lm
= /0C  )')' 3Z)' 	)'
 )' 
)'V  7;37/315=A>BO
##O
 !!2!23O
 u//0	O

 u||,O
 E--.O
  ((9(9:O
 !)):): ;O
 +,O
 
uU&&')UU	VO
  O
rE   r  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                      \ rS rSr% SrSr\\\R                        \
S'   Sr\\\R                        \
S'   Sr\\R                     \
S'   Sr\\\R                        \
S'   Sr\\\R                        \
S'   S	\\   4S
 jrSrg)4InstructBlipVideoForConditionalGenerationModelOutputi  a~  
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Language modeling loss from the language model.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head of the language model.
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
    Outputs of the language model.
Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrI   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r   r  r  N)r1  to_tuple).0krA   s     rC   	<genexpr>PInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>  sC      
 ! WW Gq!**,- !s   25)r   keysr  s   `rC   r  =InstructBlipVideoForConditionalGenerationModelOutput.to_tuple  s%     
 YY[	
 
 	
rE   r   )rs   rt   ru   rv   r   r  r   r   r8   ry   r   r  r   r  r  r   r  r{   r   rE   rC   r  r    s     04D(5**+
,315FHU5,,-.526NHU../6:>OXeE$5$567>AEHU5+<+<%=>E
%* 
rE   r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c            #         ^  \ rS rSrSrS/rS\4U 4S jjrS rS r	S r
S	 rS
\R                  S\R                  4S jr\\           SS\R                  S\R                  S\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R&                     S\\   S\\   S\\   S\S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )r   i  rb   r   r+   c                 4  > [         TU ]  U5        [        UR                  5      U l        [
        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        UR                  5      U l        [
        R                  " UR                  R                  UR                   R                  5      U l        [$        R&                  " UR                   5      U l        U R(                  R*                  b/  U R*                  R-                  U R(                  R*                  5        U R(                  R.                  b/  U R.                  R-                  U R(                  R.                  5        U R1                  5         g Nr$   )r1   r2   r  vision_configvision_modelr   r7   r8   r   num_query_tokensqformer_configr3   r   r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr   extend_keep_in_fp32_modulesr  r@   s     rC   r2   InstructBlipVideoModel.__init__  s    89M9MNLLQ8O8OQWQfQfQrQr)st4V5J5JK#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG00<""))$*=*=*O*OP44@&&--d.A.A.W.WX 	rE   c                 6    U R                   R                  5       $ r   r  r   r  s    rC   r   +InstructBlipVideoModel.get_input_embeddings      ""7799rE   c                 :    U R                   R                  U5        g r   r  r  r  s     rC   r  +InstructBlipVideoModel.set_input_embeddings      007rE   c                     U R                   R                  (       d_  U R                  R                  U R                  R                  l        U R                  R                  U R                  R                  l        g g r   r+   use_decoder_only_language_modelr  sharedr  embed_tokensdecoderr  s    rC   _tie_weights#InstructBlipVideoModel._tie_weights	  T    {{::7;7J7J7Q7QD''47;7J7J7Q7QD''4 ;rE   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggz
Some pre-processing hacks to make the model `accelerate` compatible. Check
https://github.com/huggingface/transformers/pull/21707 for more details.
r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_mapr  r8   cudadevice_countloggerwarningr+  r  r/  io_same_devicerA   r1  s     rC   _preprocess_accelerate-InstructBlipVideoModel._preprocess_accelerate  |    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
33:>D((7 4rE   r  r  c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ zJ
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
rN  rK   )r   r8   r   r+   image_token_idrQ  rO  all	unsqueeze	expand_asrg   rA   r  r  special_image_masks       rC   get_placeholder_mask+InstructBlipVideoModel.get_placeholder_mask"       !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H/99"=GGVYYZgZnZno!!rE   qformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr`   	use_cacher   rI   c                    Ub  UOU R                   R                  nUR                  u  nnnnnUR                  UU-  UUU5      nU R	                  UU	U
UUS9nUS   n[
        R                  " UR                  5       SS [
        R                  UR                  S9nU R                  R                  UR                  S   SS5      n[
        R                  " UR                  5       SS [
        R                  UR                  S9nUc  [
        R                  " U5      nUR                  USS9nUR                  USS9n[
        R                  " UU/SS9nU R                  UUUUUU	U
US9nUS   SS2SUR                  S5      2SS24   nU R!                  U5      nUR                  XR                   R"                  U-  S5      nUcR  U R$                  R'                  5       " U5      nX@R                   R(                  :H  nUc  [
        R                  " U5      nOiXR'                  5       " [
        R*                  " U R                   R(                  [
        R                  UR                  S95      :H  nUR-                  S5      nUR/                  S5      R1                  U5      R3                  UR                  5      nUR3                  UR                  UR4                  5      nUR7                  UU5      nU R                   R8                  (       a  U R$                  " SUUU	U
UUS	.UD6nOU R$                  " SUUUUU	U
UUS
.UD6n[;        UUUS9$ )aU  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
N)rb   rJ  rK  rL  r`   r   rK   rN  rP   r$   )r  r   r  rW  rX  rJ  rK  rL  r  r   rJ  rK  rL  rM  )r  r   rH  rI  rJ  rK  rL  rM  r  r   )r+   use_return_dictrR   rU   r  r8   r  rM   rQ  rO  r   rj   	ones_likerepeat_interleaverZ   r  r  r  r  r   video_token_idr   r>  r?  r@  rg   re   masked_scatterr&  r  )rA   rb   rF  rG  r  r   rH  rI  r  rJ  rK  rL  r`   rM  r   rk   frameschannelrG   rH   r   image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsrB  outputss                                 rC   rp   InstructBlipVideoModel.forward1  sw   R &1%<k$++B]B] 6B5G5G2
FGVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t  //DDFyQM!*kk.H.H!H%!&!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)) +-"3%9'# G )) 
+-"3'="3%9'#
 
G D))#*
 	
rE   r  r  r  r   r  )NNNNNNNNNFN)rs   rt   ru   rv   r"  r  r%   r2   r   r  r*  r8  r8   r  ry   rC  r   r   r   rw   rz   r   r   r   r   r  rp   r{   r|   r}   s   @rC   r   r     s    %O+,6 &:8R
?("e.>.> "uO`O` " 
 >B15598<=A04,0/3&*).$(
''
 !,,
 !))9)9 :	

 E--.
 !!1!12
 $E$4$45
 !))9)9 :
  -
 $D>
 'tn
 d^
 #'
 D>
 -.
  
uJJ	K!
  
rE   r   a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c            %       H  ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	S r
S rS	 rS
\R                  4S jrS rS rS rS r   S$S\R*                  S\R,                  S\\R,                     S\\   S\\   4
S jjrS\R,                  S\R*                  4S jr\\            S%S\R*                  S\R*                  S\\R,                     S\\R*                     S\\R,                     S\\R,                     S\\R,                     S\\R*                     S\\   S\\   S\\R,                     S\\   S\S\\   S\\   S
\\ \!4   4 S  jj5       5       r"\RF                  " 5             S&S\R*                  S\\R,                     S\\R,                     S\\R,                     S\\R,                     S\\R*                     S\S
\R,                  4S! jj5       r$   S$S\R*                  S\R,                  S\\R,                     S\\   S\\   4
S" jjr%S#r&U =r'$ )'r   i  r+   rb   Tr   c                 r  > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        R                  UR                  5      U l        [        R                   " UR                  R                  UR"                  R                  5      U l        UR&                  (       a!  [(        R*                  " UR"                  5      nO [,        R*                  " UR"                  5      nUR.                  b%  U R.                  R1                  UR.                  5        UR2                  b%  U R2                  R1                  UR2                  5        X l        U R7                  5         g r  )r1   r2   r  _from_configr  r  r   r7   r8   r   r  r  r3   r   r  r  r   r  r  r&  r"   r  r#   r   r  r  r  r  )rA   r+   r  rB   s      rC   r2   2InstructBlipVideoForConditionalGeneration.__init__  s1    8EEfFZFZ[LLQ8O8OQWQfQfQrQr)st4AA&BWBWX#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN++7"")).*J*JK//;&&--n.R.RS, 	rE   c                 6    U R                   R                  5       $ r   r  r  s    rC   r   >InstructBlipVideoForConditionalGeneration.get_input_embeddings  r  rE   c                 :    U R                   R                  U5        g r   r!  r  s     rC   r  >InstructBlipVideoForConditionalGeneration.set_input_embeddings  r#  rE   c                 :    U R                   R                  U5        g r   )r  set_output_embeddings)rA   new_embeddingss     rC   ri  ?InstructBlipVideoForConditionalGeneration.set_output_embeddings  s    11.ArE   rI   c                 6    U R                   R                  5       $ r   )r  get_output_embeddingsr  s    rC   rm  ?InstructBlipVideoForConditionalGeneration.get_output_embeddings  s    ""88::rE   c                 6    U R                   R                  5       $ r   )r  get_encoderr  s    rC   rp  5InstructBlipVideoForConditionalGeneration.get_encoder      ""..00rE   c                 6    U R                   R                  5       $ r   )r  get_decoderr  s    rC   rt  5InstructBlipVideoForConditionalGeneration.get_decoder  rr  rE   c                     U R                   R                  (       d_  U R                  R                  U R                  R                  l        U R                  R                  U R                  R                  l        g g r   r%  r  s    rC   r*  6InstructBlipVideoForConditionalGeneration._tie_weights  r,  rE   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggr.  r0  r7  s     rC   r8  @InstructBlipVideoForConditionalGeneration._preprocess_accelerate  r:  rE   rF  rG  r`   rL  c                     g)
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
Nr   )rA   rb   rF  rG  r`   rL  s         rC   get_image_features<InstructBlipVideoForConditionalGeneration.get_image_features
  s     	rE   r  r  c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ r<  )r   r8   r   r+   rS  rQ  rO  r>  r?  r@  rg   rA  s       rC   rC  >InstructBlipVideoForConditionalGeneration.get_placeholder_mask  rE  rE   r   rH  rI  rJ  rK  labelsrM  r   c                 x   Ub  UOU R                   R                  nU R                  UUUUSS9u  nnnU(       d  UR                  5       OUnU(       d  UR                  5       OUnUc  U R	                  5       " U5      nUc  [
        R                  " U5      nUR                  UR                  UR                  5      nU R                  XHS9nUR                  UU5      nU R                   R                  (       aj  U R                  " SUUU	U
UUS.UD6nU(       a  UR                  OUS   nSnUb3  U R                  " SUXR                   R                   R"                  S.UD6nOLU R                  " SUUUUU	U
UUUS.	UD6nU(       a  UR$                  OUS   nU(       a  UR                  OUS	   n['        UUUUUS
9$ )a	  
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.

Examples:

```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch
>>> from huggingface_hub import hf_hub_download
>>> import av
>>> import numpy as np

>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

>>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
>>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> file_path = hf_hub_download(
...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample uniformly 4 frames from the videWhy is this video funny?o
>>> total_frames = container.streams.video[0].frames
>>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
>>> clip = read_video_pyav(container, indices)

>>> prompt = "What is happening in the video?"
>>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
```NTrF  rG  r`   rL  r  rO  r   )r  r  r  )	r  r   rH  rI  rJ  rK  rL  r  rM  r$   )r  r  r   r  r  r   )r+   rP  get_video_featuresr  r   r8   rQ  rg   rO  re   rC  rT  r&  r  r  loss_functionr  r  r  r  )rA   rb   rF  rG  r  r   rH  rI  r  rJ  rK  r  rL  r`   rM  r   r\  r   rZ  rB  r]  r  r  s                          rC   rp   1InstructBlipVideoForConditionalGeneration.forward*  s   f &1%<k$++B]B]?C?V?V/#9%= @W @
<~} ;F002>8C..0  557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +-"3%9'# G (3W^^
FD!)) !&[[=T=T=_=_ci
 )) +-"3'="3%9'# G $/7<<GAJD'2W^^
FC))#*
 	
rE   c                 J   [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUSS9u  pnUc  Uc  U R                  R
                  /U R                  R                  -  S-  nXR                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                  " U5      nU
R!                  UR                  UR"                  5      n
U R%                  XFS9nUR'                  X5      nXeS	.nU R(                  R                  R*                  (       d  UUS
'   U R(                  R,                  " S0 UDUD6nU$ )aA  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
        (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
r1  r   Tr  r   rN  r$   r  )r  r   r  r   )r+  r8  rR   r  r+   video_token_indexr  r  bos_token_idr8   r   rQ  rO  repeatr   rQ  rg   re   rC  rT  r  is_encoder_decodergenerate)rA   rb   rF  rG  r  r   r  r`   generate_kwargsrk   r\  r   rZ  video_tokensstart_tokensrB  inputsr]  s                     rC   r  2InstructBlipVideoForConditionalGeneration.generate  s   D 4))'')!''*
?C?V?V/#9%= @W @
<}    $ = =>A]A]]`aa+{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	 557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_#0S""))<<"+F;%%..KK?KrE   c           	      ^   UR                   u  pgpn
UR                  Xg-  XU
5      nU R                  UUSS9nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR                  USS9nUR                  USS9n[        R                  " X/SS9nU R                  UUUUUSS	9nUS   SS2SUR                  S5      2SS24   nU R                  U5      nUR                  X`R                  R                   U-  S5      nU(       a  UUU4$ U$ )
r{  T)rb   r`   rL  r   NrK   rN  rP   r$   )r  r   r  rW  rX  rL  )rR   rU   r  r8   r  rM   rQ  rO  r   rj   rQ  rR  rZ   r  r  r+   r  )rA   rb   rF  rG  r`   rL  rk   rU  rV  rG   rH   r   rW  rX  r   rY  rZ  r[  r\  s                      rC   r  <InstructBlipVideoForConditionalGeneration.get_video_features  s   " 6B5G5G2
GU#++J,?RWX**%%= + 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@+Y_`!a'1%".#7 % 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t(.-GG$$rE   r_  )NFF)NNNNNNNNNNFN)NNNNNF)(rs   rt   ru   rv   r%   r   r"  r   r  r2   r   r  ri  r   Modulerm  rp  rt  r*  r8  r8   ry   r  r   rz   r|  rC  r   r   r   r   r   r   r  rp   no_gradr  r  r{   r|   r}   s   @rC   r   r     s}    $#$O!+,6 4:8B;ryy ;11R
?0 >B38&+'' !++ !))9)9 :	
 #+4. d^""e.>.> "uO`O` " 
 >B15598<=A59,0/3-1&*).$(N
''N
 !,,N
 !))9)9 :	N

 E--.N
 !!1!12N
 $E$4$45N
 !))9)9 :N
   1 12N
 $D>N
 'tnN
 ))*N
 d^N
 #'N
 D>N
  +,!N
" 
uJJ	K#N
  N
` ]]_ 9==A045959).C''C $E$4$45C !))9)9 :	C
 E,,-C !!1!12C   1 12C #'C 
		C CR >B38&+9%''9% !++9% !))9)9 :	9%
 #+4.9% d^9% 9%rE   r   )r  r   r  r   r   )r   )MrS  dataclassesr   typingr   r   r   r   r8   r   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.genericr   r   autor!   r"   r#   configuration_instructblipvideor%   r&   r'   
get_loggerrs   r4  r  r)   r   rw   floatr   r   r   r   r   r  r   r   r{  r  r  r  r  r   r  r  r   r   __all__r   rE   rC   <module>r     sp  ,  ! 1 1   ! ) B 9  G & l l j j ? I I  
		H	%G		 GT #- #- #-\ %II%<<% 
% <<	%
 U\\*% % %.I) I)X299 $> D@ryy @D1#C 1hw. w.t + 		 + \299 RYY U$> Up$
bii $
N0 0fi
$D i
X 

; 
 
: 
F
= F

F
R }%0PRa }%}%@rE   