
    cCiF                       S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSKJrJrJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,J-r-J.r.  SSK/J0r0J1r1J2r2  \&Rf                  " \45      r5\\$" SS9 " S S\"5      5       5       r6 " S S\
Rn                  5      r8 SHS\
Rn                  S\	Rr                  S\	Rr                  S\	Rr                  S\\	Rr                     S \:S!\:4S" jjr; " S# S$\
Rn                  5      r< " S% S&\
Rn                  5      r= " S' S(\5      r>\$ " S) S*\5      5       r? " S+ S,\
Rn                  5      r@ " S- S.\?5      rA " S/ S0\
Rn                  5      rB " S1 S2\
Rn                  5      rC " S3 S4\
Rn                  5      rD " S5 S6\
Rn                  5      rE " S7 S8\
Rn                  5      rF " S9 S:\5      rG " S; S<\
Rn                  5      rH " S= S>\
Rn                  5      rI " S? S@\?5      rJ\$" SAS9 " SB SC\?5      5       rK\$" SDS9 " SE SF\?\5      5       rL/ SGQrMg)IzPyTorch InstructBLIP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)OutputRecordercheck_model_inputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipConfigInstructBlipQFormerConfigInstructBlipVisionConfigzQ
    Class defining the outputs of [`InstructBlipForConditionalGeneration`].
    )custom_introc                      \ rS rSr% SrSr\\\R                        \
S'   Sr\\\R                        \
S'   Sr\\R                     \
S'   Sr\\\R                        \
S'   Sr\\\R                        \
S'   S	\\   4S
 jrSrg)/InstructBlipForConditionalGenerationModelOutput.   a~  
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Language modeling loss from the language model.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head of the language model.
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
    Outputs of the language model.
Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r.   r/   r0   N)getattrto_tuple).0kselfs     p/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/instructblip/modeling_instructblip.py	<genexpr>KInstructBlipForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>J   sC      
 ! WW Gq!**,- !s   25)tuplekeysr9   s   `r:   r6   8InstructBlipForConditionalGenerationModelOutput.to_tupleI   s%     
 YY[	
 
 	
     )__name__
__module____qualname____firstlineno____doc__r,   r   r=   torchFloatTensor__annotations__r-   r.   r/   r0   r   r6   __static_attributes__rB   rA   r:   r*   r*   .   s     04D(5**+
,315FHU5,,-.526NHU../6:>OXeE$5$567>AEHU5+<+<%=>E
%* 
rA   r*   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )InstructBlipVisionEmbeddingsS   configc                 r  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  S9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr$   r	   )in_channelsout_channelskernel_sizestrider    )super__init__rO   hidden_size	embed_dim
image_size
patch_sizer   	ParameterrH   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr9   rO   	__class__s     r:   rV   %InstructBlipVisionEmbeddings.__init__T   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"brA   
embeddingsheightwidthr1   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r$   Ng      ?r   r	   r    bicubicF)sizemodealign_cornersdim)shaperb   rH   jit
is_tracingrZ   r   reshapepermuter   
functionalinterpolateviewcat)r9   rf   rg   rh   r`   ra   class_pos_embedpatch_pos_embedrp   
new_height	new_widthsqrt_num_positionss               r:   interpolate_pos_encoding5InstructBlipVisionEmbeddings.interpolate_pos_encodingf   sS    !&&q)A-//55a81< yy##%%+*F6?***11!RaR%811!QR%8r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCrA   pixel_valuesr   c                    UR                   u  p4pVU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n	[        R                  " X/SS9n
U(       a  U R                  XU5      nOU R                  nXS S 2S U
R                  S5      2S S 24   R	                  U5      -   n
U
$ )Ndtyper    r$   rj   ro   )rq   r_   weightr   toflatten	transposer]   expandrH   ry   r   rb   rl   )r9   r   r   
batch_size_rg   rh   target_dtypepatch_embedsclass_embedsrf   rb   s               r:   forward$InstructBlipVisionEmbeddings.forward   s    '3'9'9$
v++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
#!%!>!>zSX!Y!%!8!8Q8L*//!:L8La5O"P"S"ST`"aa
rA   )	r]   rO   rX   rY   r`   ra   r_   rZ   rb   F)rC   rD   rE   rF   r'   rV   rH   Tensorintr   rI   boolr   rK   __classcell__rd   s   @r:   rM   rM   S   sr    c7 c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn  rA   rM   modulequerykeyvalueattention_maskscalingdropoutc                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nrj   ro   )ptrainingr$   r    )	rH   matmulr   r   rv   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r:   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rA   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	 SS\R                  S	\
\R                     S
\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )InstructBlipAttention   z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        SU l
        UR                  U l        [        R                  " U R                  SU R                  -  SS9U l        UR                  (       ai  [        R                   " ["        R$                  " U R                  5      5      n[        R                   " ["        R$                  " U R                  5      5      nOS nS nUbQ  ["        R&                  " U["        R(                  " USS9U45      n[        R                   " U5      U R                  l        [        R                  " U R                  U R                  5      U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr	   )bias)requires_grad)rU   rV   rO   rW   rX   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasr[   rH   zerosry   
zeros_liker   
projection)r9   rO   q_biasv_biasr   rd   s        r:   rV   InstructBlipAttention.__init__   ss   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCrA   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr$   r    )rx   r   r   r   r   )r9   r   r   r   s       r:   _shapeInstructBlipAttention._shape   s5    {{3GQQRSUVWbbddrA   hidden_states	head_maskr1   c                 4   UR                  5       u  pEnU R                  U5      nUR                  XESU R                  X`R                  -  5      R	                  SSSSS5      nUS   US   US   pn[
        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
4SU R                  (       d  SOU R                  U R                  S	.UD6u  pUR                  XES
5      R                  5       nU R                  U5      nX4$ )z#Input shape: Batch x Time x Channelr	   r    r   r$      eagerN        )r   r   r   rj   )rl   r   rt   r   ru   r   rO   _attn_implementationr   r   r   r   r   r   )r9   r   r   r   r   tgt_lenrX   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                 r:   r   InstructBlipAttention.forward   s    #0"4"4"6iHH]+	%%cAt~~yTbTbGbckkq!Q
	 2;1y|YWX\,(?;;++w6"9$++:Z:Z"[$7		%

  #}}C$2H2HJJ	%
 	%
! "))#;FFHook2((rA   )	r   rO   rX   r   r   r   r   r   r   N)rC   rD   rE   rF   rG   rV   rH   r   r   r   r   r=   r   rK   r   r   s   @r:   r   r      s    GD>eU\\ eC ec e -1$)||$) ELL)$)
 
u||Xell3XeELL>Q5RR	S$) $)rA   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rU   rV   rO   r
   
hidden_actactivation_fnr   r   rW   intermediate_sizefc1fc2rc   s     r:   rV   InstructBlipMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJrA   r   r1   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r9   r   s     r:   r   InstructBlipMLP.forward  s4    /**=9/rA   )r   rO   r   r   
rC   rD   rE   rF   rV   rH   r   r   rK   r   r   s   @r:   r   r     s)    KU\\ ell  rA   r   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\	\
   S\R                  4S j5       rS	rU =r$ )
InstructBlipEncoderLayeri  rO   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)rU   rV   rW   rX   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rc   s     r:   rV   !InstructBlipEncoderLayer.__init__  sm    ++.v6<<F<Q<QR"6*<<F<Q<QRrA   r   r   r   r1   c                     UnU R                  U5      nU R                  " SUUS.UD6u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU$ )N)r   r   rB   )r   r   r   r   )r9   r   r   r   residualr   s         r:   r    InstructBlipEncoderLayer.forward  sz     !((7>> 
'$
 

 &0 ((7/%0rA   )rX   r   r   r   r   )rC   rD   rE   rF   r%   rV   r   rH   r   r   r   rI   r   rK   r   r   s   @r:   r   r     s`    S1 S ||  +,	
 
		 rA   r   c                   J    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ SQrS rSrg)	InstructBlipPreTrainedModeli5  rO   blipT)InstructBlipQFormerEmbeddingsr   %InstructBlipQFormerMultiHeadAttentionInstructBlipQFormerSelfOutputc                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       a%  UR                  R                  R                  SUS9  g[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       aS  [        R                  R!                  UR"                  SUS9  [        R                  R!                  UR$                  SUS9  g[        U[&        [(        45      (       a%  UR*                  R                  R                  5         gg)zInitialize the weightsr   )meanstdN      ?)rO   initializer_range
isinstancer   r   r^   r   datanormal_r   zero_	Embeddingr   fill_rM   inittrunc_normal_rb   r]   $InstructBlipForConditionalGenerationInstructBlipModelquery_tokens)r9   r   factors      r:   _init_weights)InstructBlipPreTrainedModel._init_weightsH  sX   ..fryy"))455MM&&CV&<{{&  &&( '--MM&&CV&<--KK""$MM$$S) <==GG!!&";";#6!RGG!!&"8"8s!O!EGX YZZ$$**, [rA   rB   N)rC   rD   rE   rF   r%   rJ   base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr  rK   rB   rA   r:   r   r   5  s>    &*#"&N!-rA   r   c            
          ^  \ rS rSrSrS\4U 4S jjr\ S
S\\	R                     S\\   S\\\4   4S jj5       rS	rU =r$ )InstructBlipEncoderi]  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InstructBlipEncoderLayer`].

Args:
    config (`InstructBlipConfig`):
        The corresponding vision configuration for the `InstructBlipEncoder`.
rO   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rU   rV   rO   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r9   rO   r   rd   s      r:   rV   InstructBlipEncoder.__init__g  sT    mmuU[UmUmOn$oOn!%=f%EOn$op&+# %p   A&r   r   r1   c                 T    UnU R                    H  nU" U4SU0UD6nM     [        US9$ )Nr   last_hidden_state)r  r   )r9   inputs_embedsr   r   r   encoder_layers         r:   r   InstructBlipEncoder.forwardm  sC     &![[M)- M ) ??rA   )rO   r  r  r   )rC   rD   rE   rF   rG   r%   rV   r   r   rH   r   r   r   r   r=   r   r   rK   r   r   s   @r:   r  r  ]  sj    ,1 ,  26@ !.@ +,	@
 
uo%	&@ @rA   r  c                      ^  \ rS rSr% Sr\\S'   \\S.r	S\4U 4S jjr
\" SS9\  SS\\R                     S\S	\\   S
\\\4   4S jj5       5       rS rSrU =r$ )InstructBlipVisionModeli  r   rO   )r   
attentionsc                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )rU   rV   rO   rW   rM   rf   r  encoderr   r   r   post_layernorm	post_init)r9   rO   rX   rd   s      r:   rV    InstructBlipVisionModel.__init__  sY     &&	6v>*62 ll9:O:OPrA   F)tie_last_hidden_statesr   r   r1   c                     Uc  [        S5      eU R                  XS9nU R                  " SSU0UD6nUR                  nU R	                  U5      nUS S 2SS S 24   nU R	                  U5      n[        UUS9$ )Nz You have to specify pixel_values)r   r   r   r  pooler_outputrB   )r   rf   r'  r  r(  r   )r9   r   r   r   r   encoder_outputsr  pooled_outputs           r:   r   InstructBlipVisionModel.forward  s     ?@@h+/<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rA   c                     U R                   $ r   )rf   r?   s    r:   get_input_embeddings,InstructBlipVisionModel.get_input_embeddings  s    rA   )rO   rf   r'  r(  r  )rC   rD   rE   rF   main_input_namer'   rJ   r   r   _can_record_outputsrV   r   r   r   rH   rI   r   r   r   r   r=   r   r   r3  rK   r   r   s   @r:   r$  r$    s    $O$$1+
	7 	 u5 59).
u001
 #'
 +,	

 
u00	1
  6
6 rA   r$  c                   j   ^  \ rS rSrSU 4S jjrS rS rS rS rS r	    SS\
\   4S	 jjrS
rU =r$ )r   i  c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        ['        USS5      U l        U R(                  S:X  d  U R(                  S:X  aG  UR*                  U l        [        R,                  " SUR*                  -  S	-
  U R                  5      U l        S
U l        g )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr    r$   F)rU   rV   rO   rW   r   hasattrr   r   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   r5   r:  max_position_embeddingsr   distance_embeddingsave_attentionr9   rO   is_cross_attentionrd   s      r:   rV   .InstructBlipQFormerMultiHeadAttention.__init__  s    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD##rA   c                     Xl         g r   attn_gradients)r9   rL  s     r:   save_attn_gradients9InstructBlipQFormerMultiHeadAttention.save_attn_gradients  s    ,rA   c                     U R                   $ r   rK  r?   s    r:   get_attn_gradients8InstructBlipQFormerMultiHeadAttention.get_attn_gradients  s    """rA   c                     Xl         g r   attention_map)r9   rT  s     r:   save_attention_map8InstructBlipQFormerMultiHeadAttention.save_attention_map  s    *rA   c                     U R                   $ r   rS  r?   s    r:   get_attention_map7InstructBlipQFormerMultiHeadAttention.get_attention_map  s    !!!rA   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nrj   r   r    r$   r	   )rl   r   r?  rx   ru   )r9   xnew_x_shapes      r:   transpose_for_scores:InstructBlipQFormerMultiHeadAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$rA   r   c                    US LnU(       aC  U R                  U R                  U5      5      nU R                  U R                  U5      5      n	UnO@U R                  U R                  U5      5      nU R                  U R                  U5      5      n	U R                  U5      n
U R                  U
5      n[        R
                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  GaC  UR                  5       S   n[        R                  " U[        R                  UR                  S9R                  SS5      n[        R                  " U[        R                  UR                  S9R                  SS5      nX-
  nU R                  UU R                  -   S-
  5      nUR                  UR                   S9nU R                  S:X  a  [        R"                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R"                  " SUU5      n[        R"                  " S	UU5      nUU-   U-   nU[$        R&                  " U R(                  5      -  nUR                   nUb  X-   n[*        R,                  " SS
9" U5      R                  U5      nU(       a=  U R.                  (       a,  U R1                  U5        UR3                  U R4                  5        U R7                  U5      nUb  UU-  n[        R
                  " UU	5      nUR9                  SSSS5      R;                  5       nUR                  5       S S U R<                  4-   nUR                  " U6 nUU4$ )Nrj   r   r<  r=  r$   r   devicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrro   r   r    r	   )r]  r   r   r   rH   r   r   r:  rl   arangelongra  rx   rE  rD  r   r   einsummathsqrtr?  r   SoftmaxrF  rU  register_hookrM  r   ru   r   r@  )r9   r   r   r   encoder_hidden_statesencoder_attention_maskr   rH  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                             r:   r   -InstructBlipQFormerMultiHeadAttention.forward  s    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ!1!7!7%/@ **,-=>AABXY$"5"5##O4))$*B*BC #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDo--rA   )r@  r?  rT  rL  rO   rE  r   r   rD  r   r:  r   rF  r   r   NNNN)rC   rD   rE   rF   rV   rM  rP  rU  rX  r]  r   r   r   rK   r   r   s   @r:   r   r     sL    $8-#+"% "#I. +,I. I.rA   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )r   i/  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )rU   rV   r   r   rW   denser   r   rB  hidden_dropout_probr   rc   s     r:   rV   &InstructBlipQFormerSelfOutput.__init__0  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rA   r   input_tensorr1   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r   r   r9   r   r  s      r:   r   %InstructBlipQFormerSelfOutput.forward6  5    

=1]3}'CDrA   r   r  r   r   r   s   @r:   r   r   /  6    >U\\  RWR^R^  rA   r   c                      ^  \ rS rSrSU 4S jjrS r    SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\
\   S
\R                  4S jjrSrU =r$ )InstructBlipQFormerAttentioni>  c                    > [         TU ]  5         [        X5      U l        [	        U5      U l        [        5       U l        g r   )rU   rV   r   	attentionr   outputsetpruned_headsrG  s      r:   rV   %InstructBlipQFormerAttention.__init__?  s0    >vZ3F;ErA   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r$   ro   )lenr   r  r   r?  r  r   r   r   r   r  r  r@  union)r9   headsindexs      r:   prune_heads(InstructBlipQFormerAttention.prune_headsE  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:rA   r   r   r   ri  rj  r   r1   c           	      \    U R                   " SUUUUUS.UD6u  pxU R                  Xq5      n	U	$ )N)r   r   r   ri  rj  rB   )r  r  )
r9   r   r   r   ri  rj  r   r   r   attention_outputs
             r:   r   $InstructBlipQFormerAttention.forwardW  sI      
')"7#9
 
  ;;{BrA   )r  r  r  r   r~  )rC   rD   rE   rF   rV   r  rH   r   r   rI   r   r   r   rK   r   r   s   @r:   r  r  >  s    ";* 7;15=A>B ||  !!2!23  E--.	 
  ((9(9:  !)):): ;  +,  
   rA   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipQFormerIntermediateim  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rU   rV   r   r   rW   r   r  r   r   strr
   intermediate_act_fnrc   s     r:   rV   (InstructBlipQFormerIntermediate.__init__n  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rA   r   r1   c                 J    U R                  U5      nU R                  U5      nU$ r   r  r  r   s     r:   r   'InstructBlipQFormerIntermediate.forwardv  s&    

=100?rA   r  r   r   s   @r:   r  r  m  s(    9U\\ ell  rA   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )InstructBlipQFormerOutputi}  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rU   rV   r   r   r   rW   r  r   r   rB  r  r   rc   s     r:   rV   "InstructBlipQFormerOutput.__init__~  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rA   r   r  r1   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r  r  s      r:   r   !InstructBlipQFormerOutput.forward  r  rA   r  r   r   s   @r:   r  r  }  r  rA   r  c                   V   ^  \ rS rSrU 4S jr     SS\\   4S jjrS rS r	Sr
U =r$ )	InstructBlipQFormerLayeri  c                 ^  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        X l        X!R                  -  S:X  a  [	        USS9U l        SU l	        OSU l	        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        g )Nr$   r   T)rH  F)rU   rV   chunk_size_feed_forwardseq_len_dimr  r  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionr  intermediater  r  intermediate_queryoutput_queryr9   rO   r  rd   s      r:   rV   !InstructBlipQFormerLayer.__init__  s    '-'E'E$5f="7771<">vZ^"_D'+D$',D$;FC/7"A&"I5f=rA   r   c           
      p   U R                   " U4UUS.UD6nUS:  a  US S 2S U2S S 24   n	U R                  (       a&  Uc  [        S5      eU R                  " U	4UUUUS.UD6n	[	        U R
                  U R                  U R                  U	5      n
UR                  S   U:  ag  [	        U R                  U R                  U R                  US S 2US 2S S 24   5      R                  U
R                  5      n[        R                  " X/SS9n
U
$ [	        U R                  U R                  U R                  U5      n
U
$ )N)r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r   ri  rj  r$   ro   )r  r  r   r  r   feed_forward_chunk_queryr  r  rq   feed_forward_chunkr   ra  rH   ry   )r9   r   r   r   ri  rj  query_lengthr   r  query_attention_outputlayer_outputlayer_output_texts               r:   r    InstructBlipQFormerLayer.forward  so     >>
)
 	
 !%5a,6I%J"''(0$%eff)-)<)<**#1'*?+A* *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,)JPQR  5'',,   	L rA   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  r9   r  intermediate_outputr  s       r:   r  +InstructBlipQFormerLayer.feed_forward_chunk  s)    "//0@A{{#6IrA   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  r  s       r:   r  1InstructBlipQFormerLayer.feed_forward_chunk_query  s+    "556FG(()<OrA   )
r  r  r  r  r  r  r  r  r  r  NNNNr   )rC   rD   rE   rF   rV   r   r   r   r  r  rK   r   r   s   @r:   r  r    s>    >. "#6 +,6p
 rA   r  c                   T   ^  \ rS rSrU 4S jr\     SS\\   4S jj5       rSr	U =r
$ )InstructBlipQFormerEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf r  )
rU   rV   rO   r   r  r  r  r  layerr  r  s      r:   rV   #InstructBlipQFormerEncoder.__init__  sY    ]]JOPVPhPhJijJiY%f8Jij

 ',# kr  r   c                     [        U R                  R                  5       H*  nU R                  U   n	Ub  X8   OS n
U	" UUU
U4UUS.UD6nM,     [	        US9$ )N)rj  r  r  )r  rO   r  r  r   )r9   r   r   r   ri  rj  r  r   ilayer_modulelayer_head_masks              r:   r   "InstructBlipQFormerEncoder.forward  sw     t{{445A::a=L.7.CilO(%	
 (>) M	 6 9+
 	
rA   )rO   r  r  r  )rC   rD   rE   rF   rV   r   r   r   r   rK   r   r   s   @r:   r  r    s=    ,  "#
 +,
 
rA   r  c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )r   i  z;Construct the embeddings from word and position embeddings.c                 F  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R!                  S["        R$                  " UR                  5      R'                  S5      SS9  [)        USS5      U l        Xl        g )	N)padding_idxr   position_ids)r$   rj   F)
persistentr:  r;  )rU   rV   r   r   
vocab_sizerW   pad_token_idword_embeddingsrD  position_embeddingsr   r   	layernormrB  r  r   register_bufferrH   rb  r   r5   r:  rO   rc   s     r:   rV   &InstructBlipQFormerEmbeddings.__init__  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$rA   c                    Ub  UR                  5       S   nOSnUc%  U R                  S S 2XEU-   24   R                  5       nUbi  U R                  U5      nU R                  S:X  a.  U R                  UR                  UR                  5      5      nXg-   nUb  [        R                  " X64SS9nOUnUR                  U R                  R                  R                  5      nU R                  U5      nU R                  U5      nU$ )Nr$   r   r;  ro   )rl   r  cloner  r:  r  r   ra  rH   ry   r  r   r   r   )r9   	input_idsr  query_embedspast_key_values_lengthrp  rf   r  s           r:   r   %InstructBlipQFormerEmbeddings.forward  s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J++z9&*&>&>|zO`O`?a&b#'=
'"YY'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
rA   )rO   r   r  r:  r  r  )NNNr   )	rC   rD   rE   rF   rG   rV   r   rK   r   r   s   @r:   r   r     s#    E$   rA   r   c                     ^  \ rS rSrSrSrSrSrSr\	\
" \SSS9/\
" \SSS9/S.rS	\4U 4S
 jjrS rS rS r SS\R&                  S\\   S\R,                  S\S\R&                  4
S jjr\" 5       \      SS\R6                  S\\R:                     S\\R6                     S\\R&                     S\\R:                     S\\R:                     S\\R:                     S\\   S\ \\R:                     \!4   4S jj5       5       r"Sr#U =r$$ )InstructBlipQFormerModeli>  z
Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
instruction as input.
Fr$   z
.attention)r  
layer_namez.crossattention)r   r%  cross_attentionsrO   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )rU   rV   rO   r   rf   r  r'  r)  rc   s     r:   rV   !InstructBlipQFormerModel.__init__S  s7     7?1&9rA   c                 .    U R                   R                  $ r   rf   r  r?   s    r:   r3  -InstructBlipQFormerModel.get_input_embeddings]  s    ...rA   c                 $    XR                   l        g r   r  r9   r   s     r:   set_input_embeddings-InstructBlipQFormerModel.set_input_embeddings`  s    */'rA   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr'  r  r  r  )r9   heads_to_pruner  r  s       r:   _prune_heads%InstructBlipQFormerModel._prune_headsc  s<    
 +002LELLu%//;;EB 3rA   r   input_shapera  	has_queryr1   c                    UR                  5       S:X  a  USS2SSS2SS24   nO>UR                  5       S:X  a  USS2SSSS24   nO[        SU SUR                   S35      eUR                  U R                  S9nSU-
  S	-  nU$ )
a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`tuple[int]`):
        The shape of the input to the model.
    device: (`torch.device`):
        The device of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
r	   Nr    z!Wrong shape for input_ids (shape z) or attention_mask (shape )r   r   g     )rp   r   rq   r   r   )r9   r   r  ra  r  extended_attention_masks         r:   get_extended_attention_mask4InstructBlipQFormerModel.get_extended_attention_maskk  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&rA   r  r  r  r   ri  rj  r   c           	      N   Uc  Uc  [        S5      eUb  UR                  S   OSn	U R                  UUUS9n
U
R                  5       SS nUu  pU
R                  nUc  [
        R                  " X4US9nU R                  X+U5      nUb  [        U[        5      (       a  US   R                  5       u  nnnOUR                  5       u  nnnUU4n[        U[        5      (       a"  U Vs/ s H  nU R                  U5      PM     nnO>Uc'  [
        R                  " UUS9nU R                  U5      nOU R                  U5      nOSnU R                  XPR                  R                  5      nU R                  " U
4UUUUU	S.UD6nUR                  nUSS2SSS24   n[!        UUS	9$ s  snf )
a  
query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
    Hidden states to be used in the attention computation. If cross-attention,
    will be used for the query (i.e., key and value will use the encoder_hidden_states).
Nz7You have to specify query_embeds when input_ids is Noner$   r   )r  r  r  rj   )ra  )r   r   ri  rj  r  r-  )r   rq   rf   rl   ra  rH   onesr  r   listinvert_attention_maskget_head_maskrO   r  r'  r  r   )r9   r  r   r  r  r   ri  rj  r   r  embedding_outputr  r   rp  ra  r  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapemaskencoder_extended_attention_maskr/  sequence_outputr0  s                            r:   r    InstructBlipQFormerModel.forward  s   $ !5VWW0<0H|))!,a??%% + 
 '++-cr2!,
!((!"ZZ*)A6RN #'"B"B>`f"g !,/66AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$77`v2w`vX\43M3Md3S`v/2w/'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y++2O2OP	+/<<,
2"7#B%,
 ,
 *;;'1a0;-'
 	
9 3xs   &F")rO   rf   r'  r   )NNNNNN)%rC   rD   rE   rF   rG   r  r  r  r  r  r   r   r6  r&   rV   r3  r  r  rH   r   r=   r   ra  r   r  r   r   
LongTensorr   rI   r   r   r   r   r   rK   r   r   s   @r:   r  r  >  s   
 #( N 2@Vbc
 @Vgh
8 /0C  )')' 3Z)' 	)'
 )' 
)'V  7;37/315=A>BO
##O
 !!2!23O
 u//0	O

 u||,O
 E--.O
  ((9(9:O
 !)):): ;O
 +,O
 
uU&&')UU	VO
  O
rA   r  z[
    InstructBLIP base Model consisting of language model, qformer and vision encoder.
    c                     ^  \ rS rSrSrS/rS\4U 4S jjrS rS r	S r
S	 rS
\R                  S\R                  4S jr\\       SS\R                  S\R                  S\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R&                     S\S\\   S\\\4   4S jj5       5       rSrU =r$ )r  i  r   r  rO   c                 4  > [         TU ]  U5        [        UR                  5      U l        [
        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        UR                  5      U l        [
        R                  " UR                  R                  UR                   R                  5      U l        [$        R&                  " UR                   5      U l        U R(                  R*                  b/  U R*                  R-                  U R(                  R*                  5        U R(                  R.                  b/  U R.                  R-                  U R(                  R.                  5        U R1                  5         g Nr$   )rU   rV   r$  vision_configvision_modelr   r[   rH   r   num_query_tokensqformer_configrW   r  r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr  extend_keep_in_fp32_modulesr)  rc   s     r:   rV   InstructBlipModel.__init__  s    3F4H4HILLQ8O8OQWQfQfQrQr)st/0E0EF#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG00<""))$*=*=*O*OP44@&&--d.A.A.W.WX 	rA   c                 6    U R                   R                  5       $ r   r  r3  r?   s    r:   r3  &InstructBlipModel.get_input_embeddings      ""7799rA   c                 :    U R                   R                  U5        g r   r  r  r  s     r:   r  &InstructBlipModel.set_input_embeddings	      007rA   c                     U R                   R                  (       d_  U R                  R                  U R                  R                  l        U R                  R                  U R                  R                  l        g g r   rO   use_decoder_only_language_modelr  sharedr'  embed_tokensdecoderr?   s    r:   _tie_weightsInstructBlipModel._tie_weights  T    {{::7;7J7J7Q7QD''47;7J7J7Q7QD''4 ;rA   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggz
Some pre-processing hacks to make the model `accelerate` compatible. Check
https://github.com/huggingface/transformers/pull/21707 for more details.
r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_mapr  rH   cudadevice_countloggerwarningr>  r  r/  io_same_devicer9   r1  s     r:   _preprocess_accelerate(InstructBlipModel._preprocess_accelerate  |    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
33:>D((7 4rA   r  r   c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ zJ
Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
r`  rj   r3  rH   r   rO   image_token_idrc  ra  all	unsqueeze	expand_asr   r9   r  r   special_image_masks       r:   get_placeholder_mask&InstructBlipModel.get_placeholder_mask%       !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H/99"=GGVYYZgZnZno!!rA   qformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskr   r   r1   c
           	      "   U R                   " SUU	S.U
D6nUS   n[        R                  " UR                  5       SS [        R                  UR
                  S9nU R                  R                  UR                  S   SS5      n[        R                  " UR                  5       SS [        R                  UR
                  S9nUc  [        R                  " U5      n[        R                  " X/SS9nU R                  " SUUUUUS.U
D6nUS   SS2SUR                  S5      2SS24   nUc9  U R                  R                  5       " U5      nUc  [        R                  " U5      nU R                  U5      nUR                  UR
                  UR                   5      nU R#                  XHS	9nUR%                  UU5      nU R&                  R(                  (       a  U R                  " SUUS
.U
D6nOU R                  " SUUUUS.U
D6n[+        UUUS9$ )aK  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
)r   r   r   Nrj   r`  r$   ro   )r  r   r  ri  rj  r   r   r   )r   r   rI  rJ  r4   rB   )r  rH   r   rl   rc  ra  r  r   rq   	ones_likery   r  r  r3  r  r   r   rD  masked_scatterrO   r&  r*   )r9   r   rG  rH  r  r   rI  rJ  r   r   r   r.   image_embedsimage_attention_maskr  query_attention_maskquery_outputsquery_outputlanguage_model_inputsrC  outputss                        r:   r   InstructBlipModel.forward4  s@   P ** 
%%=
 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"!&,@+Y_`!a 
'1%".#7
 
 %Q'+A\->->q-A+A1(DE  //DDFyQM%!&!; !% 8 8 F 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +- G )) +-"3'=	
 G ?))#*
 	
rA   r  r  r  r  r  )NNNNNNF)rC   rD   rE   rF   r5  r  r%   rV   r3  r  r*  r8  rH   r  rI   rD  r   r   r   r   r   r   r   r   r=   r*   r   rK   r   r   s   @r:   r  r    s^    %O+,1 &:8R
?("e.>.> "uO`O` " 
 >B15598<=A04).^
''^
 !,,^
 !))9)9 :	^

 E--.^
 !!1!12^
 $E$4$45^
 !))9)9 :^
  -^
 #'^
 -.^
 
uEE	F^
  ^
rA   r  a  
    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                     ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	S r
S rS	 rS
\R                  4S jrS rS rS rS r   S S\R*                  S\R,                  S\\R,                     S\\   S\\   4
S jjrS\R,                  S\R*                  4S jr\\        S!S\R*                  S\R*                  S\\R,                     S\\R*                     S\\R,                     S\\R,                     S\\R,                     S\\R*                     S\\R,                     S\S\\   S
\\ \!4   4S jj5       5       r"\RF                  " 5             S"S\R*                  S\\R,                     S\\R,                     S\\R,                     S\\R,                     S\\R*                     S\S
\R,                  4S jj5       r$Sr%U =r&$ )#r  i  rO   r   Tr  c                 r  > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        R                  UR                  5      U l        [        R                   " UR                  R                  UR"                  R                  5      U l        UR&                  (       a!  [(        R*                  " UR"                  5      nO [,        R*                  " UR"                  5      nUR.                  b%  U R.                  R1                  UR.                  5        UR2                  b%  U R2                  R1                  UR2                  5        X l        U R7                  5         g r  )rU   rV   r$  _from_configr  r  r   r[   rH   r   r  r  rW   r  r  r  r   r  r  r&  r"   r  r#   r  r  r  r  r)  )r9   rO   r  rd   s      r:   rV   -InstructBlipForConditionalGeneration.__init__  s1    3@@AUAUVLLQ8O8OQWQfQfQrQr)st/<<V=R=RS#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN++7"")).*J*JK//;&&--n.R.RS, 	rA   c                 6    U R                   R                  5       $ r   r  r?   s    r:   r3  9InstructBlipForConditionalGeneration.get_input_embeddings  r  rA   c                 :    U R                   R                  U5        g r   r!  r  s     r:   r  9InstructBlipForConditionalGeneration.set_input_embeddings  r#  rA   c                 :    U R                   R                  U5        g r   )r  set_output_embeddings)r9   new_embeddingss     r:   rb  :InstructBlipForConditionalGeneration.set_output_embeddings  s    11.ArA   r1   c                 6    U R                   R                  5       $ r   )r  get_output_embeddingsr?   s    r:   rf  :InstructBlipForConditionalGeneration.get_output_embeddings  s    ""88::rA   c                 6    U R                   R                  5       $ r   )r  get_encoderr?   s    r:   ri  0InstructBlipForConditionalGeneration.get_encoder      ""..00rA   c                 6    U R                   R                  5       $ r   )r  get_decoderr?   s    r:   rm  0InstructBlipForConditionalGeneration.get_decoder  rk  rA   c                     U R                   R                  (       d_  U R                  R                  U R                  R                  l        U R                  R                  U R                  R                  l        g g r   r%  r?   s    r:   r*  1InstructBlipForConditionalGeneration._tie_weights  r,  rA   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggr.  r0  r7  s     r:   r8  ;InstructBlipForConditionalGeneration._preprocess_accelerate  r:  rA   rG  rH  r   return_dictc           	         U R                  UUSS9nUS   n[        R                  " UR                  5       SS [        R                  UR
                  S9nU R                  R                  UR                  S   SS5      n	[        R                  " U	R                  5       SS [        R                  UR
                  S9n
Uc  [        R                  " U5      n[        R                  " X/SS9nU R                  UUU	UUSS	9nUS   SS2SU	R                  S5      2SS24   nU R                  U5      nU(       a  XU4$ U$ )
z
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
T)r   r   rs  r   Nrj   r`  r$   ro   )r  r   r  ri  rj  rs  )r  rH   r   rl   rc  ra  r  r   rq   rN  ry   r  r  )r9   r   rG  rH  r   rs  r.   rP  rQ  r  rR  rS  rT  rU  s                 r:   get_image_features7InstructBlipForConditionalGeneration.get_image_features  sT   " **%%= + 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"!&,@+Y_`!a'1%".#7 % 
 %Q'+A\->->q-A+A1(DE !% 8 8 F(-GG$$rA   r  r   c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ r<  r=  rB  s       r:   rD  9InstructBlipForConditionalGeneration.get_placeholder_mask  rF  rA   r   rI  rJ  labelsr   c           	      ~   U R                  UUUU
SS9u  pnUc  U R                  5       " U5      nUc  [        R                  " U5      nUR	                  UR
                  UR                  5      nU R                  XHS9nUR                  X5      nU R                  R                  (       aS  U R                  " SUUS.UD6nUS   nSnU	b3  U R                  " SUXR                  R                  R                  S.UD6nO5SUS'   U R                  " SUUUUU	S	.UD6nUR                  nUR                   n[#        UUUUUS
9$ )a  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
    1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size]`

Examples:

```python
>>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
>>> import torch
>>> from PIL import Image
>>> import requests

>>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
>>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> model.to(device)  # doctest: +IGNORE_RESULT

>>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
>>> prompt = "What is unusual about this image?"
>>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     min_length=1,
...     top_p=0.9,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
...     temperature=1,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
```TrG  rH  r   rs  NrL  rM  r   )r-   ry  r  rs  )r   r   rI  rJ  ry  )r,   r-   r.   r/   r0   rB   )ru  r3  rH   rN  r   ra  r   rD  rO  rO   r&  r  loss_functionr  r  r,   r-   r*   )r9   r   rG  rH  r  r   rI  rJ  r   ry  r   r   rU  r.   rS  rC  rV  r-   r,   s                      r:   r   ,InstructBlipForConditionalGeneration.forward.  s   Z @D?V?V/#9%= @W @
<}   557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_;;66)) +- G
 QZFD!)) !&[[=T=T=_=_ci
 %)F=!)) +-"3'= G <<D^^F>))#*
 	
rA   c                 D   [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUSS9u  pnUc  Uc  U R                  R
                  /U R                  R                  -  nXR                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                  " U5      nU
R!                  UR                  UR"                  5      n
U R%                  XFS9nUR'                  X5      nXeS.nU R(                  R                  R*                  (       d  UUS	'   U R(                  R,                  " S
0 UDUD6nU$ )a  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
        Input images to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
r1  r   Tr{  r`  r$   rL  rM  r  rB   )r>  r8  rq   ru  rO   image_token_indexr  r  bos_token_idrH   r   rc  ra  repeatr3  rN  r   r   rD  rO  r  is_encoder_decodergenerate)r9   r   rG  rH  r  r   r   r   generate_kwargsr   rU  r.   rS  image_tokensstart_tokensrC  inputsrV  s                     r:   r  -InstructBlipForConditionalGeneration.generate  s   D 4))'')!''*
?C?V?V/#9%= @W @
<}    $ = =>A]A]]+{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	 557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_#0S""))<<"+F;%%..KK?KrA   rX  )NFF)NNNNNNNF)NNNNNF)'rC   rD   rE   rF   r%   rJ   r5  r  r  rV   r3  r  rb  r   Modulerf  ri  rm  r*  r8  rH   rI   r  r   r   ru  rD  r   r   r   r   r   r=   r*   r   no_gradr  rK   r   r   s   @r:   r  r    s    $O!+,1 4:8B;ryy ;11R?0 >B38&+/%''/% !++/% !))9)9 :	/%
 #+4./% d^/%b"e.>.> "uO`O` " 
 >B15598<=A59-1).}
''}
 !,,}
 !))9)9 :	}

 E--.}
 !!1!12}
 $E$4$45}
 !))9)9 :}
   1 12}
 ))*}
 #'}
 +,}
 
uEE	F}
  }
~ ]]_ 9==A045959).C''C $E$4$45C !))9)9 :	C
 E,,-C !!1!12C   1 12C #'C 
		C CrA   r  )r  r   r  r  r$  )r   )NrG   re  dataclassesr   typingr   r   r   r   rH   r   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.genericr   r   autor!   r"   r#   configuration_instructblipr%   r&   r'   
get_loggerrC   r4  r*   r  rM   r   floatr   r   r   r   r   r  r$  r   r   r  r  r  r  r  r   r  r  r  __all__rB   rA   r:   <module>r     sn   "  ! 1 1   ! ) B 9  G & l l j j ? I I o o 
		H	% 
k 
 
<G299 Gd %II%<<% 
% <<	%
 U\\*% % %0I)BII I)Zbii  9 D #-/ #- #-N@")) @F19 1hw.BII w.vBII + 299 + ^bii  		 U9 Ur$
 $
N0BII 0fi
: i
X 
e
3 e

e
P S+F SSl
rA   