
    +hj                        S SK JrJrJr  S SKrS SKrS SKJr  S SKJr  S SK	J
r  S SKJrJrJr  S SKJrJr  S SKJrJrJrJrJr  S S	KJr  S S
KJrJr  \R<                  " \5      r  " S S\RB                  5      r" " S S\RB                  5      r# " S S\RB                  5      r$ " S S\RB                  5      r% " S S\RB                  5      r& " S S\5      r' " S S\5      r(g)    )OptionalTupleUnionN)nn)BertTokenizer)QuickGELUActivation))BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)Blip2ConfigBlip2VisionConfig)Blip2EncoderBlip2PreTrainedModelBlip2QFormerAttentionBlip2QFormerIntermediateBlip2QFormerOutput)apply_chunking_to_forward)loggingreplace_return_docstringsc                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )Blip2TextEmbeddings/   z;Construct the embeddings from word and position embeddings.c                 H  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR                  5      R%                  S5      5        ['        USS5      U l        Xl        g )N)padding_idxepsposition_ids)   position_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr    configselfr5   	__class__s     k/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/blip_diffusion/modeling_blip2.pyr#   Blip2TextEmbeddings.__init__2   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ij'.v7PR\']$    c                    Ub  UR                  5       S   nOSnUc%  U R                  S S 2XEU-   24   R                  5       nUbr  U R                  U5      nU R                  S:X  a  U R                  U5      nXg-   nUb8  UR                  S   nUR                  USS5      n[        R                  " X64SS9nOUnUR                  UR                  5      nU R                  U5      nU R                  U5      nU$ )Nr   r   r!   dim)sizer   cloner(   r    r*   shaperepeatr1   cattodtyper+   r/   )	r7   	input_idsr   query_embedspast_key_values_length
seq_length
embeddingsr*   
batch_sizes	            r9   forwardBlip2TextEmbeddings.forwardB   s     ")!,JJ,,Q0FVlIl0l-lmssuL --i8J++z9&*&>&>|&L#'=
''--a0
+22:q!D"YY'AqI
%J]]<#5#56
^^J/
\\*-
r;   )r+   r5   r/   r    r*   r(   )NNNr   )	__name__
__module____qualname____firstlineno____doc__r#   rL   __static_attributes____classcell__r8   s   @r9   r   r   /   s#    E$   r;   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Blip2VisionEmbeddingse   r5   c                 t  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr      F)in_channelsout_channelskernel_sizestridebias   )r"   r#   r5   r&   	embed_dim
image_size
patch_sizer   	Parameterr1   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr6   s     r9   r#   Blip2VisionEmbeddings.__init__f   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\krw 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"br;   pixel_valuesreturnc                    UR                   S   nU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n[        R                  " XT/SS9nX`R                  S S 2S UR                  S5      2S S 24   R	                  U5      -   nU$ )Nr   rE   r`   r   r   r=   )rA   rh   weightrE   rD   flatten	transposerf   r3   r1   rC   rk   r?   )r7   rm   rK   target_dtypepatch_embedsclass_embedsrJ   s          r9   rL   Blip2VisionEmbeddings.forwardx   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
"9"9!=Qzq?Q=QST:T"U"X"XYe"ff
r;   )	rf   r5   ra   rb   ri   rj   rh   rc   rk   )rN   rO   rP   rQ   r   r#   r1   TensorrL   rS   rT   rU   s   @r9   rW   rW   e   s2    c0 c$	ELL 	U\\ 	 	r;   rW   c                   F   ^  \ rS rSrU 4S jr          SS jrSrU =r$ )Blip2QFormerEncoder   c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf )NF)
r"   r#   r5   r   
ModuleListrangenum_hidden_layersBlip2QFormerLayerlayergradient_checkpointingr7   r5   	layer_idxr8   s      r9   r#   Blip2QFormerEncoder.__init__   sX    ]]CHIaIaCbcCbiv1Cbc

 ',# ds   A&c                    U	(       a  SOS nU(       a  SOS nU(       a  SOS nU(       a  SOS n[        U R                  R                  5       H  nU R                  U   nU	(       a  X4-   nUb  UU   OS nUb  UU   OS n[	        U R                  SS5      (       aR  [
        R                  " 5       (       a8  U(       a  [        R                  S5        SnU R                  UUUUUUUUU5	      nOU" UUUUUUUU5      nUS   nU(       a	  UUS   4-  nU(       d  M  UUS   4-   nUR                  (       d  M  UUS   4-   nM     U	(       a  X4-   nU
(       d  [        S	 UUUUU4 5       5      $ [        UUUUUS
9$ )N r   FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...r   r   r   r`   c              3   0   #    U  H  nUc  M  Uv   M     g 7fNr   ).0vs     r9   	<genexpr>.Blip2QFormerEncoder.forward.<locals>.<genexpr>   s"      
A  s   	)last_hidden_statepast_key_valueshidden_states
attentionscross_attentions)r~   r5   r   r   r4   r1   is_grad_enabledloggerwarning_gradient_checkpointing_funchas_cross_attentiontupler	   )r7   r   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictquery_lengthall_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskpast_key_valuelayer_outputss                        r9   rL   Blip2QFormerEncoder.forward   s    #7BD$5b4%6rD#,R$t{{445A::a=L#$58H$H!.7.CilO3B3N_Q/TXNt{{$<eDDI^I^I`I`NNt !&I $ A A !"#)*"% 
! !-!"#)*"% 	! *!,M"}R'8&::"  &9]1=M<O&O#333+?=QRCSBU+U(Y 6\   14D D 
 "&%'(
 
 
 9+.+*1
 	
r;   )r5   r   r   )
NNNNNNFFTr   rN   rO   rP   rQ   r#   rL   rS   rT   rU   s   @r9   rz   rz      s4    , "#"W
 W
r;   rz   c                   L   ^  \ rS rSrU 4S jr       SS jrS rS rSrU =r	$ )r      c                 ^  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        X l        X!R                  -  S:X  a  [	        USS9U l        SU l	        OSU l	        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        g )Nr   r   T)is_cross_attentionF)r"   r#   chunk_size_feed_forwardseq_len_dimr   	attentionr   cross_attention_frequencycrossattentionr   r   intermediateintermediate_queryr   output_queryoutputr   s      r9   r#   Blip2QFormerLayer.__init__   s    '-'E'E$.v6"7771<"7SW"XD'+D$',D$4V<":6"B.v6(0r;   c	           
         Ub  US S OS n	U R                  UUUUU	S9n
U
S   nU
SS nU
S   nUS:  a  US S 2S U2S S 24   nU R                  (       a.  Uc  [        S5      eU R                  UUUUUUS9nUS   nXSS -   n[	        U R
                  U R                  U R                  U5      nUR                  S   U:  aO  [	        U R                  U R                  U R                  US S 2US 2S S 24   5      n[        R                  " UU/SS9nO,[	        U R                  U R                  U R                  U5      nU4U-   nX4-   nU$ )	Nr`   )r   r   r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r=   )r   r   
ValueErrorr   r   feed_forward_chunk_queryr   r   rA   feed_forward_chunkr1   rC   )r7   r   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsattention_outputoutputspresent_key_valuequery_attention_outputcross_attention_outputslayer_outputlayer_output_texts                     r9   rL   Blip2QFormerLayer.forward   s    :H9S>"1#5Y] !%/3 "0 "
 2!4(2.226!%5a,6I%J"''(0$%eff*.*=*=*")*&7 +> +' *A)C&!Ab$AA4--,,  &	L  %%a(<7$=++00$$$Qq%89	%!  %yy,8I)JPQR4'',,   	L  /G+00r;   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   r7   r   intermediate_outputr   s       r9   r   $Blip2QFormerLayer.feed_forward_chunkD  s)    "//0@A{{#6Ir;   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r   r   r   s       r9   r   *Blip2QFormerLayer.feed_forward_chunk_queryI  s+    "556FG(()<Or;   )
r   r   r   r   r   r   r   r   r   r   )NNNNNFr   )
rN   rO   rP   rQ   r#   rL   r   r   rS   rT   rU   s   @r9   r   r      s5    1, "#EN
 r;   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )	ProjLayeriP  c                   > [         TU ]  5         [        R                  " X5      U l        [        5       U l        [        R                  " X25      U l        [        R                  " U5      U l	        [        R                  " X%S9U l
        g Nr   )r"   r#   r   Lineardense1	QuickGELUact_fndense2r-   r/   r+   )r7   in_dimout_dim
hidden_dimdrop_pr   r8   s         r9   r#   ProjLayer.__init__Q  sY     ii3kii
4zz&)g7r;   c           	          UnU R                  U5      nU R                  U R                  U R                  U R	                  U5      5      5      5      U-   nU$ r   )r+   r/   r   r   r   )r7   xx_ins      r9   rL   ProjLayer.forward\  sG    NN1LLT[[Q%@ABTIr;   )r+   r   r   r   r/   )皙?-q=r   rU   s   @r9   r   r   P  s    	8 r;   r   c                      ^  \ rS rSrSr\rS\4U 4S jjr\" \	\S9    SS\
\R                     S\
\   S\
\   S\
\   S	\\\	4   4
S
 jj5       rS rSrU =r$ )Blip2VisionModelif  rm   r5   c                 4  > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )r"   r#   r5   r&   rW   rJ   r   r+   r,   pre_layernormr   encoderpost_layernorm	post_init)r7   r5   ra   r8   s      r9   r#   Blip2VisionModel.__init__j  sp     &&	/7\\)9N9NO#F+ ll9:O:OPr;   )output_typeconfig_classr   r   r   rn   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U5      nU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nUSS2SSS24   nU R                  U5      nU(       d	  Xx4USS -   $ [        UUUR                  UR                  S9$ )z
Returns:

Nz You have to specify pixel_values)inputs_embedsr   r   r   r   r   )r   pooler_outputr   r   )r5   r   r   use_return_dictr   rJ   r   r   r   r
   r   r   )	r7   rm   r   r   r   r   encoder_outputsr   pooled_outputs	            r9   rL   Blip2VisionModel.forwardu  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@5**=9,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%58KKK)/')77&11	
 	
r;   c                     U R                   $ r   )rJ   r7   s    r9   get_input_embeddings%Blip2VisionModel.get_input_embeddings  s    r;   )r5   rJ   r   r   r   )NNNN)rN   rO   rP   rQ   main_input_namer   r   r#   r   r
   r   r1   rx   boolr   r   rL   r   rS   rT   rU   s   @r9   r   r   f  s    $O$L	0 	 +ETef 04,0/3&**
u||,*
 $D>*
 'tn	*

 d^*
 
u00	1*
 g*
X r;   r   c                      ^  \ rS rSrSrS\4U 4S jjrS rS rS r	 SS\
R                  S	\\   S
\
R                  S\S\
R                  4
S jjr          SS jrSrU =r$ )Blip2QFormerModeli  z2
Querying Transformer (Q-Former), used in BLIP-2.
r5   c                    > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  5      U l        [        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [!        US5      (       a  UR"                  c  [$        R&                  " SSS9U l        O$[$        R&                  " UR"                  SS9U l        U R"                  R)                  SS05        [+        UR                  R                  UR                  R                  UR                  R                  S-  S	S
S9U l        [/        UR                  5      U l        U R3                  5         g )Nr   	tokenizerzbert-base-uncasedright)truncation_side	bos_tokenz[DEC]   r   r   )r   r   r   r   r   )r"   r#   r5   r   qformer_configrJ   r   vision_configvisual_encoderr   rd   r1   zerosnum_query_tokensr&   query_tokenshasattrr   r   from_pretrainedadd_special_tokensr   
proj_layerrz   r   r   r6   s     r9   r#   Blip2QFormerModel.__init__  s-    -f.C.CD.v/C/CDLLQ8O8OQWQfQfQrQr)stv{++v/?/?/G*::;N`ghDN*::6;K;K]deDN));*@A#((44))55,,881<
 +6+@+@Ar;   c                 .    U R                   R                  $ r   rJ   r(   r   s    r9   r   &Blip2QFormerModel.get_input_embeddings  s    ...r;   c                 $    XR                   l        g r   r  )r7   values     r9   set_input_embeddings&Blip2QFormerModel.set_input_embeddings  s    */'r;   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr   r   r   prune_heads)r7   heads_to_pruner   headss       r9   _prune_headsBlip2QFormerModel._prune_heads  s<    
 +002LELLu%//;;EB 3r;   r   input_shapedevice	has_queryrn   c                    UR                  5       S:X  a  USS2SSS2SS24   nOFUR                  5       S:X  a  USS2SSSS24   nO$[        SR                  X!R                  5      5      eUR	                  U R
                  S9nSU-
  S-  nU$ )a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`Tuple[int]`):
        The shape of the input to the model.
    device (`torch.device`):
        The device of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
rZ   Nr`   zAWrong shape for input_ids (shape {}) or attention_mask (shape {})rp   g      ?g     )r>   r   formatrA   rD   rE   )r7   r   r  r  r  extended_attention_masks         r9   get_extended_attention_mask-Blip2QFormerModel.get_extended_attention_mask  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#SZZ!5!5  #:"<"<4::"<"N#&)@#@H"L&&r;   c                    U R                  USSS9nUR                  U R                  5      nUR                  nUR                  S   n[
        R                  " XR                  R                  5       S   4[
        R                  S9R                  U R                  5      n[
        R                  " XR                  /SS9nUb  UOU R                  R                  nU	b  U	OU R                  R                  n	U
b  U
OU R                  R                  n
Ub,  US   S   R                  S	   U R                  R                   -
  OSnU R                  R                  S   nU R#                  UU R                  US
9nUR                  5       SS nUu  nnUR                  nU R%                  U5      R&                  nUnUc  [
        R                  " UUU-   4US9nU R)                  UUU5      nUb  [+        U[,        5      (       a  US   R                  5       u  nnnOUR                  5       u  nnnUU4n[+        U[,        5      (       a"  U Vs/ s H  nU R/                  U5      PM     nnO>Uc'  [
        R                  " UUS9nU R/                  U5      nOU R/                  U5      nOSnU R1                  X0R                  R2                  R4                  5      nU R7                  UUUUUUUUU	U
US9nUS   nUSS2SSS24   n U
(       d  U R9                  USS2SU2SS24   5      $ [;        UU UR<                  UR>                  UR@                  URB                  S9$ s  snf )ay  
encoder_hidden_states  (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
    the model is configured as a decoder.
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, `optional`):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
past_key_values (`tuple(tuple(torch.Tensor))` of length `config.n_layers` with each tuple having 4 tensors of:
    shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
    value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
    used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
    value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
    `(batch_size, sequence_length)`.
use_cache (`bool`, `optional`):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).
ptT)return_tensorspaddingr   r   rp   r=   Nr`   )rF   rG   rH   r   )r  )
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   )"r   rD   r  rF   rA   r1   onesr  r?   longrC   r   r5   r   r   r   r   rJ   r  r   r!  
isinstancelistinvert_attention_maskget_head_maskr  r   r   r  r   r   r   r   r   )!r7   
text_inputimage_inputr   r   r   r   r   r   r   r   textrF   rK   
query_attsr   rH   r   embedding_outputr  rI   r  image_embeds_frozenr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapemaskencoder_extended_attention_maskr   sequence_outputr   s!                                    r9   rL   Blip2QFormerModel.forward  s~   B ~~jt~Lwwt{{#NN	__Q'
ZZ->->-C-C-Ea-H IQVQ[Q[\__`d`k`kl
J0C0C#D!L1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] JYIdOAq!''*T[[-E-EEjk 	 ((..q1??**#9 + 
 '++-cr2!,
J!(("11+>PP 3!"ZZ*jCY6Y)ZdjkN #'"B"B>S^`f"g !,/66AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$77`v2w`vX\43M3Md3S`v/2w/'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y++2L2L2^2^_	,,2"7#B+/!5#% ' 
 *!,'1a0???1m|mQ3F#GHH;-'+;;)77&11,==
 	
G 3xs   M)r5   rJ   r   r  r  r   r  )F)
NNNNNNNNNN)rN   rO   rP   rQ   rR   r   r#   r   r  r  r1   rx   r   intr  r   r!  rL   rS   rT   rU   s   @r9   r   r     s    { ./0C  +'+' 3Z+' 	+'
 +' 
+'^ "#!A
 A
r;   r   ))typingr   r   r   r1   torch.utils.checkpointr   transformersr   transformers.activationsr   r   transformers.modeling_outputsr	   r
   r   /transformers.models.blip_2.configuration_blip_2r   r   *transformers.models.blip_2.modeling_blip_2r   r   r   r   r   transformers.pytorch_utilsr   transformers.utilsr   r   
get_loggerrN   r   Moduler   rW   rz   r   r   r   r   r   r;   r9   <module>rG     s    * )    & E 
 [  A 
		H	%2")) 2lBII @`
")) `
Hc		 cN		 ,=+ =BX
, X
r;   