
    bCi                     (   S SK r S SKJrJr  S SKrS SKJrJrJr  S SKJ	r	  SSK
Jr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJrJr  SSKJr  SSK J!r!  SSK"J#r#  \RH                  " \%5      r& " S S\RN                  5      r( " S S\RN                  5      r) " S S\RN                  5      r* " S S\RN                  5      r+ " S S\RN                  5      r, " S S\RN                  5      r- " S S\5      r. " S S \RN                  5      r/ " S! S"\RN                  5      r0 " S# S$\RN                  5      r1 " S% S&\RN                  5      r2 " S' S(\RN                  5      r3 " S) S*\5      r4 " S+ S,\45      r5 " S- S.\4\5      r6/ S/Qr7g)0    N)OptionalUnion)Tensordevicenn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging)deprecate_kwarg   )BlipTextConfigc                      ^  \ rS rSrSrU 4S jr    SS\\R                     S\\R                     S\\R                     S\
S\R                  4
S	 jjrS
rU =r$ )BlipTextEmbeddings,   z;Construct the embeddings from word and position embeddings.c                 F  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l
        [        R                  " UR                  5      U l        U R                  S[         R"                  " UR                  5      R%                  S5      SS9  ['        USS5      U l        Xl        g )	N)padding_idxepsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandgetattrr%   configselfr:   	__class__s     e/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/blip/modeling_blip_text.pyr(   BlipTextEmbeddings.__init__/   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$    	input_idsr"   inputs_embedspast_key_values_lengthreturnc                 L   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XFU-   24   nUc  U R                  U5      nUnU R                  S:X  a  U R	                  U5      nXx-  nU R                  U5      nU R                  U5      nU$ )Nr#   r   r&   )sizer"   r-   r%   r/   r0   r4   )	r<   rA   r"   rB   rC   input_shape
seq_length
embeddingsr/   s	            r>   forwardBlipTextEmbeddings.forwardA   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL  00;M"
'':5"&":":<"H-J^^J/
\\*-
r@   )r0   r:   r4   r%   r/   r-   )NNNr   )__name__
__module____qualname____firstlineno____doc__r(   r   r6   
LongTensorFloatTensorintr   rJ   __static_attributes____classcell__r=   s   @r>   r   r   ,   sx    E( 153759&'E,,- u//0   1 12	
 !$ 
 r@   r   c                   V  ^  \ rS rSrSU 4S jjrS rS rS rS r\	" SSS	S
9       SS\
R                  S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\   S\\   S\\
R                     S\\
R                     4S jj5       rSrU =r$ )BlipTextSelfAttention`   c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        X0l
        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R"                  " UR$                  5      U l        [)        USS5      U l        U R*                  S:X  d  U R*                  S:X  aH  UR,                  U l        [        R.                  " SUR,                  -  S	-
  U R                  5      U l        g g )
Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)r%   r&   relative_keyrelative_key_query   r   )r'   r(   r:   r+   num_attention_headshasattr
ValueErrorrS   attention_head_sizeall_head_size	layer_idxr   Linearqueryencoder_hidden_sizekeyvaluer2   attention_probs_dropout_probr4   r9   r%   r.   r)   distance_embeddingr<   r:   is_cross_attentionrd   r=   s       r>   r(   BlipTextSelfAttention.__init__a   s    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PP"YYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr@   c                     Xl         g Nattn_gradients)r<   rr   s     r>   save_attn_gradients)BlipTextSelfAttention.save_attn_gradients}   s    ,r@   c                     U R                   $ rp   rq   r<   s    r>   get_attn_gradients(BlipTextSelfAttention.get_attn_gradients   s    """r@   c                     Xl         g rp   attention_map)r<   r{   s     r>   save_attention_map(BlipTextSelfAttention.save_attention_map   s    *r@   c                     U R                   $ rp   rz   rv   s    r>   get_attention_map'BlipTextSelfAttention.get_attention_map   s    !!!r@   past_key_valuepast_key_values4.58new_nameversionhidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskoutput_attentionscache_positionrD   c	                 	   UR                   u  pnU R                  U5      R                  U	SU R                  U R                  5      R                  SS5      nUS LnU(       a  UOUnSnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      R                  U	SU R                  U R                  5      R                  SS5      nU R#                  U5      R                  U	SU R                  U R                  5      R                  SS5      nUbc  U(       d  UOS nWR%                  UUU R                  SU05      u  nnU(       a.  [        U[        5      (       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nU R*                  S:X  d  U R*                  S	:X  GaD  UR-                  5       S   n
[&        R.                  " U
[&        R0                  UR2                  S
9R                  SS5      n[&        R.                  " U
[&        R0                  UR2                  S
9R                  SS5      nUU-
  nU R5                  UU R6                  -   S-
  5      nUR9                  UR:                  S9nU R*                  S:X  a  [&        R<                  " SUU5      nUU-   nOHU R*                  S	:X  a8  [&        R<                  " SUU5      n[&        R<                  " SUU5      nUU-   U-   nU[>        R@                  " U R                  5      -  nUb  UUR9                  UR2                  5      -   n[B        RD                  " SS9" U5      nU RG                  U5      nUb  UU-  n[&        R(                  " UU5      nURI                  SSSS5      RK                  5       nUR-                  5       S S U RL                  4-   nUR                  " U6 nUU4$ )Nr#   r   r^   Fr   Tr\   r]   )dtyper   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r	   )'shaperf   viewr_   rb   	transpose
isinstancer   
is_updatedgetrd   cross_attention_cacheself_attention_cachelayerskeysvaluesrh   ri   updater6   matmulr%   rF   r7   longr   rk   r.   tor   einsummathsqrtr   Softmaxr4   permute
contiguousrc   )r<   r   r   r   r   r   r   r   r   
batch_sizerH   _query_layerrm   r   curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                                  r>   rJ   BlipTextSelfAttention.forward   s    %2$7$7!
JJ}%T*b$":":D<T<TUYq!_ 	 3$>3E/>
&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#2D.-/"=*+224>>BGGI-44T^^DKKK (j"d&>&>@X@XY1a  

>*j"d&>&>@X@XY1a  *7It)<)C)C{DNN=M~<^*&	; &*_FY*Z*ZAEO..t~~> !<<Y5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.2C2CDTD[D[2\\ **,-=> #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDo--r@   )rc   rb   r{   rr   r:   rk   r4   rh   rd   r.   r_   r%   rf   ri   rp   NNNNNFN)rL   rM   rN   rO   r(   rs   rw   r|   r   r   r6   r   r   rR   r   booltuplerJ   rT   rU   rV   s   @r>   rX   rX   `   s   u8-#+" %0A6R 7;15=A>B+/,115h.||h. !!2!23h. E--.	h.
  ((9(9:h. !)):): ;h. "%h. $D>h. !.h. 
u||	h. Sh.r@   rX   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BlipTextSelfOutput   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr    )r'   r(   r   re   r+   denser0   r1   r2   r3   r4   r;   s     r>   r(   BlipTextSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r@   r   input_tensorrD   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rp   r   r4   r0   r<   r   r   s      r>   rJ   BlipTextSelfOutput.forward   5    

=1]3}'CDr@   r0   r   r4   
rL   rM   rN   rO   r(   r6   r   rJ   rT   rU   rV   s   @r>   r   r      6    >U\\  RWR^R^  r@   r   c                   $  ^  \ rS rSrSU 4S jjrS r\" SSSS9      SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\   S\	\   S\	\R                     S\\R                     4S jj5       rSrU =r$ )BlipTextAttentioni  c                 ~   > [         TU ]  5         [        XUS9U l        [	        U5      U l        [        5       U l        g )Nrd   )r'   r(   rX   r<   r   outputsetpruned_headsrl   s       r>   r(   BlipTextAttention.__init__  s3    )&PYZ	(0Er@   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r<   r_   rb   r   r   rf   rh   ri   r   r   rc   union)r<   headsindexs      r>   prune_headsBlipTextAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r@   r   r   r   r   r   r   r   r   r   r   rD   c           
      l    U R                  UUUUUUUS9nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   r   r   r   r   r   r   )r<   r   )r<   r   r   r   r   r   r   r   self_outputsattention_outputoutputss              r>   rJ   BlipTextAttention.forward  s\     yy)"7+/) ! 
  ;;|AF#%QR(88r@   )r   r   r<   )FN)NNNNFN)rL   rM   rN   rO   r(   r   r   r6   r   r   rR   r   r   r   rJ   rT   rU   rV   s   @r>   r   r     s    ";$ %0A6R 7;15=A+/,115|| !!2!23 E--.	
  ((9(9: "% $D> !. 
u||	 Sr@   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextIntermediatei8  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rp   )r'   r(   r   re   r+   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnr;   s     r>   r(   BlipTextIntermediate.__init__9  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r@   r   rD   c                 J    U R                  U5      nU R                  U5      nU$ rp   r   r   r<   r   s     r>   rJ   BlipTextIntermediate.forwardA  s&    

=100?r@   r   r   rV   s   @r>   r   r   8  s(    9U\\ ell  r@   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BlipTextOutputiH  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r'   r(   r   re   r   r+   r   r0   r1   r2   r3   r4   r;   s     r>   r(   BlipTextOutput.__init__I  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r@   r   r   rD   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rp   r   r   s      r>   rJ   BlipTextOutput.forwardO  r   r@   r   r   rV   s   @r>   r   r   H  r   r@   r   c                   @  ^  \ rS rSrU 4S jr\" SSSS9       SS\R                  S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\   S\\R                     S\\R                     4S jj5       rS rSrU =r$ )BlipTextLayeriV  c                 B  > [         TU ]  5         Xl        UR                  U l        SU l        [        XS9U l        X l        U R                  R                  (       a#  [        XR                  R                  US9U l	        [        U5      U l        [        U5      U l        g )Nr   r   )rm   rd   )r'   r(   r:   chunk_size_feed_forwardseq_len_dimr   	attention	layer_num
is_decodercrossattentionr   intermediater   r   )r<   r:   r   r=   s      r>   r(   BlipTextLayer.__init__W  s~    '-'E'E$*6G";;!!"3;;+A+AY#D 18$V,r@   r   r   r   r   r   r   r   r   r   r   r   rD   c	           
          U R                  UUUUUUS9n	U	S   n
U	SS  nUb!  U R                  U
UUUUUUS9nUS   n
XSS  -   n[        U R                  U R                  U R
                  U
5      nU4U-   $ )N)r   r   r   r   r   r   r   r   )r   r   r   feed_forward_chunkr   r   )r<   r   r   r   r   r   r   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs                 r>   rJ   BlipTextLayer.forwarde  s     "&)/+) "0 "
 2!4(, ,&*&9&9 5#&; /"3- ': '#  7q9 ;;G0##T%A%A4CSCSUe
 ((r@   c                 J    U R                  U5      nU R                  X!5      nU$ rp   )r   r   )r<   r   intermediate_outputr  s       r>   r   BlipTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir@   )r   r   r:   r   r   r   r   r   r   )rL   rM   rN   rO   r(   r   r6   r   r   rR   r   r   r   rJ   r  rT   rU   rV   s   @r>   r   r   V  s    - %0A6R 7;15=A>B+/,115%)||%) !!2!23%) E--.	%)
  ((9(9:%) !)):): ;%) "%%) $D>%) !.%) 
u||	%) S%)N r@   r   c                   R  ^  \ rS rSrU 4S jr          SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\\
   S
\\
   S\\
   S\\
   S\\R                     S\\\R                     \4   4S jjrSrU =r$ )BlipTextEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        SU l	        g s  snf )NF)
r'   r(   r:   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r<   r:   ir=   s      r>   r(   BlipTextEncoder.__init__  sR    ]]eFLdLdFe#fFeM&$<Fe#fg
&+# $gs   A&r   r   r   r   r   r   	use_cacher   output_hidden_statesreturn_dictr   rD   c                    U R                   (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  [	        U[
        5      (       a,  [        R                  S5        [        R                  " U5      nOd[	        U[        5      (       a  [        U[        U R                  S95      nO1Uc.  [        [        U R                  S9[        U R                  S95      nU	(       a  SOS nU(       a  SOS nU(       a  Ub  SOS n[        U R                  R                  5       H[  nU R                  U   nU	(       a  X4-   nUb  X?   OS nU" UUUUUUUU5      nUS   nU(       d  MD  UUS   4-   nUc  MR  UUS   4-   nM]     U	(       a  X4-   nU
(       d  [        S	 UUUUU4 5       5      $ [        UUUUUS
9$ )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r:    r   r   r^   c              3   0   #    U  H  nUc  M  Uv   M     g 7frp   r  ).0vs     r>   	<genexpr>*BlipTextEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stater   r   
attentionscross_attentions)r  trainingloggerwarningr   r   warning_oncer   from_legacy_cacher   r:   r  r  r  r   )r<   r   r   r   r   r   r   r  r   r  r  r   all_hidden_statesall_self_attentionsall_cross_attentionsr  layer_modulelayer_head_masklayer_outputss                      r>   rJ   BlipTextEncoder.forward  s    &&4==p "	/511##`
 #6"G"G"X O\::"5o|[_[f[fGg"h ("5 4l$++6V# #7BD$5b4%6;P;\rbft{{445A::a=L#$58H$H!.7.CilO(%&!	M *!,M  &9]1=M<O&O#(4+?=QRCSBU+U(- 60   14D D 
 "#%'(
 
 
 9+++*1
 	
r@   )r:   r  r  )
NNNNNNFFTN)rL   rM   rN   rO   r(   r6   r   r   rR   r   r   r   r   r   rJ   rT   rU   rV   s   @r>   r  r    s   , 7;15=A>B+/$(,1/4&*15W
||W
 !!2!23W
 E--.	W

  ((9(9:W
 !)):): ;W
 "%W
 D>W
 $D>W
 'tnW
 d^W
 !.W
 
uU\\"$MM	NW
 W
r@   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rp   )r'   r(   r   re   r+   r   Tanh
activationr;   s     r>   r(   BlipTextPooler.__init__  s9    YYv1163E3EF
'')r@   r   rD   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r2  )r<   r   first_token_tensorpooled_outputs       r>   rJ   BlipTextPooler.forward  s6     +1a40

#566r@   )r2  r   r   rV   s   @r>   r/  r/    s(    $
U\\ ell  r@   r/  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )r'   r(   r   re   r+   r   r   r   r   r
   transform_act_fnr0   r1   r;   s     r>   r(   (BlipTextPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr@   r   rD   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rp   )r   r;  r0   r   s     r>   rJ   'BlipTextPredictionHeadTransform.forward  s4    

=1--m<}5r@   )r0   r   r;  r   rV   s   @r>   r9  r9    s)    UU\\ ell  r@   r9  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )BlipTextLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)bias)r'   r(   r9  	transformr   re   r+   r*   decoder	Parameterr6   zerosrB  r;   s     r>   r(   !BlipTextLMPredictionHead.__init__  sm    8@ yy!3!3V5F5FUSLLV->->!?@	 !IIr@   c                 :    U R                   U R                  l         g rp   )rB  rD  rv   s    r>   _tie_weights%BlipTextLMPredictionHead._tie_weights&  s     IIr@   c                 J    U R                  U5      nU R                  U5      nU$ rp   )rC  rD  r   s     r>   rJ    BlipTextLMPredictionHead.forward)  s$    }5]3r@   )rB  rD  rC  )	rL   rM   rN   rO   r(   rI  rJ   rT   rU   rV   s   @r>   r@  r@    s    && r@   r@  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BlipTextOnlyMLMHeadi0  c                 B   > [         TU ]  5         [        U5      U l        g rp   )r'   r(   r@  predictionsr;   s     r>   r(   BlipTextOnlyMLMHead.__init__1  s    3F;r@   sequence_outputrD   c                 (    U R                  U5      nU$ rp   rP  )r<   rR  prediction_scoress      r>   rJ   BlipTextOnlyMLMHead.forward5  s     ,,_=  r@   rT  r   rV   s   @r>   rN  rN  0  s(    <!u|| ! ! !r@   rN  c                   2    \ rS rSr% Sr\\S'   Sr/ rS r	Sr
g)BlipTextPreTrainedModeli;  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r:   bertc                 H   [        U[        R                  [        R                  45      (       a9  UR                  R
                  R                  SU R                  R                  S9  Oh[        U[        R                  5      (       aI  UR                  R
                  R                  5         UR                  R
                  R                  S5        [        U[        R                  5      (       a3  UR                  b%  UR                  R
                  R                  5         ggg)zInitialize the weightsg        )meanstd      ?N)r   r   re   r)   weightdatanormal_r:   initializer_ranger0   rB  zero_fill_)r<   modules     r>   _init_weights%BlipTextPreTrainedModel._init_weightsE  s    fryy",,788 MM&&CT[[5R5R&S--KK""$MM$$S)fbii((V[[-DKK""$ .E(r@   r  N)rL   rM   rN   rO   rP   r   __annotations__base_model_prefix_no_split_modulesre  rT   r  r@   r>   rX  rX  ;  s     
 
%r@   rX  c            #         ^  \ rS rSrSrSU 4S jjrS rS rS rS\	S\
\   S	\S
\S\	4
S jr               SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S
\\   S\\R                     S\\
\R                     \4   4 S jjrSrU =r$ )BlipTextModeliS  a  
The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
`encoder_hidden_states` is then expected as an input to the forward pass.
c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OS U l        U R                  5         g rp   )
r'   r(   r:   r   rI   r  encoderr/  pooler	post_init)r<   r:   add_pooling_layerr=   s      r>   r(   BlipTextModel.__init__\  sG     ,V4&v.0AnV,tr@   c                 .    U R                   R                  $ rp   rI   r-   rv   s    r>   get_input_embeddings"BlipTextModel.get_input_embeddingsf  s    ...r@   c                 $    XR                   l        g rp   rs  )r<   ri   s     r>   set_input_embeddings"BlipTextModel.set_input_embeddingsi  s    */'r@   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrm  r  r   r   )r<   heads_to_pruner  r   s       r>   _prune_headsBlipTextModel._prune_headsm  s<    
 +002LELLu%//;;EB 3r@   r   rG   r   r   rD   c                    UR                  5       S:X  a  USS2SSS2SS24   nGO1UR                  5       S:X  Ga   U(       a  Uu  pg[        R                  " XsS9nUSSSS24   R                  XgS5      USSS2S4   :*  n	U	R	                  UR
                  5      n	U	R                  S   UR                  S   :  aU  UR                  S   U	R                  S   -
  n
[        R                  " [        R                  " XgU
4X9R
                  S9U	/SS9n	U	SS2SSS2SS24   USS2SSSS24   -  nO*USS2SSSS24   nO[        S	U S
UR                   S35      eUR	                  U R
                  S9nSU-
  S-  nU$ )a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`tuple[int]`):
        The shape of the input to the model.
    device (`torch.device`):
        The device of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
r	   Nr^   r   r   )r   r   r#   )axisz!Wrong shape for input_ids (shape z) or attention_mask (shape )r   r]  g     )
r   r6   r7   repeatr   r   r   catonesra   )r<   r   rG   r   r   extended_attention_maskr   rH   seq_idscausal_maskprefix_seq_lens              r>   get_extended_attention_mask)BlipTextModel.get_extended_attention_masku  s   & 1$&4Qa]&C#!Q& )4&
,,zA%dD!m4;;JTUVZabfhikoboZpp)nn^-A-AB$$Q'.*>*>q*AA%3%9%9!%<{?P?PQR?S%SN"'))!JJ!+ HQW_p_p (	  #K +6aq!m*D~VWY]_cefVfGg*g'*8D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&r@   rA   r"   r   rB   encoder_embedsr   r   r   r  r   r  r  r   c                 :   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  U
b  U
OU R                   R                  n
OSn
Ub  Ub  [        S5      eUb3  U R                  X5        UR                  5       nUu  nnUR                  nO[Ub%  UR                  5       SS nUu  nnUR                  nO3Ub%  UR                  5       SS nUu  nnUR                  nO[        S5      eSnU	b:  [        U	[        5      (       d  U	S   S   R                  S   OU	R                  5       nUc*  [        R                  " UUU-   45      R                  U5      nU R!                  UUUU5      nUb  [        U["        5      (       a  US   R                  5       u  nnnOUR                  5       u  nnnUU4n[        U["        5      (       a"  U Vs/ s H  nU R%                  U5      PM     nnO>Uc'  [        R                  " UUS9nU R%                  U5      nOU R%                  U5      nOSnU R'                  X@R                   R(                  5      nUc  U R+                  UUUUS	9nOUnU R-                  UUUUUU	U
UUUUS
9nUS   nU R.                  b  U R/                  U5      OSnU(       d
  UU4USS -   $ [1        UUUR2                  UR4                  UR6                  UR8                  S9$ s  snf )a  
encoder_hidden_states  (`torch.FloatTensor`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
    the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor`, *optional*):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
past_key_values (`Cache`, *optional*):
    Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
    If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
    don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
    `decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).
NFzDYou cannot specify both input_ids and inputs_embeds at the same timer#   zGYou have to specify either input_ids or inputs_embeds or encoder_embedsr   r   r  )rA   r"   rB   rC   )
r   r   r   r   r   r  r   r  r  r   r   )r  pooler_outputr   r   r   r!  )r:   r   r  use_return_dictr  ra   %warn_if_padding_and_no_attention_maskrF   r   r   r   r   get_seq_lengthr6   r  r   r  listinvert_attention_maskget_head_maskr  rI   rm  rn  r   r   r   r   r!  ) r<   rA   r   r"   r   rB   r  r   r   r   r  r   r  r  r   r   rG   r   rH   r   rC   r  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapemaskencoder_extended_attention_maskembedding_outputencoder_outputsrR  r6  s                                    r>   rJ   BlipTextModel.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%.%:	@U@UII ]%>cdd"66yQ#..*K%0"J
%%F&',,.s3K%0"J
"))F'(--/4K%0"J
#**Ffgg!"& "/599  "1%++B/$335 # !"ZZZBX5X(YZ]]^deN 150P0PK1
 !,/66AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$77`v2w`vX\43M3Md3S`v/2w/'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y++2O2OP	!##)+'=	  /    .,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
[ 3xs   :L)r:   rI   rm  rn  )T)NNNNNNNNNNNNNFN)rL   rM   rN   rO   rP   r(   rt  rw  r|  r   r   rS   r   r   r  r   r6   r   r   r   rJ   rT   rU   rV   s   @r>   rk  rk  S  s   /0C<'$<'38:<'GM<'[_<'	<'@ -115/3,004158<9=+/$(,0/3&*%*15!P
ELL)P
 !.P
 u||,	P

 ELL)P
  -P
 !.P
  (5P
 !) 6P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 TNP
  !.!P
" 
uU\\"$PP	Q#P
 P
r@   rk  c            '         ^  \ rS rSrSS/rU 4S jrS rS rS rS r	                 SS	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\R                     \4   4$S jjrSU 4S jjrSrU =r$ ) BlipTextLMHeadModeliG  zcls.predictions.decoder.weightzcls.predictions.decoder.biasc                    > [         TU ]  U5        [        USS9U l        [	        U5      U l        UR                  U l        g )NF)rp  )r'   r(   rk  rY  rN  clslabel_smoothingr;   s     r>   r(   BlipTextLMHeadModel.__init__J  s8     !&EB	&v.%55r@   c                 6    U R                   R                  5       $ rp   )rY  rt  rv   s    r>   rt  (BlipTextLMHeadModel.get_input_embeddingsQ  s    yy--//r@   c                 :    U R                   R                  U5        g rp   )rY  rw  r<   new_embeddingss     r>   rw  (BlipTextLMHeadModel.set_input_embeddingsT  s    		&&~6r@   c                 B    U R                   R                  R                  $ rp   )r  rP  rD  rv   s    r>   get_output_embeddings)BlipTextLMHeadModel.get_output_embeddingsW  s    xx##+++r@   c                     XR                   R                  l        UR                  U R                   R                  l        g rp   )r  rP  rD  rB  r  s     r>   set_output_embeddings)BlipTextLMHeadModel.set_output_embeddingsZ  s*    '5$$2$7$7!r@   rA   r   r"   r   rB   r   r   labelsr   r  r   r  r  return_logitsr   	reductionr   rD   c                 @   Ub  UOU R                   R                  nUb  Sn
U R                  UUUUUUUU	U
UUUUUS9nUS   nU R                  U5      nU(       a  USS2SS2SS24   R	                  5       $ SnUb  USS2SS2SS24   R	                  5       nUSS2SS24   R	                  5       R                  UR                  5      n[        UU R                  S9nU" UR                  SU R                   R                  5      UR                  S5      5      nUS:X  a0  UR                  UR                  S5      S5      R                  S5      nU(       d  U4US	S -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   UR"                  S
9$ )a  
encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
    hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
    configured as a decoder.
encoder_attention_mask (`torch.FloatTensor`, *optional*):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
labels (`torch.LongTensor`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
past_key_values (`Cache`, *optional*):
    Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
    If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
    don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
    `decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).
NF)r   r"   r   rB   r   r   r   r  r   r  r  r   r   r   r#   r   )r  r  noner^   )losslogitsr   r   r   r!  )r:   r  rY  r  r   r   r   r   r  r   r*   rF   sumr   r   r   r   r!  )r<   rA   r   r"   r   rB   r   r   r  r   r  r   r  r  r  r   r  r   r   rR  rU  lm_lossshifted_prediction_scoresloss_fctr   s                            r>   rJ   BlipTextLMHeadModel.forward^  s   T &1%<k$++B]B]I)))%'"7#9+/!5#!)  
" "!* HH_5$QQY/::<<(9!SbS!)(D(O(O(Q%AqrE]--/223L3S3STF')TMaMabH8==b$++BXBXY[a[f[fgi[jkGF"!,,'8'='=a'@"EII!L')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r@   c                 :   > [         TU ]  " U4UUS.UD6nSUS'   U$ )N)r   r   Tr   )r'   prepare_inputs_for_generation)r<   rA   r   r   model_kwargsmodel_inputsr=   s         r>   r  1BlipTextLMHeadModel.prepare_inputs_for_generation  s>     w<
+)
 	
 &*\"r@   )rY  r  r  )NNNNNNNNNNNNNFTr[  N)NN)rL   rM   rN   rO   _tied_weights_keysr(   rt  rw  r  r  r   r6   r   r   r   r   r   r   r   rJ   r  rT   rU   rV   s   @r>   r  r  G  s   :<Z[607,8 -115/3,0048<9=)-+/$(,0/3&*(-%)#)15%Z
ELL)Z
 !.Z
 u||,	Z

 ELL)Z
  -Z
  (5Z
 !) 6Z
 &Z
 "%Z
 D>Z
 $D>Z
 'tnZ
 d^Z
  ~Z
  TN!Z
" C=#Z
$ !.%Z
& 
uU\\"$EE	F'Z
x r@   r  )rk  r  rX  )8r   typingr   r   r6   r   r   r   torch.nnr   activationsr
   cache_utilsr   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   utils.deprecationr   configuration_blipr   
get_loggerrL   r#  Moduler   rX   r   r   r   r   r   r  r/  r9  r@  rN  rX  rk  r  __all__r  r@   r>   <module>r     sT  "  "  $ $ % ! C C ) 9 
 . l l  0 . 
		H	%0 0hR.BII R.l /		 /f299  RYY :. :|^
bii ^
DRYY  bii $ryy 0!")) !%o %0p
+ p
h~1? ~B Nr@   