
    cCi                    "   S r SSKrSSKJrJr  SSKrSSKJr  SSKJr  SSK	J
r
JrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  SSKJrJrJrJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(  \" 5       (       a  SSK)J*r*  SSK+J,r,  \"RZ                  " \.5      r/ " S S\R`                  5      r1 SSK2J3r3  \3r1\/Ri                  S5         " S S\R`                  5      r8 " S S\R`                  5      r9 " S S\R`                  5      r: " S S\5      r; " S  S!\R`                  5      r<\ " S" S#\5      5       r=\ " S$ S%\=5      5       r> " S& S'\R`                  5      r? " S( S)\R`                  5      r@ " S* S+\R`                  5      rA " S, S-\R`                  5      rB " S. S/\R`                  5      rC " S0 S1\5      rD\" S2S39 " S4 S5\=5      5       rE\" S6S39 " S7 S8\=\5      5       rF/ S9QrGg! \5 a     GN*\6 a    \/Ro                  S5         GNCf = f):zPix2Struct modeling file    N)OptionalUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Pix2StructLayerNorm=   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zS
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      l/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr'   Pix2StructLayerNorm.__init__>   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor)   float32powmeanrsqrtr,   r+   dtypefloat16bfloat16)r-   hidden_statesvariances      r1   forwardPix2StructLayerNorm.forwardF   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r3   )r,   r+   )gư>__name__
__module____qualname____firstlineno__r'   rB   __static_attributes____classcell__r0   s   @r1   r#   r#   =   s    $+ +r3   r#   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
Pix2StructVisionEmbeddingsd   a  
Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
is represented by a vector of `hidden_size` values.
configreturnNc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l
        [        R                  " UR                  5      U l        g N)r&   r'   r   Linearpatch_embed_hidden_sizer.   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr-   rP   r0   s     r1   r'   #Pix2StructVisionEmbeddings.__init__k   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r3   flattened_patchesc                     US S 2S S 2S4   R                  5       nUS S 2S S 2S4   R                  5       nUS S 2S S 2SS 24   nU R                  U5      nU R                  U5      nU R                  U5      nXE-   U-   nU R	                  U5      nU$ )Nr   r   r5   )longrV   rY   rZ   r]   )r-   r`   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r1   rB   "Pix2StructVisionEmbeddings.forwardt   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  0>A
\\*-
r3   )rZ   r]   rV   rY   )rE   rF   rG   rH   __doc__r   r'   r)   TensorrB   rI   rJ   rK   s   @r1   rN   rN   d   s<    7/ 7D 7 %,,  r3   rN   c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )Pix2StructVisionAttention   c                 l  > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        U R                  U R                  -  U l	        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        SU l        g NFbias)r&   r'   r.   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutr]   	inner_dimr   rT   querykeyvalueoutputgradient_checkpointingr^   s     r1   r'   "Pix2StructVisionAttention.__init__   s    !--"(++11//(?(?? YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r3   c                 N  ^ ^ UR                   SS u  mnUU 4S jnU" T R                  U5      5      nU" T R                  U5      5      n	U" T R                  U5      5      n
[        R
                  " XR                  SS5      5      nUGc  [        R                  " ST R                  Xf4UR                  UR                  S9nT R                  (       a  T R                  (       a  SUl        UR                  5       S:X  a)  X2SS2SSSS24   R                  UR                  5      -   nOyUb  X2R                  UR                  5      -   nOX[!        5       (       dI  [        R"                  " TU4UR                  UR                  S9nX2R                  UR                  5      -   nSU-
  nUR%                  US:H  [        R&                  " UR                  5      R(                  5      nX-  n[        R*                  " U[        R,                  " [        R&                  " UR                  5      R(                  5      5      n[.        R0                  R3                  US[        R4                  S	9R7                  U5      n[.        R0                  R9                  UT R8                  T R                  S
9nUb  X-  n[        R
                  " X5      nUR                  SS5      R;                  5       R=                  TST R>                  5      nT RA                  U5      nU4U4-   nU(       a  X4-   nU$ )z
Self-attention block
Nr5   c                    > U R                  5       R                  TSTR                  TR                  5      R	                  SS5      $ )
projectionr6   r   r5   )
contiguousviewru   rs   	transpose)states
batch_sizer-   s    r1   to_projection_shape>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr3   r   r   devicer=   Tr6   )dimr=   ptraining)!shaperx   ry   rz   r)   matmulr   zerosru   r   r=   r|   r   requires_gradr   r8   r   r*   masked_fillfinfominmaxtensorr   
functionalsoftmaxr9   type_asr]   r   r   rw   r{   )r-   r@   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthr   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr   s   `               @r1   rB   !Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE l,@,@A,FG !KKDLL*9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-//!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,b &'9Lll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr3   )
r]   r|   r.   rw   ry   rs   ru   r{   rx   rz   )NNNFrD   rK   s   @r1   rl   rl      s"    ,& M Mr3   rl   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructVisionMlp   rP   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g ro   r&   r'   r   rT   r.   d_ffwi_0wi_1wor[   r\   r]   r   dense_act_fnactr^   s     r1   r'   Pix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r3   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rS   r   r   r   r]   
isinstancer   r+   r)   rj   r=   int8r8   r-   r@   hidden_geluhidden_linears       r1   rB   Pix2StructVisionMlp.forward       hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r3   r   r]   r   r   r   )	rE   rF   rG   rH   r   r'   rB   rI   rJ   rK   s   @r1   r   r      s    /5 / r3   r   c                      ^  \ rS rSrS\SS4U 4S jjr   SS\R                  S\\R                     S\\R                     S	\	S\
\\R                  \R                  4   \\R                     4   4
S
 jjrSrU =r$ )Pix2StructVisionLayeri  rP   rQ   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr   r/   )r&   r'   chunk_size_feed_forwardseq_len_dimrl   	attentionr   mlpr#   r.   layer_norm_epspre_mlp_layer_normpre_attention_layer_normr^   s     r1   r'   Pix2StructVisionLayer.__init__  ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r3   r@   r   	head_maskr   c                     UnU R                  U5      nU R                  UUUUS9nUS   nUSS  nXu-   nU R                  U5      n	U R                  U	5      U-   n	U	4U-   nU$ )N)r   r   r   r   r   )r   r   r   r   )
r-   r@   r   r   r   residualself_attention_outputsattention_outputr   layer_outputs
             r1   rB   Pix2StructVisionLayer.forward  s     ! 55mD!%)%/	 "0 "
 2!4(, )3 ..}=xx-=/G+r3   )r   r   r   r   r   r   )NNF)rE   rF   rG   rH   r   r'   r)   rj   r   boolr   tuplerB   rI   rJ   rK   s   @r1   r   r     s    k/ kD k 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	F r3   r   c                      ^  \ rS rSrS\SS4U 4S jjr     SS\R                  S\\R                     S\\R                     S	\	S
\	S\	S\
\\4   4S jjrSrU =r$ )Pix2StructVisionEncoderi2  rP   rQ   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r&   r'   rP   r   
ModuleListrangenum_hidden_layersr   layerr|   )r-   rP   _r0   s      r1   r'    Pix2StructVisionEncoder.__init__3  sT    ]]5QWQiQiKj#kKja$9&$AKj#kl
&+# $ls   A&r@   r   r   r   output_hidden_statesreturn_dictc                 6   U(       a  SOS nU(       a  SOS n[        U R                  5       H9  u  pU(       a  Xq4-   nUb  X9   OS nU
" XX5      nUS   nU(       d  M1  XS   4-   nM;     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frS   r   .0vs     r1   	<genexpr>2Pix2StructVisionEncoder.forward.<locals>.<genexpr>V  s     m$[q$[s   	last_hidden_stater@   
attentions)	enumerater   r   r   )r-   r@   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputss                r1   rB   Pix2StructVisionEncoder.forward9  s     #7BD$5b4(4OA#$58H$H!.7.CilO(kM)!,M  &91=M<O&O#  5   14D Dm]GZ$[mmm++*
 	
r3   )rP   r|   r   )NNFFT)rE   rF   rG   rH   r   r'   r)   rj   r   r   r   r   r   rB   rI   rJ   rK   s   @r1   r   r   2  s    ,5 ,$ , 26,0"'%* "
||"
 !."
 ELL)	"

  "
 #"
 "
 
uo%	&"
 "
r3   r   c                   @    \ rS rSr% \\S'   Sr\S 5       rS r	S r
Srg)	Pix2StructPreTrainedModeli^  rP   Fc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r)   r   r   r   )r-   r   
input_maskdummy_inputss       r1   r   &Pix2StructPreTrainedModel.dummy_inputsd  s6    LL.	\\*-
!*"&0

 r3   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g[        U[        5      (       Gaf  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR                  R                  R
                  R                  SX#S-  -  S9  [        UR                  S5      (       aE  UR                  R                  b.  UR                  R                  R
                  R!                  5         UR"                  R                  R
                  R                  SX#S-  -  S9  [        UR"                  S5      (       aE  UR"                  R                  b.  UR"                  R                  R
                  R!                  5         UR$                  R                  R
                  R                  SX$S-  -  S9  [        UR$                  S5      (       aG  UR$                  R                  b/  UR$                  R                  R
                  R!                  5         ggg[        U[&        5      (       Ga  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R(                  OU R                   R                  n[        U R                   [        5      (       a   U R                   R                  R*                  OU R                   R*                  nUR,                  R                  R
                  R                  SX#U-  S-  -  S9  UR.                  R                  R
                  R                  SX#S-  -  S9  UR0                  R                  R
                  R                  SX#S-  -  S9  UR2                  R                  R
                  R                  SX&U-  S-  -  S9  UR4                  (       a4  UR6                  R                  R
                  R                  SX#S-  -  S9  gg[        U[8        R:                  5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR                  R
                  R                  SX#S-  -  S9  UR<                  b2  UR                  R
                  UR<                     R!                  5         gg[        U[>        5      (       a  [        U R                   [        5      (       a   U R                   R                  R                  OU R                   R                  nUR@                  R                  R
                  R                  SX#S-  -  S9  g[        U[8        RB                  [8        RD                  45      (       a  [8        RF                  RI                  UR                  R
                  RK                  [L        RN                  5      SU R                   RP                  S9RK                  UR                  RR                  5      UR                  l        UR                  b%  UR                  R
                  R!                  5         gg[        U[        5      (       a4  UR                  b&  UR                  R
                  R                  S5        gg[        U[8        R:                  5      (       ax  UR                  R
                  R                  SU R                   RP                  S9  UR<                  b2  UR                  R
                  UR<                     R!                  5         ggg)zInitialize the weights      ?        g      )r;   stdrq   N)*rP   initializer_factorr   r#   r+   datafill_ Pix2StructTextDenseGatedActDenser   text_configr.   r   r   normal_hasattrrq   zero_r   r   Pix2StructTextAttentionrr   	num_headsrx   ry   rz   r{   has_relative_attention_biasrelative_attention_biasr   rW   padding_idxPix2StructTextModellm_headrT   Conv2dinittrunc_normal_r8   r)   r9   initializer_ranger=   )r-   modulefactorr.   r   rs   ru   s          r1   _init_weights'Pix2StructPreTrainedModel._init_weightso  s   //f122MM$$Vc\2 @AA dkk+;<< ''33[[,, 
 4>dkkK[3\3\4;;**//bfbmbmbrbrDKK##++&UYDY:Z+[v{{F++0@0@0L  %%++-KK##++&UYDY:Z+[v{{F++0@0@0L  %%++-II!!))sD.8Q)Rvyy&))fiinn.H		##))+ /I) 788
 dkk+;<< ''33[[,,  1;4;;HX0Y0Y'',,_c_j_j_v_v 
 dkk+;<< ''11[[**  LL$$,,#6TfFfkoEo;p,qJJ""**PTCT9U*VLL$$,,#6RVEV;W,XMM  %%--3FQcGchlFl<m-n11..55::BBQWlp[pQqBr 2-- dkk+;<< ''33[[,,  MM&&CVPT?T5U&V!!-""6#5#56<<> . 344 dkk+;<< ''33[[,,  NN!!&&..CVX\G\=].^BII 677 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( ' 344}}(""((- )--MM&&CT[[5R5R&S!!-""6#5#56<<> . .r3   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r6   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rP   decoder_start_token_idpad_token_id
ValueErrorr   r)   fullr   cat	new_zerosclonemasked_fill_)r-   r   r  r  shifted_input_idss        r1   _shift_right&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)<  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r3   r   N)rE   rF   rG   rH   r   __annotations___can_compile_fullgraphpropertyr   r  r!  rI   r   r3   r1   r   r   ^  s,    " M?`!r3   r   c                     ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	S r
S\\\\   4   S	S
4S jr\      SS\\R$                     S\\R$                     S\\R$                     S\\   S\\   S\\   S	\\\4   4S jj5       rSrU =r$ )Pix2StructVisionModeli  rP   r`   Tr   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  S9U l
        U R                  5         g Nr   )r&   r'   rP   rN   re   r   encoderr#   r.   r   	layernorm	post_initr^   s     r1   r'   Pix2StructVisionModel.__init__  sS     4V<.v6,V-?-?VEZEZ[ 	r3   c                 .    U R                   R                  $ rS   )re   rV   r-   s    r1   get_input_embeddings*Pix2StructVisionModel.get_input_embeddings  s    ///r3   heads_to_prunerQ   Nc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr*  r   r   prune_heads)r-   r2  r   headss       r1   _prune_heads"Pix2StructVisionModel._prune_heads  s<    
 +002LELLu%//;;EB 3r3   r   r   r   r   r   c           	      &   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUc   UR                  SS9S:g  R                  5       nU R                  X0R                   R                  5      nU R                  U5      nU R                  UUUUUUS9nUS   n	U R                  U	5      n	U(       d
  U	4n
XSS -   $ [        U	UR                  UR                  S9$ )	a  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
    Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
    [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
    paper](https://huggingface.co/papers/2210.03347) (figure 5) for more details.

Example:

```python
>>> import requests
>>> from PIL import Image
>>> from transformers import AutoProcessor, Pix2StructVisionModel

>>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 2048, 768]
```
Nz%You have to specify flattened_patchesr6   r  r   )r   r   r   r   r   r   r   )rP   r   r   use_return_dictr  sumfloatget_head_maskr   re   r*  r+  r   r@   r   )r-   r`   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputss              r1   rB   Pix2StructVisionModel.forward  s4   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$DEE!/333;q@GGIN &&y++2O2OP	??+<=,,)/!5# ' 
 *!,..9+-L!""555-)77&11
 	
r3   )rP   re   r*  r+  )NNNNNN)rE   rF   rG   rH   r   r#  main_input_namesupports_gradient_checkpointing_no_split_modulesr'   r0  dictintlistr7  r   r   r)   rj   r   r   r   r   rB   rI   rJ   rK   s   @r1   r'  r'    s    "")O&*#01
5 
0C4T#Y+? CD C  5915,0,0/3&*N
#ELL1N
 !.N
 ELL)	N

 $D>N
 'tnN
 d^N
 
u00	1N
 N
r3   r'  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )r  iL  rP   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g ro   r   r^   s     r1   r'   )Pix2StructTextDenseGatedActDense.__init__M  r   r3   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rS   r   r   s       r1   rB   (Pix2StructTextDenseGatedActDense.forwardU  r   r3   r   	rE   rF   rG   rH   r   r'   rB   rI   rJ   rK   s   @r1   r  r  L  s    /3 / r3   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Pix2StructTextLayerFFii  rP   c                    > [         TU ]  5         [        U5      U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r)  )r&   r'   r  DenseReluDenser#   r.   layer_norm_epsilon
layer_normr   r[   r\   r]   r^   s     r1   r'   Pix2StructTextLayerFF.__init__j  sK    >vF-f.@.@fF_F_`zz&"5"56r3   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ rS   )rT  rR  r]   )r-   r@   forwarded_statess      r1   rB   Pix2StructTextLayerFF.forwardr  s;    ??=9../?@%5E(FFr3   )rR  r]   rT  rN  rK   s   @r1   rP  rP  i  s    73 7 r3   rP  c                      ^  \ rS rSr SS\S\\   4U 4S jjjr\SS j5       r	SS jr
\" SSS	S
9         SS j5       rSrU =r$ )r  iy  rP   	layer_idxc                   > [         TU ]  5         X l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        UR                  U l        U R                  U R                  -  U l        X0l        Uc-  [        R                  SU R                   R"                   S35        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        [$        R&                  " U R
                  U R
                  SS9U l        U R                  (       a0  [$        R0                  " U R                  U R                  5      U l        [5        5       U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Frp   )r&   r'   r  relative_attention_num_bucketsrelative_attention_max_distancer.   rr   rs   r  ru   r\   r]   rw   rZ  loggerwarning_oncer0   rE   r   rT   rx   ry   rz   r{   rW   r	  setpruned_headsr|   r-   rP   r  rZ  r0   s       r1   r'    Pix2StructTextAttention.__init__z  so    	+F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(E&+#r3   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r5   r   )r8   r)   rb   absr   
zeros_likelogr<  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r1   _relative_position_bucket1Pix2StructTextAttention._relative_position_bucket  s   . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r3   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nOUSS2S4   R                  U5      n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  USU R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r=   r   F)rl  rm  rn  )r5   r   r   r   )r	  r+   r   r)   arangerb   r8   rs  r\  r]  permute	unsqueeze)
r-   query_length
key_lengthr   cache_positioncontext_positionmemory_positionrk  relative_position_bucketvaluess
             r1   compute_bias$Pix2StructTextAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+>#'#A#A;;==	 $B $
  --.FG	*44Q7r3   past_key_valuepast_key_values4.58new_nameversionc                    UR                   SS u  pUSLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUb[  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aU  U(       aN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  U
OSn
UR%                  UUU R                  SU
05      u  nnU(       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nUc  UR                   S   nUb  UOU
S   S-   nU R*                  (       db  [&        R,                  " SU R                  UU4UR.                  UR0                  S	9nU R2                  (       a  U R4                  (       a  SUl        O.U R9                  UUUR.                  U
S
9nUSS2SS2U* S2SS24   nUb#  USS2SS2SS2SUR                   S   24   nUU-   nU R:                  (       aS  [&        R<                  " UR                   S   5      nSU[?        U R:                  5      '   USS2URA                  5       4   nOUnUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UU RL                  U R4                  S9nUb  UU-  n[&        R(                  " UU5      nUR                  SS5      RO                  5       nUR                  USU RP                  5      nU RS                  U5      nUU4nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nr5   r6   r   r{  Tr   r   )r   r{  r   r  r   )*r   rx   r   ru   rs   r   r   r
   
is_updatedgetrZ  cross_attention_cacheself_attention_cachelayerskeysr  ry   rz   updater)   r   r  r   r   r=   r|   r   r   r  ra  r*   rH  r   r   r   r   r<  r   r]   r   rw   r{   )r-   r@   maskkey_value_statesr   r  r   ry  	use_cacher   r{  r   r   is_cross_attentionr   r  curr_past_key_valuecurrent_statesr   r   r   rz  real_seq_lengthcausal_maskr   r   r   r   s                               r1   rB   Pix2StructTextAttention.forward  s   & "/!4!4Ra!8
 .T9zz-0#((RtG^G^_iijkmno &:oGZ+[+[(3377GJ!&5&K&K#&5&J&J#"1-?)]/j,33DNNCHHJ.55dnnELLL.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*7It+>+E+Ednn?OQ_>`,(
L &AEO..t~~> lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fkk+../Gr3   )r]   r|   r  r.   rw   ry   rs   rZ  ru   r{   ra  rx   r	  r]  r\  rz   FN)T       )NN)	NNNNNNFFN)rE   rF   rG   rH   r   r   rG  r'   staticmethodrs  r  r   rB   rI   rJ   rK   s   @r1   r  r  y  s    jn,*,ZbcfZg, ,> -  - `* %0A6R l Slr3   r  c                   h   ^  \ rS rSrS
S\\   4U 4S jjjr\" SSSS9       SS j5       rS	r	U =r
$ ) Pix2StructTextLayerSelfAttentioniQ  rZ  c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr  rZ  r   r&   r'   r  r   r#   r.   rS  rT  r   r[   r\   r]   rb  s       r1   r'   )Pix2StructTextLayerSelfAttention.__init__R  sR    0W`
 .f.@.@fF_F_`zz&"5"56r3   r  r  r  r  c	                     U R                  U5      n	U R                  U	UUUUUUUS9n
XR                  U
S   5      -   nU4U
SS  -   nU$ )N)r  r   r   r  r  r   r{  r   r   rT  r   r]   )r-   r@   r   r   r   r  r  r   r{  normed_hidden_statesr   r   s               r1   rB   (Pix2StructTextLayerSelfAttention.forwardZ  sr      $}=>> '++/) * 	
 &5Ea5H(II "%5ab%99r3   r   r]   rT  r  )NNNNFFNrE   rF   rG   rH   r   rG  r'   r   rB   rI   rJ   rK   s   @r1   r  r  Q  sP    7XVY] 7 7 %0A6R  Sr3   r  c                   j   ^  \ rS rSrS
S\\   4U 4S jjjr\" SSSS9        SS j5       rS	r	U =r
$ )!Pix2StructTextLayerCrossAttentioniw  rZ  c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   r  )r-   rP   rZ  r0   s      r1   r'   *Pix2StructTextLayerCrossAttention.__init__x  sP    0UZfop-f.@.@fF_F_`zz&"5"56r3   r  r  r  r  c                     U R                  U5      nU R                  UUUUUUUUU	U
S9
nXR                  US   5      -   nU4USS  -   nU$ )N)	r  r  r   r   r  r  ry  r   r{  r   r   r  )r-   r@   r  r   r   r   r  r  ry  r   r{  r  r   r   r   s                  r1   rB   )Pix2StructTextLayerCrossAttention.forward~  sw      $}=>> -'++%/) * 
 %||4DQ4G'HH/$4QR$88r3   r  rS   )NNNNFNFNr  rK   s   @r1   r  r  w  sR    7(3- 7 7 %0A6R
  Sr3   r  c                   r   ^  \ rS rSrS
S\\   4U 4S jjjr\" SSSS9            SS j5       rS	r	U =r
$ )Pix2StructTextBlocki  rZ  c                    > [         TU ]  5         [        UUUS9U l        [	        UUS9U l        [        U5      U l        g )Nr  )rZ  )r&   r'   r  self_attentionr  encoder_decoder_attentionrP  r   rb  s       r1   r'   Pix2StructTextBlock.__init__  sH    >(C
 *K*
&
 )0r3   r  r  r  r  c                     U R                  UUUUU	U
UUS9nUS   nUSS  nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUS LnU(       a  U R                  UUUUUU	US   S-   U
US9	nUS   nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUUSS  -   nU R                  U5      nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU4nUU-   $ )N)r   r   r   r  r  r   r{  r   r   i  )r   r   r6   )r  r   r   r   r  ry  r  r   )r  r=   r)   r>   isinfanyr   r   clampr  r   )r-   r@   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskr  r  r   r   r{  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r1   rB   Pix2StructTextBlock.forward  s   " "&!4!4)'++/) "5 	"
 /q12126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM2$>&*&D&D!65; : /+B/!3#"3 'E 
'# 4A6M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM "***r3   )r  r   r  r  )NNNNNNNNFFTNr  rK   s   @r1   r  r    sa    1XVY] 1 1  %0A6R "#&*#'C+ SC+r3   r  z3
    The standalone text decoder of Pix2Struct
    )custom_introc            #         ^  \ rS rSr% \\S'   S/rS/rSrU 4S jr	S r
\              S"S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                      S\\   S\\   S\\   S\\   S\\R                     S\\   S\\R                     S\\\R                  S4   \4   4S jj5       r S#S	\\R                   S4   S\R                   S\R                   S\S\4
S jjr\S	\R                   S\S\S\R4                  S\R                   S\4S  j5       rS!rU =r$ )$r  i  rP   r  zlm_head.weightTc                 R  > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l        [        UR
                  UR                  S9U l        [        R                   " UR"                  5      U l        [        R&                  " UR
                  UR                  SS9U l        U R+                  5         SU l        g s  snf )Nr   r  r   Frp   )r&   r'   r   rW   
vocab_sizer.   embed_tokensr   r   
num_layersr  r   r   r#   rS  final_layer_normr[   r\   r]   rT   r  r,  r|   )r-   rP   r   r0   s      r1   r'   Pix2StructTextModel.__init__  s     LL):):F<N<NO]] v0011A $FQRSV`ab1

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   (!D$c                     Xl         g rS   )r  r-   new_embeddingss     r1   set_input_embeddings(Pix2StructTextModel.set_input_embeddings  s    *r3   r   r   r  r  inputs_embedsr   cross_attn_head_maskr  r  r   r   labelsr   r{  rQ   .c                 
   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nU R
                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	Ub  Ub  [        S5      eUb&  UR                  5       nUR                  SUS   5      nO"Ub  UR                  5       SS nO[        S5      eUc%  U R                  c   S5       eU R                  U5      nUu  nnU	(       a`  Uc]  U R                   R                  (       a/  [        [        U R                   S9[        U R                   S95      nO[        U R                   S9nS	nUb  US	   nOUb  UR!                  5       nUc#  ["        R$                  " UUU-   UR&                  S
9nUc8  Ub  UR!                  5       U-   OUn["        R(                  " UUUR&                  S
9nU R                   R*                  (       a7  U R-                  UUU[/        U[        5      (       a  UR0                  OUU
5      nOVUSS2SSSS24   nUR3                  UR4                  S9nSU-
  ["        R6                  " UR4                  5      R8                  -  nUbL  UR                  5       u  nnnUU4nUc  ["        R(                  " UUR&                  S
9nU R;                  U5      nOSnU R=                  X`R                   R>                  5      nU R=                  XpR                   R>                  5      nU(       a  SOSnU
(       a  SOSnU
(       a  SOSnSnSnU RA                  U5      n [C        U RD                  5       Hi  u  n!n"UU!   n#UU!   n$U(       a  UU 4-   nU"" U UUUUUU#U$UU	U
US9n%U%S	   n U%S   nUb  U%U
(       a  SOS   nU
(       d  MR  UU%S   4-   nUc  M`  UU%S   4-   nMk     U RG                  U 5      n U RA                  U 5      n U RI                  U 5      n&U(       a  UU 4-   nSn'Ub  UR3                  U&R&                  5      n[J        RL                  " SSS9n(U(" U&RO                  5       R                  SU&R                  S5      5      URO                  5       R                  S5      5      n'U(       d  [Q        S U'U&UUUU4 5       5      $ [S        U'U&UUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
    embeddings so you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
    Training](./t5#training).
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoProcessor, Pix2StructTextModel

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> loss = outputs.loss
```
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer6   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddings)rP   r   r   )r=   r   r   )r   r  r  r  r   r{  r   r   r5      r  r;   )ignore_index	reductionc              3   0   #    U  H  nUc  M  Uv   M     g 7frS   r   r   s     r1   r   .Pix2StructTextModel.forward.<locals>.<genexpr>  s"      A  s   	)losslogitsr  r@   r   cross_attentions)*rP   r  r   r   r:  r|   r   r^  warningr  sizer   r  is_encoder_decoderr
   r	   get_seq_lengthr)   rv  r   r*   
is_decoder_update_causal_maskr   r  r8   r=   r   r   invert_attention_maskr=  r  r]   r   r   r  r  r   CrossEntropyLossr   r   r   ))r-   r   r   r  r  r  r   r  r  r  r   r   r  r   r{  kwargsinput_shaper   r   past_key_values_lengthmask_seq_lengthr  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  r@   r   r   r   r  r   r  r  loss_fcts)                                            r1   rB   Pix2StructTextModel.forward  s=   f "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]&&4==YNNl I ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
J0{{--"5 4l$++6V# #/dkk"B!"%%3A%6"(%4%C%C%E"!"\\&(>(KTaThThN ! BQA\..0:=bl  #ZZ
OML`L`aN;;!!22o/BCC  44$!K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK !,=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d&7rd(,%]3(4OA|'lO)=a)@&#$58H$H!(%/- /+E /#"3-M *!,M
 *!,M$00=CTaZ[0\-  !/=3C2E!E(4+?=QRCSBU+U(C  5F --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD  #%"(   1++%1
 	
r3   r    input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fsdpa)r  r  is_trainingr   r6   )sequence_lengthtarget_lengthr=   r{  r   )cudaxpunpu)rP   _attn_implementationr  r   r)   rj   r!   r  is_compileabler   _ignore_causal_mask_sdpar   r=   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r-   r   r  r{  r  r   past_seen_tokensusing_compilable_cacher=   r  r  r  	min_dtypes                r1   r  'Pix2StructTextModel._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr3   r  r  r=   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr  )
fill_valuer=   r   r   )diagonalr  r6   r   )r   r)   r   r   r  r   triurv  reshapeexpandr  r   r8   r   )r   r  r  r=   r{  r   r  r  r  mask_lengthpadding_masks              r1   r   IPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position5  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r3   )r]   r  r  r|   r   r  )NNNNNNNNNNNNNN)F)rE   rF   rG   rH   r   r#  rE  _tied_weights_keysrD  r'   r  r   r   r)   
LongTensorFloatTensorrj   r   r   r   r   r   rB   r  r  rG  r=   r   rI   rJ   rK   s   @r1   r  r    sR    ! ./*+&*#,&+  156:=A>B48157;+/$(,0/3-1&*59V
E,,-V
 !!2!23V
  ((9(9:	V

 !)):): ;V
   0 01V
 E--.V
 'u||4V
 "%V
 D>V
 $D>V
 'tnV
 ))*V
 d^V
 !!1!12V
" 
uU&&+,.OO	P#V
 V
~ #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r3   r  zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c            &       h  ^  \ rS rSr% \\S'   SrS/rS\4U 4S jjrS r	S r
S\R                  4S	 jrS
 rS r\                SS\\R&                     S\\R&                     S\\R(                     S\\R*                     S\\R&                     S\\R&                     S\\R,                     S\\\\R&                           S\\   S\\R(                     S\\R,                     S\\   S\\   S\\   S\\   S\\R(                     S\\\R&                     \4   4"S jj5       rSrU =r$ )"Pix2StructForConditionalGenerationin  rP   r`   zdecoder.lm_head.weightc                    > [         TU ]  U5        [        UR                  5      U l        [        UR                  5      U l        UR                  U l        U R                  5         g rS   )
r&   r'   r'  vision_configr*  r  r  decoderis_vqar,  r^   s     r1   r'   +Pix2StructForConditionalGeneration.__init__x  sK     ,V-A-AB*6+=+=>mm 	r3   c                 6    U R                   R                  5       $ rS   )r  r0  r/  s    r1   r0  7Pix2StructForConditionalGeneration.get_input_embeddings  s    ||0022r3   c                 :    U R                   R                  U5        g rS   )r  r  r  s     r1   r  7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).9r3   rQ   c                 6    U R                   R                  5       $ rS   )r  get_output_embeddingsr/  s    r1   r  8Pix2StructForConditionalGeneration.get_output_embeddings  s    ||1133r3   c                 :    U R                   R                  U5        g rS   )r  set_output_embeddingsr  s     r1   r"  8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:r3   c                     U R                   $ rS   )r*  r/  s    r1   get_encoder.Pix2StructForConditionalGeneration.get_encoder  s    ||r3   r   r   r   r   decoder_head_maskr  r?  r  r  decoder_inputs_embedsr  r   r   r   r{  c                 <   Ub  UOU R                   R                  R                  nUb  UOU R                   R                  nUc  U R	                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU
bX  UcU  UcR  U R                  U
5      nUb  UO2UR                  U R                   R                  5      R                  5       nSUSS2S4'   U R                  UUUU	UUUUUUUU
UUS9nU(       d  UU-   $ [        UR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  UR"                  UR$                  S9	$ )	a}  
flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
    Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
    `num_channels` * `patch_size` * `patch_size`

    The process of flattening the pixel patches is done by `Pix2StructProcessor`.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss for the decoder.

Example:

Inference:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> # autoregressive generation
>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A stop sign is on a street corner.

>>> # conditional generation
>>> text = "A picture of"
>>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

>>> generated_ids = model.generate(**inputs, max_new_tokens=50)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> print(generated_text)
A picture of a stop sign with a red stop sign
```

Training:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

>>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
>>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "A stop sign is on the street corner."

>>> inputs = processor(images=image, return_tensors="pt")
>>> labels = processor(text=text, return_tensors="pt").input_ids

>>> # forward pass
>>> outputs = model(**inputs, labels=labels)
>>> loss = outputs.loss
>>> print(f"{loss.item():.5f}")
5.94282
```N)r`   r   r   r   r   r   r   r   r5   r   )r   r   r  r  r  r  r   r  r  r   r   r  r   r{  )	r  r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rP   r  r  r:  r*  r   r   lenr!  ner  r<  r  r   r  r  r  r@   r   r  r   )r-   r`   r   r   r   r   r'  r  r?  r  r  r(  r  r   r   r   r{  r@   decoder_outputss                      r1   rB   *Pix2StructForConditionalGeneration.forward  s   d "+!6IDKK<S<S<]<]	%0%<k$++B]B] ""ll"3-#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1'!5/!5#) ' 
" "_44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r3   )r  r*  r  )NNNNNNNNNNNNNNNN)rE   rF   rG   rH   r   r#  rC  r  r'   r0  r  r   Moduler  r"  r%  r   r   r)   r  r  
BoolTensorrj   r   r   r   r   r   rB   rI   rJ   rK   s   @r1   r  r  n  s    )O23	/ 	3:4ryy 4;  :>6:8<=A159=7;EI+/-18<$(,0/3&*59#q
#E$5$56q
 !!2!23q
 $E$4$45	q

 !))9)9 :q
 E--.q
 $E$5$56q
 'u||4q
 "%e.?.?(@"ABq
 "%q
 ))*q
  (5q
 D>q
 $D>q
 'tnq
  d^!q
" !!1!12#q
$ 
uU&&');;	<%q
 q
r3   r  )r   r  r'  r  )Hri   rh  typingr   r   r)   r   activationsr   cache_utilsr   r	   r
   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r   utils.deprecationr   configuration_pix2structr   r   r   !torch.nn.attention.flex_attentionr    integrations.flex_attentionr!   
get_loggerrE   r^  r2  r#   apex.normalizationrL   infoImportError	Exceptionr  rN   rl   r   r   r   r   r'  r  rP  r  r  r  r  r  r  __all__r   r3   r1   <module>rG     s      "   ! C C ) > 9  .   1 d d  !!;J 
		H	%+")) +2	/&
KKij! !H^		 ^D")) :(6 (V)
bii )
X y! y! y!x l
5 l
 l
`ryy :BII  Tbii Tp"ryy "L$		 $NU+4 U+p 
p3 p
pf 
Q
)BO Q

Q
hW/  	 	
NN_`	s   1G+ +H4HH