
    cCi                     >   S r SSKrSSKJrJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  SSK J!r!  \RD                  " \#5      r$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\RP                  5      r) " S S\RP                  5      r* " S S\5      r+\ " S S\5      5       r, " S S\,5      r-\" S S!9 " S" S#\,5      5       r.\" S$S!9 " S% S&\,\5      5       r/S&S/r0g)'z/PyTorch TrOCR decoder model (based on RoBERTa).    N)OptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )TrOCRConfigc                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S\	\R                     4U 4S	 jjjr
S
rU =r$ )TrOCRLearnedPositionalEmbedding+   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g )N   )offsetsuper__init__)selfr   r   	__class__s      b/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/trocr/modeling_trocr.pyr!   (TrOCRLearnedPositionalEmbedding.__init__0   s"     ++5}E    	input_idspast_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr   )dtypedevicer   )shapetorcharangelongweightr,   expand	unsqueezer    forwardr   )r"   r'   r(   r)   bszseq_lenr#   s         r$   r5   'TrOCRLearnedPositionalEmbedding.forward6   s    
 $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r&   )r   )r   N)__name__
__module____qualname____firstlineno____doc__intr!   r/   Tensorr   r5   __static_attributes____classcell__r#   s   @r$   r   r   +   sW    Fs F3 F pt;;?B;V^_d_k_kVl; ;r&   r   c            
       r   ^  \ rS rSrSrSS\S\S\S\\   4U 4S jjjrS\	R                  4U 4S	 jjrS
rU =r$ )TrOCRScaledWordEmbeddingG   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
r   r   padding_idxembed_scalec                 2   > [         TU ]  XU5        X@l        g N)r    r!   rG   )r"   r   r   rF   rG   r#   s        r$   r!   !TrOCRScaledWordEmbedding.__init__L   s    D&r&   r'   c                 <   > [         TU ]  U5      U R                  -  $ rI   )r    r5   rG   )r"   r'   r#   s     r$   r5    TrOCRScaledWordEmbedding.forwardP   s    wy)D,<,<<<r&   rG   )      ?)r9   r:   r;   r<   r=   r>   r   floatr!   r/   r?   r5   r@   rA   rB   s   @r$   rD   rD   G   sJ    's '3 'S '_ghm_n ' '= = =r&   rD   c            	          ^  \ rS rSrSrSS\S\S\\   4U 4S jjjr\SS\S\S\\   4S jj5       r	\
R                  " 5       SS	\
R                  S
\4S jj5       r SS	\
R                  S\S
\\   4S jjrSrU =r$ )"TrOCRSinusoidalPositionalEmbeddingT   zDThis module produces sinusoidal positional embeddings of any length.num_positionsr   rF   c                    > [         TU ]  5         SU l        X l        X0l        U R                  XU5      U l        U R                  S[        R                  " S5      5        g )Nr   _float_tensorr   )
r    r!   r   r   rF   get_embeddingweightsregister_bufferr/   FloatTensor)r"   rS   r   rF   r#   s       r$   r!   +TrOCRSinusoidalPositionalEmbedding.__init__W   sP    *&))-T_e.?.?.BCr&   r   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   )r+   r   dimr-   N)mathlogr/   expr0   int64rO   r4   catsincosviewzerostoget_default_dtype)r   r   rF   half_dimembs        r$   rV   0TrOCRSinusoidalPositionalEmbedding.get_embedding_   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r&   r'   r(   c                 J   UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nU R
                  b  X`R
                  R                  S5      :  a+  U R                  X`R                  U R                  5      U l        U R
                  R                  U R                  5      U l        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       nU$ )Nr   r   r-   )size"create_position_ids_from_input_idsrF   rg   r,   rW   rV   r   rU   index_selectre   detach)r"   r'   r(   r6   r7   r)   max_posxs           r$   r5   *TrOCRSinusoidalPositionalEmbedding.forwardr   s     ~~'>>yJZJZ\rsvv

 ""Q&0<<7\\->->q-A#A--g7I7I4K[K[\DL||t'9'9:LL%%a):):2)>?DDSSUV]]_r&   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.
r   r\   )ner>   r/   cumsumtype_asr1   )r"   r'   rF   r(   maskincremental_indicess         r$   rn   ETrOCRSinusoidalPositionalEmbedding.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r&   )r   r   rF   rW   rI   )r   )r9   r:   r;   r<   r=   r>   r   r!   staticmethodrV   r/   no_gradr?   r5   rn   r@   rA   rB   s   @r$   rQ   rQ   T   s    NDc D# DHUXM D D 1c 1# 1HUXM 1 1$ ]]_ s  & bc
8
847
8QYZ]Q^
8 
8r&   rQ   c                     ^  \ rS rSrSr       SS\S\S\\   S\\   S\\   S\\   S	\\   S
\\   S\\   4U 4S jjjr	\
" SSSS9      SS\R                  S\\R                     S\\   S\\R                     S\\R                     S\\   S\\R                     S\\R                  \\R                     \\\R                        4   4S jj5       rSrU =r$ )TrOCRAttention   z>Multi-headed attention from 'Attention Is All You Need' paper.	embed_dim	num_headskdimvdimdropout
is_decoderbiasis_cross_attention	layer_idxc                 2  > [         TU ]  5         X l        Ub  UOUU l        Ub  UOUU l        X0l        X`l        X#-  U l        U R                  U-  U R                  :X  d  [        SU R                   SU S35      eU R                  S-  U l	        Xpl
        Xl        [        R                  " U R                  X(S9U l        [        R                  " U R                  X(S9U l        [        R                  " X"US9U l        [        R                  " X"US9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r   )r    r!   r   r   r   r   r   head_dim
ValueErrorscalingr   r   r   Lineark_projv_projq_projout_proj)r"   configr   r   r   r   r   r   r   r   r   r#   s              r$   r!   TrOCRAttention.__init__   s     	" ,D)	 ,D)	"!.	)T^^;MdnnM] ^;b"  }}d*$"ii		9@ii		9@ii	4@		)TBr&   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskoutput_attentionscache_positionreturnc                    USLnUR                  5       u  pnU R                  U5      U R                  -  nSnUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUR                  U	SU R                   U R"                  5      R%                  SS5      nUbc  U(       d  UOSnWR'                  UUU R                  SU05      u  nnU(       a.  [        U[        5      (       a  SUR
                  U R                  '   XR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR(                  " U6 nUR(                  " U6 nUR(                  " U6 nUR                  S5      n[*        R,                  " UUR%                  SS5      5      nUR                  5       XR                   -  U
U4:w  a.  [/        SXR                   -  U
U4 S	UR                  5        35      eUbz  UR                  5       U	SU
U4:w  a#  [/        S
U	SU
U4 S	UR                  5        35      eUR                  XR                   U
U5      U-   nUR                  XR                   -  U
U5      n[0        R2                  R5                  USS9nUb  UR                  5       U R                   4:w  a*  [/        SU R                   4 S	UR                  5        35      eUR                  SSSS5      UR                  XR                   U
U5      -  nUR                  XR                   -  U
U5      nU(       a=  UR                  XR                   U
U5      nUR                  XR                   -  U
U5      nOSn[0        R2                  R7                  UU R6                  U R8                  S9n[*        R,                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [/        SXR                   XR"                  4 S	UR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR)                  XU5      nU R;                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNFr-   r   r   r   Tz$Attention weights should be of size z	, but is z!Attention mask should be of size r\   z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )rm   r   r   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   re   r   r   	transposeupdatereshaper/   bmmr   r   
functionalsoftmaxr   r   r   )r"   r   r   r   r   r   r   r   r   r6   tgt_lenr   query_statesr   curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                           r$   r5   TrOCRAttention.forward   s     .T9"/"4"4"6i {{=1DLL@
&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#-?)]/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*7It+>+E+Ednn?OQ_>`,(
L &*_FY*Z*ZAEO..t~~>NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK0111r&   )r   r   r   r   r   r   r   r   r   r   r   r   r   )NN        FTFN)NNNNFN)r9   r:   r;   r<   r=   r>   r   rO   boolr!   r   r/   r?   r	   tupler5   r@   rA   rB   s   @r$   r~   r~      s   H #"#&%*#-2$(!C !C 	!C
 sm!C sm!C %!C TN!C tn!C %TN!C D>!C !CF %0A6R 48+/1526,115q2||q2 #5<<0q2 "%	q2
 !.q2 "%,,/q2 $D>q2 !.q2 
u||Xell3XeELL>Q5RR	Sq2 Sq2r&   r~   c                   T  ^  \ rS rSrSS\4U 4S jjjr\" SSSS9         SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\R                     S\	\R                     S\	\
   S\	\   S\	\   S\	\R                     4S jj5       rSrU =r$ )TrOCRDecoderLayeri-  r   c                 j  > [         TU ]  5         UR                  U l        [	        UU R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        UR                   (       am  [	        UU R                  UR
                  UR"                  UR"                  UR                  SSUS9	U l        [        R                  " U R                  5      U l        [        R(                  " U R                  UR*                  5      U l        [        R(                  " UR*                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)r   r   r   r   r   )r   r   r   r   r   r   r   r   )r    r!   hidden_sizer   r~   decoder_attention_headsattention_dropout	self_attnr   r   activation_functionactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normr   cross_attention_hidden_sizeencoder_attnencoder_attn_layer_normr   decoder_ffn_dimfc1fc2final_layer_norm)r"   r   r   r#   s      r$   r!   TrOCRDecoderLayer.__init__.  s=   ++'nn44,,
 ~~#F$>$>?"(";";$&LL$@! ... 88777700#'#
!D ,.<<+GD(99T^^V-C-CD99V33T^^D "T^^ <r&   r   r   r   r   r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacher   c           
         UnU R                  UUUUUU
S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSnUb`  UnU R                  UUUUUUU
S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU(       a  XU4-  nU$ )au  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size *(decoder_attention_heads,)*.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r   r   r   r   N)r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   )r"   r   r   r   r   r   r   r   r   r   r   residualself_attn_weightscross_attn_weightsoutputss                  r$   r5   TrOCRDecoderLayer.forwardR  s   @ ! ,0>>'+)+/) ,: ,
( --m||VZVcVc-d 011-@ " ,$H040A0A+!65 : /"3- 1B 1-M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< "+=>>Gr&   )r   r   r   r   r   r   r   r   r   r   r   rI   )	NNNNNNFTN)r9   r:   r;   r<   r   r!   r   r/   r?   r   r	   r   r5   r@   rA   rB   s   @r$   r   r   -  s   "={ "= "=H %0A6R 268<9=26=A+/,1$(15Q||Q !.Q  (5	Q
 !) 6Q "%,,/Q %-U\\$:Q "%Q $D>Q D>Q !.Q SQr&   r   c                   4    \ rS rSr% \\S'   SrSrS/rS r	Sr
g)	TrOCRPreTrainedModeli  r   modelTr   c                 B   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_stdr   r   r   Conv1dr2   datanormal_r   zero_	EmbeddingrF   )r"   moduler   s      r$   _init_weights"TrOCRPreTrainedModel._init_weights  s    kk""fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r&    N)r9   r:   r;   r<   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr   r@   r   r&   r$   r   r     s"    &*#,-	?r&   r   c                   X   ^  \ rS rSrSrS\4U 4S jjr             SS jrSrU =r	$ )TrOCRDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]

Args:
    config: TrOCRConfig
r   c           
      |  > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  (       a   [        R                  " UR                  5      OSn[        UR                  UR                  U R                  US9U l        UR                  (       a&  [        UR                   UR                  5      U l        O@[%        UR                   U R                  -   S-   UR                  U R                  5      U l        UR&                  (       a&  [(        R*                  " UR                  5      U l        OS U l        [(        R,                  " [/        UR0                  5       Vs/ s H  n[3        XS9PM     sn5      U l        SU l        U R9                  5         g s  snf )NrN   rM   r   )r   F)r    r!   r   decoder_layerdrop	layerdroppad_token_idrF   scale_embeddingr^   sqrtr   rD   
vocab_sizeembed_tokensuse_learned_position_embeddingsr   max_position_embeddingsembed_positionsrQ   layernorm_embeddingr   r   
ModuleListrangedecoder_layersr   r   gradient_checkpointing	post_init)r"   r   rG   ir#   s       r$   r!   TrOCRDecoder.__init__  sT    ~~11!..7=7M7Mdii 2 23SV4v1143C3CQ\
 11#B6CaCacicucu#vD #E..1A1AAAE""  $D  %%')||F4F4F'GD$'+D$mmUZ[a[p[pUq$rUqPQ%6v%KUq$rs&+#	 %ss   F9c                 :   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  Ub  [        S5      eUb"  UnUR                  SUR                  S   5      nO.Ub   UR                  5       SS nUSS2SS2S4   nO[        S5      eU R                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	U	(       aG  UcD  Ub.  [        [        U R                   S9[        U R                   S95      O[        U R                   S9nU	(       a@  [        U[         5      (       a+  [        R                  S5        [        R"                  " U5      nUb  UR%                  5       OS	nUc  U R'                  U5      nU R                   R(                  (       a  U R+                  UUS
9nOU R+                  UUS
9nUU-   nU R,                  b  U R-                  U5      n[.        R0                  R3                  UU R2                  U R                  S9nUR                  n[5        X/UU5      nUb  Ub  [7        XHR8                  US   S9nU(       a  SOSnU
(       a  SOSnU
(       a  Ub  SOSn[;        XV/SS/5       Hn  u  nnUc  M  UR                  5       S	   [=        U R>                  5      :w  d  M7  [        SU S[=        U R>                  5       SUR                  5       S	    S35      e   [A        U R>                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [B        RD                  " / 5      nUU RF                  :  a  ML  U" UUUUUb  UU   OSUb  UU   OSUU
U	US9
nUS	   nU
(       d  My  UUS   4-  nUc  M  UUS   4-  nM     U(       a  UU4-  nU(       d  [!        S UUUUU4 5       5      $ [I        UUUUUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
        on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer-   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz^`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r(   r   )r   r   	head_maskcross_attn_head_maskzThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frI   r   ).0vs     r$   	<genexpr>'TrOCRDecoder.forward.<locals>.<genexpr>  s      rA rs   	)last_hidden_stater   r   
attentionscross_attentions)%r   r   output_hidden_statesr   use_return_dictr   re   r.   rm   r  r   loggerwarning_oncer   r
   r   r   from_legacy_cacheget_seq_lengthr   r   r  r  r   r   r   r   r   r+   ziplenr   	enumerater/   randr   r   )r"   r'   r   r   r   r  r  r   inputs_embedsr   r   r  return_dictr   inputinput_shaper(   	embed_posr   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_nameidxdecoder_layerdropout_probabilitylayer_outputss                               r$   r5   TrOCRDecoder.forward  sL   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E!r5;;r?;I&',,.s3K!!Q(+Edee&&4==##t "	0 )4 $L$DlZ^ZeZeFfg!5 
 OU;;\
 2CCOTOETE`!?!?!Afg  --i8M;;66,,UKa,bI,,YOe,fI%	1##/ 44]CM--mt||VZVcVc-dkk:8N

 !,1G1S%?&(;(;[QS_&"
 #7BD0d&7<Q<]rdh %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos /"3#-M *!,M  =#3"55(4(]1-=,??(7 #9<  -!11 ':K^]qr  
 9+++%1
 	
r&   )r   r  r   r  r   r  r   rF   )NNNNNNNNNNNNN)
r9   r:   r;   r<   r=   r   r!   r5   r@   rA   rB   s   @r$   r   r     sJ    { B "#!!O
 O
r&   r   a  
    The TrOCR Model with a language modeling head. Can be used for summarization.
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    )custom_introc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TrOCRDecoderWrapperi  c                 D   > [         TU ]  U5        [        U5      U l        g rI   )r    r!   r   decoderr"   r   r#   s     r$   r!   TrOCRDecoderWrapper.__init__  s     #F+r&   c                 &    U R                   " U0 UD6$ rI   r4  )r"   argskwargss      r$   r5   TrOCRDecoderWrapper.forward  s    ||T,V,,r&   r8  )r9   r:   r;   r<   r!   r5   r@   rA   rB   s   @r$   r2  r2    s    ,- -r&   r2  zy
    The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and
    c            "         ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
S	 r\              SS
\\R                     S\\R                      S\\R"                     S\\R                     S\\R                      S\\R                      S\\   S\\R"                     S\\R                     S\\   S\\   S\\   S\\   S\\R                      S\\\4   4S jj5       rSrU =r$ )TrOCRForCausalLMi  zoutput_projection.weightc                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr   )r   is_encoder_decoderr    r!   r2  r   r   r   r   r   output_projectionr  r5  s     r$   r!   TrOCRForCausalLM.__init__  sZ     $)! (0
!#6+=+=v?P?PW\!] 	r&   c                 B    U R                   R                  R                  $ rI   r   r4  r   r"   s    r$   get_input_embeddings%TrOCRForCausalLM.get_input_embeddings  s    zz!!...r&   c                 8    XR                   R                  l        g rI   rC  )r"   values     r$   set_input_embeddings%TrOCRForCausalLM.set_input_embeddings  s    */

'r&   c                     U R                   $ rI   r@  rD  s    r$   get_output_embeddings&TrOCRForCausalLM.get_output_embeddings  s    %%%r&   c                     Xl         g rI   rL  )r"   new_embeddingss     r$   set_output_embeddings&TrOCRForCausalLM.set_output_embeddings  s    !/r&   c                 $    XR                   l        g rI   r   r4  )r"   r4  s     r$   set_decoderTrOCRForCausalLM.set_decoder  s    $

r&   c                 .    U R                   R                  $ rI   rT  rD  s    r$   get_decoderTrOCRForCausalLM.get_decoder  s    zz!!!r&   r'   r   r   r   r  r  r   r!  labelsr   r   r  r"  r   r   c                 N   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  R                  UUUUUUUUU
UUUUS9nU R                  US   5      nSnU	bF  [        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a	  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import (
...     TrOCRConfig,
...     TrOCRProcessor,
...     TrOCRForCausalLM,
...     ViTConfig,
...     ViTModel,
...     VisionEncoderDecoderModel,
... )
>>> import requests
>>> from PIL import Image

>>> # TrOCR is a decoder model and should be used within a VisionEncoderDecoderModel
>>> # init vision2text model with random weights
>>> encoder = ViTModel(ViTConfig())
>>> decoder = TrOCRForCausalLM(TrOCRConfig())
>>> model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

>>> # If you want to start from the pretrained model, load the checkpoint with `VisionEncoderDecoderModel`
>>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
>>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

>>> # load image from the IAM dataset
>>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
>>> pixel_values = processor(image, return_tensors="pt").pixel_values
>>> text = "industry, ' Mr. Brown commented icily. ' Let us have a"

>>> # training
>>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id
>>> model.config.pad_token_id = processor.tokenizer.pad_token_id
>>> model.config.vocab_size = model.config.decoder.vocab_size

>>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
>>> outputs = model(pixel_values, labels=labels)
>>> loss = outputs.loss
>>> round(loss.item(), 2)
5.30

>>> # inference
>>> generated_ids = model.generate(pixel_values)
>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> generated_text
'industry, " Mr. Brown commented icily. " Let us have a'
```N)r'   r   r   r   r  r  r   r!  r   r   r  r"  r   r   r-   r   )losslogitsr   r   r  r  )r   r   r  r  r   r4  r@  r   re   r   r   r   r   r  r  )r"   r'   r   r   r   r  r  r   r!  rZ  r   r   r  r"  r   r   r]  r\  loss_fctoutputs                       r$   r5   TrOCRForCausalLM.forward  sK   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5#) % 
  ''
3')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r&   )r   r@  )NNNNNNNNNNNNNN)r9   r:   r;   r<   _tied_weights_keysr!   rE  rI  rM  rQ  rU  rX  r   r   r/   
LongTensorr?   rY   r	   r   r   r   r   r5   r@   rA   rB   s   @r$   r=  r=    s    55	/0&0%"  1515=A=A,07;+/59-1$(,0/3&*15u
E,,-u
 !.u
  ((9(9:	u

 !))9)9 :u
 ELL)u
 'u||4u
 "%u
   1 12u
 ))*u
 D>u
 $D>u
 'tnu
 d^u
 !.u
  
u77	8!u
 u
r&   r=  )1r=   r^   typingr   r   r/   r   torch.nnr   activationsr   cache_utilsr	   r
   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_trocrr   
get_loggerr9   r  r   r   rD   ModulerQ   r~   r   r   r   r2  r=  __all__r   r&   r$   <module>rr     s.   6  "   % ! C C ) : l - , 0 , 
		H	%;bll ;8
=r|| 
=;8 ;8|X2RYY X2vw2 wt ?? ? ?$v
' v
r -. -- 
V
+_ V

V
r 5
6r&   