
    cCi                       S r SSKrSSKrSSKrSSKJrJrJr  SSKrSSKJ	r	  SSK
Jr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-  \&" 5       (       a  SSK.J/r/  SSK0J1r1  \)Rd                  " \35      r4SUS\Rj                  S\6S\6S\6S\Rj                  4
S jjr7S\Rj                  S\6S\6S\Rj                  4S jr8SUS\Rj                  S\6S\6S\6S\Rj                  4
S jjr9S\6S\Rj                  4S jr:S \Rj                  S\6S\Rj                  4S! jr;S"\Rj                  S\6S#\Rx                  S\Rj                  4S$ jr=S"\Rj                  S%\6S\>\Rj                  \Rj                  4   4S& jr?S"\Rj                  S%\6S\Rj                  4S' jr@S(\Rj                  S)\Rj                  S*\6S\Rj                  4S+ jrA " S, S-\	R                  5      rC SS.KDJErE  \ErC\4R                  S/5         " S1 S2\	R                  5      rJ " S3 S4\	R                  5      rK " S5 S6\	R                  5      rL " S7 S8\	R                  5      rM " S9 S:\	R                  5      rN " S; S<\	R                  5      rO " S= S>\	R                  5      rP " S? S@\	R                  5      rQ " SA SB\	R                  5      rR " SC SD\	R                  5      rS " SE SF\5      rT\% " SG SH\5      5       rU " SI SJ\U5      rVSKrW\% " SL SM\U5      5       rX\%" SNSO9 " SP SQ\U\5      5       rY\% " SR SS\U5      5       rZ/ STQr[g! \G a     GN=\H a    \4R                  S05         GNVf = f)VzPyTorch LongT5 model.    N)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)deprecate_kwarg   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 l   U R                   U   * U-  n[        U R                   5      (       d?  [        U R                   5      nXR==   U-  ss'   [        R                  " XPR
                  S9$ S/U R                  -  nSU4Xb'   [        USSS2   S5      n[        R                  R                  XSUS9n U $ )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr*   ndimsumr   
functionalr0   )r#   r$   r%   r&   pad_len	new_shaper0   s          d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler>   @   s    wws|mi'Gqww<<M	'!{{9GG44(QVV
C7|CH
c$B$i
C
!:YGAH    c                 4   U R                   U   U-  S:w  a  [        XUSS9n U R                   U   U-  nU R                   SU X14-   U R                   US-   S -   nSU;   a)  [        R                  " X@R                  U R
                  S9$ U R                  U5      $ )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
r   )r&   Nr   r*   device)r3   r>   r6   emptyr*   rB   reshape)r#   r$   r%   
num_blocksoutput_shapes        r=   _split_into_blocksrG   P   s    
 	wws|i1$Q3!<*J774C=J#::QWWcAg[=QQLL{{<wwqxxHH99\""r?   	block_dimsequence_dimc                    U R                   U   nS/U R                  -  nSXQ'   [        USSS2   S5      n[        R                  R                  XSUS9n / n[        S5       HK  n[        S	S5      /U R                  -  n[        XwU-   5      X'   [        U5      nUR                  X   5        MM     [        R                  " XbS
9$ )zConcatenate three consecutive blocks for each input block for local attentiont.

For more information, see: https://huggingface.co/papers/2112.07916.
r+   )r   r   Nr,   r-   r.   r/   r   r   r%   )r3   r8   r9   r   r:   r0   rangeslicetupleappendr6   cat)	r#   rH   rI   r&   rE   r0   blocks_listiindicess	            r=   _concatenate_3_blocksrT   _   s    
 #J(QVV
CCN
c$B$i
C
!:YGA&(K1X D>"QVV+"1*n5.1:&  99[33r?   c                     [         R                  " SU -  [         R                  S9nXU *  nUR                  S5      UR                  S5      -
  nU$ )z:Makes 3-blocked relative position ids for local attention.r   r)   r   r   )r6   arangeint32	unsqueeze)r$   position_idscenter_position_idsrelative_position_idss       r=   "_make_3block_relative_position_idsr\   x   sP    <<IU[[AL&)<(22158K8U8UVW8XX  r?   local_attention_maskc                     [        U5      n[        R                  " U5      U:  nUSSSS2SS24   nUR                  U R                  5      n[        R
                  " X5      $ )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)r\   r6   abstorB   logical_and)r]   r$   r[   locality_masks       r=   _mask_local_attention_maskrc      s]    >yIII34y@M!$a"23M!$$%9%@%@AM1AAr?   attention_maskrB   c                     [        XSS9n[        USSS9nUR                  S5      nUR                  S5      n[        R                  " X45      n[        XQ5      nUR                  S5      R                  U5      $ )z;Prepare attention mask to be applied for a local attention.r   rK      rH   rI   r,   )rG   rT   rX   r6   ra   rc   r`   )rd   r$   rB   _blocked_attention_mask_3blocked_attention_maskr]   s         r=   _get_local_attention_maskrk      s     1PQR45LXYhij5??C7AA"E ,,-D_56JV))!,//77r?   global_block_sizec                   ^^ U R                   SS u  nmS[        R                  S[        R                  4UU4S jjn[        R                  " X R                  S9T-  n[        R
                  " USS9U-
  n[        R                  " U S	:g  S
S5      R                  U R                  5      n[        R                  " XT-   S
-
  5      R                  U R                  5      n[        R                  " SUR                  UR                  S9n[        R                  " Xg:  Xg5      nX`-  U S-
  -   nU" U5      nTT-  nUS:  a@  [        R                  " USS9R                  R                  US5      R                  SS5      n	O+[        R                  " USUR                  UR                  S9n	[        R
                  " [        R                   " X(5      SS9S-
  n
U
R#                  U R                  5      n
[        R                  " X:*  SS5      n
UR                  [        R$                  5      U
R                  [        R$                  5      4$ )a  Obtain the "fixed block" global id corresponding to each input token.

This implementation is a simplified version of the original Flaxformr implementation adopted from:
https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
the whole fixed block, are assigned to the preceding block.

Padding tokens from the original sequence are represented by -1.
Nrf   	block_idsr'   c                 X  > [         R                  " T5      T-  TS-
  :H  nUR                  U R                  5      n[         R                  " XS:  5      nUR                  S5      R                  S5      R                  U R                  5      S-
  n[         R                  " X:  X5      n U $ )Nr   r   r,   )
r6   rV   r`   rB   ra   r9   rX   typer*   where)rn   
block_endstrue_block_endsfull_blocksrl   seq_lens       r=   handle_orphan_tokens:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    ll7+.??DUXYDYY
]]9#3#34
++JQG%))"-77;@@QTUUKK	 7P	r?   rB   r   )axis              ?g     @r,   rA   r   rK   )r3   r6   Tensor	ones_likerB   cumsumrq   rp   r*   floortensormaxvaluesrepeat	transposer7   onesr`   int)rd   rl   
batch_sizerv   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsru   s    `         @r=   _make_global_fixed_block_idsr      s    )..r2J    ~>S>STWhh||$41=@PP;;~,c7;@@AUAUVD{{4#:S#@AFF~G[G[\$)LL;K;Q;QZjZqZq$r!{{8:J )9nq>PQ+,<=..KQ"')),<""E"L"L"S"ST_ab"c"m"mnoqr"s"'++!1!7!7@P@W@W#
 ejj&IrRUVV+..~/D/DE%7%RTUWXY  +-?-D-DUYY-OOOr?   c                     [        X5      u  p#UR                  S   n[        R                  " XBR                  S9nXRS   -
  nUR                  [        R                  5      $ )zBCreate the relative position tensor for local -> global attention.r,   rx   .N)r   r3   r6   rV   rB   rp   int64)rd   rl   rn   r   global_seq_lenglobal_positionsside_relative_positions          r=    _make_side_relative_position_idsr      sW    $@$c!I'--b1N||N;K;KL-)0DD!&&u{{33r?   hidden_statesrn   r   c           	      r   UR                  US:  [        R                  " X!R                  UR                  S95      n[
        R                  R                  UR                  [        R                  5      US-   5      SS2SS2SS24   n[        R                  " SXR                  U R                  5      5      $ )zFCompute individual block aggregates by summing over individual blocks.r   rA   r   Nr,   z...nd,...ng->...gd)rq   r6   r   r*   rB   r   r:   one_hotrp   r   einsum)r   rn   r   one_hot_block_idss       r=   _create_global_aggregatesr      s    
 Q^??S\ScScdI --innU[[.I>\]K]^_`bcehfheh_hi<<,m=S=STaTgTg=hiir?   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )LongT5LayerNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zW
Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
N)super__init__r   	Parameterr6   r   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r=   r   LongT5LayerNorm.__init__   s/     	ll5::k#:; #r?   c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )Nrf   r,   T)keepdim)r`   r6   float32powmeanrsqrtr   r   r*   float16bfloat16)r   r   variances      r=   forwardLongT5LayerNorm.forward   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r?   )r   r   )gư>)__name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r=   r   r      s    $+ +r?   r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseActDensei	  configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr   r   r   s     r=   r   LongT5DenseActDense.__init__
  sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r?   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)r   r   r   
isinstancer   r   r6   r|   r*   int8r`   )r   r   s     r=   r   LongT5DenseActDense.forward  s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r?   )r   r   r   r   	r   r   r   r   r    r   r   r   r   r   s   @r=   r   r   	  s    /| / r?   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseGatedActDensei  r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r	   r   r   r   s     r=   r   !LongT5DenseGatedActDense.__init__   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r?   c                     U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r=   r    LongT5DenseGatedActDense.forward(  sQ    hhtyy78		-0#3]3.r?   )r   r   r   r   r   r   r   s   @r=   r   r     s    /| / r?   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5LayerFFi2  r   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r=   r   LongT5LayerFF.__init__3  s_    ":6"BD"5f"=D)&..f>W>WXzz&"5"56r?   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r   )r   r   r   )r   r   forwarded_statess      r=   r   LongT5LayerFF.forward=  s;    ??=9../?@%5E(FFr?   )r   r   r   r   r   s   @r=   r   r   2  s    7| 7 r?   r   c                      ^  \ rS rSr  SS\S\\   4U 4S jjjrS r\	SS j5       r
SS jr\" SS	S
S9         SS j5       rSrU =r$ )LongT5AttentioniE  r   	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        [7        5       U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   s       r=   r   LongT5Attention.__init__F  so    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r?   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g Nr   r   rK   lenr   r   r   r  r   r   r   r   r   r   unionr   headsindexs      r=   prune_headsLongT5Attention.prune_headsi      u:?7<<!8!8$:K:K
 $DFFE2#DFFE2#DFFE2#DFFEq9||c%j0004<<? --33E:r?   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   rf   r   r`   r6   longr_   min
zeros_likelogfloatmath	full_likerq   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r=   _relative_position_bucket)LongT5Attention._relative_position_buckety  s   , AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r?   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nOUSS2S4   R                  U5      n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )%Compute binned relative position biasNrA   r  r  r   rf   r   r   r   )r  r   rB   r6   rV   r  r`   r%  r   r   r   permuterX   )
r   query_length
key_lengthrB   cache_positioncontext_positionmemory_positionr  relative_position_bucketr   s
             r=   compute_biasLongT5Attention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r?   past_key_valuepast_key_values4.58new_nameversionc                    UR                   SS u  pUSLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nSn[        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbc  U(       d  U
OSn
UR%                  UUU R                  SU
05      u  nnU(       a.  [        U[        5      (       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nUc  UR                   S	   nUb  UOU
S   S-   nU R*                  (       db  [&        R,                  " SU R                  UU4UR.                  UR0                  S
9nU R2                  (       a  U R4                  (       a  SUl        O.U R9                  UUUR.                  U
S9nUSS2SS2U* S2SS24   nUb#  USS2SS2SS2SUR                   S	   24   nUU-   nU R:                  (       aS  [&        R<                  " UR                   S   5      nSU[?        U R:                  5      '   USS2URA                  5       4   nOUnUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UU RL                  U R4                  S9nUb  UU-  n[&        R(                  " UU5      nUR                  SS5      RO                  5       nUR                  USU RP                  5      nU RS                  U5      nUU4nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nrf   r,   r   Fr.  Tr   rh   rB   r*   )rB   r.  r   rK   ptraining)*r3   r   viewr   r   r   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater6   matmulr   r7   rB   r*   r  r>  requires_gradr2  r  r   r5   boolr   r:   softmaxr  type_asr   
contiguousr   r   )r   r   r   key_value_statesposition_biasr5  layer_head_maskr,  	use_cacheoutput_attentionsr.  r   
seq_lengthis_cross_attentionquery_statesr@  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr-  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r=   r   LongT5Attention.forward  s   & "/!4!4Ra!8
 .T9vvm,#((RtG^G^_iijkmno 
o':;;(3377GJ!&5&K&K#&5&J&J#"1-?)]/"=*,33DNNCHHJ.55dnnELLL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*7It+>+E+Ednn?OQ_>`,(
L &*_FY*Z*ZAEO..t~~> lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fff[)./Gr?   )r   r   r  r   r   r   r   r   r   r   r   r  r   r  r   r   r   FNT       )NN)	NNNNNNFFN)r   r   r   r   r    r   r   r   r  staticmethodr%  r2  r   r   r   r   r   s   @r=   r   r   E  s     %*#'	!,!, C=	!, !,F;  -  - ^( %0A6R m Smr?   r   c                   v   ^  \ rS rSrSS\S\SS4U 4S jjjrS r\SS j5       r	S	\
4S
 jr    SS jrSrU =r$ )LongT5LocalAttentioni.  r   r   r'   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        U R                  U R                  -  U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        U R                  (       a0  [         R,                  " U R                  U R                  5      U l        [1        5       U l        SU l        g )Nr   Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr$   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r   r   r   r   s      r=   r   LongT5LocalAttention.__init__/  sQ    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r?   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g r  r	  r  s      r=   r   LongT5LocalAttention.prune_headsI  r  r?   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r  r  r  s           r=   r%  .LongT5LocalAttention._relative_position_bucketY     . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r?   block_lengthc                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ r(  metaNr   rA   r)  r*  r   r  r   rB   rp   r6   rV   r  r%  r   r   r   r+  rX   r   rq  target_devicer0  r/  r  r1  r   s           r=   r2  !LongT5LocalAttention.compute_bias       ++2299>>&H ((//66 	
  ,,q<'7uzzR_`*F ,D!G47G47PP#'#A#A#.;;==	 $B $
  --.FG	*44Q7AA!Dr?   c                 "  ^ ^ UR                   S S u  mnUU 4S jnUU 4S jnU" T R                  U5      5      n	U" T R                  U5      5      n
U" T R                  U5      5      n[	        U	T R
                  SS9n	[	        U
T R
                  SS9n
[	        UT R
                  SS9n[        U
SSS9n
[        USSS9n[        R                  " SX5      nUc  T R                  (       dz  [        R                  " SST R                  T R
                  ST R
                  -  4UR                  UR                  S	9nT R                  (       a  T R                  (       a  S
Ul        OT R#                  T R
                  5      nUb/  [        R$                  " US:  SS5      nX2R'                  SS5      -   nX-  n[(        R*                  R-                  UR/                  5       SS9R1                  U5      n[(        R*                  R3                  UT R2                  T R                  S9nUb  X-  nUR5                  UR                  5      nU" [        R                  " SX5      5      nUS S 2S U2S S 24   nT R7                  U5      nUU4nU(       a  X4-   nU$ )Nrf   c                 T   > U R                  TSTR                  TR                  5      $ 
projectionr,   r?  r   r   statesr   r   s    r=   r3   +LongT5LocalAttention.forward.<locals>.shape  "    ;;z2t||T=T=TUUr?   c                 Z   > U R                  5       R                  TSTR                  5      $ rD   r,   rL  r?  r   r  s    r=   unshape-LongT5LocalAttention.forward.<locals>.unshape  %    $$&++JDNNKKr?   r   rK   rg   ...qhd,...khd->...hqkr   r;  Tr   rz       _r,   r<  ...hqk,...khd->...qhd)r3   r   r   r   rG   r$   rT   r6   r   r   r7   r   rB   r*   r  r>  rH  r2  rq   r   r   r:   rJ  r  rK  r   rp   r   )r   r   r   rN  rO  rQ  rR  r3   r  rT  rW  rX  rY  r]  r^  r_  r   s   `               @r=   r   LongT5LocalAttention.forward  sW    "/!4!4Ra!8
J	V	L
 TVVM23466-01
TVVM23 *,AN'
DNNJ
),AN +:QRS
,\QUVW #\
  33 %4<<T^^9KLU[UbUbjpjvjv! ..4==26M/ $ 1 1$.. A{{4!8S%8 -q!0D D}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'9L#((););<ell+BL_`!![j[!"34ff[) 

 /Gr?   )r$   r   r   r  r   r   r   r   r   ri  r   r   r  r   r  r   r   r   Frb  NNNF)r   r   r   r   r    rI  r   r  re  r%  r   r2  r   r   r   r   s   @r=   rg  rg  .  sc    ,| ,$ ,[_ , ,4;  -  - ^ 6 K Kr?   rg  c                      ^  \ rS rSrSS\S\SS4U 4S jjjrS r\SS j5       r	S	\
4S
 jrS\R                  S\R                  S\R                  4S jr    SS jrSrU =r$ )LongT5TransientGlobalAttentioni  r   r   r'   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                   U R                  SS9U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        [3        5       U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        [9        UR                  UR:                  S9U l        g )Nr   Fr   r   )r   r   r   r   r   r   r   r   r   r   r   ri  r$   rl   r   r   r   r   r   r   r   r   r   r   r  r  r  global_relative_attention_biasr   r   global_input_layer_normrj  s      r=   r   'LongT5TransientGlobalAttention.__init__  s    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.!'!9!9**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E ++24,,t?b?bdhdpdp2qD/'6v~~6KdKd'e$r?   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g r  r	  r  s      r=   r  *LongT5TransientGlobalAttention.prune_heads  r  r?   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r  r  r  s           r=   r%  8LongT5TransientGlobalAttention._relative_position_bucket   rp  r?   rq  c                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ rs  ru  rv  s           r=   r2  +LongT5TransientGlobalAttention.compute_biasQ  ry  r?   r   r   c                 x   [         R                  " US   US S 2S S S 24   5      S S 2S S4   n[         R                  " US:  SS5      n[        XR                  5      nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      nXG-   nU$ )Nr   .r   rz   r  r)  )r   r   r   rf   )r6   eqrq   r   rl   r%  r   r   r   r  r+  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r=   compute_side_bias0LongT5TransientGlobalAttention.compute_side_biasi  s    #hhtI8J1dTU:8VWXY[_adXde#kk*=*A3N!A$H^H^!_(,(F(F"#.;;==	 )G )
% 778UV	 %%l3	1=""r?   c                 >	  ^ ^ UR                   S S u  mnUU 4S jnUU 4S jn[        Ub  UO"[        R                  " UR                   S S 5      T R                  5      u  pU
R                   S   n[        XU5      nT R                  U5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      n[        UT R                  SS9n[        UT R                  SS9n[        UT R                  SS9n[        USSS9n[        USSS9nS/UR                  S-   -  nUR                   S   US'   UR                  S5      R                  U5      nUR                  S5      R                  U5      n[        R                   " UU/SS9n[        R                   " UU/SS9n[        R"                  " SX5      nUb=  [%        UT R                  UR&                  5      n[        R(                  " US	:  S
S5      nOS nUGct  T R*                  (       dz  [        R,                  " SST R.                  T R                  ST R                  -  4UR&                  UR0                  S9nT R2                  (       a  T R4                  (       a  SUl        OT R9                  T R                  5      nUb  UUR;                  SS5      -   nUR=                  UR0                  5      nUc  [        R                  " TU5      nT R?                  X*5      n[        UT R                  SS9R;                  SS5      nUR=                  UR0                  5      RA                  UR&                  5      n[        R                   " UU/SS9nUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UT RL                  T R4                  S9nUb  UU-  nUR=                  UR0                  5      nU" [        R"                  " SUU5      5      nUS S 2S U2S S 24   nT RO                  U5      nUU4nU(       a  UU4-   nU$ )Nrf   c                 T   > U R                  TSTR                  TR                  5      $ r|  r~  r  s    r=   r3   5LongT5TransientGlobalAttention.forward.<locals>.shape  r  r?   c                 Z   > U R                  5       R                  TSTR                  5      $ r  r  r  s    r=   r  7LongT5TransientGlobalAttention.forward.<locals>.unshape  r  r?   r,   r   rK   rg   r  r   rz   r  r   r;  Trh   r<  r  )(r3   r   r6   r   rl   r   r  r   r   r   rG   r$   rT   r8   rX   r   rP   r   rk   rB   rq   r   r7   r   r*   r  r>  rH  r2  r   rp   r  r`   r   r:   rJ  r  rK  r   r   )r   r   r   rN  rO  rQ  rR  r3   r  rn   r   _global_seq_lenglobal_inputsrT  rW  rX  side_key_statesside_value_statesrepsrY  r]   side_position_biasr]  r^  r_  r   s   `                        @r=   r   &LongT5TransientGlobalAttention.forward~  s9    "/!4!4Ra!8
J	V	L )E$D%**]5H5H"5M*N"")
%	
 -22261-O\44]C TVVM23466-01
TVVM23} 56!$&&"78 *,AN'
DNNJ
),AN +:QRS
,\QUVW so**Q./""1%Q)33A6==dC-77:AA$G YY
O<!D
yy,0A!BJ 5|P#<T4>>S`SgSg#h #(;;/Ca/Ge#T #'  33 %4<<T^^9KL!== ,,!
 ..4==26M/ $ 1 1$.. A#/ -0D0N0NqRS0T T)..v||<M |zz*j9!%!7!7!Q!34F\^!_!i!ijkmn!o!3!8!8!F!I!I&--!X!II}6H&IrRM-}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9L#((););<ell+BLR^_`!![j[!"34ff[)./Gr?   )r$   r   r   rl   r  r  r   r   r   r   r   ri  r   r   r  r   r  r   r   r   r  rb  r  )r   r   r   r   r    rI  r   r  re  r%  r   r2  r6   r|   r  r   r   r   r   s   @r=   r  r    s    f| f$ f[_ f f>;  -  - ^ 0#ell # #Y^YeYe #0 u ur?   r  c                   h   ^  \ rS rSrS
S\\   4U 4S jjjr\" SSSS9       SS j5       rS	r	U =r
$ )LongT5LayerSelfAttentioni  r   c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r  s       r=   r   !LongT5LayerSelfAttention.__init__  sQ    ,W`
 *&..f>W>WXzz&"5"56r?   r4  r5  r6  r7  c	                     U R                  U5      n	U R                  U	UUUUUUUS9n
XR                  U
S   5      -   nU4U
SS  -   nU$ )N)r   rN  rO  r5  rP  rQ  r.  r   r   )r   r  r   )r   r   rd   rN  rO  r5  rP  rQ  r.  normed_hidden_statesattention_outputr_  s               r=   r    LongT5LayerSelfAttention.forward   st      $}=-- '++/) . 	
 &5Ea5H(II "%5ab%99r?   )r  r   r   ra  )NNNNFFNr   r   r   r   r   r   r   r   r   r   r   r   s   @r=   r  r    sP    7XVY] 7 7 %0A6R  Sr?   r  c                   X   ^  \ rS rSrSrSS\\   4U 4S jjjr    S	S\4S jjr	Sr
U =r$ )
LongT5LayerLocalSelfAttentioni  z$Local self attention used in encoderr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g N)r   r   )r   r   rg  LocalSelfAttentionr   r   r   r   r   r   r   r   r  s       r=   r   &LongT5LayerLocalSelfAttention.__init__  sI    "6v"w)&..f>W>WXzz&"5"56r?   kwargsc                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   n	U	$ N)r   rN  rO  rQ  r   r   )r   r  r   
r   r   rd   rN  rO  rQ  r  r  r  r_  s
             r=   r   %LongT5LayerLocalSelfAttention.forward%  sk      $}=22 '+/ 3 
 &5Ea5H(II "%5ab%99r?   )r  r   r   ra  r  r   r   r   r   __doc__r   r   r   r   r   r   r   r   s   @r=   r  r    s>    .7XVY] 7 7   r?   r  c                   X   ^  \ rS rSrSrSS\\   4U 4S jjjr    S	S\4S jjr	Sr
U =r$ )
'LongT5LayerTransientGlobalSelfAttentioni;  z/Transient-Global self attention used in encoderr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r   r   r  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r  s       r=   r   0LongT5LayerTransientGlobalSelfAttention.__init__>  sN    ,J-
) *&..f>W>WXzz&"5"56r?   r  c                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   n	U	$ r  )r   r  r   r  s
             r=   r   /LongT5LayerTransientGlobalSelfAttention.forwardF  sk      $}=<< '+/ = 
 &5Ea5H(II "%5ab%99r?   )r  r   r   ra  r  r  r   s   @r=   r  r  ;  s>    97XVY] 7 7   r?   r  c                   j   ^  \ rS rSrS
S\\   4U 4S jjjr\" SSSS9        SS j5       rS	r	U =r
$ )LongT5LayerCrossAttentioni]  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r=   r   "LongT5LayerCrossAttention.__init__^  sO    .vSXdmn)&..f>W>WXzz&"5"56r?   r4  r5  r6  r7  c                     U R                  U5      nU R                  UUUUUUUUU	U
S9
nXR                  US   5      -   nU4USS  -   nU$ )N)	r   rM  rN  rO  r5  rP  r,  rQ  r.  r   r   )r   r  r   )r   r   rM  rd   rN  rO  r5  rP  r,  rQ  r.  r  r  layer_outputr_  s                  r=   r   !LongT5LayerCrossAttention.forwardd  sy      $}=// -'++%/) 0 
 %||4DQ4G'HH/$4QR$88r?   )r  r   r   r   )NNNNFNFNr  r   s   @r=   r  r  ]  sR    7(3- 7 7 %0A6R
  Sr?   r  c                   r   ^  \ rS rSrS
S\\   4U 4S jjjr\" SSSS9            SS j5       rS	r	U =r
$ )LongT5Blocki  r   c                 $  > [         TU ]  5         UR                  U l        UR                  (       a  [        nOGUR                  S:X  a  [
        nO0UR                  S:X  a  [        nO[        SUR                   S35      e[        R                  " 5       U l
        U R                  R                  U" XUS95        U R                  (       a"  U R                  R                  [        XS95        U R                  R                  [        U5      5        g )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r  )r   )r   r   r   r  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrO   r  r   )r   r   r   r   attention_layerr   s        r=   r   LongT5Block.__init__  s     ++6O**g5;O**.@@EO!889<  ]]_


Fgpq	
 ??JJ7TU

-/0r?   r4  r5  r6  r7  c                 \   U R                   S   " UUUUU	U
UUS9nUS   nUSS  nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU	US   S-   U
UUS9
nUS   nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUUSS  -   nU R                   S   " U5      nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU4U-   $ )Nr   )rd   rN  rO  r5  rP  rQ  r.  r   i  )r  r   r,   )	rM  rd   rN  rO  r5  r,  rP  rQ  r.  )
r  r*   r6   r   isinfanyfinfor   clampr   )r   r   rd   rN  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasrO  cross_attn_layer_head_maskr5  rP  rQ  return_dictr.  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputss                      r=   r   LongT5Block.forward  s   " "&A)'++/)	"
 /q12126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM!__R1Fd1R&*jjm!65; : /+B/!3#"3-'# 4A6M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O 

2}5 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM 00	
r?   )r   r  ra  )NNNNNNNNFFTNr  r   s   @r=   r  r    sa    1XVY] 1 1. %0A6R "#&*#'D
 SD
r?   r  c                   r   ^  \ rS rSr% \\S'   SrSrS/rSr	\
S 5       rS r\U 4S	 j5       rS
 rS rSrU =r$ )LongT5PreTrainedModeli  r   transformerTr  Fc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r6   r   r   r   )r   r  
input_maskdummy_inputss       r=   r  "LongT5PreTrainedModel.dummy_inputs  s8     LL.	\\*-
!*"&0

 r?   c                     U nUR                  S5      nUR                  S5       H   n[        X#5      (       d    g [        X#5      nM"     U R	                  X R
                  5        g )Nz.weightr  )removesuffixsplithasattrgetattr_tie_or_clone_weightsshared)r   keymodulesub_keys       r=   _try_load_missing_tied_module3LongT5PreTrainedModel._try_load_missing_tied_module  sU    y)yy~G6++V-F &
 	""6;;7r?   c                 B  > UR                  SS5      nSUS'   [        TU ]  " U0 UD6u  pEUR                  S/ 5      n[        US5      (       aG  [        US5      (       a6  U H0  n[        R                  SU SU S	35        UR                  U5        M2     U(       a  XE4$ U$ )
Noutput_loading_infoFTmissing_keysr  _tied_weights_keysz!Recovering a missing tied weight z2 from a legacy LongT5 checkpoint. Consider saving zF in your checkpoint or updating the config (tie_word_embeddings=true).)rA  r   from_pretrainedr  r   warningr  )	r   argsr  requested_loading_infomodelloading_infor	  missing_keyr   s	           r=   r  %LongT5PreTrainedModel.from_pretrained  s    !',A5!I(,$%#g5tFvF#'';5(##7K(L(L+7} E''2m3y{ 33K@  , "&&r?   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g[        U[        [        [        45      (       a  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       aN  U R                   R                  (       d2  UR                  R                  R
                  R                  SUS-  S9  ggg[        U[        5      (       GaQ  UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR                   S5      (       aE  UR                   R$                  b.  UR                   R$                  R
                  R'                  5         UR(                  R                  R
                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aG  UR(                  R$                  b/  UR(                  R$                  R
                  R'                  5         ggg[        U[,        5      (       Ga  UR.                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aE  UR.                  R$                  b.  UR.                  R$                  R
                  R'                  5         UR0                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aE  UR0                  R$                  b.  UR0                  R$                  R
                  R'                  5         UR(                  R                  R
                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aG  UR(                  R$                  b/  UR(                  R$                  R
                  R'                  5         ggg[        U[2        [4        [6        45      (       Ga  U R                   R"                  nU R                   R8                  nU R                   R:                  nUR<                  R                  R
                  R                  SX#U-  S-  -  S9  UR>                  R                  R
                  R                  SX#S-  -  S9  UR@                  R                  R
                  R                  SX#S-  -  S9  URB                  R                  R
                  R                  SX%U-  S-  -  S9  URD                  (       a}  URF                  R                  R
                  R                  SX#S-  -  S9  [        U[6        5      (       a4  URH                  R                  R
                  R                  SX#S-  -  S9  gggg)zInitialize the weightsr{   rz   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelr  normal_r  tie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   rg  r  r   r   r   r   r   r   r   r  r  )r   r  factorr   r   r   s         r=   _init_weights#LongT5PreTrainedModel._init_weights  s   //fo..MM$$Vc\2.LN` abb MM  %%--3FSL-Ivy))$++2Q2Q%%**22#2N 3R) 344 II!!))s;;CVCV[_B_8`)avyy&))fiinn.H		##))+II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I) 899KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I)2FHf ghh kk))G!%!1!1kk++GHHOO  ((cvL^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cvL^B^cgAg7h(i11..55::BBQWhl[lQmBnf&DEE99@@EEMM fT0A&B N  F 2 ir?   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r,   )r   .rK   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  r   r6   fullr3   rP   	new_zerosclonemasked_fill_)r   r  r%  r&  shifted_input_idss        r=   _shift_right"LongT5PreTrainedModel._shift_rightG  s    !%!C!C{{//!)8  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r?   r-   )r   r   r   r   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_can_compile_fullgraphpropertyr  r  classmethodr  r!  r,  r   r   r   s   @r=   r  r    s^    %&*#&" 8  $.b! !r?   r  c                   (  ^  \ rS rSrSU 4S jjrS r             SS jr SS\\R                  S4   S\R                  S\R                  S	\
S
\4
S jjr\S\R                  S\S\S\R                  S\R                  S\4S j5       rSrU =r$ )LongT5Stackic  c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        Ub  UR                  U R                  l        UR                  U l        UR                  U l	        U R                  S-   U l
        [        R                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l        [#        UR
                  UR$                  S9U l        [        R(                  " UR*                  5      U l        SU l        U R1                  5         g s  snf )Nr   r   r  r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r   ri  r$   r  rL   
num_layersr  rI  blockr   r   final_layer_normr   r   r   r  	post_init)r   r   r9  rR   r   s       r=   r   LongT5Stack.__init__d  s    LL):):FNNK#'3':':D$ ++"//**Q.]] v0011A FQ!VXYZ1

 !0FD]D] ^zz&"5"56&+# 	s   <!E	c                     Xl         g r   )r9  r   new_embeddingss     r=   set_input_embeddings LongT5Stack.set_input_embeddings~  s    *r?   c                 	   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U	(       a  [        R                  S
5        Sn	Uc%  U R                  c   S5       eU R                  U5      nUu  nnU R
                  (       ah  U	(       a`  Uc]  U R                   R                  (       a/  [        [!        U R                   S9[!        U R                   S95      nO'[!        U R                   S9nOU R
                  (       d  S nUb  UR#                  5       OSnUc#  [$        R&                  " UUU-   UR(                  S9nUc4  [+        5       (       d%  UU-   n[$        R,                  " UUUR(                  S9nU R
                  (       a7  U R/                  UUU[1        U[        5      (       a  UR2                  OUU
5      nO=U R                   R4                  S:X  a!  [7        X R8                  UR(                  5      nOUnU R
                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [$        R,                  " UUR(                  S9nU R;                  U5      nOS nU R=                  X`R                   R>                  5      nU R=                  XpR                   R>                  5      nU(       a  SOS nU
(       a  SOS nU
(       a  U R
                  (       a  SOS nS nS nU RA                  U5      n[C        U RD                  5       H  u  n n!UU    n"UU    n#U(       a  UU4-   nU!" UUUUUUU"U#UU	U
UUS9n$U$S   nU$S   nU R
                  (       a  Ub  U$U
(       a  SOS   nU
(       d  Md  UU$S   4-   nU R
                  (       d  M  UU$S   4-   nM     U RG                  U5      nU RA                  U5      nU(       a  UU4-   nU(       d  [I        S UUUUU4 5       5      $ [K        UUUUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer,   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddings)r   r   rx   r  r-   )rO  r  r5  rP  rQ  r  r.  r   r   rf      c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r-   ).0r   s     r=   	<genexpr>&LongT5Stack.forward.<locals>.<genexpr>!  s"      
A  s   	)last_hidden_stater5  r   
attentionscross_attentions)&r   rP  rQ  output_hidden_statesuse_return_dictr   r  sizer?  r  r>  r   r   r9  is_encoder_decoderr   r   get_seq_lengthr6   rV   rB   r   r   _update_causal_maskr   rC  r  rk   r$   invert_attention_maskget_head_maskr:  r   	enumerater;  r<  rN   r   )%r   r  rd   r  r  rG  	head_maskcross_attn_head_maskr5  rP  rQ  rP  r  r.  err_msg_prefixinput_shaper   rR  past_key_values_lengthmask_seq_lengthr[  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrN  r  r   rR   layer_modulerO  r  layer_outputss%                                        r=   r   LongT5Stack.forward  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	 $$0p2pp0 --i8M!,
J??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D*D4zAO"ZZ
OML`L`aN??22o/BCC  44$!K [[//7:3NNNTaThThiK(K ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d&7DOOrRV(,%]3(4OA|'lO)=a)@&#$58H$H!(%/- /+E /#"3'-M& *!,M
 *!,M#8#D0=CTaZ[0\-  !/=3C2E!E???+?=QRCSBU+U(M  5P --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r?   rd   r!   input_tensorr.  r5  rQ  c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2rz   flex_attentionr   Fsdpa)rG  r]  is_trainingr   r,   )sequence_lengthtarget_lengthr*   r.  r   )cudaxpunpu)r   _attn_implementationr  r   r6   r|   r"   rT  is_compileabler   _ignore_causal_mask_sdpar>  r*   r3   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrB   rp   r  r  _unmask_unattended)r   rd   rj  r.  r5  rQ  past_seen_tokensusing_compilable_cacher*   rp  rq  r[  	min_dtypes                r=   rU  LongT5Stack._update_causal_mask5  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr?   rp  rq  r*   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
NrH  )
fill_valuer*   rB   r   )diagonalrx   r,   r   )r%   r6   r  r  r'  rB   triurV   rD   expandr)  r3   r`   masked_fill)rd   rp  rq  r*   r.  r   r  r[  r}  mask_lengthpadding_masks              r=   ry  ALongT5Stack._prepare_4d_causal_attention_mask_with_cache_positiony  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r?   )r;  r$   r   r9  r<  r  r   ri  r   )NNNNNNNNNNNNNr  )r   r   r   r   r   rB  r   r   r6   r|   r
   rI  rU  re  r   r*   ry  r   r   r   s   @r=   r6  r6  c  s    4+
 "#!!q
t #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r?   r6  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c            &       D  ^  \ rS rSrS/rSS/rS\4U 4S jjrS rS r	S	 r
S
 rS r\                SS\\R                      S\\R"                     S\\R                      S\\R$                     S\\R"                     S\\R"                     S\\R&                     S\\\\R"                           S\\   S\\R&                     S\\R&                     S\\   S\\   S\\   S\\   S\\R                      S\\\R"                     \4   4"S jj5       rSrU =r$ ) r  i  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl	        SUl        UR                  Ul        [        X0R                  5      U l        U R#                  5         g )NFT)r   r   r   r   r8  r   r  copydeepcopyr   rP  tie_encoder_decoderr6  encodernum_decoder_layersr:  decoderr=  r   r   encoder_configdecoder_configr   s       r=   r   LongT5Model.__init__  s     ll6#4#4fnnEv.$)!#( -2*">;;?v.$(!-2*$*$=$=!">;;? 	r?   c                     U R                   $ r   r  r   s    r=   get_input_embeddings LongT5Model.get_input_embeddings      {{r?   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  r  rB  r  r@  s     r=   rB   LongT5Model.set_input_embeddings  +    $)).9)).9r?   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   r   r  r   r  r9  r  r  r  s    r=   _tie_weightsLongT5Model._tie_weights  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r?   c                     U R                   $ r   r  r  s    r=   get_encoderLongT5Model.get_encoder      ||r?   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gz
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
Nitemsr  r  	attentionr  r   heads_to_pruner  r  s       r=   _prune_headsLongT5Model._prune_heads  <    
 +002LELLu%//;;EB 3r?   r  rd   r  r  rY  decoder_head_maskrZ  encoder_outputsr5  rG  decoder_inputs_embedsrP  rQ  rP  r  r.  r'   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUU	UUUUUUUUUS9nU(       d  UU-   $ [        UR                  UR                   UR"                  UR$                  UR&                  UR                  UR"                  UR$                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, LongT5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

>>> # Let's try a very long encoder input.
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1

>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr  rd   rG  rY  rQ  rP  r  r   r   rf   rM  r   rN  r  rd   rG  r5  r  r  rY  rZ  rP  rQ  rP  r  r.  )rM  r5  decoder_hidden_statesdecoder_attentionsrO  encoder_last_hidden_stater  encoder_attentions)r   rP  rQ  r:  r  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningr  r   r   r
  r  r   rM  r5  r   rN  rO  )r   r  rd   r  r  rY  r  rZ  r  r5  rG  r  rP  rQ  rP  r  r.  r   decoder_outputss                      r=   r   LongT5Model.forward  s   b "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r?   )r  r  r  )NNNNNNNNNNNNNNNN)r   r   r   r   "_keys_to_ignore_on_load_unexpectedr
  r    r   r  rB  r  r  r  r   r   r6   
LongTensorFloatTensor
BoolTensorr|   rN   r
   rI  r   r   r   r   r   r   s   @r=   r  r    s    	R*& 89VW| &:
O
C  156:8<=A159=7;EI+/048<$(,0/3&*59#J
E,,-J
 !!2!23J
 $E$4$45	J

 !))9)9 :J
 E--.J
 $E$5$56J
 'u||4J
 "%e.?.?(@"ABJ
 "%J
  -J
  (5J
 D>J
 $D>J
 'tnJ
  d^!J
" !!1!12#J
$ 
uU&&');;	<%J
 J
r?   r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc            (         ^  \ rS rSrS/r/ SQrS\4U 4S jjrS rS r	S r
S	 r\                 SS
\\R                     S\\R                      S\\R                     S\\R"                     S\\R                      S\\R                      S\\R$                     S\\\\R$                           S\\   S\\R                      S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                      \4   4$S jj5       rS\R$                  4S jrSrU =r$ ) r  i{  r  )r  r  zlm_head.weightr   c                 L  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        [        R$                  " UR                  UR                  SS9U l        U R)                  5         g )NFTr   )r   r   r   	model_dimr   r   r8  r  r  r  r   rP  r  r6  r  r  r:  r  r   r  r=  r  s       r=   r   'LongT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( -2*">;;?v.$(!-2*$*$=$=!">;;?yy1B1BO 	r?   c                     U R                   $ r   r  r  s    r=   r  3LongT5ForConditionalGeneration.get_input_embeddings  r  r?   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  r@  s     r=   rB  3LongT5ForConditionalGeneration.set_input_embeddings  r  r?   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   r  r  s    r=   r  +LongT5ForConditionalGeneration._tie_weights  r  r?   c                     U R                   $ r   r  r  s    r=   r  *LongT5ForConditionalGeneration.get_encoder  r  r?   r  rd   r  r  rY  r  rZ  r  r5  rG  r  labelsrP  rQ  rP  r  r.  r'   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nUb  Uc  Uc  U R                  U5      nU R                  UUUU	UUUUUUUUUS9nUS   nU R                   R                  (       a  UU R                   S-  -  nU R#                  U5      nSnUb[  [%        S	S
9nUR'                  UR(                  5      nU" UR+                  SUR-                  S5      5      UR+                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [/        UUUR0                  UR2                  UR4                  UR6                  UR8                  UR2                  UR4                  S9	$ )ar  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
>>> model = LongT5ForConditionalGeneration.from_pretrained(
...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
... )

>>> # Let's try a very long input.
>>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
>>> input_ids = inputs.input_ids

>>> outputs = model.generate(input_ids)
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
abstractthe aim of this article is to provide an overview of the literature on the role of dog
```Nr  r   r   rf   r  r  r  r$  )ignore_indexr,   )	losslogitsr5  r  r  rO  r  r  r  )r   rP  rQ  r:  r  r  r  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  r  r   r   r
  r,  r  r  r  r  r   r`   rB   r?  rR  r   r5  r   rN  rO  rM  )r   r  rd   r  r  rY  r  rZ  r  r5  rG  r  r  rP  rQ  rP  r  r.  r   r  sequence_output	lm_logitsr  loss_fctoutputs                            r=   r   &LongT5ForConditionalGeneration.forward  so   j "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD \OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r?   c                 $    U R                  U5      $ r   )r,  )r   r  s     r=   %prepare_decoder_input_ids_from_labelsDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsV  s      ((r?   )r  r  r  r  r  )NNNNNNNNNNNNNNNNN)r   r   r   r   r  r
  r    r   r  rB  r  r  r   r   r6   r  r  r  r|   rN   r
   rI  r   r   r   r  r   r   r   s   @r=   r  r  {  s    	R*& j| .:
O
  156:8<=A159=7;@D+/59=A-1$(,0/3&*59%f
E,,-f
 !!2!23f
 $E$4$45	f

 !))9)9 :f
 E--.f
 $E$5$56f
 'u||4f
 "%ell(;"<=f
 "%f
   1 12f
  ((9(9:f
 ))*f
 D>f
 $D>f
  'tn!f
" d^#f
$ !!1!12%f
& 
uU&&'8	9'f
 f
P)ELL ) )r?   r  c                   >  ^  \ rS rSrS/rS/rS\4U 4S jjrS rS r	S r
S	 rS
 r\       SS\\R                      S\\R"                     S\\R"                     S\\R"                     S\\   S\\   S\\   S\\\R"                     \4   4S jj5       rSrU =r$ )r  iZ  r  r  r   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        X R                  5      U l        U R                  5         g )NF)r   r   r   r   r8  r   r  r  r  rP  r  r6  r  r=  )r   r   r  r   s      r=   r   LongT5EncoderModel.__init___  sf     ll6#4#4fnnEv.#( -2*">;;? 	r?   c                     U R                   $ r   r  r  s    r=   r  'LongT5EncoderModel.get_input_embeddingsk  r  r?   c                 F    Xl         U R                  R                  U5        g r   )r  r  rB  r@  s     r=   rB  'LongT5EncoderModel.set_input_embeddingsn  s    $)).9r?   c                     U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g r   )r   r  r   r  r9  r  r  s    r=   r  LongT5EncoderModel._tie_weightsr  s2    ;;**&&t||'@'@$++N +r?   c                     U R                   $ r   r  r  s    r=   r  LongT5EncoderModel.get_encoderv  r  r?   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gr  r  r  s       r=   r  LongT5EncoderModel._prune_headsy  r  r?   r  rd   rY  rG  rQ  rP  r  r'   c           
      f    Ub  UOU R                   R                  nU R                  UUUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).

Example:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```r  )r   rQ  r  )	r   r  rd   rY  rG  rQ  rP  r  r  s	            r=   r   LongT5EncoderModel.forward  sK    F &1%<k$++B]B],,)'/!5# ' 
 r?   )r  r  )NNNNNNN)r   r   r   r   r
  r  r    r   r  rB  r  r  r  r   r   r6   r  r  rI  r   rN   r   r   r   r   r   s   @r=   r  r  Z  s   78*4&
| 
:OC  156:1559,0/3&*.E,,-. !!2!23. E--.	.
   1 12. $D>. 'tn. d^. 
uU&&'8	9. .r?   r  )r  r  r  r  )r   )\r  r  r  r  typingr   r   r   r6   r   torch.nnr   activationsr	   cache_utilsr
   r   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   r   r   r   r   utils.deprecationr   configuration_longt5r    !torch.nn.attention.flex_attentionr!   integrations.flex_attentionr"   
get_loggerr   r   r|   r   r>   rG   rT   r\   rc   rB   rk   rN   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionr  r   r   r   r   rg  r  r  r  r  r  r  r  r6  __HEAD_MASK_WARNING_MSGr  r  r  __all__r-   r?   r=   <module>r     s       ' '   % ! C C ) > 9  . Q   1 .  !!;J 
		H	%  3 3 W\WcWc  #%,, #3 #S #U\\ #4U\\ 4c 4 4Y\ 4ejeqeq 42!# !%,, !BU\\ Bc BV[VbVb B8ell 8s 8TYT`T` 8ejeqeq 8 .PLL.P58.P
5<<%&.Pb4U\\ 4VY 4^c^j^j 4	j<<	j,1LL	jJM	j
\\	j+bii +2	/"O
KKef")) ,ryy &BII &fbii fR299 DCRYY CN"ryy "JBII >bii D$		 $N]
, ]
@ {!O {! {!|L' L`
  |
' |
 |
~ 
W)%:O W)
W)t U. U Up kg=  	 	
NN[\	s   M M3M32M3