
    cCi                       S r SSKrSSKJrJr  SSKrSSKJs  Jr	  SSKJr  SSK
JrJrJr  SSKJr  SSKJr  SS	KJrJrJrJrJrJrJr  SS
KJr  SSKJrJrJrJ r J!r!  SSK"J#r#  SSK$J%r%  \ RL                  " \'5      r(Sr)Sr* " S S\RV                  5      r, " S S\RV                  5      r- " S S\RV                  5      r. " S S\RV                  5      r/ " S S\RV                  5      r0 " S S\RV                  5      r1 " S S\RV                  5      r2 " S S \RV                  5      r3 " S! S"\RV                  5      r4 " S# S$\RV                  5      r5 " S% S&\RV                  5      r6 " S' S(\RV                  5      r7 " S) S*\RV                  5      r8 " S+ S,\5      r9S-r:S.r;\" S/\:5       " S0 S1\95      5       r<\" S2\:5       " S3 S4\95      5       r=\" S5\:5       " S6 S7\95      5       r>\" S8\:5       " S9 S:\95      5       r?\" S;\:5       " S< S=\95      5       r@\" S>\:5       " S? S@\95      5       rA " SA SB\RV                  5      rB\" SC\:5       " SD SE\95      5       rC/ SFQrDg)GzPyTorch MEGA model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)Cache),BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings)deprecate_kwarg   )
MegaConfigzmnaylor/mega-base-wikitextr   c                   >   ^  \ rS rSrSrS\4U 4S jjrSS jrSrU =r	$ )MegaEmbeddings6   z
Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of
RoBERTa's embeddings which optionally includes token types
configc                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        UR                  U l	        U R                  (       a{  [        R                  " UR                  UR
                  5      U l        U R                  S[        R                  " UR                  [        R                   S9R#                  S5      SS9  UR                  U l        g )N)padding_idxtoken_type_idsdtype)r   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsadd_token_type_embeddingsuse_token_typestype_vocab_sizetoken_type_embeddingsregister_buffertorchzerosmax_positionslongexpandr!   selfr   	__class__s     k/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/deprecated/mega/modeling_mega.pyr(   MegaEmbeddings.__init__<   s    !||F,=,=v?Q?Q_e_r_rs%??)+f6L6LfN`N`)aD&    %++f.B.B%**"U"\"\]d"erw !  "..    c                    Uc  Uc  [        S5      eUb.  UR                  5       nUR                  nU R                  U5      nOUR                  5       S S nUR                  nU R                  (       a  Uch  [        U S5      (       a3  U R                  S S 2S US   24   nUR                  US   US   5      nUnO$[        R                  " U[        R                  US9nU R                  U5      nX8-   n	U	$ Un	U	$ )Nz.Must provide one of input_ids or inputs_embedsr%   r"   r   r   r$   device)
ValueErrorsizer@   r-   r/   hasattrr"   r7   r3   r4   r6   r1   )
r9   	input_idsr"   inputs_embedsinput_shaper@   buffered_token_type_ids buffered_token_type_ids_expandedr1   
embeddingss
             r;   forwardMegaEmbeddings.forwardJ   s   M$9MNN"#..*K%%F !00;M',,.s3K"))F
 %4!122.2.A.A!EU{ST~EUBU.V+7N7U7UVabcVdfqrsft7u4%EN%*[[EJJW]%^N %)$>$>~$N!&>J  'Jr=   )r!   r1   r/   r-   NNN
__name__
__module____qualname____firstlineno____doc__r   r(   rJ   __static_attributes____classcell__r:   s   @r;   r   r   6   s    
/z / r=   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ ) MegaSimpleRelativePositionalBiasl   zk
Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability
r   c                 B  > [         TU ]  5         Xl        U R                  R                  S:  a  U R                  R                  OU R                  R                  U l        [
        R                  " [        R                  " SUR                  -  S-
  5      5      U l	        g )Nr      r   )
r'   r(   r   
chunk_sizer5   r   	Parameterr3   Tensorrel_pos_biasr8   s     r;   r(   )MegaSimpleRelativePositionalBias.__init__q   sq    :>++:P:PST:TT[[66Z^ZeZeZpZpLLa&:N:N6NQR6R)STr=   c                    XR                   :  a  [        SU SU R                    35      eU R                  U R                   U-
  U R                   U-   S-
   n[        R                  " USU45      n[
        R                  " X145      nUS U*  nUR                  USU-  S-
  5      nSU-  S-
  S-  nUR                  S5      U-
  nUS S 2XE24   nU$ )NzSequence length z going beyond max length r   r      rZ   )	r5   rA   r^   Fpadr3   tileviewrB   )r9   seq_lenbiasrd   startends         r;   rJ   (MegaSimpleRelativePositionalBias.forwardw   s    '''/y8QRVRdRdQefgg   $"4"4w">4CUCUX_C_bcCceuuTAw<(zz$
+IgXyy!g+/2WqQ&iilU"AuyL!r=   )r   r5   r^   rM   rU   s   @r;   rW   rW   l   s     Uz U r=   rW   c                   \   ^  \ rS rSrSrS\4U 4S jjr\S\S\4S j5       r	S r
S	 rS
rU =r$ ) MegaRotaryRelativePositionalBias   a\  
Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega
repo due to differences in implementation.

When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can
extrapolate to longer sequences. Can be indexed according to input position IDs
r   c                   > [         TU ]  5         UR                  S-  S:w  a  [        S5      eXl        UR
                  U l        U R                  R                  S:  a  U R                  R                  OU R                  R                  U l        [        R                  UR                  U R                  5      u  U l        U l        [        R                  " [        R                   " SU R                  5      5      U l        [        R                  " [        R                   " SU R                  5      5      U l        U R'                  S[        R(                  " S/5      5        g )NrZ   r   zCRotary positional bias requires `hidden_size` to be a multiple of 2r   _float_tensor        )r'   r(   r+   RuntimeErrorr   shared_representation_size	embed_dimr[   r5   rl   get_sinusoid_embeddingssinecosiner   r\   r3   r]   alphab_paramr2   FloatTensorr8   s     r;   r(   )MegaRotaryRelativePositionalBias.__init__   s    !Q&dee:::>++:P:PST:TT[[66Z^ZeZeZpZp!A!Y!Y  $.."
	4;
 \\%,,q$.."AB
||ELLDNN$CD_e.?.?.FGr=   r5   embedding_dimc                    US-  n[         R                  " S5      U-  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R                  S9R                  S5      UR                  S5      -  n[        R                  " U5      [        R                  " U5      4$ )NrZ   i'  r#   r   r   )
mathlogr3   exparangeint64float	unsqueezesincos)r5   r{   half_dimembs       r;   rt   8MegaRotaryRelativePositionalBias.get_sinusoid_embeddings   s     A%hhuo(iiXU[[AGGISDPQll=<FFqICMMZ[L\\yy~uyy~--r=   c                 *   UR                  5       u  p#[        R                  " USSS9u  pEU R                  b  X R                  R                  S5      :  a(  [        R                  X#5      u  U l        U l        X l        U R                  R                  U R                  5      U l        U R                  R                  U R                  5      U l        U R                  S U nU R                  S U n[        R                  " XG-  XV-  -
  XW-  XF-  -   /SS9$ )NrZ   r%   dimr   r   )rB   r3   chunkru   rl   rt   rv   r5   toro   cat)r9   inputrf   rs   chunk_1chunk_2r   r   s           r;   rotary'MegaRotaryRelativePositionalBias.rotary   s    "ZZ\ ;;uaR899))..*; ;%E%]%]^e%q"DIt{!(IILL!3!34	kknnT%7%78ii!kk(7#yy'-'-79VW]^__r=   c                    U R                  U R                  R                  XR                  5      5      nU R                  U R                  R                  XR                  5      5      n[
        R                  " SX#5      nU$ )Nz	mk,nk->mn)r   rw   r7   rs   rx   r3   einsum)r9   rf   rotary_alpharotary_betarg   s        r;   rJ   (MegaRotaryRelativePositionalBias.forward   sY    {{4::#4#4Wnn#MNkk$,,"5"5g~~"NO||KCr=   )rw   rx   r   rv   rs   r5   ru   )rN   rO   rP   rQ   rR   r   r(   staticmethodintrt   r   rJ   rS   rT   rU   s   @r;   rl   rl      sH    Hz H  .s .3 . .` r=   rl   c                   B   ^  \ rS rSrSrSU 4S jjrSS\4S jjrSrU =r	$ )MegaDropout   aN  
A unified class for standard dropout functionality and featurewise dropout.

The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic
and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which
is retained here as well.
c                 :   > [         TU ]  5         Xl        X l        g N)r'   r(   dropout_probabilityis_featurewise)r9   r   r   r:   s      r;   r(   MegaDropout.__init__   s    #6 ,r=   batch_firstc                    U R                   (       a  U(       aJ  [        R                  " UR                  SS5      U R                  U R
                  S9R                  SS5      $ UR                  5       S:w  a  [        S5      e[        R                  " UR                  SSS5      U R                  U R
                  S9R                  SSS5      $ [        R                  " XR                  U R
                  S9$ )	Nr%   )ptrainingra   zzFeature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]r   rZ   r   )
r   rb   	dropout2d	transposer   r   r   rA   permutedropout)r9   r   r   s      r;   rJ   MegaDropout.forward   s     {{OOB+t/G/GRVR_R_)B#$ 99;!#$ U  {{5==Aq#9T=U=U`d`m`mnvvq!  99U&>&>WWr=   )r   r   )F)
rN   rO   rP   rQ   rR   r(   boolrJ   rS   rT   rU   s   @r;   r   r      s!    -
X$ X Xr=   r   c                   <   ^  \ rS rSrSrSU 4S jjrS rS rSrU =r	$ )MegaRMSNorm   z
RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square
root (as opposed to after in T5)
c                    > [         TU ]  5         Xl        X l        X0l        U(       a:  [
        R                  " [        R                  " U R                  5      5      U l	        g U R                  SS 5        g )Nweight)r'   r(   num_featuresepsaffiner   r\   r3   r]   r   register_parameter)r9   number_featuresr   r   r:   s       r;   r(   MegaRMSNorm.__init__   sP    +,,u||D4E4E'FGDK##Hd3r=   c                     [         R                  " [         R                  " U5      SSS9nU R                  b  XR                  -  nU[         R                  " X R
                  -   5      -    U$ )Nr%   Tr   keepdim)r3   meansquarer   rsqrtr   )r9   r   mean_squares      r;   rJ   MegaRMSNorm.forward   sP    jje!4"dK;;"KK'EK((233r=   c                 R    U R                    SU R                   SU R                   3$ )Nz, eps=z	, affine=)r   r   r   r9   s    r;   
extra_reprMegaRMSNorm.extra_repr   s(    ##$F488*Idkk]KKr=   )r   r   r   r   gư>T)
rN   rO   rP   rQ   rR   r(   rJ   r   rS   rT   rU   s   @r;   r   r      s    
4L Lr=   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )MegaScaleNormi  z
Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar
multiplication instead of a vector, and applies over a specified dimension
c                    > [         TU ]  5         Xl        X l        X0l        U(       a0  [
        R                  " [        R                  " S5      5      U l	        g U R                  SS 5        g )Nr   scalar)r'   r(   r   r   r   r   r\   r3   r]   r   r   )r9   r   r   r   r:   s       r;   r(   MegaScaleNorm.__init__  sH    ,,u||A7DK##Hd3r=   c                     [         R                  " [         R                  " U5      U R                  SS9nU R                  b  U R                  U-  nU[         R
                  " X R                  -   5      -  nU$ )NTr   )r3   r   r   r   r   r   r   )r9   r   r   outputs       r;   rJ   MegaScaleNorm.forward  sX    jje!4$((DQ;;"KK%'E[88%;<<r=   )r   r   r   r   r   	rN   rO   rP   rQ   rR   r(   rJ   rS   rT   rU   s   @r;   r   r     s    
4 r=   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )MegaSequenceNormi  z
A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on
input axis locations for different normalization methods.
c                 Z  > [         TU ]  5         US:X  a  [        R                  " X#US9U l        g US:X  a  [        SX4S9U l        g US:X  a  [        X#US9U l        g US:X  a  [        R                  " X#US9U l        g US	:X  a  [        R                  " X#US9U l        g [        S
U 35      e)N	layernorm)elementwise_affine	scalenormr%   )r   r   r   rmsnorm)r   r   	batchnormsyncbatchnormzUnknown norm type: )
r'   r(   r   	LayerNormnormr   r   BatchNorm1dSyncBatchNormrA   )r9   	norm_typer{   r   r   exportr:   s         r;   r(   MegaSequenceNorm.__init__   s    #]FSDI+%%"#EDI)##M6JDI+%}fMDI/)((ODI29+>??r=   c                 J   [        U R                  [        R                  R                  R
                  5      (       aV  UR                  5       S:w  a  [        S5      eUR                  SSS5      nU R                  U5      nUR                  SSS5      $ U R                  U5      $ )Nra   z.BatchNorm inputs must be exactly 3-dimensionalr   rZ   r   )	
isinstancer   r   modulesr   
_BatchNormr   rA   r   )r9   r   s     r;   rJ   MegaSequenceNorm.forward/  s    dii!5!5!@!@AAyy{a !QRRMM!Q*EIIe$E==Aq))99U##r=   )r   )gh㈵>TFr   rU   s   @r;   r   r     s    
@$ $r=   r   c            	          ^  \ rS rSrSrS\4U 4S jjrS rS\4S jr	S r
S\4S	 jrS
 rSS jrSS jr   SS\\R"                     S\\R"                     S\S\R"                  4S jjrSrU =r$ )MegaMultiDimensionDampedEmai:  a  
Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
variable names and moving away from the stateful representation of incremental decoding state. See
"https://huggingface.co/papers/2209.10655" for more details.
r   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        [        R                  " SU R                  -  5      U l        U R                  (       a  SUR                  -  OUR                  n[        R                  " [        R                  " X R                  S5      5      U l        [        R                  " [        R                  " X R                  S5      5      U l        [        R                  " [        R                  " X R                  S5      5      U l        [        R                  " [        R                  " X R                  5      5      U l        [        R                  " [        R                  " UR                  5      5      U l        S U l        S U l        g )N      ?rZ   r   )r'   r(   r   r+   rs   ema_projection_sizendimbidirectional
truncationr}   sqrtscaler   r\   r3   r]   damping_factordecay_factorema_expansion_matrixkernel_projection_matrixresidual_weight_kernel_coeffs)r9   r   
kernel_dimr:   s      r;   r(   $MegaMultiDimensionDampedEma.__init__A  s/   ++..	#11 ++YYsTYY/
/3/A/AQ+++vGYGY
 ll5<<
IIq+QRLLj))Q)OP %'LLj))UV1W$X!(*U\\*ii5X(Y%!||ELL9K9K,LMr=   c                     S U l         [        R                  " U R                  5      n[        R                  " U R                  5      nSX-  -
  nX4$ )Nr   )r   r3   sigmoidr   r   )r9   r   r   previous_timestep_weights       r;   _compute_ema_coefficients5MegaMultiDimensionDampedEma._compute_ema_coefficientsY  sH    t':':;}}T%6%67#&)F#F 77r=   lengthc                 t   S U l         U R                  5       u  p#[        R                  " U5      R	                  U5      R                  SSU5      [        R                  " U5      -  nX R                  -  [        R                  " U5      -  n[        R                  " SXPR                  U R                  -  5      $ )Nr   
dnl,dn->dl)r   r   r3   r   r   re   r~   r   r   r   r   r   )r9   r   r   r   vanderkernels         r;   _compute_efficient_ema_kernel9MegaMultiDimensionDampedEma._compute_efficient_ema_kernela  s    373Q3Q3S0 f%((8==aFKeiiXpNqq #<#<<		&@QQ||L&2O2ORVR\R\2\]]r=   c                     U R                   (       a  U R                  5       $ U R                  c  U R                  5       U l        U R                  $ r   )r   r   r   r   s    r;   get_ema_coefficients0MegaMultiDimensionDampedEma.get_ema_coefficientsm  s<    ==1133||##==?<<r=   c                 :   U R                   c  UO[        U R                   U5      nU R                  (       a  U R                  U5      $ U R                  b  U R                  R                  S5      U:  a  U R                  U5      U l        U R                  SS U24   $ )Nr%   .)r   minr   r  r   rB   )r9   r   kernel_sizes      r;   get_ema_kernel*MegaMultiDimensionDampedEma.get_ema_kernelu  s     $ 7fSRX=Y==55kBB||#t||'8'8'<{'J#AA+N<<\k\ 122r=   c                    [         R                  R                  UR                  5       SU-  S9n[         R                  R                  UR                  5       SU-  S9n[         R                  R	                  XE-  SU-  S9nU$ )NrZ   )n)r3   fftrfftr   irfft)r9   inputsr   r   
inputs_fft
kernel_fftconvolved_sequences          r;   fft_convolution+MegaMultiDimensionDampedEma.fft_convolution~  sf    YY^^FLLNa&j^A
YY^^FLLNa&j^A
"YY__Z-DF
_S!!r=   c           	      t   US:X  a  U R                  XS9$ U R                  5       u  pE[        R                  " US-   5      R	                  U5      R                  SSUS-   5      [        R                  " U5      -  n[        R                  " U5      nUb^  US S 2S S 2SS 24   U R                  U R                  -  R                  S5      -  n[        R                  " SX75      nUS S 2S S 2S4   U-  n	OS nS n	US S 2S S 2S S24   nX@R                  -  U-  n
[        R                  " SXR                  U R                  -  5      nU R                  XUS9SSU24   nUR                  U5      nUb  X-   n[        R                  " S	U[        R                  " U
S
/S95      nU	b  X-   nUR!                  S
SS5      U4$ )Nr   
past_stater%   zbdn,dnl->bdlr   r   .r   zbdl,dnl->bdnrZ   )dims)one_ema_stepr  r3   r   r   re   r~   r   r   r   r   r   r   r  type_asflipr   )r9   r  r   r  r   r   r   past_ema_projpast_ema_statepast_vandermonder   kernel_proj
ema_outputupdated_hidden_states                 r;   ema_step$MegaMultiDimensionDampedEma.ema_step  s   Q;$$V$CC 483L3L3N0fqj),,^<AA!QQR
SV[V_V_$W
 
 6"! #1a8,0M0MPTPZPZ0Z/e/efh/iiM"\\.*TN  &aBh/*<!N# 1crc	" #<#<<Fll<9V9VY]YcYc9cd))&f)McSTU[S[m\
''/
%#4J$||NFEJJv]^\_D`a'#7#J  !!!Q*,@@@r=   c                    U R                  5       u  p4X0R                  -  R                  S5      U-  nUb  XTR                  S5      U-  -   n[        R                  " SXPR
                  U R                  -  5      nUR                  S5      U4$ )Nr%   z
bdn,dn->bdr   )r  r   squeezer3   r   r   r   r   )r9   r  r  r   r   updated_stateouts          r;   r  (MegaMultiDimensionDampedEma.one_ema_step  s    373L3L3N0 (*C*CCLLRPSYY!),L,LR,PS],]]Mll<8U8UX\XbXb8bc}}Q..r=   attention_mask
prev_state	use_cachereturnc                    UR                  5       u  pVnXpR                  :w  a  [        SU SU R                   35      eXR                  -  nUR	                  SSS5      nUb"  XR                  S5      R                  U5      -  nU R                  (       a  U(       a  [        S5      eU(       a-  U R                  XUS9u  p[        R                  " X-   5      n	X4$ U R                  U5      nUnSnUR                  S5      nU R                  (       a  [        R                  " XR                  U R                  /SS	9u  nn[        R                  " XS-
  S45      [        R                  " UR!                  S
5      SUS-
  45      -   n[        R                  " XS-
  S45      nX-   S-
  nSU-  S-
  nU R#                  XUS9SXU-   24   nUR                  U5      n[        R                  " UR	                  SSS5      U-   5      nUS4$ )a{  
Mega's exponential moving average (EMA) sub-layer applied prior to single-headed (traditional) self-attention

Args:
    inputs (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
        Hidden state / embedding input to update via EMA based on FFT convolution
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Indicates which inputs are to be ignored (mostly due to padding), where elements are either 1 for *not
        masked* or 0 for *masked*
    prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*):
        The hidden state returned from the previous timestep during incremental decoding.
    use_cache (`bool`, default `False`):
        Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
        updated EMA hidden state for use in the next step

Returns:
    `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
    inputs:
    - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
      states updated by EMA, with same shapes as inputs
    - **updated_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor of shape `(batch_size,
      config.ndim)` -- The incremental EMA state for use in the next step of incremental decoding
2Unexpected embedding dimension received: input is z, model expects r   rZ   r   Nz4Bidirectional EMA does not support incremental stater  r   r%   r  .)rB   rs   rA   r   r   r   r  r   rq   r$  rb   silur	  r3   splitrc   r  r  )r9   r  r+  r,  r-  rf   bszrs   residualr)  r(  r   fft_lens_indexr  k1k2r"  gated_ema_outputs                      r;   rJ   #MegaMultiDimensionDampedEma.forward  s    > #)++-i&DYKO_`d`n`n_op 
 000 1a(%77:BB6JKF)UVV!%v:!VC &&(C %% ((1FGG ++a.K!!Vnndnn-MSTUBr!OQ#78155qR]`aRaNb;ccva';<!/!3k/A---fW-McSZgn]nSnNnoJ#++F3J vvj&8&8Aq&AH&LM#T))r=   )r   r   r   r   r   r   r   rs   r   r   r   r   r   r   )NNF)rN   rO   rP   rQ   rR   r   r(   r   r   r  r  r	  r  r$  r  r   r3   r]   r   rJ   rS   rT   rU   s   @r;   r   r   :  s    z 08
^C 
^ 3S 3"(AT
/ 26-1M* !.M* U\\*	M*
 M* 
M* M*r=   r   c                      ^  \ rS rSrSrS\4U 4S jjrS rS r    SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\   S\S\S\\
R                  \	\
R                     4   4S jjrSrU =r$ )MegaGatedCrossAttentioni  a,  
Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only
modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and
`static_kv` arguments, and the stateful representation of incremental decoder state.
r   c                 n  > [         TU ]  5         Xl        [        U R                  R                     U l        U R                  R
                  U l        U R
                  S:X  a  U R                  R                  S-  OS U l        [        U R                  R                  U R                  R                  S9U l        [        U R                  R                  U R                  R                  S9U l        [        U R                  R                  SS9U l        U R                  R                   U l        [%        U R                  R&                  U R                  R(                  U R                  R*                  S9U l        [.        R0                  " U R                  R(                  U R                  R                  5      U l        [.        R0                  " U R                  R(                  U R                  R(                  5      U l        [.        R0                  " U R                  R(                  SU R                  R(                  -  U R                  R                  -   5      U l        [.        R0                  " U R                  R(                  U R                  R(                  5      U l        U R                  R:                  S:X  a  [=        U5      U l        OMU R                  R:                  S:X  a  [A        U5      U l        O"[C        S	U R                  R:                   35      e[.        RD                  " S
S9U l#        g )Nsoftmax      ࿩r   Fr   rZ   simpler   z unknown relative position bias: r%   r   )$r'   r(   r   r
   
activationattention_activationrr   scalingr   dropout_probuse_feature_dropoutr   hidden_dropout_probhidden_dropoutattention_probs_dropout_probattention_dropoutnormalize_before_megaprenormr   normalization_typer+   norm_affiner   r   Lineark_projv_projq_projh_projrelative_positional_biasrW   r^   rl   rA   Softmaxr>  r8   s     r;   r(    MegaGatedCrossAttention.__init__  s'    !7!78$(KK$D$D!GKG`G`dmGmt{{==tCsw"4;;#;#;DKKLkLkl)KK++DKK<[<[
 "-T[[-U-Ufk!l{{88$KK**DKK,C,CDKKLcLc
	 ii 7 79_9_`ii 7 79P9PQiiKK##Q)@)@%@4;;CiCi%i
 ii 7 79P9PQ;;//8; @ HD[[11X= @ HD?@d@d?efggzzb)r=   c                    UR                  5       u  pVnUc  UR                  S5      OUS-   nUb!  UR                  SS9R                  USS5      n	OUn	U R                  [	        X5      5      S S 2S U24   n
Ub%  UR                  S5      S:w  a  [        S5      eX   n
OU
S U n
[        R                  " XR                  SS5      5      U	-  U
-   n[        U R                     " U5      R                  U5      nUb  XR                  S5      -  nU$ )Nr   r%   r   9Position offset provided with queries longer than 1 tokenrZ   )rB   sumre   r^   maxrA   r3   bmmr   r
   rD  r  r   )r9   querykeykey_padding_maskpidxr3  src_len_tgt_lenlengthsrg   qkattn_weightss                r;   element_attention)MegaGatedCrossAttention.element_attention6  s   ((*a#'<%**Q-TAX'&**r*277QBGG   W!678G8Dzz!}! !\]]:D >D YYummAq12W<tCd778<DDRH''*D*DQ*GGLr=   c                 F   UR                  5       u  pVnUc  UR                  S5      OUS-   nU R                  [        X5      5      S S 2S U24   n	Ub%  UR                  S5      S:w  a  [        S5      eX   n	OU	S U n	XR                  -  n[
        R                  " XR                  SS5      5      U	-   n
UbJ  U
R                  SU-
  R                  S5      R                  [
        R                  5      [        S5      5      n
U R                  U
5      R                  U
5      nU$ )Nr   rY  rZ   -inf)rB   r^   r[  rA   rE  r3   r\  r   masked_fillr   r   r   r   r>  r  )r9   r]  r^  r_  r`  r3  ra  rb  rc  rg   re  rf  s               r;   softmax_attention)MegaGatedCrossAttention.softmax_attentionT  s
   ((*a#'<%**Q-TAX   W!678G8Dzz!}! !\]]:D >D $YYummAq12T9'%5!5 @ @ C F Fuzz RTYZ`TabB||B'//3r=   r^  valuer_  past_key_valuesoutput_attentionsr-  r.  c                    UR                  5       u  pn
XR                  R                  :w  a%  [        SU
 SU R                  R                   35      eUb8  US:w  a  [        SU 35      eUSS u  pS=p#US   nUR                  S5      S-   nOS=pU(       a  US:X  a  SOSnUnU R                  (       a  U R                  U5      nU R                  U5      n[        R                  " UU R                  R                  U R                  R                  U R                  R                  /SS	9u  nnn[        R                  " U5      n[        R                  " U5      nUc  Ub  [        S
5      eS=nnO1U R                  U5      nU R                  U R                  U5      5      nUR!                  SS5      nUb  UR!                  SS5      nUb  UR!                  SS5      nUb  UnUnU(       a  UnUnUR                  S5      nUb  UR#                  5       S:X  a  SnUb@  UR                  S5      U	:w  a  [        S5      eUR                  S5      U:w  a  [        S5      eU R$                  S:X  a  U R'                  UUXN5      nOU R)                  UUXN5      nU R+                  USS9nU R-                  U5      n[        R.                  " UU5      R!                  SS5      nU R                  U R1                  UU-  5      5      nU R3                  U5      n[        R4                  " UUUU-
  5      nU R                  (       d  U R                  U5      nU(       a  UU4OU4nU(       a  UWW4-   nU$ )a	  
Gated cross-attention used in Mega

Args:
    query (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
        The self (or target) sequence input used as query inputs for cross-attention
    key (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
        The cross (or source) sequence input with shape used as keys in cross-attention
    value (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
        The cross (or source) sequence input with shape used as values in cross-attention
    key_padding_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
        Padding mask corresponding to the source sequence, where entries are 1 for *not masked* and 0 for
        *masked* tokens
    past_key_values (`tuple(torch.FloatTensor)`, *optional*):
        If provided, the hidden state returned from the previous timestep during incremental decoding; expects
        that prior cross-attention keys and values will be the last two items in the tuple
    output_attentions (`bool`, defaults to `False`):
        Whether or not to return the cross-attention weights.
    use_cache (`bool`, defaults to `False`):
        Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
        updated EMA hidden state for use in the next step

Returns:
    `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
    inputs:
    - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
      Hidden states from target sequence updated by gated cross-attention
    - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
      `(batch_size, source_sequence_length, target_sequence_length)` -- The pairwise cross-attention weights
      corresponding to each token in the source and target sequences
    - **cross_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
      source_sequence_length, config.shared_representation_size)` -- The cross-attention key state for use in
      the next step of incremental decoding
    - **cross_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
      source_sequence_length, config.hidden_size)` -- The cross-attention value state for use in the next step
      of incremental decoding
r0  z but expected Nr   z>Incremental decoding requested with self-sequence length > 1: r   r   r%   r   z+Key and value must be `None` simultaneouslyz6Key padding mask does not align on the batch dimensionz@Key padding mask does not align on the sequence length dimensionr>  Tr   )rB   r   r+   rA   rM  r   rS  r3   r2  rr   r   rb   r1  rQ  rC  rR  r   r   rD  rl  rg  rI  rK  r\  rT  r   addcmul)r9   r]  r^  rn  r_  ro  rp  r-  rf   r3  rs   prev_cross_keyprev_cross_valueprev_self_keynum_incremental_steps
full_queryquery_projectedr   target_gateattention_queryprojected_keyprojected_valueupdated_cross_keyupdated_cross_valuectx_lenrf  r   weighted_targetsr)  outputss                                 r;   rJ   MegaGatedCrossAttention.forwardn  s~   ` #(**,i///DYK~^b^i^i^u^u]vw  &!| #abiaj!kll/>rs/C,NC ,A.M$1$6$6q$9A$=!044N)21A4!
<<:.J ++j1
 9>[[$$dkk&=&=t{{?e?ef9
5o  --8ff[);  !NOO.22MO !KK,M"oodkk#.>?O *33Aq9$)33Aq9M&-771=O &*M.O  -"1$$Q' ',<,@,@,Ba,G#'$$Q'3. !YZZ$$Q'72 !cdd$$	1110@L  110@L --o4-P''5 !99V_=GG1M??4;;7G+7U+VW<<(89mmE?4Du4LM||))C.C):3%!24G HHGr=   )rC  rD  rK  r   r   rT  rI  rQ  r   rM  rS  r^   rE  r>  rR  )NNFF)rN   rO   rP   rQ   rR   r   r(   rg  rl  r   r3   r]   r   r   tuplerJ   rS   rT   rU   s   @r;   r<  r<    s    "*z "*H<> 48+/"'\ ell#\ %	\
 #5<<0\ "%\  \ \ 
u||Xell33	4\ \r=   r<  c                      ^  \ rS rSrSrS\4U 4S jjrS rS r     SS\	\
R                     S\	\
R                     S	\	\   4S
 jjrSrU =r$ )MegaMovingAverageGatedAttentioni  an  
Pure PyTorch implementation of Mega block; see https://huggingface.co/papers/2209.10655 and original fairseq implementation
at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License)

Differences from original implementation include hidden state refactor and fixed inconsistency with additive /
multiplicative attention masks
r   c                 f  > [         TU ]  5         Xl        [        U R                  R                     U l        U R                  R
                  S:X  a  U R                  R                  S-  OS U l        [        U R                  R                  U R                  R                  S9U l        [        U R                  R                  U R                  R                  S9U l        [        U R                  R                  SS9U l        [!        U R                  R"                  U R                  R$                  U R                  R&                  S9U l        [+        U5      U l        [.        R0                  " U R                  R$                  U R                  R2                  5      U l        [.        R0                  " U R                  R$                  U R                  R                  U R                  R2                  -   SU R                  R$                  -  -   5      U l        [.        R0                  " U R                  R2                  U R                  R$                  5      U l        [.        R:                  " [<        R>                  " SU R                  R                  5      5      U l         [.        R:                  " [<        R>                  " SU R                  R                  5      5      U l!        U R                  RD                  S:X  a  [G        U5      U l$        OMU R                  RD                  S:X  a  [K        U5      U l$        O"[M        S	U R                  RD                   35      e[.        RN                  " S
S9U l(        U R                  R
                  S:X  a  U RR                  U l+        g U RT                  U l+        g )Nr>  r?  r@  FrA  rZ   rB  r   z"Unknown relative positional bias: r%   r   ),r'   r(   r   r
   rC  rD  rr   rE  r   rF  rG  r   rH  rI  rJ  rK  r   rN  r+   rO  r   r   ema_gater   rP  intermediate_sizerR  mx_projrT  r\   r3   r]   	qk_weightqk_biasrU  rW   r^   rl   rA   rV  r>  rl  rg  attention_functionr8   s     r;   r(   (MegaMovingAverageGatedAttention.__init__  s    !7!78<@KK<\<\`i<iDKK22D8os 	 #4;;#;#;DKKLkLkl)KK++DKK<[<[
 "-T[[-U-Ufk!l$KK**DKK,C,CDKKLcLc
	 4F;ii 7 79V9VWyyKK##KK22T[[5R5RRUVY]YdYdYpYpUpp
 ii = =t{{?V?VWell1dkk6\6\&]^||ELLDKK4Z4Z$[\;;//8; @ HD[[11X= @ HDA$++BfBfAghiizzb)&*kk&F&F)&SD"" 	Y]YoYo 	r=   c                 @   UR                  S5      nUb/  UR                  SSS9nUR                  SS9R                  S5      nOUnUb  UR                  SSS9nU R	                  U5      nXQR                  S5      :w  a%  UR                  S5      S	:w  a  [        S
5      eUSS n[        R                  " XR                  SS5      5      U-  U-   n[        U R                  R                     " U5      R                  U5      n	Ub  XR                  S5      -  n	Ub  X-  n	U	$ )z
Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized
causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for
masked.
rZ   Nr%   T)r   r   )r  r   r   z2Size mismatch between Q and K in element attentionra   )rB   rZ  clampr   r^   rA   r3   matmulr   r
   r   rD  r  )
r9   r]  r^  padding_maskcausal_maskrf   rd  rg   re  rf  s
             r;   rg  1MegaMovingAverageGatedAttention.element_attention?  s!    ((1+#"&&r4&8Gmmm,66r:GG"!oo"do;G   )jjm#zz!}! !UVV9D \\%q!!45?$Fdkk>>?CKKBO#'*@*@*CCL"'5Lr=   c                    UR                  S5      nU R                  U5      nXQR                  S5      :w  a%  UR                  S5      S:w  a  [        S5      eUSS nXR                  -  n[        R
                  " XR                  SS5      5      U-   nUbN  [        R                  " XGR                  S9nUR                  SU-
  R                  5       [        S5      5      nXx-   nUbs  SU-
  nUR                  SS	S
9n	[        R                  " X9) 5      nUR                  UR                  S5      R                  [        R                  5      [        S5      5      nU R!                  U5      R#                  U5      n
U
$ )zEStandard softmax self-attention, as in the original Transformer paperrZ   r   z2Size mismatch between Q and K in softmax attentionr%   Nra   r#   rj  Tr   )rB   r^   rA   rE  r3   r  r   
zeros_liker$   rk  r   r   alllogical_andr   r   r>  r  )r9   r]  r^  r  r  rf   rg   re  additive_causal_maskpadding_mask_allrf  s              r;   rl  1MegaMovingAverageGatedAttention.softmax_attentionf  sW   ((1+  )jjm#zz!}! !UVV9D $ \\%q!!45< "#(#3#3Kxx#P #7#C#CQ_DZDZD\^cdj^k#l *B#
 |+L+//B/E ,,\;LML 6 6q 9 < <UZZ H%PV-XB||B'//3r=   r  r  ro  c                    UR                  5       u  pxn	XR                  R                  :w  a%  [        SU R                  R                   SU	 35      eUn
U R                  R                  (       a  U R                  U5      nU R                  U R                  U5      5      nU R                  R                  (       a   Ub  US:  a  [        SU 35      eUSS u  pnOS=n=pU R                  XXS9u  nnU R                  U5      nU R                  U5      n[        R                  " UU R                  R                  U R                  R                  U R                  R                  -   U R                  R                  /S	S
9u  nnn[        R                   " U5      n["        R$                  " U5      n[        R                  " UU R                  R                  U R                  R                  /S	S
9u  nnUR'                  S5      U R(                  -  U R*                  -   n[        R,                  " USS
9u  nnUR/                  SS5      nUR/                  SS5      nUR/                  SS5      nU R                  R                  (       a  Ub  [        R0                  " UU/SS
9nUb  [        R0                  " X/SS
9nU R                  R2                  (       d  UnUnO7UR                  S5      U R                  R4                  -  nUS:X  a  SnSnOUnUnUR                  S5      nU R                  R2                  (       dI  UR'                  S5      nUR'                  S5      nUR'                  S5      nUb  UR'                  S5      nGOXpR                  R4                  :  a  UR'                  S5      nOTXpR                  R4                  -  nUR7                  UUU R                  R4                  U R                  R                  5      nUU R                  R4                  :  a7  UR'                  S5      nUR'                  S5      nUb  UR'                  S5      nOUU R                  R4                  -  nUR7                  UUU R                  R4                  U R                  R                  5      nUR7                  UUU R                  R4                  U R                  R                  5      nUb'  UR9                  UUU R                  R4                  5      nUb  UR;                  5       S:X  a  SnU R=                  UUX#S9nU R?                  USS9nU RA                  U5      n[        RB                  " UU5      R9                  XU R                  R                  5      R/                  SS5      n U R                  UU RE                  U U-  5      -   5      n U R                  U 5      n [        RF                  " U
UU U
-
  5      n!U R                  R                  (       d  U R                  U!5      n!U(       a  U!U4OU!4n"U R                  R                  (       a  U"WWU4-   n"U"$ )a	  
Mega's self-attention block, which combines multi-headed EMA with traditional self-attention

Args:
    input (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
        Hidden states to be updated by Mega's self-attention
    padding_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
        or 0 for *masked*
    causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
        Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
        masked* or 0 for *masked*
    past_key_values (`tuple(torch.Tensor)`, *optional*):
        The hidden states returned from the previous timestep during incremental decoding; expects that
        self-attention key, value, and EMA states are the first 3 entries in the tuple
    output_attentions (`bool`, default `False`):
        Whether to return self-attention weights
    use_cache (`bool`, default `False`):
        Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated
        states for use in the next step

Returns:
    `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
    inputs:
    - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
      states from target sequence updated by Mega's self-attention
    - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
      `(batch_size, 1, sequence_length, sequence_length)` -- The self-attention weights corresponding to how
      each token in the input sequence attends to every other token
    - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
      sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
      step of incremental decoding
    - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
      sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
      incremental decoding
    - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
      `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
z$Input embedding dimension should be z; received Nr   zGIncremental decoding only supports self sequence length of 1; received r   ra   )r+  r,  r-  r%   r   rZ   )r  r  Trr  )$rB   r   r+   rA   rL  r   rC  rR  
is_decoderr  r   r  r3   r2  rr   r  r   rb   r1  r   r  r  unbindr   r   use_chunkingr[   reshapere   r   r  rI  rK  r  rT  rs  )#r9   r   r  r  ro  rp  r-  rf   r3  rs   r4  rn  rv  prev_self_valueprev_ema_stateema_outupdated_ema_statebaser   query_key_gatesintermediate_state	query_keyattention_gater]  r^  updated_self_keyupdated_self_valuecurr_lenr  n_chunksrf  r   weighted_self_outputr)  return_valuess#                                      r;   rJ   'MegaMovingAverageGatedAttention.forward  s   ` #(**,i///CDKKD[D[C\\ghqgrstt ;;,,IIe$E E 23
 ;;!!'B{ #jkrjs!tuu=LQq=Q:MN?CCMCO &*]]> &3 &
"" ,,w' ||G$?D{{''669V9VV''
 @
<*<  --8 &&1 %*KKdkkDDdkkFcFcdjl%
!	> ''*T^^;dllJ	 \\)3
s
 1%mmAq!1%;;!!
 (ii 4!<*		?":B ;;++#& %*"88A;)?)??q='+$)-&'*$).&((1+{{'' OOA&E--"COOA&E'+55a8 ///* #kk&<&<<c8T[[5K5KT[[MsMst///mmA&*+#/#9#9!#<L #dkk&<&<<kk#x1G1GIoIopc8T[[5K5KT[[MjMjk+#/#4#4S(DKKDZDZ#[L #(8(8(:a(?L..uc.n##Et#<''5 LL',,S4;;;X;XYccdeghi 	
  $/ADKKPdguPuDv/vw#||,@AmmHo7Kh7VW{{00))C.C/@l+sf;;!!)-=?QSd,eeMr=   )rC  rK  r  r   r   r  rT  rI  r  r   r  r  r^   rE  r>  rR  )NNNFF)rN   rO   rP   rQ   rR   r   r(   rg  rl  r   r3   r]   r   rJ   rS   rT   rU   s   @r;   r  r    su    '
z '
R%N$R 04.2+/N u||,N ell+	N
 "%N Nr=   r  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ ) MegaNormalizedFeedForwardNetworki]  z
Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args
from Hugging Face config
r   c                 \  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        [        UR
                     U l        [        U R                  R                  U R                  R                  S9U l        [        U R                  R                  U R                  R                  S9U l        U R                  R                  U l        [!        U R                  R"                  U R                  R$                  U R                  R&                  S9U l        [*        R,                  " U R                  R$                  U R                  R                  5      U l        [*        R,                  " U R                  R                  U R                  R$                  5      U l        g )Nr@  rA  )r'   r(   r   nffn_hidden_size
hidden_dimrC  act_fnr
   r   rF  rG  r   nffn_activation_dropout_probrI  normalize_before_ffnrM  r   rN  r+   rO  r   r   rP  fc1fc2r8   s     r;   r(   )MegaNormalizedFeedForwardNetwork.__init__c  s    11'' !2!23"4;;#;#;DKKLkLkl)KK44T[[EdEd
 {{77$KK**DKK,C,CDKKLcLc
	 99T[[44dkk6R6RS99T[[994;;;R;RSr=   c                 @   UnU R                   (       a  U R                  U5      nU R                  U R                  U5      5      nU R	                  U5      nU R                  U5      nU R                  U5      nXB-   nU R                   (       d  U R                  U5      nU$ r   )rM  r   rC  r  rI  r  r   )r9   r  r4  hiddenr   s        r;   rJ   (MegaNormalizedFeedForwardNetwork.forwardx  s    <<YYv&F&!12$$V,&!f%"||YYv&Fr=   )
r  rC  r   r   r  r  r  rI  r   rM  rM   rU   s   @r;   r  r  ]  s     
Tz T* r=   r  c                   (  ^  \ rS rSrS\4U 4S jjr\" SSSS9       SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\R                     S\	\   S\	\   S\S\\R                     4S jj5       rSrU =r$ )	MegaBlocki  r   c                 j  > [         TU ]  5         SU l        [        U5      U l        UR
                  (       a  [        U5      OS U l        UR                  U l        UR                  U l	        U R                  (       a0  U R                  (       d  [        U  S35      e[        U5      U l        g S U l        g )Nr   z> should be used as a decoder model if cross attention is added)r'   r(   seq_len_dimr  
mega_layeruse_normalized_ffnr  nffnr  add_cross_attentionrA   r<  
cross_attnr8   s     r;   r(   MegaBlock.__init__  s    9&A@F@Y@Y4V<_c	 ++#)#=#= ##?? D6)g!hii5f=DO"DOr=   past_key_valuero  z4.58)new_nameversionhidden_statesr+  r  encoder_hidden_statesencoder_attention_maskrp  r-  r.  c	           
      <   U(       a  Ub  Ub  USS2S4   R                  S5      n	OUn	U R                  UU	UUUUS9n
U
S   nU(       a  U
SS OSu  pnU(       a  U
S   OSnU R                  bG  Uc  [        S5      eU R                  UUUUUUUS	9nUS   nU(       a  US
S OSu  nnU(       a  US   OSnU R                  b  U R	                  U5      nU4nU(       a  UU4-   nU R                  b  UW4-   nU(       a  UUU4nU R                  b  UWW4-   nUU4-   nU$ )ac  
A single Mega layer: either encoder or decoder, with optional cross-attention and optional normalized
feed-forward layer

Args:
    hidden_states (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
        Hidden states to be updated by the Mega block
    attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
        Indicates which entries in the self/target sequence are to be ignored (mostly due to padding), where
        elements are either 1 for *not masked* or 0 for *masked*. Causal attention is enforced internally.
    causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
        Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
        masked* or 0 for *masked*
    encoder_hidden_states (`torch.Tensor`, of shape `(source_sequence_length, batch_size, hidden_size)`, *optional*):
        Encoder hidden states to be used for cross-attention (and required for encoder-decoder model setup)
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
        Indicates which entries in the cross/source sequence are to be ignored (mostly due to padding), where
        elements are either 1 for *not masked* or 0 for *masked*.
    past_key_values (`tuple(torch.Tensor)`, *optional*):
        The hidden states returned from the previous timestep during incremental decoding; expects that
        self-attention key, value, and EMA states are the first 3 entries in the tuple, and (if doing
        cross-attention) cross-attention key and value are the last 2 entries in the tuple
    output_attentions (`bool`, default `False`):
        Whether to return self-attention weights
    use_cache (`bool`, default `False`):
        Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated
        states for use in the next step

Returns:
    `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
    inputs:
    - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
      Hidden states from target sequence updated by Mega
    - **self_attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
      `(batch_size, 1, target_sequence_length, target_sequence_length)` -- The self-attention weights
      corresponding to how each token in the input sequence attends to every other token
    - **cross_attn_weights** (*optional*, returned when `output_attentions=True` and
      `config.add_cross_attention=True`) `torch.FloatTensor` of shape `(batch_size, source_sequence_length,
      target_sequence_length)` -- Pairwise cross-attention weights between every entry in the source sequence
      and target sequence
    - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
      sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
      step of incremental decoding
    - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
      sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
      incremental decoding
    - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
      `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
    - **cross_key** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
      `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.shared_representation_size)` --
      The cross-attention key state for use in the next step of incremental decoding
    - **cross_value** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
      `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.hidden_size)` -- The
      cross-attention value state for use in the next step of incremental decoding
Nr%   )r   r  r  ro  rp  r-  r   rL   r   zARequested cross-attention without providing encoder hidden states)r]  r^  rn  r_  ro  rp  r-  r   NN)r   r  r  rA   r  )r9   r  r+  r  r  r  ro  rp  r-  mega_padding_maskmega_outputsnew_hidden_statesself_key
self_valueself_ema_stateself_attention_weightscross_attn_outputs	cross_keycross_valuecross_attention_weightsoutsnew_key_valuess                         r;   rJ   MegaBlock.forward  s   P /5N<V .q"u 5 ? ? C .*#+/ ' 
 )ODM|BC/@Se,n4Ea4 ??&$, !dee!%')+!7 /"3# "1 " !31 5@I%7%<|"I{?P&8&;VZ# 99  $		*; <!#133D*688N
 *!/9k2J!J>++Dr=   )r  r  r  r  r  r  )NNNNNFF)rN   rO   rP   rQ   r   r(   r   r3   r]   r   
LongTensorry   r   r   r  rJ   rS   rT   rU   s   @r;   r  r    s    #z # %0A6R 6:26=A>B+/,1C||C !!1!12C e../	C
  ((9(9:C !)):): ;C "%C $D>C C 
u||	C SCr=   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
MegaPooleri!  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r'   r(   r   rP  r+   denseTanhrC  r8   s     r;   r(   MegaPooler.__init__"  s9    YYv1163E3EF
'')r=   r  r.  c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r  rC  )r9   r  first_token_tensorpooled_outputs       r;   rJ   MegaPooler.forward'  s6     +1a40

#566r=   )rC  r  )
rN   rO   rP   rQ   r(   r3   r]   rJ   rS   rT   rU   s   @r;   r  r  !  s(    $
U\\ ell  r=   r  c                   8    \ rS rSr% Sr\\S'   SrSrS/r	S r
Srg	)
MegaPreTrainedModeli0  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
r   megaFr  c           
      b   [        U[        5      (       Ga  [        R                  " 5          [        R
                  R                  UR                  SU R                  R                  S9  [        R
                  R                  UR                  SU R                  R                  S9  [        R                  " U R                  R                  S5      nU R                  R                  S:  aQ  [        R                  " [        [        SU R                  R                  S5      5      5      nUR!                  SUS5        UR"                  R                  SU R                  R$                  S9R'                  U5        [        R
                  R                  UR(                  SU R                  R*                  S9  [        R
                  R                  UR,                  SU R                  R*                  S9  SSS5        g[        U[.        5      (       a>  [        R
                  R                  UR0                  SU R                  R2                  S9  g[        U[4        5      (       a{  [        R
                  R                  UR6                  SU R                  R2                  S9  [        R
                  R                  UR8                  SU R                  R2                  S9  g[        U[:        5      (       aG  U R                  R<                  (       a+  [        R
                  R?                  UR@                  S5        gg[        U[B        5      (       aG  U R                  R<                  (       a+  [        R
                  R?                  URD                  S5        gg[        U[F        5      (       ah  [        R
                  R                  URH                  SU R                  R2                  S9  [        R
                  R?                  URJ                  S5        g[        U[        RL                  5      (       ak  URD                  RN                  R                  SU R                  R2                  S9  URP                  b%  URP                  RN                  RS                  5         gg[        U[        RT                  5      (       ax  URD                  RN                  R                  SU R                  R2                  S9  URV                  b2  URD                  RN                  URV                     RS                  5         gg[        U[        RX                  5      (       aJ  URP                  RN                  RS                  5         URD                  RN                  R[                  S5        gg! , (       d  f       g= f)	zInitialize the weightsrp   )r   stdr   rZ   r   g      Nr   ).r   r   r3   no_gradr   initnormal_r   r   ema_delta_alpha_ranger   onesr   tensorlistrangeindex_fill_r   ema_beta_rangeadd_r   ema_gamma_omega_ranger   rW   r^   initializer_rangerl   rw   rx   r   rO  	constant_r   r   r   r  r  r  rP  datarg   zero_r)   r!   r   fill_)r9   modulevalidxs       r;   _init_weights!MegaPreTrainedModel._init_weights;  s   f9:: 5 5CT[[EfEfg 3 3#4;;CdCdejj!@!@!D;;22Q6,,tE!T[[5T5TVW,X'YZCOOAsD1++33$++B\B\3]bbcfg ? ?ct{{OpOpq 6 6SdkkFgFgh !  @AAGGOOF//ct{{?\?\O] @AAGGOOFLLs8U8UOVGGOOFNN$++:W:WOX..{{&&!!&--5 ',,{{&&!!&--5 ' ?@@GGOOF,,3DKK<Y<YOZGGfnnc2		**MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .K !s   GV  
V. N)rN   rO   rP   rQ   rR   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesr  rS   r  r=   r;   r  r  0  s)    
 &+#:;**r=   r  a>  

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`MegaConfig`]): Model configuration class with all the parameters of the
            model. Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            This parameter can only be used when the model is initialized with `add_token_type_embeddings` parameter
            set to `True`. All the value in this tensor should be always < config.type_vocab_size.

            [What are token type IDs?](../glossary#token-type-ids)
        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.c                     ^  \ rS rSrSrSS\4U 4S jjjrS rS r\	" \
R                  S5      5      \" \\\S9           SS	\\R$                     S
\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\   S\\   S\\   S\\   S\\   S\\\R$                     \4   4S jj5       5       rSrU =r$ )	MegaModeli  ae  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average
Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig,
Jonathan May, and Luke Zettlemoyer

To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
`True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both
`is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an
`encoder_hidden_states` is then expected as an input to the forward pass.

.. _*Mega: Moving Average Equipped Gated Attention*: https://huggingface.co/papers/2209.10655

r   c                 <  > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l
        U(       a  [        U5      OS U l        U R                  5         g s  snf r   )r'   r(   r   r   embedding_layerr   
ModuleListr  num_hidden_layersr  layersr  pooler	post_init)r9   r   add_pooling_layerrb  r:   s       r;   r(   MegaModel.__init__  su     -f5mmfF^F^@_$`@_1Yv%6@_$`a,=j(4 	 %as   Bc                 .    U R                   R                  $ r   r  r-   r   s    r;   get_input_embeddingsMegaModel.get_input_embeddings  s    ##333r=   c                 $    XR                   l        g r   r  )r9   rn  s     r;   set_input_embeddingsMegaModel.set_input_embeddings  s    /4,r=   batch_size, sequence_length
checkpointoutput_typeconfig_classrD   r+  r"   rE   r  r  ro  r-  rp  output_hidden_statesreturn_dictr.  c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  Ub  [	        S5      eUb.  U R                  X5        UR                  5       nUR                  nO.Ub   UR                  5       SS nUR                  nO[	        S5      eU R                   R                  (       a/  [        R                  " US   U R                   R                  /5      nUu  pU R                   R                  (       aZ  XR                   R                  :  aA  XR                   R                  -  S:w  a%  [	        SU SU R                   R                   35      eU R                   R                  (       ae  Ub  UOU R                   R                  n[        R                  " SU4[        R                  US	9nU R!                  UU5      nUR#                  S5      nOS
nSnUbQ  [%        U5      U R                   R&                  :w  a.  [	        SU R                   R&                   S[%        U5       35      eU R)                  XUS9nUR+                  SS5      nUb  UR+                  SS5      nU
(       a  U4OSnU	(       a  SOSnU	(       a  U R                   R,                  (       a  SOSnU(       a  SOSn[/        U R0                  5       H  u  nnUb  UU   OSnU" UUUUUUU	US9nUS   nU
(       a  UUR+                  SS5      4-  nU	(       a1  US   nUU4-  nU R                   R,                  (       a  US   nUU4-  nU(       d  M  US   nUU4-  nM     UR+                  SS5      nU R2                  b  U R3                  U5      OSnU(       d  UU4UUUU4-   $ [5        UUUUUUS9$ )a  
encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
    the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
    Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

    If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
    don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
    `decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).
NzDYou cannot specify both input_ids and inputs_embeds at the same timer%   z5You have to specify either input_ids or inputs_embedsr   zconfig.use_chunking is activated; input sequence length must be shorter than or a multiple of config.chunk_size
received sequence length of z with chunk size r   r?   Fz;Received past key/value cache with size mismatch; expected z, received )rD   r"   rE   r  )r  r+  r  r  r  ro  rp  r-  rZ   )last_hidden_statepooler_outputro  r  
attentionscross_attentions)r   rp  r#  use_return_dictrA   %warn_if_padding_and_no_attention_maskrB   r@   r  r3   r  r[   r  r-  r  r6   *create_extended_attention_mask_for_decoderr'  lenr  r  r   r  	enumerater  r  r   ) r9   rD   r+  r"   rE   r  r  ro  r-  rp  r#  r$  rF   r@   
batch_sizesequence_lengthtemp_mask_for_extensionr  embedding_outputr  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheir  current_decoder_cacher  self_attn_weightscross_attn_weightsupdated_cacher  s                                    r;   rJ   MegaModel.forward  s,   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K%%F&',,.s3K"))FTUU;;##,,A8N8N'OPK&1#
;;##;;;Q;Q)Q!7!771<  d  et  du  uF  GK  GR  GR  G]  G]  F^  _  ;;!!%.%:	@U@UI
 ',jj!_1EUZZ`f&g#II+WnoK &--a0KIK 'c/.BdkkFcFc.cMdkkNkNkMllwx{  }L  yM  xN  O 
  //m 0 

 )221a8
 !,$9$C$CAq$I! 4H-/T$5b4%64;;;Z;Zr`d#,R$&t{{3MAz:I:UOA$6[_!%+-'&;'= 5"3#	L )OM# "m&=&=a&C%EE! $0O!#(9';;#;;22)5a&(-?,AA(y ,R 0"}&66"5 4: &//15 7;kk6MM2SW!=1!"#$	5   <+'.+*1
 	
r=   )r   r  r  r  )T)NNNNNNNNNNN)rN   rO   rP   rQ   rR   r   r(   r  r  r   MEGA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr   r3   r]   r   r   r   r  rJ   rS   rT   rU   s   @r;   r  r    se   
 
z 
 
45 ++@+G+GHe+fg&@$ -11515048<9=+/$(,0/3&*Y
ELL)Y
 !.Y
 !.	Y

  -Y
  (5Y
 !) 6Y
 "%Y
 D>Y
 $D>Y
 'tnY
 d^Y
 
uU\\"$PP	QY
 hY
r=   r  zFMEGA Model with a `language modeling` head on top for CLM fine-tuning.c                     ^  \ rS rSrS/rS\4U 4S jjr\" \R                  S5      5      \
" \\S9            SS\\R                     S\\R                      S	\\R                     S
\\R                      S\\R                      S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\   S\\\R*                     \4   4S jj5       5       rSS jrS rSrU =r$ )MegaForCausalLMii  zlm_head.weightr   c                   > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        UR                  (       aK  [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        OS U l        S U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g )NzLIf you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`Fr  )r'   r(   r  loggerwarningr  r  add_lm_hidden_dense_layerr   rP  r+   r  r  hidden_activationr*   lm_headr  r8   s     r;   r(   MegaForCausalLM.__init__o  s       NNijf>	++6#5#5v7I7IJDJ%'WWYD"DJ%)D"yy!3!3V5F5FG 	r=   r  )r!  r"  rD   r+  r"   rE   r  r  labelsro  r-  rp  r#  r$  r.  c                    Ub  UOU R                   R                  nUb  Sn	U R                  UUUUUUUU	U
UUS9nUS   nU R                  b"  U R                  U5      nU R	                  U5      nU R                  U5      nSnUb{  USS2SS2SS24   R                  5       nUSS2SS24   R                  5       n[        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )	az	  
encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
    the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
    Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.

    If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
    don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
    `decoder_input_ids` of shape `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).

Returns:

Example:

```python
>>> from transformers import AutoTokenizer, MegaForCausalLM, AutoConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("mnaylor/mega-base-wikitext")
>>> config = AutoConfig.from_pretrained("mnaylor/mega-base-wikitext")
>>> config.is_decoder = True
>>> config.bidirectional = False
>>> model = MegaForCausalLM.from_pretrained(
...     "mnaylor/mega-base-wikitext", config=config, ignore_mismatched_sizes=True
... )

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NF)
r+  r"   rE   r  r  ro  r-  rp  r#  r$  r   r%   r   rZ   )losslogitsro  r  r(  r)  )r   r*  r  r  rH  rI  
contiguousr   re   r*   r   ro  r  r(  r)  )r9   rD   r+  r"   rE   r  r  rK  ro  r-  rp  r#  r$  r  sequence_outputprediction_scoreslm_lossshifted_prediction_scoresloss_fctr   s                       r;   rJ   MegaForCausalLM.forward  sz   ~ &1%<k$++B]B]I))))'"7#9+/!5#  
 "!*::!"jj9O"44_EO LL9(9!SbS!)(D(O(O(Q%AqrE]--/F')H8==b$++BXBXY[a[f[fgi[jkG')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r=   c                 h    UR                   nUc  UR                  U5      nUb  US S 2SS 24   nXUS.$ )Nr%   )rD   r+  ro  )shapenew_ones)r9   rD   ro  r+  model_kwargsrF   s         r;   prepare_inputs_for_generation-MegaForCausalLM.prepare_inputs_for_generation  sE    oo!&//<N &!!RS&)I&]lmmr=   c                 P   ^ SnU H  nU[        U4S jU 5       5      4-  nM     U$ )Nr  c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr   r@   ).0r  beam_idxs     r;   	<genexpr>1MegaForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)r  )r9   ro  r`  reordered_past
layer_pasts     `  r;   _reorder_cacheMegaForCausalLM._reorder_cache  s8    )Jncmnn N * r=   )r  rH  rI  r  )NNNNNNNNNNNNr  )rN   rO   rP   rQ   _tied_weights_keysr   r(   r   r=  r>  r   r   r@  r   r3   r  ry   r   r   r   r  r]   rJ   rZ  re  rS   rT   rU   s   @r;   rB  rB  i  s    ++z ( ++@+G+GHe+fg+L[jk 156:5959=A>B-1+/$(,0/3&*j
E,,-j
 !!2!23j
 !!1!12	j

   1 12j
  ((9(9:j
 !)):): ;j
 ))*j
 "%j
 D>j
 $D>j
 'tnj
 d^j
 
uU\\"$EE	Fj
 l hj
X
n r=   rB  z2MEGA Model with a `language modeling` head on top.c                     ^  \ rS rSrS/rS\4U 4S jjrS rS r\	" \
R                  S5      5      \" \\\SS	S
S9          SS\\R$                     S\\R&                     S\\R$                     S\\R&                     S\\R&                     S\\R&                     S\\R$                     S\\   S\\   S\\   S\\\R.                     \4   4S jj5       5       rSrU =r$ )MegaForMaskedLMi  zmlm_head.weightr   c                 ,  > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        UR                  (       aK  [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        OS U l        S U l        [        R                  " UR                  UR                  5      U l        [        R                   " UR"                  5      U l        U R'                  5         g )NzfIf you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for bi-directional self-attention.FrD  )r'   r(   r  rE  rF  r  r  rG  r   rP  r+   r  r  rH  r*   mlm_headDropoutrF  r   r  r8   s     r;   r(   MegaForMaskedLM.__init__
  s     NN1
 f>	++6#5#5v7I7IJDJ%'WWYD"DJ%)D"		&"4"4f6G6GHzz&"5"56 	r=   c                     U R                   $ r   rk  r   s    r;   get_output_embeddings%MegaForMaskedLM.get_output_embeddings   s    }}r=   c                     Xl         g r   ro  )r9   new_embeddingss     r;   set_output_embeddings%MegaForMaskedLM.set_output_embeddings#  s    &r=   r  z<mask>z' Paris'g?)r   r!  r"  maskexpected_outputexpected_lossrD   r+  r"   rE   r  r  rK  rp  r#  r$  r.  c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  b"  U R                  U5      nU R	                  U5      nU R                  U5      nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
kwargs (`dict[str, any]`, optional, defaults to *{}*):
    Used to hide legacy arguments that have been deprecated.
N)r+  r"   rE   r  r  rp  r#  r$  r   r%   rZ   rM  rN  r  r(  )r   r*  r  r  rH  rk  r   re   r*   r   r  r(  )r9   rD   r+  r"   rE   r  r  rK  rp  r#  r$  r  rP  rQ  masked_lm_lossrT  r   s                    r;   rJ   MegaForMaskedLM.forward&  s"   < &1%<k$++B]B]))))'"7#9/!5#  

 "!*::!"jj9O"44_EO MM/:')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r=   )r  r   rH  r  rk  )
NNNNNNNNNN)rN   rO   rP   rQ   rg  r   r(   rp  rt  r   r=  r>  r   r?  r   r@  r   r3   r  ry   r   r   r  r]   rJ   rS   rT   rU   s   @r;   ri  ri    s`   +,z ,' ++@+G+GHe+fg&"$" 156:5959=A>B-1,0/3&*6
E,,-6
 !!2!236
 !!1!12	6

   1 126
  ((9(9:6
 !)):): ;6
 ))*6
 $D>6
 'tn6
 d^6
 
uU\\"N2	36
 h6
r=   ri  z
    MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   l  ^  \ rS rSrU 4S jr\" \R                  S5      5      \" \	\
\S9        SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\\R&                     \
4   4S jj5       5       rSrU =r$ )MegaForSequenceClassificationih  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFrD  )	r'   r(   
num_labelsr   r  r  MegaClassificationHead
classifierr  r8   s     r;   r(   &MegaForSequenceClassification.__init__p  sH      ++f>	08 	r=   r  r  rD   r+  r"   rE   rK  rp  r#  r$  r.  c	           
      ,   Ub  UOU R                   R                  nU R                  UUUUUUUS9n	U	S   n
U R                  U
5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R
                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       nU" X5      nU(       d  U4U	S	S -   nUb  U4U-   $ U$ [        UUU	R                   U	R"                  S
9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr+  r"   rE   rp  r#  r$  r   r   
regressionsingle_label_classificationmulti_label_classificationr%   rZ   rz  )r   r*  r  r  problem_typer  r$   r3   r6   r   r   r'  r   re   r   r   r  r(  r9   rD   r+  r"   rE   rK  rp  r#  r$  r  rP  rN  rM  rT  r   s                  r;   rJ   %MegaForSequenceClassification.forward{  s   . &1%<k$++B]B]))))'/!5#  
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r=   )r  r   r  r  NNNNNNNN)rN   rO   rP   rQ   r(   r   r=  r>  r   r?  r   r@  r   r3   r  ry   r   r   r  r]   rJ   rS   rT   rU   s   @r;   r~  r~  h  s   	 ++@+G+GHe+fg&,$ 156:5959-1,0/3&*?
E,,-?
 !!2!23?
 !!1!12	?

   1 12?
 ))*?
 $D>?
 'tn?
 d^?
 
uU\\"$<<	=?
 h?
r=   r~  z
    MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                   l  ^  \ rS rSrU 4S jr\" \R                  S5      5      \" \	\
\S9        SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\\R&                     \
4   4S jj5       5       rSrU =r$ )MegaForMultipleChoicei  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr   )r'   r(   r  r  r   rl  rH  r   rP  r+   r  r  r8   s     r;   r(   MegaForMultipleChoice.__init__  sV     f%	zz&"<"<=))F$6$6: 	r=   z(batch_size, num_choices, sequence_lengthr  rD   r"   r+  rK  rE   rp  r#  r$  r.  c	           
         Ub  UOU R                   R                  nUb  UR                  S   OUR                  S   n	Ub!  UR                  SUR	                  S5      5      OSn
Ub!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  U
UUUUUUS9nUS   nU R                  U5      nU R                  U5      nUR                  SU	5      nSnUb  [        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a"  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr   r%   r   )r"   r+  rE   rp  r#  r$  rZ   rz  )r   r*  rW  re   rB   r  r   r  r   r   r  r(  )r9   rD   r"   r+  rK  rE   rp  r#  r$  num_choicesflat_input_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr  r  rN  reshaped_logitsrM  rT  r   s                        r;   rJ   MegaForMultipleChoice.forward  s   . &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 ))..,/!5#  
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r=   )r  r   r  r  )rN   rO   rP   rQ   r(   r   r=  r>  r   r?  r   r@  r   r3   r  ry   r   r   r  r]   rJ   rS   rT   rU   s   @r;   r  r    s    ++@+G+GHr+st&-$ 15596:-159,0/3&*:
E,,-:
 !!1!12:
 !!2!23	:

 ))*:
   1 12:
 $D>:
 'tn:
 d^:
 
uU\\"$==	>:
 u:
r=   r  z
    MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                   l  ^  \ rS rSrU 4S jr\" \R                  S5      5      \" \	\
\S9        SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\\R&                     \
4   4S jj5       5       rSrU =r$ )MegaForTokenClassificationi  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r  )r'   r(   r  r  r  classifier_dropoutrH  r   rl  r   rP  r+   r  r  r9   r   r  r:   s      r;   r(   #MegaForTokenClassification.__init__   s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r=   r  r  rD   r+  r"   rE   rK  rp  r#  r$  r.  c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4U	SS -   nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r%   rZ   rz  )r   r*  r  r   r  r   re   r  r   r  r(  r  s                  r;   rJ   "MegaForTokenClassification.forward.  s    * &1%<k$++B]B]))))'/!5#  
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r=   )r  r   r  r  r  )rN   rO   rP   rQ   r(   r   r=  r>  r   r?  r   r@  r   r3   r  ry   r   r   r  r]   rJ   rS   rT   rU   s   @r;   r  r    s    ++@+G+GHe+fg&)$ 156:5959-1,0/3&*.
E,,-.
 !!2!23.
 !!1!12	.

   1 12.
 ))*.
 $D>.
 'tn.
 d^.
 
uU\\"$99	:.
 h.
r=   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  if  z-Head for sentence-level classification tasks.c                 b  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        g r   )r'   r(   r   rP  r+   r  r  rH  rl  r   r  out_projr  s      r;   r(   MegaClassificationHead.__init__i  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHr=   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r  )r   r  r3   tanhr  )r9   featureskwargsxs       r;   rJ   MegaClassificationHead.forwardr  sY    Q1WLLOJJqMJJqMLLOMM!r=   )r  r   r  r   rU   s   @r;   r  r  f  s    7I r=   r  z
    MEGA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                     ^  \ rS rSrU 4S jr\" \R                  S5      5      \" \	\
\S9         SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\\R&                     \
4   4S jj5       5       rSrU =r$ )MegaForQuestionAnsweringi|  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r'   r(   r  r  r  r   rP  r+   
qa_outputsr  r8   s     r;   r(   !MegaForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r=   r  r  rD   r+  r"   rE   start_positionsend_positionsrp  r#  r$  r.  c
           
         U	b  U	OU R                   R                  n	U R                  UUUUUUU	S9n
U
S   nU R                  U5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU	(       d  X4U
SS -   nUb  U4U-   $ U$ [        UUUU
R                  U
R                  S	9$ )
a  
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for position (index) of the start of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for position (index) of the end of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
Nr  r   r   r%   r   )ignore_indexrZ   )rM  start_logits
end_logitsr  r(  )r   r*  r  r  r2  r'  rO  r-  rB   r  r   r   r  r(  )r9   rD   r+  r"   rE   r  r  rp  r#  r$  r  rP  rN  r  r  
total_lossignored_indexrT  
start_lossend_lossr   s                        r;   rJ    MegaForQuestionAnswering.forward  s   8 &1%<k$++B]B]))))'/!5#  
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r=   )r  r  r  )	NNNNNNNNN)rN   rO   rP   rQ   r(   r   r=  r>  r   r?  r   r@  r   r3   r  ry   r   r   r  r]   rJ   rS   rT   rU   s   @r;   r  r  |  s2    ++@+G+GHe+fg&0$ 156:59596:48,0/3&*D
E,,-D
 !!2!23D
 !!1!12	D

   1 12D
 "%"2"23D
   0 01D
 $D>D
 'tnD
 d^D
 
uU\\"$@@	AD
 hD
r=   r  )rB  ri  r  r  r~  r  r  r  )ErR   r}   typingr   r   r3   torch.nn.functionalr   
functionalrb   torch.nnr   r   r   activationsr
   cache_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   utils.deprecationr   configuration_megar   
get_loggerrN   rE  r?  r@  Moduler   rW   rl   r   r   r   r   r   r<  r  r  r  r  r  MEGA_START_DOCSTRINGr=  r  rB  ri  r~  r  r  r  r  __all__r  r=   r;   <module>r     s     "     A A " !   /  2 * 
		H	%2 3RYY 3lryy <2ryy 2j#X")) #XLL")) L8BII 2$ryy $@N*")) N*bbii DMbii M`
*ryy *ZS		 Sn 5*/ 5*p  % P dB
# B
	B
J PRfW) WWt NPde^
) ^
 f^
B  Q
$7 Q
Q
h  K
/ K
K
\  C
!4 C
C
NRYY ,  U
2 U
U
p	r=   