
    cCiA                       S r SSKrSSKJrJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJr  SSKJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&  SSK'J(r(  \$RR                  " \*5      r+S\RX                  S\-S\-4S jr. " S S\R^                  5      r0 " S S\Rb                  5      r2 " S S\5      r3 " S S\5      r4 " S S\Rb                  5      r5 " S  S!\Rb                  5      r6\# " S" S#\!5      5       r7 " S$ S%\75      r8 " S& S'\75      r9\# " S( S)\75      5       r:\#" S*S+9 " S, S-\7\5      5       r;\#" S.S+9 " S/ S0\75      5       r<\# " S1 S2\75      5       r= " S3 S4\75      r> " S5 S6\7\5      r?/ S7Qr@g)8zPyTorch MVP model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       ^/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_rightr+   3   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S\	\R                     4U 4S	 jjjr
S
rU =r$ )MvpLearnedPositionalEmbeddingD   zF
This module learns positional embeddings up to a fixed maximum size.
num_embeddingsembedding_dimc                 L   > SU l         [        TU ]	  XR                   -   U5        g N   )offsetsuper__init__)selfr0   r1   	__class__s      r*   r7   &MvpLearnedPositionalEmbedding.__init__I   s"     ++5}Er,   r   past_key_values_lengthposition_idsc                   > Uc]  UR                   SS u  pE[        R                  " X"U-   [        R                  U R                  R
                  S9R                  US5      nOUR                  S5      n[        TU ]%  X0R                  -   5      $ )z3`input_ids' shape is expected to be [bsz x seqlen].Nr4   )dtypedevicer#   r   )r%   torcharangelongweightr?   expand	unsqueezer6   forwardr5   )r8   r   r;   r<   bszseq_lenr9   s         r*   rF   %MvpLearnedPositionalEmbedding.forwardO   s    
 $??2A.LC <<&(HPUPZPZcgcncncucufS"o  (11!4Lw|kk9::r,   )r5   )r   N)__name__
__module____qualname____firstlineno____doc__intr7   r@   Tensorr   rF   __static_attributes____classcell__r9   s   @r*   r.   r.   D   sW    Fs F3 F pt;;?B;V^_d_k_kVl; ;r,   r.   c                     ^  \ rS rSrSr    SS\S\S\\   S\\   S\\   S\\   4U 4S	 jjjr	\
" S
SSS9       SS\R                  S\\R                     S\\   S\\R                     S\\R                     S\\R                     S\S\\R                     S\\R                  \\R                     \\\R                        4   4S jj5       rSrU =r$ )MvpAttention_   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropout
is_decoderbias	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩r[   )r6   r7   rW   rX   rY   head_dimr'   scalingrZ   r\   r   Lineark_projv_projq_projout_proj)r8   rW   rX   rY   rZ   r[   r\   r9   s          r*   r7   MvpAttention.__init__b   s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr,   past_key_valuepast_key_values4.58new_nameversionhidden_stateskey_value_statesattention_masklayer_head_maskattn_promptoutput_attentionscache_positionreturnc	                    USLn	UR                  5       u  pnU R                  U5      U R                  -  nSnUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUbc  U	(       d  UOSnWR'                  UUU R                  SU05      u  nnU	(       a.  [        U[        5      (       a  SUR
                  U R                  '   Ub  [(        R*                  " US   R-                  U
SSS5      U/SS	9n[(        R*                  " US   R-                  U
SSS5      U/SS	9nUbZ  [(        R.                  " U
SXS   R                  S5      5      R1                  UR2                  5      n[(        R*                  " UU/SS	9nXR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR4                  " U6 nUR4                  " U6 nUR4                  " U6 nUR                  S5      n[(        R6                  " UUR%                  SS5      5      nUR                  5       XR                   -  UU4:w  a.  [9        S
XR                   -  UU4 SUR                  5        35      eUbz  UR                  5       U
SUU4:w  a#  [9        SU
SUU4 SUR                  5        35      eUR                  XR                   UU5      U-   nUR                  XR                   -  UU5      n[:        R<                  R?                  USS	9nUb  UR                  5       U R                   4:w  a*  [9        SU R                   4 SUR                  5        35      eUR                  SSSS5      UR                  XR                   UU5      -  nUR                  XR                   -  UU5      nU(       a=  UR                  XR                   UU5      nUR                  XR                   -  UU5      nOSn[:        R<                  RA                  UU R@                  U RB                  S9n[(        R6                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [9        SXR                   XR"                  4 SUR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR5                  XU RD                  5      nU RG                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNFr#   r   r4   rs   Tr   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )$sizerd   r`   
isinstancer   
is_updatedgetr\   cross_attention_cacheself_attention_cachelayerskeysvaluesrb   rc   viewrX   r_   	transposeupdater@   catrD   zerostor?   reshapebmmr'   r   
functionalsoftmaxrY   rz   rW   re   )r8   rm   rn   rh   ro   rp   rq   rr   rs   is_cross_attentionrG   tgt_len_query_statesr}   curr_past_key_valuecurrent_states
key_statesvalue_statesprompt_mask
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                             r*   rF   MvpAttention.forward   s     .T9',,.a {{=1DLL@
&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#-?)]/"=*,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL*7It+>+E+Ednn?OQ_>`,(
L &*_FY*Z*ZAEO..t~~>"KN$9$9#r2r$JJ#W]^_J 99k!n&;&;CR&Ll%[abcL)#kk#q'q>;N;Nq;QRUUVdVkVkl!&K+Hr!SNN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r,   )rY   rW   r_   rZ   rb   r\   rX   re   rd   r`   rc   )        FTN)NNNNNFN)rJ   rK   rL   rM   rN   rO   r   floatboolr7   r   r@   rP   r   tuplerF   rQ   rR   rS   s   @r*   rU   rU   _   sf   G $'%*#$(CC C %	C
 TNC tnC D>C C: %0A6R 48+/1526.2"'15}2||}2 #5<<0}2 "%	}2
 !.}2 "%,,/}2 ell+}2  }2 !.}2 
u||Xell3XeELL>Q5RR	S}2 S}2r,   rU   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\R                  S\\	   S	\
\R                  \\R                     4   4S
 jjrSrU =r$ )MvpEncoderLayer   configc                 h  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  S9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)rW   rX   rY   )r6   r7   d_modelrW   rU   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrY   r
   activation_functionactivation_fnactivation_dropoutra   encoder_ffn_dimfc1fc2final_layer_normr8   r   r9   s     r*   r7   MvpEncoderLayer.__init__  s    %nn44,,

 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r,   rm   ro   rp   self_attn_promptrr   rt   c                    UnU R                  UUUUUS9u  p[        R                  R                  XR                  U R                  S9nXa-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nXa-   nU R                  U5      nUR                  [        R                  :X  a  [        R                  " U5      R                  5       (       d)  [        R                   " U5      R                  5       (       aC  [        R"                  " UR                  5      R$                  S-
  n[        R&                  " X* US9nX4$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
        `(2, encoder_attention_heads, pro_len, head_dim)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rm   ro   rp   rq   rr   rx   i  )minmax)r   r   r   rY   rz   r   r   r   r   r   r   r>   r@   float16isinfanyisnanfinfor   clamp)	r8   rm   ro   rp   r   rr   residualr   clamp_values	            r*   rF   MvpEncoderLayer.forward  su   * !&*nn')+(/ '5 '
# --m||VZVcVc-d 011-@ **488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m<%--/KK&**,,M0J0N0N0P0P++m&9&9:>>EK!KK<[YM**r,   )	r   r   rY   rW   r   r   r   r   r   )F)rJ   rK   rL   rM   r   r7   r@   FloatTensorr   r   r   rF   rQ   rR   rS   s   @r*   r   r      s    =y =, -2/+((/+ ))/+ **	/+
  ++/+ $D>/+ 
u  (5+<+<"==	>/+ /+r,   r   c            !         ^  \ rS rSrSS\4U 4S jjjr\" SSSS9           SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\R                     S\	\R                     S\	\R                     S\	\R                     S\	\
   S\	\   S\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jj5       rSrU =r$ )MvpDecoderLayeriC  r   c                   > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  SUS9U l        UR                  U l        [        UR                     U l        UR                  U l        [        R                  " U R                  5      U l        [	        U R                  UR
                  UR                  SUS9U l        [        R                  " U R                  5      U l        [        R$                  " U R                  UR&                  5      U l        [        R$                  " UR&                  U R                  5      U l        [        R                  " U R                  5      U l        g )NT)rW   rX   rY   rZ   r\   )rY   rZ   r\   )r6   r7   r   rW   rU   decoder_attention_headsr   r   rY   r
   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normra   decoder_ffn_dimr   r   r   )r8   r   r\   r9   s      r*   r7   MvpDecoderLayer.__init__D  s   %nn44,,
 ~~#F$>$>?"(";";$&LL$@!(NN**,,
 (*||DNN'C$99T^^V-C-CD99V33T^^D "T^^ <r,   rg   rh   ri   rj   rm   ro   encoder_hidden_statesencoder_attention_maskrp   cross_attn_layer_head_maskr   cross_attn_promptrr   	use_cachers   rt   c           
         UnU R                  UU	UUUU
US9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nSnUb`  UnU R                  UUUUUU	U
S9u  p[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nUnU R                  U R                  U5      5      n[        R                  R                  XR                  U R                  S9nU R                  U5      n[        R                  R                  XR                  U R                  S9nX-   nU R                  U5      nU4nU
(       a  UX4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
        `(2, decoder_attention_heads, pro_len, head_dim)`.
    cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
        `(2, decoder_attention_heads, pro_len, head_dim)`.
    past_key_values (`Cache`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rm   rh   ro   rp   rq   rr   rs   rx   N)rm   rn   ro   rp   rq   rh   rr   )r   r   r   rY   rz   r   r   r   r   r   r   r   r   )r8   rm   ro   r   r   rp   r   r   r   rh   rr   r   rs   r   self_attn_weightscross_attn_weightsoutputss                    r*   rF   MvpDecoderLayer.forward`  s   L ! ,0>>'+)+(/) ,: ,
( --m||VZVcVc-d 011-@ " ,$H040A0A+!65 :- /"3 1B 1-M MM11-<<Z^ZgZg1hM$4M 88GM !**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0--m< ")>>Gr,   )r   r   rY   rW   r   r   r   r   r   r   r   N)NNNNNNNNFTN)rJ   rK   rL   rM   r   r7   r   r@   rP   r   r   r   r   r   rF   rQ   rR   rS   s   @r*   r   r   C  si   =y = =8 %0A6R 268<9=26=A3748+/,1$(15U||U !.U  (5	U
 !) 6U "%,,/U %-U\\$:U #5<<0U $ELL1U "%U $D>U D>U !.U 
u  (51B1BEDUDU1U+V"WW	XU SUr,   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jr
SrU =r$ )MvpClassificationHeadi  z-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    > [         TU ]  5         [        R                  " X5      U l        [        R
                  " US9U l        [        R                  " X#5      U l        g )Nry   )r6   r7   r   ra   denseDropoutrY   re   )r8   r   r   r   r   r9   s        r*   r7   MvpClassificationHead.__init__  s@     	YYy4
zzN3		)9r,   rm   rt   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r   )rY   r   r@   tanhre   )r8   rm   s     r*   rF   MvpClassificationHead.forward  sN    ]3

=1

=1]3m4r,   )r   rY   re   )rJ   rK   rL   rM   rN   rO   r   r7   r@   rP   rF   rQ   rR   rS   s   @r*   r   r     sQ    7
:
: 
: 	
:
 
:U\\ ell  r,   r   c                   l   ^  \ rS rSrSrU 4S jrS\R                  S\\R                     4S jr	Sr
U =r$ )	MvpPrompti  z)Layer-wise prompt for encoder or decoder.c           	      :  > [         TU ]  5         UR                  U l        X l        X0l        UR
                  U-  U l        [        R                  " UR                  S9U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " [        R                  " UR
                  UR                  5      [        R                  " 5       [        R                  " UR                  US-  UR
                  -  5      5      U l        g )Nr   r4   )r6   r7   prompt_length
num_layersrX   r   r_   r   r   rY   	Embeddingprompt_embedding
Sequentialra   prompt_mid_dimGELUprompt_trans)r8   r   r   rX   r9   s       r*   r7   MvpPrompt.__init__  s    #11$")3zzFNN3 "V-A-A6>> RMMIIfnnf&;&;<GGIIIf++Z!^fnn-LM
r,   
prompt_idsrt   c                 *   U R                  U R                  U5      5      nUR                  U R                  U R                  S-  U R
                  U R                  5      nU R                  U5      nUR                  / SQ5      R                  S5      nU$ )Nr4   )r   r4   r   r	   )
r   r   r   r   r   rX   r_   rY   permutesplit)r8   r   prompts      r*   rF   MvpPrompt.forward  sw    ""4#8#8#DET//11DdnnVZVcVcdf%-33A6r,   )rY   r_   rX   r   r   r   r   )rJ   rK   rL   rM   rN   r7   r@   rP   r   rF   rQ   rR   rS   s   @r*   r   r     s0    3
%,, 53F  r,   r   c                   >    \ rS rSr% \\S'   SrSrS r\	S 5       r
Srg)	MvpPreTrainedModeli  r   modelTc                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Nr   )meanstd)r   init_stdr|   r   ra   rC   datanormal_r[   zero_r   padding_idx)r8   moduler   s      r*   _init_weights MvpPreTrainedModel._init_weights  s    kk""fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .r,   c                     U R                   R                  n[        R                  " / SQSSSSU//U R                  S9nUR                  U5      US.nU$ )N)r      
      r4   r         r4   r?   )ro   r   )r   r    r@   tensorr?   ne)r8   	pad_tokenr   dummy_inputss       r*   r  MvpPreTrainedModel.dummy_inputs  sW    KK,,	LL"2Q2q)4L!MVZVaVab	'll95"
 r,    N)rJ   rK   rL   rM   r   __annotations__base_model_prefixsupports_gradient_checkpointingr  propertyr  rQ   r  r,   r*   r   r     s-    &*#	?  r,   r   c                   "  ^  \ rS rSrSr SS\S\\R                     S\\	   4U 4S jjjr
       SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\\\4   4S jjrSrU =r$ )
MvpEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`MvpEncoderLayer`].

Args:
    config: MvpConfig
    embed_tokens (nn.Embedding): output embedding
    use_prompt (bool): whether to use prompt
r   embed_tokens
use_promptc                 P  > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSU l        Ub  X l        O0[        R                   " UR"                  X@R                  5      U l        [%        UR                  U5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        U5      PM     sn5      U l        [        R2                  " U5      U l        X0l        U(       a7  UR8                  U l        [;        UUR,                  UR<                  5      U l        SU l         U RC                  5         g s  snf )N      ?F)"r6   r7   rY   encoder_layerdrop	layerdropr   r    r  max_position_embeddingsmax_source_positionsscale_embeddingmathsqrtembed_scaler  r   r   
vocab_sizer.   embed_positions
ModuleListrangeencoder_layersr   r   r   layernorm_embeddingr  r   r   r   r   gradient_checkpointing	post_init)r8   r   r  r  rW   r   r9   s         r*   r7   MvpEncoder.__init__  sD    	 ~~11NN	!..$*$B$B!393I3I499Y/s# , "V->->	K[K[ \D<** 
 mmeFLaLaFb$cFb_V%<Fb$cd#%<<	#: $!'!5!5D$-%%..%D! ',# %ds   F#r   ro   	head_maskinputs_embedsrr   output_hidden_statesreturn_dictrt   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb$  UnUR
                  n	UR                  SU	S   5      nO.Ub   UR                  5       SS n	USS2SS2S4   nO[	        S5      eUc  U R                  U5      U R                  -  nU R                  U5      n
XJ-   nU R                  U5      n[        R                  R                  XR                  U R                  S9nU R                   (       aJ  ["        R$                  " U R&                  5      R)                  U R*                  5      nU R-                  U5      nUb  [/        X$R0                  5      nU(       a  SOSnU(       a  SOSnUb`  UR                  5       S   [3        U R4                  5      :w  a6  [	        S[3        U R4                  5       S	UR                  5       S    S
35      e[7        U R4                  5       H  u  nnU(       a  X4-   nSnU R                  (       a(  ["        R8                  " / 5      nUU R:                  :  a  SnU(       a  SnO-U" UUUb  UU   OSU R                   (       a  WU   OSUS9nUS   nU(       d  M  UUS   4-   nM     U(       a  X4-   nU(       d  [=        S XU4 5       5      $ [?        XUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzDYou cannot specify both input_ids and inputs_embeds at the same timer#   z5You have to specify either input_ids or inputs_embedsrx   r  r   z&The head_mask should be specified for  layers, but it is for .FT)NN)rp   r   rr   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r  .0vs     r*   	<genexpr>%MvpEncoder.forward.<locals>.<genexpr>  s     e$Sq$Ss   	last_hidden_staterm   
attentions) r   rr   r/  use_return_dictr'   r%   r   r{   r  r#  r%  r)  r   r   rY   rz   r  r@   rA   r   r   r?   r   r   r>   lenr   	enumeraterandr  r   r   )r8   r   ro   r-  r.  rr   r/  r0  inputinput_shape	embed_posrm   r   r   encoder_statesall_attentionsidxencoder_layerto_dropdropout_probabilitylayer_outputss                        r*   rF   MvpEncoder.forward8  s   \ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  ]%>cdd"E++K!r;r?;I&',,.s3K!!Q(+ETUU  --i84;K;KKM((/	%100?--m||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ %7H[H[\N30d  ~~"s4;;'78 <S=M<N O!(+,A/ 
 #,DKK"8C#!/2B!BG}}&+jjn#&7"G , -!"7@7LYs^RV?C&6s&;TX&7! !.a 0  !/=3C2E!E1 #94  +.>>Ne]N$Seee+Vd
 	
r,   )rY   r%  r#  r  r*  r  r)  r   r  r  r   r   r  NF)NNNNNNN)rJ   rK   rL   rM   rN   r   r   r   r   r   r7   r@   
LongTensorrP   r   r   r   r   rF   rQ   rR   rS   s   @r*   r  r    s     lq$$/7/E$ZbcgZh$ $P 1515,059,0/3&*@
E,,-@
 !.@
 ELL)	@

   1 12@
 $D>@
 'tn@
 d^@
 
uo%	&@
 @
r,   r  c                     ^  \ rS rSrSr SS\S\\R                     S\\	   4U 4S jjjr
             SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\	   S\\	   S\\	   S\\	   S\\R                     S\\\4   4S jjrSrU =r$ )
MvpDecoderi  z
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

Args:
    config: MvpConfig
    embed_tokens (nn.Embedding): output embedding
    use_prompt (bool): whether to use prompt
r   r  r  c           
        > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  (       a   [        R                  " UR                  5      OSU l        Ub  X l        O;[        R                   " UR"                  UR                  U R                  5      U l        [%        UR                  UR                  5      U l        [        R(                  " [+        UR,                  5       Vs/ s H  n[/        XS9PM     sn5      U l        [        R2                  " UR                  5      U l        X0l        U(       a]  UR8                  U l        [;        UUR,                  UR<                  5      U l        [;        UUR,                  UR<                  5      U l         SU l!        U RE                  5         g s  snf )Nr  )r\   F)#r6   r7   rY   decoder_layerdropr  r    r  r  max_target_positionsr   r!  r"  r   r#  r  r   r   r$  r.   r%  r&  r'  decoder_layersr   r   r   r)  r  r   r   r   r   r   r*  r+  )r8   r   r  r  ir9   s        r*   r7   MvpDecoder.__init__  su    	 ~~11!..$*$B$B!8>8N8N499V^^4TW# , "V->->PTP`P` aD<**NN 
 mmSXY_YnYnSo$pSoa_V%ISo$pq#%<<#? $!'!5!5D$-%%..%D!
 &/%%..&D" ',#' %qs   G$r   ro   r   r   r-  cross_attn_head_maskrh   r.  r   rr   r/  r0  rs   rt   c                 	   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  Ub  [        S5      eUb$  UnUR                  nUR                  SUS   5      nO.Ub   UR                  5       SS nUSS2SS2S4   nO[        S5      eUc  U R                  U5      U R                  -  nU R                  (       a/  U R                  (       a  U	(       a  [        R                  S5        Sn	U	(       aG  UcD  Ub.  [        [!        U R                   S9[!        U R                   S95      O[!        U R                   S9nU	(       a@  [#        U[$        5      (       a+  [        R                  S5        [        R&                  " U5      nUb  UR)                  5       OS	n[+        X/UU5      nUb  Ub  [-        XHR.                  US   S
9nU R1                  UU5      nUU-   nU R3                  U5      n[4        R6                  R9                  UU R8                  U R                  S9nU R:                  (       a[  [<        R>                  " U R@                  5      RC                  U RD                  5      nU RG                  U5      nU RI                  U5      nU(       a  SOSnU
(       a  SOSnU
(       a  Ub  SOSn[K        XV/SS/5       Hn  u  nnUc  M  UR                  5       S	   [M        U RN                  5      :w  d  M7  [        SU S[M        U RN                  5       SUR                  5       S	    S35      e   [Q        U RN                  5       H  u  nnU(       a  UU4-  nU R                  (       a(  [<        RR                  " / 5      nUU RT                  :  a  ML  U" UUUUUb  UU   OSUb  UU   OSU R:                  (       a  WU   OSU R:                  (       a  WU   OSUU
U	US9nUS	   nU
(       d  M  UUS   4-  nUc  M  UUS   4-  nM     U(       a  UU4-  nU(       d  [%        S UUUUU4 5       5      $ [W        UUUUUS9$ )a  
Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
        provide it.

        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
        [`PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
        cross-attention on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer#   zEYou have to specify either decoder_input_ids or decoder_inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )r   rx   r  r-  rV  zThe `z` should be specified for r2  r3  )	r   rp   r   r   r   rh   rr   r   rs   r   r4   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r  r5  s     r*   r8  %MvpDecoder.forward.<locals>.<genexpr>  s      rA rs   	)r;  rh   rm   r<  cross_attentions),r   rr   r/  r   r=  r'   r%   r   r{   r  r#  r*  rz   loggerwarning_oncer   r   r|   r   from_legacy_cacheget_seq_lengthr   r   r>   r%  r)  r   r   rY   r  r@   rA   r   r   r?   r   r   zipr>  r   r?  r@  r  r   )r8   r   ro   r   r   r-  rV  rh   r.  r   rr   r/  r0  rs   rA  rB  r;   	positionsrm   r   r   r   all_hidden_statesall_self_attnsall_cross_attentions	attn_mask	mask_namerF  decoder_layerrI  rJ  s                                  r*   rF   MvpDecoder.forward  s   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E#//K!r;r?;I&',,.s3K!!Q(+Edee  --i84;K;KKM&&4==##p "	0 )4 $L$DlZ^ZeZeFfg!5 
 OU;;\
 2CCOTOETE`!?!?!Afg:8N

 !,1G1S%?&(;(;[QS_&"
 ((0FG	%	100?--mt||VZVcVc-d ??d&8&89<<T[[IJ#44Z@ $ 6 6z B #7BD0d&7<Q<]rdh %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#!m%55!}}&+jjn#&7)%'=3<3H3dI]Ii,@,Eos;???"23"7PT=A__#4S#9RV /"3#-M *!,M  =#3"55(4(]1-=,??(9 #9>  -!11 ':K^]qr  
 9+++%1
 	
r,   )r   rY   r%  r#  r  r*  r  r)  r   rR  r  r   r   r  rL  )NNNNNNNNNNNNN)rJ   rK   rL   rM   rN   r   r   r   r   r   r7   r@   rM  rP   r   r   r   r   r   rF   rQ   rR   rS   s   @r*   rO  rO    s}    lq&&/7/E&ZbcgZh& &T 1515=A=A,07;+/59$(,0/3&*15Q
E,,-Q
 !.Q
  ((9(9:	Q

 !))9)9 :Q
 ELL)Q
 'u||4Q
 "%Q
   1 12Q
 D>Q
 $D>Q
 'tnQ
 d^Q
 !.Q
 
u??	@Q
 Q
r,   rO  c            &         ^  \ rS rSrS/rSS/rS\4U 4S jjrS rS r	S	 r
S
 r\                SS\\R                     S\\R                      S\\R                     S\\R                     S\\R                      S\\R                      S\\R                      S\\\R$                        S\\   S\\R$                     S\\R$                     S\\   S\\   S\\   S\\   S\\R                      S\\\4   4"S jj5       rSrU =r$ )MvpModeli  final_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                 t  > [         TU ]  U5        UR                  UR                  p2UR                  U l        [
        R                  " X1R                  U5      U l        [        XR                  UR                  5      U l
        [        XR                  UR                  5      U l        U R                  5         g r   )r6   r7   r    r$  r  r   r   r   sharedr  encoderrO  decoderr+  )r8   r   r  r$  r9   s       r*   r7   MvpModel.__init__  s     "("5"5v7H7HZ ++ll:~~{K!&++v7H7HI!&++v7H7HI 	r,   c                     U R                   $ r   )rn  r8   s    r*   get_input_embeddingsMvpModel.get_input_embeddings  s    {{r,   c                 |    Xl         U R                   U R                  l        U R                   U R                  l        g r   )rn  ro  r  rp  r8   values     r*   set_input_embeddingsMvpModel.set_input_embeddings  s'    $(KK!$(KK!r,   c                     U R                   $ r   )ro  rs  s    r*   get_encoderMvpModel.get_encoder  s    ||r,   c                 4   U R                   (       d   S5       eU R                  S5        U R                  R                  R                  S5        U R                  R                  R                  S5        U R                  R
                  R                  S5        g )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r  requires_grad_ro  r   rp  r   rs  s    r*   set_lightweight_tuningMvpModel.set_lightweight_tuning  sj    j jjE"%%44T:%%44T:&&55d;r,   r   ro   decoder_input_idsdecoder_attention_maskr-  decoder_head_maskrV  encoder_outputsrh   r.  decoder_inputs_embedsr   rr   r/  r0  rs   rt   c                 T   UcE  UcB  Uc  [        S5      e[        XR                  R                  U R                  R                  5      nUb  UOU R                  R
                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUc  U R                  UUUU
UUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nU R                  UUUS   UUUU	UUUUUUS9nU(       d  UU-   $ [        UR                  UR                   UR"                  UR$                  UR&                  UR                  UR"                  UR$                  S	9$ )
az  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   ro   r-  r.  rr   r/  r0  r   r   r4   r:  )r   ro   r   r   r-  rV  rh   r.  r   rr   r/  r0  rs   )r;  rh   decoder_hidden_statesdecoder_attentionsrZ  encoder_last_hidden_stater   encoder_attentions)r'   r+   r   r    r!   rr   r/  r   r=  ro  r|   r   r>  rp  r   r;  rh   rm   r<  rZ  )r8   r   ro   r  r  r-  r  rV  r  rh   r.  r  r   rr   r/  r0  rs   decoder_outputss                     r*   rF   MvpModel.forward  s   f $)>)F  U  !3;;33T[[5W5W! 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]""ll#-#+"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1'!5+//!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r,   )rp  ro  rn  r  NNNNNNNNNNNNNNNN)rJ   rK   rL   rM   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r7   rt  ry  r|  r  r   r   r@   rM  rP   listr   r   r   r   r   r   rF   rQ   rR   rS   s   @r*   ri  ri    s   *=)>&79VWy 0
<  15158<=A,0487;=A+/59=A$(,0/3&*15#t
E,,-t
 !.t
 $E$4$45	t

 !))9)9 :t
 ELL)t
 $ELL1t
 'u||4t
 "$u'8'8"9:t
 "%t
   1 12t
  ((9(9:t
 D>t
 $D>t
 'tnt
  d^!t
" !.#t
$ 
u((	)%t
 t
r,   ri  ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )custom_introc            (         ^  \ rS rSr/ SQrS\4U 4S jjrS rS r S#S\	S	\
\	   S
\S\R                  4U 4S jjjrS\	SS4S jrS r\                 S$S\
\R&                     S\
\R(                     S\
\R&                     S\
\R&                     S\
\R(                     S\
\R(                     S\
\R(                     S\
\\R,                        S\
\   S\
\R,                     S\
\R,                     S\
\R&                     S\
\   S\
\   S\
\   S\
\   S\
\R(                     S\\\4   4$S  jj5       rS\R(                  4S! jrS"rU =r$ )%MvpForConditionalGenerationi^  )rk  rl  lm_head.weightr   c                 v  > [         TU ]  U5        [        U5      U l        U R	                  S[
        R                  " SU R                  R                  R                  45      5        [        R                  " UR                  U R                  R                  R                  SS9U l        U R                  5         g )Nrj  r   Fr^   )r6   r7   ri  r   register_bufferr@   r   rn  r0   r   ra   r   lm_headr+  r   s     r*   r7   $MvpForConditionalGeneration.__init__f  s     f%
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^ 	r,   c                 6    U R                   R                  5       $ r   )r   r|  rs  s    r*   r|  'MvpForConditionalGeneration.get_encodero      zz%%''r,   c                 6    U R                   R                  5       $ r   )r   get_decoderrs  s    r*   r  'MvpForConditionalGeneration.get_decoderr  r  r,   Nnew_num_tokenspad_to_multiple_ofmean_resizingrt   c                 J   > [         TU ]  XU5      nU R                  U5        U$ r   )r6   resize_token_embeddings_resize_final_logits_bias)r8   r  r  r  new_embeddingsr9   s        r*   r  3MvpForConditionalGeneration.resize_token_embeddingsu  s+     8]jk&&~6r,   c                 ,   U R                   R                  S   nX::  a  U R                   S S 2S U24   nON[        R                  " SX-
  4U R                   R                  S9n[        R
                  " U R                   U/SS9nU R                  SU5        g )Nr#   r   r  rv   rj  )rj  r%   r@   r   r?   r   r  )r8   r  old_num_tokensnew_bias
extra_biass        r*   r  5MvpForConditionalGeneration._resize_final_logits_bias|  s    //55b9+--a..@AHa)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r,   c                 n    U R                   R                  5         U R                  R                  S5        g rL  r   r  r  r  rs  s    r*   r  2MvpForConditionalGeneration.set_lightweight_tuning  $    

))+##E*r,   r   ro   r  r  r-  r  rV  r  rh   r.  r  labelsr   rr   r/  r0  rs   c                    Ub  UOU R                   R                  nUbX  U(       a  [        R                  S5        SnUc7  Uc4  [	        XR                   R
                  U R                   R                  5      nU R                  UUUUUUUUU	U
UUUUUUS9nU R                  US   5      U R                  -   nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  S9	$ )	aE  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example of summarization:

Fine-tuning a model
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

>>> inputs = tokenizer(
...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
...     return_tensors="pt",
... )
>>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     generated_ids = model.generate(**inputs)

>>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
```
NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)ro   r  r  r  r-  r  rV  rh   r.  r  r   rr   r/  r0  rs   r   r#   r   	losslogitsrh   r  r  rZ  r  r   r  )r   r=  r[  warningr+   r    r!   r   r  rj  r   r   r$  r   rh   r  r  rZ  r  r   r  )r8   r   ro   r  r  r-  r  rV  r  rh   r.  r  r  r   rr   r/  r0  rs   r   	lm_logitsmasked_lm_lossloss_fctoutputs                          r*   rF   #MvpForConditionalGeneration.forward  s   d &1%<k$++B]B]klI (-B-J$6KK44dkk6X6X%! **)/+#9/!5+'"7/!5#)!  
$ LL,t/E/EE	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   c                 j    [        XR                  R                  U R                  R                  5      $ r   )r+   r   r    r!   )r8   r  s     r*   %prepare_decoder_input_ids_from_labelsAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labels  s#    !&++*B*BDKKDfDfggr,   r  r   )NT)NNNNNNNNNNNNNNNNN)rJ   rK   rL   rM   r  r   r7   r|  r  rO   r   r   r   r   r  r  r  r   r@   rM  rP   r  r   r   r   r   r   rF   r  rQ   rR   rS   s   @r*   r  r  ^  s8    jy (( dh!7?}\`	 < < <+  15158<=A,0487;=A+/59=A-1$(,0/3&*15%C
E,,-C
 !.C
 $E$4$45	C

 !))9)9 :C
 ELL)C
 $ELL1C
 'u||4C
 "$u'8'8"9:C
 "%C
   1 12C
  ((9(9:C
 ))*C
 D>C
 $D>C
  'tn!C
" d^#C
$ !.%C
& 
uo%	&'C
 C
JhELL h hr,   r  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $         ^  \ rS rSrSS/rS\4U 4S jjrS r\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\4   4 S jj5       rSrU =r$ )MvpForSequenceClassificationi  rk  rl  r   c                    > [         TU ]  " U40 UD6  [        U5      U l        [	        UR
                  UR
                  UR                  UR                  5      U l        U R                  5         g r   )
r6   r7   ri  r   r   r   
num_labelsclassifier_dropoutclassification_headr+  )r8   r   kwargsr9   s      r*   r7   %MvpForSequenceClassification.__init__  sZ    *6*f%
#8NNNN%%	$
  	r,   c                 n    U R                   R                  5         U R                  R                  S5        g rL  )r   r  r  r  rs  s    r*   r  3MvpForSequenceClassification.set_lightweight_tuning)  s&    

))+  //6r,   r   ro   r  r  r-  r  rV  r  r.  r  r  r   rr   r/  r0  rt   c                    Ub  UOU R                   R                  nUb  SnUc%  U	b"  [        SU R                  R                   35      eU R                  UUUUUUUUU	U
UUUUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      5      S:  a  [        S5      eUUSS24   R                  UR!                  S5      SUR!                  S5      5      SS2SSS24   nU R#                  U5      nSnUGb  U R                   R$                  c  U R                   R&                  S:X  a  S	U R                   l        OyU R                   R&                  S:  aN  UR(                  [        R*                  :X  d  UR(                  [        R,                  :X  a  S
U R                   l        OSU R                   l        U R                   R$                  S	:X  aT  [/        5       nU R                   R&                  S:X  a&  U" UR1                  5       UR1                  5       5      nOU" UU5      nOU R                   R$                  S
:X  aG  [3        5       nU" UR                  SU R                   R&                  5      UR                  S5      5      nO-U R                   R$                  S:X  a  [5        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  UR>                  UR@                  URB                  URD                  S9	$ )a
  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Example of single-label classification:

Fine-tuning a model on `num_labels` classes
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForSequenceClassification

>>> num_labels = 2  # for example, this is a binary classification task
>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

>>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
>>> labels = torch.tensor(1)  # the real label for inputs

>>> loss = model(**inputs, labels=labels).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     logits = model(**inputs).logits

>>> predicted_class_id = logits.argmax()
```
NFz8Passing input embeddings is currently not supported for ro   r  r  r-  r  rV  r  r.  r  r   rr   r/  r0  r   r   z7All examples must have the same number of <eos> tokens.r#   
regressionsingle_label_classificationmulti_label_classificationr  )#r   r=  NotImplementedErrorr9   rJ   r   eqeos_token_idr   r?   r>  r@   unique_consecutivesumr'   r   r{   r  problem_typer  r>   rB   rO   r   squeezer   r   r   rh   r  r  rZ  r  r   r  )r8   r   ro   r  r  r-  r  rV  r  r.  r  r  r   rr   r/  r0  r   rm   eos_masksentence_representationr  r  r  r  s                           r*   rF   $MvpForSequenceClassification.forward-  s   Z &1%<k$++B]B]I!:%J4>>KbKbJcd  **)/#9/!5+'"7/!5#  
   
<< 8 89<<]=Q=QRu''Q89A=VWW"/!"<"A"A-BTBTUVBWY[]j]o]opr]s"tr1H#
 ))*AB{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   )r  r   )NNNNNNNNNNNNNNN)rJ   rK   rL   rM   r  r   r7   r  r   r   r@   rM  rP   r  r   r   r   r   r   rF   rQ   rR   rS   s   @r*   r  r    s    89VWy 7  15158<=A,0487;=A59=A-1$(,0/3&*!T
E,,-T
 !.T
 $E$4$45	T

 !))9)9 :T
 ELL)T
 $ELL1T
 'u||4T
 "$u'8'8"9:T
   1 12T
  ((9(9:T
 ))*T
 D>T
 $D>T
 'tnT
  d^!T
" 
u55	6#T
 T
r,   r  c            &         ^  \ rS rSrSS/rU 4S jrS r\                SS\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\\	R                        S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\\4   4"S jj5       rSrU =r$ )MvpForQuestionAnsweringi  rk  rl  c                    > [         TU ]  U5        SUl        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r3   )
r6   r7   r  ri  r   r   ra   hidden_size
qa_outputsr+  r   s     r*   r7    MvpForQuestionAnswering.__init__  s[      ++f%
))F$6$68I8IJ 	r,   c                 n    U R                   R                  5         U R                  R                  S5        g rL  )r   r  r  r  rs  s    r*   r  .MvpForQuestionAnswering.set_lightweight_tuning  s$    

))+&&u-r,   r   ro   r  r  r-  r  rV  r  start_positionsend_positionsr.  r  r   rr   r/  r0  rt   c                    Ub  UOU R                   R                  nU	b  U
b  SnU R                  UUUUUUUUUUUUUUS9nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnU	b  U
b  [        U	R                  5       5      S:  a  U	R                  S5      n	[        U
R                  5       5      S:  a  U
R                  S5      n
UR                  S5      nU	R                  SU5      n	U
R                  SU5      n
[        US9nU" UU	5      nU" UU
5      nUU-   S	-  nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  UR                  UR                  UR                   UR"                  UR$                  S
9
$ )a
  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    For translation and summarization training, `decoder_input_ids` should be provided. If no
    `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
    for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

Fine-tuning a model for extrative question answering, and our model also supports generative question answering
using `BartForConditionalGeneration`
```python
>>> import torch
>>> from transformers import AutoTokenizer, MvpForQuestionAnswering

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

>>> inputs = tokenizer(
...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
...     return_tensors="pt",
... )
>>> target_start_index = torch.tensor([18])
>>> target_end_index = torch.tensor([19])

>>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
>>> loss.backward()
```

Inference after the model fine-tuned
```python
>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> answer_start_index = outputs.start_logits.argmax()
>>> answer_end_index = outputs.end_logits.argmax()

>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
>>> predict_answer = tokenizer.decode(predict_answer_tokens)
```
NFr  r   r   r#   rv   )ignore_indexr4   )
r  start_logits
end_logitsrh   r  r  rZ  r  r   r  )r   r=  r   r  r   r  
contiguousr>  r{   r   r   r   rh   r  r  rZ  r  r   r  )r8   r   ro   r  r  r-  r  rV  r  r  r  r.  r  r   rr   r/  r0  r   sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                               r*   rF   MvpForQuestionAnswering.forward  s   f &1%<k$++B]B]&=+DI**)/#9/!5+'"7/!5#  
" "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J F 0:/EZMF*Q6Q2%!#33")"?"?&99$55&-&G&G")"?"?&99
 	
r,   )r   r  r  r  )rJ   rK   rL   rM   r  r7   r  r   r   r@   rP   rM  r  r   r   r   r   r   rF   rQ   rR   rS   s   @r*   r  r    s   79VW
.  -1158<=A,0487;=A6:4859=A$(,0/3&*#Q
ELL)Q
 !.Q
 $E$4$45	Q

 !))9)9 :Q
 ELL)Q
 $ELL1Q
 'u||4Q
 "$u'8'8"9:Q
 "%"2"23Q
   0 01Q
   1 12Q
  ((9(9:Q
 D>Q
 $D>Q
  'tn!Q
" d^#Q
$ 
u99	:%Q
 Q
r,   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )MvpDecoderWrapperio  z
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
c                 D   > [         TU ]  U5        [        U5      U l        g r   )r6   r7   rO  rp  r   s     r*   r7   MvpDecoderWrapper.__init__u  s     !&)r,   c                 &    U R                   " U0 UD6$ r   rp  )r8   argsr  s      r*   rF   MvpDecoderWrapper.forwardy  s    ||T,V,,r,   r  )	rJ   rK   rL   rM   rN   r7   rF   rQ   rR   rS   s   @r*   r  r  o  s    
*- -r,   r  c            "         ^  \ rS rSrS/rU 4S jrS rS rS rS r	S r
\              SS	\\R                     S
\\R                     S\\R                      S\\R                      S\\R                     S\\R                     S\\   S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrU =r$ )MvpForCausalLMi}  r  c                    > SUl         SUl        [        TU ]  U5        [	        U5      U l        [        R                  " UR                  UR                  SS9U l
        U R                  5         g )NTFr^   )rZ   is_encoder_decoderr6   r7   r  r   r   ra   r  r$  r  r+  r   s     r*   r7   MvpForCausalLM.__init__  sX     $)! &v.
yy!3!3V5F5FUS 	r,   c                 B    U R                   R                  R                  $ r   r   rp  r  rs  s    r*   rt  #MvpForCausalLM.get_input_embeddings  s    zz!!...r,   c                 8    XR                   R                  l        g r   r  rw  s     r*   ry  #MvpForCausalLM.set_input_embeddings  s    */

'r,   c                 $    XR                   l        g r   r   rp  )r8   rp  s     r*   set_decoderMvpForCausalLM.set_decoder  s    $

r,   c                 .    U R                   R                  $ r   r  rs  s    r*   r  MvpForCausalLM.get_decoder  s    zz!!!r,   c                 n    U R                   R                  5         U R                  R                  S5        g rL  r  rs  s    r*   r  %MvpForCausalLM.set_lightweight_tuning  r  r,   r   ro   r   r   r-  rV  rh   r.  r  r   rr   r/  r0  rs   rt   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  R                  UUUUUUUUU
UUUS9nU R                  US   5      nSnU	bF  [        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a9  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MvpForCausalLM

>>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
>>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> list(logits.shape)
[1, 8, 50267]
```N)r   ro   r   r   r-  rV  rh   r.  r   rr   r/  r0  r   r#   r   )r  r  rh   rm   r<  rZ  )r   rr   r/  r=  r   rp  r  r   r   r$  r   rh   rm   r<  rZ  )r8   r   ro   r   r   r-  rV  rh   r.  r  r   rr   r/  r0  rs   r   r  r  r  r  s                       r*   rF   MvpForCausalLM.forward  sF   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5# % 
 gaj)')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r,   r  )NNNNNNNNNNNNNN)rJ   rK   rL   rM   r  r7   rt  ry  r  r  r  r   r   r@   rM  rP   r   r   r   r   r   r   rF   rQ   rR   rS   s   @r*   r  r  }  s   *+	/0%"+  1515=A>B,07;+/59-1$(,0/3&*15T
E,,-T
 !.T
  ((9(9:	T

 !)):): ;T
 ELL)T
 'u||4T
 "%T
   1 12T
 ))*T
 D>T
 $D>T
 'tnT
 d^T
 !.T
  
u77	8!T
 T
r,   r  )r  r  r  r  ri  r   )ArN   r!  typingr   r   r@   r   torch.nnr   r   r   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_mvpr   
get_loggerrJ   r[  rP   rO   r+   r   r.   ModulerU   r   r   r   r   r   r  rO  ri  r  r  r  r  r  __all__r  r,   r*   <module>r     s     "   A A ! C C ) :   . , 0 ( 
		H	%%,, c [^ ";BLL ;6^2299 ^2B@+0 @+Fs0 snBII 0		 2   6q
# q
hC
# C
L Y
! Y
 Y
x 
mh"4o mh
mh` i
#5 i
i
X e
0 e
 e
R-* -s
' s
lr,   