
    cCib                     N   S r SSKrSSKJrJr  SSKrSSKJr  SSKJrJ	r	J
r
Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJrJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%  \!RL                  " \'5      r(S)S jr) " S S\RT                  5      r+ " S S\RT                  5      r, " S S\5      r-\  " S S\5      5       r.\  " S S\.5      5       r/\ " SS9 " S S \.\5      5       r0\ " S!S9 " S" S#\.5      5       r1\  " S$ S%\.5      5       r2\  " S& S'\.5      5       r3/ S(Qr4g)*zPyTorch MPT model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )CacheDynamicCache)GenerationMixin)!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)PreTrainedModel)auto_docstringlogging)deprecate_kwarg   )	MptConfigc                 H   [         R                  " SU-
  S[         R                  US9R                  SSSU5      nS[        R
                  " [        R                  " U 5      5      -  n[         R                  " SUS-   [         R                  US9R                  5       nXbU-  -  nS[         R                  " SU5      -  nUR                  SUSS5      nXP:w  a7  [         R                  " USS2SSS2S4   USS2SSS2S4   /SS9SS2SU 2S4   nXG-  nUR                  S5      $ )	a  
Link to paper: https://huggingface.co/papers/2108.12409 - Alibi tensor is not causal as the original paper mentions, it
relies on a translation invariance of softmax for quick implementation. This implementation has been copied from
the alibi implementation of MPT source code that led to slightly different results than the Bloom alibi:
https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L292
r   )dtypedevice         ?N.dimr   )torcharangeint32viewmathceillog2int64floatpowconcatsqueeze)	num_headssequence_lengthalibi_bias_maxr   alibinum_heads_power_of_2baseslopess           ^/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/mpt/modeling_mpt.pybuild_mpt_alibi_tensorr7   -   s    LL_,au{{6RWWXY[\^_apqE		$))I*> ??<</!35;;vV\\^D$889D599Q%%F[[0!Q7F(vaAsl3VAssCK5HIqQRSU_V_U_adRdeNE==    c                      ^  \ rS rSrSrSS\S\\   4U 4S jjjr\	" SSSS	9   SS
\
R                  S\
R                  S\\   S\\
R                     S\\
R                     4
S jj5       rSrU =r$ )MptAttentionD   zrMulti-head self attention.
Using torch or triton attention implementation enables user to also use additive bias.
config	layer_idxc                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  R                  U l        U R                  c5  S[        R                  " U R                  U R                  -  5      -  U l        UR                  R                  U l        UR                  R                  U l        [        R                  " U R                  SU R                  -  SS9U l        [        R                  " U R                  U R                  SS9U l        X l        g )Nr   r   Fbias)super__init__hidden_sizen_headsmax_seq_lenmax_seq_lengthhead_dimattn_configsoftmax_scaler'   sqrt
attn_pdropattn_dropout_pclip_qkvr   LinearWqkvout_projr=   )selfr<   r=   	__class__s      r6   rB   MptAttention.__init__I   s   !--~~$00((DLL8#//==%!"TYYt/?/?$,,/N%O!OD$00;;**33IId..D4D4D0D5Q			$"2"2D4D4D5Q"r8   past_key_valuepast_key_values4.58new_nameversionhidden_statesposition_biasattention_maskcache_positionc                 \   UR                   S S u  pgU R                  U5      nU R                  (       a%  UR                  U R                  * U R                  S9nUR	                  SSS9u  pnU	R                  XgU R                  U R                  5      R                  SS5      n	U
R                  XgU R                  U R                  5      R                  SS5      n
UR                  XgU R                  U R                  5      R                  SS5      nUb#  SU0nUR                  XU R                  U5      u  p[        R                  " XR                  SS5      5      U R                  -  nUc  UOXsR                  5       -   nUb  [        UR                   5      S:w  a!  [!        S	[        UR                   5       35      eU
R                   S   n[#        S
UR%                  S5      U-
  5      n[#        S
UR%                  S5      U-
  5      nUS S 2US 2US 24   nX-   nUb:  UR'                  U[        R(                  " U	R*                  5      R,                  5      n[.        R0                  R3                  UR5                  5       SS9R7                  UR*                  5      n[.        R0                  R9                  UU R:                  U R<                  S9n[        R                  " UU5      nUR?                  S
SSS5      RA                  5       RC                  XgS5      nU RE                  U5      nUU4$ )Nr   )minmaxr   r!   r   r]   z6Expecting position_bias shape to be 3 dimensions, got r   ptraining)#shaperO   rM   clampchunkreshaperD   rG   	transposeupdater=   r#   matmulrI   get_seq_lengthlen
ValueErrorr`   sizemasked_fillfinfor   r_   r   r
   softmaxr+   todropoutrL   re   permute
contiguousr&   rP   )rQ   rZ   r[   rU   r\   r]   
batch_size
seq_length	mixed_qkvquery_states
key_statesvalue_statescache_kwargsattention_scoresquery_length
key_lengthposition_bias_query_indexposition_bias_key_indexattn_weightscontext_statesattn_outputs                        r6   forwardMptAttention.forwardY   s    "/!4!4Ra!8
IIm,	==!T]]NNI1:1J.,#++JDLLRVR_R_`jjklnop''
dmm\ffghjkl
#++JDLLRVR_R_`jjklnop&,n=L'6'='=jX\XfXfht'u$J <<6J6J2r6RSVZVhVhh%4%<z*OmOmOoBo$=&&'1, #YZ]^k^q^qZrYs!tuu#))"-J(+A}/A/A!/D|/S(T%&)!]-?-?-BZ-O&P#)!-F-GI`Ia*abM/?%/;;NEKKXdXjXjLkLoLop }},,-=-C-C-E2,NQQR^RdRde}},,\T=P=P[_[h[h,ilLA'//1a;FFHMMjfhimmN3L((r8   )
rO   rL   rM   rG   rC   r=   rF   rD   rP   rI   N)NNN)__name__
__module____qualname____firstlineno____doc__r   r   intrB   r   r#   Tensorr   r   __static_attributes____classcell__rR   s   @r6   r:   r:   D   s    #y #Xc] # #  %0A6R
 ,015151)||1) ||1) "%	1)
 !.1) !.1) S1)r8   r:   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jrSr	U =r
$ )	MptMLP   r<   c                   > [         TU ]  5         UR                  n[        R                  " USU-  SS9U l        [        R                  " SS9U l        [        R                  " SU-  USS9U l        UR                  R                  U l        g )N   Fr?   none)approximate)rA   rB   rC   r   rN   up_projGELUact	down_projrH   rK   hidden_dropout)rQ   r<   rC   rR   s      r6   rB   MptMLP.__init__   sm    ((yya+oEJ77v.1{?KeL$00;;r8   rZ   residualreturnc                     U R                  U R                  U5      5      nU R                  U5      n[        R                  " X0R
                  U R                  S9nXB-   nU$ )Nrc   )r   r   r   Fru   r   re   )rQ   rZ   r   intermediate_outputoutputs        r6   r   MptMLP.forward   sS    m!<="nn];.2E2EPTP]P]^"r8   )r   r   r   r   )r   r   r   r   r   rB   r#   r   r   r   r   r   s   @r6   r   r      s:    <y <U\\ U\\ ell  r8   r   c                      ^  \ rS rSrSS\S\\   4U 4S jjjr    SS\R                  S\R                  S\R                  S\\
   S	\S
\S\\R                     4S jjrSrU =r$ )MptBlock   r<   r=   c                   > [         TU ]  5         UR                  n[        X1R                  S9U l        S U R
                  l        UR                  U l        [        X5      U l
        [        X1R                  S9U l        S U R                  l        [        U5      U l        UR                  R                  U l        ["        R$                  " U R                   5      U l        g )Neps)rA   rB   rC   r   layer_norm_epsilonnorm_1r@   rD   r/   r:   attnnorm_2r   ffnrH   rK   dropout_rater   Dropoutresid_attn_dropout)rQ   r<   r=   rC   rR   s       r6   rB   MptBlock.__init__   s    ((1J1JK 3	1J1JK&>"..99"$**T->->"?r8   rZ   r[   r\   
layer_past	use_cacheoutput_attentionsr]   c                     U R                  U5      nUn	U R                  UUUUUS9u  pU R                  U
5      U	-   nU R                  U5      nUn	U R	                  X5      nX4$ )N)r[   r\   rU   r]   )r   r   r   r   r   )rQ   rZ   r[   r\   r   r   r   r]   layernorm_outputr   attn_outputsr   r   s                r6   r   MptBlock.forward   s      ;;}5  &*YY')&) &/ &
" //=H;;}5 ! *5##r8   )r   r   r   r   r   r/   r   r   )NFFN)r   r   r   r   r   r   r   rB   r#   r   r   boolr   r   r   r   s   @r6   r   r      s    @y @Xc] @ @2 '+"'15"$||"$ ||"$ 	"$
 UO"$ "$  "$ !."$ "$r8   r   c            	         ^  \ rS rSr% \\S'   SrSrS/rS/r	U 4S jr
S\R                  4S	 jr\\" S
SSS9S\\\R$                  \R$                  4      S\\\R$                  \R$                  4      4S j5       5       rSrU =r$ )MptPreTrainedModel   r<   transformerTr   z
lm_head.*.c                 &   > [         TU ]  " U0 UD6  g r   )rA   rB   )rQ   inputskwargsrR   s      r6   rB   MptPreTrainedModel.__init__   s    &+F+r8   modulec                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        5      (       aW  UR                  b$  UR                  R                  R                  5         UR                  R                  R                  S5        gg)zInitialize the weights.g        )meanstdNr    )
isinstancer   rN   weightdatanormal_r<   initializer_ranger@   zero_	Embeddingpadding_idxr   fill_)rQ   r   s     r6   _init_weights MptPreTrainedModel._init_weights   s   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .	**{{&  &&(MM$$S) +r8   rT   rU   rV   rW   r   c                 j   ^^^ U S   S   R                   u  pmmX-  m[        UUU4S jU  5       5      $ )zg
Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
r   c              3   |   >#    U  H1  nUS    R                  TTT5      US   R                  TTT5      4v   M3     g7f)r   r   N)ri   ).0r   batch_size_times_num_headsrG   ry   s     r6   	<genexpr>;MptPreTrainedModel._convert_to_mpt_cache.<locals>.<genexpr>  sL      

 .
 1%%&@(JW1%%&@*hW .s   9<)rf   tuple)rU   rx   r/   r   rG   ry   s      @@@r6   _convert_to_mpt_cache(MptPreTrainedModel._convert_to_mpt_cache   sI     7Fa6H6K6Q6Q3
x%/%;"  

 .
 
 	
r8    )r   r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_keys_to_ignore_on_load_missingrB   r   Moduler   staticmethodr   r   r#   r   r   r   r   r   s   @r6   r   r      s    %&*##'4o#,*BII *" %0A6R
uU\\5<<%?@A
	uU\\5<</0	1
 S 
r8   r   c                   b  ^  \ rS rSrS\4U 4S jjrS rSS jrS\R                  4S jr
\         SS\\R                     S	\\   S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                  S4   \4   4S jj5       rSrU =r$ )MptModeli  r<   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        [        U R                  UR                  S9U l        S U R                   l        SU l        U R'                  5         g s  snf )N)r=   r   F)rA   rB   rC   rD   r/   r   r   
vocab_sizewte
ModuleListrangen_layersr   blocksr   r   norm_fr@   gradient_checkpointing	post_init)rQ   r<   irR   s      r6   rB   MptModel.__init__  s     !-- << 1 143C3CD mmERXRaRaLb$cLbqXf%BLb$cd   0 0f6O6OP&+# 	 %ds   
C5c                     U R                   $ r   r   )rQ   s    r6   get_input_embeddingsMptModel.get_input_embeddings&  s    xxr8   c                     [        XX45      $ r   )r7   )rQ   r/   r0   r1   r   s        r6   r7   MptModel.build_mpt_alibi_tensor)  s    %i.YYr8   new_embeddingsc                     Xl         g r   r   rQ   r   s     r6   set_input_embeddingsMptModel.set_input_embeddings,  s    !r8   	input_idsrU   r\   inputs_embedsr   r   output_hidden_statesreturn_dictr]   r   .c
                 r   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR                  u  pOUb  UR                  u  pnO[        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc  [        U R                   S9nU(       a@  [        U[        5      (       a+  [        R                  S5        [        R                  " U5      nUnU(       a  SOSnU(       a  SOSnUb  UR!                  5       OS	nUU-   nUc"  ["        R$                  " UU4UR&                  S
9nOUR)                  UR&                  5      nU R+                  U R,                  U R                   R.                  UR&                  S
9n[1        X;U4UU5      nUR3                  5       nU R4                   H3  nU(       a  UU4-   nU" UUUUUUU	S9nUS	   nU(       d  M*  UUS   4-   nM5     U R7                  U5      nU(       a  UU4-   nU(       d  [        S XUU4 5       5      $ [9        UUUUS9$ )j  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
NzDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r<   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `DynamicCache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.r   r   r   )r   r\   r   r   r[   r]   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   )r   vs     r6   r   #MptModel.forward.<locals>.<genexpr>  s      cacs   	)last_hidden_staterU   rZ   
attentions)r<   r   r  r   use_return_dictro   rf   r   re   loggerwarning_oncer   r   r   r   from_legacy_cacherm   r#   onesr   rt   r7   r/   rE   r   r   r   r   r   )rQ   r   rU   r\   r  r   r   r  r  r]   r   rx   ry   _rZ   all_self_attentionsall_hidden_statespast_key_values_lengthseq_length_with_pastr2   causal_maskblockoutputss                          r6   r   MptModel.forward/  s   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%JATUU&&4==##p "	  HHY/M0*$++>OOU;;U
 +<<_MO%$5b4"6BD FUE`!?!?!Afg),BB!"ZZ5I(JS`SgSghN+..}/C/CDN++DNNDKK<S<S\i\p\p+q74mE[
 "&&([[E#$58H$H!**#"3#-G $AJM  &9WQZM&I#! !& M2 1]4D D )<MObc   9+++*	
 	
r8   )r   r   rC   r   r/   r      N	NNNNNNNNN)r   r   r   r   r   rB   r   r7   r#   r   r   r   r   
LongTensorr   r   r   r   r   r   r   r   r   s   @r6   r   r     s   y ,Z"5<< "  15+/1548$(,0/3&*15t
E,,-t
 "%t
 !.	t

   0 01t
 D>t
 $D>t
 'tnt
 d^t
 !.t
 
uU\\3&')RR	St
 t
r8   r   z
    The MPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                   t  ^  \ rS rSrS/rS\4U 4S jjrS\R                  4S jr	\
          SS\\R                     S\\   S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                     \4   4S jj5       rSrU =r$ )MptForCausalLMi  zlm_head.weightr<   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g NFr?   )
rA   rB   r   r   r   rN   rC   r   lm_headr   rQ   r<   rR   s     r6   rB   MptForCausalLM.__init__  sI     #F+yy!3!3V5F5FUS 	r8   r   c                     Xl         g r   )r#  r   s     r6   set_output_embeddings$MptForCausalLM.set_output_embeddings  s    %r8   r   rU   r\   r  labelsr   r   r  r  r]   r   c                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUbE  UR	                  UR
                  5      nU R                  " UU4SU R                   R                  0UD6nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
N)rU   r\   r  r   r   r  r  r]   r   r   r   losslogitsrU   rZ   r  )r<   r  r   r#  rt   r   loss_functionr   r   rU   rZ   r  )rQ   r   rU   r\   r  r)  r   r   r  r  r]   r   transformer_outputsrZ   	lm_logitsr,  r   s                    r6   r   MptForCausalLM.forward  s   @ &1%<k$++B]B]"..+)'/!5#) / 

 ,A.LL/	YYy//0F%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r8   )r#  r   )
NNNNNNNNNN)r   r   r   r   _tied_weights_keysr   rB   r#   r   r'  r   r   r  r   r   r   r   r   r   r   r   r   s   @r6   r   r     s.    ++y &ELL &  15+/1504)-$(,0/3&*15F
E,,-F
 "%F
 !.	F

  -F
 &F
 D>F
 $D>F
 'tnF
 d^F
 !.F
 
uU\\"$EE	FF
 F
r8   r   a  
    The MPT Model transformer with a sequence classification head on top (linear layer).

    [`MptForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   ,  ^  \ rS rSrS\4U 4S jjr\         SS\\R                     S\\
   S\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )MptForSequenceClassificationi  r<   c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r"  )
rA   rB   
num_labelsr   r   r   rN   rC   scorer   r$  s     r6   rB   %MptForSequenceClassification.__init__  sV      ++#F+YYv1163D3D5Q
 	r8   r   rU   r\   r  r)  r   r   r  r  r   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9n
U
S   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGbg  U R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOeU" UU5      nO[U R                   R"                  S:X  a  [1        5       nU" UU5      nO-U R                   R"                  S:X  a  [3        5       nU" UU5      nU	(       d  U4U
SS -   nUb  U4U-   $ U$ [5        UUU
R6                  U
R8                  U
R:                  S9$ )  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values.get_seq_length()`
    (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NrU   r\   r  r   r   r  r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.ra   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  
regressionsingle_label_classificationmulti_label_classificationr+  )r<   r  r   r7  rf   pad_token_idro   rt   r   r#   r%   r$   argmaxr  r  rR   r   problem_typer6  r   longr   r	   r.   r   r   r   rU   rZ   r  )rQ   r   rU   r\   r  r)  r   r   r  r  r/  rZ   r-  rx   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr,  loss_fctr   s                        r6   r   $MptForSequenceClassification.forward  s   < &1%<k$++B]B]"..+)'/!5# / 	
 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r8   )r6  r7  r   r  )r   r   r   r   r   rB   r   r   r#   r  r   r   r   r   r   r   r   r   r   r   s   @r6   r4  r4    s    y   15+/1504)-$(,0/3&*d
E,,-d
 "%d
 !.	d

  -d
 &d
 D>d
 $D>d
 'tnd
 d^d
 
uU\\"$DD	Ed
 d
r8   r4  c                   ,  ^  \ rS rSrS\4U 4S jjr\         SS\\R                     S\\
   S\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )MptForTokenClassificationi  r<   c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [        US5      (       a  UR                  b  UR                  nO-[        US5      (       a  UR                  b  UR                  nOSn[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )Nclassifier_dropoutr   g?)rA   rB   r6  r   r   hasattrrL  r   r   r   ru   rN   rC   
classifierr   )rQ   r<   rL  rR   s      r6   rB   "MptForTokenClassification.__init__  s      ++#F+6/00V5N5N5Z!'!:!:V-..63H3H3T!'!6!6!$zz"45))F$6$68I8IJ 	r8   r   rU   r\   r  r)  r   r   r  r  r   c
                 
   U	b  U	OU R                   R                  n	U R                  UUUUUUUU	S9nUS   nU R                  U5      nU R	                  U5      nSnUbl  UR                  UR                  5      nUR                  u  nn[        5       nU" UR                  UU-  U R                  5      UR                  UU-  5      5      nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )r:  Nr;  r   r   )r,  r-  rZ   r  )r<   r  r   ru   rN  rt   r   rf   r   r&   r6  r   rZ   r  )rQ   r   rU   r\   r  r)  r   r   r  r  deprecated_argumentsr/  rZ   r-  r,  rx   ry   rG  r   s                      r6   r   !MptForTokenClassification.forward  s+   > &1%<k$++B]B]"..+)'/!5# / 	
 ,A.]3/YYv}}-F%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r8   )rN  ru   r6  r   r  )r   r   r   r   r   rB   r   r   r#   r  r   r   r   r   r   r   r   r   r   r   s   @r6   rJ  rJ    s    y "  15+/1504)-$(,0/3&*B
E,,-B
 "%B
 !.	B

  -B
 &B
 D>B
 $D>B
 'tnB
 d^B
 
uU\\"$99	:B
 B
r8   rJ  c                     ^  \ rS rSrU 4S jr\        SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\
   S	\\
   S
\\
   S\\\4   4S jj5       rSrU =r$ )MptForQuestionAnsweringi  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g )Nr   )	rA   rB   r   r   r   rN   rC   
qa_outputsr   r$  s     r6   rB    MptForQuestionAnswering.__init__  sA     #F+))F$6$6: 	r8   r   r\   r  start_positionsend_positionsr   r  r  r   c	           	         Ub  UOU R                   R                  nU R                  UUUUUUS9n	U	S   n
U R                  U
5      nUR	                  SSS9u  pUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" X5      nUU-   S-  nU(       d  X4U	SS -   nUb  U4U-   $ U$ [        UUUU	R                  U	R                  S	9$ )
r  N)r\   r  r   r  r  r   r   ra   r!   )ignore_indexr   )r,  start_logits
end_logitsrZ   r  )r<   r  r   rV  splitr.   rw   rn   rp   rg   r   r   rZ   r  )rQ   r   r\   r  rX  rY  r   r  r  r  sequence_outputr-  r\  r]  
total_lossignored_indexrG  
start_lossend_lossr   s                       r6   r   MptForQuestionAnswering.forward  s   2 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: #++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
:H$x/14J"/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r8   )rV  r   )NNNNNNNN)r   r   r   r   rB   r   r   r#   r  FloatTensorr   r   r   r   r   r   r   r   s   @r6   rT  rT    s      156:596:48,0/3&*E
E,,-E
 !!2!23E
   1 12	E

 "%"2"23E
   0 01E
 $D>E
 'tnE
 d^E
 
u22	3E
 E
r8   rT  )r   r   r   r4  rJ  rT  r  )5r   r'   typingr   r   r#   r   torch.nnr   r   r   r	   r
   r   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   utils.deprecationr   configuration_mptr   
get_loggerr   r  r7   r   r:   r   r   r   r   r   r4  rJ  rT  __all__r   r8   r6   <module>rs     sd     "   L L $ . ) I 9  . , 0 ( 
		H	%.G)299 G)TRYY *7$) 7$t -
 -
 -
` U
! U
 U
p U
' U
U
p o
#5 o
o
d U
 2 U
 U
p O
0 O
 O
dr8   