
    bCio                      S r SSKJr  SSKrSSKJr  SSKrSSKr	SSK
Jr  SSKJrJrJrJrJr  SSKJrJrJrJrJrJrJrJrJr  SS	KJrJrJr  SS
K J!r!J"r"J#r#J$r$  SSK%J&r&  \$RN                  " \(5      r)Sr*Sr+ " S S\RX                  RZ                  5      r. " S S\RX                  RZ                  5      r/ " S S\RX                  RZ                  5      r0 " S S\RX                  RZ                  5      r1 " S S\RX                  RZ                  5      r2 " S S\RX                  RZ                  5      r3 " S S\RX                  RZ                  5      r4 " S S\RX                  RZ                  5      r5 " S S \RX                  RZ                  5      r6 " S! S"\RX                  RZ                  5      r7S# r8S$ r9S% r:S& r;S' r< " S( S)\RX                  RZ                  5      r= " S* S+\RX                  RZ                  5      r> " S, S-\RX                  RZ                  5      r? " S. S/\RX                  RZ                  5      r@ " S0 S1\RX                  RZ                  5      rA " S2 S3\RX                  RZ                  5      rB " S4 S5\5      rCS6rDS7rE\"" S8\D5       " S9 S:\C5      5       rF\"" S;\D5       " S< S=\C\5      5       rG\"" S>\D5       " S? S@\C\5      5       rH\"" SA\D5       " SB SC\C\5      5       rI\"" SD\D5       " SE SF\C\5      5       rJ/ SGQrKg)HzTF 2.0 DeBERTa model.    )annotationsN)Sequence   )get_tf_activation)TFBaseModelOutputTFMaskedLMOutputTFQuestionAnsweringModelOutputTFSequenceClassifierOutputTFTokenClassifierOutput)	TFMaskedLanguageModelingLossTFModelInputTypeTFPreTrainedModelTFQuestionAnsweringLossTFSequenceClassificationLossTFTokenClassificationLossget_initializerkerasunpack_inputs)check_embeddings_within_bounds
shape_liststable_softmax)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardlogging   )DebertaConfigr   zkamalkraj/deberta-basec                  X   ^  \ rS rSrSU 4S jjrSS	S jjr\S
S j5       rSS jrSr	U =r
$ )TFDebertaContextPooler8   c                   > [         TU ]  " S0 UD6  [        R                  R	                  UR
                  SS9U l        [        UR                  SS9U l	        Xl
        g )Ndensenamedropout )super__init__r   layersDensepooler_hidden_sizer"   TFDebertaStableDropoutpooler_dropoutr%   configselfr.   kwargs	__class__s      i/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/deberta/modeling_tf_deberta.pyr(   TFDebertaContextPooler.__init__9   sM    "6"\\''(A(A'P
-f.C.C)T    c                    US S 2S4   nU R                  X2S9nU R                  U5      n[        U R                  R                  5      " U5      nU$ )Nr   training)r%   r"   r   r.   pooler_hidden_act)r0   hidden_statesr8   context_tokenpooled_outputs        r3   callTFDebertaContextPooler.call?   sO     &ad+]F

=1)$++*G*GHWr5   c                .    U R                   R                  $ N)r.   hidden_sizer0   s    r3   
output_dim!TFDebertaContextPooler.output_dimH   s    {{&&&r5   c                   U R                   (       a  g SU l         [        U SS 5      be  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       Nl= f! , (       d  f       g = f)NTr"   r%   )
builtgetattrtf
name_scoper"   r$   buildr.   r+   r%   r0   input_shapes     r3   rJ   TFDebertaContextPooler.buildL   s    ::
4$'3tzz/

  $dkk.L.L!MN 04D)5t||001""4( 21 6 0/ 21s   3C+C<+
C9<
D
)rF   r.   r"   r%   r.   r   Fr8   bool)returnintr@   )__name__
__module____qualname____firstlineno__r(   r=   propertyrC   rJ   __static_attributes____classcell__r2   s   @r3   r   r   8   s+     ' '	) 	)r5   r   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )TFDebertaXSoftmaxX   a&  
Masked Softmax which is optimized for saving memory

Args:
    input (`tf.Tensor`): The input tensor that will apply softmax.
    mask (`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
    dim (int): The dimension that will apply softmax
c                2   > [         TU ]  " S0 UD6  Xl        g Nr&   )r'   r(   axis)r0   ra   r1   r2   s      r3   r(   TFDebertaXSoftmax.__init__b   s    "6"	r5   c                   [         R                  " [         R                  " U[         R                  5      5      n[         R                  " U[         R                  " [        S5      U R                  S9U5      n[        [         R                  " U[         R                  S9U R                  5      n[         R                  " USU5      nU$ )Nz-infdtype        )
rH   logical_notcastrQ   wherefloatcompute_dtyper   float32ra   )r0   inputsmaskrmaskoutputs        r3   r=   TFDebertaXSoftmax.callf   s}    rwwtRWW56%vd>P>P!QSYZbjj A499M%f-r5   ra   ))rm   	tf.Tensorrn   rt   )	rT   rU   rV   rW   __doc__r(   r=   rY   rZ   r[   s   @r3   r]   r]   X   s     r5   r]   c                  ^   ^  \ rS rSrSrU 4S jr\R                  S 5       rSSS jjr	Sr
U =r$ )	r,   n   zo
Optimized dropout module for stabilizing the training

Args:
    drop_prob (float): the dropout probabilities
c                2   > [         TU ]  " S0 UD6  Xl        g r`   )r'   r(   	drop_prob)r0   ry   r1   r2   s      r3   r(   TFDebertaStableDropout.__init__v   s    "6""r5   c                   ^ ^^ [         R                  " S[         R                  R                  R                  R                  ST R                  -
  S9R                  [        U5      S9-
  [         R                  5      m[         R                  " SST R                  -
  -  T R                  S9mT R                  S:  a8  [         R                  " T[         R                  " ST R                  S9U5      T-  nUUU 4S jnX4$ )	zn
Applies dropout to the inputs, as vanilla dropout, but also scales the remaining elements up by 1/drop_prob.
r   g      ?)probs)sample_shaperd   r   rf   c                   > TR                   S:  a8  [        R                  " T[        R                  " STR                  S9U 5      T-  $ U $ )Nr   rf   rd   )ry   rH   ri   rh   rk   )upstreamrn   scaler0   s    r3   grad-TFDebertaStableDropout.xdropout.<locals>.grad   s>    ~~!xxbggc9K9K&LhWZ___r5   )rH   rh   compatv1distributions	Bernoulliry   sampler   rQ   convert_to_tensorrk   ri   )r0   rm   r   rn   r   s   `  @@r3   xdropoutTFDebertaStableDropout.xdropoutz   s    
 wwiill((22t~~9M2NUUcmntcuUvwGG

 $$SA,>%?tGYGYZ>>AXXdBGGCt7I7I$JFSV[[F	  |r5   c                6    U(       a  U R                  U5      $ U$ r@   )r   )r0   rm   r8   s      r3   r=   TFDebertaStableDropout.call   s    ==((r5   )ry   rO   )rm   rt   r8   rt   )rT   rU   rV   rW   ru   r(   rH   custom_gradientr   r=   rY   rZ   r[   s   @r3   r,   r,   n   s1    #  * r5   r,   c                  F   ^  \ rS rSrSrSU 4S jjrU 4S jrSS jrSrU =r	$ )	TFDebertaLayerNorm   zBLayerNorm module in the TF style (epsilon inside the square root).c                >   > [         TU ]  " S0 UD6  Xl        X l        g r`   )r'   r(   sizeeps)r0   r   r   r1   r2   s       r3   r(   TFDebertaLayerNorm.__init__   s    "6"	r5   c                   > U R                  U R                  /[        R                  " 5       SS9U l        U R                  U R                  /[        R
                  " 5       SS9U l        [        TU ]!  U5      $ )Nweight)shapeinitializerr$   bias)	
add_weightr   rH   ones_initializergammazeros_initializerbetar'   rJ   )r0   rL   r2   s     r3   rJ   TFDebertaLayerNorm.build   s^    __DII;BDWDWDY`h_i
OO499+2CWCWCY`fOg	w}[))r5   c                "   [         R                  " US/SS9n[         R                  " [         R                  " X-
  5      S/SS9n[         R                  R	                  X0R
                  -   5      nU R                  X-
  -  U-  U R                  -   $ )Nrs   T)ra   keepdims)rH   reduce_meansquaremathsqrtr   r   r   )r0   xmeanvariancestds        r3   r=   TFDebertaLayerNorm.call   sm    ~~ardT:>>"))AH"5RD4Pggll8hh./zzQX&,tyy88r5   )r   r   r   r   )g-q=)r   rt   rR   rt   
rT   rU   rV   rW   ru   r(   rJ   r=   rY   rZ   r[   s   @r3   r   r      s    L
*
9 9r5   r   c                  D   ^  \ rS rSrSU 4S jjrSSS jjrS	S jrSrU =r$ )
TFDebertaSelfOutput   c                  > [         TU ]  " S0 UD6  [        R                  R	                  UR
                  SS9U l        [        R                  R                  UR                  SS9U l	        [        UR                  SS9U l        Xl        g )Nr"   r#   	LayerNormepsilonr$   r%   r&   )r'   r(   r   r)   r*   rA   r"   LayerNormalizationlayer_norm_epsr   r,   hidden_dropout_probr%   r.   r/   s      r3   r(   TFDebertaSelfOutput.__init__   so    "6"\\''(:(:'I
88AVAV]h8i-f.H.HyYr5   c                l    U R                  U5      nU R                  XS9nU R                  X-   5      nU$ )Nr7   r"   r%   r   r0   r:   input_tensorr8   s       r3   r=   TFDebertaSelfOutput.call   s7    

=1]F}'CDr5   c                "   U R                   (       a  g SU l         [        U SS 5      be  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        [        U SS 5      be  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       N= f! , (       d  f       N}= f! , (       d  f       g = fNTr"   r   r%   )rF   rG   rH   rI   r"   r$   rJ   r.   rA   r   r%   rK   s     r3   rJ   TFDebertaSelfOutput.build   s
   ::
4$'3tzz/

  $dkk.E.E!FG 04d+7t~~223$$dD$++2I2I%JK 44D)5t||001""4( 21 6 0/ 43 21$   3E3E/8F 
E,/
E= 
Fr   rF   r.   r"   r%   rN   rO   rP   r@   	rT   rU   rV   rW   r(   r=   rJ   rY   rZ   r[   s   @r3   r   r      s    ) )r5   r   c                  l   ^  \ rS rSrSU 4S jjr     S               SS jjrS	S jrSrU =r$ )
TFDebertaAttention   c                n   > [         TU ]  " S0 UD6  [        USS9U l        [	        USS9U l        Xl        g )Nr0   r#   rp   r&   )r'   r(   "TFDebertaDisentangledSelfAttentionr0   r   dense_outputr.   r/   s      r3   r(   TFDebertaAttention.__init__   s5    "6"6vFK	/XFr5   c           
     r    U R                  UUUUUUUS9nUc  UnU R                  US   X7S9n	U	4USS  -   n
U
$ )Nr:   attention_maskquery_statesrelative_posrel_embeddingsoutput_attentionsr8   r   r:   r   r8   r   )r0   r   )r0   r   r   r   r   r   r   r8   self_outputsattention_outputrp   s              r3   r=   TFDebertaAttention.call   ss     yy&)%%)/ ! 
 'L,,&q/ - 
 #$|AB'77r5   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       Nl= f! , (       d  f       g = f)NTr0   r   )rF   rG   rH   rI   r0   r$   rJ   r   rK   s     r3   rJ   TFDebertaAttention.build   s    ::
4&2tyy~~.		% /4.:t00556!!''- 76 ; /. 76   C.C%
C"%
C3)rF   r.   r   r0   rN   NNNFF)r   rt   r   rt   r   tf.Tensor | Noner   r   r   r   r   rQ   r8   rQ   rR   tuple[tf.Tensor]r@   r   r[   s   @r3   r   r      sv     *.)-+/"' " '	
 ' )    
:	. 	.r5   r   c                  @   ^  \ rS rSrSU 4S jjrSS jrSS jrSrU =r$ )	TFDebertaIntermediate   c                J  > [         TU ]  " S0 UD6  [        R                  R	                  UR
                  [        UR                  5      SS9U l        [        UR                  [        5      (       a  [        UR                  5      U l        OUR                  U l        Xl        g )Nr"   unitskernel_initializerr$   r&   )r'   r(   r   r)   r*   intermediate_sizer   initializer_ranger"   
isinstance
hidden_actstrr   intermediate_act_fnr.   r/   s      r3   r(   TFDebertaIntermediate.__init__   s    "6"\\''**vOgOg?hov ( 

 f''--'89J9J'KD$'-'8'8D$r5   c                F    U R                  US9nU R                  U5      nU$ Nrm   )r"   r   r0   r:   s     r3   r=   TFDebertaIntermediate.call  s(    

-
800?r5   c                @   U R                   (       a  g SU l         [        U SS 5      bf  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        g g ! , (       d  f       g = f)NTr"   )	rF   rG   rH   rI   r"   r$   rJ   r.   rA   rK   s     r3   rJ   TFDebertaIntermediate.build  sm    ::
4$'3tzz/

  $dkk.E.E!FG 0/ 4//s   3B
B)rF   r.   r"   r   rN   r:   rt   rR   rt   r@   r   r[   s   @r3   r   r      s    H Hr5   r   c                  D   ^  \ rS rSrSU 4S jjrSSS jjrS	S jrSrU =r$ )
TFDebertaOutputi  c                @  > [         TU ]  " S0 UD6  [        R                  R	                  UR
                  [        UR                  5      SS9U l        [        R                  R                  UR                  SS9U l        [        UR                  SS9U l        Xl        g )Nr"   r   r   r   r%   r#   r&   )r'   r(   r   r)   r*   rA   r   r   r"   r   r   r   r,   r   r%   r.   r/   s      r3   r(   TFDebertaOutput.__init__  s    "6"\\''$$IaIa9bip ( 

 88AVAV]h8i-f.H.HyYr5   c                h    U R                  US9nU R                  XS9nU R                  X-   5      nU$ )Nr   r7   r   r   s       r3   r=   TFDebertaOutput.call  s9    

-
8]F}'CDr5   c                "   U R                   (       a  g SU l         [        U SS 5      be  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        [        U SS 5      be  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       N= f! , (       d  f       N}= f! , (       d  f       g = fr   )rF   rG   rH   rI   r"   r$   rJ   r.   r   r   rA   r%   rK   s     r3   rJ   TFDebertaOutput.build&  s
   ::
4$'3tzz/

  $dkk.K.K!LM 04d+7t~~223$$dD$++2I2I%JK 44D)5t||001""4( 21 6 0/ 43 21r   r   rN   rO   )r:   rt   r   rt   r8   rQ   rR   rt   r@   r   r[   s   @r3   r   r     s    ) )r5   r   c                  l   ^  \ rS rSrSU 4S jjr     S               SS jjrS	S jrSrU =r$ )
TFDebertaLayeri5  c                   > [         TU ]  " S0 UD6  [        USS9U l        [	        USS9U l        [        USS9U l        g )N	attentionr#   intermediaterp   r&   )r'   r(   r   r   r   r   r   bert_outputr/   s      r3   r(   TFDebertaLayer.__init__6  s?    "6"+FE1&~N*6Ar5   c           
         U R                  UUUUUUUS9nUS   n	U R                  U	S9n
U R                  XUS9nU4USS  -   nU$ )N)r   r   r   r   r   r   r8   r   r:   r   r   )r   r   r   )r0   r:   r   r   r   r   r   r8   attention_outputsr   intermediate_outputlayer_outputoutputss                r3   r=   TFDebertaLayer.call=  s     !NN&)%%)/ + 
 -Q/"//>N/O''-W_ ( 
  /$5ab$99r5   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       N= f! , (       d  f       N}= f! , (       d  f       g = f)NTr   r   r   )	rF   rG   rH   rI   r   r$   rJ   r   r   rK   s     r3   rJ   TFDebertaLayer.buildY  s    ::
4d+7t~~223$$T* 44.:t00556!!''- 74-9t//445  &&t, 65 : 43 76 65s$   D0.E
E0
D>
E
E )r   r   rF   r   rN   r   r:   rt   r   rt   r   r   r   r   r   r   r   rQ   r8   rQ   rR   r   r@   r   r[   s   @r3   r   r   5  sw    B *.)-+/"'  " '	
 ' )    
8- -r5   r   c                     ^  \ rS rSrS	U 4S jjrS
S jrS rS rSS jr      S                 SS jjr	Sr
U =r$ )TFDebertaEncoderih  c                X  > [         TU ]  " S0 UD6  [        UR                  5       Vs/ s H  n[	        USU 3S9PM     snU l        [        USS5      U l        Xl        U R                  (       a5  [        USS5      U l	        U R                  S:  a  UR                  U l	        g g g s  snf )	Nzlayer_._r#   relative_attentionFmax_relative_positionsrs   r   r&   )r'   r(   rangenum_hidden_layersr   layerrG   r  r.   r  max_position_embeddings)r0   r.   r1   ir2   s       r3   r(   TFDebertaEncoder.__init__i  s    "6"KPQWQiQiKjkKjanVHQC.AKjk
")&2F"N""*1&:RTV*WD'**Q..4.L.L+ / # ls   B'c                   U R                   (       a  g SU l         U R                  (       aV  U R                  SU R                  S-  U R                  R
                  /[        U R                  R                  5      S9U l        [        U SS 5      bN  U R                   H=  n[        R                  " UR                  5         UR                  S 5        S S S 5        M?     g g ! , (       d  f       MR  = f)NTzrel_embeddings.weight   r$   r   r   r  )rF   r  r   r  r.   rA   r   r   r   rG   r  rH   rI   r$   rJ   )r0   rL   r  s      r3   rJ   TFDebertaEncoder.buildt  s    ::
"""&//,22Q68O8OP+DKK,I,IJ #2 #D
 4$'3]]5::.KK% /. $ 4..s   >C
C-	c                H    U R                   (       a  U R                  nU$ S nU$ r@   )r  r   )r0   r   s     r3   get_rel_embedding"TFDebertaEncoder.get_rel_embedding  s*    040G0G,, NRr5   c                   [        [        U5      5      S::  a  [        R                  " [        R                  " US5      S5      nU[        R                  " [        R                  " US5      S5      -  n[        R
                  " U[        R                  5      nU$ [        [        U5      5      S:X  a  [        R                  " US5      nU$ )Nr  r   rs   r   )lenr   rH   expand_dimssqueezerh   uint8)r0   r   extended_attention_masks      r3   get_attention_mask#TFDebertaEncoder.get_attention_mask  s    z.)*a/&(nnR^^NTU5VXY&Z#4r~~bjjQhjlFmoq7rrNWW^RXX>N  N+,1^^NA>Nr5   c                    U R                   (       a:  Uc7  Ub  [        U5      S   O[        U5      S   n[        U[        U5      S   5      nU$ )Nr  )r  r   build_relative_position)r0   r:   r   r   qs        r3   get_rel_posTFDebertaEncoder.get_rel_pos  sO    ""|';0<0H
<(,jYfNghjNkA21j6OPR6STLr5   c	                L   U(       a  SOS n	U(       a  SOS n
U R                  U5      nU R                  XU5      n[        U[        5      (       a  US   nOUnU R	                  5       n[        U R                  5       Hu  u  pU(       a  X4-   n	U" UUUUUUUS9nUS   nUb=  Un[        U[        5      (       a%  US-   [        U R                  5      :  a  XS-      OS nOUnU(       d  Mm  XS   4-   n
Mw     U(       a  X4-   n	U(       d  [        S XU
4 5       5      $ [        XU
S9$ )Nr&   r   r   r   c              3  .   #    U  H  oc  M  Uv   M     g 7fr@   r&   ).0vs     r3   	<genexpr>(TFDebertaEncoder.call.<locals>.<genexpr>  s     h$Vq$Vs   	last_hidden_stater:   
attentions)
r"  r'  r   r   r  	enumerater  r  tupler   )r0   r:   r   r   r   r   output_hidden_statesreturn_dictr8   all_hidden_statesall_attentionsnext_kvr   r  layer_modulelayer_outputss                   r3   r=   TFDebertaEncoder.call  sE    #7BD0d00@''\RmX..#A&G#G//1(4OA#$58H$H!(%-))-"3!M *!,M',mX6667!ec$**o6MmE2SWG'  !/3C2E!E/  54   14D Dh]~$Vhhh +Yg
 	
r5   )rF   r.   r  r  r   r  rN   r@   )NN)NNFFTF)r:   rt   r   rt   r   r   r   r   r   rQ   r4  rQ   r5  rQ   r8   rQ   rR   $TFBaseModelOutput | tuple[tf.Tensor])rT   rU   rV   rW   r(   rJ   r  r"  r'  r=   rY   rZ   r[   s   @r3   r
  r
  h  s    	M& *.)-"'%* :
 :
 ":
 '	:

 ':
  :
 #:
 :
 :
 
.:
 :
r5   r
  c                   [         R                  " U [         R                  S9n[         R                  " U[         R                  S9nUSS2S4   [         R                  " [         R                  " USS/5      U S/5      -
  nUSU 2SS24   n[         R
                  " USS9n[         R                  " U[         R                  5      $ )a  
Build relative position according to the query and key

We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
\(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
P_k\)

Args:
    query_size (int): the length of query
    key_size (int): the length of key

Return:
    `tf.Tensor`: A tensor with shape [1, query_size, key_size]

rd   Nr   rs   r   rr   )rH   r  int32tilereshaper  rh   int64)
query_sizekey_sizeq_idsk_idsrel_pos_idss        r3   r%  r%    s      HHZrxx0EHHXRXX.E4.2772::eaW+E
TU#WWKkzk1n-K..15K77;))r5   c                    [        U5      S   [        U5      S   [        U5      S   [        U5      S   /n[        R                  " X5      $ )Nr   r   r  rs   r   rH   broadcast_to)c2p_posquery_layerr   shapess       r3   c2p_dynamic_expandrM    sN    ;";";"< $	F ??7++r5   c                    [        U5      S   [        U5      S   [        U5      S   [        U5      S   /n[        R                  " X5      $ )Nr   r   r  rH  )rJ  rK  	key_layerrL  s       r3   p2c_dynamic_expandrP    sN    ;";"9b!9b!	F ??7++r5   c                    [        U5      S S [        U 5      S   [        U5      S   /-   n[        R                  " X5      $ )Nr  r  rH  )	pos_indexp2c_attrO  rL  s       r3   pos_dynamic_expandrT     sA     !$
9(=b(A:iCXY[C\']]F??9--r5   c                z   US:  a  [         R                  " U 5      U-   nU[         R                  " U 5      S-
  :w  a  [         R                  " U 5      S-
  U-
  n[         R                  " [         R                  " [         R                  " U 5      5      USS9n[         R                  " XS9n [         R                  " XS9nOSn[         R
                  " U S[         R                  " U 5      S   45      n[         R
                  " US[         R                  " U5      S   45      n[         R                  " XVSS9n[         R
                  " U[         R                  " U5      5      nUS:w  aS  [         R                  " [         R                  " [         R                  " U 5      5      U* SS9n[         R                  " XtS9nU$ )Nr   r   rr   permrs   )
batch_dims)rH   rankrollr  	transposer@  r   gather)r   indicesgather_axispre_rollpermutationflat_xflat_indicesgathereds           r3   torch_gatherrd    s5   Qggaj;.bggaj1n$771:>K/ggbhhrwwqz2H1ELL-,,w9ZZBB01F::gBHHW,=b,A'BCLyy!<Hzz(BHHW$56H1}ggbhhrwwqz2XIAF<<;Or5   c                     ^  \ rS rSrSrS	U 4S jjrS
S jrSS jr     S               SS jjrS r	Sr
U =r$ )r   i  z
Disentangled self-attention module

Parameters:
    config (`str`):
        A model config class instance with the configuration to build a new model. The schema is similar to
        *BertConfig*, for more details, please refer [`DebertaConfig`]

c                  > [         TU ]  " S0 UD6  UR                  UR                  -  S:w  a&  [	        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  R                  U R                  S-  [        UR                  5      SSS9U l        UR                  b  UR                  O/ U l        [        US	S5      U l        [        US
S5      U l        U R"                  (       a  [        R                  R                  U R                  [        UR                  5      SSS9U l        [        R                  R                  U R                  [        UR                  5      SSS9U l        [)        SS9U l        U R                   (       a  [        USS5      U l        U R,                  S:  a  UR.                  U l        [1        UR2                  SS9U l        SU R                  ;   aB  [        R                  R                  U R                  [        UR                  5      SSS9U l        SU R                  ;   aA  [        R                  R                  U R                  [        UR                  5      SS9U l        [1        UR:                  SS9U l        Xl        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   in_projFr   r$   use_biasr  talking_headhead_logits_projhead_weights_projrs   rr   r  r   pos_dropoutr#   c2ppos_projp2c
pos_q_proj)r   r$   r%   r&   ) r'   r(   rA   num_attention_heads
ValueErrorrS   attention_head_sizeall_head_sizer   r)   r*   r   r   rh  pos_att_typerG   r  rk  rl  rm  r]   softmaxr  r  r,   r   rn  rp  rr  attention_probs_dropout_probr%   r.   r/   s      r3   r(   +TFDebertaDisentangledSelfAttention.__init__(  s   "6" : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W !558P8PP||))".v/G/GH	 * 
 4:3F3F3RF//XZ")&2F"N#FNEB$)LL$6$6((#263K3K#L'	 %7 %D! &+\\%7%7((#263K3K#L(	 &8 &D" )b1""*1&:RTV*WD'**Q..4.L.L+5f6P6PWdeD))) % 2 2&&'6v7O7O'P#"	 !3 ! )))"',,"4"4&&?6KcKc;dkw #5 # .f.Q.QXabr5   c                   U R                   (       a  g SU l         U R                  SU R                  [        R                  R                  5       S9U l        U R                  SU R                  [        R                  R                  5       S9U l        [        U SS 5      be  [        R                  " U R                  R                  5         U R                  R                  S S U R                  R                  /5        S S S 5        [        U SS 5      bN  [        R                  " U R                   R                  5         U R                   R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R"                  R                  5         U R"                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R$                  R                  5         U R$                  R                  S 5        S S S 5        [        U S	S 5      bN  [        R                  " U R&                  R                  5         U R&                  R                  S 5        S S S 5        [        U S
S 5      bc  [        R                  " U R(                  R                  5         U R(                  R                  U R                  R                  /5        S S S 5        [        U SS 5      bd  [        R                  " U R*                  R                  5         U R*                  R                  U R                  R                  /5        S S S 5        g g ! , (       d  f       GNc= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN;= f! , (       d  f       N= f! , (       d  f       g = f)NTq_biasr  v_biasrh  r%   rl  rm  rn  rp  rr  )rF   r   rv  r   initializersZerosr|  r}  rG   rH   rI   rh  r$   rJ   r.   rA   r%   rl  rm  rn  rp  rr  rK   s     r3   rJ   (TFDebertaDisentangledSelfAttention.builda  s   ::
oo$"4"45CUCUC[C[C] & 
 oo$"4"45CUCUC[C[C] & 
 4D)5t||001""D$0G0G#HI 24D)5t||001""4( 24+T2>t4499:%%++D1 ;4,d3?t55::;&&,,T2 <4-9t//445  &&t, 64T*6t}}112##T[[%<%<$=> 34t,8t334%%t{{'>'>&?@ 54 9# 21 21 ;: <; 65 32 54sT   
3M=M+M=5NN!-1N31O
M(+
M:=
N
N!
N03
O
Oc                    [        U5      S S U R                  S/-   n[        R                  " XS9n[        R                  " U/ SQS9$ )Nrs   tensorr   r   r  r   r   rV  )r   rs  rH   r@  r[  )r0   r  r   s      r3   transpose_for_scores7TFDebertaDisentangledSelfAttention.transpose_for_scores  sD    6"3B'4+C+CR*HH67 ||F66r5   c           	        Uc;  U R                  U5      n[        R                  " U R                  U5      SSS9u  pnGOS n[        R                  " [        R                  " U R                   R
                  S   5      U R                  S-  SS9n[        R                  " U R                  SS9n[        R                  " S5       H  n[        R                  " U R                  U R                  S9n[        R                  " U R                  5       H  nUR                  UUUS-  U-      5      nM      UR                  UUR                  5       5      nM     S/S-  nU" US   US   U5      nU" US   US   U5      nU" US	   US	   U5      nU R                  U5      n	U R                  U5      n
U R                  U5      nXR                  U R                  SSSS24   5      -   n	XR                  U R                  SSSS24   5      -   nSnS[        U R                  5      -   n[         R"                  " [%        U	5      S   U-  5      nU	U-  n	[        R&                  " U	[        R                  " U
/ S
Q5      5      nU R(                  (       a"  U R+                  XWS9nU R-                  XXEU5      nUb  UU-   nU R.                  (       a?  [        R                  " U R1                  [        R                  " U/ SQ5      5      / SQ5      nU R3                  UU5      nU R5                  UUS9nU R.                  (       a?  [        R                  " U R7                  [        R                  " U/ SQ5      5      / SQ5      n[        R&                  " UU5      n[        R                  " U/ SQ5      n[%        U5      nUSS US   US   -  /-   n[        R8                  " UU5      nU(       a  UU4nU$ U4nU$ )a  
Call the module

Args:
    hidden_states (`tf.Tensor`):
        Input states to the module usually the output from previous layer, it will be the Q,K and V in
        *Attention(Q,K,V)*

    attention_mask (`tf.Tensor`):
        An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
        sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
        th token.

    return_att (`bool`, *optional*):
        Whether return the attention matrix.

    query_states (`tf.Tensor`, *optional*):
        The *Q* state in *Attention(Q,K,V)*.

    relative_pos (`tf.Tensor`):
        The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
        values ranging in [*-max_relative_positions*, *max_relative_positions*].

    rel_embeddings (`tf.Tensor`):
        The embedding of relative distances. It's a tensor of shape [\(2 \times
        \text{max_relative_positions}\), *hidden_size*].


Nr   rs   )num_or_size_splitsra   c                h    [         R                  " X SS9nUb  U[         R                  " U5      -  nU$ )NT)transpose_b)rH   matmulr[  )wbr   outs       r3   linear7TFDebertaDisentangledSelfAttention.call.<locals>.linear  s.    ii$7=2<<?*C
r5   r   )re   r   r   r  r   r   r   r  r7   )r   r  r   r   )r   r   r   r  r  r  )rh  rH   splitr  r[  r   rs  TensorArrayre   r  writeconcatr|  r}  r  rw  r   r   r   r  r  rn  disentangled_att_biasrk  rl  rx  r%   rm  r@  )r0   r:   r   r   r   r   r   r8   qprK  rO  value_layerr  wsqkvwkqkvw_insider  qkvbr&  r,  rel_attscale_factorr   attention_scoresattention_probscontext_layercontext_layer_shapenew_context_layer_shaper  s                                 r3   r=   'TFDebertaDisentangledSelfAttention.call  s   N m,B24(())"-!"3/KK
 T\\0034IaIadeIelmB >>

;DXXa[ nn4::DD\D\]$":":;A"-"3"3Ar!a%!)}"EK <zz![%7%7%9:	 !
 6A:DtAwQ6AtAwQ7AtAwQ7A33A6K11!4I33A6K!$=$=dkk$PTVW->X$YY!$=$=dkk$PTVW->X$YY3t0011		*[1"5DE!E)99[",,y,2WX""!--n-PN00gstG/'9!||%%bll3C\&RSUa  ,,'7H,,,J ll&&r||O\'RSUaO 		/;?]LA(7
 #6cr":>QRT>UXklnXo>o=p"p

=2IJ6G=/2 O\M]r5   c           
     T   Uc&  [        U5      S   n[        U[        U5      S   5      n[        U5      n[        U5      S:X  a-  [        R                  " [        R                  " US5      S5      nOM[        U5      S:X  a  [        R                  " US5      nO&[        U5      S:w  a  [        S[        U5       35      e[        R                  " [        R                  " [        R                  " [        U5      S   [        U5      S   5      U R                  5      [        R                  5      n[        R                  " X@R                  U-
  U R                  U-   2S S 24   S5      nSn	SU R                  ;   a  U R                  U5      n
U R                  U
5      n
[        R                  " U[        R                  " U
/ S	Q5      5      n[        R                   " X8-   SUS-  S-
  5      n[#        U[%        XU5      S
5      nX-  n	SU R                  ;   Ga  U R'                  U5      nU R                  U5      nU[        R(                  R+                  [        R                  " [        U5      S
   U-  U R,                  S95      -  n[        U5      S   [        U5      S   :w  a%  [        [        U5      S   [        U5      S   5      nOUn[        R                   " U* U-   SUS-  S-
  5      n[        R                  " U[        R                  " U/ S	Q5      5      n[        R                  " [#        U[/        XU5      S
5      / S	Q5      n[        U5      S   [        U5      S   :w  a<  [        R                  " US S 2S S 2S S 2S4   S
5      n[#        U[1        UUU5      S5      nU	U-  n	U	$ )Nr  r  r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. ro  r  rs   rq  rd   )r   r%  r  rH   r  rt  rh   minimummaximumr  rA  rw  rp  r  r  r[  clip_by_valuerd  rM  rr  r   r   rk   rP  rT  )r0   rK  rO  r   r   r  r&  shape_list_posatt_spanscorepos_key_layerc2p_attrJ  pos_query_layerr_posp2c_posrS  rR  s                     r3   r  8TFDebertaDisentangledSelfAttention.disentangled_att_bias  s6   ;'+A21j6KB6OPL#L1~!#>>"..q*I1ML A%>>,:L A%QRUVdReQfghh77JJ

:k226
98Mb8QRTXToTo HH	
 66ADD_D_bjDjjlmmnpq
  D%%% MM.9M 55mDMiiR\\--VWG&&|'>8a<RSCSTG"7,>wUa,bdfgGE D%%%"oon=O"77HOrww||
?3B7,FdN`N`a  O +&r*j.CB.GG/
90Eb0I:V_K`acKde$&&v'8!X\A=MNGii	2<<+VWGllW&8y&Y[]^`lG +&r*j.CB.GGNN<1a
+CRH	&w0B9gW`0acefWEr5   )rv  ru  rF   r.   r%   rl  rm  rh  r  rs  rw  rn  rp  rr  r|  r  rx  rk  r}  rN   r@   )r  rt   rR   rt   r   r  )rT   rU   rV   rW   ru   r(   rJ   r  r=   r  rY   rZ   r[   s   @r3   r   r     s    7rA@7 *.)-+/"'m m "m '	m
 'm )m  m m 
m^7 7r5   r   c                  j   ^  \ rS rSrSrU 4S jrSS jr      S             S	S jjrSrU =r	$ )
TFDebertaEmbeddingsi2  zGConstruct the embeddings from word, position and token_type embeddings.c                8  > [         TU ]  " S0 UD6  Xl        [        USUR                  5      U l        UR                  U l        UR                  U l        [        USS5      U l        UR                  U l        U R
                  UR                  :w  aB  [        R                  R                  UR                  [        UR                  5      SSS9U l        [        R                  R                  UR                  SS9U l        [#        UR$                  S	S
9U l        g )Nembedding_sizeposition_biased_inputT
embed_projFri  r   r   r%   r#   r&   )r'   r(   r.   rG   rA   r  r  r  r   r   r)   r*   r   r  r   r   r   r,   r   r%   r/   s      r3   r(   TFDebertaEmbeddings.__init__5  s    "6"%f.>@R@RS!--'-'E'E$%,V5Ld%S"!'!9!9&"4"44#ll00""#263K3K#L!	 1 DO 88AVAV]h8i-f.H.HyYr5   c                N   [         R                  " S5         U R                  SU R                  R                  U R
                  /[        U R                  5      S9U l        S S S 5        [         R                  " S5         U R                  R                  S:  aJ  U R                  SU R                  R                  U R
                  /[        U R                  5      S9U l
        OS U l
        S S S 5        [         R                  " S5         U R                  (       a@  U R                  SU R                  U R                  /[        U R                  5      S9U l        OS U l        S S S 5        U R                  (       a  g SU l        [!        U S	S 5      be  [         R                  " U R"                  R$                  5         U R"                  R'                  S S U R                  R                  /5        S S S 5        [!        U S
S 5      bN  [         R                  " U R(                  R$                  5         U R(                  R'                  S 5        S S S 5        [!        U SS 5      b\  [         R                  " U R*                  R$                  5         U R*                  R'                  S S U R
                  /5        S S S 5        g g ! , (       d  f       GNc= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       g = f)Nword_embeddingsr   r  token_type_embeddingsr   
embeddingsposition_embeddingsTr   r%   r  )rH   rI   r   r.   
vocab_sizer  r   r   r   type_vocab_sizer  r  r  rA   r  rF   rG   r   r$   rJ   r%   r  rK   s     r3   rJ   TFDebertaEmbeddings.buildH  sG   ]],-//{{--t/B/BC+D,B,BC * DK . ]]23{{**Q.-1__%;;668K8KL /0F0F G .= .* .2* 4 ]]01))+/??%779I9IJ /0F0F G ,; ,( ,0( 2 ::
4d+7t~~223$$dD$++2I2I%JK 44D)5t||001""4( 24t,8t334%%tT43F3F&GH 54 9I .- 43 21 43 21 54sJ   A
J=?A,K	AK!;3K3.L
)L=
K
K!
K03
L
L
L$c                   Uc  Uc  [        S5      eUb>  [        XR                  R                  5        [        R
                  " U R                  US9n[        U5      SS nUc  [        R                  " USS9nUc+  [        R                  " [        R                  " SUS   S9SS9nUnU R                  (       a#  [        R
                  " U R                  US9n	X-  nU R                  R                  S:  a#  [        R
                  " U R                  US9n
X-  nU R                  U R                   :w  a  U R#                  U5      nU R%                  U5      nUb  ['        [        U5      5      ['        [        U5      5      :w  ar  ['        [        U5      5      S	:X  a(  [        R(                  " [        R(                  " US
S9S
S9n[        R*                  " [        R                  " USS9U R,                  S9nX-  nU R/                  XS9nU$ )zr
Applies embedding based on inputs tensor.

Returns:
    final_embeddings (`tf.Tensor`): output embedding tensor.
Nz5Need to provide either `input_ids` or `input_embeds`.)paramsr]  rs   r   dimsvalue)startlimitrr   r  r   r  rd   r7   )rt  r   r.   r  rH   r\  r   r   fillr  r  r  r  r  r  r  rA   r  r   r  r  rh   rk   r%   )r0   	input_idsposition_idstoken_type_idsinputs_embedsrn   r8   rL   final_embeddingsposition_embedstoken_type_embedss              r3   r=   TFDebertaEmbeddings.callq  s    !6TUU *9kk6L6LMIIT[[)LM /4!WW+Q?N>>"((+b/*RYZ[L(%% iit/G/GQ]^O/;;&&* "		1K1KUc d1$"2"22#/?@>>*:;:d#$J7G,H(IIz$'(A-::bjjA&>QGDwwr~~d;4CUCUV/6<<(8<Lr5   )r   rF   r.   r%   r  r  rA   r   r  r  r  r  r   r@   )NNNNNF)r  r   r  r   r  r   r  r   rn   r   r8   rQ   rR   rt   r   r[   s   @r3   r  r  2  su    QZ&'IV '+)-+/*.!%5 #5  '5  )	5 
 (5  5  5  
5  5 r5   r  c                  @   ^  \ rS rSrSU 4S jjrSS jrSS jrSrU =r$ )	 TFDebertaPredictionHeadTransformi  c                  > [         TU ]  " S0 UD6  [        USUR                  5      U l        [
        R                  R                  U R                  [        UR                  5      SS9U l
        [        UR                  [        5      (       a  [        UR                  5      U l        OUR                  U l        [
        R                  R!                  UR"                  SS9U l        Xl        g )Nr  r"   r   r   r   r&   )r'   r(   rG   rA   r  r   r)   r*   r   r   r"   r   r   r   r   transform_act_fnr   r   r   r.   r/   s      r3   r(   )TFDebertaPredictionHeadTransform.__init__  s    "6"%f.>@R@RS\\''%%.v/G/GH ( 

 f''--$5f6G6G$HD!$*$5$5D!88AVAV]h8ir5   c                h    U R                  US9nU R                  U5      nU R                  U5      nU$ r   )r"   r  r   r   s     r3   r=   %TFDebertaPredictionHeadTransform.call  s6    

-
8--m<}5r5   c                4   U R                   (       a  g SU l         [        U SS 5      be  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        g g ! , (       d  f       Ny= f! , (       d  f       g = f)NTr"   r   )rF   rG   rH   rI   r"   r$   rJ   r.   rA   r   r  rK   s     r3   rJ   &TFDebertaPredictionHeadTransform.build  s    ::
4$'3tzz/

  $dkk.E.E!FG 04d+7t~~223$$dD$2E2E%FG 43 8 0/ 43s   3C8)D	8
D	
D)r   rF   r.   r"   r  r  rN   r   r@   r   r[   s   @r3   r  r    s    $	H 	Hr5   r  c                  h   ^  \ rS rSrS
U 4S jjrSS jrSS jrSS jrSS jrSS jr	SS jr
S	rU =r$ )TFDebertaLMPredictionHeadi  c                   > [         TU ]  " S0 UD6  Xl        [        USUR                  5      U l        [        USS9U l        X l        g )Nr  	transformr#   r&   )	r'   r(   r.   rG   rA   r  r  r  input_embeddingsr0   r.   r  r1   r2   s       r3   r(   "TFDebertaLMPredictionHead.__init__  sF    "6"%f.>@R@RS9&{S !1r5   c                j   U R                  U R                  R                  4SSSS9U l        U R                  (       a  g SU l        [        U SS 5      bO  [        R                  " U R                  R                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       g = f)NzerosTr   )r   r   	trainabler$   r  )r   r.   r  r   rF   rG   rH   rI   r  r$   rJ   rK   s     r3   rJ   TFDebertaLMPredictionHead.build  s    OO4;;+A+A*CQXdhouOv	::
4d+7t~~223$$T* 43 833s   >B$$
B2c                    U R                   $ r@   )r  rB   s    r3   get_output_embeddings/TFDebertaLMPredictionHead.get_output_embeddings  s    $$$r5   c                ^    XR                   l        [        U5      S   U R                   l        g Nr   )r  r   r   r  r0   r  s     r3   set_output_embeddings/TFDebertaLMPredictionHead.set_output_embeddings  s&    ',$+5e+<Q+?(r5   c                    SU R                   0$ )Nr   )r   rB   s    r3   get_bias"TFDebertaLMPredictionHead.get_bias  s    		""r5   c                X    US   U l         [        US   5      S   U R                  l        g )Nr   r   )r   r   r.   r  r  s     r3   set_bias"TFDebertaLMPredictionHead.set_bias  s'    &M	!+E&M!:1!=r5   c                x   U R                  US9n[        U5      S   n[        R                  " USU R                  /S9n[        R
                  " XR                  R                  SS9n[        R                  " USX R                  R                  /S9n[        R                  R                  XR                  S9nU$ )Nr   r   rs   r  T)ar  r  )r  r   )r  r   rH   r@  r  r  r  r   r.   r  nnbias_addr   )r0   r:   
seq_lengths      r3   r=   TFDebertaLMPredictionHead.call  s    ]C.q1


-DDWDW?XY		M5J5J5Q5Q_cd

-JP[P[PfPf?gh]Kr5   )r   rF   r.   r  r  r  r.   r   r  keras.layers.Layerr@   rR   r  r  ztf.Variable)rR   zdict[str, tf.Variable]r   )rT   rU   rV   rW   r(   rJ   r  r  r  r  r=   rY   rZ   r[   s   @r3   r  r    s,    
1+%@#> r5   r  c                  @   ^  \ rS rSrSU 4S jjrSS jrSS jrSrU =r$ )	TFDebertaOnlyMLMHeadi  c                D   > [         TU ]  " S0 UD6  [        XSS9U l        g )Npredictionsr#   r&   )r'   r(   r  r  r  s       r3   r(   TFDebertaOnlyMLMHead.__init__   s#    "6"4VTabr5   c                $    U R                  US9nU$ )Nr   )r  )r0   sequence_outputprediction_scoress      r3   r=   TFDebertaOnlyMLMHead.call  s     ,,?,K  r5   c                   U R                   (       a  g SU l         [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       g = f)NTr  )rF   rG   rH   rI   r  r$   rJ   rK   s     r3   rJ   TFDebertaOnlyMLMHead.build	  sb    ::
4-9t//445  &&t, 65 :55   A88
B)rF   r  r  )r  rt   rR   rt   r@   r   r[   s   @r3   r   r     s    c!
- -r5   r   c                     ^  \ rS rSr\rS	U 4S jjrS
S jrSS jrS r	\
         S                   SS jj5       rSS jrSrU =r$ )TFDebertaMainLayeri  c                n   > [         TU ]  " S0 UD6  Xl        [        USS9U l        [        USS9U l        g )Nr  r#   encoderr&   )r'   r(   r.   r  r  r
  r  r/   s      r3   r(   TFDebertaMainLayer.__init__  s4    "6"-f<H'Y?r5   c                    U R                   $ r@   )r  rB   s    r3   get_input_embeddings'TFDebertaMainLayer.get_input_embeddings  s    r5   c                ^    XR                   l        [        U5      S   U R                   l        g r  )r  r   r   r  r  s     r3   set_input_embeddings'TFDebertaMainLayer.set_input_embeddings!  s"    !&%/%6q%9"r5   c                    [         e)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
)NotImplementedError)r0   heads_to_prunes     r3   _prune_headsTFDebertaMainLayer._prune_heads%  s
    
 "!r5   c
           	        Ub  Ub  [        S5      eUb  [        U5      n
OUb  [        U5      S S n
O[        S5      eUc  [        R                  " U
SS9nUc  [        R                  " U
SS9nU R	                  UUUUUU	S9nU R                  UUUUUU	S9nUS   nU(       d	  U4USS  -   $ [        UUR                  UR                  S	9$ )
NzDYou cannot specify both input_ids and inputs_embeds at the same timers   z5You have to specify either input_ids or inputs_embedsr   r  r   )r  r  r  r  rn   r8   )r:   r   r   r4  r5  r8   r/  )	rt  r   rH   r  r  r  r   r:   r1  )r0   r  r   r  r  r  r   r4  r5  r8   rL   embedding_outputencoder_outputsr  s                 r3   r=   TFDebertaMainLayer.call,  s     ]%>cdd"$Y/K&$]3CR8KTUU!WW+Q?N!WW+Q?N??%)' + 
 ,,*)/!5# ' 
 *!,#%(;;; -)77&11
 	
r5   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       Nl= f! , (       d  f       g = f)NTr  r  )rF   rG   rH   rI   r  r$   rJ   r  rK   s     r3   rJ   TFDebertaMainLayer.builde  s    ::
4t,8t334%%d+ 54D)5t||001""4( 21 6 54 21r   )rF   r.   r  r  rN   r  r  	NNNNNNNNF)r  TFModelInputType | Noner   np.ndarray | tf.Tensor | Noner  r#  r  r#  r  r#  r   bool | Noner4  r$  r5  r$  r8   rQ   rR   r<  r@   )rT   rU   rV   rW   r   config_classr(   r  r  r  r   r=   rJ   rY   rZ   r[   s   @r3   r  r    s     L@:"  .28<8<6:7;)-,0#'6
*6
 66
 6	6

 46
 56
 '6
 *6
 !6
 6
 
.6
 6
p	) 	)r5   r  c                       \ rS rSrSr\rSrSrg)TFDebertaPreTrainedModeliq  zz
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
debertar&   N)	rT   rU   rV   rW   ru   r   r%  base_model_prefixrY   r&   r5   r3   r'  r'  q  s    
 !L!r5   r'  a9
  
    The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled
    Attention](https://huggingface.co/papers/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build
    on top of BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two
    improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data.

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`DebertaConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a	  
    Args:
        input_ids (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput``] instead of a plain tuple.
zaThe bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.c                     ^  \ rS rSrSU 4S jjr\\" \R                  S5      5      \	" \
\\S9         S	                   S
S jj5       5       5       rSS jrSrU =r$ )TFDebertaModeli  c                L   > [         TU ]  " U/UQ70 UD6  [        USS9U l        g )Nr(  r#   )r'   r(   r  r(  r0   r.   rm   r1   r2   s       r3   r(   TFDebertaModel.__init__  s(    3&3F3)&yAr5   batch_size, sequence_length
checkpointoutput_typer%  c
                4    U R                  UUUUUUUUU	S9	n
U
$ )N	r  r   r  r  r  r   r4  r5  r8   )r(  )r0   r  r   r  r  r  r   r4  r5  r8   r  s              r3   r=   TFDebertaModel.call  s9    & ,,))%'/!5#  

 r5   c                   U R                   (       a  g SU l         [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       g = f)NTr(  )rF   rG   rH   rI   r(  r$   rJ   rK   s     r3   rJ   TFDebertaModel.build  s^    ::
4D)5t||001""4( 21 611r
  )rF   r(  rN   r!  )r  r"  r   r#  r  r#  r  r#  r  r#  r   r$  r4  r$  r5  r$  r8   r$  rR   r<  r@   )rT   rU   rV   rW   r(   r   r   DEBERTA_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCr=   rJ   rY   rZ   r[   s   @r3   r+  r+    s    
B
 *+C+J+JKh+ij&%$ .28<8<6:7;)-,0#' %* 6 6	
 4 5 ' * !  
. k 4) )r5   r+  z5DeBERTa Model with a `language modeling` head on top.c                     ^  \ rS rSrS	U 4S jjrS
S jr\\" \R                  S5      5      \
" \\\S9          S                     SS jj5       5       5       rSS jrSrU =r$ )TFDebertaForMaskedLMi  c                   > [         TU ]  " U/UQ70 UD6  UR                  (       a  [        R	                  S5        [        USS9U l        [        XR                  R                  SS9U l	        g )NzpIf you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.r(  r#   cls)r  r$   )
r'   r(   
is_decoderloggerwarningr  r(  r   r  mlmr-  s       r3   r(   TFDebertaForMaskedLM.__init__  s]    3&3F3NN1
 *&yA'AXAX_der5   c                .    U R                   R                  $ r@   )rC  r  rB   s    r3   get_lm_head TFDebertaForMaskedLM.get_lm_head  s    xx###r5   r/  r0  c                    U R                  UUUUUUUUU
S9	nUS   nU R                  XS9nU	c  SOU R                  XS9nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR
                  S9$ )a  
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
r4  r   )r  r8   Nlabelslogitsr  lossrK  r:   r1  )r(  rC  hf_compute_lossr   r:   r1  )r0   r  r   r  r  r  r   r4  r5  rJ  r8   r  r  r  rM  rp   s                   r3   r=   TFDebertaForMaskedLM.call  s    4 ,,))%'/!5#  

 "!* HH_HX~t4+?+?v+?+h')GABK7F)-)9TGf$EvE$!//))	
 	
r5   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bO  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        g g ! , (       d  f       Nl= f! , (       d  f       g = f)NTr(  rC  )rF   rG   rH   rI   r(  r$   rJ   rC  rK   s     r3   rJ   TFDebertaForMaskedLM.buildJ  s    ::
4D)5t||001""4( 24%1txx}}-t$ .- 2 21 .-r   )rF   r(  rC  rN   r  
NNNNNNNNNF)r  r"  r   r#  r  r#  r  r#  r  r#  r   r$  r4  r$  r5  r$  rJ  r#  r8   r$  rR   z#TFMaskedLMOutput | tuple[tf.Tensor]r@   )rT   rU   rV   rW   r(   rF  r   r   r8  r9  r   r:  r   r;  r=   rJ   rY   rZ   r[   s   @r3   r=  r=    s    
f$ *+C+J+JKh+ij&$$ .28<8<6:7;)-,0#'04 %+
*+
 6+
 6	+

 4+
 5+
 '+
 *+
 !+
 .+
 +
 
-+
 k +
Z	% 	%r5   r=  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                     ^  \ rS rSrSU 4S jjr\\" \R                  S5      5      \	" \
\\S9          S	                     S
S jj5       5       5       rSS jrSrU =r$ )"TFDebertaForSequenceClassificationiV  c                  > [         TU ]  " U/UQ70 UD6  UR                  U l        [        USS9U l        [        USS9U l        [        USS 5      nUc  U R                  R                  OUn[        USS9U l        [        R                  R                  UR                  [        UR                   5      SS9U l        U R                  R$                  U l        g )Nr(  r#   poolercls_dropout
classifierr   )r'   r(   
num_labelsr  r(  r   rV  rG   r.   r   r,   r%   r   r)   r*   r   r   rX  rC   )r0   r.   rm   r1   drop_outr2   s        r3   r(   +TFDebertaForSequenceClassification.__init__^  s    3&3F3 ++)&yA,V(C6=$76>6F4;;22H-h]K,,,,##.v/G/GH - 

 ++00r5   r/  r0  c                6   U R                  UUUUUUUUU
S9	nUS   nU R                  XS9nU R                  XS9nU R                  U5      nU	c  SOU R	                  XS9nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )an  
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
r4  r   r7   NrI  r   rL  )r(  rV  r%   rX  rN  r
   r:   r1  )r0   r  r   r  r  r  r   r4  r5  rJ  r8   r  r  r<   rK  rM  rp   s                    r3   r=   'TFDebertaForSequenceClassification.callp  s    4 ,,))%'/!5#  

 "!*OG]F/~t4+?+?v+?+]Y,F)-)9TGf$EvE)!//))	
 	
r5   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      b\  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  /5        S S S 5        g g ! , (       d  f       GN2= f! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       g = f)NTr(  rV  r%   rX  )rF   rG   rH   rI   r(  r$   rJ   rV  r%   rX  rC   rK   s     r3   rJ   (TFDebertaForSequenceClassification.build  s:   ::
4D)5t||001""4( 244(4t{{//0!!$' 14D)5t||001""4( 24t,8t334%%tT4??&CD 54 9 21 10 21 54s0   F.F+
F<&)G
F(+
F9<
G

G)rF   rX  r(  r%   rY  rC   rV  rN   rR  )r  r"  r   r#  r  r#  r  r#  r  r#  r   r$  r4  r$  r5  r$  rJ  r#  r8   r$  rR   z-TFSequenceClassifierOutput | tuple[tf.Tensor]r@   )rT   rU   rV   rW   r(   r   r   r8  r9  r   r:  r
   r;  r=   rJ   rY   rZ   r[   s   @r3   rT  rT  V  s    1$ *+C+J+JKh+ij&.$ .28<8<6:7;)-,0#'04 %.
*.
 6.
 6	.

 4.
 5.
 '.
 *.
 !.
 ..
 .
 
7.
 k .
`E Er5   rT  z
    DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                     ^  \ rS rSrSU 4S jjr\\" \R                  S5      5      \	" \
\\S9          S	                     S
S jj5       5       5       rSS jrSrU =r$ )TFDebertaForTokenClassificationi  c                T  > [         TU ]  " U/UQ70 UD6  UR                  U l        [        USS9U l        [
        R                  R                  UR                  S9U l	        [
        R                  R                  UR                  [        UR                  5      SS9U l        Xl        g )Nr(  r#   )raterX  r   )r'   r(   rY  r  r(  r   r)   Dropoutr   r%   r*   r   r   rX  r.   r-  s       r3   r(   (TFDebertaForTokenClassification.__init__  s    3&3F3 ++)&yA||++1K1K+L,,,,##H`H`8aht - 
 r5   r/  r0  c                   U R                  UUUUUUUUU
S9	nUS   nU R                  XS9nU R                  US9nU	c  SOU R                  XS9nU(       d  U4USS -   nUb  U4U-   $ U$ [	        UUUR
                  UR                  S9$ )	z
labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
r4  r   r7   r   NrI  r   rL  )r(  r%   rX  rN  r   r:   r1  )r0   r  r   r  r  r  r   r4  r5  rJ  r8   r  r  rK  rM  rp   s                   r3   r=   $TFDebertaForTokenClassification.call  s    0 ,,))%'/!5#  

 "!*,,,J8~t4+?+?v+?+]Y,F)-)9TGf$EvE&!//))	
 	
r5   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bf  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        g g ! , (       d  f       N= f! , (       d  f       g = f)NTr(  rX  )
rF   rG   rH   rI   r(  r$   rJ   rX  r.   rA   rK   s     r3   rJ   %TFDebertaForTokenClassification.build       ::
4D)5t||001""4( 24t,8t334%%tT4;;3J3J&KL 54 9 21 54   C+.3C<+
C9<
D
)rF   rX  r.   r(  r%   rY  rN   rR  )r  r"  r   r#  r  r#  r  r#  r  r#  r   r$  r4  r$  r5  r$  rJ  r#  r8   r$  rR   z*TFTokenClassifierOutput | tuple[tf.Tensor]r@   )rT   rU   rV   rW   r(   r   r   r8  r9  r   r:  r   r;  r=   rJ   rY   rZ   r[   s   @r3   ra  ra    s    
 *+C+J+JKh+ij&+$ .28<8<6:7;)-,0#'04 %*
**
 6*
 6	*

 4*
 5*
 '*
 **
 !*
 .*
 *
 
4*
 k *
X	M 	Mr5   ra  z
    DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                     ^  \ rS rSrSU 4S jjr\\" \R                  S5      5      \	" \
\\S9           S	                       S
S jj5       5       5       rSS jrSrU =r$ )TFDebertaForQuestionAnsweringi  c                   > [         TU ]  " U/UQ70 UD6  UR                  U l        [        USS9U l        [
        R                  R                  UR                  [        UR                  5      SS9U l
        Xl        g )Nr(  r#   
qa_outputsr   )r'   r(   rY  r  r(  r   r)   r*   r   r   ro  r.   r-  s       r3   r(   &TFDebertaForQuestionAnswering.__init__  sp    3&3F3 ++)&yA,,,,##H`H`8aht - 
 r5   r/  r0  c                   U R                  UUUUUUUUUS9	nUS   nU R                  US9n[        R                  " USSS9u  nn[        R                  " USS9n[        R                  " USS9nSnU	b  U
b  S	U	0nU
US
'   U R                  UUU4S9nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )a  
start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
    Labels for position (index) of the start of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
    Labels for position (index) of the end of the labelled span for computing the token classification loss.
    Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
    are not taken into account for computing the loss.
r4  r   r   r  rs   )r  r  ra   )inputra   Nstart_positionend_positionrI  )rM  start_logits
end_logitsr:   r1  )	r(  ro  rH   r  r  rN  r	   r:   r1  )r0   r  r   r  r  r  r   r4  r5  start_positionsend_positionsr8   r  r  rK  ru  rv  rM  rJ  rp   s                       r3   r=   "TFDebertaForQuestionAnswering.call  s   > ,,))%'/!5#  

 "!*8#%88&QUW#X jzz2>ZZjr:
&=+D&8F%2F>"''v|Z>X'YD"J/'!"+=F)-)9TGf$EvE-%!!//))
 	
r5   c                   U R                   (       a  g SU l         [        U SS 5      bN  [        R                  " U R                  R
                  5         U R                  R                  S 5        S S S 5        [        U SS 5      bf  [        R                  " U R                  R
                  5         U R                  R                  S S U R                  R                  /5        S S S 5        g g ! , (       d  f       N= f! , (       d  f       g = f)NTr(  ro  )
rF   rG   rH   rI   r(  r$   rJ   ro  r.   rA   rK   s     r3   rJ   #TFDebertaForQuestionAnswering.builda  rj  rk  )rF   r.   r(  rY  ro  rN   )NNNNNNNNNNF)r  r"  r   r#  r  r#  r  r#  r  r#  r   r$  r4  r$  r5  r$  rw  r#  rx  r#  r8   r$  rR   z1TFQuestionAnsweringModelOutput | tuple[tf.Tensor]r@   )rT   rU   rV   rW   r(   r   r   r8  r9  r   r:  r	   r;  r=   rJ   rY   rZ   r[   s   @r3   rm  rm    s    	 *+C+J+JKh+ij&2$ .28<8<6:7;)-,0#'9=7; %9
*9
 69
 6	9

 49
 59
 '9
 *9
 !9
 79
 59
 9
 
;9
 k 9
v	M 	Mr5   rm  )r=  rm  rT  ra  r+  r'  )Lru   
__future__r   r   collections.abcr   numpynp
tensorflowrH   activations_tfr   modeling_tf_outputsr   r   r	   r
   r   modeling_tf_utilsr   r   r   r   r   r   r   r   r   tf_utilsr   r   r   utilsr   r   r   r   configuration_debertar   
get_loggerrT   rA  r;  r:  r)   Layerr   r]   r,   r   r   r   r   r   r   r
  r%  rM  rP  rT  rd  r   r  r  r  r   r  r'  DEBERTA_START_DOCSTRINGr8  r+  r=  rT  ra  rm  __all__r&   r5   r3   <module>r     s    "  $   / 
 
 
 S R u u 0 
		H	% ". )U\\// )@** ,%U\\// %P9++ 9()%,,,, ):-.++ -.`HELL.. H:)ell(( )B0-U\\'' 0-fi
u||)) i
X*0,,.
0R);); Rjt %,,,, t n#Hu||'9'9 #HL- 2 2 -`-5<<-- -([)++ [)|"0 "( T) X g-)- -)	-)` QSjkM%35Q M% lM%`  YE)AC_ YEYEx  IM&>@Y IMIMX  WM$<>U WMWMtr5   