
    bCi                        S r SSKrSSKJrJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJrJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJrJrJr  SSKJ r   SSK!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+  \'RX                  " \-5      r.Sr/ " S S\R`                  5      r1 " S S\R`                  5      r2 " S S\R`                  5      r3S\20r4 " S S\R`                  5      r5 " S S\R`                  5      r6 " S S\R`                  5      r7 " S  S!\5      r8 " S" S#\R`                  5      r9 " S$ S%\R`                  5      r:\& " S& S'\ 5      5       r;\& " S( S)\;5      5       r<\&" S*S+9 " S, S-\;\5      5       r=\& " S. S/\;5      5       r> " S0 S1\R`                  5      r?\&" S2S+9 " S3 S4\;5      5       r@\& " S5 S6\;5      5       rA\& " S7 S8\;5      5       rB " S9 S:\R`                  5      rC\& " S; S<\;5      5       rDS?S= jrE/ S>QrFg)@zPyTorch Data2VecText model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FNgelu)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging)deprecate_kwarg   )Data2VecTextConfig   c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )Data2VecTextForTextEmbeddings4   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r    F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr*   register_buffertorcharangeexpandzerosr,   sizelongr'   selfconfig	__class__s     m/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/data2vec/modeling_data2vec_text.pyr3   &Data2VecTextForTextEmbeddings.__init__:   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
     c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr-   r    r/   r   r1   devicer+   )"create_position_ids_from_input_idsr'   &create_position_ids_from_inputs_embedsrH   hasattrr/   rF   rD   rG   rI   r,   rS   r8   r<   r*   r:   r=   rA   )rK   	input_idsr/   r,   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr<   
embeddingsr:   s                rN   forward%Data2VecTextForTextEmbeddings.forwardS   sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
rP   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr-   r    rR   r   )rH   rD   rE   r'   rI   rS   	unsqueezerF   )rK   rX   rZ   sequence_lengthr,   s        rN   rU   DData2VecTextForTextEmbeddings.create_position_ids_from_inputs_embeds{   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<rP   )r=   rA   r'   r*   r:   r<   r8   )NNNNr   )
__name__
__module____qualname____firstlineno____doc__r3   r_   rU   __static_attributes____classcell__rM   s   @rN   r$   r$   4   s$    

4 rs&P= =rP   r$   c                     ^  \ rS rSrSU 4S jjr\" SSSS9      SS\R                  S\\R                     S	\\R                     S
\\R                     S\\
   S\\   S\\R                     S\\R                     4S jj5       rSrU =r$ )Data2VecTextSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        X0l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r*   r+   relative_keyrelative_key_queryr"   r    )r2   r3   r6   num_attention_headsrV   
ValueErrorintattention_head_sizeall_head_sizer   Linearquerykeyvaluer?   attention_probs_dropout_probrA   rB   r*   r9   r4   distance_embedding
is_decoder	layer_idxrK   rL   r*   r   rM   s       rN   r3   "Data2VecTextSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"rP   past_key_valuepast_key_values4.58new_nameversionhidden_statesattention_mask	head_maskencoder_hidden_statesoutput_attentionscache_positionreturnc                 	   UR                   u  pn
U R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nSnUS LnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nUR                  USU R                  U R                  5      R                  SS5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbc  U(       d  UOS nWR%                  UUU R                  SU05      u  nnU(       a.  [        U[        5      (       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nU R*                  S:X  d  U R*                  S	:X  Ga  UR                   S   UR                   S   nnUbB  [&        R,                  " US-
  [&        R.                  UR0                  S
9R                  SS5      nO>[&        R2                  " U[&        R.                  UR0                  S
9R                  SS5      n[&        R2                  " U[&        R.                  UR0                  S
9R                  SS5      nUU-
  nU R5                  UU R6                  -   S-
  5      nUR9                  UR:                  S9nU R*                  S:X  a  [&        R<                  " SUU5      nUU-   nOHU R*                  S	:X  a8  [&        R<                  " SUU5      n[&        R<                  " SUU5      nUU-   U-   nU[>        R@                  " U R                  5      -  nUb  UU-   n[B        RD                  RG                  USS9nU RI                  U5      nUb  UU-  n[&        R(                  " UU5      nURK                  SSSS5      RM                  5       nURO                  5       S S U RP                  4-   nUR                  U5      nUU4$ )Nr-   r    r"   Fr   Trs   rt   rR   r0   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r	   ))shaper{   viewru   rx   	transpose
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr|   r}   updaterD   matmulr*   tensorrI   rS   rE   r   r9   tor1   einsummathsqrtr   
functionalsoftmaxrA   permute
contiguousrH   ry   )rK   r   r   r   r   r   r   r   
batch_sizer[   _query_layerr   is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  rN   r_   !Data2VecTextSelfAttention.forward   sa    %2$7$7!
jj/!&&z2t7O7OQUQiQijttq
 
2$>&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#2D.-/"=*+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  *7It)<)C)C{DNN=M~<^*&	; &*_FY*Z*ZAEO..t~~> !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L*!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--rP   )ry   rx   r   rA   r   r|   r   r9   ru   r*   r{   r}   NNNNNNFN)re   rf   rg   rh   r3   r   rD   Tensorr   FloatTensorr   booltupler_   rj   rk   rl   s   @rN   rn   rn      s    #6 %0A6R 7;15=A+/,115e.||e. !!2!23e. E--.	e.
  ((9(9:e. "%e. $D>e. !.e. 
u||	e. Se.rP   rn   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )Data2VecTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr(   )r2   r3   r   rz   r6   denser=   r>   r?   r@   rA   rJ   s     rN   r3   Data2VecTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rP   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr   rA   r=   rK   r   r   s      rN   r_   Data2VecTextSelfOutput.forward  5    

=1]3}'CDrP   r=   r   rA   
re   rf   rg   rh   r3   rD   r   r_   rj   rk   rl   s   @rN   r   r     6    >U\\  RWR^R^  rP   r   eagerc                   $  ^  \ rS rSrSU 4S jjrS r\" SSSS9      SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\   S\	\   S\	\R                     S\\R                     4S jj5       rSrU =r$ )Data2VecTextAttentioni(  c                    > [         TU ]  5         [        UR                     " UUUS9U l        [        U5      U l        [        5       U l        g )Nr*   r   )	r2   r3   $DATA2VEC_TEXT_SELF_ATTENTION_CLASSES_attn_implementationrK   r   outputsetpruned_headsr   s       rN   r3   Data2VecTextAttention.__init__)  sF    89T9TU$;
	
 -V4ErP   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r    r   )lenr   rK   ru   rx   r   r   r{   r|   r}   r   r   ry   union)rK   headsindexs      rN   prune_heads!Data2VecTextAttention.prune_heads3  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rP   r   r   r   r   r   r   r   r   r   r   r   c           
      l    U R                  UUUUUUUS9nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   r   r   r   r   r   r    )rK   r   )rK   r   r   r   r   r   r   r   self_outputsattention_outputoutputss              rN   r_   Data2VecTextAttention.forwardE  s\     yy)"7+/) ! 
  ;;|AF#%QR(88rP   )r   r   rK   r   r   )re   rf   rg   rh   r3   r   r   rD   r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r   r   (  s    ";$ %0A6R 7;15=A+/,115|| !!2!23 E--.	
  ((9(9: "% $D> !. 
u||	 SrP   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Data2VecTextIntermediatei_  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r2   r3   r   rz   r6   intermediate_sizer   r   
hidden_actstrr
   intermediate_act_fnrJ   s     rN   r3   !Data2VecTextIntermediate.__init__`  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rP   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )rK   r   s     rN   r_    Data2VecTextIntermediate.forwardh  s&    

=100?rP   r   r   rl   s   @rN   r   r   _  s(    9U\\ ell  rP   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )Data2VecTextOutputio  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r2   r3   r   rz   r   r6   r   r=   r>   r?   r@   rA   rJ   s     rN   r3   Data2VecTextOutput.__init__p  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rP   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      rN   r_   Data2VecTextOutput.forwardv  r   rP   r   r   rl   s   @rN   r   r   o  r   rP   r   c                   D  ^  \ rS rSrSU 4S jjr\" SSSS9       SS\R                  S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\   S\\R                     S\\R                     4S jj5       rS rSrU =r$ )Data2VecTextLayeri~  c                 r  > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        USUS9U l	        [        U5      U l        [        U5      U l        g )Nr    r   z> should be used as a decoder model if cross attention is addedr+   r   )r2   r3   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionrv   crossattentionr   intermediater   r   )rK   rL   r   rM   s      rN   r3   Data2VecTextLayer.__init__  s    '-'E'E$.vK ++#)#=#= ##?? D6)g!hii"7
i#D 5V<(0rP   r   r   r   r   r   r   r   r   encoder_attention_maskr   r   r   c	           
      P   U R                  UUUUUUS9n	U	S   n
U	SS  nU R                  (       aD  UbA  [        U S5      (       d  [        SU  S35      eU R	                  U
UUUUUUS9nUS   n
XSS  -   n[        U R                  U R                  U R                  U
5      nU4U-   nU$ )N)r   r   r   r   r   r   r    r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r   rV   rv   r  r   feed_forward_chunkr  r  )rK   r   r   r   r   r  r   r   r   self_attention_outputsr   r   cross_attention_outputslayer_outputs                 rN   r_   Data2VecTextLayer.forward  s    "&)/+) "0 "
 2!4(,??4@4!122 =dV DD D 
 '+&9&9 5#&; /"3- ': '#  7q9 ;;G0##T%A%A4CSCSUe
  /G+rP   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r	  r   )rK   r   intermediate_outputr  s       rN   r  $Data2VecTextLayer.feed_forward_chunk  s)    "//0@A{{#6IrP   )r  r  r  r  r	  r   r   r  r   )NNNNNFN)re   rf   rg   rh   r3   r   rD   r   r   r   r   r   r   r_   r  rj   rk   rl   s   @rN   r  r  ~  s    1  %0A6R 7;15=A>B+/,115.||. !!2!23. E--.	.
  ((9(9:. !)):): ;. "%. $D>. !.. 
u||	. S.` rP   r  c                   V  ^  \ rS rSrSU 4S jjr          SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\\
   S
\\
   S\\
   S\\
   S\\R                     S\\\R                     \4   4S jjrSrU =r$ )Data2VecTextEncoderi  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l	        g s  snf )Nr  F)
r2   r3   rL   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)rK   rL   r   irM   s       rN   r3   Data2VecTextEncoder.__init__  sU    ]]TYZ`ZrZrTs#tTsq$5f$JTs#tu
&+# $us   A$r   r   r   r   r  r   	use_cacher   output_hidden_statesreturn_dictr   r   c                    U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       aL  U R                   R                  (       a1  Uc.  [        [        U R                   S9[        U R                   S95      nU(       a[  U R                   R                  (       a@  [        U[        5      (       a+  [        R                  S5        [        R                  " U5      n[        U R                  5       He  u  nnU	(       a  X4-   nUb  X?   OS nU" UUUUUUUUS9nUS   nU(       d  M6  UUS   4-   nU R                   R                  (       d  M\  UUS	   4-   nMg     U	(       a  X4-   nU
(       d  [        S
 UUUUU4 5       5      $ [        UUUUUS9$ )N zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rL   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r  r   r   r   r   r    r"   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r#  ).0vs     rN   	<genexpr>.Data2VecTextEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stater   r   
attentionscross_attentions)rL   r  r  trainingloggerwarning_oncer   r   r   r   r   from_legacy_cache	enumerater  r   )rK   r   r   r   r   r  r   r  r   r   r!  r   all_hidden_statesall_self_attentionsall_cross_attentionsr  layer_modulelayer_head_masklayer_outputss                      rN   r_   Data2VecTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	//O4K1,dkk2RT`hlhshsTtuO//JPU4V4V\
 2CCOTO(4OA|#$58H$H!.7.CilO(%'= /"3-	M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U(+  5.   14D D 
 "#%'(
 
 
 9+++*1
 	
rP   )rL   r  r  r   )
NNNNNNFFTN)re   rf   rg   rh   r3   rD   r   r   r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s   , 7;15=A>B+/$(,1/4&*15P
||P
 !!2!23P
 E--.	P

  ((9(9:P
 !)):): ;P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 !.P
 
uU\\"$MM	NP
 P
rP   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Data2VecTextPooleri"  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r2   r3   r   rz   r6   r   Tanh
activationrJ   s     rN   r3   Data2VecTextPooler.__init__#  s9    YYv1163E3EF
'')rP   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r   r<  )rK   r   first_token_tensorpooled_outputs       rN   r_   Data2VecTextPooler.forward(  s6     +1a40

#566rP   )r<  r   r   rl   s   @rN   r9  r9  "  s(    $
U\\ ell  rP   r9  c                   6    \ rS rSr% \\S'   SrSrSS/rS r	Sr
g	)
Data2VecTextPreTrainedModeli1  rL   data2vec_textTr$   r  c                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       a  [        US5      (       a1  UR                  b$  UR                  R                  R                  5         [        US5      (       a4  UR                  b&  UR                  R                  R                  S5        gggg)zInitialize the weightsg        )meanstdNbiasweightg      ?)r   r   rz   rJ  datanormal_rL   initializer_rangerI  zero_r4   r'   r=   rV   fill_)rK   modules     rN   _init_weights)Data2VecTextPreTrainedModel._init_weights8  sD   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--vv&&6;;+B  &&(vx((V]]-F""((- .G( .rP   r#  N)re   rf   rg   rh   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modulesrQ  rj   r#  rP   rN   rD  rD  1  s%    '&*#8:MN.rP   rD  c            "         ^  \ rS rSrSrSU 4S jjrS rS rS r\	              SS\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\R                     \4   4S jj5       rSrU =r$ )Data2VecTextModeliK  a  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in *Attention is
all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
Kaiser and Illia Polosukhin.

To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

.. _*Attention is all you need*: https://huggingface.co/papers/1706.03762

c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
r2   r3   rL   r$   r^   r  encoderr9  pooler	post_init)rK   rL   add_pooling_layerrM   s      rN   r3   Data2VecTextModel.__init__\  sL    
 	 7?*624E(04 	rP   c                 .    U R                   R                  $ r   r^   r8   rK   s    rN   get_input_embeddings&Data2VecTextModel.get_input_embeddingsl  s    ...rP   c                 $    XR                   l        g r   r`  )rK   r}   s     rN   set_input_embeddings&Data2VecTextModel.set_input_embeddingso  s    */'rP   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrZ  r  r  r   )rK   heads_to_pruner  r   s       rN   _prune_headsData2VecTextModel._prune_headsr  s<    
 +002LELLu%//;;EB 3rP   rW   r   r/   r,   r   rX   r   r  r   r  r   r   r!  r   r   c                 
   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nSnU	b:  [        U	[        5      (       d  U	S   S   R                  S   OU	R                  5       nUc  [        R                  " UUU-   4US9nUcs  [!        U R"                  S5      (       a4  U R"                  R$                  S S 2S U24   nUR'                  UU5      nUnO$[        R(                  " U[        R*                  US	9nU R-                  X/5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R/                  U5      nOS nU R1                  XPR                   R2                  5      nU R#                  UUUUUS
9nU R5                  UUUUUU	U
UUUUS9nUS   nU R6                  b  U R7                  U5      OS nU(       d
  UU4USS  -   $ [9        UUUR:                  UR<                  UR>                  UR@                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsr   r   )rS   r/   rR   )rW   r,   r/   rX   rY   )
r   r   r   r  r   r  r   r   r!  r   r    )r)  pooler_outputr   r   r*  r+  )!rL   r   r   use_return_dictr   r  rv   %warn_if_padding_and_no_attention_maskrH   rS   r   r   r   get_seq_lengthrD   onesrV   r^   r/   rF   rG   rI   get_extended_attention_maskinvert_attention_maskget_head_maskr  rZ  r[  r   r   r   r*  r+  ) rK   rW   r   r/   r,   r   rX   r   r  r   r  r   r   r!  r   rZ   r   r[   rS   rY   r\   r]   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputrA  s                                    rN   r_   Data2VecTextModel.forwardz  s3   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/599  "1%++B/$335 # !"ZZ*jCY6Y)ZdjkN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
rP   )rL   r^   rZ  r[  )T)NNNNNNNNNNNNNN)re   rf   rg   rh   ri   r3   rb  re  rj  r   r   rD   r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   rX  rX  K  s{    /0C  -11515/3,0048<9=+/$(,0/3&*15s
ELL)s
 !.s
 !.	s

 u||,s
 ELL)s
  -s
  (5s
 !) 6s
 "%s
 D>s
 $D>s
 'tns
 d^s
 !.s
  
uU\\"$PP	Q!s
 s
rP   rX  zX
    Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc            $         ^  \ rS rSrSS/rU 4S jrS rS r\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\4   4 S jj5       rSrU =r$ )Data2VecTextForCausalLMi  lm_head.decoder.weightlm_head.decoder.biasc                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzTIf you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`Fr]  
r2   r3   r   r-  warningrX  rE  Data2VecTextLMHeadlm_headr\  rJ   s     rN   r3    Data2VecTextForCausalLM.__init__  sM       NNqr.vO)&1 	rP   c                 .    U R                   R                  $ r   r  decoderra  s    rN   get_output_embeddings-Data2VecTextForCausalLM.get_output_embeddings      ||###rP   c                 $    XR                   l        g r   r  rK   new_embeddingss     rN   set_output_embeddings-Data2VecTextForCausalLM.set_output_embeddings      -rP   rW   r   r/   r,   r   rX   r   r  labelsr   r  r   r   r!  r   r   c                    Ub  UOU R                   R                  nU	b  SnU R                  UUUUUUUUU
UUUUUS9nUS   nU R                  U5      nSnU	b*  U R                  " UU	4SU R                   R
                  0UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
>>> config = Data2VecTextConfig.from_pretrained("facebook/data2vec-text-base")
>>> config.is_decoder = True
>>> model = Data2VecTextForCausalLM.from_pretrained("facebook/data2vec-text-base", config=config)

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.logits
```NF)r   r/   r,   r   rX   r   r  r   r  r   r   r!  r   r   r5   r"   )losslogitsr   r   r*  r+  )rL   rn  rE  r  loss_functionr5   r   r   r   r*  r+  )rK   rW   r   r/   r,   r   rX   r   r  r  r   r  r   r   r!  r   kwargsr   r|  prediction_scoreslm_lossr   s                         rN   r_   Data2VecTextForCausalLM.forward  s$   T &1%<k$++B]B]I$$))%'"7#9+/!5#) % 
" "!* LL9((!  ;;11 	G ')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
rP   rE  r  )NNNNNNNNNNNNNNN)re   rf   rg   rh   _tied_weights_keysr3   r  r  r   r   rD   
LongTensorr   r   r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s    34JK
$.  156:59371559=A>B-1+/$(,0/3&*15!U
E,,-U
 !!2!23U
 !!1!12	U

 u//0U
 E--.U
   1 12U
  ((9(9:U
 !)):): ;U
 ))*U
 "%U
 D>U
 $D>U
 'tnU
 d^U
  !.!U
$ 
u77	8%U
 U
rP   r  c                     ^  \ rS rSrSS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )Data2VecTextForMaskedLMid  r  r  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzsIf you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  r  rJ   s     rN   r3    Data2VecTextForMaskedLM.__init__h  sS     NN1
 /vO)&1 	rP   c                 .    U R                   R                  $ r   r  ra  s    rN   r  -Data2VecTextForMaskedLM.get_output_embeddingsw  r  rP   c                 $    XR                   l        g r   r  r  s     rN   r  -Data2VecTextForMaskedLM.set_output_embeddingsz  r  rP   rW   r   r/   r,   r   rX   r   r  r  r   r   r!  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUS   nU R                  U5      nSnU	ba  [	        5       nU	R                  UR                  5      n	U" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
N)
r   r/   r,   r   rX   r   r  r   r   r!  r   r-   r"   r  r  r   r*  )rL   rn  rE  r  r   r   rS   r   r5   r   r   r*  )rK   rW   r   r/   r,   r   rX   r   r  r  r   r   r!  r   r|  r  masked_lm_lossloss_fctr   s                      rN   r_   Data2VecTextForMaskedLM.forward}  s   , &1%<k$++B]B]$$))%'"7#9/!5# % 
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
rP   r  )NNNNNNNNNNNN)re   rf   rg   rh   r  r3   r  r  r   r   rD   r  r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r  d  sQ   24JK$.  156:59371559=A>B-1,0/3&*7
E,,-7
 !!2!237
 !!1!12	7

 u//07
 E--.7
   1 127
  ((9(9:7
 !)):): ;7
 ))*7
 $D>7
 'tn7
 d^7
 
un$	%7
 7
rP   r  c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )r  i  z/Data2VecText Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  5      U l
        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g r   )r2   r3   r   rz   r6   r   r=   r>   
layer_normr5   r  	ParameterrD   rG   rI  rJ   s     rN   r3   Data2VecTextLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FGLLV->->!?@	 IIrP   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r  r  rK   featuresr  xs       rN   r_   Data2VecTextLMHead.forward  s;    JJx GOOA LLOrP   c                     U R                   R                  R                  R                  S:X  a  U R                  U R                   l        g U R                   R                  U l        g )Nmeta)r  rI  rS   typera  s    rN   _tie_weightsData2VecTextLMHead._tie_weights  sC     <<##((F2 $		DLL))DIrP   )rI  r  r   r  )
re   rf   rg   rh   ri   r3   r_   r  rj   rk   rl   s   @rN   r  r    s    9&* *rP   r  z
    Data2VecText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )%Data2VecTextForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         g NFr  )	r2   r3   
num_labelsrL   rX  rE  Data2VecTextClassificationHead
classifierr\  rJ   s     rN   r3   .Data2VecTextForSequenceClassification.__init__  sI      ++.vO8@ 	rP   rW   r   r/   r,   r   rX   r  r   r   r!  r   c                 f   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUGb  UR	                  UR
                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [!        5       nU" X5      nU
(       d  U4US	S -   nUb  U4U-   $ U$ [#        UUUR$                  UR&                  S
9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r/   r,   r   rX   r   r   r!  r   r    
regressionsingle_label_classificationmulti_label_classificationr-   r"   r  )rL   rn  rE  r  r   rS   problem_typer  r1   rD   rI   rw   r   squeezer   r   r   r   r   r*  rK   rW   r   r/   r,   r   rX   r  r   r   r!  r   r|  r  r  r  r   s                    rN   r_   -Data2VecTextForSequenceClassification.forward  s   ( &1%<k$++B]B]$$))%'/!5# % 

 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
rP   )r  rL   rE  r  
NNNNNNNNNN)re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s   	  156:59371559-1,0/3&*E
E,,-E
 !!2!23E
 !!1!12	E

 u//0E
 E--.E
   1 12E
 ))*E
 $D>E
 'tnE
 d^E
 
u..	/E
 E
rP   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )Data2VecTextForMultipleChoicei3  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  S5      U l
        U R                  5         g )Nr    )r2   r3   rX  rE  r   r?   r@   rA   rz   r6   r  r\  rJ   s     rN   r3   &Data2VecTextForMultipleChoice.__init__5  sW     .v6zz&"<"<=))F$6$6: 	rP   rW   r/   r   r  r,   r   rX   r   r   r!  r   c                    U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb.  [        5       nUR                  UR                  5      nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
Nr    r-   r   )r,   r/   r   r   rX   r   r   r!  r"   r  )rL   rn  r   r   rH   rE  rA   r  r   r   rS   r   r   r*  )rK   rW   r/   r   r  r,   r   rX   r   r   r!  num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   rA  r  reshaped_logitsr  r  r   s                           rN   r_   %Data2VecTextForMultipleChoice.forward?  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 $$*..,/!5# % 

  
]3/ ++b+6')HYY556FOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
rP   )r  rE  rA   r  )re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r  3  s     15596:-1371559,0/3&*Y
E,,-Y
 !!1!12Y
 !!2!23	Y

 ))*Y
 u//0Y
 E--.Y
   1 12Y
 $D>Y
 'tnY
 d^Y
 
u//	0Y
 Y
rP   r  c                   R  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )"Data2VecTextForTokenClassificationi  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g r  )r2   r3   r  rX  rE  classifier_dropoutr@   r   r?   rA   rz   r6   r  r\  rK   rL   r  rM   s      rN   r3   +Data2VecTextForTokenClassification.__init__  s      ++.vO)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	rP   rW   r   r/   r,   r   rX   r  r   r   r!  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUbW  [        5       nUR                  UR                  5      nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r-   r"   r  )rL   rn  rE  rA   r  r   r   rS   r   r  r   r   r*  r  s                    rN   r_   *Data2VecTextForTokenClassification.forward  s
   $ &1%<k$++B]B]$$))%'/!5# % 

 "!*,,71')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
rP   )r  rE  rA   r  r  )re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s     156:59371559-1,0/3&*4
E,,-4
 !!2!234
 !!1!12	4

 u//04
 E--.4
   1 124
 ))*4
 $D>4
 'tn4
 d^4
 
u++	,4
 4
rP   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i  z-Head for sentence-level classification tasks.c                 b  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        g r   )r2   r3   r   rz   r6   r   r  r@   r?   rA   r  out_projr  s      rN   r3   'Data2VecTextClassificationHead.__init__  s    YYv1163E3EF
)/)B)B)NF%%TZTnTn 	 zz"45		&"4"4f6G6GHrP   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r?  )rA   r   rD   tanhr  r  s       rN   r_   &Data2VecTextClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!rP   )r   rA   r  )	re   rf   rg   rh   ri   r3   r_   rj   rk   rl   s   @rN   r  r    s    7I rP   r  c                   r  ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ ) Data2VecTextForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r  )
r2   r3   r  rX  rE  r   rz   r6   
qa_outputsr\  rJ   s     rN   r3   )Data2VecTextForQuestionAnswering.__init__  sV      ++.vO))F$6$68I8IJ 	rP   rW   r   r/   r,   r   rX   start_positionsend_positionsr   r   r!  r   c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r    r-   r   )ignore_indexr"   )r  start_logits
end_logitsr   r*  )rL   rn  rE  r  splitr  r   r   rH   clampr   r   r   r*  )rK   rW   r   r/   r,   r   rX   r  r  r   r   r!  r   r|  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          rN   r_   (Data2VecTextForQuestionAnswering.forward  s    &1%<k$++B]B]$$))%'/!5# % 

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
rP   )rE  r  r  )NNNNNNNNNNN)re   rf   rg   rh   r3   r   r   rD   r  r   r   r   r   r   r_   rj   rk   rl   s   @rN   r  r    s$     156:593715596:48,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u//0>
 E--.>
   1 12>
 "%"2"23>
   0 01>
 $D>>
 'tn>
 d^>
 
u22	3>
 >
rP   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r    r   )nerw   rD   cumsumtype_asrI   )rW   r'   rY   maskincremental_indicess        rN   rT   rT   I  sW     <<$((*D <<!4<<TBE[[_cc##%33rP   )r  r  r  r  r  r  rX  rD  )r   )Gri   r   typingr   r   rD   r   torch.nnr   r   r   activationsr
   r   cache_utilsr   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   utils.deprecationr   configuration_data2vec_textr!   
get_loggerre   r-  _HIDDEN_STATES_START_POSITIONModuler$   rn   r   r   r   r   r   r  r  r9  rD  rX  r  r  r  r  r  r  r  r  rT   __all__r#  rP   rN   <module>r     sO   "  "   A A ' C C ) 9	 	 	 . l l , 0 ; 
		H	% !" V=BII V=tB.		 B.LRYY  &( $3BII 3nryy   E2 ERW
")) W
v  ./ . .2 b
3 b
 b
J 
k
9? k

k
\ P
9 P
 P
h* *> R
,G R
R
j e
$? e
 e
P D
)D D
 D
PRYY , J
'B J
 J
Z4 	rP   