
    bCi1                    J   S r SSKrSSKrSSKrSSKJr  SSKJrJr  SSK	r	SSK	J
r
  SSKJrJrJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJrJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+  SSK,J-r-J.r.J/r/  SSK0J1r1  SSK2J3r3  \/Rh                  " \55      r6S r7 " S S\
Rp                  5      r9 " S S\
Rp                  5      r: " S S\:5      r; " S S\
Rp                  5      r<\:\;S.r= " S S\
Rp                  5      r> " S  S!\
Rp                  5      r? " S" S#\
Rp                  5      r@ " S$ S%\5      rA " S& S'\
Rp                  5      rB " S( S)\
Rp                  5      rC " S* S+\
Rp                  5      rD " S, S-\
Rp                  5      rE " S. S/\
Rp                  5      rF " S0 S1\
Rp                  5      rG " S2 S3\
Rp                  5      rH\. " S4 S5\'5      5       rI\\." S6S79 " S8 S9\-5      5       5       rJ\." S:S79 " S; S<\I5      5       rK\." S=S79 " S> S?\I5      5       rL\." S@S79 " SA SB\I\5      5       rM\. " SC SD\I5      5       rN\." SES79 " SF SG\I5      5       rO\." SHS79 " SI SJ\I5      5       rP\. " SK SL\I5      5       rQ\. " SM SN\I5      5       rR\. " SO SP\I5      5       rS/ SQQrTg)RzPyTorch BERT model.    N)	dataclass)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)#_prepare_4d_attention_mask_for_sdpa*_prepare_4d_causal_attention_mask_for_sdpa)GradientCheckpointingLayer)	)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputNextSentencePredictorOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging)deprecate_kwarg   )
BertConfigc           	          SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n/ n	U H]  u  p[        R                  SU
 SU 35        UR                  R                  Xj5      nUR                  U
5        U	R                  U5        M_     [        X5       GH  u  pU
R                  S5      n
[!        S U
 5       5      (       a)  [        R                  S	SR#                  U
5       35        MW  U nU
 H  nUR%                  S
U5      (       a  UR                  SU5      nOU/nUS   S:X  d	  US   S:X  a  ['        US5      nOZUS   S:X  d	  US   S:X  a  ['        US5      nO;US   S:X  a  ['        US5      nO%US   S:X  a  ['        US5      nO ['        XS   5      n[+        U5      S:  d  M  [-        US   5      nUU   nM     WSS S:X  a  ['        US5      nOUS:X  a  UR/                  U5      n UR0                  UR0                  :w  a&  [3        SUR0                   SUR0                   S35      e [        R                  SU
 35        [6        R8                  " U5      Ul        GM     U $ ! [         a    [        R                  S5        e f = f! [(         a,    [        R                  S	SR#                  U
5       35         GM  f = f! [2         a1  nU=R4                  UR0                  UR0                  4-  sl        e SnAff = f)z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   ,   #    U  H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     `/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/bert/modeling_bert.py	<genexpr>*load_tf_weights_in_bert.<locals>.<genexpr>S   s      
 nns   z	Skipping z[A-Za-z]+_\d+z_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weightssquad
classifier   r$   i_embeddingszPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoin	fullmatchgetattrAttributeErrorlenint	transposeshape
ValueErrorargstorch
from_numpydata)modelconfigtf_checkpoint_pathr?   nptftf_path	init_varsnamesarraysnamerW   arraypointerm_namescope_namesnumes                     r1   load_tf_weights_in_bertrm   6   s   
 ggoo01G
KK8	BC''0IEF (l5'BC&&w5Te	 ! 5)zz#  

 
 
 KK)CHHTN#345F||,f55 hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84Q7*!'<8%g1~>G ;1$+a.)!#,+ , #$<=(gx0GxLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''.Y *Z L  Q	
 	Z & KK)CHHTN+; <=  	FFw}}ekk22F	s6   J' K,A L'!K1L L
L?,L::L?c                      ^  \ rS rSrSrU 4S jr     SS\\R                     S\\R                     S\\R                     S\\R                     S\
S	\R                  4S
 jjrSrU =r$ )BertEmbeddings   zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r$   F)
persistenttoken_type_idsdtype)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutrR   ru   register_bufferrZ   arangeexpandzerosrw   sizelongselfr^   	__class__s     r1   r~   BertEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
    	input_idsrz   rw   inputs_embedspast_key_values_lengthreturnc                 d   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2XWU-   24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      n	U	nO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R                  U5      nU R                  U5      nU$ )Nrx   r$   rz   r   r|   devicerv   )r   rw   hasattrrz   r   rZ   r   r   r   r   r   ru   r   r   r   )r   r   rz   rw   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s                r1   forwardBertEmbeddings.forward   sC     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r   )r   r   ru   r   r   r   )NNNNr   )__name__
__module____qualname____firstlineno____doc__r~   r   rZ   
LongTensorFloatTensorrU   Tensorr   __static_attributes____classcell__r   s   @r1   ro   ro      s    Q
* 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
' 'r   ro   c                     ^  \ rS rSrSU 4S jjr\" SSSS9      SS\R                  S\\R                     S	\\R                     S
\\R                     S\\
   S\\   S\\R                     S\\R                     4S jj5       rSrU =r$ )BertSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        X0l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()ru   rv   relative_keyrelative_key_queryr=   r$   )r}   r~   r   num_attention_headsr   rX   rU   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   rR   ru   r   r   distance_embedding
is_decoder	layer_idxr   r^   ru   r   r   s       r1   r~   BertSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"r   past_key_valuepast_key_values4.58new_nameversionhidden_statesattention_mask	head_maskencoder_hidden_statesoutput_attentionscache_positionr   c                 	   UR                   u  pn
U R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nSnUS LnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nUR                  USU R                  U R                  5      R                  SS5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbc  U(       d  UOS nWR%                  UUU R                  SU05      u  nnU(       a.  [        U[        5      (       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nU R*                  S:X  d  U R*                  S	:X  Ga  UR                   S   UR                   S   nnUbB  [&        R,                  " US-
  [&        R.                  UR0                  S
9R                  SS5      nO>[&        R2                  " U[&        R.                  UR0                  S
9R                  SS5      n[&        R2                  " U[&        R.                  UR0                  S
9R                  SS5      nUU-
  nU R5                  UU R6                  -   S-
  5      nUR9                  UR:                  S9nU R*                  S:X  a  [&        R<                  " SUU5      nUU-   nOHU R*                  S	:X  a8  [&        R<                  " SUU5      n[&        R<                  " SUU5      nUU-   U-   nU[>        R@                  " U R                  5      -  nUb  UU-   n[B        RD                  RG                  USS9nU RI                  U5      nUb  UU-  n[&        R(                  " UU5      nURK                  SSSS5      RM                  5       nURO                  5       S S U RP                  4-   nUR                  U5      nUU4$ )Nrx   r$   r=   Fr   Tr   r   r   r{   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r
   ))rW   r   viewr   r   rV   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   updaterZ   matmulru   tensorr   r   r   r   r   tor|   einsummathsqrtr   
functionalsoftmaxr   permute
contiguousr   r   )r   r   r   r   r   r   r   r   
batch_sizer   _query_layerr   is_cross_attentioncurr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  r1   r   BertSelfAttention.forward   sa    %2$7$7!
jj/!&&z2t7O7OQUQiQijttq
 
2$>&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#2D.-/"=*+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  *7It)<)C)C{DNN=M~<^*&	; &*_FY*Z*ZAEO..t~~> !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L*!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r   )r   r   r   r   r   r   r   r   r   ru   r   r   NNNNNNFNr   r   r   r   r~   r#   rZ   r   r   r   r   booltupler   r   r   r   s   @r1   r   r      s    #6 %0A6R 7;15=A+/,115e.||e. !!2!23e. E--.	e.
  ((9(9:e. "%e. $D>e. !.e. 
u||	e. Se.r   r   c                   $  ^  \ rS rSrSU 4S jjr\" SSSS9      SS\R                  S\\R                     S	\\R                     S
\\R                     S\\
   S\\   S\\R                     S\\R                     4U 4S jjj5       rSrU =r$ )BertSdpaSelfAttentioniD  c                 D   > [         TU ]  XUS9  UR                  U l        g Nru   r   )r}   r~   r   dropout_probr   s       r1   r~   BertSdpaSelfAttention.__init__E  s$    \ef"??r   r   r   r   r   r   r   r   r   r   r   r   c           	      (  > U R                   S:w  d
  U(       d  Ub*  [        R                  S5        [        TU ]  UUUUUUU5      $ UR                  5       u  pn
U R                  U5      R                  USU R                  U R                  5      R                  SS5      nSnUS LnU(       a  UOUnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                   nOUR"                  nOUnU(       a  UOUnU(       aQ  UbN  U(       aG  WR$                  U R                     R&                  nUR$                  U R                     R(                  nOU R+                  U5      R                  USU R                  U R                  5      R                  SS5      nU R-                  U5      R                  USU R                  U R                  5      R                  SS5      nUbc  U(       d  UOS nWR/                  UUU R                  SU05      u  nnU(       a.  [        U[        5      (       a  SUR                  U R                  '   U R0                  =(       a    U(       + =(       a    US L =(       a    U	S:  n[2        R4                  R6                  R9                  UUUUU R:                  (       a  U R<                  OS	US
9nUR                  SS5      nUR?                  XU R@                  5      nUS 4$ )Nrv   a  BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rx   r$   r=   Fr   T        )	attn_mask	dropout_p	is_causal)!ru   rC   warning_oncer}   r   r   r   r   r   r   rV   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rZ   r   r   scaled_dot_product_attentiontrainingr  reshaper   )r   r   r   r   r   r   r   r   bsztgt_lenr   r   r   r   r   r   r   r   r  attn_outputr   s                       r1   r   BertSdpaSelfAttention.forwardJ  s    '':59JiNcH 7?%!  (,,.a JJ}%**3D4L4LdNfNfgqqrsuvw 	 
2$>2D.-&/+>??,77;;DNNK
%*9*O*O'*9*N*N'&5#2D.-/"=*+224>>BGGI-44T^^DKKK (c2t779Q9QR1a  

>*c2t779Q9QR1a  *7It)<)C)C{DNN=M~<^*&	; &*_FY*Z*ZAEO..t~~> OOi,>(>i>UYCYi^ehi^i	hh))FF$+/==d''c G 
 "++Aq1!))#8J8JKD  r   )r  r  r  r  r   s   @r1   r  r  D  s    @
 %0A6R 2615=A+/,115^!||^! !.^! E--.	^!
  ((9(9:^! "%^! $D>^! !.^! 
u||	^! S^!r   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BertSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nrs   )r}   r~   r   r   r   denser   r   r   r   r   r   s     r1   r~   BertSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ Nr  r   r   r   r   r   s      r1   r   BertSelfOutput.forward  5    

=1]3}'CDr   r   r  r   
r   r   r   r   r~   rZ   r   r   r   r   r   s   @r1   r  r    6    >U\\  RWR^R^  r   r  )eagersdpac                   $  ^  \ rS rSrSU 4S jjrS r\" SSSS9      SS\R                  S	\	\R                     S
\	\R                     S\	\R                     S\	\   S\	\   S\	\R                     S\\R                     4S jj5       rSrU =r$ )BertAttentioni  c                    > [         TU ]  5         [        UR                     " UUUS9U l        [        U5      U l        [        5       U l        g r	  )	r}   r~   BERT_SELF_ATTENTION_CLASSES_attn_implementationr   r  outputsetpruned_headsr   s       r1   r~   BertAttention.__init__  sF    /0K0KL$;
	
 %V,Er   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r$   r   )rT   r   r   r   r   r3  r   r   r   r   r1  r  r   union)r   headsindexs      r1   prune_headsBertAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r   r   r   r   r   r   r   r   r   r   r   r   c           
      l    U R                  UUUUUUUS9nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr   r   r   r   r   r   r   r$   )r   r1  )r   r   r   r   r   r   r   r   self_outputsattention_outputoutputss              r1   r   BertAttention.forward  s\     yy)"7+/) ! 
  ;;|AF#%QR(88r   )r1  r3  r   r  r  )r   r   r   r   r~   r9  r#   rZ   r   r   r   r   r  r  r   r   r   r   s   @r1   r-  r-    s    ";$ %0A6R 7;15=A+/,115|| !!2!23 E--.	
  ((9(9: "% $D> !. 
u||	 Sr   r-  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r"  )r}   r~   r   r   r   intermediate_sizer  r   
hidden_actstrr   intermediate_act_fnr   s     r1   r~   BertIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r"  r  rG  r   r   s     r1   r   BertIntermediate.forward  s&    

=100?r   rJ  r(  r   s   @r1   rB  rB    s(    9U\\ ell  r   rB  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )
BertOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r}   r~   r   r   rD  r   r  r   r   r   r   r   r   s     r1   r~   BertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r"  r#  r$  s      r1   r   BertOutput.forward  r&  r   r'  r(  r   s   @r1   rN  rN    r)  r   rN  c                   D  ^  \ rS rSrSU 4S jjr\" SSSS9       SS\R                  S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\   S\\R                     S\\R                     4S jj5       rS rSrU =r$ )	BertLayeri  c                 r  > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        USUS9U l	        [        U5      U l        [        U5      U l        g )Nr$   r   z> should be used as a decoder model if cross attention is addedrv   r
  )r}   r~   chunk_size_feed_forwardseq_len_dimr-  	attentionr   add_cross_attentionrX   crossattentionrB  intermediaterN  r1  )r   r^   r   r   s      r1   r~   BertLayer.__init__  s    '-'E'E$&vC ++#)#=#= ##?? D6)g!hii"/PZfo"pD,V4 (r   r   r   r   r   r   r   r   r   encoder_attention_maskr   r   r   c	           
      P   U R                  UUUUUUS9n	U	S   n
U	SS  nU R                  (       aD  UbA  [        U S5      (       d  [        SU  S35      eU R	                  U
UUUUUUS9nUS   n
XSS  -   n[        U R                  U R                  U R                  U
5      nU4U-   nU$ )N)r   r   r   r   r   r   r$   r[  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r<  )	rY  r   r   rX   r[  r   feed_forward_chunkrW  rX  )r   r   r   r   r   r^  r   r   r   self_attention_outputsr>  r?  cross_attention_outputslayer_outputs                 r1   r   BertLayer.forward"  s    "&)/+) "0 "
 2!4(,??4@4!122 =dV DD D 
 '+&9&9 5#&; /"3- ': '#  7q9 ;;G0##T%A%A4CSCSUe
  /G+r   c                 J    U R                  U5      nU R                  X!5      nU$ r"  )r\  r1  )r   r>  intermediate_outputrc  s       r1   r`  BertLayer.feed_forward_chunkS  s)    "//0@A{{#6Ir   )rZ  rY  rW  r[  r\  r   r1  rX  r"  )NNNNNFN)r   r   r   r   r~   r#   rZ   r   r   r   r   r  r  r   r`  r   r   r   s   @r1   rT  rT    s    ) %0A6R 7;15=A>B+/,115.||. !!2!23. E--.	.
  ((9(9:. !)):): ;. "%. $D>. !.. 
u||	. S.` r   rT  c                   V  ^  \ rS rSrSU 4S jjr          SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\\
   S
\\
   S\\
   S\\
   S\\R                     S\\\R                     \4   4S jjrSrU =r$ )BertEncoderiY  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l	        g s  snf )NrV  F)
r}   r~   r^   r   
ModuleListrangenum_hidden_layersrT  layergradient_checkpointing)r   r^   r   ir   s       r1   r~   BertEncoder.__init__Z  sS    ]]ERXRjRjLk#lLkqIf$BLk#lm
&+# $ms   A$r   r   r   r   r^  r   	use_cacher   output_hidden_statesreturn_dictr   r   c                    U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       aL  U R                   R                  (       a1  Uc.  [        [        U R                   S9[        U R                   S95      nU(       a[  U R                   R                  (       a@  [        U[        5      (       a+  [        R                  S5        [        R                  " U5      n[        U R                  5       He  u  nnU	(       a  X4-   nUb  X?   OS nU" UUUUUUUUS9nUS   nU(       d  M6  UUS   4-   nU R                   R                  (       d  M\  UUS	   4-   nMg     U	(       a  X4-   nU
(       d  [        S
 UUUUU4 5       5      $ [        UUUUUS9$ )Nr.   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r^   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.)r^  r   r   r   r   r$   r=   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr"  r.   )r/   vs     r1   r2   &BertEncoder.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_stater   r   
attentionscross_attentions)r^   rZ  ro  r  rC   r  r   r   r   r   r  from_legacy_cache	enumeratern  r   )r   r   r   r   r   r^  r   rr  r   rs  rt  r   all_hidden_statesall_self_attentionsall_cross_attentionsrp  layer_modulelayer_head_masklayer_outputss                      r1   r   BertEncoder.forward`  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	//O4K1,dkk2RT`hlhshsTtuO//JPU4V4V\
 2CCOTO(4OA|#$58H$H!.7.CilO(%'= /"3-	M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U(+  5.   14D D 
 "#%'(
 
 
 9+++*1
 	
r   )r^   ro  rn  r"  )
NNNNNNFFTN)r   r   r   r   r~   rZ   r   r   r   r   r  r   r  r   r   r   r   r   s   @r1   ri  ri  Y  s   , 7;15=A>B+/$(,1/4&*15P
||P
 !!2!23P
 E--.	P

  ((9(9:P
 !)):): ;P
 "%P
 D>P
 $D>P
 'tnP
 d^P
 !.P
 
uU\\"$MM	NP
 P
r   ri  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
BertPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r"  )r}   r~   r   r   r   r  Tanh
activationr   s     r1   r~   BertPooler.__init__  s9    YYv1163E3EF
'')r   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r  r  )r   r   first_token_tensorpooled_outputs       r1   r   BertPooler.forward  s6     +1a40

#566r   )r  r  r(  r   s   @r1   r  r    s(    $
U\\ ell  r   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r  )r}   r~   r   r   r   r  r   rE  rF  r   transform_act_fnr   r   r   s     r1   r~   $BertPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r"  )r  r  r   rK  s     r1   r   #BertPredictionHeadTransform.forward  s4    

=1--m<}5r   )r   r  r  r(  r   s   @r1   r  r    s)    UU\\ ell  r   r  c                   4   ^  \ rS rSrU 4S jrS rS rSrU =r$ )BertLMPredictionHeadi  c                 H  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  SS9U l        [        R                  " [        R                  " UR                  5      5      U l        U R                  U R                  l        g )NF)r9   )r}   r~   r  	transformr   r   r   r   decoder	ParameterrZ   r   r9   r   s     r1   r~   BertLMPredictionHead.__init__  sm    4V< yy!3!3V5F5FUSLLV->->!?@	 !IIr   c                 :    U R                   U R                  l         g r"  )r9   r  r   s    r1   _tie_weights!BertLMPredictionHead._tie_weights  s     IIr   c                 J    U R                  U5      nU R                  U5      nU$ r"  )r  r  rK  s     r1   r   BertLMPredictionHead.forward  s$    }5]3r   )r9   r  r  )	r   r   r   r   r~   r  r   r   r   r   s   @r1   r  r    s    && r   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BertOnlyMLMHeadi  c                 B   > [         TU ]  5         [        U5      U l        g r"  )r}   r~   r  predictionsr   s     r1   r~   BertOnlyMLMHead.__init__  s    /7r   sequence_outputr   c                 (    U R                  U5      nU$ r"  r  )r   r  prediction_scoress      r1   r   BertOnlyMLMHead.forward  s     ,,_=  r   r  r(  r   s   @r1   r  r    s(    8!u|| ! ! !r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertOnlyNSPHeadi  c                 n   > [         TU ]  5         [        R                  " UR                  S5      U l        g Nr=   )r}   r~   r   r   r   seq_relationshipr   s     r1   r~   BertOnlyNSPHead.__init__  s'     "		&*<*<a @r   c                 (    U R                  U5      nU$ r"  r  )r   r  seq_relationship_scores      r1   r   BertOnlyNSPHead.forward  s    !%!6!6}!E%%r   r  r   r   r   r   r~   r   r   r   r   s   @r1   r  r    s    A& &r   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BertPreTrainingHeadsi  c                    > [         TU ]  5         [        U5      U l        [        R
                  " UR                  S5      U l        g r  )r}   r~   r  r  r   r   r   r  r   s     r1   r~   BertPreTrainingHeads.__init__  s4    /7 "		&*<*<a @r   c                 L    U R                  U5      nU R                  U5      nX44$ r"  r  r  )r   r  r  r  r  s        r1   r   BertPreTrainingHeads.forward  s-     ,,_=!%!6!6}!E 88r   r  r  r   s   @r1   r  r    s    A
9 9r   r  c                   6    \ rS rSr% \\S'   \rSrSr	Sr
S rSrg)BertPreTrainedModeli
  r^   bertTc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsr  )meanstdNg      ?)r   r   r   r6   r\   normal_r^   initializer_ranger9   zero_r   rr   r   fill_r  )r   modules     r1   _init_weights!BertPreTrainedModel._init_weights  s3   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) 455KK""$ 6r   r.   N)r   r   r   r   r%   __annotations__rm   load_tf_weightsbase_model_prefixsupports_gradient_checkpointing_supports_sdpar  r   r.   r   r1   r  r  
  s#    -O&*#N%r   r  z0
    Output type of [`BertForPreTraining`].
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
BertForPreTrainingOutputi%  ar  
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the masked language modeling loss and the next sequence prediction
    (classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
    before SoftMax).
Nlossprediction_logitsseq_relationship_logitsr   rz  r.   )r   r   r   r   r   r  r   rZ   r   r  r  r  r   r  rz  r   r.   r   r1   r  r  %  s~    	 )-D(5$$
%,59x 1 129;?Xe&7&78?8<M8E%"3"345<59Ju00129r   r  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c            "         ^  \ rS rSrSS/rSU 4S jjrS rS rS r\	              SS\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\R                     \4   4S jj5       rSrU =r$ )	BertModeli>  ro   rT  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        UR                  U l
        UR                  U l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)r}   r~   r^   ro   r   ri  encoderr  poolerr0  attn_implementationru   	post_init)r   r^   add_pooling_layerr   s      r1   r~   BertModel.__init__M  sg    
 	 (0"6*,=j(4#)#>#> '-'E'E$ 	r   c                 .    U R                   R                  $ r"  r   r   r  s    r1   get_input_embeddingsBertModel.get_input_embeddings`  s    ...r   c                 $    XR                   l        g r"  r  )r   r   s     r1   set_input_embeddingsBertModel.set_input_embeddingsc  s    */'r   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rn  rY  r9  )r   heads_to_prunern  r7  s       r1   _prune_headsBertModel._prune_headsf  s<    
 +002LELLu%//;;EB 3r   r   r   rz   rw   r   r   r   r^  r   rr  r   rs  rt  r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nSnU	b:  [        U	[        5      (       d  U	S   S   R                  S   OU	R                  5       nUcs  [        U R                  S5      (       a4  U R                  R                   S S 2S U24   nUR#                  UU5      nUnO$[$        R&                  " U[$        R(                  US9nU R                  UUUUUS	9nUc  [$        R*                  " UUU-   4US
9nU R,                  S:H  =(       a(    U R.                  S:H  =(       a    US L =(       a    U(       + nU(       aT  UR1                  5       S:X  a@  U R                   R                  (       a  [3        UUUU5      nO'[5        UUR6                  US9nOU R9                  X/5      nU R                   R                  (       av  Ubs  UR                  5       u  nnnUU4nUc  [$        R*                  " UUS
9nU(       a*  UR1                  5       S:X  a  [5        UUR6                  US9nOU R;                  U5      nOS nU R=                  XPR                   R>                  5      nU RA                  UUUUUU	U
UUUUS9nUS   nU RB                  b  U RC                  U5      OS n U(       d
  UU 4USS  -   $ [E        UU URF                  URH                  URJ                  URL                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerx   z5You have to specify either input_ids or inputs_embedsr   r   rz   r   )r   rw   rz   r   r   )r   r+  rv   r=   )r  )
r   r   r   r^  r   rr  r   rs  rt  r   r$   )ry  pooler_outputr   r   rz  r{  )'r^   r   rs  use_return_dictr   rr  rX   %warn_if_padding_and_no_attention_maskr   r   r   r   rW   get_seq_lengthr   r   rz   r   rZ   r   r   onesr  ru   r   r   r   r|   get_extended_attention_maskinvert_attention_maskget_head_maskrm  r  r  r   r   r   rz  r{  )!r   r   r   rz   rw   r   r   r   r^  r   rr  r   rs  rt  r   r   r   r   r   r   r   r   embedding_outputuse_sdpa_attention_masksextended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskencoder_outputsr  r  s!                                    r1   r   BertModel.forwardn  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/599  "1%++B/$335 # !t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'#9 + 
 !"ZZZBX5X(YbhiN $$. &,,
:&T!& &%	 	! $(:(:(<(A {{%%*T"$*	+' +N"$4$:$:J+' '+&F&F~&c# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&',B,F,F,HA,M 3V*,<,B,BJ3/ 372L2LMc2d/.2+ &&y++2O2OP	,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r   )r  r^   r   r  r  ru   )T)NNNNNNNNNNNNNN)r   r   r   r   _no_split_modulesr~   r  r  r  r!   r   rZ   r   r   r  r   r  r   r   r   r   r   s   @r1   r  r  >  s    *;7&/0C  -11515/3,0048<9=+/$(,0/3&*15S
ELL)S
 !.S
 !.	S

 u||,S
 ELL)S
  -S
  (5S
 !) 6S
 "%S
 D>S
 $D>S
 'tnS
 d^S
 !.S
  
uU\\"$PP	Q!S
 S
r   r  z
    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    c                     ^  \ rS rSrSS/rU 4S jrS rS r\           SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )BertForPreTrainingi  predictions.decoder.biascls.predictions.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r"  )r}   r~   r  r  r  clsr  r   s     r1   r~   BertForPreTraining.__init__  s4     f%	'/ 	r   c                 B    U R                   R                  R                  $ r"  r  r  r  r  s    r1   get_output_embeddings(BertForPreTraining.get_output_embeddings      xx##+++r   c                     XR                   R                  l        UR                  U R                   R                  l        g r"  r  r  r  r9   r   new_embeddingss     r1   set_output_embeddings(BertForPreTraining.set_output_embeddings  *    '5$$2$7$7!r   r   r   rz   rw   r   r   labelsnext_sentence_labelr   rs  rt  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUSS u  pU R                  X5      u  nnSnUbv  Ubs  [	        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU" UR                  SS5      UR                  S5      5      nUU-   nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
    the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
    pair (see `input_ids` docstring) Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForPreTraining
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased")

>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> prediction_logits = outputs.prediction_logits
>>> seq_relationship_logits = outputs.seq_relationship_logits
```
Nr   rz   rw   r   r   r   rs  rt  r=   rx   )r  r  r  r   rz  )
r^   r  r  r  r   r   r   r  r   rz  )r   r   r   rz   rw   r   r   r  r  r   rs  rt  r?  r  r  r  r  
total_lossloss_fctmasked_lm_lossnext_sentence_lossr1  s                         r1   r   BertForPreTraining.forward  sC   V &1%<k$++B]B]))))%'/!5#  

 *1!&48HH_4\11
"5"A')H%&7&<&<RAWAW&XZ`ZeZefhZijN!)*@*E*Eb!*LNaNfNfgiNj!k'*<<J')?@712;NF/9/EZMF*Q6Q'/$:!//))
 	
r   r  r  NNNNNNNNNNN)r   r   r   r   _tied_weights_keysr~   r  r  r!   r   rZ   r   r  r   r  r  r   r   r   r   s   @r1   r   r     sC    56VW,8  -11515/3,004)-6:,0/3&*L
ELL)L
 !.L
 !.	L

 u||,L
 ELL)L
  -L
 &L
 &ell3L
 $D>L
 'tnL
 d^L
 
uU\\"$<<	=L
 L
r   r   zP
    Bert Model with a `language modeling` head on top for CLM fine-tuning.
    c            $         ^  \ rS rSrSS/rU 4S jrS rS r\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\
R                     \4   4 S jj5       rSrU =r$ )BertLMHeadModelin  zcls.predictions.decoder.biasr  c                    > [         TU ]  U5        UR                  (       d  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzLIf you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`Fr  
r}   r~   r   rC   warningr  r  r  r  r  r   s     r1   r~   BertLMHeadModel.__init__v  sL       NNijf>	"6* 	r   c                 B    U R                   R                  R                  $ r"  r  r  s    r1   r  %BertLMHeadModel.get_output_embeddings  r
  r   c                     XR                   R                  l        UR                  U R                   R                  l        g r"  r  r  s     r1   r  %BertLMHeadModel.set_output_embeddings  r  r   r   r   rz   rw   r   r   r   r^  r  r   rr  r   rs  rt  r   r   c                    Ub  UOU R                   R                  nU	b  SnU R                  UUUUUUUUU
UUUUUS9nUS   nU R                  U5      nSnU	b(  U R                  " UXR                   R
                  40 UD6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
NF)r   rz   rw   r   r   r   r^  r   rr  r   rs  rt  r   r   r=   )r  logitsr   r   rz  r{  )r^   r  r  r  loss_functionr   r   r   r   rz  r{  )r   r   r   rz   rw   r   r   r   r^  r  r   rr  r   rs  rt  r   loss_kwargsr?  r  r  lm_lossr1  s                         r1   r   BertLMHeadModel.forward  s   4 &1%<k$++B]B]I))))%'"7#9+/!5#)  
" "!* HH_5(():FKKDZDZj^ijG')GABK7F,3,?WJ'KVK0$#33!//))$55
 	
r   r  )NNNNNNNNNNNNNNN)r   r   r   r   r  r~   r  r  r!   r   rZ   r   r   r  r   r  r   r   r   r   r   s   @r1   r  r  n  s    9:Z[
,8  -11515/3,0048<9=)-+/$(,0/3&*15!@
ELL)@
 !.@
 !.	@

 u||,@
 ELL)@
  -@
  (5@
 !) 6@
 &@
 "%@
 D>@
 $D>@
 'tn@
 d^@
  !.!@
$ 
uU\\"$EE	F%@
 @
r   r  c                     ^  \ rS rSrSS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSS jr\S\4S j5       rSrU =r$ )BertForMaskedLMi  r  r  c                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         g )NzkIf you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr!  r"  r   s     r1   r~   BertForMaskedLM.__init__  sR     NN1
 f>	"6* 	r   c                 B    U R                   R                  R                  $ r"  r  r  s    r1   r  %BertForMaskedLM.get_output_embeddings  r
  r   c                     XR                   R                  l        UR                  U R                   R                  l        g r"  r  r  s     r1   r  %BertForMaskedLM.set_output_embeddings  r  r   r   r   rz   rw   r   r   r   r^  r  r   rs  rt  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUS   nU R                  U5      nSnU	bF  [	        5       nU" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
N)
r   rz   rw   r   r   r   r^  r   rs  rt  r   rx   r=   r  r*  r   rz  )
r^   r  r  r  r   r   r   r   r   rz  )r   r   r   rz   rw   r   r   r   r^  r  r   rs  rt  r?  r  r  r  r  r1  s                      r1   r   BertForMaskedLM.forward  s    . &1%<k$++B]B]))))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r   c                    UR                   nUS   nU R                  R                  c  [        S5      e[        R
                  " X"R                  UR                   S   S45      /SS9n[        R                  " US4U R                  R                  [        R                  UR                  S9n[        R
                  " X/SS9nXS.$ )Nr   z.The PAD token should be defined for generationr$   rx   r   r   )r   r   )
rW   r^   r   rX   rZ   cat	new_zerosfullr   r   )r   r   r   model_kwargsr   effective_batch_sizedummy_tokens          r1   prepare_inputs_for_generation-BertForMaskedLM.prepare_inputs_for_generation!  s    oo*1~ ;;##+MNNN4L4LnNbNbcdNeghMi4j#kqstjj!1%t{{'?'?uzzZcZjZj
 IIy6A>	&IIr   c                     g)z
Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a
`prepare_inputs_for_generation` method.
Fr.   )r  s    r1   can_generateBertForMaskedLM.can_generate1  s     r   r  )NNNNNNNNNNNNr"  )r   r   r   r   r  r~   r  r  r!   r   rZ   r   r  r   r  r   r   rA  classmethodrD  r   r   r   s   @r1   r0  r0    sj   46VW,8  -11515/3,0048<9=)-,0/3&*7
ELL)7
 !.7
 !.	7

 u||,7
 ELL)7
  -7
  (57
 !) 67
 &7
 $D>7
 'tn7
 d^7
 
uU\\"N2	37
 7
rJ  T  r   r0  zT
    Bert Model with a `next sentence prediction (classification)` head on top.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForNextSentencePredictioni:  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r"  )r}   r~   r  r  r  r  r  r   s     r1   r~   &BertForNextSentencePrediction.__init__@  s4     f%	"6* 	r   r   r   rz   rw   r   r   r  r   rs  rt  r   c                    SU;   a,  [         R                  " S[        5        UR                  S5      nU
b  U
OU R                  R
                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nSnUb2  [        5       nU" UR                  SS5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )	a"  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
    (see `input_ids` docstring). Indices should be in `[0, 1]`:

    - 0 indicates sequence B is a continuation of sequence A,
    - 1 indicates sequence B is a random sequence.

Example:

```python
>>> from transformers import AutoTokenizer, BertForNextSentencePrediction
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
>>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")

>>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
>>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
>>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

>>> outputs = model(**encoding, labels=torch.LongTensor([1]))
>>> logits = outputs.logits
>>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
```
r  zoThe `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.Nr  r$   rx   r=   r8  )warningswarnFutureWarningpopr^   r  r  r  r   r   r   r   rz  )r   r   r   rz   rw   r   r   r  r   rs  rt  kwargsr?  r  seq_relationship_scoresr  r  r1  s                     r1   r   %BertForNextSentencePrediction.forwardI  s   T !F*MM%
 ZZ 56F%0%<k$++B]B]))))%'/!5#  

  
"&((="9!')H!)*A*F*Fr1*Mv{{[]!_-/'!"+=F7I7U')F2a[aa*#*!//))	
 	
r   r  
NNNNNNNNNN)r   r   r   r   r~   r!   r   rZ   r   r  r   r  r   r   r   r   r   s   @r1   rH  rH  :  s     -11515/3,004)-,0/3&*Q
ELL)Q
 !.Q
 !.	Q

 u||,Q
 ELL)Q
  -Q
 &Q
 $D>Q
 'tnQ
 d^Q
 
uU\\"$??	@Q
 Q
r   rH  z
    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForSequenceClassificationi  c                 r  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        UR                  b  UR                  OUR                  n[        R                  " U5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g r"  )r}   r~   
num_labelsr^   r  r  classifier_dropoutr   r   r   r   r   r   r<   r  r   r^   rX  r   s      r1   r~   &BertForSequenceClassification.__init__  s      ++f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   rz   rw   r   r   r  r   rs  rt  r   c                 R   U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [!        UUUR"                  UR$                  S	9$ )
ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r$   
regressionsingle_label_classificationmulti_label_classificationrx   r=   r8  )r^   r  r  r   r<   problem_typerW  r|   rZ   r   rU   r	   squeezer   r   r   r   r   rz  )r   r   r   rz   rw   r   r   r  r   rs  rt  r?  r  r*  r  r  r1  s                    r1   r   %BertForSequenceClassification.forward  s   ( &1%<k$++B]B]))))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r   )r  r<   r^   r   rW  rS  )r   r   r   r   r~   r!   r   rZ   r   r  r   r  r   r   r   r   r   s   @r1   rU  rU    s     -11515/3,004)-,0/3&*E
ELL)E
 !.E
 !.	E

 u||,E
 ELL)E
  -E
 &E
 $D>E
 'tnE
 d^E
 
uU\\"$<<	=E
 E
r   rU  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForMultipleChoicei  c                 0  > [         TU ]  U5        [        U5      U l        UR                  b  UR                  OUR
                  n[        R                  " U5      U l        [        R                  " UR                  S5      U l        U R                  5         g )Nr$   )r}   r~   r  r  rX  r   r   r   r   r   r   r<   r  rY  s      r1   r~   BertForMultipleChoice.__init__  su     f%	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$6: 	r   r   r   rz   rw   r   r   r  r   rs  rt  r   c                 Z   U
b  U
OU R                   R                  n
Ub  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        5       nU" X5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
    1]`:

    - 0 corresponds to a *sentence A* token,
    - 1 corresponds to a *sentence B* token.

    [What are token type IDs?](../glossary#token-type-ids)
position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
    `input_ids` above)
Nr$   rx   r   r  r=   r8  )r^   r  rW   r   r   r  r   r<   r   r   r   rz  )r   r   r   rz   rw   r   r   r  r   rs  rt  num_choicesr?  r  r*  reshaped_logitsr  r  r1  s                      r1   r   BertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  

  
]3/ ++b+6')HO4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r   )r  r<   r   rS  )r   r   r   r   r~   r!   r   rZ   r   r  r   r  r   r   r   r   r   s   @r1   rc  rc    s     -11515/3,004)-,0/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 $D>X
 'tnX
 d^X
 
uU\\"$==	>X
 X
r   rc  c                   l  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForTokenClassificationih  c                 d  > [         TU ]  U5        UR                  U l        [        USS9U l        UR
                  b  UR
                  OUR                  n[        R                  " U5      U l	        [        R                  " UR                  UR                  5      U l        U R                  5         g NFr!  )r}   r~   rW  r  r  rX  r   r   r   r   r   r   r<   r  rY  s      r1   r~   #BertForTokenClassification.__init__j  s      ++f>	)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r   r   r   rz   rw   r   r   r  r   rs  rt  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nU R	                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   rx   r=   r8  )r^   r  r  r   r<   r   r   rW  r   r   rz  )r   r   r   rz   rw   r   r   r  r   rs  rt  r?  r  r*  r  r  r1  s                    r1   r   "BertForTokenClassification.forwardx  s    $ &1%<k$++B]B]))))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r   )r  r<   r   rW  rS  )r   r   r   r   r~   r!   r   rZ   r   r  r   r  r   r   r   r   r   s   @r1   rk  rk  h  s     -11515/3,004)-,0/3&*2
ELL)2
 !.2
 !.	2

 u||,2
 ELL)2
  -2
 &2
 $D>2
 'tn2
 d^2
 
uU\\"$99	:2
 2
r   rk  c                     ^  \ rS rSrU 4S jr\           SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\
\\R                     \4   4S jj5       rSrU =r$ )BertForQuestionAnsweringi  c                    > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rm  )
r}   r~   rW  r  r  r   r   r   
qa_outputsr  r   s     r1   r~   !BertForQuestionAnswering.__init__  sU      ++f>	))F$6$68I8IJ 	r   r   r   rz   rw   r   r   start_positionsend_positionsr   rs  rt  r   c                 $   Ub  UOU R                   R                  nU R                  UUUUUUU	U
US9	nUS   nU R                  U5      nUR	                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nS nUb  Ub  [        UR                  5       5      S:  a  UR                  S5      n[        UR                  5       5      S:  a  UR                  S5      nUR                  S5      nUR                  SU5      nUR                  SU5      n[        US9nU" X5      nU" UU5      nUU-   S-  nU(       d  UU4USS  -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )	Nr  r   r$   rx   r   )ignore_indexr=   )r  start_logits
end_logitsr   rz  )r^   r  r  rt  rN   r`  r   rT   r   clampr   r   r   rz  )r   r   r   rz   rw   r   r   rv  rw  r   rs  rt  r?  r  r*  rz  r{  r  ignored_indexr  
start_lossend_lossr1  s                          r1   r    BertForQuestionAnswering.forward  s    &1%<k$++B]B]))))%'/!5#  

 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r   )r  rW  rt  r  )r   r   r   r   r~   r!   r   rZ   r   r  r   r  r   r   r   r   r   s   @r1   rr  rr    s     -11515/3,0042604,0/3&*>
ELL)>
 !.>
 !.	>

 u||,>
 ELL)>
  ->
 "%,,/>
  ->
 $D>>
 'tn>
 d^>
 
uU\\"$@@	A>
 >
r   rr  )r0  rc  rH  r   rr  rU  rk  rT  r  r  r  rm   )Ur   r   rE   rL  dataclassesr   typingr   r   rZ   r   torch.nnr   r   r	   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr    r!   r"   utils.deprecationr#   configuration_bertr%   
get_loggerr   rC   rm   Modulero   r   r  r  r/  r-  rB  rN  rT  ri  r  r  r  r  r  r  r  r  r  r   r  r0  rH  rU  rc  rk  rr  __all__r.   r   r1   <module>r     s!      	  ! "   A A ! C C ) w 9
 
 
 . l l 9 9 0 * 
		H	%FR=RYY =@B.		 B.Je!- e!PRYY  ! 3BII 3lryy  C* CLW
")) W
t ")) "299 .!bii !&bii &	9299 	9 %/ % %4 
:{ : :& 	x
# x
x
v `
, `
`
F 
W
)? W

W
t i) i iX 
\
$7 \

\
~ V
$7 V
V
r g
/ g
 g
T B
!4 B
 B
J J
2 J
 J
Zr   