
    cCi                       S SK r S SKrS SKJr  S SKJrJr  S SKrS SKJ	s  J
r  S SKJ	r	  S SKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)  \$" 5       (       a  S SK*J+r+  S SK,J-r-  S SK.J/r/  O\0r-\%Rb                  " \25      r3 " S S\Rh                  Rj                  5      r6  STS\\Rn                     S\\8   4S jjr9 " S S\-5      r: " S S\	Rv                  5      r< " S S\	Rv                  5      r= " S S \	Rv                  5      r>S! r?SUS" jr@ SVS#S$S%\Rn                  S&\Rn                  S'\Rn                  S(\\R                     S)\B\8\84   S*\8S+\8S,\\C   S-\\B\Rn                  \Rn                  4   \B\Rn                     4   4S. jjrD\R                  4S#S$S%\Rn                  S/\:S\Rn                  S\8S)\B\8\84   S*\8S+\8S0\R                  S-\B\Rn                     4S1 jjrGS#S$S%\Rn                  S&\Rn                  S'\Rn                  S(\\R                     S)\B\8\84   S*\8S+\8S-\B\Rn                     4S2 jrH\G\D\HS3.rI " S4 S$\	Rv                  5      rJ " S5 S6\5      rK\# " S7 S8\!5      5       rL  STS9\Rn                  S&\Rn                  S(\\Rn                     S:\\Rn                     S-\B\Rn                  \Rn                  \Rn                  \8\\Rn                     \\Rn                     4   4
S; jjrMS9\Rn                  S<\Rn                  S=\8S>\8S-\Rn                  4
S? jrN\# " S@ SA\L5      5       rO " SB SC\	Rv                  5      rP\#" SDSE9 " SF SG\L5      5       rQ\#" SHSE9 " SI SJ\L5      5       rR\#" SKSE9 " SL SM\L5      5       rS\# " SN SO\L5      5       rT\#" SPSE9 " SQ SR\L5      5       rU/ SSQrVg)W    N)nullcontext)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringis_flash_attn_2_availablelogging)is_triton_available   )ModernBertConfig) flash_attn_varlen_qkvpacked_func)RotaryEmbedding)apply_rotaryc                   h    \ rS rSr\  SS\\R                     S\\   4S jj5       r	\S 5       r
Srg)	ApplyRotaryEmbUnpad=   N
cu_seqlens
max_seqlenc                     UR                  5       nUR                  u  pgpUS S 2S S24   R                  USU	5      n
[        U
UUSUUSSS9  U R	                  X#U5        XPl        U$ )N   r   FT)seqlen_offsetsr#   r$   interleavedinplace)
contiguousshapeviewr   save_for_backwardr$   )ctxqkvcossinr#   r$   	total_nnz_three_nheadsheaddimqks              l/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/modernbert/modeling_modernbert.pyforwardApplyRotaryEmbUnpad.forward>   sz     nn.1ii+	7 BQBZ__YG4!!		
 	c
3#
    c                     U R                   u  p#nUR                  5       nUR                  u  pVpxUS S 2S S24   R                  USU5      n	[	        U	UUSUU R
                  SSSS9	  US S S S S S 4$ )Nr&   r'   r   FT)r(   r#   r$   r)   r*   	conjugate)saved_tensorsr+   r,   r-   r   r$   )
r/   dor1   r2   r#   r3   r4   r5   r6   dqks
             r8   backwardApplyRotaryEmbUnpad.backward]   s    "00*]]_.0hh+	7 BQBinnYG4!~~
	
 4tT455r;    NN)__name__
__module____qualname____firstlineno__staticmethodr   torchTensorintr9   rA   __static_attributes__rC   r;   r8   r!   r!   =   sQ     .2$(
 U\\* SM < 6 6r;   r!   r#   r$   c                 0    [         R                  XX#U5      $ )a  
Arguments:
    qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
    cos, sin: (seqlen_rotary, rotary_dim / 2)
    interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
        of 1st half and 2nd half (GPT-NeoX style).
    inplace: if True, apply rotary embedding in-place.
    seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
        Most commonly used in inference when we have KV cache.
    cu_seqlens: (batch + 1,) or None
    max_seqlen: int
Return:
    out: (total_nnz, dim)
rotary_dim must be <= headdim
Apply rotary embedding to the first rotary_dim of x.
)r!   apply)r0   r1   r2   r#   r$   s        r8   apply_rotary_unpaddedrP   t   s    . $$Ss
KKr;   c                   6  ^  \ rS rSrSr    SS\S\S\\   S\\R                     S\\R                     4
U 4S jjjr SS	\R                  S
\R                  S\\   S\\R                  \\R                  \R                  4   4   4S jjrS\4S jrSrU =r$ )!ModernBertUnpaddedRotaryEmbedding   zH
The rotary position embeddings applied directly to unpadded sequences.
dimbaser$   devicedtypec                 h   > [         TU ]  XUSS9  X0l        Ub  Ub  Ub  U R                  X4US9  gggg)z
max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
    up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
    the cos_sin_cache will be recomputed during the forward pass.
F)rT   rU   rV   r)   NrV   rW   )super__init__r$   _update_cos_sin_cache)selfrT   rU   r$   rV   rW   	__class__s         r8   r[   *ModernBertUnpaddedRotaryEmbedding.__init__   sM     	SFN$!f&8U=N&&z&N >O&8!r;   r0   r#   returnc                     Ub$  U R                  X1R                  UR                  S9  [        UU R                  U R
                  UUS9nU$ )z
Apply rotary embedding *inplace* to qkv.
qkv: (total_nnz, 3, nheads, headdim)
cu_seqlens: (batch + 1,) cumulative sequence lengths
max_seqlen: int max seq length in the batch
rY   r#   r$   )r\   rV   rW   rP   _cos_cached_sin_cached)r]   r0   r#   r$   s       r8   r9   )ModernBertUnpaddedRotaryEmbedding.forward   sQ     !&&z**CII&V#!!
 
r;   c                 T    SU R                    SU R                   SU R                   3$ )Nzdim=z, base=z, scale_base=)rT   rU   
scale_baser]   s    r8   
extra_repr,ModernBertUnpaddedRotaryEmbedding.extra_repr   s(    dhhZwtyykt>OPPr;   )r$   )g     @NNNN)rE   rF   rG   rH   __doc__rL   floatr   rJ   rV   rW   r[   rK   r   tupler9   strri   rM   __classcell__r^   s   @r8   rR   rR      s     $()-'+OO O SM	O
 &O $O O. %)	\\ LL SM	
 
u||U5<<#=>>	?2QC Q Qr;   rR   c                      ^  \ rS rSrSrS\4U 4S jjr\R                  " SS9S\R                  S\R                  4S	 j5       r SS\\R                     S
\\R                     S\R                  4S jjrSrU =r$ )ModernBertEmbeddings   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
configc                 \  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  S9U l        [        R                  " UR                  UR                  UR                  S9U l        [        R                  " UR                  5      U l        g )N)padding_idxepsbias)rZ   r[   ru   r   	Embedding
vocab_sizehidden_sizepad_token_idtok_embeddings	LayerNormnorm_eps	norm_biasnormDropoutembedding_dropoutdropr]   ru   r^   s     r8   r[   ModernBertEmbeddings.__init__   su     ll6+<+<f>P>P^d^q^qrLL!3!3vO_O_`	JJv778	r;   Tdynamic	input_idsr`   c                 `    U R                  U R                  U R                  U5      5      5      $ rk   )r   r   r   )r]   r   s     r8   compiled_embeddings(ModernBertEmbeddings.compiled_embeddings   s%    yy4#6#6y#ABCCr;   inputs_embedsc                    Ub"  U R                  U R                  U5      5      nU$ U R                  R                  (       a  U R	                  U5      O.U R                  U R                  U R                  U5      5      5      nU$ rk   )r   r   ru   reference_compiler   r   )r]   r   r   hidden_statess       r8   r9   ModernBertEmbeddings.forward   su     $ IIdii&>?M  ;;00 ((3YYtyy)<)<Y)GHI 
 r;   )ru   r   r   r   rD   )rE   rF   rG   rH   rl   r   r[   rJ   compile
LongTensorrK   r   r   r9   rM   rp   rq   s   @r8   rs   rs      s    9/ 9 ]]4 DU-=-= D%,, D !D ei!%"2"23KSTYT`T`Ka	 r;   rs   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	ModernBertMLP   a*  Applies the GLU at the end of each ModernBERT layer.

Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
ru   c                   > [         TU ]  5         Xl        [        R                  " UR
                  [        UR                  5      S-  UR                  S9U l	        [        UR                     U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR
                  UR                  S9U l        g )Nr&   rz   )rZ   r[   ru   r   Linearr}   rL   intermediate_sizemlp_biasWir   hidden_activationactr   mlp_dropoutr   Wor   s     r8   r[   ModernBertMLP.__init__   s    ))F..F4L4L0MPQ0QX^XgXgh&223JJv112	))F44f6H6Hv_r;   r   r`   c                     U R                  U5      R                  SSS9u  p#U R                  U R                  U R	                  U5      U-  5      5      $ )Nr&   r'   rT   )r   chunkr   r   r   )r]   r   inputgates       r8   r9   ModernBertMLP.forward   sG    ggm,221"2=wwtyy%4!7899r;   )r   r   r   ru   r   )rE   rF   rG   rH   rl   r   r[   rJ   rK   r9   rM   rp   rq   s   @r8   r   r      s7    `/ `:U\\ :ell : :r;   r   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\R                  " 5       \
S 5       5       rSrU =r$ )ModernBertRotaryEmbedding   inv_freqru   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultr   F)
persistent)rZ   r[   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenru   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r]   ru   rV   r   r^   s       r8   r[   "ModernBertRotaryEmbedding.__init__   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r;   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r'   r   mpscpuF)device_typeenabledr&   r   )rW   )r   rm   expandr,   torV   r   r   ro   rJ   autocast	transposecatr1   r   r2   rW   )
r]   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr1   r2   s
             r8   r9   !ModernBertRotaryEmbedding.forward	  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   ru   r   r   r   r   r   rk   )rE   rF   rG   rH   rJ   rK   __annotations__r   r[   no_gradr   r9   rM   rp   rq   s   @r8   r   r      sA    ll// / /" ]]_<  <r;   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr'   r&   r   )r,   rJ   r   )r   x1x2s      r8   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r;   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr1   r2   r   unsqueeze_dimq_embedk_embeds           r8   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr;   moduleModernBertAttentionr0   attention_masksliding_window_maskr   local_attentionbsrT   output_attentionsr`   c	                    U R                  XS9u  pUR                  SS5      R                  SS9u  pn[        XX5      u  pU R                  S-  n[
        R                  " XR                  SS5      5      U-  nUS:w  a  UnUU-   n[        R                  R                  US[
        R                  S	9R                  UR                  5      n[        R                  R                  UU R                  U R                  S
9n[
        R                  " UU5      nUR                  SS5      R!                  5       nUR#                  USU5      nU(       a  UU4$ U4$ )Nr   r
   r   r&   r         ࿩r'   r'   r'   rT   rW   )ptraining)
rotary_embr   unbindr   head_dimrJ   matmulr   
functionalsoftmaxfloat32r   rW   dropoutattention_dropoutr   r+   r-   )r   r0   r   r   r   r   r   rT   r   _kwargsr1   r2   querykeyvaluescaleattn_weightsattn_outputs                     r8   eager_attention_forwardr   ;  s=       @HCa+22q29E%e#;JEOOT!E<<}}Q':;eCL(",.0L ==((2U]](SVVW\WbWbcL==((9Q9Q\b\k\k(lL,,|U3K''1-88:K""2r3/K\**>r;   r   target_dtypec	           	         U" XUS9nUR                   [        R                  [        R                  4;  n
U
(       ad  UR                   nUR	                  U5      n[        UUUU R                  (       a  U R                  OSU R                  US9nUR	                  U5      nO5[        UUUU R                  (       a  U R                  OSU R                  US9nUR                  Xg5      4$ )Nrb           )r#   r$   	dropout_pdeterministicwindow_size)
rW   rJ   float16bfloat16r   r   r   r   deterministic_flash_attnr-   )r   r0   r   r#   r$   r   r   rT   r   r   convert_dtype
orig_dtypeattns                r8   flash_attention_forwardr
  `  s     SJ
GCIIemmU^^%DDM YY
ff\"/!!28//f..s 99'
 wwz"/!!28//f..s 99'
 IIb  r;   c                 f   U R                  XS9u  pUR                  SS5      R                  SS9u  pn[        XX5      u  pUS:w  a  Un[        R
                  " UUUU R                  (       a  U R                  OSUS9R                  SS5      R                  5       nUR                  US	U5      nU4$ )
Nr   r
   r   r&   r   r   r   )r  	attn_maskr'   )
r   r   r   r   Fscaled_dot_product_attentionr   r   r+   r-   )r   r0   r   r   r   r   r   rT   r   r1   r2   r   r   r   r   s                  r8   sdpa_attention_forwardr    s        @HCa+22q29E%e#;JE(", 	
&&28//f..s$	
 
1a	  ""2r3/K>r;   )flash_attention_2eagersdpac                      ^  \ rS rSrSrSS\S\\   4U 4S jjjr SS\	R                  S\\   S\	R                  4S	 jjrS
rU =r$ )r   i  an  Performs multi-headed self attention on a batch of unpadded sequences.

If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.

See `forward` method for additional details.
ru   layer_idc                   > [         TU ]  5         Xl        X l        UR                  UR
                  -  S:w  a&  [        SUR                   SUR
                   S35      eUR                  U l        UR                  U l        UR
                  U l	        UR                  UR
                  -  U l
        U R                  U R                  -  U l        [        R                  " UR                  SU R                  -  UR                  S9U l        X!R                   -  S:w  aU  UR"                  S-  UR"                  S-  4U l        UR$                  b  UR$                  OUR&                  nUR"                  nOSU l        UR(                  nUR&                  nUR*                  S	:X  a  [-        U R                  XCS
9U l        O*[0        R2                  " U5      nX5l        [7        US9U l        [        R                  " UR                  UR                  UR                  S9U l        UR                  S:  a   [        R:                  " UR                  5      O[        R<                  " 5       U l        [A        5       U l!        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r
   r   r&   r   r  )rT   r$   rU   )ru   r   )"rZ   r[   ru   r  r}   num_attention_heads
ValueErrorr   r  	num_headsr   all_head_sizer   r   attention_biasWqkvglobal_attn_every_n_layersr   local_rope_thetaglobal_rope_thetar   _attn_implementationrR   r   copydeepcopy
rope_thetar   r   r   Identityout_dropsetpruned_heads)r]   ru   r  r#  r   config_copyr^   s         r8   r[   ModernBertAttention.__init__  s     : ::a?#F$6$6#77mnt  oI  oI  nJ  JK  L  "(!9!9(.(G(G%33**f.H.HH!]]T^^;IIf00!d6H6H2HvOdOde	7771<$*$:$:a$?AWAW[\A\#]D 4:4K4K4W00]c]u]uJ&,&<&<##+D &,&D&D#11J&&*==?MM.EDO --/K%/"7{KDO))F..0B0BI^I^_@F@X@X[^@^

6#;#;<dfdododqEr;   r   r   r`   c           
         U R                  U5      nUR                  S   nU R                  R                  S:X  a)  UR	                  SSU R
                  U R                  5      nO)UR	                  USSU R
                  U R                  5      n[        U R                  R                     " U 4UU R                  U R                  UU R                  US.UD6nUS   nU R                  U R                  U5      5      nU4USS  -   $ )Nr   r  r'   r
   )r0   r   r   r   rT   r   r   )r  r,   ru   r   r-   r  r   MODERNBERT_ATTENTION_FUNCTIONr   r   r  r%  r   )r]   r   r   kwargsr0   r   attn_outputss          r8   r9   ModernBertAttention.forward  s     ii&  #;;++/BB((2q$..$--@C((2r1dnndmmDC4T[[5U5UV	
 00""/	
 	
 %Qdggm&<=,qr"222r;   )r   r  r  r   ru   r  r   r  r   r  r%  r'  r   rk   F)rE   rF   rG   rH   rl   r   r   rL   r[   rJ   rK   boolr9   rM   rp   rq   s   @r8   r   r     s]    %"/ %"8C= %" %"T -23||3 $D>3
 
3 3r;   c                   t  ^  \ rS rSrSS\S\\   4U 4S jjjr\R                  " SS9S\R                  S\R                  4S	 j5       r      SS\R                  S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\R                  4S jjrSrU =r$ )ModernBertEncoderLayeri  ru   r  c                   > [         TU ]  5         Xl        US:X  a  [        R                  " 5       U l        O9[        R                  " UR                  UR                  UR                  S9U l        [        XS9U l        [        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        g )Nr   rx   )ru   r  )rZ   r[   ru   r   r$  	attn_normr   r}   r   r   r   r	  mlp_normr   mlpr]   ru   r  r^   s      r8   r[   ModernBertEncoderLayer.__init__  s    q=[[]DN\\&*<*<&//X^XhXhiDN'vI	V%7%7V__SYScScd (r;   Tr   r   r`   c                 B    U R                  U R                  U5      5      $ rk   )r6  r5  r]   r   s     r8   compiled_mlp#ModernBertEncoderLayer.compiled_mlp  s    xxm455r;   r   r   r   r#   r$   r   c           
      
   U R                  U R                  U5      UUUUUUS9nXS   -   nU R                  R                  (       a  U R	                  U5      OU R                  U R                  U5      5      n	X-   nU4USS  -   $ )Nr   r   r   r#   r$   r   r   r   )r	  r4  ru   r   r;  r6  r5  )
r]   r   r   r   r   r#   r$   r   r-  
mlp_outputs
             r8   r9   ModernBertEncoderLayer.forward  s     yyNN=)) 3%!!/ ! 
 &Q7 {{,, m,$--67 	
 &2,qr"222r;   )r	  r4  ru   r6  r5  rk   )NNNNNF)rE   rF   rG   rH   r   r   rL   r[   rJ   r   rK   r;  r   r0  r9   rM   rp   rq   s   @r8   r2  r2    s    	)/ 	)8C= 	) 	) ]]4 6%,, 65<< 6 !6 266:37-1$(,13||3 !.3 &ell3	3
 u//03 U\\*3 SM3 $D>3 
3 3r;   r2  c                      ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrS\R                  4S	 jr SS
\\   S\S\4U 4S jjjrS rU 4S jrSrU =r$ )ModernBertPreTrainedModeli2  ru   modelTrs   r2  Fr   c                   ^ U R                   R                  mTc  SmS[        R                  S[        4U4S jjnU R                   R
                  U R                   R
                  [        R                  " SU R                   R                  -  5      -  U R                   R
                  U R                   R                  S-  S.n[        U[        5      (       a  U" UR                  US   5        g [        U[        5      (       a-  U" UR                  US	   5        U" UR                  US
   5        g [        U[         5      (       a-  U" UR"                  US	   5        U" UR                  US
   5        g [        U[$        5      (       a  U" UR&                  US
   5        g [        U[(        5      (       a  U" UR*                  US
   5        g [        U[,        [.        [0        [2        45      (       a  U" UR4                  US   5        g [        U[        R6                  5      (       aX  UR8                  R:                  R=                  S5        UR>                  b%  UR>                  R:                  RA                  5         g g g )Nr
   r   stdc                   > [         R                  R                  U R                  SUT* U-  TU-  S9  [	        U [         R
                  5      (       a8  U R                  b*  [         R                  R                  U R                  5        g g g )Nr   )meanrE  ab)r   inittrunc_normal_weightr   r   rz   zeros_)r   rE  cutoff_factors     r8   init_weight<ModernBertPreTrainedModel._init_weights.<locals>.init_weightA  st    GG!! .3&#% "  &")),,;;*GGNN6;;/ + -r;   g       @r   )inout	embedding	final_outrS  rQ  rR  rT  g      ?)!ru   initializer_cutoff_factorr   Modulerm   initializer_rangemathsqrtnum_hidden_layersr}   r   rs   r   r   r   r   r   r  ModernBertPredictionHeaddenseModernBertForMaskedLMdecoder#ModernBertForSequenceClassificationModernBertForMultipleChoice ModernBertForTokenClassificationModernBertForQuestionAnswering
classifierr   rL  datafill_rz   zero_)r]   r   rO  stdsrN  s       @r8   _init_weights'ModernBertPreTrainedModel._init_weights<  s   == M	0		 	0 	0 ++//;;00499S4;;C`C`=`3aa6600$6	
 f233--tK/@A..		4:.		4;/ 344T$Z0		4;/ 899d5k2 566U43+0.	
 
 ))4+<=--MM$$S){{&  &&( ' .r;   attn_implementationis_init_checkr`   c                    >  Uc  U R                  5       (       a  SOUn[        TU ]  XS9$ ! [        [        4 a     Nf = f)zB
Checks and dispatches to hhe requested attention implementation.
r  )rj  rk  )_flash_attn_2_can_dispatchr  ImportErrorrZ   %_check_and_adjust_attn_implementation)r]   rj  rk  r^   s      r8   ro  ?ModernBertPreTrainedModel._check_and_adjust_attn_implementationp  s`    	 '.43R3R3T3T $(   w< 3 = 
 	
 K( 		s   , ??c                    U R                   R                  SL a  g [        U S5      (       aZ  [        U R                  5      S:  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                   R                  c  [        5       U R                   l        g g )	NFhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.r   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
ru   r   r   lenrr  loggerwarning_oncerV   r   r   rh   s    r8   _maybe_set_compile,ModernBertPreTrainedModel._maybe_set_compile  s   ;;((E14))c$2D2D.E.I{{,,##9 -2DKK);;u${{,,##9 -2DKK);;u${{,,##9 -2DKK);;((0,?,ADKK) 1r;   c                    > [         TU ]  " U0 UD6nU R                  R                  S;   aA  U R                  R                  (       a  [        R                  S5        SU R                  l        U$ )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rZ   resize_token_embeddingsru   r   rt  ru  )r]   argsr,  model_embedsr^   s       r8   ry  1ModernBertPreTrainedModel.resize_token_embeddings  s[    w6GG;;((L8{{,,##y -2DKK)r;   rC   r/  )rE   rF   rG   rH   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr   rV  rh  r   ro   r0  ro  rv  ry  rM   rp   rq   s   @r8   rB  rB  2  s    &*#/1IJN2)BII 2)j IN
#+C=
AE
	
 
.B>
 
r;   rB  inputslabelsc                    UR                  S[        R                  S9n[        R                  " UR	                  5       SS9R	                  5       n[        UR                  5       R                  5       5      n[        R                  R                  R                  [        R                  " US[        R                  S9S5      nU R                  5       S:X  a  U R	                  5       U   nO(U R                  tpnX-  nU R                  " U/UQ76 U   nUb  UR	                  5       U   OSnUb  UR	                  5       U   OSnXXvX4$ )	aP  
Remove padding from input sequences.

Args:
    inputs: (batch, seqlen, ...) or (batch, seqlen)
    attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
    position_ids: (batch, seqlen), int, position ids
    labels: (batch, seqlen), int, labels

Returns:
    unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    cu_seqlens: (batch + 1), the cumulative sequence lengths
    max_seqlen_in_batch: int
    unpadded_position_ids: (total_nnz) or None
    unpadded_labels: (total_nnz) or None
r'   r   F)as_tupler   )r   r   r&   N)sumrJ   int32nonzeroflattenrL   maxitemr   r   padcumsumrT   r,   r-   )r  r   r   r  seqlens_in_batchindicesmax_seqlen_in_batchr#   unpadded_inputsbatchseqlenrestr,   unpadded_position_idsunpadded_labelss                  r8   _unpad_modernbert_inputr    s   . &))b)DmmN224uEMMOG.22499;<$$((6FAUZU`U`)acijJzz|q ..*73%|| ++e3d3G<?K?WL0027;]a393Efnn&w/4OZF[llr;   r  r  r  c                 ^   U R                  5       S:X  aC  [        R                  " X#-  U R                  U R                  S9nXU'   UR                  X#5      nU$ U R                  tpg[        R                  " X#-  /UQ7U R                  U R                  S.6nXU'   UR
                  " X#/UQ76 nU$ )a-  
Add padding to sequences.

Args:
    inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    batch: int, batch size
    seqlen: int, max sequence length

Returns:
    padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
r   rW   rV   )rT   rJ   zerosrW   rV   r-   r,   )r  r  r  r  outputpadded_inputs_r  s           r8   _pad_modernbert_outputr    s    $ zz|qU^6<<V wE2  <<U^]d]&,,v}}] wE9D9r;   c            !         ^  \ rS rSrS\4U 4S jjrS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\   S\	\   S\\\
R                  S4   \4   4S jj5       rS\
R                  S\S\
R                  4S jrSrU =r$ )ModernBertModeli  ru   c           	        > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l
        [
        R                  " UR                  UR                  UR                  S9U l        SU l        U R#                  5         g s  snf )Nrx   F)rZ   r[   ru   rs   
embeddingsr   
ModuleListrangerZ  r2  layersr   r}   r   r   
final_normgradient_checkpointing	post_initr7  s      r8   r[   ModernBertModel.__init__  s     .v6mmFKFLdLdFefFe(#F5Fef
 ,,v'9'9vU[UeUef&+#	 gs   C c                 .    U R                   R                  $ rk   r  r   rh   s    r8   get_input_embeddings$ModernBertModel.get_input_embeddings  s    ---r;   c                 $    XR                   l        g rk   r  )r]   r   s     r8   set_input_embeddings$ModernBertModel.set_input_embeddings  s    ).&r;   r   r   r   r   r   r  r#   r$   
batch_sizeseq_lenr   output_hidden_statesreturn_dictr`   .c                   ^^	^
 Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [	        S5      eU(       a  SOSnU(       a  SOSnU R                  5         Ub  U R                  X5        T	c+  T
c(  Ub  UR                  SS u  m	m
OUR                  SS u  m	m
Ub  UR                  OUR                  nUc&  [        R                  " T	T
4U[        R                  S9nSnU R                   R                  S:X  aH  TcD  UcA  Uc>  SnUc,  [        R                  " 5          [        XS	9tnmpxnSSS5        OF[        XRS	9tnmpxnO8Uc$  [        R                  " T
US
9R!                  S5      nU R#                  X+S9u  p#U R%                  XS9nU R&                   HD  nU(       a  UU4-   nU" UUUUUUUS9nUS   nU(       d  M*  [)        U5      S:  d  M;  UUS   4-   nMF     U(       a  UU4-   nU R+                  U5      nU(       a'  [-        UTT	T
S9nUb  [/        U	UU
4S jU 5       5      nOWU R                   R                  S:X  a=  Ub:  US   R1                  5       S:X  a#  UR!                  S5      n[/        S U 5       5      nU(       d  [/        S UX4 5       5      $ [3        UUUS9$ ! , (       d  f       GN;= f)F  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nz:You must specify exactly one of input_ids or inputs_embedsrC   r&   rY   Fr  T)r  r   rV   r   )r   )r   r   r>  r   r  r  r  r  c              3   <   >#    U  H  n[        UTTTS 9v   M     g7f)r  N)r  ).0hsr  r  r  s     r8   	<genexpr>*ModernBertModel.forward.<locals>.<genexpr>|  s$      */ +"gZ`gh/s   r'   c              3   B   #    U  H  oR                  S 5      v   M     g7f)r   N)r   )r  r  s     r8   r  r    s     %R@Q"ll1oo@Qs   c              3   .   #    U  H  oc  M  Uv   M     g 7frk   rC   )r  vs     r8   r  r    s     m$[q$[s   	)last_hidden_stater   
attentions)ru   r   r  use_return_dictr  rv  %warn_if_padding_and_no_attention_maskr,   rV   rJ   onesr0  r   r   r  aranger   _update_attention_maskr  r  rs  r  r  rn   rT   r   )r]   r   r   r   r   r   r  r#   r$   r  r  r   r  r  all_hidden_statesall_self_attentionsrV   repadr  r   encoder_layerlayer_outputss         `  ``           r8   r9   ModernBertModel.forward  s8   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]-t";<YZZ"6BD$5b4! 66yQ'/(&3&9&9"1&=#
G&/oobq&9#
G%.%:!!@T@T!"ZZW(=fTYT^T^_N;;++/BB:#5*:L (I`#,JF	7JQ )
 Ja,JFM7JQ #$||GFCMMaP262M2M 3N 3/N )Y![[M#$58H$H!)-$7)%%"3M *!,M  S%7!%;&9]1=M<O&O# )"   1]4D D62$gZPWM !,$) */* %! KK,,0CC!-!"%))+q0)33A6M %%R@Q%R Rm]4E$[mmm++*
 	
{ )s   K""
K1c                 ,   U(       a  U R                   R                  S:X  a'  [        R                  S5        SU R                   l        OGU R                   R                  S:w  a-  [        R                  SU R                   R                   S35        [	        XR
                  5      n[        R                  " UR                  S   5      R                  S5      n[        R                  " XDR                  -
  5      nXPR                   R                  S-  :*  R                  S5      R                  S5      R                  UR                  5      nUR                  UR!                  5       [        R"                  " U R
                  5      R$                  5      nX74$ )Nr  zOutputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r  zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r&   r   )ru   r   rt  ru  r   rW   rJ   r  r,   r   absTr   r   rV   masked_filllogical_notfinfomin)r]   r   r   global_attention_maskrowsdistancewindow_maskr   s           r8   r  &ModernBertModel._update_attention_mask  sJ   {{//69##V 4;011W<##  $ @ @A B:: !;>:: V ||177:;EEaH99TFF]+ 4499DDQGQQRSTWWXfXmXmn 	 4??@W@W@Y[`[f[fgkgqgq[r[v[vw$99r;   )ru   r  r  r  r  NNNNNNNNNNNNN)rE   rF   rG   rH   r   r[   r  r  r   r   rJ   r   rK   rL   r0  r   rn   r   r9   r  rM   rp   rq   s   @r8   r  r    s   	/ 	./  15156:3704*.-1$($(!%,0/3&*A
E,,-A
 !.A
 &ell3	A

 u//0A
  -A
 %,,'A
 U\\*A
 SMA
 SMA
 #A
 $D>A
 'tnA
 d^A
 
uU\\3&'8	9A
 A
F:U\\ :VZ :_d_k_k : :r;   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )r[  i  ru   c                 F  > [         TU ]  5         Xl        [        R                  " UR
                  UR
                  UR                  5      U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        g )Nrx   )rZ   r[   ru   r   r   r}   classifier_biasr\  r   classifier_activationr   r   r   r   r   r   s     r8   r[   !ModernBertPredictionHead.__init__  so    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	r;   r   r`   c                 `    U R                  U R                  U R                  U5      5      5      $ rk   )r   r   r\  r:  s     r8   r9    ModernBertPredictionHead.forward  s#    yy$**]";<==r;   )r   ru   r\  r   )rE   rF   rG   rH   r   r[   rJ   rK   r9   rM   rp   rq   s   @r8   r[  r[    s2    a/ a>U\\ >ell > >r;   r[  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )custom_introc            "       F  ^  \ rS rSrS/rS\4U 4S jjrS rS\R                  4S jr
\R                  " SS	9S
\R                  S\R                  4S j5       r\              SS\\R"                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )r]  i  zdecoder.weightru   c                 n  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  UR                  S9U l        U R                  R                  U l        U R                  R                  U l        U R                  5         g )Nr   )rZ   r[   ru   r  rC  r[  headr   r   r}   r|   decoder_biasr^  sparse_predictionsparse_pred_ignore_indexr  r   s     r8   r[   ModernBertForMaskedLM.__init__  s     $V,
,V4	yy!3!3V5F5FVM`M`a!%!>!>(,(L(L% 	r;   c                     U R                   $ rk   r^  rh   s    r8   get_output_embeddings+ModernBertForMaskedLM.get_output_embeddings  s    ||r;   new_embeddingsc                     Xl         g rk   r  )r]   r  s     r8   set_output_embeddings+ModernBertForMaskedLM.set_output_embeddings  s    %r;   Tr   r  r`   c                 B    U R                  U R                  U5      5      $ rk   )r^  r  )r]   r  s     r8   compiled_head#ModernBertForMaskedLM.compiled_head  s    ||DIIf-..r;   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  c                 v   Ub  UOU R                   R                  nU R                  5         U R                   R                  S:X  a  Uc  Uc  U	c  U
c)  Uc&  Ub  UR                  SS u  pOUR                  SS u  pUb  UR
                  OUR
                  nUc%  [        R                  " X4U[        R                  S9nUc-  [        R                  " 5          [        XXFS9u  pppFSSS5        O[        XRXFS9u  pWppFU R                  UUUUUUUU	U
UUUUS9nUS   nU R                  (       aK  UbH  UR                  S5      nUR                  UR                  S   S5      nX`R                  :g  nUU   nUU   nU R                   R                  (       a  U R!                  U5      OU R#                  U R%                  U5      5      nSnUb*  U R&                  " UU4S	U R                   R(                  0UD6nU R                   R                  S:X  a  U R                   R*                  (       d  Uc
  [-        5       O[        R                  " 5          [/        UXzUS
9nSSS5        [1        USS5      bw  / nUR2                   HU  nUR5                  5       S:X  a$  UR                  S   S:X  a  UR7                  S5      nUR9                  [/        UXzUS
95        MW     [;        U5      Ul        U(       d  U4nUb  U4U-   $ U$ [=        UUUR2                  UR>                  S9$ ! , (       d  f       GN+= f! , (       d  f       N= f)r  Nr  r&   rY   )r  r   r   r  r   r   r   r   r   r  r#   r$   r  r  r   r  r  r   r'   r|   r  r   r
   r   losslogitsr   r  ) ru   r  rv  r   r,   rV   rJ   r  r0  r   r  rC  r  r-   r  r   r  r^  r  loss_functionr|   repad_logits_with_gradr   r  getattrr   rT   squeezeappendrn   r   r  )r]   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  r,  rV   outputsr  mask_tokensr  r  padded_hidden_statesr  r  s                            r8   r9   ModernBertForMaskedLM.forward  s:   F &1%<k$++B]B]!;;++/BB:#5*:L%'/$0.;.A.A"1.E+
G.7oobq.A+
-6-B))H\H\!)%*ZZ0Ef\a\f\f%gN ([r#,Zf\X	JL )
 \s,Zf\XMJL **) 3%'!!!/!5#  
 $AJ!!f&8[[_F 1 6 6v||A K !$A$AAK 1+ >K(F {{,, 01dii(9:; 	 %%ffbAWAWb[abD;;++/BB"&++"D"D\a\i\i\kk/vwipq l w6B')$!//Bvvx1}!)9ZZ](//.b'dkl 0 )..B(C%YF)-)9TGf$EvE!//))	
 	
C )` lks   L
L*
L'*
L8)ru   r^  r  rC  r  r  NNNNNNNNNNNNNN)rE   rF   rG   rH   _tied_weights_keysr   r[   r  r   r   r  rJ   r   rK   r  r   r   r   rL   r0  r   rn   r   r9   rM   rp   rq   s   @r8   r]  r]    s    ++/ &BII & ]]4 /ELL /U\\ / !/  15156:/304)-*.-1$($(!%,0/3&*x
E,,-x
 !.x
 &ell3	x

 u||,x
  -x
 &x
 %,,'x
 U\\*x
 SMx
 SMx
 #x
 $D>x
 'tnx
 d^x
" 
uU\\"N2	3#x
 x
r;   r]  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c            "         ^  \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )r_  iY  ru   c                 n  > [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g rk   )rZ   r[   
num_labelsru   r  rC  r[  r  rJ   r   r   classifier_dropoutr   r   r}   rc  r  r   s     r8   r[   ,ModernBertForSequenceClassification.__init___  s      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r;   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  r`   c                 z   Ub  UOU R                   R                  nU R                  5         Ub  U R                  X5        U
c)  Uc&  Ub  UR                  SS u  pOUR                  SS u  pUb  UR
                  OUR
                  nUc%  [        R                  " X4U[        R                  S9nU R                  UUUUUUUU	U
UUUUS9nUS   nU R                   R                  S:X  a
  USS2S4   nOLU R                   R                  S:X  a2  UUR                  S5      -  R                  S	S
9UR                  S	SS9-  nU R                  U5      nU R                  U5      nU R                  U5      nSnUGb  U R                   R                   c  U R"                  S	:X  a  SU R                   l        OoU R"                  S	:  aN  UR$                  [        R&                  :X  d  UR$                  [        R(                  :X  a  SU R                   l        OSU R                   l        U R                   R                   S:X  aJ  [+        5       nU R"                  S	:X  a&  U" UR-                  5       UR-                  5       5      nOU" UU5      nOU R                   R                   S:X  a=  [/        5       nU" UR1                  SU R"                  5      UR1                  S5      5      nO-U R                   R                   S:X  a  [3        5       nU" UU5      nU(       d  U4nUb  U4U-   $ U$ [5        UUUR6                  UR8                  S9$ )a  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nr&   rY   r  r   clsrG  r'   r   r   TrT   keepdim
regressionsingle_label_classificationmulti_label_classificationr  )ru   r  rv  r  r,   rV   rJ   r  r0  rC  classifier_poolingr   r  r  r   rc  problem_typer  rW   longrL   r	   r  r   r-   r   r   r   r  )r]   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  r,  rV   r   r  pooled_outputr  r  loss_fctr  s                           r8   r9   +ModernBertForSequenceClassification.forwardl  s   N &1%<k$++B]B]! 66yQ'/(&3&9&9"1&=#
G&/oobq&9#
%.%:!!@T@T!"ZZ(=fTYT^T^_N**) 3%'!!!/!5#  
 $AJ;;))U2 1!Q$ 7[[++v5!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./YF)-)9TGf$EvE'!//))	
 	
r;   )rc  ru   r   r  rC  r  r  )rE   rF   rG   rH   r   r[   r   r   rJ   r   rK   rL   r0  r   rn   r   r9   rM   rp   rq   s   @r8   r_  r_  Y  sk   /   15156:/304)-*.-1$($(!%,0/3&*r
E,,-r
 !.r
 &ell3	r

 u||,r
  -r
 &r
 %,,'r
 U\\*r
 SMr
 SMr
 #r
 $D>r
 'tnr
 d^r
" 
uU\\"$<<	=#r
 r
r;   r_  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c            "         ^  \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )ra  i  ru   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rk   rZ   r[   r  r  rC  r[  r  rJ   r   r   r	  r   r   r}   rc  r  r   s     r8   r[   )ModernBertForTokenClassification.__init__  s{      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJ 	r;   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  r`   c                    Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nr  r   r'   r   r  )ru   r  rv  rC  r  r   rc  r   r-   r  r   r   r  )r]   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  r   r  r  r  r  r  s                        r8   r9   (ModernBertForTokenClassification.forward  s#   H &1%<k$++B]B]!**) 3%'!!!/!5#  
 $AJ II&78 II&78!23')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r;   rc  r   r  rC  r  r  )rE   rF   rG   rH   r   r[   r   r   rJ   r   rK   rL   r0  r   rn   r   r9   rM   rp   rq   s   @r8   ra  ra    sk   
/ 
  15156:/304)-*.-1$($(!%,0/3&*I
E,,-I
 !.I
 &ell3	I

 u||,I
  -I
 &I
 %,,'I
 U\\*I
 SMI
 SMI
 #I
 $D>I
 'tnI
 d^I
  
uU\\"$99	:!I
 I
r;   ra  c            "         ^  \ rS rSrS\4U 4S jjr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rb  iA  ru   c                 b  > [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rk   r  r   s     r8   r[   'ModernBertForQuestionAnswering.__init__C  sy      ++$V,
,V4	HH$$V%>%>?	))F$6$68I8IJr;   r   r   r   r   start_positionsend_positionsr  r#   r$   r  r  r   r  r  r`   c                 R   Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nUR                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  U R                  " UUXV40 UD6nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )r  N)r   r   r   r  r#   r$   r  r  r   r  r  r   r   r'   r   )r  start_logits
end_logitsr   r  )ru   r  rv  rC  r  r   rc  splitr  r+   r  r   r   r  )r]   r   r   r   r   r"  r#  r  r#   r$   r  r  r   r  r  r,  r   r  r  r%  r&  r  r  s                          r8   r9   &ModernBertForQuestionAnswering.forwardN  sc   F &1%<k$++B]B]!**) 3%!!!/!5#  
 $AJ II&78 II&78!23#)<<r<#: j#++B/::<''+668
&=+D%%lJibhiD"J/'!"+=F)-)9TGf$EvE+%!!//))
 	
r;   r  r  )rE   rF   rG   rH   r   r[   r   r   rJ   rK   rL   r0  r   rn   r   r9   rM   rp   rq   s   @r8   rb  rb  A  sf   	/ 	  266:/32604*.-1$($(!%,0/3&*K
ELL)K
 !.K
 &ell3	K

 u||,K
 "%,,/K
  -K
 %,,'K
 U\\*K
 SMK
 SMK
 #K
 $D>K
 'tnK
 d^K
" 
uU\\"$@@	A#K
 K
r;   rb  z
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c            "         ^  \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )r`  i  ru   c                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  S5      U l        U R                  5         g Nr   )rZ   r[   ru   r  rC  r[  r  rJ   r   r   r	  r   r   r}   rc  r  r   s     r8   r[   $ModernBertForMultipleChoice.__init__  sm     $V,
,V4	HH$$V%>%>?	))F$6$6: 	r;   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  r`   c                 \   Ub  UOU R                   R                  nUb  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R                   R                  S:X  a  [        R                  " UR                  S   UR                  S9nUb)  UR                  SS	9R                  UR                  5      nO.[        R                  " S[        R                  UR                  S
9nUUU4   nONU R                   R                  S:X  a4  UR                  SSS9nUUR!                  S5      -  R                  SS	9U-  nU R#                  U5      nU R%                  U5      nU R'                  U5      nUR                  SU5      nSnUb  [(        R*                  " 5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  S9$ )aK  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nr   r'   r  r   r  r  r   r  rG  Tr  r  )ru   r  r,   r-   sizerv  rC  r  rJ   r  rV   argmaxr   tensorr  r  r   r  r   rc  r   r   r   r   r  )r]   r   r   r   r   r   r  r  r#   r$   r  r  r   r  r  r,  num_choicesr   r  	indices_0cls_masknum_non_pad_tokensr  r  reshaped_logitsr  r  r  s                               r8   r9   #ModernBertForMultipleChoice.forward  s   L &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 	!**) 3%'!!!/!5#  
 $AJ ;;))U2%6%<%<Q%?HYH`H`aI))00R08;;<M<T<TU !<<DUD\D\] 1)X2E F [[++v5!/!3!34!3!H!2^5M5Mb5Q!Q V V[\ V ]`r r		"34		-0/ ++b+6**,HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r;   )rc  ru   r   r  rC  r  )rE   rF   rG   rH   r   r[   r   r   rJ   r   rK   rL   r0  r   rn   r   r9   rM   rp   rq   s   @r8   r`  r`    sk   
/ 
  15156:/304)-*.-1$($(!%,0/3&*i
E,,-i
 !.i
 &ell3	i

 u||,i
  -i
 &i
 %,,'i
 U\\*i
 SMi
 SMi
 #i
 $D>i
 'tni
 d^i
" 
uU\\"$==	>#i
 i
r;   r`  )r  rB  r]  r_  ra  rb  r`  rD   r+  r/  )Wr!  rX  
contextlibr   typingr   r   rJ   torch.nn.functionalr   r   r  torch.nnr   r   r	   activationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   r   utils.import_utilsr   configuration_modernbertr   flash_attn.flash_attn_interfacer   flash_attn.layers.rotaryr   flash_attn.ops.triton.rotaryr   object
get_loggerrE   rt  autogradFunctionr!   rK   rL   rP   rR   rV  rs   r   r   r   r   r   rn   r0  r   r  rW   r
  r  r+  r   r2  rB  r  r  r  r[  r]  r_  ra  rb  r`  __all__rC   r;   r8   <module>rM     s  ,   " "     A A ! B 9  L - G G 5 6 P89O 
		H	%46%..11 46v *. $L &	L
 L42Q 2Qj299 <:BII :(!<		 !<H(H )."!"	" LL" 	"
 5++," 38_" 	" 
"  ~" 5u||+,eELL.AAB"\ !&(!!(!	(! 2(! 	(!
 (! 38_(! 	(! 
(! ++(! 5<<(!V ! 	  LL  	 
 5++,  38_  	  
  5<< H 1$"! L3")) L3^+37 +3\ } } }F ,0%)	&mLL&mLL&m 5<<(&m U\\"	&m
 5<<u||S(5<<:PRZ[`[g[gRhhi&mRLL\\  	
 \\> s:/ s: s:l	>ryy 	> 
S
5 S

S
l 
A
*C A

A
H 
W
'@ W

W
t X
%> X
 X
v 
w
"; w

w
tr;   