
    cCi                     v   S SK r S SKJrJrJr  S SKrS SKJs  Jr	  S SKJr  SSK
Jr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/  SSK0J1r1  \*" 5       (       a  S SK2J3r3  \" S5       " S S\Rh                  5      5       r5 " S S\Rh                  5      r6S r7SES jr8S\Rr                  S\:S\Rr                  4S  jr; SFS!\Rh                  S"\Rr                  S#\Rr                  S$\Rr                  S%\\Rr                     S&\<S'\<S(\%\'   4S) jjr=   SGS!\Rh                  S"\Rr                  S#\Rr                  S$\Rr                  S%\\Rr                  S*4   S&\\<   S+\\<   S,\\Rr                     S\>\Rr                  \Rr                  4   4S- jjr?\"" 5       r@\?\@S.'    " S/ S0\Rh                  5      rA " S1 S2\Rh                  5      rB " S3 S4\Rh                  5      rC " S5 S6\5      rD\( " S7 S8\#5      5       rE\( " S9 S:\E5      5       rF    SHS;\\Rr                  \>\Rr                     S4   S<\\:   S=\\:   S>\:S%\\Rr                     S\\Rr                  \:4   4S? jjrG\( " S@ SA\E\5      5       rH " SB SC\\E5      rI/ SDQrJg)I    N)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)compile_friendly_flex_attention)create_causal_mask!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_available)deprecate_kwarg)OutputRecordercheck_model_inputs   )
DogeConfig)	BlockMaskRMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )DogeRMSNorm4   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z*
DogeRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      `/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/doge/modeling_doge.pyr)   DogeRMSNorm.__init__6   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor+   float32powmeanrsqrtr.   r-   )r/   hidden_statesinput_dtypevariances       r3   forwardDogeRMSNorm.forward>   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r5   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler-   shaper.   )r/   s    r3   
extra_reprDogeRMSNorm.extra_reprE   s*    ))*+6$2G2G1HIIr5   )r.   r-   )gư>)	__name__
__module____qualname____firstlineno__r)   rC   rH   __static_attributes____classcell__r2   s   @r3   r%   r%   4   s    $;J Jr5   r%   c                      ^  \ rS rSr% \R
                  \S'   SS\4U 4S jjjr\R                  " 5       \
S 5       5       rSrU =r$ )DogeRotaryEmbeddingI   inv_freqconfigc                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultrT   F)
persistent)r(   r)   hasattr
isinstancerW   dictgetrX   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrU   r   rope_init_fnattention_scalingregister_bufferrT   original_inv_freq)r/   rU   devicerT   r2   s       r3   r)   DogeRotaryEmbedding.__init__L   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r5   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r8   r    mpscpuF)device_typeenabledr7   dim)r:   )rT   floatexpandrG   r;   rg   r]   rY   strr+   autocast	transposecatcosrd   sinr:   )
r/   xposition_idsinv_freq_expandedposition_ids_expandedrl   freqsembrv   rw   s
             r3   rC   DogeRotaryEmbedding.forward]   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)rd   rU   ra   rf   rb   rc   rX   N)rJ   rK   rL   rM   r+   Tensor__annotations__r!   r)   no_gradr   rC   rN   rO   rP   s   @r3   rR   rR   I   s@    ll/z / /" ]]_<  <r5   rR   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr8   r7   rn   )rG   r+   ru   )rx   x1x2s      r3   rotate_halfr   m   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r5   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkrv   rw   ry   unsqueeze_dimq_embedk_embeds           r3   apply_rotary_pos_embr   t   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr5   r@   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)rG   rq   reshape)r@   r   batchnum_key_value_headsslenhead_dims         r3   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr5   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr7   r   r8   )ro   r:   ptrainingr    )r   num_key_value_groupsr+   matmulrt   rG   r   
functionalsoftmaxr<   r;   r:   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r3   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r5   r"   softcap	head_maskc                 6  ^^^ S n	S m[        U[        5      (       a  Un	OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUUU4S jn
[        UUUU
U	SUSS9u  pUR	                  UR
                  5      nUR                  SS5      R                  5       nX4$ )Nr   c                    > Tb  T[         R                  " U T-  5      -  n Tb  U TU   U   U   U   -   n Tb  U TU   U   S   S   -   n U $ )Nr   )r+   tanh)score	batch_idxhead_idxq_idxkv_idxr   r   r   s        r3   	score_mod)flex_attention_forward.<locals>.score_mod   sm    ejj99E"K	28<UCFKKE Ii0:1=a@@Er5   T)r   
block_mask
enable_gqascale
return_lser    r7   )r]   r"   rG   r   r;   r:   rt   r   )r   r   r   r   r   r   r   r   r   r   r   r   attention_weightsr   s         ``     @r3   flex_attention_forwardr      s     JK.),,#
$!!Q?SYYr]?":; &E &"K *,,U[[9''1-88:K))r5   doge_flex_attentionc                     ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9   SS	\	R                  S
\\	R                  \	R                  4   S\\	R                     S\\   S\\	R                     S\\	R                  \\	R                     \\\	R                        4   4S jj5       r  SS	\	R                  S\	R                  S\S\\	R                     4S jjrSrU =r$ )DogeAttention   rU   	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        UR                  U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R&                  " [(        R*                  " UR                  5      5      U l        [        R                  " UR                  U R                  -  UR                  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        [3        U R                  UR4                  S9U l        [3        U R                  UR4                  S9U l        g )Nr   g      ࿩biasr1   )r(   r)   rU   r   getattrr0   num_attention_headsr   r   r   r   attention_dropoutkeep_window_sizer   Linearattention_biasq_projk_projv_projr*   r+   zerosAdt_projo_projr%   rms_norm_epsq_normk_normr/   rU   r   r2   s      r3   r)   DogeAttention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9 & 7 7ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ekk&*D*DEFyy&&68R8RY_YnYn
 ii&&68J8JQWQfQf
 "$--V5H5HI!$--V5H5HIr5   past_key_valuepast_key_values4.58new_nameversionr@   position_embeddingsr   cache_positionr   c                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U R                  U5      R	                  U5      5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  pU R                  UR                  SS5      R                  UR                   S   UR                   S   S5      5      n[        R                  " U R                   ["        R$                  " U5      -  5      R                  SS5      nU R'                  UUU R(                  US9n[+        UU R,                  5      n[.        nU R0                  R2                  S:w  a  [4        U R0                  R2                     nU" U U	U
U4UU R6                  (       d  S	OU R8                  U R:                  S
.UD6u  nnUR                  " / UQSP76 R=                  5       nU R?                  U5      nUU4$ )Nr8   r    r7   )rw   rv   r   r   r   )r@   	dt_statesr   r   eager        )r   r   r   ) rG   r   r   r   viewrt   r   r   r   r   updater   r   r   r+   expr   Fsoftplusprepare_dynamic_maskr   r   r   r   rU   _attn_implementationALL_ATTENTION_FUNCTIONSr   r   r   r   r   )r/   r@   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rv   rw   cache_kwargsr   	attn_maskattention_interfacer   r   s                       r3   rC   DogeAttention.forward	  s^    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J LL""1a(001C1CA1FHZHZ[]H^`bc
	 IIdffqzz)'<<=GGBO	--'!22)	 . 
	 i)B)BC	(?;;++w6"9$++:Z:Z"[$7		%

 %#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r5   r   r   c           	         [         R                  " UR                  5      R                  nUR                  nUSS2SS2SSS24   R	                  SSUR
                  S   S5      nUb  [        U[        5      (       d  UR                  [         R                  :X  aB  UR                  n[         R                  " U[         R                  " SUR                  US9U5      nUR                  USS2SS2SS2SUR
                  S   24   S:g  U5      nUR
                  S   U:  ah  [         R                  " XvUR                  S9n[         R                  " XsSSS	S
9R                  n	UR!                  SU	S5      nUR                  US:H  U5      nU$ )a  
The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

Args:
    hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
    dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
    keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
    attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
Nr8   r    r   )rg   r:   r   r:   rg   TF)ro   largestsorted      ?)r+   finfor:   minrq   rG   r]   r"   boolwheretensorrg   masked_fill
zeros_liketopkindicesscatter)
r/   r@   r   r   r   	min_dtyper:   r   active_masktopk_indicess
             r3   r   "DogeAttention.prepare_dynamic_maskB  se   $ KK 3 3488	##aD!m,33M''*B
	 %j.S.S##uzz1%++!&"ELL^=R=RZ_$`bk" "--nQ1F[	XZH[F[=[.\`a.aclmI??2!11**9)JZJZ[K ::irSW`efnnL%--b,DK!--kS.@)LIr5   )r   r   rU   r   r   r   r   r   r   r   r   r   r   r   r   r   NNN)i   N)rJ   rK   rL   rM   r!   r   intr)   r   r+   r   rF   r	   
LongTensorrC   r   rN   rO   rP   s   @r3   r   r      s'   Jz Jhsm J J< %0A6R
 26+/596)||6) #5<<#=>6) !.	6)
 "%6) !!1!126) 
u||Xell3XeELL>Q5RR	S6) S6)x !%15#||# <<# 	#
 !.# #r5   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DogeMLPih  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nr   )r(   r)   rU   r0   intermediate_sizer   r   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr/   rU   r2   s     r3   r)   DogeMLP.__init__i  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r5   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r  r  r  r  )r/   rx   r  s      r3   rC   DogeMLP.forwards  s6    NN4;;t~~a/@#ADLLQRO#ST	r5   )r  rU   r  r  r0   r  r  )rJ   rK   rL   rM   r)   rC   rN   rO   rP   s   @r3   r  r  h  s    0 r5   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	DogeCDMoEix  rU   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        UR
                     U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        UR                  U l        UR                  U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  S-  SS9U l        [        R,                  " U R                  U R                  5      U l        [        R,                  " U R                  U R                  5      U l        g )Nr   r7   F)r(   r)   r0   r  r   r  r  num_expertsmathfloorsqrtnum_keysnum_experts_per_toktop_knorm_topk_probr   r   r  r  r  r  router_gate	Embedding
down_embedup_embedr  s     r3   r)   DogeCDMoE.__init__y  s_   !--!'!9!9V../!--

499T-=-=#>?//
$33 4#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRab 99T%5%5t}}q7HuU ,,t'7'79I9IJT%5%5t7G7GHr5   r@   r   c                    UR                   u  p4nU R                  U5      R                  SX4-  S5      nUR                  U R                  SS9u  u  pxu  pUR                  S5      UR                  S5      -   nU	R                  S5      U R                  -  U
R                  S5      -   nUR                  " / UR                   S S QSP76 nUR                  " / UR                   S S QSP76 nUR                  U R                  SS9u  pUR                  SU5      n[        R                  " USS9nU R                  (       a  UUR                  SSS9-  nU R                  U5      nU R                  U5      n[        R                  " UUR                  X4-  SS5      5      R                  X4-  S5      nU R!                  U5      U-  n[        R                  " UR                  X4-  SS5      U5      R                  X4S5      nU R#                  U R!                  U R%                  U5      5      U R'                  U5      -  5      nUU-   nX4$ )Nr7   r8   rn   r   T)ro   r9   r    )rG   r'  r   r  r#  r   r%  gatherr   r   r&  sumr)  r*  r+   r   r  r  r  r  )r/   r@   r   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr  routing_weightsr)  r*  experts_weightsexperts_statess                        r3   rC   DogeCDMoE.forward  s+   
 (--a ((7<<QrR 8E7I7I$--]_7I7`44y''+h.@.@.DD
))"-=	@S@STV@WW__@j&6&6s&;@R@
!&&C(9(9#2(>CC#-??4::2?#F $$R)9:))F322r42HHO __W-
==),,z=3E3EcmUWYZ3[\aabeboqst++o6Ho&:&:3=!R&PRZ[``adoqrt{{4>>-3P'QTXT`T`anTo'op%6++r5   )r  r)  r  r  r0   r  r&  r  r#  r'  r%  r*  r  )rJ   rK   rL   rM   r!   r)   r+   r   rC   rN   rO   rP   s   @r3   r  r  x  s5    Iz I.,||, 
	, ,r5   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9     SS	\	R                  S
\\	R                  \	R                  4   S\\	R                     S\\	R                     S\\   S\\   S\\	R                     S\\   S\\	R"                  \\\	R"                  \	R"                  4      4   4S jj5       rSrU =r$ )DogeDecoderLayeri  rU   r   c                 (  > [         TU ]  5         UR                  U l        [        UR                  UR
                  S9U l        [        XS9U l        [        R                  " [        R                  " UR                  5      5      U l        [        UR                  UR
                  S9U l        UR                  (       d  [!        U5      O
[#        U5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   )rU   r   )r(   r)   hidden_dropoutr%   r0   r   input_layernormr   	self_attnr   r*   r+   r,   input_residualpost_attention_layernormis_moer  r  mlppost_attention_residualr   s      r3   r)   DogeDecoderLayer.__init__  s    $33*6+=+=6CVCVW&fJ ll5::f6H6H+IJ(3F4F4FFL_L_(`%*0--76?Yv=N')||EJJv?Q?Q4R'S$r5   r   r   r   r   r@   r   r   ry   	use_cacher   r   r   c                    Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  p[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nUn	U R                  U5      nU R                  U5      n[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nU$ )N)r@   r   r   ry   r   rK  r   r    )
rC  rD  r   r   rB  r   rE  rF  rH  rI  )r/   r@   r   r   ry   r   rK  r   r   residualself_attn_weightss              r3   rC   DogeDecoderLayer.forward  s     !,,];+/>> 	,
' 3)%+)	,
 	,
( 		-3F3FQUQ^Q^_++h6F !55mD/		-3F3FQUQ^Q^_44x?-Or5   )rB  rC  rE  rH  rF  rI  rD  r   )NNNFN)rJ   rK   rL   rM   r!   r   r  r)   r   r+   r   rF   r  r	   r   r   r   FloatTensorrC   rN   rO   rP   s   @r3   r@  r@    s   
Tz 
Thsm 
T 
T %0A6R
 2637+/$)59"||" #5<<#=>" !.	"
 u//0" "%" D>" !!1!12" +," 
u  (51B1BEDUDU1U+V"WW	X" S"r5   r@  c                   r   ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\" \SS	9\\S
.rU 4S jrSrU =r$ )DogePreTrainedModeli  rU   modelTr@  r   Fr    )index)r2  r@   
attentionsc                   > [         TU ]  U5        [        U[        5      (       a7  [	        US5      (       a%  UR
                  R                  R                  5         gg[        U[        5      (       an  [	        US5      (       a%  UR                  R                  R                  S5        [	        US5      (       a&  UR                  R                  R                  S5        ggg)zInitialize the weightsr   rE  r   rI  N)r(   _init_weightsr]   r   r\   r   datazero_r@  rE  fill_rI  )r/   r   r2   s     r3   rX  !DogePreTrainedModel._init_weights  s    f%fm,,vs####% $ 011v/00%%**005v899..3399#> : 2r5   rM  )rJ   rK   rL   rM   r!   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r  r@  r   _can_record_outputsrX  rN   rO   rP   s   @r3   rS  rS    sf    &*#+,#4"5 N""&'	;)#
? 
?r5   rS  c                   "  ^  \ rS rSrS\4U 4S jjr\" 5       \       SS\\	R                     S\\	R                     S\\	R                     S\\   S\\	R                     S	\\   S
\\	R                     S\\   S\4S jj5       5       rSrU =r$ )	DogeModeli  rU   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   rU   F)r(   r)   pad_token_idpadding_idx
vocab_sizer   r(  r0   embed_tokens
ModuleListrangenum_hidden_layersr@  layersr%   r   normrR   
rotary_embgradient_checkpointing	post_initr   s      r3   r)   DogeModel.__init__  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabBaYf0Bab
   2 28K8KL	-V<&+# 	 cs   C?	input_idsr   ry   r   inputs_embedsrK  r   r   r   c                    US L US L-  (       a  [        S5      eU(       a  Uc  [        U R                  S9nUc  U R                  U5      nUcD  Ub  UR	                  5       OSn	[
        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      nU R                  R                  c  [        O[        n
U
" U R                  UUUUUS9nUnU R                  X5      nU R                  S U R                  R                    H  nU" U4UUUUUUS.UD6nM     U R!                  U5      n[#        UUS9$ )	Nz:You must specify exactly one of input_ids or inputs_embedsrj  r   r    )rg   )rU   input_embedsr   r   r   ry   )r   r   ry   r   rK  r   )last_hidden_stater   )
ValueErrorr
   rU   rn  get_seq_lengthr+   arangerG   rg   r   sliding_windowr   r   rt  rr  rq  rs  r   )r/   rx  r   ry   r   ry  rK  r   r   past_seen_tokensmask_functionr   r@   r   decoder_layers                  r3   rC   DogeModel.forward  so    -t";<YZZ0*$++>O  --i8M!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;&))+%
 & #oomJ![[)H4;;+H+HIM)	$7*) /#-	 	M J 		-0%++
 	
r5   )rn  ru  rr  rs  rl  rt  rm  )NNNNNNN)rJ   rK   rL   rM   r!   r)   r   r   r   r+   r  r   r	   rQ  r   r   r   r   rC   rN   rO   rP   s   @r3   rh  rh    s    z    151537+/59$(59<
E,,-<
 !.<
 u//0	<

 "%<
   1 12<
 D><
 !!1!12<
 +,<
 
 <
  <
r5   rh  gate_logitsr  r#  r%  c                    U b  [        U [        5      (       d  gU S   R                  nU S   R                  n/ n/ nU  GH  n	U	R	                  U5      n	U	R                  USS9u  u  pu  pU
R                  S5      UR                  S5      -   nUR                  S5      U-  UR                  S5      -   nUR                  " / UR                  SS QSP76 nUR                  " / UR                  SS QSP76 nUR                  USS9u  nnUR                  SU5      n[        R                  " USS9nUR                  U5        UR                  U5        GM     [        R                  " USS9n[        R                  " USS9nUcu  UR                  S5      n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      UR                  S   -  n[        R$                  " USS9nGO;UR                  u  nn['        U 5      nUSSS2SS2S4   R)                  UUUU45      R+                  S5      R	                  U5      nUR                  S5      UR-                  5          n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      [        R.                  " U5      -  nUSSS2SS2S4   R)                  UUUU45      R+                  SU5      R	                  U5      n[        R.                  " UU-  SS9[        R.                  " USS9-  n[        R.                  " UU-  5      nUU-  $ )a  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [2, batch_size * sequence_length, num_keys].
    num_experts:
        Number of experts
    num_keys:
        Number of keys
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   r8   rn   r   r   )r]   rF   r:   rg   r;   r  r   r   rG   r-  r   r   appendr+   ru   r   	ones_likescatter_add_r>   lenrq   r   r   r.  )r  r  r#  r%  r   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr3  r4  r5  r6  r7  r8  r1  r:  expert_indicesr;  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthrq  expert_attention_mask router_per_expert_attention_maskoverall_losss                                r3   load_balancing_loss_funcr  V  si   @ *[%"@"@N((M ^**N(-00@7H7M7Mh\^7M7_44y''+h.@.@.DD
))"-89;N;Nr;RR__@j&6&6s&;@R@
!&&C(9(9#2(>CC(ooeo<$++B0@A))JB7!!.1""?3! )" #51=))$7Q?/44R8!KKQ_`oo0n]-::1>PRUVYkYqYqrsYtt "',?Q!G&4&:&:#
O, 4At+,V&
OUKLWR[R	 	 044R89N9S9S9UV "KKQ_`oo0n]-::1>PRUVY^YbYb!Z
 
 4At+,V&
O[QRWR%R	 	) "'+>Aa+agh!ilqlulu,!m
 "
 99.1GGHL+%%r5   c                   ~  ^  \ rS rSrS/rSS0rSS/S/40rU 4S jr\\	          SS\
\R                     S	\
\R                     S
\
\R                     S\
\   S\
\R                     S\
\R                     S\
\   S\
\R                     S\\\R                  4   S\
\   S\\   S\4S jj5       5       rSrU =r$ )DogeForCausalLMi  zlm_head.weightlm_headcolwise_repr@   logitsc                 J  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        UR                  U l
        UR                  U l        U R                  5         g )NFr   )r(   r)   rh  rT  rm  r   r   r0   r  router_aux_loss_coefr  r$  rv  r  s     r3   r)   DogeForCausalLM.__init__  s     v&
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#=  	r5   rx  r   ry   r   ry  labelsrK  r   logits_to_keepoutput_router_logitsr   r   c                    U
b  U
OU R                   R                  n
U R                  " SUUUUUUUS.UD6nUR                  n[	        U	[
        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 UD6nSnU
(       a  [        UR                  U R                  [        R                  " [        R                  " U R                  5      5      U R                   U5      nUb+  UU R"                  UR%                  UR&                  5      -  -  n[)        UUUUR*                  UR,                  UR.                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, DogeForCausalLM

>>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
>>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)rx  r   ry   r   ry  rK  r   )lossaux_lossr  r   r@   rV  r2  rM  )rU   r  rT  r|  r]   r  slicer  loss_functionrm  r  r2  r  r   r!  r"  r$  r  r;   rg   r   r   r@   rV  )r/   rx  r   ry   r   ry  r  rK  r   r  r  r   outputsr@   slice_indicesr  r  r  s                     r3   rC   DogeForCausalLM.forward  sm   N %9$D $++JjJj 	
 +/** 	+
)%+')	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD/%%  

499T%5%567((H !11HKK4LLL(#33!//))!//
 	
r5   )r  rT  r  r$  r  rm  )
NNNNNNNNr   N)rJ   rK   rL   rM   _tied_weights_keys_tp_plan_pp_planr)   r   r   r   r+   r  r   r	   rQ  r   r   r  r   r   r   rC   rN   rO   rP   s   @r3   r  r    sO   *+=)H_-z:;H
  151537+/59-1$(5934/3Q
E,,-Q
 !.Q
 u//0	Q

 "%Q
   1 12Q
 ))*Q
 D>Q
 !!1!12Q
 c5<</0Q
 'tnQ
 +,Q
 
#Q
  Q
r5   r  c                       \ rS rSrSrg)DogeForSequenceClassificationi(  rM  N)rJ   rK   rL   rM   rN   rM  r5   r3   r  r  (  s    r5   r  )r  rh  rS  r  )Nr    )r   r  )NNr7   N)Kr   typingr   r   r   r+   torch.nn.functionalr   r   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   integrations.flex_attentionr   masking_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   configuration_doger!   !torch.nn.attention.flex_attentionr"   Moduler%   rR   r   r   r   r  r   rp   r   rF   r   r   r   r  r  r@  rS  rh  r  r  r  __all__rM  r5   r3   <module>r     s9  0  , ,     ! . ) 7 J R [ Q K A & g g 0 ? *  !!; Y'J")) J (J(!<")) !<H(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%@  $#(,.*II.*<<.* 
.* <<	.*
 %,,34.* e_.* e_.* %.* 5<<%&.*b -. 1G - .{BII {|bii  6,		 6,r01 0f ?/ ? ?< O
# O
 O
h "&"-1g&u||U5<<%8$>?g&#g& smg& 	g&
 U\\*g& 5<<g&T d
)? d
 d
N	$DFY 	 cr5   