
    cCi                     &   S r SSKrSSKJrJrJr  SSKrSSKJs  J	r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)J*r*J+r+J,r,J-r-J.r.  SSK/J0r0J1r1  \!" 5       (       a  SSK2J3r3   " S S\5      r4 " S S\*5      r5 " S S\+5      r6   S>S\Rn                  S\Rp                  S\Rp                  S\Rp                  S \\Rp                  S!4   S"\\9   S#\\9   S$\\Rp                     S%\:\Rp                  \Rp                  4   4S& jjr;\" 5       r<\;\<S''    " S( S)\Rn                  5      r= " S* S+\(5      r> " S, S-\Rn                  5      r? " S. S/\5      r@ " S0 S1\)5      rA " S2 S3\15      rB    S?S4\\Rp                  \:\Rp                     S4   S5\\C   S6\\C   S7\CS \\Rp                     S%\\Rp                  \C4   4S8 jjrD " S9 S:\05      rE " S; S<\'5      rF/ S=QrGg)@zPyTorch Doge model.    N)CallableOptionalUnion)nn   )ACT2FN)Cache)PretrainedConfig)compile_friendly_flex_attention)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)rope_config_validation)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsis_torch_flex_attn_available)deprecate_kwarg)OutputRecorder   )LlamaForSequenceClassificationLlamaMLPLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward	repeat_kv)MixtralForCausalLMMixtralModel)	BlockMaskc                      ^  \ rS rSrSrSrS/r0 SS_SS_SS_S	S
_SS
_SS_SS_SS_SS_SS_SS_SS_SS
_SS_SS_SS_rS/S/4SS/S/4S/S/4S.r                          S!U 4S jjr	S r
U =r$ )"
DogeConfig7   a4  
This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 32768):
        Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer decoder.
    hidden_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for each sequence transformation and state transformation module.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether the model's input and output word embeddings should be tied.
    max_position_embeddings (`int`, *optional*, defaults to 2048):
        The maximum sequence length that this model might ever be used with.
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings.
        NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
        Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
                In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'.
                The original max position embeddings used during pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation.
                If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
                Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
                Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention.
        If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
        When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
        For more details checkout [this paper](https://huggingface.co/papers/2305.13245).
        If it is not specified, will default to `num_attention_heads`.
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
    sliding_window (`int`, *optional*):
        Sliding window attention window size. If not specified, will default to `None`.
    keep_window_size (`int`, *optional*, defaults to 2048):
        The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
    is_moe (`bool`, *optional*, defaults to `False`):
        Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
    num_experts (`int`, *optional*, defaults to 16384):
        Number of routed experts in the model. This is only used when `is_moe=True`.
    num_experts_per_tok (`int`, *optional*, defaults to 64):
        Number of selected experts to route per-token.
    norm_topk_prob (`bool`, *optional*, defaults to `False`):
        Whether to normalize the topk probabilities.
    output_router_logits (`bool`, *optional*, defaults to `False`):
        Whether or not the router logits should be returned by the model. Enabling this will also
        allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
    router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
        The aux loss factor for the total loss.

```python
>>> from transformers import DogeConfig, DogeModel

>>> # Initializing a Doge-320M style configuration
>>> configuration = DogeConfig()

>>> # Initializing a model from the Doge-320M style configuration
>>> model = DogeModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```dogepast_key_valueszlayers.*.self_attn.q_projcolwisezlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projrowwisezlayers.*.self_attn.o_projzlayers.*.input_layernorm.weightsequence_parallelzlayers.*.input_residual.weightz(layers.*.post_attention_layernorm.weightz'layers.*.post_attention_residual.weightznorm.weightzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatecolwise_repzlayers.*.mlp.down_embedrowwise_repzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                   > Xl         X l        X0l        X@l        XPl        X`l        Xpl        Xl        Xl        Xl	        Xl
        Xl        Xl        Xl        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        U R                  b,  SU R                  ;   a  U R                  S   U R                  S'   [3        U 5        Uc  Xl        [4        TU ]l  " SSU
0UD6  g )Ntype	rope_typetie_word_embeddings )
vocab_sizehidden_sizeintermediate_sizenum_hidden_layershidden_dropout
hidden_actinitializer_rangerms_norm_eps	use_cachemax_position_embeddings
rope_thetarope_scalingnum_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moenum_expertsnum_experts_per_toknorm_topk_proboutput_router_logitsrouter_aux_loss_coefr   super__init__)selfr9   r:   r;   r<   r=   r>   r?   r@   rA   r7   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   kwargs	__class__s                               _/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/doge/modular_doge.pyrS   DogeConfig.__init__   s   < %&!2!2,$!2("'>$$(#6 #6 ,!2 , 0&#6 ,$8!$8! (Vt7H7H-H-1->->v-FDk*t$ &':$ 	
 3	
	
    )rG   rH   r>   r=   r:   r?   r;   rL   rK   rB   rI   rO   rE   rM   rN   r<   rF   rP   r@   rD   rC   rQ   rJ   rA   r9   )i   i                  silug{Gz?gư>TFrZ   g     @N   NFr\   FNrZ   Fi @  @   FFgMbP?)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrS   __static_attributes____classcell__rV   s   @rW   r$   r$   7   su   n` J#4"5#Y#Y 	$Y 	%i	
 	$Y 	*+> 	)*= 	34G 	23F 	* 	!) 		 	!) 	#M 	"=  	 !& &(9:#%568IJ!"_$56 ! $ ""7G
 G
rY   r$   c                       \ rS rSrSrg)DogeRMSNormi  r8   Nr`   ra   rb   rc   ri   r8   rY   rW   rm   rm         rY   rm   c                       \ rS rSrSrg)DogeRotaryEmbeddingi  r8   Nrn   r8   rY   rW   rq   rq     ro   rY   rq   modulequerykeyvaluer0   r"   scalingsoftcap	head_maskreturnc                 6  ^^^ S n	S m[        U[        5      (       a  Un	OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUUU4S jn
[        UUUU
U	SUSS9u  pUR	                  UR
                  5      nUR                  SS5      R                  5       nX4$ )Nc                    > Tb  T[         R                  " U T-  5      -  n Tb  U TU   U   U   U   -   n Tb  U TU   U   S   S   -   n U $ )Nr   )torchtanh)score	batch_idxhead_idxq_idxkv_idxcausal_maskrx   rw   s        rW   	score_mod)flex_attention_forward.<locals>.score_mod*  sm    ejj99E"K	28<UCFKKE Ii0:1=a@@ErY   T)r   
block_mask
enable_gqascale
return_lse   r   )
isinstancer"   shaper   todtype	transpose
contiguous)rr   rs   rt   ru   r0   rv   rw   rx   rU   r   r   attn_outputattention_weightsr   s         ``     @rW   flex_attention_forwardr     s     JK.),,#
$!!Q?SYYr]?":; &E &"K *,,U[[9''1-88:K))rY   doge_flex_attentionc                     ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9   SS	\	R                  S
\\	R                  \	R                  4   S\\	R                     S\\   S\\	R                     S\\	R                  \\	R                     \\\	R                        4   4S jj5       r  SS	\	R                  S\	R                  S\S\\	R                     4S jjrSrU =r$ )DogeAttentioniJ  config	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        UR                  U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R&                  " [(        R*                  " UR                  5      5      U l        [        R                  " UR                  U R                  -  UR                  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        [3        U R                  UR4                  S9U l        [3        U R                  UR4                  S9U l        g )Nhead_dimg      ࿩biaseps)rR   rS   r   r   getattrr:   rE   r   rF   num_key_value_groupsrv   rH   rK   r   LinearrG   q_projk_projv_proj	Parameterr}   zerosAdt_projo_projrm   r@   q_normk_normrT   r   r   rV   s      rW   rS   DogeAttention.__init__K  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9 & 7 7ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ekk&*D*DEFyy&&68R8RY_YnYn
 ii&&68J8JQWQfQf
 "$--V5H5HI!$--V5H5HIrY   past_key_valuer'   4.58new_nameversionr/   position_embeddingsr0   cache_positionry   c                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U R                  U5      R	                  U5      5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  pU R                  UR                  SS5      R                  UR                   S   UR                   S   S5      5      n[        R                  " U R                   ["        R$                  " U5      -  5      R                  SS5      nU R'                  UUU R(                  US9n[+        UU R,                  5      n[.        nU R0                  R2                  S:w  a  [4        U R0                  R2                     nU" U U	U
U4UU R6                  (       d  S	OU R8                  U R:                  S
.UD6u  nnUR                  " / UQSP76 R=                  5       nU R?                  U5      nUU4$ )Nr   r   )sincosr   r   r{   )r/   	dt_statesrK   r0   eagerr\   )r0   dropoutrv   ) r   r   r   r   viewr   r   r   r   r   updater   r   reshaper}   expr   Fsoftplusprepare_dynamic_maskrK   r   r   r   r   _attn_implementationALL_ATTENTION_FUNCTIONStrainingrH   rv   r   r   )rT   r/   r   r0   r'   r   rU   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   	attn_maskattention_interfacer   attn_weightss                       rW   forwardDogeAttention.forwardi  s^    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ &#&nUL'6'='=jX\XfXfht'u$J LL""1a(001C1CA1FHZHZ[]H^`bc
	 IIdffqzz)'<<=GGBO	--'!22)	 . 
	 i)B)BC	(?;;++w6"9$++:Z:Z"[$7		%

 %#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((rY   r   rK   c           	         [         R                  " UR                  5      R                  nUR                  nUSS2SS2SSS24   R	                  SSUR
                  S   S5      nUb  [        U[        5      (       d  UR                  [         R                  :X  aB  UR                  n[         R                  " U[         R                  " SUR                  US9U5      nUR                  USS2SS2SS2SUR
                  S   24   S:g  U5      nUR
                  S   U:  ah  [         R                  " XvUR                  S9n[         R                  " XsSSS	S
9R                  n	UR!                  SU	S5      nUR                  US:H  U5      nU$ )a  
The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

Args:
    hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
    dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
    keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
    attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
Nr   r   r\   )devicer   r   r   r   TF)dimlargestsorted      ?)r}   finfor   minexpandr   r   r"   boolwheretensorr   masked_fill
zeros_liketopkindicesscatter)
rT   r/   r   rK   r0   	min_dtyper   r   active_masktopk_indicess
             rW   r   "DogeAttention.prepare_dynamic_mask  se   $ KK 3 3488	##aD!m,33M''*B
	 %j.S.S##uzz1%++!&"ELL^=R=RZ_$`bk" "--nQ1F[	XZH[F[=[.\`a.aclmI??2!11**9)JZJZ[K ::irSW`efnnL%--b,DK!--kS.@)LIrY   )r   rH   r   r   r   r   r   rK   r   r   r   r   r   rv   r   NNNN)rZ   N)r`   ra   rb   rc   r$   r   intrS   r   r}   Tensortupler	   
LongTensorr   r   ri   rj   rk   s   @rW   r   r   J  s'   Jz Jhsm J J< %0A6R
 26+/596)||6) #5<<#=>6) !.	6)
 "%6) !!1!126) 
u||Xell3XeELL>Q5RR	S6) S6)x !%15#||# <<# 	#
 !.# #rY   r   c                       \ rS rSrSrg)DogeMLPi  r8   Nrn   r8   rY   rW   r   r     ro   rY   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	DogeCDMoEi  r   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        UR
                     U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        UR                  U l        UR                  U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  S-  SS9U l        [        R,                  " U R                  U R                  5      U l        [        R,                  " U R                  U R                  5      U l        g )Nr   r   F)rR   rS   r:   r;   r   r>   act_fnrM   mathfloorsqrtnum_keysrN   top_krO   r   r   rI   	gate_projup_proj	down_projrouter_gate	Embedding
down_embedup_embedrT   r   rV   s     rW   rS   DogeCDMoE.__init__  s_   !--!'!9!9V../!--

499T-=-=#>?//
$33 4#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRab 99T%5%5t}}q7HuU ,,t'7'79I9IJT%5%5t7G7GHrY   r/   ry   c                    UR                   u  p4nU R                  U5      R                  SX4-  S5      nUR                  U R                  SS9u  u  pxu  pUR                  S5      UR                  S5      -   nU	R                  S5      U R                  -  U
R                  S5      -   nUR                  " / UR                   S S QSP76 nUR                  " / UR                   S S QSP76 nUR                  U R                  SS9u  pUR                  SU5      n[        R                  " USS9nU R                  (       a  UUR                  SSS9-  nU R                  U5      nU R                  U5      n[        R                  " UUR                  X4-  SS5      5      R                  X4-  S5      nU R!                  U5      U-  n[        R                  " UR                  X4-  SS5      U5      R                  X4S5      nU R#                  U R!                  U R%                  U5      5      U R'                  U5      -  5      nUU-   nX4$ )Nr   r   r   r{   T)r   keepdimr   )r   r   r   r   r   	unsqueezer   gatherr   softmaxrO   sumr   r   r}   matmulr   r   r   r   )rT   r/   rU   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr   r   experts_weightsexperts_statess                        rW   r   DogeCDMoE.forward  s+   
 (--a ((7<<QrR 8E7I7I$--]_7I7`44y''+h.@.@.DD
))"-=	@S@STV@WW__@j&6&6s&;@R@
!&&C(9(9#2(>CC#-??4::2?#F $$R)9:))F322r42HHO __W-
==),,z=3E3EcmUWYZ3[\aabeboqst++o6Ho&:&:3=!R&PRZ[``adoqrt{{4>>-3P'QTXT`T`anTo'op%6++rY   )r   r   r   r   r:   r;   rO   rM   r   r   r   r   r   )r`   ra   rb   rc   r$   rS   r}   r   r   ri   rj   rk   s   @rW   r   r     s5    Iz I.,||, 
	, ,rY   r   c                     ^  \ rS rSrSS\S\\   4U 4S jjjr\" SSSS9     SS	\	R                  S
\\	R                  \	R                  4   S\\	R                     S\\	R                     S\\   S\\   S\\	R                     S\\   S\\	R"                  \\\	R"                  \	R"                  4      4   4S jj5       rSrU =r$ )DogeDecoderLayeri  r   r   c                 (  > [         TU ]  5         UR                  U l        [        UR                  UR
                  S9U l        [        XS9U l        [        R                  " [        R                  " UR                  5      5      U l        [        UR                  UR
                  S9U l        UR                  (       d  [!        U5      O
[#        U5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   )r   r   )rR   rS   r=   rm   r:   r@   input_layernormr   	self_attnr   r   r}   onesinput_residualpost_attention_layernormrL   r   r   mlppost_attention_residualr   s      rW   rS   DogeDecoderLayer.__init__  s    $33*6+=+=6CVCVW&fJ ll5::f6H6H+IJ(3F4F4FFL_L_(`%*0--76?Yv=N')||EJJv?Q?Q4R'S$rY   r   r'   r   r   r/   r   r0   position_idsrA   r   rU   ry   c                    Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  p[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nUn	U R                  U5      nU R                  U5      n[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nU$ )N)r/   r   r0   r%  r'   rA   r   )pr   r8   )
r  r  r   r   r=   r   r   r!  r"  r#  )rT   r/   r   r0   r%  r'   rA   r   rU   residualself_attn_weightss              rW   r   DogeDecoderLayer.forward  s     !,,];+/>> 	,
' 3)%+)	,
 	,
( 		-3F3FQUQ^Q^_++h6F !55mD/		-3F3FQUQ^Q^_44x?-OrY   )r=   r  r   r"  r!  r#  r  r   )NNNFN)r`   ra   rb   rc   r$   r   r   rS   r   r}   r   r   r   r	   r   r   r   FloatTensorr   ri   rj   rk   s   @rW   r  r    s   
Tz 
Thsm 
T 
T %0A6R
 2637+/$)59"||" #5<<#=>" !.	"
 u//0" "%" D>" !!1!12" +," 
u  (51B1BEDUDU1U+V"WW	X" S"rY   r  c                   8    \ rS rSrSrSr\" \SS9\\	S.r
S rSrg)	DogePreTrainedModeli8  Fr   )index)r  r/   
attentionsc                    [         R                  " X5        [        U[        5      (       a7  [	        US5      (       a%  UR
                  R                  R                  5         gg[        U[        5      (       an  [	        US5      (       a%  UR                  R                  R                  S5        [	        US5      (       a&  UR                  R                  R                  S5        ggg)zInitialize the weightsr   r   r   r#  N)r   _init_weightsr   r   hasattrr   datazero_r  r   fill_r#  )rT   rr   s     rW   r1  !DogePreTrainedModel._init_weightsA  s    %%d3fm,,vs####% $ 011v/00%%**005v899..3399#> : 2rY   r8   N)r`   ra   rb   rc   _supports_flash_attn_can_compile_fullgraphr   r   r  r   _can_record_outputsr1  ri   r8   rY   rW   r-  r-  8  s+     "'	;)#
?rY   r-  c                       \ rS rSrSrg)	DogeModeliN  r8   Nrn   r8   rY   rW   r;  r;  N  ro   rY   r;  gate_logitsrM   r   r   c                    U b  [        U [        5      (       d  gU S   R                  nU S   R                  n/ n/ nU  GH  n	U	R	                  U5      n	U	R                  USS9u  u  pu  pU
R                  S5      UR                  S5      -   nUR                  S5      U-  UR                  S5      -   nUR                  " / UR                  SS QSP76 nUR                  " / UR                  SS QSP76 nUR                  USS9u  nnUR                  SU5      n[        R                  " USS9nUR                  U5        UR                  U5        GM     [        R                  " USS9n[        R                  " USS9nUcu  UR                  S5      n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      UR                  S   -  n[        R$                  " USS9nGO;UR                  u  nn['        U 5      nUSSS2SS2S4   R)                  UUUU45      R+                  S5      R	                  U5      nUR                  S5      UR-                  5          n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      [        R.                  " U5      -  nUSSS2SS2S4   R)                  UUUU45      R+                  SU5      R	                  U5      n[        R.                  " UU-  SS9[        R.                  " USS9-  n[        R.                  " UU-  5      nUU-  $ )a  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [2, batch_size * sequence_length, num_keys].
    num_experts:
        Number of experts
    num_keys:
        Number of keys
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   r   r  r{   r   )r   r   r   r   r   r   r  r   r   r  r   r  appendr}   catr   	ones_likescatter_add_meanlenr   r   r   r  )r<  rM   r   r   r0   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr  r  r  r  r  r  r  r  expert_indicesr  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthr<   expert_attention_mask router_per_expert_attention_maskoverall_losss                                rW   load_balancing_loss_funcrR  R  si   @ *[%"@"@N((M ^**N(-00@7H7M7Mh\^7M7_44y''+h.@.@.DD
))"-89;N;Nr;RR__@j&6&6s&;@R@
!&&C(9(9#2(>CC(ooeo<$++B0@A))JB7!!.1""?3! )" #51=))$7Q?/44R8!KKQ_`oo0n]-::1>PRUVYkYqYqrsYtt "',?Q!G&4&:&:#
O, 4At+,V&
OUKLWR[R	 	 044R89N9S9S9UV "KKQ_`oo0n]-::1>PRUVY^YbYb!Z
 
 4At+,V&
O[QRWR%R	 	) "'+>Aa+agh!ilqlulu,!m
 "
 99.1GGHL+%%rY   c                   L  ^  \ rS rSrU 4S jr          SS\\R                     S\\R                     S\\R                     S\\	   S\\R                     S\\R                     S	\\   S
\\R                     S\\\R                  4   S\\   S\\   S\4S jjrSrU =r$ )DogeForCausalLMi  c                 f   > [         TU ]  U5        [        U5      U l        UR                  U l        g r   )rR   rS   r;  modelrM   r   s     rW   rS   DogeForCausalLM.__init__  s*     v&
!--rY   r-   r0   r%  r'   r.   labelsrA   r   logits_to_keeprP   rU   ry   c                    U
b  U
OU R                   R                  n
U R                  " SUUUUUUUS.UD6nUR                  n[	        U	[
        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 UD6nSnU
(       a  [        UR                  U R                  [        R                  " [        R                  " U R                  5      5      U R                   U5      nUb+  UU R"                  UR%                  UR&                  5      -  -  n[)        UUUUR*                  UR,                  UR.                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, DogeForCausalLM

>>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
>>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r-   r0   r%  r'   r.   rA   r   )lossaux_losslogitsr'   r/   r/  r  r8   )r   rP   rV  last_hidden_stater   r   slicelm_headloss_functionr9   rR  r  rM   r   r   r   rN   rQ   r   r   r   r'   r/   r/  )rT   r-   r0   r%  r'   r.   rX  rA   r   rY  rP   rU   outputsr/   slice_indicesr]  r[  r\  s                     rW   r   DogeForCausalLM.forward  sm   J %9$D $++JjJj 	
 +/** 	+
)%+')	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD/%%  

499T%5%567((H !11HKK4LLL(#33!//))!//
 	
rY   )rV  rM   )
NNNNNNNNr   N)r`   ra   rb   rc   rS   r   r}   r   r   r	   r+  r   r   r   r   r   r   r   ri   rj   rk   s   @rW   rT  rT    s   . 151537+/59-1$(5934/3Q
E,,-Q
 !.Q
 u//0	Q

 "%Q
   1 12Q
 ))*Q
 D>Q
 !!1!12Q
 c5<</0Q
 'tnQ
 +,Q
 
#Q
 Q
rY   rT  c                       \ rS rSrSrg)DogeForSequenceClassificationi  r8   Nrn   r8   rY   rW   rf  rf    ro   rY   rf  )r$   rT  r;  r-  rf  r   )NNr   N)Hrd   r   typingr   r   r   r}   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr	   configuration_utilsr
   integrations.flex_attentionr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.deprecationr   utils.genericr   llama.modeling_llamar   r   r   r   r   r   r   r   mixtral.modeling_mixtralr    r!   !torch.nn.attention.flex_attentionr"   r$   rm   rq   Moduler   floatr   r   r   r   r   r   r  r-  r;  r   rR  rT  rf  __all__r8   rY   rW   <module>r|     sT  "   , ,     !   3 J 9 Q 9 A & E 0 +	 	 	 H  !!;S
! S
l	, 		. 	  $#(,.*II.*<<.* 
.* <<	.*
 %,,34.* e_.* e_.* %.* 5<<%&.*b -. 1G - .{BII {|	h 	6,		 6,r01 0f?. ?,	 	 "&"-1g&u||U5<<%8$>?g&#g& smg& 	g&
 U\\*g& 5<<g&TW
( W
t	$B 	rY   