
    oiH                        S SK JrJr  S SKrS SKJr  S SKJr  S SKJ	r	  S SK
Jr  S SKJr  S SKJr  S SKJrJrJr  S S	KJrJr  S S
KJrJr  S\S\4S jrS\4S jr S4S\S\S\4S jjr\ " S S5      5       r\ " S S5      5       r S\S\S\!S\!4S jr"S\S\S\!S\!4S jr#S\S\S\!S\!4S jr$S\S\S\!S\!4S jr% S5S\RL                  S\RL                  S\!S\!S\S \4S! jjr' S5S"\ S#\ S\!S\!S \4
S$ jjr( S5S"\ S#\ S\!S\!S \4
S% jjr) S5S"\S#\S\!S\!S \4
S& jjr* S5S'\S(\S)\S\!S\!S \4S* jjr+S5S+\RX                  S,\RL                  S-\4S. jjr-S+\RX                  S/\RL                  S0\RL                  S,\RL                  4S1 jr. " S2 S3\5      r/g)6    )	dataclassfieldsN)HfApi)_safetensors)Qwen3MoeConfig)Qwen3MoeSparseMoeBlock)grouped_gemm)KernelConfigBackward_dWKernelConfigBackward_dXKernelConfigForward)GroupedGEMMResultQwen3MoeGroupedGEMMBlock)permute	unpermute	moe_blockconfigc                 H   UR                   nUR                  nUR                  nU R                  S   R                  R
                  R                  nU R                  S   R                  R
                  R                  n[        R                  " X$X5US9n[        R                  " X$X5US9n[        R                  " X#XEUS9n	[        U R                  5       H  u  pXz   R                  UR                  R
                  R                  5        X   R                  UR                  R
                  R                  5        X   R                  UR                  R
                  R                  5        M     [        U R                  5       H  u  p[        R                  R!                  Xz   5      UR                  l        [        R                  R!                  X   5      UR                  l        [        R                  R!                  X   5      UR                  l        M     XxU	4$ )Nr   )devicedtype)num_expertshidden_sizemoe_intermediate_sizeexperts	down_projweightr   r   torchempty	enumeratecopy_up_projdata	gate_projnn	Parameter)r   r   r   r   interm_sizer   r   	buffer_upbuffer_gatebuffer_downiexperts               ]/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/moe/tests/moe_utils.pyrebind_experts_to_shared_bufferr,      s    $$K$$K..Kq!++2299Fa **1177E+I +++K +++K
 y001	6>>00556V--4499:V--4499: 2 y001	 % 2 29< @"'(("4"4[^"D"'(("4"4[^"D 2
 ;..    model_idc                 P    [        5       nUR                  U 5      nUR                  $ N)r   get_safetensors_metadatafiles_metadata)r.   apimetadatas      r+   get_expert_metadatar5   <   s*    
'C585Q5Q6H """r-   copyc                    [         R                  " UR                  UR                  UR                  5      n[         R                  " UR                  UR                  UR                  5      n[         R                  " UR                  UR                  UR                  5      n[        U R                  5       H  u  pgX6   R                  UR                  R                  R                  5        XF   R                  UR                  R                  R                  5        XV   R                  UR                  R                  R                  5        M     XTU4$ r0   )r   r   r   r   r   r   r   r   r   r   r!   r    r"   )r   r   r6   
down_projsup_projs
gate_projs
expert_idxr*   s           r+   clone_expertsr<   D   s    F..0L0LJ {{F88&:L:LH F88&:L:LJ (	(9(9:
$$V%5%5%<%<%A%AB""6>>#8#8#=#=>$$V%5%5%<%<%A%AB ; ++r-   c                   ~    \ rS rSr% \R
                  \S'   \R
                  \S'   \R
                  \S'   Sr\\S'   Sr	g)ForwardResultW   outputrouter_logitsXNgrouped_gemm_result )
__name__
__module____qualname____firstlineno__r   Tensor__annotations__rC   r   __static_attributes__rD   r-   r+   r>   r>   W   s+    LL<<||O-1*1r-   r>   c                       \ rS rSr% \R
                  \S'   \R
                  \S'   \R
                  \S'   \R
                  \S'   \R
                  \S'   Srg)	BackwardResult`   X_grad	gate_gradgate_proj_gradup_proj_graddown_proj_gradrD   N)rE   rF   rG   rH   r   rI   rJ   rK   rD   r-   r+   rM   rM   `   s4    LL||LL ,,LL r-   rM   grouped_gemm_blockatolrtolc                    [        U R                  5       H  u  pEUR                  R                  R                  nUc   eUR                  R                  U   nUc   eXg-
  R                  5       R                  5       n[        R                  " XgX#S9(       a  M  [        SU SUR                  5       R                  5       R                  5       S 35        M     g )NrU   rV   expert z down_proj_grad_diff: .6f)r   r   r   r   gradabsmaxr   allcloseprintdetachcpuitem)	r   rT   rU   rV   r)   r*   ref_grad	test_graddiffs	            r+   check_down_proj_gradrf   i   s     y001	##**//###&0055a8	$$$$))+//1~~h$LLGA34T[[]5F5F5H5M5M5OPS4TUV 2r-   c                 <   UR                   n[        U R                  5       GH  u  pVUR                  R                  R
                  nUR                  R                  R
                  nUc   eUc   eUR                  R
                  US U24   n	UR                  R
                  XTS 24   n
U	c   eU
c   eUR                  U	R                  :X  d    UR                   SU	R                   35       eUR                  U
R                  :X  d    UR                   SU
R                   35       eXy-
  R                  5       R                  5       n[        R                  " XyX#S9(       d<  [        SU SUR                  5       R                  5       R!                  5       S 35        X-
  R                  5       R                  5       n[        R                  " XX#S9(       a  GM  [        SU SUR                  5       R                  5       R!                  5       S 35        GM     g )N != rX   rY   z gate_proj_grad_diff: rZ   z up_proj_grad_diff: )r   r   r   r"   r   r[   r    gate_up_projshaper\   r]   r   r^   r_   r`   ra   rb   )r   rT   rU   rV   r   r)   r*   ref_gate_proj_gradref_up_proj_gradtest_gate_proj_gradtest_up_proj_gradre   s               r+   check_gate_up_proj_gradro   y   s    /DDy001	#--4499!>>0055!---+++ 1==BB%%%%
 /;;@@%%
 #... ,,, $$(;(A(AA	H &&'t,?,E,E+FG	HA ""&7&=&==	D$$%T*;*A*A)BC	D= #8==?CCE~~D
 GA34T[[]5F5F5H5M5M5OPS4TUV 499;??A~~
 
 GA324;;=3D3D3F3K3K3Mc2RSTE 2r-   c                 l   U R                   R                  R                  nUc   eUR                   R                  nUc   eXE-
  R                  5       R	                  5       n[
        R                  " XEX#S9(       d:  [        SUR                  5       R                  5       R                  5       S 35        g g )NrX   zgate_grad_diff: rZ   )gater   r[   r\   r]   r   r^   r_   r`   ra   rb   )r   rT   rU   rV   rc   rd   re   s          r+   check_gate_gradrr      s     ~~$$))H"'',,I    %%'++-D>>(dH !2!2!4!9!9!;C @AB Ir-   c                 L    [        XX#5        [        XX#5        [        XX#5        g r0   )rf   ro   rr   )r   rT   rU   rV   s       r+   check_wgradrt      s"     CI4FI4>r-   X_refX_testnameverbosec                 j   X-
  R                  5       R                  5       nU(       a;  [        U SUR                  5       R	                  5       R                  5       S 35        [        R                  " XX#S9(       d7   U SUR                  5       R	                  5       R                  5       S 35       eg )N diff: rZ   rX   )r\   r]   r_   r`   ra   rb   r   r^   )ru   rv   rU   rV   rw   rx   re   s          r+   check_tensor_allcloser{      s     N!%%'Dgdkkm//1668=>?>>d 8
wt{{}((*//1#678 r-   
ref_resulttest_resultc           	         [        [        5       Vs/ s H!  nSUR                  ;   d  M  UR                  PM#     nn[        U5      S:X  d   eU GH  n[	        X5      n[	        X5      n	UR
                  U	R
                  :X  d#   U SUR
                   SU	R
                   35       e[        UR
                  S   5       H  n
X   nX   nX-
  R                  5       R                  5       n[        R                  " XX#S9(       a  MG   U SU
 SUR                  5       R                  5       R                  5       S	 35       e   X-
  R                  5       R                  5       nU(       a;  [        U S
UR                  5       R                  5       R                  5       S	 35        [        R                  " XX#S9(       a  GMq   U S
UR                  5       R                  5       R                  5       S	 35       e   g s  snf )Nproj   z: rh   r   rX   [z] diff: rZ   rz   )r   rM   rw   lengetattrrj   ranger\   r]   r   r^   r`   ra   rb   r_   )r|   r}   rU   rV   rx   ffields_to_checkfield	ref_grads
test_gradsr)   rc   rd   re   s                 r+   check_expert_gradsr      s    (.n'=R'=!166AQvqvv'=OR1$$$ J.	[0
OOz///	?WBy'tJ,<,<+=>	?/ yq)*A |H"I(--/335D>>D  F!HT[[]%6%6%8%=%=%?$DEF 	 + &++-113UG74;;=#4#4#6#;#;#=c"BCD~~$
 
 	=WGDKKM--/446s;<	= 
) ! Ss
   G:G:c                     [        U R                  UR                  X#SU5        [        U R                  UR                  X#SU5        [        XX#U5        g )NzX.gradz	gate.grad)r{   rO   rP   r   )r|   r}   rU   rV   rx   s        r+   check_gradsr      sR     ;--t8W k33Tg zGDr-   c                     U R                   nUR                   nXV-
  R                  5       R                  5       nU(       a9  [        SUR	                  5       R                  5       R                  5       S 35        [        R                  " XVX#S9(       d5   SUR	                  5       R                  5       R                  5       S 35       eU R                  nUR                  n	X-
  R                  5       R                  5       nU(       a9  [        SUR	                  5       R                  5       R                  5       S 35        [        R                  " XX#S9(       d5   SUR	                  5       R                  5       R                  5       S 35       eg )Nzoutput diff: rZ   rX   zrouter_logits diff: )
r@   r\   r]   r_   r`   ra   rb   r   r^   rA   )
r|   r}   rU   rV   rx   
ref_outputtest_outputre   ref_router_logitstest_router_logitss
             r+   	check_fwdr     sM    ""J$$K$))+//1Ddkkm//1668=>?>> 8	t{{}((*//1#678 
 #00$222779==?D$T[[]%6%6%8%=%=%?$DEF>>d ?	dkkm//1668=>? r-   grouped_resultfused_result	permute_yc                 F   [        [        5       GH  n[        XR                  5      n[        XR                  5      nXx-
  R	                  5       R                  5       n	UR                  S:X  a	  U(       a  Mg  U(       aE  [        UR                   SU	R                  5       R                  5       R                  5       S 35        [        R                  " XxX4S9(       a  M   UR                   SU	R                  5       R                  5       R                  5       S 35       e   g )Nsecond_gemmrz   rZ   rX   )r   r   r   rw   r\   r]   r_   r`   ra   rb   r   r^   )
r   r   r   rU   rV   rx   r   	ref_value
test_valuere   s
             r+   check_grouped_gemm_resultsr     s     )*NJJ7	\::6
&++-113 ::&9UZZL(9(9(;(@(@(B3'GHI~~$
 
 	Bjj\!2!2!4!9!9!;C @A	B 
 +r-   modelrB   is_grouped_gemmc                     UR                  5       R                  5       R                  S5      nU " U5      u  p4U(       a  [        UR                  UUUS9nU$ [        X4US9nU$ )NT)r@   rA   rB   rC   )r@   rA   rB   )r`   clonerequires_grad_r>   hidden_states)r   rB   r   r@   rA   results         r+   run_forwardr   8  sg    	
))$/A!!HF)))"(	
 M STUMr-   grad_outputr@   c                 .   UR                  U5        UR                  c   eU R                  5        H  u  pEUR                  b  M   U S35       e   [        U [        5      (       Ga  U R
                  R                  R                  n[        R                  " U R                   Vs/ s H"  owR                  R                  R                  PM$     sn5      n[        R                  " U R                   Vs/ s H"  owR                  R                  R                  PM$     sn5      n	[        R                  " U R                   Vs/ s H"  owR                  R                  R                  PM$     sn5      n
O[        U [        5      (       aS  U R
                  R                  nU R                  R                  R                  SSS9u  pU R                  R                  n
O[!        S[#        U 5       35      e[%        UR                  UUU	U
S9$ s  snf s  snf s  snf )Nz grad is None      dimzUnsupported model type: )rO   rP   rQ   rR   rS   )backwardr[   named_parameters
isinstancer   rq   r   r   stackr   r"   r    r   r   ri   chunk
ValueErrortyperM   )r   r   r@   rB   rw   paramrP   r*   rQ   rR   rS   s              r+   run_backwardr   G  s    OOK 66--/zz%=$}'==% 0%/00JJ%%**	8=Ff$$))F
 {{6;mmDmF^^""''mD
 8=Ff$$))F
 
E3	4	4JJOO	','9'9'>'>'D'DQa'D'P$--3DK=ABB'#'  G E Gs   %)H3)H)Hc                   &  ^  \ rS rSrSr      SS\S\R                  S\R                  S\R                  S\S\S	\S
\	S\
S\4U 4S jjjr\      SS\S\S\S	\S
\	S\
S\4S jj5       rSS\R                  S\S\R                  4S jjrSrU =r$ )Qwen3MoeFusedGroupedGEMMBlockih  a  
Reference implementation of MoE block using grouped gemm.

This is the same as the Qwen3MoeGroupedGEMMBlock but with triton grouped gemm in place of torch-native grouped gemm implementation.

NOTE: This is NOT to be used for production as it contains many extra checks and saves all intermediate results for debugging.
See grouped_gemm/reference/moe_block.py for a cleaner implementation.
r   rq   ri   r   	permute_xr   autotunekernel_config_fwdkernel_config_bwd_dWkernel_config_bwd_dXc                    > [         TU ]  XX45        XPl        X`l        Xpl        U(       d  Ub  U	b  U
c   S5       eXl        Xl        Xl        g )Nz4Kernel configs must be provided if autotune is False)super__init__r   r   r   r   r   r   )selfr   rq   ri   r   r   r   r   r   r   r   	__class__s              r+   r   &Qwen3MoeFusedGroupedGEMMBlock.__init__r  sc     	|?"" !-(4(4F F	F5 "3$8!$8!r-   r   c                     UR                   S   R                  n[        R                  " U5      u  pnU " UU	U
UUUUUUUS9
$ )Nr   )r   r   r   r   r   r   )r   r   r   extract_hf_weights)clsr   r   r   r   r   r   r   r   rq   ri   r   s               r+   from_hf%Qwen3MoeFusedGroupedGEMMBlock.from_hf  s_     "+!2!21!5!<!<(@(S(S)
%I !! 1#7#7
 	
r-   r   debugreturnc                    UR                   u  p4nX4-  nX`R                  -  nUR                  SU5      nU R                  U5      u  pn
U R	                  U
5      u  pU R
                  (       d)  [        XU R                  5      nUR                   Xu4:X  d   e[        UU R                  UUU R                  U R
                  SU R                  U R                  U R                  U R                  SS9nUR                   USU R                  -  4:X  d   eU R                  U5      nUR                   XpR                  4:X  d   e[        UU R                  UUU R                  SU R                   U R                  U R                  U R                  U R                  SS9nUR                   Xu4:X  d   eU R                   (       d  [#        X5      nUR                   Xu4:X  d   eOUnUR                  X`R                  U5      U	S   -  nUR%                  SS9nUR                   Xe4:X  d   eUR                  X4U5      n['        UUU	UUUUUS	9U4$ )
NFT)rB   Wm_sizesgather_indicestopkr   r   r   r   r   r   is_first_gemmr   ).Nr   r   )token_counts_by_expertr   topk_weights
first_gemmintermediater   hidden_states_unpermuter   )rj   top_kview
run_router#get_token_counts_and_gather_indicesr   r   r	   ri   r   r   r   r   r   act_and_mulr   r   r   sumr   )r   r   r   
batch_sizesequence_length
hidden_dim
num_tokenstotal_tokensrA   routing_weightsselected_expertsr   r   r   r   r   r   s                    r+   forward%Qwen3MoeFusedGroupedGEMMBlock.forward  sn   2?2E2E/
Z1
!JJ.%**2z:;???<
8(8 445EF 	/
 ~~#M4::NM &&<*DDDD "!!,+::}} $ 6 6#'#<#<#'#<#< 

 L!d6P6P2P#QQQQ''
3!!l4N4N%OOOO",+::}} $ 6 6#'#<#<#'#<#<!
   \$>>>> ~~&/&L#*00\4NNNN&1# $((ZZLi() 	 &)))2""z&>>>>%**:
S %;+*#'%&=)	
 	 		r-   )r   r   r   r   r   r   )FFTNNNF)rE   rF   rG   rH   __doc__r   r   rI   boolr   r
   r   r   classmethodr   r   r   rK   __classcell__)r   s   @r+   r   r   h  s,     158<8<99 ll9 ll	9
 <<9 9 9 9 /9 69 69 96   158<8<
)
 
 	

 
 /
 6
 6
 
6RU\\ R$ R5<< R Rr-   r   )Tr   )0dataclassesr   r   r   torch.nnr#   huggingface_hubr   huggingface_hub.utilsr   5transformers.models.qwen3_moe.configuration_qwen3_moer   0transformers.models.qwen3_moe.modeling_qwen3_moer   grouped_gemm.interfacer	   grouped_gemm.kernels.tuningr
   r   r   'grouped_gemm.reference.layers.qwen3_moer   r   grouped_gemm.reference.moe_opsr   r   r,   strr5   r   r<   r>   rM   floatrf   ro   rr   rt   rI   r{   r   r   r   r   Moduler   r   r   rD   r-   r+   <module>r      s	   *   ! . P S / 
 >/%//=/D## # MQ,%,/=,EI,& 2 2 2 ! ! !W%W0W W 	W )U%)U0)U )U 	)UXC%C0C C 	C?%?0? ? 	?" 8<<8LL8 8 	8
 8 8*  = = =  = 	 =
  =P EEE E 	E
 E* ??? ? 	?
 ?D B%B#B B 	B
 B B4ryy U\\ D 99#(<<9>JO,,BS$< Sr-   