
    oi	#                         S SK r S SKJrJrJrJr  S SKrS SKrS SKJ	r	  S SK
r
S SKrSSKJrJrJr  SSKJrJr  S r \R(                  " \5        g)    N)UnionListOptionalTuple   )TEMPORARY_PATCHESUNSLOTH_ENABLE_LOGGINGlogger)patch_functionraise_errorc                    ^^^  SSK n SSK Jm  [        R
                  SS.S[        R                  S[        S	[        R                  4U4S
 jjjm[        U R                  R                  ST5          SSK n SSKJm  UU4S jn[        U R                  R                  SU5          SSK n SSK Jm  [        R
                  SS.S[        R                  S[        S	[        R                  4U4S jjjn[        U R                  R                  S5      (       a=  X0R                  R                  l        [         (       a  ["        R$                  " S5        gg[         (       a  ["        R$                  " S5        gg! [         a  n[        SU5      s SnA$ SnAff = f! [         a  n[        SU5      s SnA$ SnAff = f! [         a  n[        SU5      s SnA$ SnAff = f)zg
Pin the original GPU-optimized version of convert_moe_packed_tensors with smaller default chunk size.
r   N)
FP4_VALUESztransformers.integrations.mxfp4i   )dtyperows_per_chunkr   r   returnc                B  > U R                   (       dC  [        R                  R                  5       (       a   U R                  5       n UR                  5       nUR	                  [        R
                  5      S-
  nU R                  SS UR                  :X  d#   SU R                  < SUR                  < 35       e[        R                  " TX R                  S9nU R                  Gt pVn[        R                  " U5      U-  nU R                  X5      n UR                  US5      n[        R                  " XS-  X R                  S9n	[        S	X5       H  n
[        X-   U5      nX
U nXU nUS
-  R	                  [        R                  5      nUS-	  R	                  [        R                  5      nXU nXN   USS2S	SS24'   XO   USS2SSS24'   [        R                   " UUUS9  AAAAAM     U	R                  " / UQUPUS-  P76 R"                  " / UQXg-  S-  P76 n	A AAU	$ )a  
Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
pass of GPT_OSS.

Args:
    blocks: Packed quantized weights
    scales: Quantization scales
    dtype: Output data type
    rows_per_chunk: Number of rows to process per chunk. .
   Nzblocks.shape= does not match scales.shape=r   devicer      r         out)is_cudatorchcudais_availabletoint32shapetensorr   mathprodreshapeemptyrangeminlongldexpviewblocksscalesr   r   lutprefix_shapeGB
rows_totalr   r0r1blkexpidx_loidx_hisubr   s                    ]/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/temporary_patches/mxfp4.pyconvert_moe_packed_tensorsDpatch_convert_moe_packed_tensors.<locals>.convert_moe_packed_tensors%   s   $ ~~%**"9"9";";[[]F[[]F5;;'#-||CR FLL0c]V\\OCaTZT`T`Sb2cc0ll:U==I$ll!YY|,q0

.
A.kk*!e5O:6BR(*5BB-CB-C Dj__UZZ0FQh]]5::.F*C;C14a4L;C14a4LKKSc*S# 7" kk2<22AE277QQquqyQFC
    r>   )shard_and_distribute_modulez*transformers.integrations.mxfp4.dequantizec                 ^  > UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  SS 5      n	UR                  SS 5      n
UR                  SS 5      nS GH9  nX;   d  M  Ub  T" UUUUUU	U
USS	9	nU S
3nU S3n[        XR                  SS5      S   U5        [        X5      (       d  MW  [        X5      (       d  Mi  T" [	        X5      [	        X5      5      nUR                  SS5      R                  5       R                  U5      nUS:X  aA  [        R                  R                  5       (       a  [        R                  R                  5         [        X[        R                  R                  U5      5        [        X5        [        X5        GM<     g )Nmodelempty_paramcasting_dtypeto_contiguousrankdevice_mesh)gate_up_proj	down_projF)	set_param_blocks_scales.r   r   cpu)getsetattrrsplithasattrgetattr	transpose
contiguousr!   r   r   r    empty_cachenn	Parameterdelattr)module
param_nameparam_valuetarget_devicedq_param_namekwargsrC   rD   rE   rF   rG   rH   projblocks_attrscales_attrdequantizedr>   rA   s                   r=   
dequantize4patch_convert_moe_packed_tensors.<locals>.dequantizei   sz   

7D)jj5

?D9

?D9zz&$'jj51D!*"=##%%%#"'
#K "&g.!%g. 1 1#q 9! <kJ6//GF4P4P"<WV=Y[bci[w"xK"-"7"71"="H"H"J"M"Mm"\K$-%**2I2I2K2K

..0F%((*<*<[*IJF0F05 2r@   re   z#transformers.integrations.mxfp4_CPUi   c                   > U R                   (       a  U R                  5       n UR                   (       a  UR                  5       nUR                  [        R                  5      S-
  nU R
                  SS UR
                  :X  d&   SU R
                  SS < SUR
                  < 35       e[        R                  " TUSS9nU R
                  Gt pVn[        R                  " U5      U-  nU R                  X5      n UR                  US5      n[        R                  " XS	-  USS9n	[        S
X5       H  n
[        X-   U5      nX
U nXU nUS-  R                  [        R                  5      nUS-	  R                  [        R                  5      nXU nXN   USS2S
SS	24'   XO   USS2SSS	24'   [        R                  " UUUS9  AAAAAM     U	R                  " / UQUPUS	-  P76 R                  " / UQXg-  S	-  P76 n	A AAU	$ )a  
Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
pass of GPT_OSS. CPU-optimized version with smaller default chunk size.

Args:
    blocks: Packed quantized weights
    scales: Quantization scales
    dtype: Output data type
    rows_per_chunk: Number of rows to process per chunk. CPU-optimized default: 1M rows.
                   Memory usage per chunk (assuming B=128):
                   - 8192: ~22 MB
                   - 1048576 (1M): ~2.6 GB
                   - 33554432 (32M): ~90 GB
r   Nr   zblocks.shape[:-1]=r   rO   r   r   r   r   r   r   r   )r   rO   r!   r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   s                    r=   convert_moe_packed_tensors_cpuHpatch_convert_moe_packed_tensors.<locals>.convert_moe_packed_tensors_cpu   s   , >>ZZ\F>>ZZ\F5;;'#-||CR FLL0h5GV\\#25F4HHfY_YeYeXg2hh0 ll:U5A$ll!YY|,q0

.
A. kk*!e5G:6BR(*5BB-CB-C Dj__UZZ0FQh]]5::.F*C;C14a4L;C14a4LKKSc*S# 7" kk2<22AE277QQquqyQFC
r@   zDUnsloth: Successfully added convert_moe_packed_tensors_cpu function.zTUnsloth: Failed to add convert_moe_packed_tensors_cpu - original function not found.)transformers.integrations.mxfp4r   	Exceptionr   r   bfloat16r   intTensorr   integrationsmxfp4)transformers.integrations.tensor_parallelrA   rS   rh   r	   r
   info)transformersere   rh   r   r>   rA   s       @@@r=    patch_convert_moe_packed_tensorsru      s   A.> #^^*7 {{	7
 7 
7 7p <,,224PRlmL.Y"1F <,,22L*ME.> #^^)> {{	>
 > 
> >B |((..0LMMIg!!''F!!KK^_ " "!KKno "w  A<a@@AH  LGKKL\  E@!DDEsY   
E7 3
F '
F= 7
FFFF
F:$F5/F:5F:=
GGGG)retypingr   r   r   r   inspectr   torch.nnrX   osr%   commonr   r	   r
   utilsr   r   ru   append r@   r=   <module>r      sD   " 
 / /    	  E E .CpH    9 :r@   