
    oiZ                        S SK r S SKrS SKJr  S SKrS SKJr  S SKJr	  S SK
r
S SKJr  S SKJr  S SKJr  \R"                  r S SKJr   S SKJr   S S
KJr   S SKJr  \R:                  S\R<                  4S j5       rS\R@                  4S\RB                  S\RB                  S\"S\RB                  4S jjr#\R@                  4S\RB                  S\RB                  4S jjr$\R:                  S\R<                  4S j5       r% S;S\RB                  S\"S\&\RB                  \RB                  4   4S jjr'\R:                  S\R<                  S\R<                  S\R<                  S\R<                  4S j5       r(\RR                  4S\RB                  S\RB                  S \RB                  S!\RB                  S\*\"   S"\RV                  S\RB                  4S# jjr,\R@                  4S$\RB                  S%\RB                  S&\RB                  S'\RB                  S\&\"\"4   S"\RV                  4S( jjr-\b  \-O\,r. " S) S*\R^                  R`                  5      r1\S+ 5       r2 " S, S-\R^                  R`                  5      r3\S<S. j5       r4 " S/ S0\R^                  R`                  5      r5\S<S1 j5       r6S2 r7\2r8S3\ Rr                  ;  a  S4\ Rr                  S3'    S SK:r:\" \:Rv                  5      \" S55      :  a?  \7" 5       (       a$  S6\ Rr                  S3'   \R*                  " S75        \6r8OS4\ Rr                  S3'   \S<S8 j5       r<S=S9 jr=\b  \=" \4S'5      \l>        \b  \=" \8S:5      \l>        gg!   Sr\R*                  " S5         GN1= f!   Sr\R*                  " S	5         GNF= f!   Sr\R*                  " S5         GN[= f!   Sr\R*                  " S5         GNp= f!    N= f)>    N)
functional)Version)logger)torch_compile)	FP8LinearzrUnsloth: FP8 models need importing FP8Linear from `transformers.integrations.finegrained_fp8` but we don't see it.)FbgemmFp8LinearzsUnsloth: FP8 models need importing FbgemmFP8Linear from `transformers.integrations.fbgemm_fp8` but we don't see it.)triton_quantize_fp8_blockzcUnsloth: Could not find fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm.triton_quantize_fp8_block)blockwise_fp8_gemmzkUnsloth: Could not find torchao.prototype.blockwise_fp8_inference.blockwise_quantization.blockwise_fp8_gemm
BLOCK_SIZEc                 (   [         R                  " SS9n[         R                  " SS9n[         R                  " XE5      nXe-  [         R                  " SU5      -   n	Xu-  [         R                  " SU5      -   n
U	S S 2S 4   U-  U
S S S 24   -   nU	S S 2S 4   U:  U
S S S 24   U:  -  n[         R                  " X-   US9R                  [         R                  5      n[         R                  " XU-  -   U-   5      nX-  n[         R                  " X+-   XS9  g )Nr   axis   mask)tl
program_idcdivarangeloadtofloat32store)x_ptrs_ptry_ptrMNr   pid_mpid_nnoffs_moffs_noffsr   xsys                   M/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/fp8.pyweight_dequant_kernelr)   @   s    MM#EMM#E
A"))Az"::F"))Az"::F!T'?Qa0D1d7OaF47Oa$78D
T*--bjj9A
	!E)*A	AHHU\1*       r%   r&   
block_sizereturnc           	      p  ^^ U R                  5       (       d  U R                  5       n UR                  5       (       d  UR                  5       nU R                  5       S:X  a  UR                  5       S:X  d   eU R                  5       u  mm[        R
                  " XS9nUU4S jn[        U   " XUTTUS9  U$ )N   dtypec                 l   > [         R                  " TU S   5      [         R                  " TU S   5      4$ Nr   tritonr   )metar   r   s    r(   <lambda>&weight_dequant_block.<locals>.<lambda>Y   s.    AtL)*AtL)*r*   r   )is_contiguous
contiguousdimsizetorch
empty_liker)   )r%   r&   r,   r1   r'   gridr   r   s         @@r(   weight_dequant_blockrA   O   s     ??LLN??LLN557a<AEEGqL((668DAq*AD $aAJGHr*   c                    UR                   S   S:X  a  U R                   S   UR                   S   :X  a%  U R                  U5      UR                  U5      -  nU$ U R                   S   UR                   S   :X  aC  U R                  5       R                  U5      UR                  U5      -  nUR                  5       nU$ [        SU R                   < SUR                   < 35      e[	        XUS9$ )Nr   r   zIncompatible shapes x.shape = z, s.shape = r0   )shaper   t
ValueErrorrA   )r%   r&   r1   r'   s       r(   weight_dequantrF   a   s    wwqzQ771:#Uadd5k)A  WWQZ1771:%!$$u+-AA  >AGG<}!''NOO $A%88r*   c                    [         R                  " SS9nXC-  [         R                  " SU5      -   n[         R                  " X-   5      R	                  [         R
                  5      n[         R                  " [         R                  " U5      5      S-  nUS:X  a  SOUnXg-  nUR	                  UR                  R                  5      n[         R                  " X-   U5        [         R                  " X$-   U5        g )Nr   r   g      |@g      ?)r   r   r   r   r   r   maxabsr1   
element_tyr   )	r   r   r   r   pidr$   r%   r&   r'   s	            r(   act_quant_kernelrL   s   s    
--q
!Cbii:66D
  ,A
rvvayE!A Av1A	A	U[[##$AHHU\1HHU[!r*   c                   ^  T R                  5       (       d  T R                  5       m T R                  S   U-  S:X  d   e[        R                  " T [        R
                  S9nT R                  " / T R                  5       S S QT R                  S5      U-  P7S[        R                  06nU 4S jn[        U   " T X#US9  X#4$ )Nr   r0   r1   c                 V   > [         R                  " TR                  5       U S   5      4$ r3   )r5   r   numel)r6   r%   s    r(   r@   act_quant.<locals>.grid   s"    AGGItL'9:<<r*   r9   )
r:   r;   rC   r>   r?   float8_e4m3fn	new_emptyr=   r   rL   )r%   r,   r'   r&   r@   s   `    r(   	act_quantrT      s     ??LLN772;#q(((E$7$78A	TQVVXcr]TAFF2J*$<TemmTA= T1a<4Kr*   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc                    [         R                  " SS9n[         R                  " UU5      n[         R                  " UU5      nUU-  nUU-  nUU-  n[        UU-
  U5      nUUU-  -   nUU-  U-  n UU-  [         R                  " SU5      -   U-  n!U U-  [         R                  " SU5      -   U-  n"[         R                  " SU5      n#U U!SS2S4   U
-  U#SSS24   U-  -   -   n$UU#SS2S4   U-  U"SSS24   U-  -   -   n%UU!U-  -   n&U"U-  n'UU'U-  -   n([         R
                  " UU4[         R                  S9n)[        S[         R                  " UU5      5       H  n*[         R                  " U$U#SSS24   UU*U-  -
  :  SS9n+[         R                  " U%U#SS2S4   UU*U-  -
  :  SS9n,U*U-  n-U-U	-  n.[         R                  " U&U.U-  -   5      n/[         R                  " U(U.U-  -   5      n0U)[         R                  " U+U,5      U/SS2S4   -  U0SSS24   -  -  n)U$UU-  -  n$U%UU-  -  n%M     UR                  R                  [         R                  :X  a   U)R                  [         R                  5      n1OgUR                  R                  [         R                  :X  a   U)R                  [         R                  5      n1OU)R                  [         R                  5      n1UU-  [         R                  " SU5      -   n2U U-  [         R                  " SU5      -   n3X.U2SS2S4   -  -   UU3SSS24   -  -   n4U2SS2S4   U:  U3SSS24   U:  -  n5[         R                  " U4U1U5S9  g)zTriton-accelerated function used to perform linear operations (dot
product) on input tensors `A` and `B` with block-wise quantization, and
store the result in output tensor `C`.
r   r   Nr0   g        )r   otherr   )r   r   r   minr   zerosr   ranger   dotr1   rJ   bfloat16r   float16r   )6ABCAsBsr   r   Kgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nrU   rV   rW   rX   rK   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mr   r    offs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_masks6                                                         r(   _w8a8_block_fp8_matmulr      s]   J --q
!C<(I<(I#i/&&H\)Ky;.=L3-.E##4E|#bii<&@@AEG|#bii<&@@AEGYYq,'F'!T'"Y.a91LLMF&D/I-a0@90LLMF7[((G'!H8k))G((L,7LK1bgga./GGF6$'?Q\9I5I#ISVWGGF6!T'?Q\9I5I#ISVWl"W$ggg+ 556ggg+ 556rvva|c!T'l2Sq\AA,**,** 0 	wwR[[(NN2;;'	
		rzz	)NN2::&NN2::&l"RYYq,%??Gl"RYYq,%??GWQW---	GD!G<L0LLFag"wtQw'7!';<FHHVQv&r*   ra   rb   rd   re   output_dtypec                   ^^ [        U5      S:X  d   eUS   US   pvU R                  S   UR                  S   :X  d   eU R                  SS UR                  SS :X  a  U R                  5       (       d   e[        R                  " U R                  S   U5      UR                  S   :X  d   eU R                  5       U R                  S   -  mUR                  S:X  a%  UR                  5       (       a  UR                  S:X  d   eUR                  u  mn[        R                  " TU5      UR                  S   :X  d   e[        R                  " X5      UR                  S   :X  d   eU R                  SS T4-   n	U R                  XS9n
SnTU:  a"  [        R                  " T5      n[        US5      nUnX|-  S:X  d   eUnUU4S	 jn[        U   " U UU
UUTTUUUU R                  S
5      U R                  S5      UR                  S5      UR                  S5      U
R                  S
5      U
R                  S5      UR                  S
5      UR                  S5      UR                  S5      UR                  S5      UUUSS9  U
$ )aQ  This function performs matrix multiplication with block-wise
quantization.
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
The output is returned in the specified `output_dtype`.
Args:
    A: The input tensor, e.g., activation.
    B: The input tensor, e.g., weight.
    As: The per-token-group quantization scale for `A`.
    Bs: The per-block quantization scale for `B`.
    block_size: The block size for per-block quantization. It should
    be 2-dim, e.g., [128, 128].
    output_dytpe: The dtype of the returned tensor.
Returns:
    torch.Tensor: The result of matmul.
r/   r   r   rN   Nr0   r+      c                 p   > [         R                  " TU S   5      [         R                  " TU S   5      -  4$ )NrU   rV   r4   )METAr   r   s    r(   r@   *w8a8_block_fp8_matmul_triton.<locals>.grid  s5    KK4/06;;q$~BV3WW
 	
r*      )rU   rV   rW   rX   )lenrC   r:   r5   r   rP   ndimrS   next_power_of_2rH   r   stride)ra   rb   rd   re   r,   r   block_nblock_krf   C_shaperc   rU   rW   rV   r@   r   r   s                  @@r(   w8a8_block_fp8_matmul_tritonr      sD   . z?a!!}jmW772;!''"+%%%773B<288CR=(Q__->->>>;;qwwr{G,<<<		QWWR[ A66Q;1??,,A==77DAq;;q'"bhhqk111;;q"bhhqk111ggcrlaT!G	G2AL<--a0<,L!Q&&&L

 4 			

									
		"
		"
		!
		!###14 Hr*   act_qweight_q	act_scaleweight_scalec                     [        U R                  5       UR                  5       UR                  5       UR                  5       US   S9nUR                  U5      $ )Nr   )r,   )torchao_blockwise_gemmr;   r   )r   r   r   r   r,   r   outs          r(   torchao_block_matmulr   :  sV     !!]C 66,r*   c                   4    \ rS rSr\S 5       r\S 5       rSrg)FP8BlockQuantLineariV  c           	         UR                   u  pEUR                   u  pg[        USS 5      =(       d    [        USSS/5      nUc   S5       e[        R                  " XHS   5      U:w  d  [        R                  " XXS   5      U:w  ao  [        R                  " XHS   5      U:X  a*  [        R                  " XXS   5      U:X  a  UR                  nO([        SUR                    SUR                    SU 35      eUR                  5       (       d  UR                  5       n[        XS   5      u  p[        U	UU
UUUR                  S	9nX l        X0l        Xl        UR                  UR                  5      $ )
Nr,   r+   zblock_size is not setr   r   Weight shape  and scales shape # is not compatible with block size )r   )rC   getattrr5   r   TrE   r:   r;   rT   fp8_block_matmulr1   weightr   r,   r   )ctxXr   r   mr!   pqr,   qinputscaleoutputs               r(   forwardFP8BlockQuantLinear.forwardW  se    ||!!V\48 
G,c
=

 %>'>>%;;qQ-(A-Q11NRS1SA!}-2KKa=1Q6  ,~~ #FLL>1CLDVDVCWWz  |F  {G  H  ##%%&&(F!!]3!77
 
'#yy!!r*   c                 d    [        U R                  U R                  5      n[        X5      nAUS S 4$ NrF   r   r   torch_matmulr   grad_outputW_deqgrad_Xs       r(   backwardFP8BlockQuantLinear.backward~  s2    szz3+;+;<k1tT!!r*    N__name__
__module____qualname____firstlineno__staticmethodr   r   __static_attributes__r   r*   r(   r   r   V  s)    $" $"L " "r*   r   c                 .    [         R                  XU5      $ r   )r   apply)r   r   r   s      r(   fp8_torch_block_quant_forwardr     s    $$Q==r*   c                   8    \ rS rSr\SS j5       r\S 5       rSrg)FbgemmFp8Linear_matmuli  Nc                    UR                   S   UR                   S   :X  Gaw  UR                   S   S-  S:X  Ga`  UR                   S   S-  S:X  GaI  / UR                   S S QSP7n[        R                  R                  R	                  UR                  SUR                   S   5      R                  5       [        USS 5      S9u  pgUR                  [        R                  5      nUR                  5       (       d  UR                  5       nUR                  5       (       d  UR                  5       n[        R                  R                  R                  XbXxSS9n	Ub  X-   OU	n	U	R                  UR                  UR                  5      n	U	R                  U5      n	AAOUR                   S   UR                   S   :w  a   UR                   S   UR                   S   :X  d,  UR                   S   S-  S:w  d  UR                   S   S-  S:w  a"  [        X#5      R                   n
[#        X5      n	A
O5[%        S	UR                   < S
UR                   < SUR                   < 35      eX l        X0l        U	$ )Nr   r   r   rN   input_scale_ub)scale_ubT)use_fast_accumz'Shapes are incompatible weight.shape = z, weight_scale.shape = z, x.shape = )rC   r>   opsfbgemmquantize_fp8_per_rowviewr;   r   r   r   r:   f8f8bf16_rowwisedevicer1   reshaperF   r   r   rE   r   r   )r   r%   r   r   biasoutput_shapex_quantizedx_scaleweight_scale_float32r   r   s              r(   r   FbgemmFp8Linear_matmul.forward  s,   <<?l0033LLOa1$a1)<)A /QWWSb\.2.L $)99#3#3#H#Hr1772;'224"6+;TB $I $ K $0??5==#A ''))**,--//+668YY%%66WUY 7 F '+&6V]FFYYqxx1F^^L1FWLLO|11!44Q<#5#5a#88ll1o"a'6<<?a+?1+D
 #68::E!!+F:6<<*;;Sl>P>P=TTaWXW^W^Vbc  
'r*   c                 h    [        U R                  U R                  5      n[        X5      nAUS S S S 4$ r   r   r   s       r(   r   FbgemmFp8Linear_matmul.backward  6    szz3+;+;<k1tT4--r*   r   r   r   r   r*   r(   r   r     s)    6 6p . .r*   r   c                 .    [         R                  XX#5      $ r   )r   r   r   r   r   r   s       r(   fbgemm_fp8_linearr     s    !''<FFr*   c                   8    \ rS rSr\SS j5       r\S 5       rSrg)FP8_fbgemm_block_lineari  Nc           	      :   UR                   nUR                  SUR                   S   5      n[        USS 5      =(       d    [        USSS/5      u  pgUnUR                   u  pUR                   u  p[        R                  " X5      U:w  d  [        R                  " X5      U:w  aj  [        R                  " X5      U:X  a'  [        R                  " X5      U:X  a  UR
                  nO)[        SUR                    SUR                    SXg4 35      e[        XUS 5      u  p[        R                  R                  R                  XR                  5       XR                  5       XU5      nUb  X-   OUnUR                  " / US S QSP76 nAAX l        X0l        XU/U l        U$ )NrN   r,   r+   r   r   r   )rC   r   r   r5   r   r   rE   r	   r>   r   r   f8f8bf16_blockwiser;   r   r   r,   )r   r   r   r   r   
orig_shapebs_nbs_kbs_mr   r!   r   r   xqxsr   s                   r(   r   FP8_fbgemm_block_linear.forward  s   WW
FF2qwwr{#V\48 
G,c
=

 ||!!;;q1$A(<(A{{1#q(V[[-AQ-F  ,~~ #FLL>1CLDVDVCWWz{  |F  {G  H  +1D$?
 !!44!!#R)@)@)BDPT
 #'"22j"o2r2
'd+r*   c                 h    [        U R                  U R                  5      n[        X5      nAUS S S S 4$ r   r   r   s       r(   r    FP8_fbgemm_block_linear.backward  r   r*   r   r   r   r   r*   r(   r   r     s)    ( (T . .r*   r   c                 .    [         R                  XX#5      $ r   )r   r   r   s       r(   fp8_fbgemm_block_linearr     s    "((LGGr*   c                    ^ Su  pn[         R                  " X[         R                  SS9nUnUR                  u  pUR                  u  p[         R                  " U S-  US-  [         R                  SS9nSn [         R
                  R                  R                  X4Xf5      n[         R                  " U5      R                  5       S:X  d   eSnAAA[         R                  R!                  5         U$ ! [         a^  n	[        U	5      R                  5       mSn
[        U4S jU
 5       5      nU(       a  [        S	5        O[        S
U	 S35        Sn S n	A	NS n	A	ff = f)N)r+   r+   r+   cuda)r1   r   r+   FT)	cutlassz
cuda errorzcuda runtime errorzno kernel imagezarch conditionalzmma instructionzcompute capabilitycute_invalid_control_pathtmac              3   ,   >#    U  H	  oT;   v   M     g 7fr   r   ).0err	error_strs     r(   	<genexpr>"test_has_fbgemm.<locals>.<genexpr>+  s     #T@S9$4@Ss   zNUnsloth: FBGEMM on the current GPU cannot load - will switch to Triton kernelsz<Unsloth: FBGEMM on the current GPU cannot load with error = z  - will switch to Triton kernels)r>   onesrR   rC   r   r   r   r   uniqueitem	Exceptionstrloweranyprintr   empty_cache)r   r   rf   r   wq_block_scale
has_fbgemmr   ecutlass_cuda_errorsis_cutlass_cuda_errorr   s               @r(   test_has_fbgemmr    s6   
 GA!	A%"5"5	GB	B88DA88DA**Q#XqCxQWXKJii11"+S||C %%'3...
6 	R	JJ9  FLLN	

 !$#T@S#T T ` NqcQqr 
3s   7AC0 0
E:AEEUNSLOTH_HAS_FBGEMM0z1.4.01z+Using fbgemm_gpu block quantized FP8 matmulc                     UR                   S:X  a!  UR                  S   S:  a  [        XU5      nU$ [        XX#5      nU$ )Nr/   r   )r   rC   fp8_block_quant_linearr   )r   r   r   r   r   s        r(   
fp8_linearr  Q  sH    A,"4"4Q"7!";$Q= J  <>Jr*   c                    ^ ^ U U4S jnU$ )Nc                 >   > T" XR                   [        U T5      5      $ r   )r   r   )selfr   forward_function
scale_attrs     r(   patched_forward-module_forward_patch.<locals>.patched_forward]  s    ;;j0IJJr*   r   )r  r  r  s   `` r(   module_forward_patchr!  \  s    K r*   weight_scale_inv)r+   r   )r   )?osr>   torch.nnnnr5   triton.languagelanguager   r   Fmathunsloth_zoo.utilsr   unsloth_zoo.logr   $unsloth_zoo.temporary_patches.commonr   matmulr   )transformers.integrations.finegrained_fp8r   info$transformers.integrations.fbgemm_fp8r   1fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemmr	   @torchao.prototype.blockwise_fp8_inference.blockwise_quantizationr
   r   jit	constexprr)   r_   TensorintrA   rF   rL   tuplerT   r   r   listr1   r   r   r   autogradFunctionr   r   r   r   r   r   r  r  environ
fbgemm_gpu__version__r  r!  r   r   r*   r(   <module>r>     sC   
     $  % " >||CD + + + 9<U^^||25
\\$ >C^^ 9ell 9u|| 9$ bll    (+||!$
5<<%&" Q'4 ,,5Q'6 ,,7Q'8 ,,9Q': ,,;Q' Q't !&N||N||N 	N 		N
 S	N ++N \\Nn !& << ll  ||  ,,	 
 c3h  ++ . ) 	% -"%..11 -"` > >?.U^^44 ?.D G G1.enn55 1.h H H-` 7 rzz)'*BJJ#$	
 z%%&''*:: /2BJJ+,KKEG%<"/2BJJ+,
   23DnUO,-CEWXI SI
KK|O
KK} $
KKm!
KKud	sC   M  	M< N N4 AO O  M9<NN14OO