
    h?              (       D   S SK r S SKJr  S SKrS SKrS SKJr  SSKJ	r	J
r
JrJr  S rSrSrSrSrSr\\\\\\S	.r\R(                  S
\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  4S j5       r\R(                  S
\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  4S j5       r\R(                  S\R*                  S
\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  4S j5       r\R(                  S\R*                  S
\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S\R*                  4S j5       r\\S.\\S.\\S.\\S.\\S.\\S.S.r  S8S\S\R8                  S\R8                  S\R8                  S\\R8                     S\\R8                     S\S\S
\S\S \S!\S\S\S"\S#\S\S$S4$S% jjr\R@                  S&\R8                  S'\R8                  S(\R8                  S)\S*\RB                  S$\R8                  4S+ j5       r"\R@                  S&\R8                  S(\R8                  S)\S$\#\R8                  \R8                  4   4S, j5       r$S\R8                  S\R8                  S\R8                  S\\R8                     S
\S\S \S!\S\S"\S#\S-\R8                  S.\\R8                     S/\R8                  S0\\R8                     S\S\S1\%S\S$S4(S2 jr&S\R8                  S\R8                  S\R8                  S\\R8                     S
\S\S \S!\S\S"\S#\S-\R8                  S.\\R8                     S/\R8                  S0\\R8                     S\S\S1\%S\S$S4(S3 jr'\R(                  S
\R*                  S\R*                  S\R*                  S4\R*                  S\R*                  S\R*                  4S5 j5       r(\R(                  S
\R*                  S\R*                  S\R*                  S\R*                  S\R*                  S4\R*                  S\R*                  S\R*                  4S6 j5       r)\(\(\(\)\(\)S	.r*   S9S\S\R8                  S\R8                  S\R8                  S\\R8                     S
\S\S \S!\S\S"\S#\S-\R8                  S.\\R8                     S/\R8                  S0\\R8                     S\S\S$S4&S7 jjr+\+r+g):    N)Optional   )dequant_8bit_blockwise"dequant_8bit_blockwise_kernel_util#quantize_8bit_blockwise_kernel_utilquantize_blockwise_triton            )momentumrmspropadagradadamlionademamixbeta1beta2epsweight_decaygnorm_scaleOPTIMIZER_ID
BLOCK_SIZEN_PER_THc                 x   [         R                  " SS9nUU-  nUU-  [         R                  " SUU-  5      -   nUU:  n[         R                  " U U-   USS9n[         R                  " UU-   USS9n[         R                  " UU-   USS9nUU-  nSSU
-
  -  nSSU-
  -  nUS:X  aK  UU-  SU-
  U-  -   nUU-  SU-
  U-  U-  -   nUU-  nUU-  nU[         R                  " U5      U-   -  nUU-  nOUS:X  a  Un[         R
                  " [         R                  " UWS5      5      n[         R                  " UU5        g)	zBPreprocessing optimizer, computing update norm (2-state optimizer)r   axis        maskother      ?r
   r   Ntl
program_idarangeloadsqrtsumwhere
atomic_add)g_ptrp_ptr
state1_ptr
state2_ptr	unorm_ptrr   r   r   r   step
beta1_step
beta2_steplrr   
n_elementsr   r   r   pidblock_start_idxoffsetsr    g_valss1_valss2_valscorrection1correction2update_valsupdate_norm
total_norms                                 d/home/james-whalen/.local/lib/python3.13/site-packages/bitsandbytes/backends/triton/kernels_optim.py$_optimizer_precondition_2state_32bitrB   #   sW   , --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;Fggj7*SAGggj7*SAG6!Fz)*Kz)*KqE/S5[F$::E/S5[F$:V$CCK'K'!1C!78!K/		{C89JMM)Z(    c                    [         R                  " SS9nUU-  nUU-  [         R                  " SUU-  5      -   nUU:  n[         R                  " U U-   USS9n[         R                  " UU-   USS9nUU-  nUS:X  a  U	S:X  a  UnOUU-  U-   nUU-  nOUS:X  a  UU-  SU-
  U-  -   nUnOhUS:X  a3  UU-  SU-
  U-  U-  -   nU[         R                  " U5      U-   -  nUU-  nO/US:X  a)  UUU-  -   nU[         R                  " U5      U-   -  nUU-  n[         R
                  " [         R                  " UWS5      5      n[         R                  " UU5        g	)
zBPreprocessing optimizer, computing update norm (1-state optimizer)r   r   r   r   r   r   r"   r	   Nr#   )r,   r-   r.   r/   r0   r   r   r   r   r1   r2   r3   r4   r   r5   r   r   r   r6   r7   r8   r    r9   r:   r?   r>   r@   s                              rA   $_optimizer_precondition_1state_32bitrE   Z   sr   , --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;Fggj7*SAG6!Fq19Go.G'		E/S5[F$::		E/S5[F$:V$CC 03 67!K/		FVO+ 03 67!K/{C89JMM)Z(rC   	max_unormc                    [         R                  " SS9nUU-  nUU-  [         R                  " SUU-  5      -   nUU:  n[         R                  " U U-   USS9R	                  [         R
                  5      n[         R                  " UU-   USS9R	                  [         R
                  5      n[         R                  " UU-   USS9n[         R                  " UU-   USS9nUS:X  a  [         R                  " UU-   U-   USS9nUU-  nSn US:  a9  [         R                  " [         R                  " U5      5      n!U!XV-  :  a  XV-  U!-  n US:X  a  UU-  SU-
  U-  -   nUU-  SU-
  U-  U-  -   nSU-
  n"[         R                  " SU-
  5      n#U* U#-  U"-  n$US:  a  USUU-  -
  -  nU U$-  U[         R                  " U5      UU#-  -   -  -  n%UU%-   nOUS:X  a  UU-  SU-
  U-  -   nWU	-  SU	-
  U-  -   nUU-  SU-
  U-  U-  -   nSU-
  n"[         R                  " SU-
  5      n#US:  a  USUU-  -
  -  nUU"-  U
U-  -   n&[         R                  " U5      U#-  U-   n'UUU&U'-  -  -
  n[         R                  " UU-   UUS9  [         R                  " UU-   UUS9  [         R                  " UU-   UUS9  US:X  a  [         R                  " UU-   U-   WUS9  g	g	)
z2-state optimizer kernelr   r   r   r   r   r"   r
   r    N)r$   r%   r&   r'   tofloat32r(   store)(r,   r-   r.   r/   r0   rF   
param_normr   r   beta3alphar   r   r1   r2   r3   r4   r   
skip_zerosr5   r   r   r   r6   r7   r8   r    r9   p_valsr:   r;   s3_valsupdate_scalecurrent_unormr<   r=   	step_size
update_valmixed_momentumadaptive_terms(                                           rA   ,_optimizer_update_2state_32bit_triton_kernelrX      s   6 --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;>>rzzJFWWUW_4s;>>rzzJFggj7*SAGggj7*SAGq''*z1G;$cR6!FL3	 23911%2mCLqE/S5[F$::E/S5[F$:V$CCJ&ggcJ./C+%3	#sR,%667F!I-BGGG<LsU`O`<`1ab
*$		E/S5[F$::E/S5[F$::E/S5[F$:V$CCJ&ggcJ./#sR,%667F!K/EGOD)K73>" >??HHUW_f40HHZ'!76HHZ'!76q
j(72G$G rC   c           
         [         R                  " SS9nUU-  nUU-  [         R                  " SUU-  5      -   nUU:  n[         R                  " U U-   USS9R	                  [         R
                  5      n[         R                  " UU-   USS9R	                  [         R
                  5      n[         R                  " UU-   USS9nUU-  nUS:  a  UUU-  -   nSnUS:  a?  [         R                  " [         R                  " U5      5      nUXV-  U-   :  a
  XV-  U-   U-  nUS:X  a   US:X  a  UnOUU-  U-   nUU* U-  -  n UU -   nOUS:X  a\  UU-  SU-
  U-  -   n!UU-  [         R                  " U!S:  S[         R                  " U!S:  SS5      5      -  n UU -
  nUU-  SU-
  U-  -   nOqUS:X  a9  UU-  SU-
  U-  U-  -   nUU-  U-  [         R                  " U5      U-   -  n UU -
  nO2US	:X  a,  UUU-  -   nUU-  [         R                  " U5      U-   -  n UU -
  n[         R                  " UU-   UUS
9  [         R                  " UU-   UUS
9  g)z1-state optimizer kernelr   r   r   r   r"   r   r         r	   rH   N)	r$   r%   r&   r'   rI   rJ   r(   r*   rK   )"r,   r-   r.   r/   r0   rF   rL   r   r   rM   rN   r   r   r1   r2   r3   r4   r   rO   r5   r   r   r   r6   r7   r8   r    r9   rP   r:   rR   rS   rU   momentum_updates"                                     rA   ,_optimizer_update_1state_32bit_triton_kernelr\      ss   6 --Q
CHnO
*RYYq*x:O-PPGZDWWUW_4s;>>rzzJFWWUW_4s;>>rzzJFggj7*SAG6!Fc&<//L3	 2391C77%2S8MILq19Go.G!bS7]3
*$		!E/S5[F,BB!B&/A2EsBHHUdghUhjnpsLt)uu
*$E/S5[F$::		E/S5[F$:V$CC!B&/27773Cc3IJ
*$		FVO+&[BGGG$4s$:;
*$HHUW_f40HHZ'!76rC   )
preprocessupdate)r   r   r   r   r   r   optimizer_namegpstate1state2	unorm_vecrL   rM   rN   r1   r4   returnc                    U(       a  [        S5      eSnSn[        R                  " UR                  5       UU-  5      4n[        U    n[
        U    S   n[
        U    S   nX-  nX-  nU S:X  ao  UU   " UUUUUUUUU	U
UUUUUUUUUUR                  5       UUUSS9  US	:  a:  UR                  5         UU   " UUUUUUU	UUUUUUUUR                  5       UUUSS9  g
g
US	:  a9  UR                  5         UU   " UUUUUUU	UUUUUUUUR                  5       UUUSS9  UU   " UUUUUUUUU	U
UUUUUUUUUUR                  5       UUUSS9  g
)z(
32-bit optimizer implemented by Triton
&skip_zeros is not supported on XPU yet   r   r]   r^   r   r	   )	num_warpsr   N)NotImplementedErrortritoncdivnumelname2optimizer_idname2optimizer_32bit_fnzero_)r_   r`   ra   rb   rc   rd   rF   rL   r   r   rM   rN   r   r   r1   r4   r   rO   r   r   gridoptimizer_idfn_preprocess	fn_updater2   r3   s                             rA   optimizer_update_32bit_implru   S  s   . !"JKKJHKK	:#89;D$^4L+N;LIM'7AI JJ$GGI1	
6 s?OO$	' 2 s?OO$	', 	$GGI1	
rC   Aabsmaxcode	blocksizedtypec                 ~   U R                  5       S:X  a  [        R                  " XS9$ U R                  5       nUR                  5       nUR	                  U R
                  5      UR                  5          R	                  U5      n[        R                  " Xc-  5      nX-  U-
  n	U	S:  a,  [        R                  R                  R                  USU	45      nUR                  X5      n
XR                  S5      R	                  U5      -  nUR                  5       nU	S:  a  USU	*  nUR                  U R                  5      $ )zF
Pure PyTorch reference implementation for block-wise dequantization.
r   rz   r   N)rm   torch
empty_likeflattenrI   devicelongmathceilnn
functionalpadreshape	unsqueezeshape)rv   rw   rx   ry   rz   A_flatnum_elementsdequantized_flat
num_blockspad_lendequantized_blocksrescaled_blocksrescaled_flats                rA   _dequantize_blockwise_pytorchr     s    	wwyA~//YY[F<<>Lwwqxx(7::5A<34J$|3G{ 88..223Ca\R)11*H(+;+;A+>+A+A%+HHO#++-M{%ix0  ))rC   c                    U R                  5       S:X  aQ  [        R                  " U [        R                  S9[        R                  " S[        R
                  U R                  S94$ U R                  5       nUR                  5       n[        R                  " XB-  5      nXR-  U-
  nUS:  a,  [        R                  R                  R                  USU45      nUR                  XR5      n[        R                  " [        R                  " U5      SSS9S   nSXS:H  '   Xx-  n	[        R                  " U	R!                  S5      UR#                  U R                  5      -
  5      n
[        R$                  " U
SS	9R#                  [        R                  5      nUR                  5       nUS:  a  US
U*  nUR                  U R&                  5      UR                  5       4$ )zD
Pure PyTorch reference implementation for block-wise quantization.
r   r|   )rz   r   r   T)dimkeepdimr"   r	   )r   N)rm   r}   r~   uint8emptyrJ   r   r   r   r   r   r   r   r   maxabsr   rI   argminr   )rv   rx   ry   r   r   r   r   A_blocksrw   scaled_blocksdiffquantized_indicesquantized_flats                rA   _quantize_blockwise_pytorchr   
  ss    	wwyA~5u{{1EMMbcbjbj7kkkYY[F<<>L<34J$|3G{$$((!W>~~j4HYYuyy*4@CFFQ;%M 99],,Q/$''!((2CCDDTq144U[[A&..0N{'	'2!!!''*FNN,<<<rC   qmap1qmap2absmax1absmax2rO   c                   U(       a  [        S5      eSn[        R                  " 5          US:X  ao  UR                  S:X  a_  [	        US   US   UU[        R
                  5      n[	        US   US   UU[        R
                  5      n[        R                  " UU/5      nO[	        X-UU[        R
                  5      nSnUb  [	        X>UU[        R
                  5      nUR                  5       U-  nU R                  R                  5       nUS:X  a  UR                  U5      R                  US	U-
  S
9  UR                  U5      R                  UUS	U-
  S9  S	XI-  -
  nS	XY-  -
  nUR                  5       [        R                  " U5      -  R                  U5      nUS:  a  UR                  S	X-  -
  5        UR                  UUU
* U-  S9  GOUS:X  a  US   US   nnUnUR                  U5      R                  US	U-
  S
9  UR                  U5      R                  US	U-
  S
9  UR                  U5      R                  UUS	U-
  S9  S	XI-  -
  n[        R                  " S	XY-  -
  5      nUU-  UU-  -   UR                  5       U-  U-   -  n US:  a  UR                  S	X-  -
  5        UR                  U U
* S
9  [        R                  " UU/5      nGOUS:X  a[  UR                  UUS
9  U	S:X  a  UR                  U5        O UR                  U5      R                  U5        UR                  UU
* S
9  GO_US:X  ac  UR                  UUS
9  UR                  U5      R                  UUS	U-
  S9  UR                  UUR                  5       R                  U5      U
* S9  OUS:X  a  US:  a  UR                  S	X-  -
  5        [        R                   " UR#                  U5      UR#                  S	U-
  5      -   5      n!UR                  U!U
* S
9  UR                  U5      R                  US	U-
  S
9  OfUS:X  aQ  UR                  UUS
9  UR                  UUS	S9  UR                  UUR                  5       R                  U5      U
* S9  O[%        SU S35      eU R                  R                  U5        US:X  a  ['        US   UU5      u  n"n#['        US   UU5      u  n$n%US   R                  U"5        US   R                  U$5        US   R                  U#5        US   R                  U%5        ['        UUU5      u  n&n'UR                  U&5        UR                  U'5        Og['        UUU5      u  n(n)UR                  U(5        UR                  U)5        Ub2  ['        UUU5      u  n&n'UR                  U&5        UR                  U'5        SSS5        g! , (       d  f       g= f)
Pure PyTorch implementation of the 8-bit block-wise optimizer update step.
This version ensures high-precision updates for float16 parameters.
z'skip_zeros is not supported on XPU yet.rh   r   r	   r   r   Nr   r"   rN   valuer   r   r   r   r   +Pure PyTorch implementation for optimizer '' is not available.)
ValueErrorr}   no_gradndimr   rJ   stackfloatdatamul_add_addcmul_r(   r   addcdiv_copy_signmulrj   r   )*ra   r`   rb   rc   r   r   rM   rN   r   r1   r4   r   r   r   r   r   r   rO   r_   ry   	s1_1_fp32	s1_2_fp32state1_fp32state2_fp32gradp_fp32bias_correction1bias_correction2denomm1_fp32m2_fp32nu_fp32r^   
update_dirnew_m1_8bitnew_absmax_m1new_m2_8bitnew_absmax_m2new_state2_8bitnew_absmax2new_state1_8bitnew_absmax1s*                                             rA   'optimizer_update_8bit_blockwise_pytorchr   2  s   6 BCCI	Z'GLLA,=5fQiUT]_d_l_lmI5fQiUT]_d_l_lmI++y)&<=K7PY[`[h[hiK7PY[`[h[hiKwwy;& V#U#((S5[(AU#,,T4sU{,K"U[0"U[0 %%'$))4D*EEKKCPEc!C""334OOKrc<L6LOMz)*1~{1~WG!GLL$$Tu$=LL$$Tu$=LL((t3;(G"U[0#yyu{):; 0057?Bw||~XhGhknGnoFc!C""334KKrcK*++w&89Kz)IIfLI1qy!!$'  ',,T2KKB3K/y(IIfLI1U#,,T4sU{,KOOD+"2"2"4"9"9#">rcOJv%c!C""334KOOE$:TXXcEk=R$RSJKK
2#K.U#((S5[(Ay(IIfLI1  t3 7OOD+"2"2"4"9"9#">rcOJ &=n=MM`a 
 	
V Z')D[QR^UZ\e)f&K)D[QR^UZ\e)f&K1IOOK(1IOOK(AJ]+AJ]++F{TY[d+e(O[LL)MM+&+F{TY[d+e(O[LL)MM+&&/J;X]_h/i,_-k*U 
s   VW


Wc          	         U(       a  [         R                  " U5      (       d  gSnUR                  5       U-  n[         R                  " 5          U R                  R                  5       nUS:X  ak  UR
                  S:X  a[  [        US   US   UU[         R                  S9n[        US   US   UU[         R                  S9n[         R                  " UU/5      nO[        X-UU[         R                  S9nSnUb  [        X>UU[         R                  S9nUS:X  a  US	:  a  UR                  S
X-  -
  5        UR                  U5      R                  US
U-
  S9  UR                  U5      R                  UUS
U-
  S9  S
XI-  -
  nS
XY-  -
  nUR                  5       [        R                  " U5      -  R                  U5      nUR                  UUU
* U-  S9  GOUS:X  a  US   US   nnUnUR                  U5      R                  US
U-
  S9  UR                  U5      R                  US
U-
  S9  UR                  U5      R                  UUS
U-
  S9  S
XI-  -
  n[        R                  " S
XY-  -
  5      nUU-  UU-  -   UR                  5       U-  U-   -  n US	:  a  UR                  S
X-  -
  5        UR                  U U
* S9  [         R                  " UU/5      nGOUS:X  a[  UR                  UUS9  U	S:X  a  UR                  U5        O UR                  U5      R                  U5        UR                  UU
* S9  GO_US:X  ac  UR                  UUS9  UR                  U5      R                  UUS
U-
  S9  UR                  UUR                  5       R                  U5      U
* S9  OUS:X  a  US	:  a  UR                  S
X-  -
  5        [         R                   " UR#                  U5      UR#                  S
U-
  5      -   5      n!UR                  U!U
* S9  UR                  U5      R                  US
U-
  S9  OfUS:X  aQ  UR                  UUS9  UR                  UUS
S9  UR                  UUR                  5       R                  U5      U
* S9  O[%        SU S35      eU R                  R                  U5        US:X  a  ['        US   UU5      u  n"n#['        US   UU5      u  n$n%US   R                  U"5        US   R                  U$5        US   R                  U#5        US   R                  U%5        ['        UUU5      u  n&n'UR                  U&5        UR                  U'5        Og['        UUU5      u  n(n)UR                  U(5        UR                  U)5        Ub2  ['        UUU5      u  n&n'UR                  U&5        UR                  U'5        SSS5        g! , (       d  f       g= f)r   Nrh   r   r	   r   r|   r   r   r   r"   r   r   r   r   r   r   r   r   )r}   anyr   r   r   r   r   rJ   r   r   r   r   r(   r   r   r   r   r   rj   r   )*ra   r`   rb   rc   r   r   rM   rN   r   r1   r4   r   r   r   r   r   r   rO   r_   ry   r   r   r   r   r   r   r   r   r   r   r   r   r^   r   r   r   r   r   r   r   r   r   s*                                             rA   ,optimizer_update_8bit_blockwise_triton_quantr     s   6 %))A,,I779{"D	 Z'GLLA,=.vay'!*eY^c^k^klI.vay'!*eY^c^k^klI++y)&<=K0%Z_ZgZghK0%Z_ZgZghK V#c!C""334U#((S5[(AU#,,T4sU{,K"U[0"U[0 %%'$))4D*EEKKCPEOOKrc<L6LOMz)*1~{1~WG!GLL$$Tu$=LL$$Tu$=LL((t3;(G"U[0#yyu{):; 0057?Bw||~XhGhknGnoFc!C""334KKrcK*++w&89Kz)IIfLI1qy!!$'  ',,T2KKB3K/y(IIfLI1U#,,T4sU{,KOOD+"2"2"4"9"9#">rcOJv%c!C""334KOOE$:TXXcEk=R$RSJKK
2#K.U#((S5[(Ay(IIfLI1  t3 7OOD+"2"2"4"9"9#">rcOJ &=n=MM`a 
 	
V Z')B;q>SXZc)d&K)B;q>SXZc)d&K1IOOK(1IOOK(AJ]+AJ]++D[RWYb+c(O[LL)MM+&+D[RWYb+c(O[LL)MM+&&/HV[]f/g,_-k*S 
s   U<W
W!BLOCK_SIZE_Nc           	         [         R                  " SS9nUU-  nUU-  [         R                  " SUU-  5      -   nUU:  n[         R                  " UU-   USS9R	                  [         R
                  5      U-  n[         R                  " U U-   USS9R	                  [         R
                  5      n[        UUXUU5      nUS:  a  US:X  a  USUU-  -
  -  nOUS:  a  UUU-  -  nUS:X  a  U	S:X  a  UnOUU-  U-   nUUU-  -  nOUS:X  a4  UU-  SU-
  U-  U-  -   nUUU[         R                  " U5      U-   -  -  -  nOUS:X  a+  UUU-  -  nUUU[         R                  " U5      U-   -  -  -  nO^US:X  aX  UU-  SU-
  U-  -   n[         R                  " US:  S[         R                  " US:  S	S5      5      nUUU-  -  nUU-  SU-
  U-  -   n[         R                  " U U-   UR	                  U R                  R                  5      US
9  [        UUSUU5      u  n n![         R                  " UU-   U US
9  [         R                  " UU-   [         R                  " SU5      -   U!5        g)zm
Triton kernel for 8-bit optimizers that use one momentum state.
Supports: Momentum, RMSprop, Adagrad, Lion.
r   r   r   r   r	   r"   r   r   rZ   rH   rh   N)r$   r%   r&   r'   rI   rJ   r   r(   r*   rK   rz   
element_tyr   )"r-   r,   r.   r/   r   r   rM   rN   r   r1   r2   r3   r4   	qmap1_ptr	qmap2_ptrabsmax1_ptrabsmax2_ptrr   r   r5   r   r   r   r6   r7   r8   r    r`   ra   s1valr^   s1_codesr   s"                                     rA   5_optimizer_update_1state_8bit_blockwise_triton_kernelr   W  s|   B --Q
CHnO,ryyL8<S/TTGZD 	d#699"**ESA
d#699"**EA	+JY]_k	lB cla/	S2$$$			Q q19BeaB	R"W 
	%Z3;!+a//	R1c)*++ 
	
a!e	R1c)*++ 
	5jC%K1,,#)S"((39dC*HI	R&[%Z3;!++ HHUW_add5;;#9#9:F?IsT`bjkHkHHZ'!8$7HH[?*RYYq(-CC[QrC   c                    [         R                  " SS9nUU-  nUU-  [         R                  " SUU-  5      -   nUU:  n[         R                  " UU-   USS9R	                  [         R
                  5      U-  n[         R                  " U U-   USS9R	                  [         R
                  5      nUS:X  Ga  [        UUXUU5      n[        UUUUUU5      nUU-  SU-
  U-  -   nUU-  SU-
  U-  U-  -   nSU
-
  nSU-
  n US:  a  USUU-  -
  -  n[         R                  " U5      [         R                  " U 5      -  U-   n!UUU-  UU!-  -  -  n[         R                  " U U-   UR	                  U R                  R                  5      US9  [        UUSUU5      u  n"n#[         R                  " UU-   U"US9  [         R                  " UU-   [         R                  " SU5      -   U#5        [        UUSUU5      u  n$n%[         R                  " UU-   U$US9  [         R                  " UU-   [         R                  " SU5      -   U%5        g
US	:X  Ga!  [        UUXUU5      n&[        UU-   UUUUU-  -   UU5      n'[        UUUUUU5      n(U&U-  SU-
  U-  -   n&U'U-  SU-
  U-  -   n'U(U-  SU-
  U-  U-  -   n(SU
-
  n[         R                  " SU-
  5      n U&U-  UU'-  -   [         R                  " U(5      U -  U-   -  n)US:  a  USUU-  -
  -  nUUU)-  -  n[         R                  " U U-   UR	                  U R                  R                  5      US9  [        U&USUU5      u  n*n+[         R                  " UU-   U*US9  [         R                  " UU-   [         R                  " SU5      -   U+5        [        U'USUU5      u  n,n-[         R                  " UU-   U-   U,US9  [         R                  " UU-   [         R                  " SU5      -   UU-  -   U-5        [        U(USUU5      u  n.n/[         R                  " UU-   U.US9  [         R                  " UU-   [         R                  " SU5      -   U/5        g
g
)z\
Triton kernel for 8-bit optimizers that use two momentum states.
Supports: Adam, AdEMAMix.
r   r   r   r   r
   r"   rH   rh   r   N)r$   r%   r&   r'   rI   rJ   r   r(   rK   rz   r   r   )0r-   r,   r.   r/   r   r   rM   rN   r   r1   r2   r3   r4   r   r   r   r   r   r   r5   r   r   r   r6   r7   r8   r    r`   ra   r   s2r   r   r   r   r   s2_codesr   m1m2nur^   m1_codesr   m2_codesr   nu_codesnew_absmax_nus0                                                   rA   5_optimizer_update_2state_8bit_blockwise_triton_kernelr     s   F --Q
CHnO,ryyL8<S/TTGZD 	d#699"**ESA
d#699"**EA q/
GY]acop/
GYP[]acop%Z3;!++%Z3;!+a// ++#rL(((Abgg&677#=	b##U
33 	!$$u{{'='=">TJ !DB	SVXdfn o+
g%xd;
.1h1GGU CB	SVXdfn o+
g%xd;
.1h1GGU		/
GY]acop/#*44
 0
GYP[]acop%Z3;!++%Z3;!++%Z3;!+a// +773#34''%"*4GW9WZ]9]^#rL(((A	R&[ 	!$$u{{'='=">TJ #Fb)UXZfhp"q-
g%xd;
.1h1GGW"Eb)UXZfhp"q-
j(72H4H
/)BIIa,BBZS_E__	

 #Fb)UXZfhp"q-
g%xd;
.1h1GGWa 
rC   c                    U(       a  [        S5      eU S:X  a~  UR                  5       S:  d  UR                  S   S:w  a  [        SUR                   35      eUR                  5       S:  d  UR                  S   S:w  a  [        SUR                   35      eSnSn[        R
                  " UR                  5       UU-  5      4n[        U    n[        U    nXZ-  nXj-  nUU   " UUUUUUUUU	U
UUUUUUUUUUR                  5       UUUSS	9  g )
Nrg   r   r	   r   zIFor ademamix, state1 must be a stacked tensor of shape (2, ...), but got zJFor ademamix, absmax1 must be a stacked tensor of shape (2, ...), but got rh   r   )r   r   r   ri   )	rj   r   r   r   rk   rl   rm   name2optimizer_fnrn   )r_   r`   ra   rb   rc   r   r   rM   rN   r   r1   r4   r   r   r   r   r   r   rO   r   r   rq   fnrr   r2   r3   s                             rA   $optimizer_update_8bit_blockwise_implr   4  s>   * !"JKK#::<!v||A!3[\b\h\h[ij  ;;=1a 0A 5\]d]j]j\kl  JHKK	:#89;D	>	*B$^4L JJtH		
		!1rC   )r"   F)r   r"   F),r   typingr   r}   rk   triton.languagelanguager$   kernels_8bit_quantr   r   r   r   MOMENTUMRMSPROPADAGRADADAMLIONADEMAMIXrn   jit	constexprrB   rE   rX   r\   ro   strTensorr   intru   compilerz   r   tupler   boolr   r   r   r   r   r    rC   rA   <module>r      s	         

   3) <<3) <<3) 
3) ,,3) 3)  ,,!3)" #3)$ ll%3) 3)l 6) <<6) <<6) 
6) 6)  ,,!6)" #6)$ ll%6) 6)r QH ||QH <<QH <<QH 
QH ,,QH$ %QH* ,,+QH, -QH. ll/QH QHh J7 ||J7 <<J7 <<J7 
J7 ,,J7$ %J7* ,,+J7, -J7. ll/J7 J7^ ;>
 ;>
 ;>
 ;>
 ;>
 ;>+ Z %L
L
||L
 ||L
 LL	L

 U\\"L
 %L
 L
 L
 L
 L
 L
 L
 
L
 L
 L
  	!L
" #L
& 
'L
h *||*LL* ,,* 	*
 ;;* \\* *D #=||#=
,,#= #= 5<<%&	#= #=NJ+||J+||J+ LLJ+ U\\"	J+
 J+ J+ J+ J+ 
J+ J+ 	J+ <<J+ ELL!J+ \\J+ ell#J+  !J+" #J+$ %J+* +J+, 
-J+fJ+||J+||J+ LLJ+ U\\"	J+
 J+ J+ J+ J+ 
J+ J+ 	J+ <<J+ ELL!J+ \\J+ ell#J+  !J+" #J+$ %J+* +J+, 
-J+d OR <<OR <<OR 
OR. ,,/OR0 ll1OR2 ,,3OR ORd |X <<|X <<|X 
|X* ,,+|X, -|X2 ,,3|X4 ll5|X6 ,,7|X |X@ FDDAAE 6 'GG||G ||G LL	G
 U\\"G G G G G 
G G 	G <<G ELL!G \\G  ell#!G" #G$ %G( 
)G\ (L $rC   