
    hT                     V   S SK r S SKrS SKJr  \R
                  S\R                  S\R                  4S j5       r\R
                  S\R                  S\R                  4S j5       rS r	\R
                  S\R                  4S j5       r
\R
                  S	 5       r\R
                  S\R                  4S
 j5       r\R
                  S 5       r\R
                  S\R                  4S j5       r\R
                  S\R                  S\R                  4S j5       r\R
                  S\R                  S\R                  4S j5       r\R
                  S\R                  S\R                  4S j5       rS\ R$                  S\ R$                  S\S\S\ R*                  S\ R$                  SS4S jrS\ R$                  S\ R$                  S\S\ R$                  S\ R*                  S\ R$                  SS4S jr\R
                  S\R                  S\R                  S\R                  4S j5       rg)    N
BLOCK_SIZESPLIT_NUM_BLOCKSc                 
   US-  n[         R                  " S5      U-  n[         R                  " SXd-  5      nXt-  U-   n	X:  n
[         R                  " X	-   U
SS9n[         R                  " XU45      n[         R
                  " [         R                  " U5      SS9n[         R                  " X-   [         R                  " SU5      -   U5        XS S 2S 4   -  n[         R                  " USS5      n[         R                  " US:  S	S5      n[         R                  " U5      n[         R                  " US
:  [         R                  " US:  [         R                  " US:  SS5      [         R                  " US:  SS5      5      [         R                  " US:  [         R                  " US:  SS5      [         R                  " US:  SS5      5      5      nUU-  R                  [         R                  5      nUR	                  XdS-  S45      nUR                  5       u  nnUS-  US-  -  n[         R                  " UXE-  45      nXt-  S-  [         R                  " SXT-  5      -   nUUS-  :  n[         R                  " UU-   UUS9  g )N   r           maskother   axis            ?   g&>?g?gD^Ř?   gyCuΪ?      g      ?g%?      gْvWUe?   r	   tl
program_idarangeloadreshapemaxabsstoreclampwheretouint8split)A_ptr
absmax_ptrout_ptr
n_elementsr   r   PAIRED_SPLIT_NUM_BLOCKSblock_start_idx
thread_idxoffsetsr	   A
A_reshapedabsmaxA_normalizedsignA_absfresult	quantizedleftrightpackedpacked_flatout_offsetsout_masks                            c/home/james-whalen/.local/lib/python3.13/site-packages/bitsandbytes/backends/triton/kernels_4bit.pyquantize_fp4_blockwise_kernelr=      s@    -=q,@mmA&)@@O15BCJ*Z7GD
d#6A ADEJ VVBFF:&Q/FHHZ)BIIa9P,QQSYZq$w/L88L$4L88L1$ff5DVVL!FXX
Xrxx(:E5I288TZ]fTfhmotKu	
 	YHHVj(&&9HHVj(&&9	

F $""288,I!!#:!OQ"OPI//#KD%QY%#+&F**Vj&C%EFK!.!3biiCSC`6aaKZ1_,HHHW{"Kh?    c                 2   US-  n[         R                  " S5      U-  n[         R                  " SXd-  5      nXt-  U-   n	X:  n
[         R                  " X	-   U
SS9n[         R                  " XU45      n[         R
                  " [         R                  " U5      SS9n[         R                  " X-   [         R                  " SU5      -   U5        XS S 2S 4   -  n[         R                  " USS5      n[         R                  " US	:  [         R                  " US
:  [         R                  " US:  [         R                  " US:  SS5      [         R                  " US:  SS5      5      [         R                  " US:  [         R                  " US:  SS5      [         R                  " US:  SS5      5      5      [         R                  " US:  [         R                  " US:  [         R                  " US:  SS5      [         R                  " US:  SS 5      5      [         R                  " US!:  [         R                  " US":  S#S5      [         R                  " US$:  SS5      5      5      5      nUR                  [         R                  5      nUR	                  XdS-  S45      nUR                  5       u  nnUS -  US-  -  n[         R                  " UXE-  45      nXt-  S-  [         R                  " SXT-  5      -   nUUS-  :  n[         R                  " UU-   UUS%9  g )&Nr   r   r   r   r   r   r   r   g   __?g    ?g   ൑?g   0;?r      g   p?      g    ?g   Z?   
   g   Pɾ?	   r   g   Nտg   xg    Or   r   g   οr   r   g   Ng    pݿr   g   #r   r   )r&   r'   r(   r)   r   r   r*   r+   r,   r-   r	   r.   r/   r0   r1   r4   r5   r6   r7   r8   r9   r:   r;   s                          r<   quantize_nf4_blockwise_kernelrF   U   s    -=q,@mmA&)@@O15BCJ*Z7GD
d#6A ADEJ VVBFF:&Q/FHHZ)BIIa9P,QQSYZq$w/L88L$4LXX**
--HH11(::FFK(::FFK
 HH11(::FFK(::FFK	
 	//HH33(==vvN(<<ffM
 HH22(;;VVL(;;VVL	
F: 		"((#I!!#:!OQ"OPI//#KD%QY%#+&F**Vj&C%EFK!.!3biiCSC`6aaKZ1_,HHHW{"Kh?r>   c           	          Sn[         R                  " X75      4nUS:X  a  [        U   " U UUUUUS9  Xd4$ [        U   " U UUUUUS9  Xd4$ )Nr   fp4)r&   r'   r(   r)   r   r   )tritoncdivr=   rF   )	r.   	blocksize
quant_typeblocksr0   num_elementsquantized_outsplit_num_blocksgrids	            r<   quantize_4bit_blockwise_tritonrR      sw    KK13DU%d+!# -	
"    	&d+!# -	
   r>   QUANT_BLOCKc                     US-  nX:  nU S-  nU S-	  n	X-  n
[         R                  " X:-   USSS9n[         R                  " X)-   SS9n[         R                  " X(-   SS9nX-  nX-  n[         R                  " X5      nU$ )Nr   r   r   r   
evict_lastr	   r
   eviction_policyrW   )r   r   
interleave)ar-   	quant_ptrr'   n_elemsrS   PAIRED_QUANT_BLOCKr	   higherlowerabs_offsetsr0   lower_4higher_4mul_highmul_lowout_dqs                    r<   dequant_4bit_body_utilrf      s    '2a'7DWFFE/KWWZ-DUabF ggi'FGwwy)<HH HG]]7-FMr>   c           	         [         R                  " U S-  S:H  SS5      nU S-  S:H  nU S-  S:H  nU S-  S:H  n[         R                  " U[         R                  " USS5      [         R                  " US	S
5      5      n[         R                  " U[         R                  " USS5      [         R                  " USS5      5      n[         R                  " X6U5      nX-  U-  $ )Nr   r   r   r   r   r   g      ?gK}\UU?g      ?gQUU?gvWUU?gDpTUu?r   r   r"   )	valr0   r2   	third_bit
second_bit	first_bitbranch1branch2outs	            r<   dequantize_fp4_treerp      s    
 88S6\f,dC8Dv&(I,6)Jv&(Ihh
D*-
C,G
 hh
C,
J,G
 ((9w
/C:r>   c                     US-  nX:  nU S-  nU S-	  nX-  n	[         R                  " X)-   USSS9n
[        Xz5      n[        X5      n[         R                  " X5      nU$ Nr   r   r   r   rU   rV   )r   r   rp   rY   rZ   r-   r'   r\   rS   r]   r	   r^   r_   r`   r0   rc   rd   re   s                 r<   dequant_fp4_body_utilrt      sn    '2a'7DWFFE/KWWZ-DUabF"62H!%0G]]7-FMr>   c                    U S-  S:H  nU S-  S:H  nU S-  S:H  nU S-  S:H  n[         R                  " U[         R                  " U[         R                  " USS5      [         R                  " USS5      5      [         R                  " U[         R                  " US	S
5      [         R                  " USS5      5      5      n[         R                  " U[         R                  " U[         R                  " USS5      [         R                  " USS5      5      [         R                  " U[         R                  " USS5      [         R                  " USS5      5      5      n[         R                  " XU5      $ )Nr   r   r   r   r   g   `v"?g   ` ?g    4?g   @g?g   ?g   `\?g   __?r   g    Og   ০ǿg   I4ҿg    TFٿg    fg    6Gr   rh   )ri   cond0cond1cond2cond3
branch_pos
branch_negs          r<   dequantize_nf4_treer|      sB    6\f$E6\f$E6\f$E6\f$E 
HHUC!34HHU.0CD	

 	HHU/1DEHHU/1DE	
J 
HHUC!56HHU02FG	

 	HHU02EFHHU/6	
J 88Ez22r>   c                     US-  nX:  nU S-  nU S-	  nX-  n	[         R                  " X)-   USSS9n
[        U5      U
-  n[        U5      U
-  n[         R                  " X5      nU$ rr   )r   r   r|   rY   rs   s                 r<   dequant_nf4_body_utilr~     sx    '2a'7DWFFE/KWWZ-DUabF"6*V3H!%(61G]]7-FMr>   
SPLIT_SIZEc           	      D   [         R                  " SS9nXv-  nU[         R                  " SU5      -   n	X:  n
[         R                  " X	-   U
SS9n[	        UU	UUUUS9nXv-  S-  nU[         R                  " SUS-  5      -   nXS-  :  n
[         R
                  " X-   X5        g )Nr   r   evict_firstrX   )rZ   r-   r[   r'   r\   rS   r   )r   r   r   r   rf   r    )a_ptrc_ptrr[   r'   num_paired_elementsrS   r   pidblock_startr-   r	   rZ   re   out_block_startoffss                  r<   dequant_4bit_kernelr   K  s     --Q
C"KBIIa44G(D
}EA#
#F &*ORYYq*q.99D))DHHU\6(r>   c                 B   [         R                  " SS9nXe-  nU[         R                  " SU5      -   nX:  n	[         R                  " X-   U	SS9n
[	        U
UUUUS9nXe-  S-  nU[         R                  " SUS-  5      -   nXS-  :  n	[         R
                  " X-   X5        g Nr   r   r   rX   )rZ   r-   r'   r\   rS   r   )r   r   r   r   rt   r    r   r   r'   r   rS   r   r   r   r-   r	   rZ   re   r   r   s                 r<   dequant_fp4_kernelr   p       --Q
C"KBIIa44G(D
}EA"
#F &*ORYYq*q.99D))DHHU\6(r>   c                 B   [         R                  " SS9nXe-  nU[         R                  " SU5      -   nX:  n	[         R                  " X-   U	SS9n
[	        U
UUUUS9nXe-  S-  nU[         R                  " SUS-  5      -   nXS-  :  n	[         R
                  " X-   X5        g r   )r   r   r   r   r~   r    r   s                 r<   dequant_nf4_kernelr     r   r>   r.   r0   rK   rL   dtypero   returnc                     U R                  5       nSn[        R                  " Xg5      4nUS:X  a  [        U   " XXX'5        g [        U   " XXX'5        g )N   rH   )numelrI   rJ   r   r   )	r.   r0   rK   rL   r   ro   number_of_paired_elementsr   rQ   s	            r<   dequantize_4bit_implr     sS     !"	 JKK1>@DU4 Ib4 Ibr>   codec           	      z    U R                  5       nSn[        R                  " Xg5      4n[        U   " XX1XbU5        g )Nr   )r   rI   rJ   r   )	r.   r0   rK   r   r   ro   r   r   rQ   s	            r<   !dequantize_4bit_impl_passing_coder     s>     !"	 JKK1>@Dad4MZder>   	CODE_SIZEc                    US-  n[         R                  " S5      U-  n	[         R                  " SX-  5      n
X-  U
-   nX:  n[         R                  " X-   USS9n[         R                  " XU45      n[         R
                  " [         R                  " U5      SS9n[         R                  " X)-   [         R                  " SU5      -   U5        XS S 2S 4   -  n[         R                  " USS5      n[         R                  " X4[         R                  S	9n[         R                  " X4US-
  [         R                  S	9n[        S
5       HY  nUU-   S-  n[         R                  " UU-   5      nUU:  n[         R                  " UUU5      n[         R                  " UUU5      nM[     [         R                  " UU-   5      n[         R                  " UU-   5      n[         R                  " UU-
  5      n[         R                  " UU-
  5      n[         R                  " UU:*  UU5      R                  [         R                  5      nUR	                  XS-  S45      nUR                  [         R                  SS9nUR!                  5       u  nnUS
-  US-  -  n[         R                  " UXW-  45      nX-  S-  [         R                  " SXu-  5      -   n U US-  :  n![         R                  " UU -   UU!S9  g )Nr   r   r   r   r   r   r   r   )r   r   T)bitcastr   r   )r   r   r   r   r   r   r   r    r!   zerosint32fullranger"   r#   r$   r%   )"r&   code_ptrr'   r(   r)   r   r   r   r*   r+   r,   r-   r	   r.   r/   r0   r1   lower_pivotupper_pivot_pivotri   	is_higher	lower_val	upper_val
lower_dist
upper_distr5   r6   r7   r8   r9   r:   r;   s"                                     r<   quantize_4bit_blockwise_kernelr     s    -=q,@mmA&)@@O15BCJ*Z7GD
d#6A ADEJ VVBFF:&Q/FHHZ)BIIa9P,QQSYZq$w/L88L$4L((3@QK''2?QVXV^V^_K1X{*q0ggh&' 3&	hhy%=hhy+u=  ;./I;./Iy01Jy01Jz1;LOOPRPXPXYI!!#:!OQ"OPIRXXt4I//#KD%QY%#+&F **Vj&C%EFK!.!3biiCSC`6aaKZ1_,HHHW{"Kh?r>   )torchrI   triton.languagelanguager   jit	constexprr=   rF   rR   rf   rp   rt   r|   r~   r   r   r   Tensorintstrr   r   r   r    r>   r<   <module>r      s      2@
 2@ ll2@ 2@B B@
 B@ llB@ B@J!2 TVT`T`  *  .    $3 $3N   ` )KM<<)egeqeq) )H )@B)Z\ZfZf) )F )@B)Z\ZfZf) )0c||cLLc c 	c
 ;;c 
c 
c0f||fLLf f ,,	f
 ;;f 
f 
fP :@ :@ ||:@ ll:@ :@r>   