
    h"                     H   S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJr  SSK	J
r
JrJrJr  \R                  R                  r\R                  R                   r\R                  R"                  rS SKJr  \" SS	9S
 5       r\" SS	9S 5       r " S S\5      r\R/                  \R0                  R2                  5      S 5       r\R/                  \R6                  R2                  5      S 5       r\R/                  \R8                  R:                  5      S 5       r\R/                  \R<                  R2                  5      S 5       r\R/                  \R>                  R2                  \R>                  R2                  \R@                  R2                  \R@                  R2                  \RB                  R2                  /5      S 5       r\R/                  \RD                  R2                  5      S 5       r\R/                  \RF                  R                  5      S 5       r\" \/5        g)    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensor   )create_dynamic_mapdequant_with_qmapquantize_4bit_with_qmapscale_tensor)	lru_cache)maxsizec                      [        SSS5      $ )NT      )r        U/home/james-whalen/.local/lib/python3.13/site-packages/torchao/optim/subclass_4bit.pyget_qmap_signedr   "   s    dAq))r   c                  R    [         R                  " SSSSS9SS  R                  5       $ )Nr   r      cpudevice)torchlinspacetolistr   r   r   get_qmap_unsignedr   '   s'    >>!Q51!"5<<>>r   c            	           \ rS rSr/ SQr\S\S\S\S\4S j5       rS\S\S\S\4S jr	S	 r
\ SS j5       rSS jr\SS\S\4S jj5       rS rSrg
)OptimState4bit,   )codesscaleqmapr!   r"   r#   signedc                 @    [         R                  " XUR                  S9$ )Nr   )r   _make_wrapper_subclassr   )clsr!   r"   r#   r$   shapes         r   __new__OptimState4bit.__new__/   s    ,,SMMr   c                 V   UR                   [        R                  L d   eUR                  S:X  d   eUR                  S:X  d   eUR                   [        R                  L d   eXl        X l        X0l        X@l        XPl	        UR                  5       S-  UR                  5       -  U l        g)a  Create quantized 4-bit optimizer state as proposed in https://arxiv.org/abs/2309.01507

Args
    codes: quantized and packed 4-bit data stored as uint8.
    scale: scale data for block-wise quantization.
    qmap: lookup table that maps between quantized value (code) and float value.
    signed: whether the tensor is signed or unsigned.
    shape: shape of original float tensor.

NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
Given `codes` and `scale`, `block_size` is calculated as `codes.numel() * 2 // scale.numel()`.
The extra `* 2` is because `codes` is 4-bit data packed in 8-bit storage.
r      N)dtyper   uint8ndimfloat32r!   r"   r#   r$   _shapenumel
block_size)selfr!   r"   r#   r$   r(   s         r   __init__OptimState4bit.__init__3   s     {{ekk)))zzQzzQzzU]]***

	++-!+u{{}<r   c                 J    U R                   U R                  U R                  /4$ N)tensor_attrsr$   r1   r4   s    r   __tensor_flatten__!OptimState4bit.__tensor_flatten__M   s       4;;"<<<r   Nc                 Z    U " / U R                    Vs/ s H  oQU   PM	     snQUQ76 $ s  snf r8   )r9   )r'   tensor_data_dicttensor_attributes
outer_sizeouter_stridenames         r   __tensor_unflatten__#OptimState4bit.__tensor_unflatten__P   s>      
141A1AB1At$1AB
EV
 	
Bs   (
c                    [         R                  " U R                  S-	  U R                  S-  /SS9n[        X R                  U R
                  5      nUb  UR                  U5      nUR                  U R                  5      $ )Nr      )dim)	r   stackr!   r	   r#   r"   toviewr1   )r4   output_dtyper!   
float_datas       r   
dequantizeOptimState4bit.dequantizeX   sc    TZZ1_djj6.ABK&uiiD
##|4Jt{{++r   r3   c                 p   [        U[        5      (       a  U4OUn[        R                  " U5      n[        R
                  " US-  [        R                  US9n[        R
                  " XS-  US9nU(       a
  [        5       O	[        5       n[        R                  " U[        R                  US9n	U " XgXU5      $ )Nr,   )r-   r   r   )
isinstanceintmathprodr   zerosr.   r   r   tensorr0   )
r'   r(   r$   r3   r   n_elemsr!   r"   	qmap_listr#   s
             r   rU   OptimState4bit.zeros_   s    &uc22))E"GqLFKG1&A)/O%5F5H	||IU]]6J5u55r   c                     U R                   R                   SU R                   SU R                   S[	        U R
                  5       SU R                   SU R                   S3$ )Nz(signed=z, block_size=z, shape=z	, device=z, requires_grad=))	__class____name__r$   r3   tupler(   r   requires_gradr:   s    r   __repr__OptimState4bit.__repr__j   sb    ~~&&'x}M$//IZ [4::&'y=MdN`N`Maabd	
r   )r1   r3   r!   r#   r"   r$   )NNr8   )T   N)r]   
__module____qualname____firstlineno__r9   staticmethodr   boolr)   r5   r;   classmethodrC   rN   rR   rU   r`   __static_attributes__r   r   r   r   r   ,   s    -LNF N6 N N N N=f =V =6 =4 =4= PT
 
, 6$ 63 6 6
r   r   c                    US   nUS   n[        U[        5      (       a  [        U[        5      (       a  UR                  UR                  :X  a4  UR                  UR                  :X  a  UR                  UR                  :X  d   eUR
                  R                  UR
                  5        UR                  R                  UR                  5        U$ [        U[        5      (       a  [        UR                  S5      UR                  5      u  pg[        XdR                  5      nUR
                  R                  US S S2   S-  USS S2   -  5        UR                  R                  U5        U$ UR                  UR                  5       5        U$ )Nr   r   rG   r,   r   )rQ   r   r$   r3   r1   r!   copy_r"   r   rK   r
   r#   rN   )	functypesargskwargsdstsrc
scaled_srcr"   r!   s	            r   _rs   q   s,   
q'C
q'C#~&&:c>+J+JJJ#**$#..0

cjj(	
) 					"				" J 
C	(	(("s~~F
'
HH=		ssqE!$Q$K78		
 J 			#.."#Jr   c                 ,   UR                  SS 5      n[        US   R                  R                  US9US   R                  R                  US9US   R
                  R                  US9US   R                  US   R                  5      n[        XX55      $ )Nr   r   r   )	getr   r!   rJ   r"   r#   r$   r(   r   )rl   rm   rn   ro   r   outs         r   rs   rs      s     ZZ$'F
Q'Q'Qv&QQC 't6??r   c                     U Vs/ s H*  n[        U[        5      (       a  UR                  5       OUPM,     nnU " U0 UD6$ s  snf r8   )rQ   r   rN   )rl   rm   rn   ro   xs        r   rs   rs      sC    LPQDqjN;;ALLNBDDQ    Rs   1A c                    Uu  pE[        UR                  5      [        U5      :X  aA  [        UR                  UR                  UR
                  UR                  UR                  5      $ [        U5      S:X  aO  US   S:X  aF  [        UR                  UR                  UR
                  UR                  UR                  5       45      $ [        UR                  R                   S35      e)Nr   r   rG   z4 only supports .view() with same shape or shape=[-1])r^   r(   r   r!   r"   r#   r$   r1   lenr2   
ValueErrorr\   r]   )rl   rm   rn   ro   rx   r(   s         r   rs   rs      s    HAQWW~u%aggqww!((KK
5zQ58r>aggqww1779,OO
;;
  TU r   c                    US   n[        U[        5      (       d  [        S[        U5       35      eU " UR                  /USS  Q70 UD6nU " UR
                  /USS  Q70 UD6nUR                  S   UR                  5       -  UR                  R                  5       -  4UR                  SS  -   n[        XVUR                  R                  5       UR                  U5      $ )Nr   z%expecting a OptimState4bit but found r   )rQ   r   r{   typer!   r"   r1   r2   r#   cloner$   )rl   rm   rn   ro   rx   r!   r"   r(   s           r   rs   rs      s     	QAa((@a	JKK.48.v.E.48.v.E XXa[5;;=(AGGMMO;=LE %%HHr   c                     US   R                   R                  5       =(       aA    US   R                  R                  5       =(       a    US   R                  R                  5       $ )Nr   )r!   	is_pinnedr"   r#   )rl   rm   rn   ro   s       r   rs   rs      sO     	Q! 	%GMM##%	%GLL""$r   c                    US S u  pEpg[        U5      S:  a  US   OSnUS:w  a  [        S5      eUS:w  a  [        S5      eUR                  n	[        R                  " UR
                  SS  5      n
Xj-  U	-  S:w  d  Xz-  U	-  S:w  a"  [        SUR
                   SU	 SU S	U S
3	5      eUR                  Xj-  S-  Xz-  S-   nUR                  Xj-  U	-  Xz-  U	-   nUR
                  S   UR                  5       -  UR                  R                  5       -  4UR
                  SS  -   n[        XUR                  R                  5       UR                  U5      $ )Nr   r   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.r,   )rz   r{   r3   rS   rT   r(   r!   r"   r2   r   r#   r~   r$   )rl   rm   rn   ro   rx   rH   startendstepr3   strider!   r"   r(   s                 r   rs   rs      s^   bqAE$i!m47D axFGGqy>??JYYqwwqr{#F 	*$)clj-HA-M-aggY6Fzl S#WF3%q2
 	
 GGENa'#,!*;<EGGENj03<:3MNE WWQZ%++-'177==?:<qwwqr{JE%%HHr   )$rS   r   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   quant_utilsr   r	   r
   r   opsatenc10d_functional_c10d_functional	functoolsr   r   r   r   
implementsrk   defaultrs   _to_copylerpScalarrK   all_gather_into_tensorwait_tensordetachr   slicer   r   r   <module>r      s      0 D +  yy~~))++99--    1* * 1? ?B
& B
J 4::--. /4 4==001
@ 2
@ 499++,! -! 499,,- .  	..66//77##++$$,,
I
I$ 4>>112 3 4::,,-I .IB .! "r   