
    h                     H   S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJr  SSK	J
r
JrJrJr  \R                  R                  r\R                  R                   r\R                  R"                  rS SKJr  \" SS	9S
 5       r\" SS	9S 5       r " S S\5      r\R/                  \R0                  R2                  5      S 5       r\R/                  \R6                  R2                  5      S 5       r\R/                  \R8                  R:                  5      S 5       r\R/                  \R<                  R2                  5      S 5       r\R/                  \R>                  R2                  \R>                  R2                  \R@                  R2                  \R@                  R2                  \RB                  R2                  /5      S 5       r\R/                  \RD                  R2                  5      S 5       r\R/                  \RF                  R                  5      S 5       r\" \/5        g)    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensor   )create_dynamic_mapdequant_with_qmapquantize_8bit_with_qmapscale_tensor)	lru_cache)maxsizec                      [        SS9$ )NTsignedr        U/home/james-whalen/.local/lib/python3.13/site-packages/torchao/optim/subclass_8bit.pyget_qmap_signedr      s    T**r   c                      [        SS9$ )NFr   r   r   r   r   get_qmap_unsignedr   #   s    U++r   c            	           \ rS rSr/ SQr\S\S\S\S\4S j5       rS\S\S\S\4S jr	S	 r
\ SS j5       rSS jr\SS\S\4S jj5       rS rSrg
)OptimState8bit(   )codesscaleqmapr   r   r   r   c                 T    [         R                  " XR                  UR                  S9$ )Ndevice)r   _make_wrapper_subclassshaper    )clsr   r   r   r   s        r   __new__OptimState8bit.__new__+   s    ,,S++ellSSr   c                     UR                   [        R                  L d   eUR                  S:X  d   eUR                   [        R                  L d   eXl        X l        X0l        X@l        UR                  5       UR                  5       -  U l
        g)a  Create quantized 8-bit optimizer state as proposed in https://arxiv.org/abs/2110.02861

Args
    codes: quantized 8-bit data stored as uint8. Has the same shape as the original float tensor.
    scale: scale data for block-wise quantization.
    qmap: lookup table that maps between quantized value (code) and float value.
    signed: whether the tensor is signed or unsigned.

NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
Given `codes` and `scale`, `block_size` is calculated as `codes.numel() // scale.numel()`.
r   N)dtypetorchuint8ndimfloat32r   r   r   r   numel
block_size)selfr   r   r   r   s        r   __init__OptimState8bit.__init__/   sl     {{ekk)))zzQzzU]]***

	++-5;;=8r   c                 4    U R                   U R                  /4$ N)tensor_attrsr   r.   s    r   __tensor_flatten__!OptimState8bit.__tensor_flatten__E   s      4;;-//r   Nc                 Z    U " / U R                    Vs/ s H  oQU   PM	     snQUQ76 $ s  snf r2   )r3   )r#   tensor_data_dicttensor_attributes
outer_sizeouter_stridenames         r   __tensor_unflatten__#OptimState8bit.__tensor_unflatten__H   s>      
141A1AB1At$1AB
EV
 	
Bs   (
c                     [        U R                  U R                  U R                  5      nUb  UR	                  U5      nU$ r2   )r	   r   r   r   to)r.   output_dtype
float_datas      r   
dequantizeOptimState8bit.dequantizeP   s6    &tzz499djjI
##|4Jr   r-   c                 &   [         R                  " U[         R                  US9n[         R                  " UR                  5       U-  US9nU(       a
  [	        5       O	[        5       n[         R                  " U[         R                  US9nU " XVX5      $ )N)r'   r    r   )r(   zerosr)   r,   r   r   tensorr+   )	r#   r"   r   r-   r    r   r   	qmap_listr   s	            r   rF   OptimState8bit.zerosV   sf    EVDEKKMZ7G)/O%5F5H	||IU]]6J5..r   c                     U R                   R                   SU R                   SU R                   S[	        U R
                  5       SU R                   SU R                   S3$ )Nz(signed=z, block_size=z, shape=z	, device=z, requires_grad=))	__class____name__r   r-   tupler"   r    requires_gradr4   s    r   __repr__OptimState8bit.__repr__^   sb    ~~&&'x}M$//IZ [4::&'y=MdN`N`Maabd	
r   )r-   r   r   r   r   )NNr2   )T   N)rM   
__module____qualname____firstlineno__r3   staticmethodr   boolr$   r/   r5   classmethodr=   rC   intrF   rP   __static_attributes__r   r   r   r   r   (   s    -LTF T6 T T T T9f 9V 96 94 9,0 PT
 
 /$ /3 / /
r   r   c                    US   nUS   n[        U[        5      (       a  [        U[        5      (       a  UR                  UR                  :X  a  UR                  UR                  :X  d   eUR                  R                  UR                  5        UR                  R                  UR                  5        U$ [        U[        5      (       ad  [        XTR                  5      u  pg[        XdR                  5      nUR                  R                  U5        UR                  R                  U5        U$ UR                  UR                  5       5        U$ )Nr   r   )
isinstancer   r   r-   r   copy_r   r   r
   r   rC   )	functypesargskwargsdstsrc
scaled_srcr   r   s	            r   _re   e   s    
q'C
q'C#~&&:c>+J+JzzSZZ'CNNcnn,LLL				"				" J 
C	(	((nn=
'
HH=				
 J 			#.."#Jr   c                    UR                  SS 5      n[        US   R                  R                  US9US   R                  R                  US9US   R
                  R                  US9US   R                  5      n[        XX55      $ )Nr    r   r   )getr   r   r@   r   r   r   r   )r^   r_   r`   ra   r    outs         r   re   re   |   s     ZZ$'F
Q'Q'Qv&Q	C 't6??r   c                     U Vs/ s H*  n[        U[        5      (       a  UR                  5       OUPM,     nnU " U0 UD6$ s  snf r2   )r\   r   rC   r^   r_   r`   ra   xs        r   re   re      sC    LPQDqjN;;ALLNBDDQ    Rs   1A c                     Uu  pE[        UR                  R                  U5      UR                  UR                  UR
                  5      $ r2   )r   r   viewr   r   r   )r^   r_   r`   ra   rk   r"   s         r   re   re      s3    HA!'',,u-qwwIIr   c           	          US   n[        U[        5      (       d  [        S[        U5       35      e[        U " UR                  /USS  Q70 UD6U " UR
                  /USS  Q70 UD6UR                  R                  5       UR                  5      $ )Nr   z%expecting a OptimState8bit but found r   )	r\   r   
ValueErrortyper   r   r   cloner   rj   s        r   re   re      s     	QAa((@a	JKK QWW*tABx*6*QWW*tABx*6*			 r   c                     US   R                   R                  5       =(       aA    US   R                  R                  5       =(       a    US   R                  R                  5       $ )Nr   )r   	is_pinnedr   r   )r^   r_   r`   ra   s       r   re   re      sO     	Q! 	%GMM##%	%GLL""$r   c                    US S u  pEpg[        U5      S:  a  US   OSnUS:w  a  [        S5      eUS:w  a  [        S5      eUR                  n	[        R                  " UR
                  SS  5      n
Xj-  U	-  S:w  d  Xz-  U	-  S:w  a"  [        SUR
                   SU	 SU S	U S
3	5      e[        UR                  Xg UR                  Xj-  U	-  Xz-  U	-   UR                  R                  5       UR                  5      $ )N   r   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.)lenro   r-   mathprodr"   r   r   r   r   rq   r   )r^   r_   r`   ra   rk   dimstartendstepr-   strides              r   re   re      s   bqAE$i!m47D axFGGqy>??JYYqwwqr{#F 	*$)clj-HA-M-aggY6Fzl S#WF3%q2
 	
 		*,s|z/IJ			 r   )$rx   r(   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   quant_utilsr   r	   r
   r   opsatenc10d_functional_c10d_functional	functoolsr   r   r   r   
implementsr]   defaultre   _to_copylerpScalarrm   all_gather_into_tensorwait_tensordetachrs   slicer   r   r   <module>r      s      0 D +  yy~~))++99--    1+ + 1, ,:
& :
z 4::--. /, 4==001	@ 2	@ 499++,! -! 499,,-J .J
  	..66//77##++$$,,

" 4>>112 3 4::,,- .< .! "r   