
    h                        S SK r S SKrS SKJr  S SKJr  S SKJr  S SKJr  \R                  R                  r
\R                  R                  r\R                  R                  r\R                  rS\S\4S jr " S	 S
\5      r\R%                  \
R&                  R(                  5      S 5       r\R%                  \
R,                  R(                  5      S 5       r\R%                  \
R.                  R0                  5      S 5       r\R%                  \
R2                  R(                  5      S 5       r\R%                  \R4                  R(                  \R4                  R(                  \R6                  R(                  \R6                  R(                  \
R8                  R(                  /5      S 5       r\R%                  \
R:                  R(                  5      S 5       r\R%                  \
R<                  R                  5      S 5       r\" \/5        g)    N)Tensor)add_safe_globals)return_and_correct_aliasing)TorchAOBaseTensorinput
block_sizec                 z   U R                   nU R                  SU5      n U R                  5       R                  S5      R	                  S5      [
        R                  " [        5      R                  -  nXR                  SS5      -  n U R                  [        5      R                  S5      nUR                  U5      U4$ )Ng-q=   )
shapeviewabsamaxcliptorchfinfoDTYPEmaxto)r   r   r   scalecodess        T/home/james-whalen/.local/lib/python3.13/site-packages/torchao/optim/subclass_fp8.pyquantize_fp8r      s    KKEJJr:&EIIKR %%e,u{{5/A/E/EEEJJr1%%EHHUO  $E::ee##    c                       \ rS rSrSS/r\S\S\4S j5       rS\S\4S jrS r	\
 SS j5       rSS	 jr\
SS
\4S jj5       rS rSrg)OptimStateFp8"   r   r   c                 T    [         R                  " XR                  UR                  S9$ )Ndevice)r   _make_wrapper_subclassr   r    )clsr   r   s      r   __new__OptimStateFp8.__new__%   s    ,,S++ellSSr   c                     UR                   [        L d   eUR                  S:X  d   eXl        X l        UR                  5       UR                  5       -  U l        g)a  Create quantized FP8 optimizer state.

Args
    codes: quantized FP8 E4M3FN data. Has the same shape as the original float tensor.
    scale: scale data for block-wise quantization.

NOTE: To get block-wise scale, the original float tensor is first reshape to (-1, block_size).
Thus, the last dimension of the original float tensor is not necessarily divisible by block size.
Given `codes` and `scale`, `block_size` is calculated as `codes.numel() // scale.numel()`.
r   N)dtyper   ndimr   r   numelr   )selfr   r   s      r   __init__OptimStateFp8.__init__)   sH     {{e###zzQ

++-5;;=8r   c                     U R                   / 4$ Ntensor_attrsr)   s    r   __tensor_flatten__ OptimStateFp8.__tensor_flatten__:   s      "$$r   Nc                 Z    U " / U R                    Vs/ s H  oQU   PM	     snQUQ76 $ s  snf r-   r.   )r"   tensor_data_dicttensor_attributes
outer_sizeouter_stridenames         r   __tensor_unflatten__"OptimStateFp8.__tensor_unflatten__=   s>      
141A1AB1At$1AB
EV
 	
Bs   (
c                    U R                   R                  5       nUR                  SU R                  5      U R                  R                  SS5      -  nUb  UR                  U5      nUR                  U R                   R                  5      $ )Nr
   r   )r   floatr   r   r   r   r   )r)   output_dtype
float_datas      r   
dequantizeOptimStateFp8.dequantizeE   sj    ZZ%%'
__R9DJJOOBPQ<RR
##|4Jtzz//00r   r   c                     [         R                  " U[        US9n[         R                  " UR                  5       U-  US9nU " XE5      $ )N)r&   r    r   )r   zerosr   r(   )r"   r   r   r    r   r   s         r   rB   OptimStateFp8.zerosM   s:    Ev>EKKMZ7G5  r   c           
          U R                   R                   SU R                   S[        U R                  5       SU R
                   SU R                   S3
$ )Nz(block_size=z, shape=z	, device=z, requires_grad=))	__class____name__r   tupler   r    requires_gradr0   s    r   __repr__OptimStateFp8.__repr__S   sX    ~~&&'|DOO3D E4::&'y=MdN`N`Maabd	
r   )r   r   r   )NNr-   )   N)rG   
__module____qualname____firstlineno__r/   staticmethodr   r#   r*   r1   classmethodr9   r?   intrB   rJ   __static_attributes__ r   r   r   r   "   s    W%LTF T6 T T9f 9V 9"% PT
 
1 !c ! !

r   r   c                 D   US   nUS   n[        U[        5      (       a}  [        U[        5      (       ah  UR                  UR                  :X  d   eUR                  R	                  UR                  5        UR
                  R	                  UR
                  5        U$ [        U[        5      (       aO  [        XTR                  5      u  pgUR                  R	                  U5        UR
                  R	                  U5        U$ UR	                  UR                  5       5        U$ )Nr   r   )
isinstancer   r   r   copy_r   r   r?   )functypesargskwargsdstsrcr   r   s           r   _r^   Z   s    
q'C
q'C#}%%*S-*H*H~~///				"				" J 
C	'	'#C8				
 J 			#.."#Jr   c                     UR                  SS 5      n[        US   R                  R                  US9US   R                  R                  US95      n[        XX55      $ )Nr    r   r   )getr   r   r   r   r   )rX   rY   rZ   r[   r    outs         r   r^   r^   o   s_     ZZ$'F
Q'Q'C 't6??r   c                     U Vs/ s H*  n[        U[        5      (       a  UR                  5       OUPM,     nnU " U0 UD6$ s  snf r-   )rV   r   r?   rX   rY   rZ   r[   xs        r   r^   r^   z   sC    KOP4ajM::ALLNA4DP    Qs   1A c                 h    Uu  pE[        UR                  R                  U5      UR                  5      $ r-   )r   r   r   r   )rX   rY   rZ   r[   rd   r   s         r   r^   r^      s'    HAe,agg66r   c           	          US   n[        U[        5      (       d  [        S[        U5       35      e[        U " UR                  /USS  Q70 UD6U " UR
                  /USS  Q70 UD65      $ )Nr   z$expecting a OptimStateFp8 but found r   )rV   r   
ValueErrortyper   r   rc   s        r   r^   r^      sx     	QAa''?QyIJJ QWW*tABx*6*QWW*tABx*6* r   c                     US   R                   R                  5       =(       a    US   R                  R                  5       $ )Nr   )r   	is_pinnedr   )rX   rY   rZ   r[   s       r   r^   r^      s/    7==""$Ba)@)@)BBr   c                    US S u  pEpg[        U5      S:  a  US   OSnUS:w  a  [        S5      eUS:w  a  [        S5      eUR                  n	[        R                  " UR
                  SS  5      n
Xj-  U	-  S:w  d  Xz-  U	-  S:w  a"  [        SUR
                   SU	 SU S	U S
3	5      e[        UR                  Xg UR                  Xj-  U	-  Xz-  U	-   5      $ )N   r   r   z+Only support aten.slice along the first dimz#Only support aten.slice with step=1zInvalid start or end for shape=z and block_size=zD. Make sure start and end align with block boundary. Received start=z, end=.)	lenrg   r   mathprodr   r   r   r   )rX   rY   rZ   r[   rd   dimstartendstepr   strides              r   r^   r^      s    bqAE$i!m47D axFGGqy>??JYYqwwqr{#F 	*$)clj-HA-M-aggY6Fzl S#WF3%q2
 	
 		*,s|z/IJ r   )ro   r   r   torch.serializationr   torch.utils._python_dispatchr   torchao.utilsr   opsatenc10d_functional_c10d_functionalfloat8_e4m3fnr   rR   r   r   
implementsrW   defaultr^   _to_copylerpScalarr   all_gather_into_tensorwait_tensordetachrj   slicerT   r   r   <module>r      s      0 D +yy~~))++99-- $ $C $5
% 5
p $**,,- .( $--//0@ 1@ $))**+! ,! $))++,7 -7
  	..66//77##++$$,,
	
	 $..001C 2C
 $**++, -8 - !r   