
    hFL                        % S SK JrJrJrJr  S SKrS SKJr  S SKJ	r	J
r
  S SKJrJr  \R                  R                  r\R                  R                   r\R                  R"                  r0 r\\\4   \S'      S%S\R(                  S\R(                  S	\R(                  S
\R(                  S\R*                  S\\R(                     S\\R(                     S\S\R(                  4S jjrS rS r\" \R4                  R6                  \R8                  R6                  \R:                  R6                  \R<                  R(                  \R>                  R@                  \RB                  R6                  /5      S&S j5       r"\" \RF                  R6                  /5      S&S j5       r$\" \RJ                  R6                  \RL                  RN                  /5      S&S j5       r(\" \RR                  R6                  /5      S&S j5       r*\" \RV                  R(                  /5      S&S j5       r,\" \RZ                  R6                  /5      S&S j5       r.\" \R^                  R`                  /5      S&S j5       r1S\	S\	4S jr2\" \Rf                  R6                  \Rh                  R6                  /5      S&S j5       r5\" \Rl                  R6                  /5      S&S j5       r7\" \Rp                  R6                  /5      S&S j5       r9\" \Rt                  R6                  /5      S&S  j5       r;\" \Rx                  R6                  \Rx                  R6                  /5      S&S! j5       r=\" \R|                  R6                  \R|                  R6                  /5      S&S" j5       r?\" \R                  R6                  /5      S&S# j5       rA\" \R                  R6                  /5      S&S$ j5       rCg)'    )AnyDictOptionalTupleN)tree_map)Float8TrainingTensorchoose_scaled_mm_config)is_row_majorpad_tensor_for_matmulFLOAT8_OPS_TABLEa_dataa_scaleb_datab_scaleoutput_dtypeoutput_scalebiasuse_fast_accumreturnc                    UR                  5       nUR                  5       n	Sn
UR                  U R                  S   S4:H  =(       a    UR                  SUR                  S   4:H  nU(       a-  U(       d&  X-  n
UR                  S5      nUR                  S5      n	UnU[        R                  [        R
                  4;   a  U(       a  [        R                  nSnU[        R
                  :X  a  UnSn[        R                  " U UUU	UUUUS9nU
b  X-  nUb  X-  nU[        R                  [        R
                  4;   a  U(       a  UR                  U5      nU$ )z
This is the unwrapped version of addmm_float8, which does not take in Float8TrainingTensors
as inputs. This is used to standardize the logic between subclassed and non subclassed
versions of the linear module.
Nr       )scale_ascale_br   scale_result	out_dtyper   )	
reciprocalshapenew_onestorchfloat16float32bfloat16
_scaled_mmto)r   r   r   r   r   r   r   r   a_inverse_scaleb_inverse_scalepost_inverse_scaleis_rowwise_scaling
orig_dtype	post_biasoutputs                  S/home/james-whalen/.local/lib/python3.13/site-packages/torchao/float8/float8_ops.pyaddmm_float8_unwrappedr.      sM    ((*O((*O 6<<?A*>> 7==	QU D
 . ->)2226)2226 JemmU]]338J~~Iu}}$	!%	F %$emmU]]338J:&M    c                 J    [        UR                  5      S;   d
   U  S35       eg )N)r   r   z+ with axiswise scaling is not supported yet)lenr   )aten_opscales     r-   _assert_tensorwise_scaler4   ^   s1     	EKKF"? =>	? 	#r/   c                    ^  U 4S jnU$ )z(Register aten ops to the float8 op tablec                    > T H8  nU[         ;   a"  [        SU S[         U   R                   35      eU [         U'   M:     U $ )Nz
Float8 op z is already registered to )r   RuntimeError__name__)funcopaten_opss     r-   	decoratorimplements.<locals>.decoratori   sV    B%%" $>?OPR?S?\?\>]^  $(R   r/   r   )r;   r<   s   ` r-   
implementsr>   f   s     r/   c                     [        XS   R                  5        U " US   R                  /USS  Q70 UD6n[        UUS   R                  US   R                  US   R
                  US   R                  5      $ Nr   r   )r4   _scale_datar   _orig_dtype_linear_mm_config_gemm_input_role)r2   argskwargsnew_datas       r-   float8_desugar_oprI   u   sx     W1gnn5tAw}}:tABx:6:HQQQ!!Q   r/   c                     U " US   R                   /USS  Q70 UD6nU " US   R                  /USS  Q70 UD6n[        UUUS   R                  US   R                  US   R
                  5      $ r@   )rB   rA   r   rC   rD   rE   )r2   rF   rG   rH   	new_scales        r-    float8_desugar_data_and_scale_oprL      s     tAw}}:tABx:6:HQ<ab<V<IQQ!!Q   r/   c                    U " US   R                   /USS  Q70 UD6nUS   R                  R                  S:  a  U " US   R                  /USS  Q70 UD6nOUS   R                  nU [        R                  R
                  :X  a  [        XS   R                  5        US   R                  nUnUb  US:X  a  US:H    OUS:H    [        UUUS   R                  US   R                  US   R                  U5      $ )Nr   r   )rB   rA   ndimaten	transposeintr4   _axiswise_dimr   rC   rD   rE   )r2   rF   rG   rH   rK   old_axiswise_dimnew_axiswise_dims          r-   float8_transposerV      s     tAw}}:tABx:6:HAw~~QDGNN@T!"X@@	GNN	$..$$$ q'..9Aw,,'#q "!QQ!!Q   r/   c                 .   US   US   pCU[        UR                  R                  5      :X  am  U " US   R                  /USS  Q70 UD6n[        UUS   R                  US   R
                  US   R                  US   R                  US   R                  5      $ [        US   R                  R                  5      S:  a  [        XU5      $ UR                  n[        U5      S:X  a  US:X  ag  U " UR                  U40 UD6nSUS   /nU " UR                  U40 UD6n[        UUUR
                  UR                  UR                  UR                  5      $ US:X  d  U[        UR                  5      S-
  :X  a_  U " UR                  U40 UD6nUS   S/nU " UR                  U40 UD6nSn	[        UUUR
                  UR                  UR                  U	5      $ [        U  SUR                   SUR                  R                   SUR                   SU S	3
5      e)
Nr   r      rN   z# with axiswise scaling and t.shape z t._scale.shape z t._axiswise_dim z new_shape z is not supported yet.)listrB   r   r   rA   rC   rD   rE   rS   r1   rI   AssertionError)
r2   rF   rG   t	new_shaperH   axiswise_dimnew_scale_shaperK   rU   s
             r-   float8_viewr_      sI   7DGy D''47==>48>v>#GNNGG%%G$$G!!
 	
 47>> 1$ 77 ??L
9~1qww	<V<H )B-0O/DVDI'##""  R<CL14D#Eqww	<V<H(|Q/O/DVDI!'##""   )6qwwi?OPQPXPXP^P^O__pqr  rA  rA  qB  BM  NW  MX  Xn  	o r/   c                    ^ U " TS   R                   /TSS  Q70 UD6n[        U TS   R                  5        U4S jn[        XC5      n[	        U5      $ )Nr   r   c                    > [        U TS   R                  TS   R                  TS   R                  TS   R                  5      $ )Nr   )r   rA   rC   rD   rE   )datarF   s    r-   make_float8!float8_split.<locals>.make_float8   sE    #GNNGG%%G$$
 	
r/   )rB   r4   rA   maprY   )r2   rF   rG   new_data_tensorsrc   outs    `    r-   float8_splitrh      sS    tAw}}BtABxB6BWd1gnn5
 k
,C9r/   c                    US   nUS   R                   nUS   R                  nUS   R                  nUS   R                  R                  nUS   R
                  n/ n	U H  n
[        U
[        5      (       d   S5       eU
R                   U:X  d   S5       eU
R                  UL d   S5       eU
R                  UL d   S5       eU
R                  R                  U:X  d   S5       eU
R
                  UL d   S5       e[        X
R                  5        U	R                  U
R                  R                  [        R                  5      5        M     U " U	/USS  Q70 UD6nUR                  U5      n[        XXFU5      $ )	Nr   z7Expecting all chunks to be of type Float8TrainingTensorz,Expecting all chunks to be of the same dtypezCExpecting all chunks to have thee same scale as a result of a splitzGExpecting all chunks to have thee same mm config as a result of a splitzCExpecting all chunks to be of the same dtype as a result of a splitzLExpecting all chunks to have the same gemm_input_role as a result of a splitr   )rC   rA   rD   rB   dtyperE   
isinstancer   r4   appendviewr    uint8)r2   rF   rG   chunked_tensorsr*   r3   	mm_config	fp8_dtypegemm_input_role
chunk_datachunkrH   s               r-   
float8_catru     s   377O #//JA%%E"44I"((..I%a(99OJ %!566 	
E	
6   J. 	
:	
. ||u$ 	
Q	
$ &&)3 	
U	
3 {{  I- 	
Q	
- %%8 	
Z	
8 	!,,7%++**5;;78) !, z7DH77H}}Y'HXXr/   c                 t    [        XS   R                  5        S n[        X15      n[        X25      nU " U0 UD6$ )a  Be careful with this function, this is a "fallback" op that
casts the output of the op to the original precision. And performs the op.

We currently need this to support the backward for admmm bias.
"addmm" -> out
"hp_gradBias" <-"sum" <- "identity" <- gradOut <- "hp_gradOut"
r   c                 P    [        U [        5      (       a  U R                  5       $ U $ N)rk   r   to_original_precision)xs    r-   unwrap!float8_cast_up_op.<locals>.unwrap8  s$    a-..**,,r/   )r4   rA   r   )r2   rF   rG   r{   new_args
new_kwargss         r-   float8_cast_up_opr   -  s?     W1gnn5
 %H&)JH+
++r/   abc                     U R                   nU R                  nUR                   n[        U R                  U R                  UR                  UR                  5      nUR
                  (       a  U R                   R                  S5      UR                   R                  S5      :X  d?   SU R                   R                  S5       SUR                   R                  S5       35       e[        USS9n[        USS9n[        UR                  5       5      (       d  UR                  5       n[        UR                  5       5      (       a,  UR                  5       R                  5       R                  5       nUR                  nU R                  c<  UR                  b/  UR                  UR                  S   5      R                  SS5      nOHU R                  b;  UR                  c.  UR                  UR                  S   5      R                  SS5      nX#XF4$ )Nr   r   z"Inner dims must match for mm, got z and )dimsrN   )rB   rA   r	   rE   rD   pad_inner_dimsizer   r
   stride
contiguousr[   rS   repeatr   reshape)r   r   r   r   r   scaled_mm_configr   s          r-   preprocess_addmmr   B  s   WWFhhGWWF.					 %%ww||A!'',,q/1 	
0a0Aqww||TUFWX	
1 'vA6&vA6((""$FMMO$$&&(**,hhG 	1??#>..a199"a@	
	$)@..a199!R@F++r/   c                    US   nUS   n[        U[        5      (       a  [        U[        5      (       d)   SR                  [        U5      [        U5      5      5       e[	        X45      u  pVpxUR
                  n	[        UR                  UR                  UR                  UR                  5      n
U
R                  (       ap  [        R                  " UR                  R                  5       UR                  -  UR                  R                  5       UR                  -  5      R                  U	5      $ [!        UUUUU	S S U
R"                  S9nU$ )Nr   r   zFExpecting  both Float8TrainingTensor for mm inputs but found {} and {}r   r   r   )rk   r   formattyper   rC   r	   rE   rD   emulater    mmrB   floatrA   r%   r.   r   )r2   rF   rG   r   r   r   r   r   r   r   r   
tensor_outs               r-   	float8_mmr   k  s+   QAQAa-..:	4 4 OVVQa 
 (8'=$FV==L.					 xx!((2AGGMMOahh4NORR
 	
 ('66	J r/   c                    [        US   [        R                  5      (       a0  [        US   [        5      (       a  [        US   [        5      (       d   eUS   nUS   nUS   n[	        XE5      u  pgpUR
                  n
UR                  U
:X  d   S5       e[        UR                  UR                  UR                  UR                  5      nUR                  (       at  [        R                  " UR                  R                  5       UR                  -  UR                  R                  5       UR                  -  5      R                  U
5      nX-   $ [!        UUUU	U
S UUR"                  S9nU$ )Nr   r   rX   z"bias dtype must match output dtyper   )rk   r    Tensorr   r   rC   rj   r	   rE   rD   r   r   rB   r   rA   r%   r.   r   )r2   rF   rG   r   r   r   r   r   r   r   r   r   rg   r   s                 r-   float8_addmmr     sL    	47ELL))tAw 455tAw 455	6 7DQAQA'7'=$FV==L::%K'KK%.					 hhqww}}1177==?QXX3MNQQ
 z''66	J r/   c                 p    [        XS   R                  5        US   R                  US   R                  :H  $ r@   )r4   rA   r   r2   rF   rG   s      r-   float8_is_same_sizer     s-    W1gnn57==DGMM))r/   c           	      r   [        US   [        5      (       d   e[        U5      S:X  a  SU;   d   S5       eUS   [        R                  [        R
                  1;   d   S5       e[        US   R                  US   R                  US   US   R                  US   R                  US   R                  5      $ )z{This gets called when running matmul under autocast
when the input is a Float8TrainingTensor, presenting as a fp32
tensor.
r   r   rj   z%Only support dtype kwarg for autocastzKOnly support floating point conversion for autocast w/ Float8TrainingTensor)rk   r   r1   r    r!   r#   rB   rA   rD   rE   rS   r   s      r-   autocast_to_copyr     s     d1g34444v;!6 1 /1 '?  U UU   QQwQ!!Q  Q r/   c                 P   [        XS   R                  5        US   n[        U[        5      (       d   S[	        U5       35       eUR
                  nUR                  5       nU " U/USS Q70 UD6n[        UUR                  UR                  UR                  UR                  5      $ )z#
override funcol with FP8 handling
r   z9expecting a Float8TrainingTensor for allgather but found r   N)
r4   rA   rk   r   r   rB   r   rC   rD   rE   r2   rF   rG   	fp8_inputfp8_datafp8_outs         r-   allgather_fp8r     s     W1gnn5QIi!566 
CDOCTU6 H""$Hh4ab4V4G##"" r/   c                    [        XS   R                  5        US   n[        U[        5      (       d   eUR                  nU " U/USS  Q70 UD6n[        UUR                  UR
                  UR                  UR                  5      $ r@   )r4   rA   rk   r   rB   rC   rD   rE   r   s         r-   wait_tensor_fp8r     s    W1gnn5QIi!56666Hh4ab4V4G##"" r/   c                    US   nUS   n[        U[        5      (       d   e[        U[        5      (       d   e[        X1S   R                  5        UR                  UR                  :X  d   eUR                  UR                  :X  d   eUR
                  UR
                  :X  d   eUR                  nUR                  nU " XQS   U/USS  Q70 UD6n[        UUR                  UR
                  UR                  UR                  5      $ )Nr   rX   r      )	rk   r   r4   rA   rj   rC   rB   rD   rE   )r2   rF   rG   fp8_self
fp8_valuesr   fp8_values_datar   s           r-   index_put_fp8r     s    AwHaJh 45555j"67777XAw~~6??j/////>>Z-----:#9#9999~~H &&OhQN48NvNG""!! r/   c                    US   nUS   n[        U[        5      (       dI  [        U[        5      (       a4  UR                  5       n[        XR                  5        U " X5/USS  Q70 UD6$ [        U[        5      (       Ga?  [        U[        5      (       Ga)  [        XR                  5        UR
                  UR
                  :X  d   S5       eUR                  UR                  :X  d   S5       eUR                  UR                  :X  d   S5       eUR                  R                  UR                  R                  :X  d   S5       eUR                  UR                  :X  d   S5       eU " UR                  UR                  /USS  Q70 UD6n[        UUR                  UR
                  UR                  UR                  5      $ [        S	5      e)
Nr   r   rX   z<Expecting both Float8TrainingTensors to be of the same dtypez<Expecting both Float8TrainingTensors to have thee same scalez@Expecting both Float8TrainingTensors to have thee same mm configz=Expecting both Float8TrainingTensors to be of the same dtypetzEExpecting both Float8TrainingTensors to have the same gemm_input_rolez7Unsupported semantics for copy_ in Float8TrainingTensor)rk   r   ry   r4   rA   rC   rD   rB   rj   rE   r7   )r2   rF   rG   selfsrcsrc_hpr   s          r-   copy_fp8r     s    7D
q'Cd011j!7 7 **, **5t9d12h9&99	D.	/	/J!5 5 	!**53??2 	
J	
2 {{cjj( 	
J	
( %%)>)>> 	
N	
> zz399??2 	
K	
2 $$(<(<< 	
S	
< $**ciiE$qr(EfE#KK""!!
 	
 TUUr/   )NNFrx   )Dtypingr   r   r   r   r    torch.utils._pytreer   %torchao.float8.float8_training_tensorr   r	   torchao.float8.float8_utilsr
   r   opsrP   c10d_functional_c10d_functionalr   __annotations__r   rj   boolr.   r4   r>   _unsafe_viewdefault
as_stridedcloneslicefill_Scalarr   rI   detachrL   r[   rQ   rR   rV   rm   r_   splitrh   catru   sumdim_IntListr   r   r   matmulr   addmmr   is_same_sizer   _to_copyr   all_gather_into_tensorr   wait_tensorr   
index_put_r   copy_r   r   r/   r-   <module>r      s   . -  ( Lyy~~))++99-- #% $sCx. % ,0#' ?LL?\\? LL? \\	?
 ++? 5<<(? 5<<
 ? ? \\?D? !!





				 
	
	 8 TYY 3 !3l TZZ  !$ TXX!Y  !YH TXX!!"#, $,(&,, &,1E &,R TWW__dkk1123 4D TZZ !! "!H T&&'(* )*
 T]]""#$ %. ..66//77, _((002B2N2N2V2VWX Y  T__$$%& ', TZZ !-V "-Vr/   