
    hj;                        S SK JrJrJr  S SKrS SKrSrSrSSSSS	S
.SSSSSS	S.SSSSS	S
.SSSSS.S.rS+S\\	   4S jjr
Sr S,S\	S\\	   S\\	   S\\\R                  \4      4S  jjr S+S!\R                  S"\R                  S#\R                  S\\	   S\R                  4
S$ jjrS!\R                  S"\R                  S#\R                  S\\	   S\\	   S\\	   4S% jr S+S\\	   S\\	   S&\S\\	   4S' jjr S,S\	S\\	   S\\\R                  \4      4S( jjr S+S\\	   S\\	   4S) jjrS!\R                  S"\R                  S#\R                  S\\	   S\\	   4
S* jrg)-    )ListOptionalUnionN      g vCg 
`Cg   .YvBg(\?gq=
ףp?)bf16_peak_topsfp8_peak_topspeak_mem_bw_bytes_secpct_achievable_gemm_topspct_achievable_mem_bwg sCg s/Cg s?Cg   B)r   r	   fp4_peak_topsr
   r   r   g ֒Cg ֒"Cg   xHBg  @Bg  @Bg  @Cg  BwC)r   r	   r   r
   )zNVIDIA H100zNVIDIA B200zAMD Instinct MI300XzNVIDIA GeForce RTX 5090gpu_namec                 X    U c  [         R                  R                  S5      n [        U    $ )Nr   )torchcudaget_device_namegpu_name_to_specs)r   s    a/home/james-whalen/.local/lib/python3.13/site-packages/torchao/testing/training/roofline_utils.py	get_specsr   D   s'    ::--a0X&&    g>tensor_rolefloat8_recipe_namemx_recipe_namereturnc                 V   X-  nSnUS:X  ae  US:X  a/  U(       a  Sn	O	[         U-  n	[         U-  [        U-  -   n
U
nU	SX/nGOU(       a  Sn	O	[         U-  n	[         U-  S[        -  U-  -   n
U	SU
/nGOeUS:X  a  US:X  aL  U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	[         U-  S-  n[         U-  [        U-  -   n
XU
/nGOU(       a  S[        U-  -   [         U-  -   n	O[         U-  [        U-  -   [         U-  -   n	[         U-  [        U-  -   nX/nOUS:X  ak  US;   a+  U(       a  S[        U-  -   n	O[         U-  [        U-  -   n	U	/nOUS:X  a-  [         U-  [        U-  -   n	Sn[         U-  [        U-  -   n
XU
/nOQ S	5       eUS
;   d   S	5       eU(       a  S[        U-  -   n	O[         U-  [        U-  -   n	[         U-  [        U-  -   nX/nU Vs/ s H  nXS   -  U S   -  PM     nnU Vs/ s H  n[        R                  " U[        5      PM      nnU$ s  snf s  snf )a  
Calculates the roofline estimate of casting one of the gemm inputs
(input, weight or grad_output) to float8 in fwd+bwd.

Inputs: dim0 and dim1 (shape), tensor_role (input|weight|grad_output), recipe names
Outputs: list of read/write traffic overhead in seconds, one for each kernel
N
tensorwiseweightr   r   rowwiserowwise_with_gw_hp)inputgrad_outputunsupportedmxfp8_emulatedmxfp8_cublasmxfp8_cublas_rceilr
   r   BYTES_PER_EL_BF16BYTES_PER_EL_FLOAT8sympyMaxKERNEL_LAUNCH_OVERHEAD_SEC)specsdim0dim1r   r   r   fuse_with_prevnumel	res_byteskernel_1_rwkernel_3_rwkernel_4_rwkernel_2_rwxres_ss                  r    get_tensor_memory_traffic_ovhd_sr9   P   s   " KEI\)("  0%7+e36IE6QQK%K$aBI  0%7+e3a:M6MPU6UUK$a5I	y	((" "5"==/%7:MPU:UU+e3a7K+e36IE6QQK$;?I  +e336G%6OO 
 &-)E12'%/0 
 ,e36IE6QQK$2I	3	322 "5"==/%7:MPU:UU$IH$
 ,e36IE6QQKK+e36IE6QQK$;?I'-'5  "
 
 	 		 
 1E99K+e36IE6QQK'%/2E2MM .	
 A 	
)**U3J-KK 
  @EEu!UYYq45uEEL Fs   H!8%H&MKNc                    [        U5      nSU -  U-  U-  nU[        R                  L a  US   nO1U[        R                  [        R                  4;   a  US   nO S5       eXx-  US   -  n	X-  X-  -   n
X-  nUbB  US;   d   S5       eU[        R                  [        R                  4;   d   S5       eSnX-  nX-   n
U[        R                  L a  U
[
        -  U[
        -  -   nO?U[        R                  [        R                  4;   a  U
[        -  U[
        -  -   nO S5       eXS   -  US	   -  n[        R                  " X[        5      $ )
Nr   r   r	   r"   r   r#       r
   r   )
r   r   bfloat16float8_e4m3fnfloat8_e5m2r(   r)   r*   r+   r,   )r:   r;   r<   dtyper   r   r-   gemm_ops	peak_topscompute_gemm_time_s	num_reads
num_writes
block_sizenum_scale_readsbytes_rwmem_gemm_time_ss                   r   get_individual_gemm_time_sympyrL      s{    hE1uqy1}H*+		5&&(9(9:	:/*	#m#u".7Q1RR IJ! "
 
 	 		 

 ,,e.?.?@@O-O@
#1/	00:@Q3QQ	5&&(9(9:	:22ZBS5SS#m#u011E:Q4RR  99(;UVVr   c                     UUUpnUS:X  a  [         R                  n	[        XX'XV5      n
[        XXXV5      n[        XX)XV5      nX-   U-   nU$ )Nr   )r   r?   rL   )r:   r;   r<   rB   r   r   r   gemm_dtype_inputgemm_dtype_grad_inputgemm_dtype_grad_weightgemm_output_time_sgemm_grad_input_time_sgemm_grad_weight_time_stotals                 r   get_gemm_time_sympyrU     sz     	 .D
 11!&7	a> <	a =	a 7:QQELr   enable_fusion_modelingc           
          [        U5      n[        UU USUUUS9n[        UUUSUUSS9n	[        UU USUUUS9n
[        / UQU	QU
Q5      nU$ )Nr    )r   r   r   r0   r   Fr!   )r   r9   sum)r:   r;   r<   r   r   rV   r   r-   fwd_fp8_input_memfwd_fp8_weight_memgi_fp8_grad_output_memress               r   get_float8_mem_sympyr]   -  s     hE 9		-%- :		-% >		!-%- P!P$6P9OP
QCJr   c                 $   US:X  d   S5       eUSL d   S5       eX-  nSnUS:X  d   e[         U-  [        U-  -   nU/nU V	s/ s H  n	XS   -  U S   -  PM     n
n	U
 V	s/ s H  n	[        R                  " U	[        5      PM      n
n	U
$ s  sn	f s  sn	f )zw
Inference version of `get_tensor_memory_traffic_ovhd_s`.
The only thing happening here is we quantize the activation.
r   r"   FNr    r
   r   r'   )r-   r.   r/   r   r   r0   r1   r2   r3   r7   r8   s              r   *get_inference_tensor_memory_traffic_ovhd_sr_   c  s     *9M9*U"1M1" KEI'!!! $e+.AE.IIKI A 	
)**U3J-KK 
  @EEu!UYYq45uEEL Fs   B%Bc           	      R    [        U5      n[        UU USUSS9n[        / UQ5      nU$ )Nr    F)r   r   r0   )r   r_   rX   )r:   r;   r<   r   r   r-   rY   r\   s           r   get_inference_float8_mem_sympyra     sF     hE C		- "!"
#CJr   c                 B    US:X  d
  Ub   S5       e[        XX#S U5      nU$ )Nr   r"   )rL   )r:   r;   r<   rB   r   r   rQ   s          r   get_inference_gemm_time_sympyrc     s6     *.@.HW-WH 8ahWr   )N)F)typingr   r   r   r*   r   r)   r(   r   strr   r,   Symbolfloatr9   rL   rU   boolr]   r_   ra   rc    r   r   <module>rj      s   ) (    
 ! !'$( "& " "( %) "&$ " !' %) "& # !) U2 j' ' +  F 	F
 !F SMF 
%e#
$%F^ #/W||/W||/W ||/W sm/W \\/Wd|||| ||
 ! SM smR #3 !	3
 SM3 !3 sm3x % 	%
 !% 
%e#
$%%Z # !	
 sm,|||| ||
 ! smr   