
    h'              
       r   S SK r S SKJrJrJrJrJr  S SKrS SKJ	r	  S SK
Js  Jr  S SKJr  S SKJr  S SKJrJrJrJr  S SKJr  \R2                  " 5       S\	R4                  SS4S	 j5       r\R8                  R:                  R<                  R>                  \R8                  R:                  R@                  R>                  \R8                  R:                  RB                  RD                  \R8                  R:                  RF                  R>                  \R8                  R:                  RH                  R>                  \R8                  R:                  RJ                  R>                  \R8                  R:                  RL                  R>                  \R8                  R:                  RN                  R>                  \R8                  R:                  RP                  RD                  \R8                  R:                  RR                  R>                  1
r* " S
 S\RD                  5      r+\RX                  R[                  \+/5        g)    N)AnyListOptionalSetTuple)suggest_memory_format)hp_tensor_to_float8_dynamic)Float8TrainingTensorGemmInputRoleLinearMMConfighp_tensor_and_scale_to_float8)EPSmodulereturnc                    SSK Jn  SSKJn  [        R
                  R                  S5        U R                  5        Vs/ s H_  n[        X25      (       d  M  [        UR                  U5      (       d  M2  [        UR                  R                  [        5      (       d  M]  UPMa     nnU Vs/ s H  oUR                  PM     nnU Vs1 s H#  nUR                  R                  R                  iM%     nnU(       d  gUu  n[        R                  " U[         R"                  S9n	[        R$                  " U	5      n
[        R&                  " U
[(        5      n
U
R*                  nU
R-                  [        R.                  5      n
[        R0                  " U5      R2                  U
-  nU[        R4                  L aA  [        R&                  " U[        R0                  " [        R4                  5      R2                  S9nUR7                  5       R-                  [        R8                  5      n[;        U5       H"  u  pX   UR                  R                  l        M$     gs  snf s  snf s  snf )a)  
Calculate scale dynamically for all float8 parameters.
This should be run after the optimizer step. It performs a single all-reduce to compute the
scales for all float8 weights.
Example usage:
    model(input).sum().backward()
    optim.step()
    precompute_float8_dynamic_scale_for_fsdp(model)
r   DTensor)Float8Linearz7torchao.float8.precompute_float8_dynamic_scale_for_fsdpN)ord)max)torch.distributed._tensorr   torchao.float8.float8_linearr   torch_C_log_api_usage_oncemodules
isinstanceweight_local_tensor!WeightWithDynamicFloat8CastTensorconfigcast_config_weighttarget_dtype_foreach_normmathinfstackclampr   dtypetofloat64finfor   float16to_localfloat32	enumerate_precomputed_scale)r   r   r   mfloat8_linearsfloat8_linearweightstarget_dtypesr#   max_weightsamax_tensororigin_dtypescale_tensorlocal_scale_tensoris                  S/home/james-whalen/.local/lib/python3.13/site-packages/torchao/float8/fsdp_utils.py(precompute_float8_dynamic_scale_for_fsdpr>      s    29	HH  A !*!Aa& 	
 qxx) 	
 qxx--/PQ	 	
!  * IWW}22GW ,'+M 	//<<+  '
 #O\ %%g488<K++k*K ++k3/K $$L../K;;|,00;>Lu}}${{<U[[5O5S5ST%..033EMMB%n5@R@U**= 6A* X's#   II3'II*I*Ic                      \ rS rSr\ SS\R                  S\S\R                  S\	\R                     4S jj5       r
 SS\R                  S\S\R                  S\	\R                     4S jjr\SS	 j5       rS
 r\S 5       rS rS rSS.S\\R                  S4   S\S\R                  S\	\R                     4S jjrSrg)r       Ntensorlinear_mm_configr)   precomputed_scalec                     [         R                  R                  U UR                  5       UR	                  5       UR                  5       [        U5      UR                  UR                  UR                  UR                  5       UR                  S9
$ )N)stridesstorage_offsetmemory_formatr)   layoutdevice
pin_memoryrequires_grad)r   Tensor_make_wrapper_subclasssizestriderF   r   r)   rH   rI   	is_pinnedrK   )clsrA   rB   r)   rC   s        r=   __new__)WeightWithDynamicFloat8CastTensor.__new__   ss     ||22KKMMMO!002/7,,===='') .. 3 
 	
    c                 4    Xl         X l        X0l        X@l        g N)_tensor_linear_mm_config_dtyper1   )selfrA   rB   r)   rC   s        r=   __init__*WeightWithDynamicFloat8CastTensor.__init__   s     !1 #4rT   c                   ^^ U[         R                  R                  R                  R                  :X  a4  [        US   R                  US   R                  US   R                  5      $ S mS mUU4S jn[        R                  " [
        XSU=(       d    0 45      u  p4U" U0 UD6nU[        ;  a  U$ [        R                  " [         R                  UU4S jU5      $ )Nr   c                    > Tc  U R                   mOU R                   T:X  d   eTc  U R                  mU R                  $ U R                  T:X  d   eU R                  $ rV   )rX   rY   rW   )tr)   	mm_configs    r=   unwrapDWeightWithDynamicFloat8CastTensor.__torch_dispatch__.<locals>.unwrap   s`     //	**i777} 99 xx5(((99rT   c                    > [        U TT5      $ rV   )r    )xr)   r`   s    r=   <lambda>FWeightWithDynamicFloat8CastTensor.__torch_dispatch__.<locals>.<lambda>   s    79eLrT   )r   opsatendetachdefaultr    rW   rX   rY   pytreetree_map_only_ops_to_preserve_subclassrL   )	rQ   functypesargskwargsra   outr)   r`   s	          @@r=   __torch_dispatch__4WeightWithDynamicFloat8CastTensor.__torch_dispatch__   s    599>>((0004Qa!:!:DGNN  /3	'+	 ++-vfl7K
 D#F#00J##LLL
 	
rT   c                     S/nU R                   (       a  UR                  S5        XR                  U R                  S.4$ )NrW   r1   )r`   r)   )r1   appendrX   rY   )rZ   tensorss     r=   __tensor_flatten__4WeightWithDynamicFloat8CastTensor.__tensor_flatten__   s7    +""NN/0&<&<t{{SSSrT   c           
      F    [        U S   US   US   [        U SS 5      5      $ )NrW   r`   r)   r1   )r    getattr)inner_tensorsflatten_spec
outer_sizeouter_strides       r=   __tensor_unflatten__6WeightWithDynamicFloat8CastTensor.__tensor_unflatten__   s4    0)$%!M#7>	
 	
rT   c                 V    SU R                    SU R                   SU R                   S3$ )Nz)WeightWithDynamicFloat8CastTensor(tensor=z, linear_mm_config=z, dtype=))rW   rX   rY   )rZ   s    r=   __repr__*WeightWithDynamicFloat8CastTensor.__repr__   sN    :4<<.H[\`\r\r[ss{  }A  }H  }H  |I  IJ  K  	KrT   c           	      P   U R                   bF  [        U R                  U R                   U R                  U R                  [
        R                  5      nO:[        U R                  U R                  U R                  S[
        R                  US9nUR                  4UR                  44$ )NT)reduce_amaxgemm_input_roledevice_mesh)
r1   r   rW   rY   rX   r   WEIGHTr	   _data_scale)rZ   meshfloat8_training_tensors      r=   fsdp_pre_all_gather5WeightWithDynamicFloat8CastTensor.fsdp_pre_all_gather   s    "".%B''&&$$&" &A&&  - 4 4 &" ',,.1G1N1N0PPPrT   )rr   all_gather_outputs.metadataparam_dtyperr   c                H   Uu  nUu  nUbp  SSK Jn  [        U[        5      (       a  Xdl        g [        XG5      (       a0  [        UR
                  [        5      (       a  XdR
                  l        g [        SU 35      e[        UUUU R                  [        R                  S9U44$ )Nr   r   z[out must be a Float8TrainingTensor or DTensor(_local_tensor=Float8TrainingTensor), but got )r   )
r   r   r   r
   r   r   RuntimeErrorrX   r   r   )rZ   r   r   r   rr   datascaler   s           r=   fsdp_post_all_gather6WeightWithDynamicFloat8CastTensor.fsdp_post_all_gather   s     %?9#344"
  C))j!!#7/ / ,1!!(
  #qruqvw  $"")00
 7 	rT   )rY   rX   r1   rW   rV   )__name__
__module____qualname____firstlineno__staticmethodr   rL   r   r)   r   rR   r[   classmethodrs   rx   r   r   r   r   r   r   __static_attributes__ rT   r=   r    r       s    59

 )
 {{	

 $ELL1
 
2 5944 )4 {{	4
 $ELL14 
 
BT 
 
KQ4 '+!%,,"34  [[	 ell# rT   r    ).r%   typingr   r   r   r   r   r   torch.nnnntorch.utils._pytreeutils_pytreerk   torch._prims_commonr   #torchao.float8.float8_scaling_utilsr	   %torchao.float8.float8_training_tensorr
   r   r   r   torchao.float8.float8_utilsr   no_gradModuler>   rg   rh   
empty_likerj   	new_zerosslicerL   copy_view
as_strided_to_copy_pin_memorysplitclonerm   r    serializationadd_safe_globalsr   rT   r=   <module>r      s    2 2   $ $ 5  , 3VRYY 3V4 3V 3Vt 
IINN%%	IINN$$	IINN	IINN  	IINN	IINN%%	IINN##	IINN&&	IINN	IINN   `K K^    $ $&G%H IrT   