
    h2                     `   S SK Jr  S SKrS SKJr  S SKJr  S SKJr  SSKJ	r	  SSK
Jr  SS	KJr  SS
KJr   " S S\5      rS\S\S\S\S\S\\   S\S\S\S\S\S\S\4S jr " S S\5      r " S S\5      r " S S \5      r " S! S"\5      r " S# S$\5      r " S% S&\5      r " S' S(\5      rg))    )OptionalN)Tensor)DTensor)	Optimizer   )_fp32_to_bf16_sr)OptimState4bit)OptimState8bit)OptimStateFp8c                      ^  \ rS rSr  SU 4S jjrS\SS4U 4S jjrU 4S jr\S\	S	\
S
\4S j5       rS\	S	\
4S jr\R                  " 5       SS j5       rSrU =r$ )	_AdamBase   returnNc                  > SU::  d  [        SR                  U5      5      eSU::  d  [        SR                  U5      5      eSUS   s=::  a  S:  d  O  [        SR                  US   5      5      eSUS   s=::  a  S:  d  O  [        SR                  US   5      5      e[        UUUUUS	9n
[        TU ]  X5        Xpl        Xl        Xl        g )
N        zInvalid learning rate: {}zInvalid epsilon value: {}r   g      ?z%Invalid beta parameter at index 0: {}r   z%Invalid beta parameter at index 1: {})lrbetasepsweight_decayamsgrad)
ValueErrorformatdictsuper__init__
block_sizebf16_stochastic_roundis_adamw)selfparamsr   r   r   r   r   r   r   r   defaults	__class__s              L/home/james-whalen/.local/lib/python3.13/site-packages/torchao/optim/adam.pyr   _AdamBase.__init__   s     by8??CDDcz8??DEEeAh$$DKKERSHUVVeAh$$DKKERSHUVV%
 	*$%:"     param_groupc                    > [         TU ]  U5        U R                  S   n[        US   [        5      (       d*  [
        R                  " US   [
        R                  S9US'   g g )Nr   )dtype)r   add_param_groupparam_groups
isinstancer   torchtensorfloat32)r   r&   groupr"   s      r#   r*   _AdamBase.add_param_group5   sT    , !!"%%+v..,,uT{%--HE$K /r%   c                 n   > [         TU ]  U5        U R                   H  nUR                  SS5        M     g )Nr   F)r   __setstate__r+   
setdefault)r   stater0   r"   s      r#   r3   _AdamBase.__setstate__=   s0    U#&&EY. 'r%   psignedr   c                     [         eN)NotImplementedErrorr7   r8   r   s      r#   _subclass_zeros_AdamBase._subclass_zerosC   s    !!r%   c           	         [        U[        5      (       a  UR                  5       OUnUR                  5       S:  a>  UR                  5       U R                  -  S:X  a  U R                  X2U R                  5      nO[        R                  " U5      n[        U[        5      (       aE  [        R                  " UUR                  UR                  SUR                  UR                  5       S9nUR                  UR                  5      nU$ )Ni   r   F)local_tensordevice_mesh
placements	run_checkshapestride)r,   r   to_localnumelr   r=   r-   
zeros_like
from_localrA   rB   rD   rE   todevice)r   r7   r8   local_pouts        r#   _new_buffer_AdamBase._new_bufferG   s    ",Q"8"8!**,a ==?d"w}}'HA'M&&wHC""7+C a!!$$ MM<<ggxxzC ffQXX
r%   c                 V   S nUb%  [         R                  " 5          U" 5       nS S S 5        [         R                  R                  R	                  5          U R
                   GH  nUS    GH  nUR                  c  M  UR                  nUR                  (       a  [        S5      eU R                  U   n[        U5      S:X  ab  [         R                  " S5      US'   U R                  US5      US'   U R                  US5      US	'   US
   (       a  U R                  US5      US'   US==   S-  ss'   [        US   [        5      (       d  [        S5      e[         R                  " [         SSS9" UR#                  5       UUS   US   US	   UR%                  SS 5      US   US   S   US   S   US   US   U R&                  U R(                  =(       a    UR*                  [         R,                  L 5        GM     GM     S S S 5        U$ ! , (       d  f       GN= f! , (       d  f       U$ = f)Nr    z Sparse gradient is not supportedr   r   stepTexp_avgF
exp_avg_sqr   max_exp_avg_sqr   r   zulr was changed to a non-Tensor object. If you want to update lr, please use optim.param_groups[0]['lr'].fill_(new_lr))	fullgraphdynamicr   r   r   )r-   enable_grad_dynamoutilsdisable_cache_limitr+   grad	is_sparseRuntimeErrorr5   lenr.   rN   r,   r   compilesingle_param_adamdetachgetr   r   r)   bfloat16)r   closurelossr0   r7   r[   r5   s          r#   rQ   _AdamBase.stepe   s   ""$y %
 ]]  446**xAvv~ 66D~~*+MNN JJqME 5zQ(-S(9f+/+;+;At+Di(.2.>.>q%.Hl+ +6:6F6Fq%6PE"23&MQ&M%eDk6::*H  MM"3tUS
fi(l+		"2D9dgq)gq)n-e22Pqww%..7P= ) + 7` k %$
 76` s   HF*H
H
H()r   r   r   r   Nr:   )__name__
__module____qualname____firstlineno__r   r   r*   r3   staticmethodr   boolintr=   rN   r-   no_gradrQ   __static_attributes____classcell__r"   s   @r#   r   r      s    ! 
!BI4 ID I/ "6 "4 "S " "V T < ]]_8 8r%   r   r7   r[   rQ   rR   rS   rT   r   beta1beta2r   r   IS_ADAMWBF16_STOCHASTIC_ROUNDc                    U R                  5       nUR                  5       nU(       a  XU	-  U-  -
  nOXU-  -   nSXr-  -
  nSX-  -
  nUR                  5       R                  USU-
  5      nUR                  5       R                  UR                  5       SU-
  5      nUR                  U5        UR                  U5        Ub[  [        R
                  " UR                  5       U5      nUR                  U5        UR                  5       UR                  5       -  U
-   nO$UR                  5       UR                  5       -  U
-   nXUU-  -  U-  -
  nU(       a  U R                  [        U5      5        g U R                  U5        g )Nr   )floatlerpsquarecopy_r-   maximumsqrtr   )r7   r[   rQ   rR   rS   rT   r   rs   rt   r   r   ru   rv   p_f32grad_f32bias_correction1bias_correction2exp_avg_f32exp_avg_sq_f32max_exp_avg_sq_f32denoms                        r#   r`   r`      s]     GGIEzz|H\)E11U225;5; --/&&xU;K%%',,X__->E	JNMM+^$!"]]>+?+?+A>R/0#((*-=-B-B-DDK$$&)9)>)>)@@CG+(889EAAE	 '(	r%   c                   f   ^  \ rS rSr     SSSS. SU 4S jjjjr\S\S\S\4S	 j5       r	S
r
U =r$ )Adam8bit   F   r   r   c                n   > [         T	U ]  UUUUUUUUSS9	  [        R                  R	                  S5        g )NFr   r   r   ztorchao.optim.Adam8bitr   r   r-   _C_log_api_usage_once
r   r    r   r   r   r   r   r   r   r"   s
            r#   r   Adam8bit.__init__   G     	!"7 	 
	
 	$$%=>r%   r7   r8   r   c                 Z    [         R                  " U R                  XU R                  5      $ r:   r
   zerosrD   rK   r<   s      r#   r=   Adam8bit._subclass_zeros       ##AGGVJJr%    MbP?g?g+?:0yE>r   Frg   rh   ri   rj   rk   r   rl   r   rm   rn   r=   rp   rq   rr   s   @r#   r   r      c     ? #? 
? ?2 K6 K4 KS K Kr%   r   c                   f   ^  \ rS rSr     SSSS. SU 4S jjjjr\S\S\S\4S	 j5       r	S
r
U =r$ )Adam4bit   F   r   c                n   > [         T	U ]  UUUUUUUUSS9	  [        R                  R	                  S5        g )NFr   ztorchao.optim.Adam4bitr   r   s
            r#   r   Adam4bit.__init__   r   r%   r7   r8   r   c                 Z    [         R                  " U R                  XU R                  5      $ r:   r	   r   rD   rK   r<   s      r#   r=   Adam4bit._subclass_zeros  r   r%   r   r   rg   r   rr   s   @r#   r   r      r   r%   r   c                   f   ^  \ rS rSr     SSSS. SU 4S jjjjr\S\S\S\4S	 j5       r	S
r
U =r$ )AdamFp8i  Fr   r   c                n   > [         T	U ]  UUUUUUUUSS9	  [        R                  R	                  S5        g )NFr   ztorchao.optim.AdamFp8r   r   s
            r#   r   AdamFp8.__init__  sG     	!"7 	 
	
 	$$%<=r%   r7   r8   r   c                 X    [         R                  " U R                  X R                  5      $ r:   r   r   rD   rK   r<   s      r#   r=   AdamFp8._subclass_zeros,      ""177JAAr%   r   r   rg   r   rr   s   @r#   r   r     sc     > #> 
> >2 B6 B4 BS B Br%   r   c                   f   ^  \ rS rSr     SSSS. SU 4S jjjjr\S\S\S\4S	 j5       r	S
r
U =r$ )	AdamW8biti1  Fr   r   c                n   > [         T	U ]  UUUUUUUUSS9	  [        R                  R	                  S5        g )NTr   ztorchao.optim.AdamW8bitr   r   s
            r#   r   AdamW8bit.__init__2  G     	!"7 	 
	
 	$$%>?r%   r7   r8   r   c                 Z    [         R                  " U R                  XU R                  5      $ r:   r   r<   s      r#   r=   AdamW8bit._subclass_zerosK  r   r%   r   r   r   r   g{Gz?Frg   r   rr   s   @r#   r   r   1  g     @ #@ 
@ @2 K6 K4 KS K Kr%   r   c                   f   ^  \ rS rSr     SSSS. SU 4S jjjjr\S\S\S\4S	 j5       r	S
r
U =r$ )	AdamW4bitiP  Fr   r   c                n   > [         T	U ]  UUUUUUUUSS9	  [        R                  R	                  S5        g )NTr   ztorchao.optim.AdamW4bitr   r   s
            r#   r   AdamW4bit.__init__Q  r   r%   r7   r8   r   c                 Z    [         R                  " U R                  XU R                  5      $ r:   r   r<   s      r#   r=   AdamW4bit._subclass_zerosj  r   r%   r   r   rg   r   rr   s   @r#   r   r   P  r   r%   r   c                   f   ^  \ rS rSr     SSSS. SU 4S jjjjr\S\S\S\4S	 j5       r	S
r
U =r$ )AdamWFp8io  Fr   r   c                n   > [         T	U ]  UUUUUUUUSS9	  [        R                  R	                  S5        g )NTr   ztorchao.optim.AdamWFp8r   r   s
            r#   r   AdamWFp8.__init__p  sG     	!"7 	 
	
 	$$%=>r%   r7   r8   r   c                 X    [         R                  " U R                  X R                  5      $ r:   r   r<   s      r#   r=   AdamWFp8._subclass_zeros  r   r%   r   r   rg   r   rr   s   @r#   r   r   o  sc     ? #? 
? ?2 B6 B4 BS B Br%   r   c                   D   ^  \ rS rSr     SSS. SU 4S jjjjrSrU =r$ )_AdamWi  F)r   c                B   > [         TU ]  UUUUUU[        S5      USS9	  g)zAdamW optimizer that supports quantized training (parameter is quantized). This optimizer should
only be used with torchao's quantized training.infTr   N)r   r   rx   )	r   r    r   r   r   r   r   r   r"   s	           r#   r   _AdamW.__init__  s6     	U|"7 	 
	
r%   r   r   rg   )rh   ri   rj   rk   r   rp   rq   rr   s   @r#   r   r     s2     
 $
 

 
r%   r   )typingr   r-   r   torch.distributed._tensorr   torch.optimr   quant_utilsr   subclass_4bitr	   subclass_8bitr
   subclass_fp8r   r   rx   rm   r`   r   r   r   r   r   r   r   r   r%   r#   <module>r      s      - ! ) ) ) 'K	 K`..
. . 	.
 . V$. 	. . . . 
. .  .bKy K>Ky K>Bi B>K	 K>K	 K>By B>
Y 
r%   