
    h;                     J    S SK Jr  S SKrS SKJrJr  S SKJr   " S S\5      rg)    )TypeN)	OptimizerParamsT)get_available_devicesc                       \ rS rSr\R
                  R                  4SSS.S\S\\	   S\
S\S	S
4
S jjjr\R                  " 5       SS j5       rSS jr\S 5       rS rS rSrg
)CPUOffloadOptimizer   Fi   )offload_gradientsminimal_sizeparamsoptimizer_classr
   r   returnNc                  ^ ^ U[         R                  R                  L a  SU;  a  UR                  SS9  [	        U5      n[        U5      S:X  a  [        S5      e[        US   [        5      (       d  SU0/nUT l	        ST l
        / T l        [        5       T l        [        5       T l        [        5       S   T l        T R                  S	;   d   S
5       e[!        [         T R                  5      R#                  5       T l        [        5       T l        UU 4S jnU GH  nUR)                  S5      n/ n	U H  n
U
R*                  (       d  M  U
R-                  5       T R                  :  a  U	R/                  U
5        MG  [         R0                  " U
SSS9n[         R0                  " USS9Ul        UR5                  U
R7                  5       SS9  UT R                  U
'   U
R9                  U5        U" SU0UE/40 UD6T R                  U
'   M     [        U	5      S:  d  M  T R                  R/                  SU	0UE5        GM      [        T R                  5      S:  a  U" T R                  40 UD6T l
        gg)am  Offload optimizer to CPU for single-GPU training. This will reduce GPU memory by the size of optimizer state.
Optimizer step will be done on CPU.

Args
    params: a list of parameters or parameter groups.
    optimizer_class: constructor of the base optimizer. Defaults to :class:`torch.optim.AdamW`.
    offload_gradients: free GPU gradients once they are moved to CPU. Not compatible with gradient accumulation.
    minimal_size: tensors smaller than this are kept on the GPU, to avoid excessively many small transfers.
    kwargs: other keyword arguments to be passed to the base optimizer e.g. `lr`, `weight_decay`.
fusedT)r   r   z%optimizer got an empty parameter listr   N)cudaxpuz.CPU Offload currently only supports CUDA & XPUc                   > U R                   Gb)  TR                  U    nTR                  R                  [	        [
        TR                  5      R                  5       5        [	        [
        TR                  5      R                  TR                  5         UR                   R                  U R                   SS9  S S S 5        U TR                  ;   a  TR                  U 	 TR                  R                  5       TR                  U '   T(       a-  U R                   R                  TR                  5        S U l         g g g ! , (       d  f       N= fNTnon_blocking)gradparam_d2h_mapstreamwait_streamgetattrtorchdevicecurrent_streamcopy_queuerecord_eventrecord_stream)p_devicep_hostr
   selfs     S/home/james-whalen/.local/lib/python3.13/site-packages/torchao/optim/cpu_offload.pybackward_hook3CPUOffloadOptimizer.__init__.<locals>.backward_hookA   s    }}(++H5 ''t{{(C(R(R(TUUDKK077DKK%%hmm$%G E tzz)

8,'+{{'?'?'A

8$ %MM//<$(HM %! )
 EDs   %D::
Ecpu)r   
pin_memory)r+   r   )r   optimAdamWupdatelistlen
ValueError
isinstancedictr   d_optd_param_groupsr   
optim_dictr   r   r   Streamr   r!   poprequires_gradnumelappend
empty_liker   r    detach"register_post_accumulate_grad_hook)r&   r   r   r
   r   kwargsparam_groupsr(   param_groupretained_paramsr$   r%   s   `  `        r'   __init__CPUOffloadOptimizer.__init__   s<   ( ekk///G64IMMM%F||!DEE,q/400%|45L )
 !V&+-b1{{ 
 
 	< <	< 
 eT[[188: V
	)* (K __X.F O"-->>#d&7&77#**84 ))(5TR#..v$GX__.TB/5""8,;;MJ,;6+67-;A-)! #( ?#a'##**Ho+U+UV3 (6 t""#a'()<)<GGDJ (    c                 N   S nUb  U" 5       nU R                   b  U R                   R                  5         U R                  R                  5        H  u  p4UR	                  5         U R
                  U   R                  5         U R                  U   n[        [        U R                  5      R                  U R                  5         UR                  USS9  S S S 5        M     U R                  R	                  5         U R                  R                  5         U$ ! , (       d  f       M  = fr   )r4   stepr!   itemssynchronizer6   r   r   r   r   r   r    clear)r&   closurelossr$   grad_d2h_eventr%   s         r'   rG   CPUOffloadOptimizer.stept   s    9D ::!JJOO(,

(8(8(:$H&&(OOH%**,
 ''1F,33DKK@vD9 A@ ); 	!

 A@s   D
D$	c                     U(       d   eU R                   R                  5        H
  nS Ul        M     U R                  b  U R                  R	                  US9  g g )N)set_to_none)r   keysr   r4   	zero_grad)r&   rP   r$   s      r'   rR   CPUOffloadOptimizer.zero_grad   sR    { **//1H HM 2 ::!JJ  [ 9 "rE   c                 h    [        S U R                  R                  5        5       U R                  S9$ )Nc              3   8   #    U  H  oR                   v   M     g 7fN)r@   ).0r,   s     r'   	<genexpr>3CPUOffloadOptimizer.param_groups.<locals>.<genexpr>   s     F-EE-Es   )start)sumr6   valuesr5   )r&   s    r'   r@    CPUOffloadOptimizer.param_groups   s/     FT__-C-C-EF%%
 	
rE   c                     SU R                   R                  5        Vs/ s H  oR                  5       PM     sn0nU R                  (       a  U R                  R                  5       US'   U$ s  snf )N	offloaded	on-device)r6   r\   
state_dictr4   )r&   r,   ra   s      r'   ra   CPUOffloadOptimizer.state_dict   s_    $//:P:P:RS:R**,:RS

 ::&*jj&;&;&=J{#	 Ts   A+c                    [        U R                  R                  5       US   5       H  u  p#UR                  U5        M     U R                  (       a  U R                  R                  US   5        g SU;   a  [        S5      eg )Nr_   r`   zPloaded state dict has a 'on-device' parameter group not present in the optimizer)zipr6   r\   load_state_dictr4   r1   )r&   ra   r,   optim_state_dicts       r'   re   #CPUOffloadOptimizer.load_state_dict   sy    '*OO""$j&=(
#E !!"23(

 ::JJ&&z+'>?J&b  'rE   )r4   r5   r   r   r6   r   r!   r   rV   )T)__name__
__module____qualname____firstlineno__r   r,   r-   r   r   r   boolintrC   no_gradrG   rR   propertyr@   ra   re   __static_attributes__ rE   r'   r   r      s     ,1;;+<+<_H
 #( _H_H i_H
  _H _H 
_HB ]]_ 2: 
 
rE   r   )	typingr   r   torch.optim.optimizerr   r   torchao.utilsr   r   rq   rE   r'   <module>ru      s!      4 /b) brE   