
    ȅiB                         S SK r S SKJrJr  S SKJr  S SKJrJr  S SK	r	S SK
Jr  S SKJrJrJr  S SKJr  \ R&                  " \5      rS\\\4   4S jrS	\	R2                  S\4S
 jr " S S\5      r " S S\5      rg)    N)abcdefaultdict)Iterable)Anyoverload)_MultiDeviceReplicator
GradScalerOptState)ProcessGroupreturnc                  (    [         R                  0 S.$ )N)stagefound_inf_per_device)r
   READY     d/home/james-whalen/.local/lib/python3.13/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_stater      s    ^^R@@r   tensorc                     U R                   =(       d;    U R                  R                  SSSSS[        R                  R                  5       4;   $ )Nxlacpuhpumtiaxpu)is_cudadevicetypetorch_C_get_privateuse1_backend_name)r   s    r   _is_supported_devicer"      sH    >> V]]//..04  r   c                   >    \ rS rSrSrS\R                  SS4S jrSrg)_GeneralMultiDeviceReplicator   z{
Lazily serves tensor to request device. This class extends
_MultiDeviceReplicator to allow support for "cpu" as a device.
master_tensorr   Nc                 n    [        U5      (       d  [        SUR                   35      eXl        0 U l        g )NExpected supported device, got )r"   AssertionErrorr   master_per_device_tensors)selfr&   s     r   __init__&_GeneralMultiDeviceReplicator.__init__%   s:    #M22 1-2F2F1GH  $EG r   )r+   r*   )	__name__
__module____qualname____firstlineno____doc__r   Tensorr-   __static_attributes__r   r   r   r$   r$      s!    
Hell Ht Hr   r$   c                     ^  \ rS rSrSrSSSSSS\R                  R                  4S	\S
\	S\	S\	S\
S\S\S-  SS4U 4S jjjr\S\R                   S\R                   4S j5       r\S\\R                      S\\R                      4S j5       r\S\\R                   S4   S\\R                   S4   4S j5       r\S\\R                      S\\R                      4S j5       rS\R                   \\R                      -  S\R                   \\R                      -  4S jr S$S\R*                  R,                  S\R                   S\R                   S\S\\R0                  \R                   4   4
S jjrS\R*                  R,                  SS4S jrS\R                   SS4S  jrS%S!\	\R                   -  S-  SS4S" jjrS#rU =r$ )&ShardedGradScaler.   a  
ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
functionality from GradScaler:
* Supports Pytorch DDP and FSDP implementations
* Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
* Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
* Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
nodes

Example::

    # Creates a ShardedGradScaler once at the beginning of training.
    scaler = ShardedGradScaler()

    for epoch in epochs:
        for input, target in data:
            optimizer.zero_grad()
            output = model(input)
            loss = loss_fn(output, target)

            # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(loss).backward()

            # scaler.step() first unscales gradients of the optimizer's params.
            # If gradients don't contain infs/NaNs, optimizer.step() is then called,
            # otherwise, optimizer.step() is skipped.
            scaler.step(optimizer)

            # Updates the scale for next iteration.
            scaler.update()

See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

Args:
    init_scale (float, optional, default=2.**16):  Initial scale factor.
    growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
        :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
    backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
        :meth:`update` if inf/NaN gradients occur in an iteration.
    growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
        that must occur for the scale to be multiplied by ``growth_factor``.
    enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
        invokes the underlying ``optimizer.step()``, and other methods become no-ops.
        Default: ``True``
    process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
        process group for sharding
cudag      @g      ?g       @i  Tr   
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupNr   c           	         > [         TU ]  UUUUUUS9  U R                  (       a  Xpl        [	        [
        5      U l        g g )N)r:   r;   r<   r=   r>   )superr-   _enabledr?   r   r   _per_optimizer_states)	r,   r   r:   r;   r<   r=   r>   r?   	__class__s	           r   r-   ShardedGradScaler.__init___   sK     	!)'+ 	 	
 ==!.)45Q)RD& r   outputsc                     g Nr   r,   rF   s     r   scaleShardedGradScaler.scaleu   s    <?r   c                     g rH   r   rI   s     r   rJ   rK   x   s    HKr   .c                     g rH   r   rI   s     r   rJ   rK   {   s    TWr   c                     g rH   r   rI   s     r   rJ   rK   ~   s    PSr   c                 8  ^ ^^ T R                   (       d  U$ [        U[        R                  5      (       a  [	        U5      (       d  [        SUR                   35      eT R                  c  T R                  UR                  5        T R                  c  [        S5      eUT R                  R                  UR                  SS9-  nUR                  UR                  5      $ / mS[        R                  [        [        R                     -  4UU U4S jjmT" U5      $ )Nr(   +Expected _scale to be initialized, got NoneTr   non_blockingvalc                   > [        U [        R                  5      (       a  [        U 5      (       d  [	        SU R
                   35      e[        T5      S:X  ad  TR                  c  TR                  U R
                  5        TR                  c  [	        S5      eTR                  [        TR                  5      5        U TS   R                  U R
                  5      -  nUR                  U R                  5      $ [        U [        R                  5      (       a:  [!        TU 5      n[        U ["        [$        45      (       a  [        U 5      " U5      $ U$ ['        S5      e)Nr(   r   rP   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer   r4   r"   r)   r   len_scale_lazy_init_scale_growth_trackerappendr$   getr   dtyper   r   maplisttuple
ValueError)rS   
scaled_valiteratorapply_scaler,   stashs      r   rb   ,ShardedGradScaler.scale.<locals>.apply_scale   s   #u||,,+C00(+J3::,)WXXu:?{{*<<SZZH{{*,I  LL!>t{{!KL 58<<

#;;
 "syy11#s||,,{C0cD%=119X..QRRr   )rB   rU   r   r4   r"   r)   r   rW   rX   tor   r[   r   )r,   rF   scaled_outputrb   rc   s   `  @@r   rJ   rK      s     }}Ngu||,,'00$'Fw~~FV%WXX{{"44W^^D{{"$%RSS#dkknn~~D '5 ' M !%%gmm4457	SU\\HU\\,BB 	S 	S0 7##r   	optimizer	inv_scale	found_inf
allow_fp16c           
         [        U5      n[        U5      n[        S 5      n[        R                  " 5          UR                   GHI  nUS    GH;  n	U	R
                  c  M  U(       d3  U	R
                  R                  [        R                  :X  a  [        S5      eU	R
                  R                  (       a  U	R
                  R                  [        R                  L a[  U	R
                  R                  [        R                  5      R                  5       n
U
R                  [        R                  5      U	l        U	R
                  R                  5       nOU	R
                  nX{R                     UR                     R                  U5        GM>     GML     UR!                  5        HR  u  pUR#                  5        H9  n[        R$                  " UUR'                  U5      UR'                  U5      5        M;     MT     S S S 5        UR(                  (       d=  U R*                  c  [-        S5      eUR'                  U R*                  R                  5        UR(                  $ ! , (       d  f       Nh= f)Nc                       [        [        5      $ rH   )r   r]   r   r   r   <lambda>3ShardedGradScaler._unscale_grads_.<locals>.<lambda>   s	    T9Jr   paramsz%Attempting to unscale FP16 gradients.rP   )r$   r   r   no_gradparam_groupsgradr[   float16r_   	is_sparser   float32coalesce_valuesr   rY   itemsvalues*_amp_foreach_non_finite_check_and_unscale_rZ   r+   rW   r)   )r,   rg   rh   ri   rj   per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparamparam_grad_fp32
to_unscaler   per_dtype_gradsgradss                  r   _unscale_grads_!ShardedGradScaler._unscale_grads_   s     =YG<YG &11J%K"]]_"//"8_Ezz) &EJJ,<,<,M()PQQzz++
 !::++u}}<.3jjooemm.L.U.U.WO)8)=)=emm)LEJ%*ZZ%7%7%9
%*ZZ
./@/@A"((fZ() - 0. ,F+K+K+M',335EDD,008,008 6 ,N1 D $77{{"$%RSS $$T[[%7%78#777M _s   GI
I)c           	         U R                   (       d  g U R                  S5        U R                  [        U5         nUS   [        R
                  L a  [        S5      eUS   [        R                  L a  [        S5      eU R                  c  [        S5      eU R                  R                  5       R                  5       R                  5       n[        R                  " SS[        R                  U R                  R                   S9nU R#                  XUS	5      US
'   [        R
                  US'   U R                  [        U5         n/ n/ n/ nUS
   R%                  5        H  nU R&                  S:w  a  UR                   R(                  S:X  an  UR+                  U5        UR-                  U R&                  5      nUR+                  U5        UR+                  [.        R0                  " US	U R2                  S95        M  UR+                  [.        R0                  " US	U R2                  S95        M     U H  n	U	R5                  5         M     U(       a  [        R6                  " Xg5        g g )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().rP   )   g        )r[   r   Tr   r   )async_opr~   )rB   _check_scale_growth_trackerrC   idr
   UNSCALEDRuntimeErrorSTEPPEDrW   r)   double
reciprocalfloatr   fullru   r   r   ry   _devicer   rY   re   dist
all_reducer?   wait_foreach_copy_)
r,   rg   optimizer_staterh   ri   worksfound_inf_on_cpusfound_inf_on_devicesfound_inf_on_deviceworks
             r   r   ShardedGradScaler.unscale_   s	   }}((444R	]C7#x'8'88_  W%)9)99IJJ ;; !NOOKK&&(335;;=	JJ#U]]4;;3E3E
	 372F2F)T3
./ $,#4#4  44R	]C!()?@GGII||u$)9)9)>)>%)G!((3&/ll4<<&@#$++,?@OO+d$BTBT OOIDDVDVW J DIIK   !2I r   c                    U R                   b  U R                  c  [        S5      eUR                  5       S:  a;  U =R                   U R                  -  sl         U R                  R                  S5        gU R                  S-   nX R                  :X  a;  U =R                   U R                  -  sl         U R                  R                  S5        gX l        g)z
If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
Nz?Expected _scale and _growth_tracker to be initialized, got Noneg      ?r   r   )rW   _growth_trackerr)   item_backoff_factorfill__growth_interval_growth_factor)r,   ri   
successfuls      r   _amp_update_scale_cpu_(ShardedGradScaler._amp_update_scale_cpu_   s    
 ;;$"6"6"> Q  >>s"KK4///K  &&q)--1J222t222$$**1-'1$r   	new_scalec           
      L   U R                   (       d  gU R                  S5      u  p#Ub  [        U[        5      (       a  U R                  R                  U5        GOSnUR                  R                  U R                  :w  a  [        U5      eUR                  5       S:w  a  [        U5      eUR                  SLa  [        U5      eU R                  R                  U5        GO)U R                  R                  5        VVs/ s H8  nUS   R                  5         H  nUR                  UR                  SS9PM     M:     nnn[!        U5      S	:X  a  [        S
5      eUS	   n[!        U5      S:  a#  [#        S[!        U5      5       H
  n	XU	   -  nM     UR                  R                  S:X  a  U R%                  U5        OM[&        R(                  " U R                  U R*                  UU R,                  U R.                  U R0                  5        [3        [4        5      U l        gs  snnf )a)  
Updates the scale factor.
If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
the scale is multiplied by ``growth_factor`` to increase it.
Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
used directly, it's used to fill GradScaler's internal scale tensor. So if
``new_scale`` was a tensor, later in-place changes to that tensor will not further
affect the scale GradScaler uses internally.)
Args:
    new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
.. warning::
    :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
    been invoked for all optimizers used this iteration.
Nupdatezpnew_scale should be a float or a 1-element torch.cuda.FloatTensor or torch.FloatTensor with requires_grad=False.r   Fr   TrQ   r   z,No inf checks were recorded prior to update.r   )rB   r   rU   r   rW   r   r   r   r   r)   numelrequires_gradcopy_rC   ry   re   rV   ranger   r   _amp_update_scale_r   r   r   r   r   r   )
r,   r   rW   r   reasonstateri   
found_infsfound_inf_combinedis
             r   r   ShardedGradScaler.update5  s   " }}"&"B"B8"L )U++!!),B  ##((DLL8(00??$)(00**%7(00!!), "77>>@@E!&'=!>!E!E!GI FMME!G F@   :!#$%STT!+A:"q#j/2A&Q-7& 3 }}!!U*++,>?((KK((&''(()) &11M%N"7s   ??H )r   rC   r?   )TrH   )r/   r0   r1   r2   r3   r   r~   WORLDstrr   intboolr   r-   r   r   r4   rJ   r]   r^   r   optim	Optimizerdictr   r   r   r   r   r5   __classcell__)rD   s   @r   r7   r7   .   s:   .d # #"#-1ZZ-=-=SS S 	S
 S S S $d*S 
S S, ?U\\?ell? ?KT%,,/KD4FK KWU5<<#45W%c@Q:RW WSXell3S8NS S/$||hu||&<</$	.	./$l  78;;((78 <<78 <<	78
 78 
ellELL(	)78r3J%++"7"7 3JD 3Jj2 2 2*DO 4t ; DOt DO DOr   r7   )loggingcollectionsr   r   collections.abcr   typingr   r   r   torch.distributeddistributedr   torch.amp.grad_scalerr   r	   r
   "torch.distributed.distributed_c10dr   	getLoggerr/   loggerr   r   r   r4   r   r"   r$   r7   r   r   r   <module>r      s~     ( $      N N ; 
		8	$Ad38n A $ H$: HKO
 KOr   