
    oi              
          S SK r S SKrS SKJrJrJrJrJrJ	r	  S SK
r
S SKrS SKrSSKJrJr  SSKJrJrJrJrJrJr  / SQrSrSrS	r\ R6                  r\" \5      \" S
5      :  aA  \ R:                  R<                  R>                  r \ R:                  R<                  RB                  r"O2\ R<                  R?                  SS9r \ R<                  RC                  SS9r"  S2S\#S\\\$\#4      S\\#   4S jjr%  S2S\#S\\\$\#4      S\\#   4S jjr&   S3S\S\\\$\#4      S\\'   SS4S jjr(  " S S\ RR                  RT                  5      r+  " S S\ RR                  RT                  5      r, \ RZ                  SS.S j5       r. S r/ S r0 S r1 S r2 S S K3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;  SS.S4S! jjr< \S";   a  \ R:                  Rz                  r>O\S#:X  a  \ R~                  Rz                  r>/ q@SqAS5S$ jrB  " S% S&\ RR                  RT                  5      rC S S'K3JDrDJErEJFrFJGrG  \ RZ                  S\G\ES(S).S\\'   S*\/ \	\D\D4   4   S+\$S,\'4S- jj5       rH S5S. jrI S/ rJ S0 rK \ RZ                  SS.S1 j5       rLg)6    N)UnionOptionalListAnyCallableTuple   )
_get_dtypeVersion)is_hipget_device_typeDEVICE_TYPEDEVICE_TYPE_TORCHDEVICE_COUNTALLOW_PREQUANTIZED_MODELS) calculate_n_gradient_checkpointsprepare_n_gradient_checkpoints'Unsloth_Offloaded_Gradient_Checkpointer%unsloth_offloaded_gradient_checkpoint$patch_unsloth_gradient_checkpointing&unpatch_unsloth_gradient_checkpointingUnsloth_Gradient_Checkpointerunsloth_gradient_checkpointpatch_gradient_checkpointingunpatch_gradient_checkpointing*patch_unsloth_smart_gradient_checkpointing,unpatch_unsloth_smart_gradient_checkpointing,reset_unsloth_gradient_checkpointing_buffers         z2.4.0cudadevice_typen_layersmethodreturnc                    [        U 5      [        L a  U S:  d   eUc  SnUS:X  a  [        U S-  5      nOE[        U5      [        L a(  US:  a"  [        [        R                  " X-  5      5      nO[	        S5      eX-  n[        R
                  " X#[        S9nX-  n[        U5       H  nXBS-
  U-
  ==   S-  ss'   M     [        R                  " S[        R                  " U5      45      nUR                  5       nU$ )Nr   sqrtg      ?z3method must be 'sqrt' or an int >0 and <= n_layers.dtyper	   )
typeintnpceil
ValueErrorfullrangehstackcumsumtolist)r%   r&   n_checkpointssizesizes	leftoversk
boundariess           \/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/gradient_checkpointing.py!_calculate_n_gradient_checkpointsr=   B   s     >S X\21~vHcM*	f	!BGGH$567NOO$DGGM5E(I9Aoa A%  Aryy/01J""$J    layers_per_checkpointc                     [        U 5      [        L a  U S:  d   eUb  US:X  a  g [        X5      nUS   S:X  a	  US   U :X  d   e[        U5      S:X  a  [	        U5      U :X  d   e[
        R                  " U5      R                  5       S:  d   eU$ )Nr   r	   )r,   r-   r=   minmaxr.   diff)r%   r?   r;   s      r<   r   r   ]   s     >S X\21$(=(B28SJa=A*R.H"<=<z?aC
Ox$?@?77:""$)*)r>   modeluse_reentrantc                 &   Sn[        U S5      (       a  U nO8[        U S5      (       a'  [        U R                  S5      (       a  U R                  nUc  [        S5      e USL a  Sn [        UR                  5      n[        XA5      nXSl        X#l        g)a  
Calculates where to place the gradient checkpoints given n_layers.

Args:
    model: Any LlamaModel with layers.
    layers_per_checkpoint (`Union[str, int]`, *optional*):
        Can either be `sqrt` or an integer for how many layers per checkpoint you want.
        The more, the less memory usage, but can be slower. Default is `sqrt`.
        Choose 1 for Pytorch gradient checkpointing. 2 to wrap 2 layers in 1 module etc.
    use_reentrant (`bool`, *optional*):
        https://github.com/pytorch/pytorch/blob/main/torch/utils/checkpoint.py#L354
        Optimal gradient checkpointing algorithm `use_reentrant=False` which will
        be the default in future Pytorch versions doesn't seem to work??
NlayersrE   zX`model` or `model.model` does not have attribute `layers`. Are you sure this is a model?FT)hasattrrE   	TypeErrorlenrH   r   "_gradient_checkpointing_boundaries%_gradient_checkpointing_use_reentrant)rE   r?   rF   _modelr%   r;   s         r<   r   r   o   s    & Fuh		 	 5;;))[[F~rss6==!H1(RJ3=-3@0r>   c                   N    \ rS rSrSr\\S 5       5       r \\S 5       5       r	Sr
g)r      z
All Unsloth Zoo code licensed under LGPLv3
Saves VRAM by smartly offloading to RAM.
Tiny hit to performance, since we mask the movement via non blocking calls.
c                     UR                   U l         UR                  SSS9n[        R                  " 5          U" U/UQ76 nS S S 5        U R	                  U5        Xl        X0l        W$ ! , (       d  f       N-= f)NcpuTnon_blocking)devicetotorchno_gradsave_for_backwardforward_functionargs)ctxrZ   hidden_statesr[   saved_hidden_statesoutputs         r<   forward/Unsloth_Offloaded_Gradient_Checkpointer.forward   sk     #))
+..uT.J]]_%m;d;F 12/ _s   
A((
A6c                    U R                   u  nUR                  U R                  SS9R                  5       nUR	                  S5        [
        R                  " 5          U R                  " U/U R                  Q76 u  nS S S 5        [
        R                  R                  WU5        S UR                  4S[        U R                  5      -  -   $ ! , (       d  f       NU= f)NTrS   N)saved_tensorsrV   rU   detachrequires_grad_rW   enable_gradrZ   r[   autogradbackwardgradrK   r\   dYr]   r_   s       r<   ri   0Unsloth_Offloaded_Gradient_Checkpointer.backward   s     ,,%((D(IPPR$$T* ,,]FSXXFIV !+m((*WS]-BBB ! s    C
C N__name__
__module____qualname____firstlineno____doc__staticmethodtorch_amp_custom_fwdr`   torch_amp_custom_bwdri   __static_attributes__rn   r>   r<   r   r      sI    
    	C  C 	r>   r   c                   N    \ rS rSrSr\\S 5       5       r \\S 5       5       r	Sr
g)r      z^
All Unsloth Zoo code licensed under LGPLv3
Same as normal gradient checkpointing but cleaner
c                     [         R                  " 5          U" U/UQ76 nS S S 5        U R                  U5        Xl        X0l        W$ ! , (       d  f       N-= frc   )rW   rX   rY   rZ   r[   )r\   rZ   r]   r[   r_   s        r<   r`   %Unsloth_Gradient_Checkpointer.forward   sG     ]]_%m;d;F m,/ _s   
A
Ac                    U R                   u  nUR                  5       nUR                  S5        [        R                  " 5          U R
                  " U/U R                  Q76 u  nS S S 5        [        R                  R                  WU5        S UR                  4S[        U R                  5      -  -   $ ! , (       d  f       NU= f)NTrc   )rd   re   rf   rW   rg   rZ   r[   rh   ri   rj   rK   rk   s       r<   ri   &Unsloth_Gradient_Checkpointer.backward   s     ,,%,,.$$T* ,,]FSXXFIV !+m((*WS]-BBB ! s    B44
Crn   Nro   rn   r>   r<   r   r      sI        	C  C 	r>   r   )rF   c                0    [         R                  " U /UQ76 $ rc   )r   applyfunctionrF   r[   kwargss       r<   r   r      s    (..x?$??r>   c                     [        S5        SS Kn U R                  R                  R                  R                  S:X  a  g U R                  R                  R                  U R                  R                  l        [        U R                  R                  l        SS Kn[        UR                  l        S[        R                  S'   g )NzDUnsloth: Patched gradient checkpointing for long context finetuning.r   r   1UNSLOTH_PATCHED)printtorch.utilsutils
checkpointrp   _old_checkpointr   transformers.modeling_utilsmodeling_utilsosenvironrW   transformerss     r<   r   r      s    	
PQ{{((115\\^d-2[[-C-C-N-NEKK*(MEKK%&-RL*$'BJJ !r>   c                     [        S5        SS Kn U R                  R                  R                  R                  S:X  a  g U R                  R                  R                  U R                  R                  l        [        U R                  R                  l        SS Kn[        UR                  l        S[        R                  S'   g )Nz(Unsloth: Patched gradient checkpointing.r   r   r   r   )r   r   r   r   rp   r   r   r   r   r   r   r   s     r<   r   r      s    	
45{{((115RRTZ-2[[-C-C-N-NEKK*(CEKK%&-HL*$'BJJ !r>   c                      SS K n [        U R                  R                  S5      (       aO  U R                  R                  R                  U R                  R                  l        U R                  R                  ?g Nr   r   r   rI   r   r   r   rW   s    r<   r   r      R    u{{%%'899,1KK,B,B,R,R)KK""2r>   c                      SS K n [        U R                  R                  S5      (       aO  U R                  R                  R                  U R                  R                  l        U R                  R                  ?g r   r   r   s    r<   r   r     r   r>   )check_backward_validity_infer_device_type_get_autocast_kwargs_get_device_moduleget_device_statesdetach_variable
contextlibDefaultDeviceTypec                    Uc  [         R                  " 5       nUS:X  a  g[        U5      n[        X5       H0  u  pEUR	                  U5         UR                  U5        SSS5        M2     g! , (       d  f       MD  = f)a  Sets random number generator states for the specified devices.

Args:
    devices: Device ids to set states for.
    states: States to set.
    device_type: ``device_type`` of the devices to set states for. Default
        is the device returned by a call to ``DefaultDeviceType.get_device_type()``,
        which is ``cuda`` if not changed by calling ``DefaultDeviceType::set_device_type()``.
Nmeta)r   r   r   ziprU   set_rng_state)devicesstatesr$   device_modulerU   states         r<   set_device_statesr     si     '779f&{3MW-!!&)''. *) .))s   A++
A:	r"   hipxpuc                    / q SqU cp  [        S:X  a&  [        R                  R                  5       u  pUS:  nO[        S:X  a  SnO[        S:X  a  SnW(       a  [        R                  O[        R                  n  [        S5       H/  n[        R                  " SU S	SS
9n[         R                  U5        M1      [        S;   a  [        R                  R                  5       O[        R                  R                  5       n [        [        U5       Vs/ s H"  n[        R                  " SU [         SU 3S9PM$     sn5      qSq[        [        U5       Vs/ s HI  n[        S:X  a  [        R                  R'                  5       O[        R                  R'                  5       PMK     sn5      q[        S;   aZ  [        [        U5       Vs/ s H9  n[        R                  R+                  [        R,                  " SU 35      5      PM;     sn5      qOc[        S:X  aY  [        [        U5       Vs/ s H9  n[        R                  R1                  [        R,                  " SU 35      5      PM;     sn5      q[        R2                  " U 5      R4                  S-  nSU-  qSqSqSqSqg s  snf ! [          a=  n[#        S5        [#        S5        [#        S5        [#        S5        [#        S5        e S nAff = fs  snf s  snf s  snf )Nr   r"      r   Tr   r!   r   rR   r+   rU   
pin_memoryr   r    :)r+   rU   z==========
z?Unsloth: Your setup does not support `PYTORCH_CUDA_ALLOC_CONF`
zDPlease set `import os; os.environ['PYTORCH_CUDA_ALLOC_CONF'] = '';`
z#Then re-run Unsloth from the start.zcuda:zxpu:i    ) CPU_BUFFERS	CPU_INDEXr   rW   r"   get_device_capabilitybfloat16float16r2   emptyappenddevice_countr   tupler   GPU_BUFFERS	Exceptionr   BACKWARD_PASSStreamEXTRA_STREAMSdefault_streamrU   MAIN_STREAMScurrent_streamfinfobitsMINIMUM_SIZEUSE_UNSLOTH_GCLAST_GC_INDEX
FIRST_PASSCURRENT_GC_INDEX)	r+   major_versionminor_versionSUPPORTS_BFLOAT16ixn_gpusen_bytess	            r<   )initialize_unsloth_gradient_checkpointingr   @  s    KI}& +0::+K+K+M(M!.!!3E! $E! $"33ZKK%%dS1  	 +6*HUZZ$$&eiiNdNdNfFrwx~r  ArmnU[[UPaObbcdecfMghr  A  B Motu{o|}o|jk2Cv2M5::,,.SXS\S\ScScSeeo|}~Mo%]bci]jk]jXYuzz88aSk9RS]jkl		[`ag[hi[hVWuyy77tA3Z8PQ[hij kk% %%*G"g-LN MJ3 A mPQUV34m ~kis?   ?J) )J$:J) AK3A K8,A K=$J) )
K038K++K0c                   6    \ rS rSr\S 5       r \S 5       rSrg)UnslothCheckpointFunctioni  c                    Xl         X l        [        U6 U l        [	        U R                  5      u  U l        U l        U(       ad  [        R                  " 5       U l	        SU l
        [        U R                  5      n[        USS5      (       a  SU l
        [        U6 u  U l        U l        / U l        / U l        / nSU l        Sn['        U5       GH  u  px[        R(                  " U5      (       GaT  US:X  Ga  UR*                  (       Ga  [,        (       a	  [.        S-  q [0        S-  qSU l        UR3                  5       n	U	[4        :  Ga  [0        [.        :w  d  [,        (       Ga  SnUR6                  n
U
R8                  n[:        U   n[<        U   n[>        U   n[@        (       a  Sq Sq! [B        [E        [F        5      :  a5  [        RH                  " XRJ                  SSS9n[F        RM                  U5         [F        [B           nURN                  nXR3                  5       :  a  URQ                  U	5        XR3                  5       :  a  URQ                  U	5        US U	 RS                  U5      nURU                  U5        [W        U5         URY                  USS9  S S S 5        U	U[B        XU4U l-        [B        S-  q!URM                  S 5        [\        (       a  [_        S	5        Sq.OS
U l-        URM                  U5        OURM                  U5         U R"                  RM                  U5        U R                   RM                  S 5        OU R                   RM                  U5        GM      U R$                  (       a  U R`                  " U6   [        Rb                  " 5          U" U6 nS S S 5        U(       a  WRU                  W5        W$ ! , (       d  f       GN2= f! , (       d  f       N:= f)NF_initializedTr   r	   rR   r   rS   z5Unsloth: Will smartly offload gradients to save VRAM!)NNNNNN)2run_functionpreserve_rng_stater   r$   r   device_autocast_kwargscpu_autocast_kwargsrW   get_rng_statefwd_cpu_statehad_device_in_fwdr   getattrr   fwd_devicesfwd_device_statesinputstensor_indices_requires_gradient	enumerate	is_tensorrequires_gradr   r   r   numelr   rU   indexr   r   r   r   r   rK   r   r   r+   r   shaperesize_viewwait_streamtorch_gpu_streamcopy__saved_metadatar   r   rY   rX   )r\   r   r   r[   r   tensor_inputsuse_gpu_bufferr   argnew_sizerU   device_index
GPU_BUFFERMAIN_STREAMEXTRA_STREAMr   r   outputss                     r<   r`   !UnslothCheckpointFunction.forward  sB   
 (!3,d3>ROO?
;"C$;  % 3 3 5C
 %*C!.s?M}ne<<(,%9JD9Q6!6 
!&oFAs##6c/// "z &*$)$-1C*"yy{H  ,.5E5V[e[e)- "%'-||'2\'B
'3\'B'4\'B )=,1M()I %K(88 %HiiRWfj kA'..q1'	2 #		#ggi/81D#&6&6&88*:L:LX:VixL--e4 %00=-l;GGCG= < 08	<fr.t+!Q	%,,T2 *>!"YZ-2N.S+%,,S1!((-""))!,

!!$'

!!#&W &X 	!!3#8#8-#H]]_"D)G  ;22<@= <;4 _s   N?O?
O	
Oc           	      	   U R                   (       d  g [        R                  R                  5       (       d  [	        S5      e[        U R                  5      nU R                  nU R                  nU R                  u  pVpxpUbl  [        U   S U R                  U5      n[        U   S U R                  U5      nU
R                  U	5        [        U
5         UR                  USS9  S S S 5        O[!        U5      S:w  a
  US   X#S   '    [#        USS  SS9 H  u  pXM   X.'   M      SqSqSq/ nU R*                  (       a  U R,                  (       a  U R.                  n[        R0                  R3                  XR*                  U R4                  S9   U R*                  (       aZ  [        R6                  " U R8                  5        U R,                  (       a)  [;        U R.                  U R<                  U R4                  S	9  [        R>                  RA                  U R4                  5      (       a6  [        R>                  RB                  " SS
U R4                  0U RD                  D6O[F        RH                  " 5       n/ nU Hg  n[K        U[        RL                  5      (       d  URO                  U5        M5  URQ                  5       nURR                  Ul)        URO                  U5        Mi      Ub7  U	R                  U
5        WRQ                  5       nURU                  S5        UUS'    [        RV                  " 5          U   [        R>                  RB                  " S0 U RX                  D6   U RZ                  " U6 nS S S 5        S S S 5        S S S 5         S S S 5         [K        W[        RL                  5      (       a  U4n/ n/ n[]        [!        U5      5       H`  n[        R^                  " UU   5      (       d  M#  UU   RR                  (       d  M9  URO                  UU   5        URO                  X   5        Mb      [!        U5      S:X  a  O [        R                  Ra                  UU5         [c        S W 5       5      n[]        [!        U5      5       H  nS UU'   S X-'   M      SU-   $ ! , (       d  f       GN= f! , (       d  f       GNG= f! , (       d  f       GNQ= f! , (       d  f       GN[= f! , (       d  f       GNd= f)NzWhen use_reentrant=True, torch.utils.checkpoint is incompatible with .grad() or passing an `inputs` parameter to .backward(). To resolve this error, you can either set use_reentrant=False, or call .backward() without passing the `inputs` argument.TrS   r   r	   )startF)r   enabledr$   r#   r$   c              3   |   #    U  H2  n[        U[        R                  5      (       a  UR                  OS v   M4     g 7frc   )
isinstancerW   Tensorrj   ).0inps     r<   	<genexpr>5UnslothCheckpointFunction.backward.<locals>.<genexpr>g  s0      
& #355CHH4?&s   :<)NNrn   )rR   )2r   rW   rh   _is_checkpoint_validRuntimeErrorlistr   r   rd   r   r   r   r   r   r   r   rK   r   r   r   r   r   r   r   randomfork_rngr$   r   r   r   r   ampis_autocast_availableautocastr   r   nullcontextr   r   r   re   r   rf   rg   r   r   r2   r   ri   r   )r\   r[   r   r   tensorsr   r   r   r   r   r   bufferr   r   idxrng_devicesdevice_autocast_ctxdetached_inputsr  r   outputs_with_gradargs_with_gradgradss                          r<   ri   "UnslothCheckpointFunction.backward  s5    %%d~~2244N  cjj!++##NQNaNaK+  .y9>>uEFI&y166u=A $$[1!,/Qt4 0/ >"a',3AJa()  qr 2A>FA!*FK ? 

 !!c&;&;//K\\"")?)?S__ # 
 %%##C$5$56((%coos7L7LZ]ZiZij 00AA #())"4"4 #OO#/2/I/I#GQG]G]G_  
 !O!#u||44#**3/JJL"%"3"3&&q)   $''5MMO  &%&"""$&9599;M;M;oWZWnWn;o**O< <p&9$C
D 	gu||,,jG s7|$Awqz**wqz/G/G/G!((4%%dg. % 	 !Q& NN##$5~F 
&
 

 s?+,A!%OAFI - 	e##} 0/r <p;o&9&9$$?
 
sa   RF%S%1S4+SR/	/S7S?S%
R,/
R>9S
SS
S"	S%%
S4rn   N)rp   rq   rr   rs   ru   r`   ri   rx   rn   r>   r<   r   r     s8    r rf 	 w$ w$p 	r>   r   )ContextManager_DEFAULT_DETERMINISM_MODE'_checkpoint_without_reentrant_generatornoop_context_fnF)rF   
context_fndeterminism_checkdebugr  r  r  c                   Uc  [         R                  " SSS9  SnUR                  SS5      nU(       a+  U(       a$  [        SSR	                  S	 U 5       5      -   5      eU(       a0  U[
        Ld  US
La  [        S5      e[        R                  " X/UQ76 $ [        XX#U/UQ70 UD6n[        U5        U " U0 UD6n	 [        U5        g! [         a    U	s $ f = f)a  Checkpoint a model or part of the model.

Activation checkpointing is a technique that trades compute for memory.
Instead of keeping tensors needed for backward alive until they are used in
gradient computation during backward, forward computation in checkpointed
regions omits saving tensors for backward and recomputes them during the
backward pass. Activation checkpointing can be applied to any part of a
model.

There are currently two checkpointing implementations available, determined
by the :attr:`use_reentrant` parameter. It is recommended that you use
``use_reentrant=False``. Please refer the note below for a discussion of
their differences.

.. warning::

    If the :attr:`function` invocation during the backward pass differs
    from the forward pass, e.g., due to a global variable, the checkpointed
    version may not be equivalent, potentially causing an
    error being raised or leading to silently incorrect gradients.

.. warning::

    The ``use_reentrant`` parameter should be passed explicitly. In version
    2.4 we will raise an exception if ``use_reentrant`` is not passed.
    If you are using the ``use_reentrant=True`` variant, please refer to the
    note below for important considerations and potential limitations.

.. note::

    The reentrant variant of checkpoint (``use_reentrant=True``) and
    the non-reentrant variant of checkpoint (``use_reentrant=False``)
    differ in the following ways:

    * Non-reentrant checkpoint stops recomputation as soon as all needed
      intermediate activations have been recomputed. This feature is enabled
      by default, but can be disabled with :func:`set_checkpoint_early_stop`.
      Reentrant checkpoint always recomputes :attr:`function` in its
      entirety during the backward pass.

    * The reentrant variant does not record the autograd graph during the
      forward pass, as it runs with the forward pass under
      :func:`torch.no_grad`. The non-reentrant version does record the
      autograd graph, allowing one to perform backward on the graph within
      checkpointed regions.

    * The reentrant checkpoint only supports the
      :func:`torch.autograd.backward` API for the backward pass without its
      `inputs` argument, while the non-reentrant version supports all ways
      of performing the backward pass.

    * At least one input and output must have ``requires_grad=True`` for the
      reentrant variant. If this condition is unmet, the checkpointed part
      of the model will not have gradients. The non-reentrant version does
      not have this requirement.

    * The reentrant version does not consider tensors in nested structures
      (e.g., custom objects, lists, dicts, etc) as participating in
      autograd, while the non-reentrant version does.

    * The reentrant checkpoint does not support checkpointed regions with
      detached tensors from the computational graph, whereas the
      non-reentrant version does. For the reentrant variant, if the
      checkpointed segment contains tensors detached using ``detach()`` or
      with :func:`torch.no_grad`, the backward pass will raise an error.
      This is because ``checkpoint`` makes all the outputs require gradients
      and this causes issues when a tensor is defined to have no gradient in
      the model. To avoid this, detach the tensors outside of the
      ``checkpoint`` function.

Args:
    function: describes what to run in the forward pass of the model or
        part of the model. It should also know how to handle the inputs
        passed as the tuple. For example, in LSTM, if user passes
        ``(activation, hidden)``, :attr:`function` should correctly use the
        first input as ``activation`` and the second input as ``hidden``
    preserve_rng_state(bool, optional):  Omit stashing and restoring
        the RNG state during each checkpoint. Note that under torch.compile,
        this flag doesn't take effect and we always preserve RNG state.
        Default: ``True``
    use_reentrant(bool):
        specify whether to use the activation checkpoint variant that
        requires reentrant autograd. This parameter should be passed
        explicitly. In version 2.5 we will raise an exception if
        ``use_reentrant`` is not passed. If ``use_reentrant=False``,
        ``checkpoint`` will use an implementation that does not require
        reentrant autograd. This allows ``checkpoint`` to support additional
        functionality, such as working as expected with
        ``torch.autograd.grad`` and support for keyword arguments input into
        the checkpointed function.
    context_fn(Callable, optional): A callable returning a tuple of two
        context managers. The function and its recomputation will be run
        under the first and second context managers respectively.
        This argument is only supported if ``use_reentrant=False``.
    determinism_check(str, optional): A string specifying the determinism
        check to perform. By default it is set to ``"default"`` which
        compares the shapes, dtypes, and devices of the recomputed tensors
        against those the saved tensors. To turn off this check, specify
        ``"none"``. Currently these are the only two supported values.
        Please open an issue if you would like to see more determinism
        checks. This argument is only supported if ``use_reentrant=False``,
        if ``use_reentrant=True``, the determinism check is always disabled.
    debug(bool, optional): If ``True``, error messages will also include
        a trace of the operators ran during the original forward computation
        as well as the recomputation. This argument is only supported if
        ``use_reentrant=False``.
    args: tuple containing inputs to the :attr:`function`

Returns:
    Output of running :attr:`function` on :attr:`*args`
Nae  torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.   )
stacklevelTr   zUnexpected keyword arguments: ,c              3   $   #    U  H  ov   M     g 7frc   rn   )r   r   s     r<   r  %unsloth_checkpoint.<locals>.<genexpr>  s     7Nvvs   FzKPassing `context_fn` or `debug` is only supported when use_reentrant=False.)warningswarnpopr0   joinr  r   r   r  nextStopIteration)
r   rF   r  r  r  r[   r   preservegenrets
             r<   unsloth_checkpointr-  |  s    r C 	
  zz.5H-,sxx7Nv7N/NN
 	
 _,U0B'  )..xIDII5
u
GK
OU
 	S	''	I 	J	s   ;C CCc                 t   [         R                  R                  R                  R                  S:w  ao  [        U 5        [         R                  R                  R                  [         R                  R                  l        [        [         R                  R                  l        [         R                  R                  R                  R                  S:w  ae  [         R                  R                  R                  [         R                  R                  l        [        [         R                  R                  l        g g )Nr   r-  )
rW   r   r   CheckpointFunctionrp   r   _old_CheckpointFunctionr   r   r-  r*   s    r<   r   r     s    {{0099=XX1%89>9O9O9b9b64M1{{((115II161G1G1R1R.,>) Jr>   c                     [         R                  R                  R                  R                  S:X  Ga  [        [         R                  R                  S5      (       GaX  [         R                  R                  R                  [         R                  R                  l        [        [        [        5      5       HT  n [        [        U    S5      (       a  [        U    R                  S5        [        [        5      [        L d  MK  S [        U '   MV     [        [        [        5      5       HT  n [        [        U    S5      (       a  [        U    R                  S5        [        [        5      [        L d  MK  S [        U '   MV     S q	S q[         R                  R                  5         [         R"                  " 5         [         R                  R                  R                  R                  S:X  al  [        [         R                  R                  S5      (       aB  [         R                  R                  R$                  [         R                  R                  l        g g g )Nr   r0  r   r   r-  r   )rW   r   r   r/  rp   rI   r0  r2   rK   r   r   r,   r  r   r"   empty_cachegccollectr   r   s    r<   r   r   +  s~   11::>YY&&(ABB49KK4J4J4b4b1 s;'(A{1~y11;q>3I3I!3LK D(4+a. ) s;'(A{1~y11;q>3I3I!3LK D(4+a. ) 

 


))226JJ&&(9::,1KK,B,B,R,R) 	; 	Kr>   c                  8   [         b  [        c  g[        [         5      S:X  a  g[        [        [         5      5       H  n U [        :  aD  [         U    b8  [        [         U    S5      (       a  [         U    R                  [        5        MM  MO  MQ  [         U    b0  [        [         U    S5      (       a  [         U    R                  S5        S[         U '   M      [        [         5      [        :  a  [         [        S2	  [        [        [        5      5       HE  n [        U    c  M  [        [        U    S5      (       d  M)  [        U    R                  [        5        MG      Sq	Sq
SqSqSqSq[        R                   R#                  5         [$        R&                  " 5         g)ax  
All Unsloth Zoo code licensed under LGPLv3

Resets CPU_BUFFERS and GPU_BUFFERS to their initial sizes after training.

This function should be called after trainer.train() completes to free up
memory that was allocated during training while keeping the buffers ready
for another potential training run. Unlike unpatch_unsloth_smart_gradient_checkpointing,
this does NOT destroy the buffers or unpatch the checkpointing - it just resets
them to their initial state.

Usage:
    trainer.train()
    reset_unsloth_gradient_checkpointing_buffers()  # Free memory, stay ready
    # Can run trainer.train() again without re-initializing
Nr   r   T)r   r   rK   r2   INITIAL_CPU_BUFFER_COUNTrI   r   INITIAL_CPU_BUFFER_SIZEINITIAL_GPU_BUFFER_SIZEr   r   r   r   r   r   rW   r"   r2  r3  r4  r5  s    r<   r   r   E  sV   6 k1
;1 3{#$''1~)gk!ni.P.PA&&'>? /Q) 1~)gk!ni.P.PA&&q)!KN % 	 ;22012 3{#$q>%'+a.)*L*LN""#:; % 	 IMMJN 
JJJJLr>   c                    [        [        5      S:X  a  [        US   R                  5        [        R
                  " U /UQ76 $ )Nr   )rK   r   r   r+   r   r   r   s       r<   r   r     s7     ;11$q'--@$**8;d;;r>   )r)   )r)   T)r'   Nrc   )MrW   numpyr.   typingr   r   r   r   r   r   r   r$  r3  r   r
   r   r$   r   r   r   r   r   r   __all__r8  r9  r7  __version__torch_versionr"   r	  
custom_fwdrv   
custom_bwdrw   r-   strr=   r   boolr   rh   Functionr   r   _disable_dynamor   r   r   r   r   torch.utils.checkpointr   r   r   r   r   r   r   r   r   streamr   r   r   r   r   r   r  r  r  r  r-  r   r   r   r   rn   r>   r<   <module>rH     sA  "   > > 	  	 & & % (  !!
=GG,, ::>>44 ::>>44 99//f/E 99//f/E 
 ,2c3h( 
#Y0 
 9?$U38_5 
#Y 
 9?-1$A$A$U38_5$A %TN$A 
	$AJ 	enn.E.E 	< 	ENN$;$; 	6  AE @ @ ( ( 	 	 
 
 
 7; /$  /!zz((Eyy''	<z r	 7 7 r	f    %)FU6] D>] U>>#ABBC	]
 ] ] ]| 	? S. BF  KO < <
 r>   