
    6bi                     t    S r SSKJs  Jr  SSKJr  SSKJr  \" SSS/S9 " S S	\R                  5      5       r
g)
zSGD optimizer implementation.    N)optimizer_v2)keras_exportzkeras.optimizers.legacy.SGDzkeras.optimizers.SGD)v1c                   z   ^  \ rS rSrSrSr    SU 4S jjrS rU 4S jrSS jr	U 4S jr
SS	 jrU 4S
 jrSrU =r$ )SGD   a	  Gradient descent (with momentum) optimizer.

Update rule for parameter `w` with gradient `g` when `momentum=0`:

```python
w = w - learning_rate * g
```

Update rule when `momentum` is larger than 0:

```python
velocity = momentum * velocity - learning_rate * g
w = w + velocity
```

When `nesterov=True`, this rule becomes:

```python
velocity = momentum * velocity - learning_rate * g
w = w + momentum * velocity - learning_rate * g
```

Args:
  learning_rate: A `Tensor`, floating point value, or a schedule that is a
    `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
    that takes no arguments and returns the actual value to use. The
    learning rate. Defaults to `0.01`.
  momentum: float hyperparameter >= 0 that accelerates gradient descent in
    the relevant direction and dampens oscillations. Vanilla gradient
    descent means no momentum. Defaults to `0.`.
  nesterov: boolean. Whether to apply Nesterov momentum.
    Defaults to `False`.
  name: Optional name prefix for the operations created when applying
    gradients.  Defaults to `"SGD"`.
  **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
    `clipnorm`, `global_clipnorm`.
    If `clipvalue` (float) is set, the gradient of each weight
    is clipped to be no higher than this value.
    If `clipnorm` (float) is set, the gradient of each weight
    is individually clipped so that its norm is no higher than this value.
    If `global_clipnorm` (float) is set the gradient of all weights is
    clipped so that their global norm is no higher than this value.

Usage:

>>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
>>> var = tf.Variable(1.0)
>>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
>>> step_count = opt.minimize(loss, [var]).numpy()
>>> # Step is `- learning_rate * grad`
>>> var.numpy()
0.9

>>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1, momentum=0.9)
>>> var = tf.Variable(1.0)
>>> val0 = var.value()
>>> loss = lambda: (var ** 2)/2.0         # d(loss)/d(var1) = var1
>>> # First step is `- learning_rate * grad`
>>> step_count = opt.minimize(loss, [var]).numpy()
>>> val1 = var.value()
>>> (val0 - val1).numpy()
0.1
>>> # On later steps, step-size increases because of momentum
>>> step_count = opt.minimize(loss, [var]).numpy()
>>> val2 = var.value()
>>> (val1 - val2).numpy()
0.18

Reference:
    - For `nesterov=True`, See [Sutskever et al., 2013](
      https://github.com/mlresearch/v28/blob/gh-pages/sutskever13.pdf).
Tc                   > [         TU ]  " U40 UD6  U R                  SUR                  SU5      5        U R                  SU R                  5        SU l        [        U[        R                  5      (       d  [        U5      (       d  US:  a  SU l        [        U[        [        45      (       a'  US:  d  US:  a  [        SU S	[        U5       S
35      eU R                  SU5        X0l        g )Nlearning_ratelrdecayFr   T   z6`momentum` must be between [0, 1]. Received: momentum=z
 (of type z).momentum)super__init__
_set_hyperget_initial_decay	_momentum
isinstancetfTensorcallableintfloat
ValueErrortypenesterov)selfr
   r   r   namekwargs	__class__s         i/home/james-whalen/.local/lib/python3.13/site-packages/tf_keras/src/optimizers/legacy/gradient_descent.pyr   SGD.__init__j   s     	((D-)HI!4!45x++!!!|!DNhe--qLHqL$:ZX/?rC  	
H-     c                 ^    U R                   (       a  U H  nU R                  US5        M     g g Nr   )r   add_slot)r   var_listvars      r"   _create_slotsSGD._create_slots   s&    >>c:.   r$   c                    > [         TU ]  XU5        [        R                  " U R	                  SU5      5      X1U4   S'   g r&   )r   _prepare_localr   identity
_get_hyper)r   
var_device	var_dtypeapply_stater!   s       r"   r-   SGD._prepare_local   s;    zkB;=;;OOJ	2<
+,Z8r$   c           
          UR                   UR                  R                  pTU=(       d    0 R                  XE45      =(       d    U R	                  XE5      nU R
                  (       ac  U R                  US5      n[        R                  R                  UR                  UR                  US   UUS   U R                  U R                  S9$ [        R                  R                  UR                  US   UU R                  S9$ )Nr   lr_t)r)   accumr   gradr   use_lockinguse_nesterov)r)   alphadeltar8   )devicedtype
base_dtyper   _fallback_apply_stater   get_slotr   raw_opsResourceApplyKerasMomentumhandle_use_lockingr   ResourceApplyGradientDescent)r   r7   r)   r2   r0   r1   coefficientsmomentum_vars           r"   _resource_apply_denseSGD._resource_apply_dense   s     #

CII,@,@I#)r..#
 ?''
> 	 >>==j9L::88JJ"))'%j1 --!]] 9   ::::JJ"6* --	 ;  r$   c                 `  > U R                   (       a  [        TU ]  " XU40 UD6$ UR                  UR                  R
                  peUR                  S0 5      R                  XV45      =(       d    U R                  XV5      n[        R                  R                  UR                  UU* US   -  S9$ )Nr2   r5   )resourceindicesupdates)r   r   (_resource_apply_sparse_duplicate_indicesr<   r=   r>   r   r?   r   rA   ResourceScatterAddrC   )	r   r7   r)   rL   r    r0   r1   rF   r!   s	           r"   rN   ,SGD._resource_apply_sparse_duplicate_indices   s     >>7C7&,  %(JJ		0D0D	!::mR8<<' C++JB  ::00V 44 1  r$   c                 r   UR                   UR                  R                  peU=(       d    0 R                  XV45      =(       d    U R	                  XV5      nU R                  US5      n[        R                  R                  UR                  UR                  US   UUUS   U R                  U R                  S9$ )Nr   r5   )r)   r6   r   r7   rL   r   r8   r9   )r<   r=   r>   r   r?   r@   r   rA    ResourceSparseApplyKerasMomentumrC   rD   r   )	r   r7   r)   rL   r2   r0   r1   rF   rG   s	            r"   _resource_apply_sparseSGD._resource_apply_sparse   s     #

CII,@,@I#)r..#
 ?''
> 	 }}S*5zz::

%%F#!*-)) ; 	
 		
r$   c                    > [         TU ]  5       nUR                  U R                  S5      U R                  U R                  S5      U R
                  S.5        U$ )Nr
   r   )r
   r   r   r   )r   
get_configupdate_serialize_hyperparameterr   r   )r   configr!   s     r"   rV   SGD.get_config   sY    #%!%!?!?#" ,, :::F MM		
 r$   )r   r   )g{Gz?g        Fr   )N)__name__
__module____qualname____firstlineno____doc___HAS_AGGREGATE_GRADr   r*   r-   rH   rN   rS   rV   __static_attributes____classcell__)r!   s   @r"   r   r      sH    
GR  !</

2&
& r$   r   )r_   tensorflow.compat.v2compatv2r   tf_keras.src.optimizers.legacyr    tensorflow.python.util.tf_exportr   OptimizerV2r    r$   r"   <module>rj      sO    $ " ! 7 : ! =>@,
"
" @	@r$   