
    ΅i                     X   S SK Jr  S SKrS SKJr  S SKJrJr  S SKJ	r	J
r
Jr  SSKJr  SSKJr  SS	KJr  / S
Qr " S S\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      r " S S\\5      r " S S\5      rg)    )AnyN)Tensor)
functionalinit)	ParameterUninitializedBufferUninitializedParameter   )SyncBatchNorm)LazyModuleMixin)Module)BatchNorm1dLazyBatchNorm1dBatchNorm2dLazyBatchNorm2dBatchNorm3dLazyBatchNorm3dr   c                      ^  \ rS rSr% SrSr/ SQr\\S'   \	\S'   \	S-  \S'   \
\S	'   \
\S
'         SS\S\	S\	S-  S	\
S
\
SS4U 4S jjjrSS jrSS jrS rS r  SU 4S jjrSrU =r$ )	_NormBase   z,Common base of _InstanceNorm and _BatchNorm.   )track_running_statsmomentumepsnum_featuresaffiner   r   Nr   r   r   returnc                   > XgS.n[         TU ]  5         Xl        X l        X0l        X@l        XPl        U R
                  (       aK  [        [        R                  " U40 UD65      U l
        [        [        R                  " U40 UD65      U l        O$U R                  SS 5        U R                  SS 5        U R                  (       a  U R                  S[        R                  " U40 UD65        U R                  S[        R                  " U40 UD65        U   U   U R                  S[        R                   "  SS[        R"                  0UR%                  5        V	V
s0 s H  u  pU	S:w  d  M  X_M     sn
n	D65        U   O6U R                  SS 5        U R                  SS 5        U R                  SS 5        U R'                  5         g s  sn
n	f )	Ndevicedtypeweightbiasrunning_meanrunning_varnum_batches_trackedr!   r   )super__init__r   r   r   r   r   r   torchemptyr"   r#   register_parameterregister_bufferzerosonestensorlongitemsreset_parameters)selfr   r   r   r   r   r    r!   factory_kwargskv	__class__s              T/home/james-whalen/.local/lib/python3.13/site-packages/torch/nn/modules/batchnorm.pyr)   _NormBase.__init__&   s    %+;( #6 ;;#EKK$O$OPDK!%++l"Mn"MNDI##Hd3##FD1##  L KN K   uzz,I.I   %** )7(<(<(>O(>!w,tqt(>O	   6  5  !6= Ps   G'Gc                     U R                   (       aP  U R                  R                  5         U R                  R	                  S5        U R
                  R                  5         g g )Nr
   )r   r$   zero_r%   fill_r&   r4   s    r9   reset_running_stats_NormBase.reset_running_statsV   sJ    ## ##%""1%$$**, $    c                     U R                  5         U R                  (       aA  [        R                  " U R                  5        [        R
                  " U R                  5        g g N)r?   r   r   ones_r"   zeros_r#   r>   s    r9   r3   _NormBase.reset_parameters^   s:      ";;JJt{{#KK		" rA   c                     [         erC   )NotImplementedErrorr4   inputs     r9   _check_input_dim_NormBase._check_input_dimd   s    !!rA   c                 :    SR                   " S0 U R                  D6$ )Nzj{num_features}, eps={eps}, momentum={momentum}, affine={affine}, track_running_stats={track_running_stats} )format__dict__r>   s    r9   
extra_repr_NormBase.extra_reprg   s)    88>? PAEP	
rA   c           	      t  > UR                  SS 5      nUb  US:  a  U R                  (       av  US-   n	X;  al  U R                  b:  U R                  R                  [        R                  " S5      :w  a  U R                  O"[        R
                  " S[        R                  S9X'   [        T
U ]!  UUUUUUU5        g )Nversionr   r&   metar   )r!   )	getr   r&   r    r*   r0   r1   r(   _load_from_state_dict)r4   
state_dictprefixlocal_metadatastrictmissing_keysunexpected_keys
error_msgsrT   num_batches_tracked_keyr8   s             r9   rW   _NormBase._load_from_state_dictm   s     !$$Y5Ow{0H0H '-/D&D#&8 //;00775<<;OO ,, auzz:	 3 	%	
rA   )r   r#   r   r   r   r   r"   h㈵>皙?TTNNr   N)__name__
__module____qualname____firstlineno____doc___version__constants__int__annotations__floatboolr)   r?   r3   rK   rQ   rW   __static_attributes____classcell__r8   s   @r9   r   r      s    6HXM	JdlL !$$(. .  .  $,	. 
 .  ".  
.  . `-#"
 
 
 
  
rA   r   c                   l   ^  \ rS rSr      SS\S\S\S-  S\S\SS4U 4S	 jjjrS
\S\4S jr	Sr
U =r$ )
_BatchNorm   Nr   r   r   r   r   r   c                 4   > XgS.n[         T	U ]  " XX4U40 UD6  g Nr   )r(   r)   )
r4   r   r   r   r   r   r    r!   r5   r8   s
            r9   r)   _BatchNorm.__init__   s*     %+;x1D	
HV	
rA   rJ   c           
         U R                  U5        U R                  c  SnOU R                  nU R                  (       ak  U R                  (       aZ  U R                  bM  U R                  R                  S5        U R                  c  S[        U R                  5      -  nOU R                  n U R                  (       a  SnO#U R                  S L =(       a    U R                  S L n [        R                  " UU R                  (       a  U R                  (       a  U R                  OS U R                  (       a  U R                  (       a  U R                  OS U R                  U R                  UUU R                  5      $ )N        r
         ?T)rK   r   trainingr   r&   add_rn   r$   r%   F
batch_normr"   r#   r   )r4   rJ   exponential_average_factorbn_trainings       r9   forward_BatchNorm.forward   s'   e$
 == ),&)-&==T55''3((--a0==(14uT=U=U7V1V.15.	 ==K,,4T4;K;Kt;SK	
 || }}(@(@ !!$(MMT5M5MDSWKKII&HH
 	
rA   rN   ra   )re   rf   rg   rh   rl   rn   ro   r)   r   r   rp   rq   rr   s   @r9   rt   rt      sw     !$$(

 
 $,	

 
 "
 

 
0
V 0
 0
 0
rA   rt   c                   n   ^  \ rS rSr% \\S'   \\S'         S S	U 4S jjjrS	U 4S jjrS	S jrSr	U =r
$ )
_LazyNormBase   r"   r#   c           
        > XVS.n[         T
U ]  " SUUSS40 UD6  X0l        X@l        U R                  (       a   [	        S0 UD6U l        [	        S0 UD6U l        U R                  (       ax  [        S0 UD6U l        [        S0 UD6U l	        [        R                  "  SS[        R                  0UR                  5        VV	s0 s H  u  pUS:w  d  M  X_M     sn	nD6U l        g g s  sn	nf )Nr   r   Fr!   rN   r'   )r(   r)   r   r   r	   r"   r#   r   r$   r%   r*   r0   r1   r2   r&   )r4   r   r   r   r   r    r!   r5   r6   r7   r8   s             r9   r)   _LazyNormBase.__init__   s     %+; 		
 		
 #6 ;;0B>BDK.@@DI## 3 En ED2D^DD',||(jj( %3$8$8$:K$:DAa7l414$:K	(D$ $ Ls   ?C"C"c                 p   > U R                  5       (       d   U R                  S:w  a  [        TU ]  5         g g g )Nr   )has_uninitialized_paramsr   r(   r3   )r4   r8   s    r9   r3   _LazyNormBase.reset_parameters   s3    ,,..43D3D3IG$& 4J.rA   c                    U R                  5       (       Ga3  UR                  S   U l        U R                  (       a  [	        U R
                  [        5      (       d  [        S5      e[	        U R                  [        5      (       d  [        S5      eU R
                  R                  U R                  45        U R                  R                  U R                  45        U R                  (       aL  U R                  R                  U R                  45        U R                  R                  U R                  45        U R                  5         g g )Nr
   z-self.weight must be an UninitializedParameterz+self.bias must be an UninitializedParameter)r   shaper   r   
isinstancer"   r	   AssertionErrorr#   materializer   r$   r%   r3   rI   s     r9   initialize_parameters#_LazyNormBase.initialize_parameters  s    ((** %AD{{!$++/EFF(G  "$))-CDD()VWW''):):(<=		%%t'8'8&:;''!!--&&(   ,,&&( !!#% +rA   )r   r#   r&   r   r$   r%   r   r"   ra   rd   )re   rf   rg   rh   r	   rm   r)   r3   r   rp   rq   rr   s   @r9   r   r      sG    ""
    & 
& &P'
$ $rA   r   c                   "    \ rS rSrSrSS jrSrg)r   i  a  Applies Batch Normalization over a 2D or 3D input.

Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the number of features or channels of the input). By default, the
elements of :math:`\gamma` are set to 1 and the elements of :math:`\beta` are set to 0.
At train time in the forward pass, the variance is calculated via the biased estimator,
equivalent to ``torch.var(input, correction=0)``. However, the value stored in the
moving average of the variance is calculated via the unbiased  estimator, equivalent to
``torch.var(input, correction=1)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, L)` slices, it's common terminology to call this Temporal Batch Normalization.

Args:
    num_features: number of features or channels :math:`C` of the input
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``

Shape:
    - Input: :math:`(N, C)` or :math:`(N, C, L)`, where :math:`N` is the batch size,
      :math:`C` is the number of features or channels, and :math:`L` is the sequence length
    - Output: :math:`(N, C)` or :math:`(N, C, L)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm1d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm1d(100, affine=False)
    >>> input = torch.randn(20, 100)
    >>> output = m(input)
Nc                     UR                  5       S:w  a2  UR                  5       S:w  a  [        SUR                  5        S35      eg g Nr      zexpected 2D or 3D input (got D input)dim
ValueErrorrI   s     r9   rK   BatchNorm1d._check_input_dimb  @    99;!		q 0<UYY[MRSS !1rA   rN   rd   re   rf   rg   rh   ri   rK   rp   rN   rA   r9   r   r     s    DLTrA   r   c                   &    \ rS rSrSr\rSS jrSrg)r   ig  a  A :class:`torch.nn.BatchNorm1d` module with lazy initialization.

Lazy initialization based on the ``num_features`` argument of the :class:`BatchNorm1d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
Nc                     UR                  5       S:w  a2  UR                  5       S:w  a  [        SUR                  5        S35      eg g r   r   rI   s     r9   rK    LazyBatchNorm1d._check_input_dim  r   rA   rN   rd   )	re   rf   rg   rh   ri   r   cls_to_becomerK   rp   rN   rA   r9   r   r   g  s    4  MTrA   r   c                   "    \ rS rSrSrSS jrSrg)r   i  a  Applies Batch Normalization over a 4D input.

4D is a mini-batch of 2D inputs
with additional channel dimension. Method described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, correction=0)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased  estimator, equivalent to
``torch.var(input, correction=1)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, H, W)` slices, it's common terminology to call this Spatial Batch Normalization.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, H, W)`
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``

Shape:
    - Input: :math:`(N, C, H, W)`
    - Output: :math:`(N, C, H, W)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm2d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm2d(100, affine=False)
    >>> input = torch.randn(20, 100, 35, 45)
    >>> output = m(input)
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg N   zexpected 4D input (got r   r   rI   s     r9   rK   BatchNorm2d._check_input_dim  0    99;!6uyy{m8LMM rA   rN   rd   r   rN   rA   r9   r   r         ENNrA   r   c                   &    \ rS rSrSr\rSS jrSrg)r   i  a  A :class:`torch.nn.BatchNorm2d` module with lazy initialization.

Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm2d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg r   r   rI   s     r9   rK    LazyBatchNorm2d._check_input_dim  r   rA   rN   rd   )	re   rf   rg   rh   ri   r   r   rK   rp   rN   rA   r9   r   r         4  MNrA   r   c                   "    \ rS rSrSrSS jrSrg)r   i  a  Applies Batch Normalization over a 5D input.

5D is a mini-batch of 3D inputs with additional channel dimension as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over
the mini-batches and :math:`\gamma` and :math:`\beta` are learnable parameter vectors
of size `C` (where `C` is the input size). By default, the elements of :math:`\gamma` are set
to 1 and the elements of :math:`\beta` are set to 0. At train time in the forward pass, the
standard-deviation is calculated via the biased estimator, equivalent to
``torch.var(input, correction=0)``. However, the value stored in the moving average of the
standard-deviation is calculated via the unbiased  estimator, equivalent to
``torch.var(input, correction=1)``.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done over the `C` dimension, computing statistics
on `(N, D, H, W)` slices, it's common terminology to call this Volumetric Batch Normalization
or Spatio-temporal Batch Normalization.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, D, H, W)`
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``

Shape:
    - Input: :math:`(N, C, D, H, W)`
    - Output: :math:`(N, C, D, H, W)` (same shape as input)

Examples::

    >>> # With Learnable Parameters
    >>> m = nn.BatchNorm3d(100)
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm3d(100, affine=False)
    >>> input = torch.randn(20, 100, 35, 45, 10)
    >>> output = m(input)
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg N   zexpected 5D input (got r   r   rI   s     r9   rK   BatchNorm3d._check_input_dim@  r   rA   rN   rd   r   rN   rA   r9   r   r     r   rA   r   c                   &    \ rS rSrSr\rSS jrSrg)r   iE  a  A :class:`torch.nn.BatchNorm3d` module with lazy initialization.

Lazy initialization is done for the ``num_features`` argument of the :class:`BatchNorm3d` that is inferred
from the ``input.size(1)``.
The attributes that will be lazily initialized are `weight`, `bias`,
`running_mean` and `running_var`.

Check the :class:`torch.nn.modules.lazy.LazyModuleMixin` for further documentation
on lazy modules and their limitations.

Args:
    eps: a value added to the denominator for numerical stability.
        Default: 1e-5
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
Nc                 f    UR                  5       S:w  a  [        SUR                  5        S35      eg r   r   rI   s     r9   rK    LazyBatchNorm3d._check_input_dimb  r   rA   rN   rd   )	re   rf   rg   rh   ri   r   r   rK   rp   rN   rA   r9   r   r   E  r   rA   r   c                      ^  \ rS rSrSr       SS\S\S\S-  S\S\S	\S-  S
S4U 4S jjjr	SS jr
SS jrS\S
\4S jr\SS j5       rSrU =r$ )r   ig  a  Applies Batch Normalization over a N-Dimensional input.

The N-D input is a mini-batch of [N-2]D inputs with additional channel dimension) as described in the paper
`Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/abs/1502.03167>`__ .

.. math::

    y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta

The mean and standard-deviation are calculated per-dimension over all
mini-batches of the same process groups. :math:`\gamma` and :math:`\beta`
are learnable parameter vectors of size `C` (where `C` is the input size).
By default, the elements of :math:`\gamma` are sampled from
:math:`\mathcal{U}(0, 1)` and the elements of :math:`\beta` are set to 0.
The standard-deviation is calculated via the biased estimator, equivalent to
`torch.var(input, correction=0)`.

Also by default, during training this layer keeps running estimates of its
computed mean and variance, which are then used for normalization during
evaluation. The running estimates are kept with a default :attr:`momentum`
of 0.1.

If :attr:`track_running_stats` is set to ``False``, this layer then does not
keep running estimates, and batch statistics are instead used during
evaluation time as well.

.. note::
    This :attr:`momentum` argument is different from one used in optimizer
    classes and the conventional notion of momentum. Mathematically, the
    update rule for running statistics here is
    :math:`\hat{x}_\text{new} = (1 - \text{momentum}) \times \hat{x} + \text{momentum} \times x_t`,
    where :math:`\hat{x}` is the estimated statistic and :math:`x_t` is the
    new observed value.

Because the Batch Normalization is done for each channel in the ``C`` dimension, computing
statistics on ``(N, +)`` slices, it's common terminology to call this Volumetric Batch
Normalization or Spatio-temporal Batch Normalization.

Currently :class:`SyncBatchNorm` only supports
:class:`~torch.nn.DistributedDataParallel` (DDP) with single GPU per process. Use
:meth:`torch.nn.SyncBatchNorm.convert_sync_batchnorm()` to convert
:attr:`BatchNorm*D` layer to :class:`SyncBatchNorm` before wrapping
Network with DDP.

Args:
    num_features: :math:`C` from an expected input of size
        :math:`(N, C, +)`
    eps: a value added to the denominator for numerical stability.
        Default: ``1e-5``
    momentum: the value used for the running_mean and running_var
        computation. Can be set to ``None`` for cumulative moving average
        (i.e. simple average). Default: 0.1
    affine: a boolean value that when set to ``True``, this module has
        learnable affine parameters. Default: ``True``
    track_running_stats: a boolean value that when set to ``True``, this
        module tracks the running mean and variance, and when set to ``False``,
        this module does not track such statistics, and initializes statistics
        buffers :attr:`running_mean` and :attr:`running_var` as ``None``.
        When these buffers are ``None``, this module always uses batch statistics.
        in both training and eval modes. Default: ``True``
    process_group: synchronization of stats happen within each process group
        individually. Default behavior is synchronization across the whole
        world

Shape:
    - Input: :math:`(N, C, +)`
    - Output: :math:`(N, C, +)` (same shape as input)

.. note::
    Synchronization of batchnorm statistics occurs only while training, i.e.
    synchronization is disabled when ``model.eval()`` is set or if
    ``self.training`` is otherwise ``False``.

Examples::

    >>> # xdoctest: +SKIP
    >>> # With Learnable Parameters
    >>> m = nn.SyncBatchNorm(100)
    >>> # creating process group (optional)
    >>> # ranks is a list of int identifying rank ids.
    >>> ranks = list(range(8))
    >>> r1, r2 = ranks[:4], ranks[4:]
    >>> # Note: every rank calls into new_group for every
    >>> # process group created, even if that rank is not
    >>> # part of the group.
    >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
    >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
    >>> # Without Learnable Parameters
    >>> m = nn.BatchNorm3d(100, affine=False, process_group=process_group)
    >>> input = torch.randn(20, 100, 35, 45, 10)
    >>> output = m(input)

    >>> # network is nn.BatchNorm layer
    >>> sync_bn_network = nn.SyncBatchNorm.convert_sync_batchnorm(network, process_group)
    >>> # only single gpu per process is currently supported
    >>> ddp_sync_bn_network = torch.nn.parallel.DistributedDataParallel(
    >>>                         sync_bn_network,
    >>>                         device_ids=[args.local_rank],
    >>>                         output_device=args.local_rank)
Nr   r   r   r   r   process_groupr   c	                 @   > XxS.n	[         T
U ]  " XX4U40 U	D6  X`l        g rw   )r(   r)   r   )r4   r   r   r   r   r   r   r    r!   r5   r8   s             r9   r)   SyncBatchNorm.__init__  s2     %+;x1D	
HV	
 +rA   c                 f    UR                  5       S:  a  [        SUR                  5        S35      eg )Nr   z expected at least 2D input (got r   r   rI   s     r9   rK   SyncBatchNorm._check_input_dim  s/    99;??		}HUVV rA   c                 D    UR                  S5      S:X  a  [        S5      eg )Nr
   r   z9SyncBatchNorm number of input channels should be non-zero)sizer   rI   s     r9   _check_non_zero_input_channels,SyncBatchNorm._check_non_zero_input_channels  s'    ::a=AK  rA   rJ   c                 F   U R                  U5        U R                  U5        U R                  c  SnOU R                  nU R                  (       a{  U R                  (       aj  U R
                  c  [        S5      eU R
                  R                  S5        U R                  c  SU R
                  R                  5       -  nOU R                  n U R                  (       a  SnO#U R                  SL =(       a    U R                  SL n U R                  (       a  U R                  (       a  U R                  OSnU R                  (       a  U R                  (       a  U R                  OSnU=(       aV    U R                  =(       aC    [        R                  R                  5       =(       a    [        R                  R                  5       nU(       a  UR                  R                   SSS	[        R"                  R%                  5       4;  a*  ['        S
[        R"                  R%                  5        35      e[        R                  R(                  R*                  nU R,                  (       a  U R,                  n[        R                  R/                  U5      nUS:  nU(       d;  [0        R2                  " UUUU R4                  U R6                  UUU R8                  5      $ U(       d  [        S5      e[:        R<                  " UU R4                  U R6                  UUU R8                  UWW5	      $ )z
Runs the forward pass.
Nrz   z$num_batches_tracked must not be Noner
   r{   Tcudahpuxpuz;SyncBatchNorm expected input tensor to be on GPU or XPU or zbn_training must be True)rK   r   r   r|   r   r&   r   r}   itemr$   r%   r*   distributedis_availableis_initializedr    type_C_get_privateuse1_backend_namer   groupWORLDr   get_world_sizer~   r   r"   r#   r   sync_batch_normapply)	r4   rJ   r   r   r$   r%   	need_syncr   
world_sizes	            r9   r   SyncBatchNorm.forward  s    	e$++E2
 == ),&)-&==T55''/$%KLL$$))!,}}$-043K3K3P3P3R-R*-1]]*	 ==K,,4T4;K;Kt;SK	 &*]]d6N6NDTX 	 %)MMT5M5MDSW 	  33!!..03 !!002	 	 ||  668	)  !Qxx==?@B 
 "--3399M!! $ 2 2**99-HJ"QI <<		*	 	 $%?@@"((		*
 
rA   c                 6   Un[        U[        R                  R                  R                  R
                  5      (       Ga  [        R                  R                  UR                  UR                  UR                  UR                  UR                  U5      nUR                  (       a@  [        R                  " 5          UR                  Ul        UR                  Ul        SSS5        UR                  Ul        UR                   Ul        UR"                  Ul        UR$                  Ul        ['        US5      (       a  UR(                  Ul        UR+                  5        H%  u  pEUR-                  X@R/                  XR5      5        M'     AU$ ! , (       d  f       N= f)a  Converts all :attr:`BatchNorm*D` layers in the model to :class:`torch.nn.SyncBatchNorm` layers.

Args:
    module (nn.Module): module containing one or more :attr:`BatchNorm*D` layers
    process_group (optional): process group to scope synchronization,
        default is the whole world

Returns:
    The original :attr:`module` with the converted :class:`torch.nn.SyncBatchNorm`
    layers. If the original :attr:`module` is a :attr:`BatchNorm*D` layer,
    a new :class:`torch.nn.SyncBatchNorm` layer object will be returned
    instead.

Example::

    >>> # Network with nn.BatchNorm layer
    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> module = torch.nn.Sequential(
    >>>            torch.nn.Linear(20, 100),
    >>>            torch.nn.BatchNorm1d(100),
    >>>          ).cuda()
    >>> # creating process group (optional)
    >>> # ranks is a list of int identifying rank ids.
    >>> ranks = list(range(8))
    >>> r1, r2 = ranks[:4], ranks[4:]
    >>> # Note: every rank calls into new_group for every
    >>> # process group created, even if that rank is not
    >>> # part of the group.
    >>> # xdoctest: +SKIP("distributed")
    >>> process_groups = [torch.distributed.new_group(pids) for pids in [r1, r2]]
    >>> process_group = process_groups[0 if dist.get_rank() <= 3 else 1]
    >>> sync_bn_module = torch.nn.SyncBatchNorm.convert_sync_batchnorm(module, process_group)

Nqconfig)r   r*   nnmodules	batchnormrt   r   r   r   r   r   r   no_gradr"   r#   r$   r%   r&   r|   hasattrr   named_children
add_moduleconvert_sync_batchnorm)clsmoduler   module_outputnamechilds         r9   r   $SyncBatchNorm.convert_sync_batchnormL  s/   H fehh..88CCDD!HH22##

**M }}]]_+1==M()/M& % *0)<)<M&(.(:(:M%060J0JM-%+__M"vy))(.%!002KD$$00F 3  %_s   =#F


F)r   )rb   rc   TTNNNrd   rC   )re   rf   rg   rh   ri   rl   rn   ro   r   r)   rK   r   r   r   classmethodr   rp   rq   rr   s   @r9   r   r   g  s    dR !$$($(++ + $,	+
 + "+ Tz+ 
+ +"WaV a aF < <rA   r   )typingr   r*   r   torch.nnr   r~   r   torch.nn.parameterr   r   r	   
_functionsr   r   lazyr   r   r   __all__r   rt   r   r   r   r   r   r   r   rN   rA   r9   <module>r      s       * U U 8 ! t
 t
n@
 @
FE$OY E$PIT* ITXTmZ TDJN* JNZNmZ NDJN* JNZNmZ NDbJ brA   