
    biV                        S SK r S SKJrJr  S SKrS SKJr  S SKJr  / r	    SS\R
                  S\R
                  S\R
                  S\
S	\S
\S\S\R
                  4S jjr " S S\R                  R                  5      r " S S\R                  R                  5      r " S S\R                  R                  5      r " S S\R                  R                  5      rg)    N)OptionalUnion)Tensor)
functionalpsd_spsd_nreference_vectorsolutiondiagonal_loadingdiag_epsepsreturnc                     US:X  a  [         R                  " XX$XV5      nU$ US:X  a  [         R                  " U 5      nO[         R                  " XX$US9n[         R                  " XX$XV5      nU$ )ax  Compute the MVDR beamforming weights with ``solution`` argument.

Args:
    psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
        Tensor with dimensions `(..., freq, channel, channel)`.
    psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
        Tensor with dimensions `(..., freq, channel, channel)`.
    reference_vector (torch.Tensor): one-hot reference channel matrix.
    solution (str, optional): Solution to compute the MVDR beamforming weights.
        Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
    diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
        (Default: ``True``)
    diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
        It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
    eps (float, optional): Value to add to the denominator in the beamforming weight formula.
        (Default: ``1e-8``)

Returns:
    torch.Tensor: the mvdr beamforming weight matrix
ref_channelstv_evd)r   r   )Fmvdr_weights_soudenrtf_evd	rtf_powermvdr_weights_rtf)	r   r   r	   r
   r   r   r   beamform_vectorstvs	            ^/home/james-whalen/.local/lib/python3.13/site-packages/torchaudio/transforms/_multi_channel.py_get_mvdr_vectorr      sr    : = //>Nbjp  y ))E"C++e,<jrsC,,S9I]ek    c                      ^  \ rS rSrSrSS\S\S\4U 4S jjjrSS\R                  S\
\R                     4S	 jjrS
rU =r$ )PSD7   a  Compute cross-channel power spectral density (PSD) matrix.

.. devices:: CPU CUDA

.. properties:: Autograd TorchScript

Args:
    multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
    normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``)
    eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``)

multi_mask	normalizer   c                 F   > [         TU ]  5         Xl        X l        X0l        g N)super__init__r   r    r   )selfr   r    r   	__class__s       r   r$   PSD.__init__D   s    $"r   specgrammaskc                     Ub   U R                   (       a  UR                  SS9n[        R                  " XU R                  U R
                  5      nU$ )a'  
Args:
    specgram (torch.Tensor): Multi-channel complex-valued spectrum.
        Tensor with dimensions `(..., channel, freq, time)`.
    mask (torch.Tensor or None, optional): Time-Frequency mask for normalization.
        Tensor with dimensions `(..., freq, time)` if multi_mask is ``False`` or
        with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
        (Default: ``None``)

Returns:
    torch.Tensor: The complex-valued PSD matrix of the input spectrum.
        Tensor with dimensions `(..., freq, channel, channel)`
dim)r   meanr   psdr    r   )r%   r(   r)   r/   s       r   forwardPSD.forwardJ   s@     yyRy(eeHDNNDHH=
r   )r   r   r    )FTgV瞯<r"   )__name__
__module____qualname____firstlineno____doc__boolfloatr$   torchr   r   r0   __static_attributes____classcell__r&   s   @r   r   r   7   sL    
4 D e   HU\\4J  r   r   c                   (  ^  \ rS rSrSr      SS\S\S\S\S\S\4U 4S	 jjjr	    SS
\
R                  S\
R                  S\
R                  S\
R                  S\
R                  S\S\S\S\S\
R                  4S jjrS
\
R                  S\
R                  S\
R                  4S jrS\
R                  S\
R                  S\
R                  4S jr SS\
R                  S\
R                  S\\
R                     S\
R                  4S jjrSrU =r$ )MVDRa   a(  Minimum Variance Distortionless Response (MVDR) module that performs MVDR beamforming with Time-Frequency masks.

.. devices:: CPU CUDA

.. properties:: Autograd TorchScript

Based on https://github.com/espnet/espnet/blob/master/espnet2/enh/layers/beamformer.py

We provide three solutions of MVDR beamforming. One is based on *reference channel selection*
:cite:`souden2009optimal` (``solution=ref_channel``).

.. math::
    \textbf{w}_{\text{MVDR}}(f) =        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}

where :math:`\bf{\Phi}_{\textbf{SS}}` and :math:`\bf{\Phi}_{\textbf{NN}}` are the covariance        matrices of speech and noise, respectively. :math:`\bf{u}` is an one-hot vector to determine the         reference channel.

The other two solutions are based on the steering vector (``solution=stv_evd`` or ``solution=stv_power``).

.. math::
    \textbf{w}_{\text{MVDR}}(f) =        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}

where :math:`\bm{v}` is the acoustic transfer function or the steering vector.        :math:`.^{\mathsf{H}}` denotes the Hermitian Conjugate operation.

We apply either *eigenvalue decomposition*
:cite:`higuchi2016robust` or the *power method* :cite:`mises1929praktische` to get the
steering vector from the PSD matrix of speech.

After estimating the beamforming weight, the enhanced Short-time Fourier Transform (STFT) is obtained by

.. math::
    \hat{\bf{S}} = {\bf{w}^\mathsf{H}}{\bf{Y}}, {\bf{w}} \in \mathbb{C}^{M \times F}

where :math:`\bf{Y}` and :math:`\hat{\bf{S}}` are the STFT of the multi-channel noisy speech and        the single-channel enhanced speech, respectively.

For online streaming audio, we provide a *recursive method* :cite:`higuchi2017online` to update the
PSD matrices of speech and noise, respectively.

Args:
    ref_channel (int, optional): Reference channel for beamforming. (Default: ``0``)
    solution (str, optional): Solution to compute the MVDR beamforming weights.
        Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
    multi_mask (bool, optional): If ``True``, only accepts multi-channel Time-Frequency masks. (Default: ``False``)
    diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to the covariance matrix
        of the noise. (Default: ``True``)
    diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
        It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
    online (bool, optional): If ``True``, updates the MVDR beamforming weights based on
        the previous covarience matrices. (Default: ``False``)

Note:
    To improve the numerical stability, the input spectrogram will be converted to double precision
    (``torch.complex128`` or ``torch.cdouble``) dtype for internal computation. The output spectrogram
    is converted to the dtype of the input spectrogram to be compatible with other modules.

Note:
    If you use ``stv_evd`` solution, the gradient of the same input may not be identical if the
    eigenvalues of the PSD matrix are not distinct (i.e. some eigenvalues are close or identical).
r   r
   r   diag_loadingr   onlinec                 
  > [         TU ]  5         US;  a  [        SR                  U5      5      eXl        X l        X0l        X@l        XPl        X`l	        [        U5      U l        [        R                  " S5      n[        R                  " S5      n[        R                  " S5      n	[        R                  " S5      n
U R                  SU5        U R                  SU5        U R                  SU	5        U R                  SU
5        g )N)r   r   	stv_powerzK`solution` must be one of ["ref_channel", "stv_evd", "stv_power"]. Given {}   r   r   
mask_sum_s
mask_sum_n)r#   r$   
ValueErrorformatr   r
   r   r@   r   rA   r   r/   r9   zerosregister_buffer)r%   r   r
   r   r@   r   rA   r   r   rE   rF   r&   s              r   r$   MVDR.__init__   s     	 
 

 ]ddemn  ' $( z?#kk!n#kk!n#(;;q>
#(;;q>
We,We,\:6\:6r   r   r   mask_smask_nr	   r   r   r   c
           	         U R                   (       a  UR                  SS9nUR                  SS9nU R                  R                  S:X  aB  Xl        X l        UR                  SS9U l        UR                  SS9U l        [        XXVXxU	5      $ U R                  X5      nU R                  X$5      nXl        X l        U R                  UR                  SS9-   U l        U R                  UR                  SS9-   U l        [        XXVXxU	5      $ )a>  Recursively update the MVDR beamforming vector.

Args:
    psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
        Tensor with dimensions `(..., freq, channel, channel)`.
    psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
        Tensor with dimensions `(..., freq, channel, channel)`.
    mask_s (torch.Tensor): Time-Frequency mask of the target speech.
        Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
        or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
    mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
        Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
        or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
    reference_vector (torch.Tensor): One-hot reference channel matrix.
    solution (str, optional): Solution to compute the MVDR beamforming weights.
        Options: [``ref_channel``, ``stv_evd``, ``stv_power``]. (Default: ``ref_channel``)
    diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
        (Default: ``True``)
    diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
        It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
    eps (float, optional): Value to add to the denominator in the beamforming weight formula.
        (Default: ``1e-8``)

Returns:
    torch.Tensor: The MVDR beamforming weight matrix.
r+   r,   rD   )r   r.   r   ndimr   sumrE   rF   r   _get_updated_psd_speech_get_updated_psd_noise)
r%   r   r   rL   rM   r	   r
   r   r   r   s
             r   _get_updated_mvdr_vectorMVDR._get_updated_mvdr_vector   s    L ??[[R[(F[[R[(F::??aJJ$jjRj0DO$jjRj0DO#E2BN^jmnn00?E//>EJJ"oo

r
0BBDO"oo

r
0BBDO#E2BN^jmnnr   c                     U R                   U R                   UR                  SS9-   -  nSU R                   UR                  SS9-   -  nU R                  US   -  XS   -  -   nU$ )a  Update psd of speech recursively.

Args:
    psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
        Tensor with dimensions `(..., freq, channel, channel)`.
    mask_s (torch.Tensor): Time-Frequency mask of the target speech.
        Tensor with dimensions `(..., freq, time)`.

Returns:
    torch.Tensor: The updated PSD matrix of target speech.
rO   r,   rD   .NN)rE   rQ   r   )r%   r   rL   	numeratordenominators        r   rR   MVDR._get_updated_psd_speech  i     OOt9K'KL	4??VZZBZ-??@

Y77%oB^:^^r   c                     U R                   U R                   UR                  SS9-   -  nSU R                   UR                  SS9-   -  nU R                  US   -  XS   -  -   nU$ )a  Update psd of noise recursively.

Args:
    psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
        Tensor with dimensions `(..., freq, channel, channel)`.
    mask_n (torch.Tensor or None, optional): Time-Frequency mask of the noise.
        Tensor with dimensions `(..., freq, time)`.

Returns:
    torch.Tensor:  The updated PSD matrix of noise.
rO   r,   rD   rW   )rF   rQ   r   )r%   r   rM   rX   rY   s        r   rS   MVDR._get_updated_psd_noise  r[   r   r(   c           
         UR                   nUR                  S:  a  [        SUR                   35      eUR	                  5       (       d  [        SUR                    35      eUR                   [
        R                  :X  a  UR                  5       nUc  [        R                  " S5        SU-
  nU R                  X5      nU R                  X5      n[
        R                  " UR                  5       SS UR                  [
        R                  S9nUS	U R                  4   R                  S5        U R                   (       a4  U R#                  XVX#XpR$                  U R&                  U R(                  5      nO,[+        XVXpR$                  U R&                  U R(                  5      n[,        R.                  " X5      n	U	R1                  U5      $ )
a  Perform MVDR beamforming.

Args:
    specgram (torch.Tensor): Multi-channel complex-valued spectrum.
        Tensor with dimensions `(..., channel, freq, time)`
    mask_s (torch.Tensor): Time-Frequency mask of target speech.
        Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
        or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
    mask_n (torch.Tensor or None, optional): Time-Frequency mask of noise.
        Tensor with dimensions `(..., freq, time)` if multi_mask is ``False``
        or with dimensions `(..., channel, freq, time)` if multi_mask is ``True``.
        (Default: None)

Returns:
    torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
   z?Expected at least 3D tensor (..., channel, freq, time). Found: ziThe type of ``specgram`` tensor must be ``torch.cfloat`` or ``torch.cdouble``.                    Found: Nz=``mask_n`` is not provided, use ``1 - mask_s`` as ``mask_n``.rD   )devicedtype.)rb   rP   rG   shape
is_complexr9   cfloatcdoublewarningswarnr/   rI   sizera   r   fill_rA   rT   r
   r@   r   r   r   apply_beamformingto)
r%   r(   rL   rM   rb   r   r   uw_mvdrspecgram_enhanceds
             r   r0   MVDR.forward#  sx   & ==1^_g_m_m^nopp""$$$NN+-  >>U\\)'')H>MMYZZF**KK,X__EMMZ	#t
 &&q);;22fa@Q@QSWS`S`F &eA}}dFWFWY]YfYfgF//A ##E**r   )r   r@   rF   rE   r   rA   r/   r   r   r   r
   )r   r   FTHz>Fr   Trq   :0yE>r"   )r2   r3   r4   r5   r6   intstrr7   r8   r$   r9   r   rT   rR   rS   r   r0   r:   r;   r<   s   @r   r>   r>   a   s   AJ % !!7!7 !7 	!7
 !7 !7 !7 !7T &!%7o||7o ||7o 	7o
 7o  ,,7o 7o 7o 7o 7o 
7orU\\ 5<< TYT`T` "ELL %,, SXS_S_ $ ^b1+1+.3ll1+DLU\\DZ1+	1+ 1+r   r>   c                   V    \ rS rSrSr   SS\S\S\S\\\4   S\S\	S	\	S
\4S jjr
Srg)RTFMVDRiW  a  Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) module
based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.

.. devices:: CPU CUDA

.. properties:: Autograd TorchScript

Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the relative transfer function (RTF) matrix
or the steering vector of target speech :math:`\bm{v}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:

.. math::
    \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)

where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin,
:math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation.

The beamforming weight is computed by:

.. math::
    \textbf{w}_{\text{MVDR}}(f) =
    \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
    {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}
r(   rtfr   reference_channelr   r   r   r   c                 b    [         R                  " X#XEXg5      n[         R                  " X5      n	U	$ )a  
Args:
    specgram (torch.Tensor): Multi-channel complex-valued spectrum.
        Tensor with dimensions `(..., channel, freq, time)`
    rtf (torch.Tensor): The complex-valued RTF vector of target speech.
        Tensor with dimensions `(..., freq, channel)`.
    psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
        Tensor with dimensions `(..., freq, channel, channel)`.
    reference_channel (int or torch.Tensor): Specifies the reference channel.
        If the dtype is ``int``, it represents the reference channel index.
        If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
        is one-hot.
    diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
        (Default: ``True``)
    diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
        It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
    eps (float, optional): Value to add to the denominator in the beamforming weight formula.
        (Default: ``1e-8``)

Returns:
    torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
)r   r   rk   )
r%   r(   rx   r   ry   r   r   r   rn   spectrum_enhanceds
             r   r0   RTFMVDR.forwardr  s2    @ ##C0AU]c//A  r    NTrq   rs   )r2   r3   r4   r5   r6   r   r   rt   r7   r8   r0   r:   r}   r   r   rw   rw   W  st    @ "&"!"! "! 	"!
 !f-"! "! "! "! 
"! "!r   rw   c                   j    \ rS rSrSr   SS\S\S\S\\\4   S\S\	S	\	S
\
R
                  4S jjrSrg)
SoudenMVDRi  aG  Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) module
based on the method proposed by *Souden et, al.* :cite:`souden2009optimal`.

.. devices:: CPU CUDA

.. properties:: Autograd TorchScript

Given the multi-channel complex-valued spectrum :math:`\textbf{Y}`, the power spectral density (PSD) matrix
of target speech :math:`\bf{\Phi}_{\textbf{SS}}`, the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and
a one-hot vector that represents the reference channel :math:`\bf{u}`, the module computes the single-channel
complex-valued spectrum of the enhanced speech :math:`\hat{\textbf{S}}`. The formula is defined as:

.. math::
    \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)

where :math:`\textbf{w}_{\text{bf}}(f)` is the MVDR beamforming weight for the :math:`f`-th frequency bin.

The beamforming weight is computed by:

.. math::
    \textbf{w}_{\text{MVDR}}(f) =
    \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
    {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}
r(   r   r   ry   r   r   r   r   c                 b    [         R                  " X#XEXg5      n[         R                  " X5      n	U	$ )a  
Args:
    specgram (torch.Tensor): Multi-channel complex-valued spectrum.
        Tensor with dimensions `(..., channel, freq, time)`.
    psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
        Tensor with dimensions `(..., freq, channel, channel)`.
    psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
        Tensor with dimensions `(..., freq, channel, channel)`.
    reference_channel (int or torch.Tensor): Specifies the reference channel.
        If the dtype is ``int``, it represents the reference channel index.
        If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
        is one-hot.
    diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
        (Default: ``True``)
    diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
        It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
    eps (float, optional): Value to add to the denominator in the beamforming weight formula.
        (Default: ``1e-8``)

Returns:
    torch.Tensor: Single-channel complex-valued enhanced spectrum with dimensions `(..., freq, time)`.
)r   r   rk   )
r%   r(   r   r   ry   r   r   r   rn   r{   s
             r   r0   SoudenMVDR.forward  s2    @ &&u5FZbh//A  r   r}   Nr~   )r2   r3   r4   r5   r6   r   r   rt   r7   r8   r9   r0   r:   r}   r   r   r   r     sw    > "&"!"! "! 	"!
 !f-"! "! "! "! 
"! "!r   r   rr   )rg   typingr   r   r9   r   
torchaudior   r   __all__ru   r7   r8   r   nnModuler   r>   rw   r   r}   r   r   <module>r      s     "   &  "!&<<&<<& ll& 	&
 & & 
& \\&R'%((// 'Ts+588?? s+l=!ehhoo =!@<! <!r   