
    11ir^                        S r SSKrSSKJr  SSKJr  SSKrSSKJ	r	   " S S\
5      r " S S	\5      rS
 rS rSSSS.S\\-  S\S\S-  S\R                   \   \R"                  -  4S jjrSSS\R&                  \R&                  4S\R"                  S\S\S\S\\-  S\S\R"                  4S jjrS\R&                  SS\R&                  S.S\R"                  S\S\\-  S\S\S\S\\R"                  \R"                  4   4S jjrSSSSSS \R0                  S!.S\R"                  S-  S"\R"                  S-  S\S\S#\S\R"                  4S$ jjrS%S&S&S'.S(\R"                  S\S\S)\S*\S+\S\R"                  4S, jjrSS-SS.S\\-  S\S\S-  S\R                   \   \R"                  -  4S. jjrSSS/.S\S0\S-  S\R:                  \   \R"                  -  4S1 jjrg)2a+  
Copyright (c) 2013--2023, librosa development team.

Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.


***This file extracted from librosa package since we use only the trim() function and librosa requires many dependencies***

Reference:
    - https://gist.github.com/evq/82e95a363eeeb75d15dd62abc1eb1bde
    - https://github.com/librosa/librosa/blob/894942673d55aa2206df1296b6c4c50827c7f1d6/librosa/effects.py#L612
    N)Callable)Any)
as_stridedc                       \ rS rSrSrSrg)LibrosaError   z The root librosa exception class N__name__
__module____qualname____firstlineno____doc____static_attributes__r	       J/home/james-whalen/.local/lib/python3.13/site-packages/kokoro_onnx/trim.pyr   r      s    *r   r   c                       \ rS rSrSrSrg)ParameterError   z%Exception class for mal-formed inputsr	   Nr
   r	   r   r   r   r      s    /r   r   c                 @    U R                   S-  U R                  S-  -   $ )z*Efficiently compute abs2 on complex inputs   )realimag)xs    r   _cabs2r   '   s    6619qvvqy  r   c                     [         R                  " U 5      (       a!  [        U 5      nUc  U$ UR                  U5      $ [         R                  " XS9$ )a  Compute the squared magnitude of a real or complex array.

This function is equivalent to calling `np.abs(x)**2` but it
is slightly more efficient.

Parameters
----------
x : np.ndarray or scalar, real or complex typed
    The input data, either real (float32, float64) or complex (complex64, complex128) typed
dtype : np.dtype, optional
    The data type of the output array.
    If not provided, it will be inferred from `x`

Returns
-------
p : np.ndarray or scale, real
    squared magnitude of `x`

Examples
--------
>>> librosa.util.abs2(3 + 4j)
25.0

>>> librosa.util.abs2((0.5j)**np.arange(8))
array([1.000e+00, 2.500e-01, 6.250e-02, 1.562e-02, 3.906e-03, 9.766e-04,
   2.441e-04, 6.104e-05])
dtype)npiscomplexobjr   astypesquare)r   r   ys      r   abs2r$   ,   sE    8 
q1I=H88E?" yy((r   g      ?gh㈵>g      T@refamintop_dbr&   r'   r(   returnc                   [         R                  " U 5      n [         R                  " U R                  [         R                  5      (       a  [
        R                  " SSS9  [         R                  " U 5      n[        U5      (       a	  U" U5      nO[         R                  " U5      n[        U[         R                  5      (       a  UOSn[         R                  " XFS9n[        XuS-  US-  US9nU$ )aO  Convert an amplitude spectrogram to dB-scaled spectrogram.

This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``,
but is provided for convenience.

Parameters
----------
S : np.ndarray
    input amplitude

ref : scalar or callable
    If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:
    ``20 * log10(S / ref)``.
    Zeros in the output correspond to positions where ``S == ref``.

    If callable, the reference value is computed as ``ref(S)``.

amin : float > 0 [scalar]
    minimum threshold for ``S`` and ``ref``

top_db : float >= 0 [scalar]
    threshold the output at ``top_db`` below the peak:
    ``max(20 * log10(S/ref)) - top_db``

Returns
-------
S_db : np.ndarray
    ``S`` measured in dB

See Also
--------
power_to_db, db_to_amplitude

Notes
-----
This function caches at level 30.
zamplitude_to_db was called on complex input so phase information will be discarded. To suppress this warning, call amplitude_to_db(np.abs(S)) instead.r   
stacklevelN)outr%   )r   asarray
issubdtyper   complexfloatingwarningswarnabscallable
isinstancendarrayr"   power_to_db)	Sr&   r'   r(   	magnitude	ref_value	out_arraypowerdbs	            r   amplitude_to_dbr>   T   s    X 	

1A	}}QWWb00117 		
 q	I}}	N	FF3K	'	2::>>	DIIIi/E qLtQwvVBIr   i   i   <   r#   frame_length
hop_length	aggregatec           	      (   [        XUS9n[        USSSS24   USS9nUR                  S:  aa  [        R                  " XW[        UR                  S-
  5      5      n[        R                  " U[        [        UR                  S-
  5      5      S9nXs* :  $ )a  Frame-wise non-silent indicator for audio input.

This is a helper function for `trim` and `split`.

Parameters
----------
y : np.ndarray
    Audio signal, mono or stereo

frame_length : int > 0
    The number of samples per frame

hop_length : int > 0
    The number of samples between frames

top_db : number
    The threshold (in decibels) below reference to consider as
    silence.
    You can also use a negative value for `top_db` to treat any value
    below `ref + |top_db|` as silent.  This will only make sense if
    `ref` is not `np.max`.

ref : callable or float
    The reference amplitude

aggregate : callable [default: np.max]
    Function to aggregate dB measurements across channels (if y.ndim > 1)

    Note: for multiple leading axes, this is performed using ``np.apply_over_axes``.

Returns
-------
non_silent : np.ndarray, shape=(m,), dtype=bool
    Indicator of non-silent frames
)r#   r@   rA   .r   N)r&   r(      )axis)rmsr>   ndimr   apply_over_axesrangesqueezetuple)r#   r@   rA   r(   r&   rB   mser=   s           r   _signal_to_frame_nonsilentrM      s    X 
DC %Sa^TJB 
ww{	uRWWq[/AB ZZuRWWq['9!:;<r   )r(   r&   r@   rA   rB   c          
      >   [        U UUUUUS9n[        R                  " U5      nUR                  S:  aG  [	        [        US   US95      n[        U R                  S   [	        [        US   S-   US95      5      n	OSu  pU SX24   [        R                  " X/5      4$ )a  Trim leading and trailing silence from an audio signal.

Silence is defined as segments of the audio signal that are `top_db`
decibels (or more) quieter than a reference level, `ref`.
By default, `ref` is set to the signal's maximum RMS value.
It's important to note that if the entire signal maintains a uniform
RMS value, there will be no segments considered quieter than the maximum,
leading to no trimming.
This implies that a completely silent signal will remain untrimmed with the default `ref` setting.
In these situations, an explicit value for `ref` (in decibels) should be used instead.

Parameters
----------
y : np.ndarray, shape=(..., n)
    Audio signal. Multi-channel is supported.
top_db : number
    The threshold (in decibels) below reference to consider as
    silence.
    You can also use a negative value for `top_db` to treat any value
    below `ref + |top_db|` as silent.  This will only make sense if
    `ref` is not `np.max`.
ref : number or callable
    The reference amplitude.  By default, it uses `np.max` and compares
    to the peak amplitude in the signal.
frame_length : int > 0
    The number of samples per analysis frame
hop_length : int > 0
    The number of samples between analysis frames
aggregate : callable [default: np.max]
    Function to aggregate across channels (if y.ndim > 1)

Returns
-------
y_trimmed : np.ndarray, shape=(..., m)
    The trimmed signal
index : np.ndarray, shape=(2,)
    the interval of ``y`` corresponding to the non-silent region:
    ``y_trimmed = y[index[0]:index[1]]`` (for mono) or
    ``y_trimmed = y[:, index[0]:index[1]]`` (for stereo).

Examples
--------
>>> # Load some audio
>>> y, sr = librosa.load(librosa.ex('choice'))
>>> # Trim the beginning and ending silence
>>> yt, index = librosa.effects.trim(y)
>>> # Print the durations
>>> print(librosa.get_duration(y, sr=sr), librosa.get_duration(yt, sr=sr))
25.025986394557822 25.007891156462584
)r@   rA   r&   r(   rB   r   )rA   rD   r   r   .)	rM   r   flatnonzerosizeintframes_to_samplesminshaper.   )
r#   r(   r&   r@   rA   rB   
non_silentnonzerostartends
             r   trimr[      s    v ,	!J nnZ(G||a %gajZHIGGBK!'"+/jIJ
 
 S%)^bjj%666r   Tconstant)r#   r8   r@   rA   centerpad_moder   r8   r]   c           
         U b  U(       aY  [        U R                  5       Vs/ s H  nSPM     nn[        US-  5      [        US-  5      4US'   [        R                  " XUS9n [        XUS9n	[        R                  " [        XS9SS	S
9n
OUb  UR                  S   US-  S-   :w  aJ  [        SUR                  S    SUR                  S   S-  S-
   SUR                  S   S-  S-
   SU 35      e[        XS9n	U	SSSS24==   S-  ss'   US-  S:X  a  U	SSSS24==   S-  ss'   S[        R                  " U	SS	S
9-  US-  -  n
O[        S5      e[        R                  " U
5      nU$ s  snf )a  Compute root-mean-square (RMS) value for each frame, either from the
audio samples ``y`` or from a spectrogram ``S``.

Computing the RMS value from audio samples is faster as it doesn't require
a STFT calculation. However, using a spectrogram will give a more accurate
representation of energy over time because its frames can be windowed,
thus prefer using ``S`` if it's already available.

Parameters
----------
y : np.ndarray [shape=(..., n)] or None
    (optional) audio time series. Required if ``S`` is not input.
    Multi-channel is supported.
S : np.ndarray [shape=(..., d, t)] or None
    (optional) spectrogram magnitude. Required if ``y`` is not input.
frame_length : int > 0 [scalar]
    length of analysis frame (in samples) for energy calculation
hop_length : int > 0 [scalar]
    hop length for STFT. See `librosa.stft` for details.
center : bool
    If `True` and operating on time-domain input (``y``), pad the signal
    by ``frame_length//2`` on either side.
    If operating on spectrogram input, this has no effect.
pad_mode : str
    Padding mode for centered analysis.  See `numpy.pad` for valid
    values.
dtype : np.dtype, optional
    Data type of the output array.  Defaults to float32.

Returns
-------
rms : np.ndarray [shape=(..., 1, t)]
    RMS value for each frame

Examples
--------
>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> librosa.feature.rms(y=y)
array([[1.248e-01, 1.259e-01, ..., 1.845e-05, 1.796e-05]],
      dtype=float32)

Or from spectrogram input

>>> S, phase = librosa.magphase(librosa.stft(y))
>>> rms = librosa.feature.rms(S=S)

>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=2, sharex=True)
>>> times = librosa.times_like(rms)
>>> ax[0].semilogy(times, rms[0], label='RMS Energy')
>>> ax[0].set(xticks=[])
>>> ax[0].legend()
>>> ax[0].label_outer()
>>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
...                          y_axis='log', x_axis='time', ax=ax[1])
>>> ax[1].set(title='log Power spectrogram')

Use a STFT window of constant ones and no frame centering to get consistent
results with the RMS computed from the audio samples ``y``

>>> S = librosa.magphase(librosa.stft(y, window=np.ones, center=False))[0]
>>> librosa.feature.rms(S=S)
>>> plt.show()

NrP   r   rO   )mode)r@   rA   r   T)rE   keepdimsrD   zSince S.shape[-2] is z!, frame_length is expected to be z or z; found .r   g      ?z Either `y` or `S` must be input.)rI   rG   rS   r   padframemeanr$   rV   r   sumsqrt)r#   r8   r@   rA   r]   r^   r   _paddingr   r<   
rms_results               r   rF   rF   *  s   V 	}',QVV}5}!v}G5|q013|q7H3IJGBKq1A!:F Q,2E	
772;,!+a// '} 5223''"+/A2E1Fd177SU;YZ?]^K^J_ `%(    	
#q!)!q c2qjMS M BFF1255aG?@@WWU^J? 6s   E+rO   F)rE   	writeablesubokr   rE   rk   rl   c                V   [         R                  " U SUS9n U R                  U   U:  a   [        SU R                  U   S SUS 35      eUS:  a  [        SUS 35      eU R                  [        U R                  U   /5      -   n[        U R                  5      nXs==   US-
  -  ss'   [        U5      [        U/5      -   n[        XXUS9n	US	:  a  US-
  n
OUS-   n
[         R                  " U	S
U
5      n	[        S5      /U	R                  -  n[        S	SU5      X'   U	[        U5         $ )ax  Slice a data array into (overlapping) frames.

This implementation uses low-level stride manipulation to avoid
making a copy of the data.  The resulting frame representation
is a new view of the same input data.

For example, a one-dimensional input ``x = [0, 1, 2, 3, 4, 5, 6]``
can be framed with frame length 3 and hop length 2 in two ways.
The first (``axis=-1``), results in the array ``x_frames``::

    [[0, 2, 4],
     [1, 3, 5],
     [2, 4, 6]]

where each column ``x_frames[:, i]`` contains a contiguous slice of
the input ``x[i * hop_length : i * hop_length + frame_length]``.

The second way (``axis=0``) results in the array ``x_frames``::

    [[0, 1, 2],
     [2, 3, 4],
     [4, 5, 6]]

where each row ``x_frames[i]`` contains a contiguous slice of the input.

This generalizes to higher dimensional inputs, as shown in the examples below.
In general, the framing operation increments by 1 the number of dimensions,
adding a new "frame axis" either before the framing axis (if ``axis < 0``)
or after the framing axis (if ``axis >= 0``).

Parameters
----------
x : np.ndarray
    Array to frame
frame_length : int > 0 [scalar]
    Length of the frame
hop_length : int > 0 [scalar]
    Number of steps to advance between frames
axis : int
    The axis along which to frame.
writeable : bool
    If ``False``, then the framed view of ``x`` is read-only.
    If ``True``, then the framed view is read-write.  Note that writing to the framed view
    will also write to the input array ``x`` in this case.
subok : bool
    If True, sub-classes will be passed-through, otherwise the returned array will be
    forced to be a base-class array (default).

Returns
-------
x_frames : np.ndarray [shape=(..., frame_length, N_FRAMES, ...)]
    A framed view of ``x``, for example with ``axis=-1`` (framing on the last dimension)::

        x_frames[..., j] == x[..., j * hop_length : j * hop_length + frame_length]

    If ``axis=0`` (framing on the first dimension), then::

        x_frames[j] = x[j * hop_length : j * hop_length + frame_length]

Raises
------
ParameterError
    If ``x.shape[axis] < frame_length``, there is not enough data to fill one frame.

    If ``hop_length < 1``, frames cannot advance.

See Also
--------
numpy.lib.stride_tricks.as_strided

Examples
--------
Extract 2048-sample frames from monophonic signal with a hop of 64 samples per frame

>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
>>> frames
array([[-1.407e-03, -2.604e-02, ..., -1.795e-05, -8.108e-06],
       [-4.461e-04, -3.721e-02, ..., -1.573e-05, -1.652e-05],
       ...,
       [ 7.960e-02, -2.335e-01, ..., -6.815e-06,  1.266e-05],
       [ 9.568e-02, -1.252e-01, ...,  7.397e-06, -1.921e-05]],
      dtype=float32)
>>> y.shape
(117601,)

>>> frames.shape
(2048, 1806)

Or frame along the first axis instead of the last:

>>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64, axis=0)
>>> frames.shape
(1806, 2048)

Frame a stereo signal:

>>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), mono=False)
>>> y.shape
(2, 117601)
>>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
(2, 2048, 1806)

Carve an STFT into fixed-length patches of 32 frames with 50% overlap

>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> S = np.abs(librosa.stft(y))
>>> S.shape
(1025, 230)
>>> S_patch = librosa.util.frame(S, frame_length=32, hop_length=16)
>>> S_patch.shape
(1025, 32, 13)
>>> # The first patch contains the first 32 frames of S
>>> np.allclose(S_patch[:, :, 0], S[:, :32])
True
>>> # The second patch contains frames 16 to 16+32=48, and so on
>>> np.allclose(S_patch[:, :, 1], S[:, 16:48])
True
F)copyrl   zInput is too short (n=dz) for frame_length=rD   zInvalid hop_length: )stridesrV   rl   rk   r   rO   N)r   arrayrV   r   rp   rK   listr   moveaxisslicerG   )r   r@   rA   rE   rk   rl   out_stridesx_shape_trimmed	out_shapexwtarget_axisslicess               r   rd   rd     s=   F 	e,Awwt}|#$QWWT]1$55HVWHXY
 	
 A~3Jq>BCC ))eQYYt_$566K 177mO\A--o&~)>>I		i	
B axQhQh	R[	)B Dk]RWW$FD*-FLeFmr   g|=c                   [         R                  " U 5      n US::  a  [        S5      e[         R                  " U R                  [         R
                  5      (       a,  [        R                  " SSS9  [         R                  " U 5      nOU n[        U5      (       a	  U" U5      nO[         R                  " U5      nS[         R                  " [         R                  " X$5      5      -  nUS[         R                  " [         R                  " X%5      5      -  -  nUb8  US:  a  [        S5      e[         R                  " XfR                  5       U-
  5      nU$ )a	  Convert a power spectrogram (amplitude squared) to decibel (dB) units

This computes the scaling ``10 * log10(S / ref)`` in a numerically
stable way.

Parameters
----------
S : np.ndarray
    input power

ref : scalar or callable
    If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::

        10 * log10(S / ref)

    Zeros in the output correspond to positions where ``S == ref``.

    If callable, the reference value is computed as ``ref(S)``.

amin : float > 0 [scalar]
    minimum threshold for ``abs(S)`` and ``ref``

top_db : float >= 0 [scalar]
    threshold the output at ``top_db`` below the peak:
    ``max(10 * log10(S/ref)) - top_db``

Returns
-------
S_db : np.ndarray
    ``S_db ~= 10 * log10(S) - 10 * log10(ref)``

See Also
--------
perceptual_weighting
db_to_power
amplitude_to_db
db_to_amplitude

Notes
-----
This function caches at level 30.

Examples
--------
Get a power spectrogram from a waveform ``y``

>>> y, sr = librosa.load(librosa.ex('trumpet'))
>>> S = np.abs(librosa.stft(y))
>>> librosa.power_to_db(S**2)
array([[-41.809, -41.809, ..., -41.809, -41.809],
       [-41.809, -41.809, ..., -41.809, -41.809],
       ...,
       [-41.809, -41.809, ..., -41.809, -41.809],
       [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)

Compute dB relative to peak power

>>> librosa.power_to_db(S**2, ref=np.max)
array([[-80., -80., ..., -80., -80.],
       [-80., -80., ..., -80., -80.],
       ...,
       [-80., -80., ..., -80., -80.],
       [-80., -80., ..., -80., -80.]], dtype=float32)

Or compare to median power

>>> librosa.power_to_db(S**2, ref=np.median)
array([[16.578, 16.578, ..., 16.578, 16.578],
       [16.578, 16.578, ..., 16.578, 16.578],
       ...,
       [16.578, 16.578, ..., 16.578, 16.578],
       [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)

And plot the results

>>> import matplotlib.pyplot as plt
>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
>>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
...                                   ax=ax[0])
>>> ax[0].set(title='Power spectrogram')
>>> ax[0].label_outer()
>>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
...                                  sr=sr, y_axis='log', x_axis='time', ax=ax[1])
>>> ax[1].set(title='Log-Power spectrogram')
>>> fig.colorbar(imgpow, ax=ax[0])
>>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
r   zamin must be strictly positivezpower_to_db was called on complex input so phase information will be discarded. To suppress this warning, call power_to_db(np.abs(D)**2) instead.r   r+   g      $@ztop_db must be non-negative)r   r.   r   r/   r   r0   r1   r2   r3   r4   log10maximummax)r8   r&   r'   r(   r9   r:   log_specs          r   r7   r7   ?  s    | 	

1Aqy=>>	}}QWWb00116 		
 FF1I		}}	N	FF3K	"((2::d+F"GGHrxx

4 ;<<<HA: !>??::h(?@Or   )rA   n_fftr   c                    SnUb  [        US-  5      n[        R                  " U 5      U-  U-   R                  [         5      $ )a  Convert frame indices to audio sample indices.

Parameters
----------
frames : number or np.ndarray [shape=(n,)]
    frame index or vector of frame indices
hop_length : int > 0 [scalar]
    number of samples between successive frames
n_fft : None or int > 0 [scalar]
    Optional: length of the FFT window.
    If given, time conversion will include an offset of ``n_fft // 2``
    to counteract windowing effects when using a non-centered STFT.

Returns
-------
times : number or np.ndarray
    time (in samples) of each given frame number::

        times[i] = frames[i] * hop_length

See Also
--------
frames_to_time : convert frame indices to time values
samples_to_frames : convert sample indices to frame indices

Examples
--------
>>> y, sr = librosa.load(librosa.ex('choice'))
>>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
>>> beat_samples = librosa.frames_to_samples(beats, sr=sr)
r   r   )rS   r   
asanyarrayr!   )framesrA   r   offsets       r   rT   rT     sA    J FUaZMM&!J.7??DDr   )r   r1   collections.abcr   typingr   numpyr   numpy.lib.stride_tricksr   	Exceptionr   r   r   r$   floatfloatingr6   r>   r~   rS   rM   rK   r[   float32boolrF   rd   r7   integerrT   r	   r   r   <module>r      s    $   .	9 		\ 	!
%)V  B 
	B 	B
 DLB [[

"BN FF&&8	zz88 8 	8
 
E	8 8 ZZ8| FF&&S7	zzS7 S7 
		S7
 S7 S7 S7 2::rzz!"S7p  
**l	zzDl 
zzDl 	l
 l l ZZlh c	zzc c 	c
 c c c ZZcR  | 
	| 	|
 DL| [[

"|D 	)E )E :	)E
 ZZ_rzz!)Er   