
    cCi`Y                         S r SSKJrJrJr  SSKrSSKJrJ	r	J
r
Jr  SSKJr  SSKJr  SSKJrJrJr  \R(                  " \5      r " S	 S
\5      rS
/rg)z)Feature extractor class for UnivNetModel.    )AnyOptionalUnionN   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc            )         ^  \ rS rSrSr/ SQr                     S/S\S\S\S\S\S	\S
\S\	S\
\   S\S\S\
\   S\S\S\S\S\S\S\S\4(U 4S jjjrS rS rS\R                  S\R                  4S jr S0S\S\
\R"                  R$                     S\R                  4S  jjrS0S\\R                     4S! jjr            S1S"\\R                  \\   \\R                     \\\      4   S\
\   S#\\\	\4   S$\
\   S%\S&\
\   S'\S\
\R"                  R$                     S(\S)\
\   S\
\	   S*\
\   S+\
\\	\4      S\4S, jjrS\\	\4   4U 4S- jjrS.rU =r$ )2UnivNetFeatureExtractor   a  
Constructs a UnivNet feature extractor.

This class extracts log-mel-filter bank features from raw speech using the short time Fourier Transform (STFT). The
STFT implementation follows that of TacoTron 2 and Hifi-GAN.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

Args:
    feature_size (`int`, *optional*, defaults to 1):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 24000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        The value to pad with when applying the padding strategy defined by the `padding` argument to
        [`UnivNetFeatureExtractor.__call__`]. Should correspond to audio silence. The `pad_end` argument to
        `__call__` will also use this padding value.
    do_normalize (`bool`, *optional*, defaults to `False`):
        Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve the
        performance for some models.
    num_mel_bins (`int`, *optional*, defaults to 100):
        The number of mel-frequency bins in the extracted spectrogram features. This should match
        `UnivNetModel.config.num_mel_bins`.
    hop_length (`int`, *optional*, defaults to 256):
        The direct number of samples between sliding windows. Otherwise referred to as "shift" in many papers. Note
        that this is different from other audio feature extractors such as [`SpeechT5FeatureExtractor`] which take
        the `hop_length` in ms.
    win_length (`int`, *optional*, defaults to 1024):
        The direct number of samples for each sliding window. Note that this is different from other audio feature
        extractors such as [`SpeechT5FeatureExtractor`] which take the `win_length` in ms.
    win_function (`str`, *optional*, defaults to `"hann_window"`):
        Name for the window function used for windowing, must be accessible via `torch.{win_function}`
    filter_length (`int`, *optional*, defaults to 1024):
        The number of FFT components to use. If `None`, this is determined using
        `transformers.audio_utils.optimal_fft_length`.
    max_length_s (`int`, *optional*, defaults to 10):
        The maximum input length of the model in seconds. This is used to pad the audio.
    fmin (`float`, *optional*, defaults to 0.0):
        Minimum mel frequency in Hz.
    fmax (`float`, *optional*):
        Maximum mel frequency in Hz. If not set, defaults to `sampling_rate / 2`.
    mel_floor (`float`, *optional*, defaults to 1e-09):
        Minimum value of mel frequency banks. Note that the way [`UnivNetFeatureExtractor`] uses `mel_floor` is
        different than in [`transformers.audio_utils.spectrogram`].
    center (`bool`, *optional*, defaults to `False`):
        Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
        `t` will start at time `t * hop_length`.
    compression_factor (`float`, *optional*, defaults to 1.0):
        The multiplicative compression factor for dynamic range compression during spectral normalization.
    compression_clip_val (`float`, *optional*, defaults to 1e-05):
        The clip value applied to the waveform before applying dynamic range compression during spectral
        normalization.
    normalize_min (`float`, *optional*, defaults to -11.512925148010254):
        The min value used for Tacotron 2-style linear normalization. The default is the original value from the
        Tacotron 2 implementation.
    normalize_max (`float`, *optional*, defaults to 2.3143386840820312):
        The max value used for Tacotron 2-style linear normalization. The default is the original value from the
        Tacotron 2 implementation.
    model_in_channels (`int`, *optional*, defaults to 64):
        The number of input channels to the [`UnivNetModel`] model. This should match
        `UnivNetModel.config.model_in_channels`.
    pad_end_length (`int`, *optional*, defaults to 10):
        If padding the end of each waveform, the number of spectrogram frames worth of samples to append. The
        number of appended samples will be `pad_end_length * hop_length`.
    return_attention_mask (`bool`, *optional*, defaults to `True`):
        Whether or not [`~UnivNetFeatureExtractor.__call__`] should return `attention_mask`.
)input_featuresnoise_sequencepadding_maskfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionfilter_lengthmax_length_sfminfmax	mel_floorcentercompression_factorcompression_clip_valnormalize_minnormalize_maxmodel_in_channelspad_end_lengthc           
        > [         TU ]  " SUUUUS.UD6  X@l        XPl        X`l        Xpl        Xl        Xl        Xl        Uc  [        U5      S-  nXl
        Xl        Xl        X-  U l        U R                  c  [        U R
                  5      U l        OU R                  U l        U R                  S-  S-   U l        [#        U R
                  U R                  SS9U l        ['        U R                   U R                  U R                  U R                  U R(                  SSS9U l        Xl        Xl        UU l        UU l        UU l        UU l        UU l        g )	N)r   r   r   return_attention_mask      T)window_lengthnameperiodicslaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__r   r   r   r   r   r   r    floatr!   r"   r   num_max_samplesr   n_fftn_freqsr
   windowr   r   mel_filtersr#   r$   r%   r&   r'   r(   r)   )selfr   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r+   kwargs	__class__s                          p/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/univnet/feature_extraction_univnet.pyr:    UnivNetFeatureExtractor.__init__e   sL   2 	 	
%''"7		

 	
 )($$(*	<'!+D	"(+;%+DOO<DJ++DJ

a1,%DOO$J[J[fjk*#|| --)))),,
 "4$8!**!2,    c                 ^    SXR                   -
  U R                  U R                   -
  -  -  S-
  $ )Nr,   r-   r&   r'   rA   r	   s     rD   	normalize!UnivNetFeatureExtractor.normalize   s2    [#5#55$:L:LtOaOa:abcfgggrF   c                 `    U R                   U R                  U R                   -
  US-   S-  -  -   $ )Nr-   r,   rH   rI   s     rD   denormalize#UnivNetFeatureExtractor.denormalize   s6    !!T%7%7$:L:L%LR]`aRaefQf$gggrF   waveformreturnc                    [         R                  " U[        U R                  U R                  -
  S-  5      [        U R                  U R                  -
  S-  5      4SS9n[        UU R                  U R                  U R                  U R                  SU R                  SSS9	n[         R                  " [         R                  " U5      S-  [         R                  " U5      S-  -   U R                  -   5      n[         R                  " U R                  R                  U5      n[         R                  " [         R                   " X@R"                  SS9U R$                  -  5      nUR                  $ )a  
Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
`int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.

Args:
    waveform (`np.ndarray` of shape `(length,)`):
        The input waveform. This must be a single real-valued, mono waveform.

Returns:
    `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
r,   reflect)modeN)r?   frame_lengthr   
fft_lengthpowerr#   r@   r"   )a_mina_max)nppadintr=   r   r	   r?   r#   sqrtrealimagr"   matmulr@   Tlogclipr%   r$   )rA   rO   complex_spectrogramamplitude_spectrogrammel_spectrogramlog_mel_spectrograms         rD   re   'UnivNetFeatureExtractor.mel_spectrogram   s-    66$**t.!34c4::;W[\:\6]^
 *;;zz;;

 !#GG'(A-8K0LPQ0QQTXTbTbb!
 ))D$4$4$6$68MN !ffGGO+D+DDQTXTkTkk

 #$$$rF   noise_length	generatorc                     Uc  [         R                  R                  5       nXR                  4nUR	                  U[         R
                  S9nU$ )aq  
Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
[`UnivNetModel.forward`].

Args:
    spectrogram_length (`int`):
        The length (dim 0) of the generated noise.
    model_in_channels (`int`, *optional*, defaults to `None`):
        The number of features (dim 1) of the generated noise. This should correspond to the
        `model_in_channels` of the [`UnivNetGan`] model. If not set, this will default to
        `self.config.model_in_channels`.
    generator (`numpy.random.Generator`, *optional*, defaults to `None`)
        An optional `numpy.random.Generator` random number generator to control noise generation. If not set, a
        new generator with fresh entropy will be created.

Returns:
    `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
    model_in_channels)`.
dtype)rY   randomdefault_rngr(   standard_normalfloat32)rA   rh   ri   noise_shapenoises        rD   generate_noise&UnivNetFeatureExtractor.generate_noise   sH    0 		--/I#%;%;<))+RZZ)HrF   c                     U Vs/ s H.  o3R                  5       R                  SSS9R                  5       PM0     nnUb$  [        U5       VVs/ s H  u  pCUSX$    PM     nnnU$ s  snf s  snnf )aw  
Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
audio waveform arrays and not a single tensor/array because in general the waveforms will have different
lengths after removing padding.

Args:
    waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        The batched output waveforms from the [`UnivNetModel`].
    waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
        The batched lengths of each waveform before padding.

Returns:
    `list[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
cpuT)devicecopyN)detachtonumpy	enumerate)rA   	waveformswaveform_lengthsrO   is        rD   batch_decode$UnivNetFeatureExtractor.batch_decode  s|      \ee[dx__&))T)BHHJ[d	e'LUV_L`aL`[Q"7$4$78L`Ia f bs   5A%A*
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_noisepad_end
pad_lengthr+   return_tensorsc                 .	   Ub  UOU R                   nUbP  X R                  :w  a@  [        SU R                  R                   SU R                   SU R                   SU S3	5      eO-[
        R                  SU R                  R                   S35        [        U[        R                  5      =(       a    [        UR                  5      S:  nU(       a'  [        UR                  5      S	:  a  [        S
U  35      eU=(       dE    [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      nU(       a4  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     nnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                   [        R                   " [        R"                  5      L a  UR%                  [        R                  5      nU(       d$  [        R                  " U[        R                  S9/nU	(       aO  U
b  U
OU R&                  n
U Vs/ s H1  n[        R(                  " USXR*                  -  4U R,                  S9PM3     nn[/        SU05      nU R)                  UUUb  UOU R0                  UUUS9nUR3                  S5      nU Vs/ s H  nU R5                  U5      PM     nn[        US   [        5      (       a8  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     snUS'   O3U Vs/ s H"  nUR%                  [        R                  5      PM$     snUS'   UR3                  S5      nUb7  U Vs/ s H&  n[        R                  " U[        R6                  S9PM(     snUS'   U(       a7  US    Vs/ s H"  nU R9                  UR                  S   U5      PM$     nnUUS'   U(       a(  US    Vs/ s H  nU R;                  U5      PM     snUS'   Ub  UR=                  U5      nU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf s  snf )a'  
Main method to featurize and prepare for the model one or several sequence(s).

Args:
    raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
        stereo, i.e. single float per timestep.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
        pipeline.
    padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
        Select a strategy to pad the input `raw_speech` waveforms (according to the model's padding side and
        padding index) among:

        - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
          sequence if provided).
        - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
          acceptable input length for the model if that argument is not provided.
        - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
          lengths).

        If `pad_end = True`, that padding will occur before the `padding` strategy is applied.
    max_length (`int`, *optional*):
        Maximum length of the returned list and optionally padding length (see above).
    truncation (`bool`, *optional*, defaults to `True`):
        Activates truncation to cut input sequences longer than `max_length` to `max_length`.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_noise (`bool`, *optional*, defaults to `True`):
        Whether to generate and return a noise waveform for use in [`UnivNetModel.forward`].
    generator (`numpy.random.Generator`, *optional*, defaults to `None`):
        An optional `numpy.random.Generator` random number generator to use when generating noise.
    pad_end (`bool`, *optional*, defaults to `False`):
        Whether to pad the end of each waveform with silence. This can help reduce artifacts at the end of the
        generated audio sample; see https://github.com/seungwonpark/melgan/issues/8 for more details. This
        padding will be done before the padding strategy specified in `padding` is performed.
    pad_length (`int`, *optional*, defaults to `None`):
        If padding the end of each waveform, the length of the padding in spectrogram frames. If not set, this
        will default to `self.config.pad_end_length`.
    do_normalize (`bool`, *optional*):
        Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve
        the performance for some models. If not set, this will default to `self.config.do_normalize`.
    return_attention_mask (`bool`, *optional*):
        Whether to return the attention mask. If left to the default, will return the attention mask according
        to the specific feature_extractor's default.

        [What are attention masks?](../glossary#attention-mask)

    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.np.array` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r-   r,   z2Only mono-channel audio is supported for input to r   rk   )constant_valuesr   )r   r   r   r   r+   attention_maskr   r   )r   r   
ValueErrorrC   __name__loggerwarning
isinstancerY   ndarraylenshapelisttupleasarrayrp   rl   float64astyper)   rZ   r   r   r   r<   getre   int32rs   rJ   convert_to_tensors)rA   r   r   r   r   r   r   r   ri   r   r   r   r+   r   is_batched_numpy
is_batchedspeechrO   batched_speechpadded_inputsr   mel_spectrogramsmelr   arrayr	   rr   s                              rD   __call__ UnivNetFeatureExtractor.__call__  s2   X (4'?|TEVEV$ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  3 NNVW[WeWeWnWnVo p\ \
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 MWXZ6"**V2::>ZJXJJz2::$F$FJbjjAJ
BJJ//J4D4DQSQ[Q[H\4\#**2::6J **ZrzzBCJ '1'=4CVCVJ !+ *H x!Z//%A!BTXTfTfg *  
 &'7&DE%/%;zAUAU!1"7 ! 
 '**+;<KYZ>xD00:>ZnQ'..]m/n]mVY

3bjj0Q]m/nN+,Rb/cRb3

2::0FRb/cN+, '**+;<%]k-l]kTYbjjbhh.O]k-lN>* $22B#C#CK ##K$5$5a$8)D#C   05N+,?MN^?_0?_{+?_0N+, %+>>~NNy Y( [ 0o/c
 .m0s0   -Q/8Q4Q9:-Q>2)R9-R:)R8Rc                 P   > [         TU ]  5       n/ SQnU H  nX1;   d  M
  X	 M     U$ )N)r?   r@   r=   r>   r<   )r9   to_dict)rA   outputnamesr/   rC   s       rD   r   UnivNetFeatureExtractor.to_dict  s2    " QD~L  rF   )r#   r%   r$   r   r   r!   r    r   r   r@   r"   r(   r=   r>   r'   r&   r<   r   r)   r   r   r?   )r-   i]          Fd         hann_windowr   
   r   Ng&.>Fg      ?gh㈵>g    'g    ă@@   r   T)N)NTNTNTNFNNNN) r   
__module____qualname____firstlineno____doc__model_input_namesr[   r;   boolstrr   r:   rJ   rM   rY   r   re   rm   	Generatorrs   r   r   r   r   r   r   r   dictr   r   __static_attributes____classcell__)rC   s   @rD   r   r      s   CJ M """)'+ $$'&*21!# "-J-J- J- 	J-
 J- J- J- J- J-  }J- J- J- uoJ- J- J-  "!J-" $#J-$ %J-& 'J-( )J-* +J- J-Xhh.%

 .%rzz .%f 48 BII//0 
	@RZZ@P 4 (,59$(,0!37$(&*04;?_"**d5k4

3CT$u+EVVW_  }_ tS/12	_
 SM_ _ %SM_ _ BII//0_ _ SM_ sm_  (~_ !sJ!78_ 
_B	c3h 	 	rF   r   )r   typingr   r   r   r{   rY   audio_utilsr   r   r	   r
   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   
get_loggerr   r   r   __all__r8   rF   rD   <module>r      sQ    0 ' '  \ \ I 4 9 9 
		H	%k6 k\ %
%rF   