
    bCi*                         S r SSKJrJr  SSKrSSKJrJrJ	r	  SSK
Jr  SSKJr  SSKJrJr  \R"                  " \5      r " S	 S
\5      rS
/rg)z"
Feature extractor class for CLVP
    )OptionalUnionN   )mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                   @  ^  \ rS rSrSrSS/r         SU 4S jjrS\R                  S\R                  4S jr	       SS
\
\R                  \\   \\R                     \\\      4   S\\   S\S\\   S\\
\\4      S\\   S\\   S	\\   S\4S jjrSrU =r$ )ClvpFeatureExtractor!   a  
Constructs a CLVP feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.

Args:
    feature_size (`int`, *optional*, defaults to 80):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 22050):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    default_audio_length (`int`, *optional*, defaults to 6):
        The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
        automatically be set to default_audio_length * `self.sampling_rate`.
    hop_length (`int`, *optional*, defaults to 256):
        Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
    chunk_length (`int`, *optional*, defaults to 30):
        The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
        sequences.
    n_fft (`int`, *optional*, defaults to 1024):
        Size of the Fourier transform.
    padding_value (`float`, *optional*, defaults to 0.0):
        Padding value used to pad the audio. Should correspond to silences.
    mel_norms (`list` of length `feature_size`, *optional*):
        If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
        mel-filter.
    return_attention_mask (`bool`, *optional*, defaults to `False`):
        Whether to return the attention mask. If left to the default, it will return the attention mask.

        [What are attention masks?](../glossary#attention-mask)
input_featuresattention_maskc
           
         > [         TU ]  " S	UUUU	S.U
D6  X`l        X@l        XPl        XR-  U l        U R
                  U-  U l        X l        X0l        Xl	        [        SUS-  -   USSUSSS9U l        g )
N)feature_sizesampling_ratepadding_valuereturn_attention_mask              g     @@slaneyhtk)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   default_audio_length	mel_normsr   mel_filters)selfr   r   r*   r&   r'   r%   r   r+   r   kwargs	__class__s              j/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/clvp/feature_extraction_clvp.pyr$   ClvpFeatureExtractor.__init__G   s     	 	
%''"7		

 	
 
$(%5!^^z9*$8!"* EQJ/( '
    waveformreturnc           
      J   [        U[        U R                  S5      U R                  U R                  SU R                  SS9n[
        R                  " [
        R                  " USSS95      nU R                  b*  U[
        R                  " U R                  5      SS2S4   -  nU$ )z
This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
each mel-filterbank, if `mel_norms` is provided.
hanng       @N)frame_lengthr&   powerr,   log_melgh㈵>)a_mina_max)
r   r   r%   r&   r,   nplogclipr+   array)r-   r3   log_specs      r0   _np_extract_fbank_features/ClvpFeatureExtractor._np_extract_fbank_featuresm   s    
 DJJ/((
 66"''($dCD>>%"((4>>":1d7"CCHr2   
max_length
raw_speechr   
truncationpad_to_multiple_ofreturn_tensorsr   paddingc	                    UbP  X R                   :w  a@  [        SU R                  R                   SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R                   S35        [        U[        R                  5      =(       a    [        UR                  5      S:  n
U
(       a'  [        UR                  5      S	:  a  [        S
U  35      eU
=(       dE    [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      nU(       a?  U Vs/ s H1  n[        R                  " U/[        R                  S9R                  PM3     nnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                   [        R                   " [        R"                  5      L a  UR%                  [        R                  5      nU(       d"  [        R                  " U/5      R                  /n['        SU05      nUc  U R(                  U R                   -  OUnU R+                  UUUUUUS9nUR-                  S5      R/                  S	SS5      nUS    Vs/ s H1  nU R1                  U5      R%                  [        R                  5      PM3     nn[        US   [        5      (       a+  U Vs/ s H  n[        R                  " U5      PM     snUS'   OXS'   UR3                  U5      $ s  snf s  snf s  snf )a  
`ClvpFeatureExtractor` is used to extract various voice specific properties such as the pitch and tone of the
voice, speaking speed, and even speaking defects like a lisp or stuttering from a sample voice or `raw_speech`.

First the voice is padded or truncated in a way such that it becomes a waveform of `self.default_audio_length`
seconds long and then the log-mel spectrogram is extracted from it.

Args:
    raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
        stereo, i.e. single float per timestep.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
        pipeline.
    truncation (`bool`, *optional*, default to `True`):
        Activates truncation to cut input sequences longer than *max_length* to *max_length*.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_attention_mask (`bool`, *optional*, defaults to `True`):
        Whether to return the attention mask. If left to the default, it will return the attention mask.

        [What are attention masks?](../glossary#attention-mask)
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used to fill the padding values / vectors.
    max_length (`int`, *optional*):
        The maximum input length of the inputs.
z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   )dtyper   )rH   rC   rE   rF   r   )r   
ValueErrorr/   __name__loggerwarning
isinstancer<   ndarraylenshapelisttupleasarrayfloat32TrK   float64astyper
   r*   padget	transposerA   convert_to_tensors)r-   rD   r   rE   rF   rG   r   rH   rC   r.   is_batched_numpy
is_batchedspeechbatched_speechpadded_inputsr   r3   features                     r0   __call__ClvpFeatureExtractor.__call__   s   f $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  3 NNVW[WeWeWnWnVo p\ \
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 Q[\Q[v"**fXRZZ@BBQ[J\JJz2::$F$FJbjjAJ
BJJ//J4D4DQSQ[Q[H\4\#**2::6J **j\2445J%'7&DEGQGYT..1C1CC_i
!!1"7 ! 
 '**+;<FFq!QO ZhhiYj
YjXD++H5<<RZZHYj 	 
 nQ'..R`.aR`wrzz'/BR`.aM*+.<*+//??G ]4

 /bs   8M8M M")	r'   r*   r&   r,   r+   r%   r(   r)   r   )	P   i"V           i   r   NF)NTNNTrC   N)rM   
__module____qualname____firstlineno____doc__model_input_namesr$   r<   rQ   rA   r   rT   floatr   intboolstrr   r
   re   __static_attributes____classcell__)r/   s   @r0   r   r   !   s.   !F *+;< #$
L2:: "** 2 (,,0;?04!-$(k@"**d5k4

3CT$u+EVVWk@  }k@ 	k@
 %SMk@ !sJ!78k@  (~k@ #k@ SMk@ 
k@ k@r2   r   )rn   typingr   r   numpyr<   audio_utilsr   r   r   !feature_extraction_sequence_utilsr	   feature_extraction_utilsr
   utilsr   r   
get_loggerrM   rN   r   __all__r"   r2   r0   <module>r~      sO     #  H H I 4 ( 
		H	%M@3 M@` "
"r2   