
    cCiE                         S r SSKrSSKJrJrJr  SSKrSSKJ	r	J
r
JrJr  SSKJr  SSKJr  SSKJrJrJr  \R*                  " \5      r " S	 S
\5      rS
/rg)z%Feature extractor class for SpeechT5.    N)AnyOptionalUnion   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                   @  ^  \ rS rSrSrSS/r              S%S\S\S\S\S	\S
\S\S\	S\S\S\S\S\S\4U 4S jjjr
\ S&S\\R                     S\\R                     S\S\\R                     4S jj5       rS\R                  S\R                  4S jr         S'S\\\R                  \\   \\R                     \\\      4      S\\\R                  \\   \\R                     \\\      4      S\\\	\4   S\\   S\S\\   S\\   S\\\	\4      S\\   S\4S jjr       S(S \\R                  \\   \\R                     \\\      4   S!\S\\\	\4   S\\   S\S\\   S\\   S\\\	\4      S\4S" jjrS\\	\4   4U 4S# jjrS$rU =r$ ))SpeechT5FeatureExtractor   ai	  
Constructs a SpeechT5 feature extractor.

This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
the SpeechT5 speech encoder prenet.

This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
prenet.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

Args:
    feature_size (`int`, *optional*, defaults to 1):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used to fill the padding values.
    do_normalize (`bool`, *optional*, defaults to `False`):
        Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
        improve the performance for some models.
    num_mel_bins (`int`, *optional*, defaults to 80):
        The number of mel-frequency bins in the extracted spectrogram features.
    hop_length (`int`, *optional*, defaults to 16):
        Number of ms between windows. Otherwise referred to as "shift" in many papers.
    win_length (`int`, *optional*, defaults to 64):
        Number of ms per window.
    win_function (`str`, *optional*, defaults to `"hann_window"`):
        Name for the window function used for windowing, must be accessible via `torch.{win_function}`
    frame_signal_scale (`float`, *optional*, defaults to 1.0):
        Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
    fmin (`float`, *optional*, defaults to 80):
        Minimum mel frequency in Hz.
    fmax (`float`, *optional*, defaults to 7600):
        Maximum mel frequency in Hz.
    mel_floor (`float`, *optional*, defaults to 1e-10):
        Minimum value of mel frequency banks.
    reduction_factor (`int`, *optional*, defaults to 2):
        Spectrogram length reduction factor. This argument is deprecated.
    return_attention_mask (`bool`, *optional*, defaults to `True`):
        Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
input_valuesattention_maskfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionframe_signal_scalefminfmax	mel_floorreduction_factorreturn_attention_maskc           
        > [         TU ]  " SXUS.UD6  X@l        Xl        XPl        X`l        Xpl        Xl        Xl        Xl	        Xl
        Xl        Xl        Xr-  S-  U l        Xb-  S-  U l        [        U R                  5      U l        U R                   S-  S-   U l        [%        U R                  U R                  SS9U l        [)        U R"                  U R                  U R                  U R                  U R*                  SSS9U l        U	S	:w  a  [.        R0                  " S
[2        5        US:w  a  [.        R0                  " S[2        5        g g )N)r   r   r   i        T)window_lengthnameperiodicslaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale      ?zeThe argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformersg       @zcThe argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers )super__init__r   r"   r   r   r   r   r   r   r   r    r!   sample_sizesample_strider   n_fftn_freqsr
   windowr   r   mel_filterswarningswarnFutureWarning)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   kwargs	__class__s                   r/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/speecht5/feature_extraction_speecht5.pyr3   !SpeechT5FeatureExtractor.__init__N   s5   $ 	wl_lwpvw(%:"($$("4		" 0%5='74?'(8(89


a1,%D4D4D4K\K\gkl*#|| --)))),,
 $MMw s"MMu #    returnc                    Ub  [         R                  " U[         R                  5      n/ n[        XR	                  S5      5       Hl  u  pEXDSU R                  5       -
  [         R                  " USU R                  5       S-   5      -  nXVR                  S   :  a  X&US& UR                  U5        Mn     U$ U  Vs/ s H=  owUR                  5       -
  [         R                  " UR                  5       S-   5      -  PM?     nnU$ s  snf )zK
Every array in the list is normalized to have zero mean and unit variance
NgHz>r   )
nparrayint32zipsummeansqrtvarshapeappend)r   r   r   normed_input_valuesvectorlengthnormed_slicexs           r@   zero_mean_unit_var_norm0SpeechT5FeatureExtractor.zero_mean_unit_var_norm   s     %XXnbhh?N"$"%l4F4Fr4J"K &)=)=)? ?2776RYSY?K^K^K`cgKgChh..q11,9)#**<8 #L #" Vb"bUaPQLBGGAEEGdN4K#KUa"b"" #cs   ;ADone_waveformc                     [        UU R                  U R                  U R                  U R                  U R
                  U R                  SS9nUR                  $ )zJ
Extracts log-mel filterbank features for one waveform array (unbatched).
log10)r8   frame_lengthr   
fft_lengthr9   r    log_mel)r	   r8   r4   r5   r6   r9   r    T)r=   rW   log_mel_specs      r@   _extract_mel_features.SpeechT5FeatureExtractor._extract_mel_features   sP     #;;))))zz((nn	
 ~~rB   audioaudio_targetpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc
                    Uc  Uc  [        S5      eU	b<  XR                  :w  a,  [        SU  SU R                   SU R                   SU	 S3	5      eO-[        R                  SU R                  R
                   S	35        Ub  U R                  " US
UUUUUU40 U
D6nOSnUb?  U R                  " USUUUUUU40 U
D6nUc  U$ US   US'   UR                  S5      nUb  XS'   U$ )a  
Main method to featurize and prepare for the model one or several sequence(s).

Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
spectrogram features.

Args:
    audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
        The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
        be mono channel audio, not stereo, i.e. single float per timestep.
    audio_target (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`, *optional*):
        The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
        list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
        spectrogram features.
    padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
        Select a strategy to pad the returned sequences (according to the model's padding side and padding
        index) among:

        - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
          sequence if provided).
        - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
          acceptable input length for the model if that argument is not provided.
        - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
          lengths).
    max_length (`int`, *optional*):
        Maximum length of the returned list and optionally padding length (see above).
    truncation (`bool`):
        Activates truncation to cut input sequences longer than *max_length* to *max_length*.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_attention_mask (`bool`, *optional*):
        Whether to return the attention mask. If left to the default, will return the attention mask according
        to the specific feature_extractor's default.

        [What are attention masks?](../glossary#attention-mask)

    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
        to pass `sampling_rate` at the forward call to prevent silent errors.
Nz9You must provide either `audio` or `audio_target` values.z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.FTr   labelsr   decoder_attention_mask)
ValueErrorr   loggerwarningr?   __name___process_audioget)r=   ra   rb   rc   rd   re   rf   r"   rg   r   r>   inputsinputs_targetrk   s                 r@   __call__!SpeechT5FeatureExtractor.__call__   sb   ~ =\1XYY$ 2 22 I$ P**+ ,**+9]O1F  3 NNVW[WeWeWnWnVo p\ \
 (("%
 
F F# //"%
 
M ~$$#0#@x )6):):;K)L&)57M34rB   speech	is_targetc	           	      
	   [        U[        R                  5      =(       a    [        UR                  5      S:  n
U
(       a'  [        UR                  5      S:  a  [        SU  35      eU
=(       dE    [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      nU(       a5  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     snnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                  [        R                  " [        R                  5      L a  UR                  [        R                  5      nU(       d  U/nU R                  nU(       a?  U Vs/ s H  oR                  U5      PM     nn[        SU05      nU R                   U l        O[        SU05      nU R"                  " U4UUUUUS.U	D6nXl        US   n[        US   [        R                  5      (       d9  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     snUS'   GO[        U[        R                  5      (       d  [        US   [        R                  5      (       ah  US   R                  [        R                  " [        R                  5      L a4  U Vs/ s H"  nUR                  [        R                  5      PM$     snUS'   Or[        U[        R                  5      (       aS  UR                  [        R                  " [        R                  5      L a"  UR                  [        R                  5      US'   UR%                  S5      nUb7  U Vs/ s H&  n[        R                  " U[        R&                  S9PM(     snUS'   U(       dV  U R(                  (       aE  U R+                  X4S	9[,        R.                  La  UOS nU R1                  US   UU R2                  S
9US'   Ub  UR5                  U5      nU$ s  snf s  snf s  snf s  snf s  snf )Nr%   r$   z2Only mono-channel audio is supported for input to r   )dtyper   )rc   rd   re   rf   r"   r   )rd   )r   r   )
isinstancerF   ndarraylenrN   rl   listtupleasarrayfloat32ry   float64astyper   r_   r   r   padrq   rH   r   _get_padding_strategiesr   
DO_NOT_PADrU   r   convert_to_tensors)r=   rv   rw   rc   rd   re   rf   r"   rg   r>   is_batched_numpy
is_batchedfeature_size_hackwaveformfeaturesencoded_inputspadded_inputsr   rG   r   s                       r@   rp   'SpeechT5FeatureExtractor._process_audio)  s    &fbjj9Sc&,,>ORS>SFLL 1A 5QRVQWXYY% 
ve}-d:fQi"**V[]aIb3c 	 IOPvbjjrzz:PFJvrzz$B$BZZbjj9F

++@T0T]]2::.F XF !-- MSTV228<VHT)>8*DEN $ 1 1D)>6*BCN
!!1"7
 
 . %^4,q/2::66^j,k^jUZRZZRZZ-P^j,kM.)<44<?BJJ77Q%%"**)==S_,`S_%U\\"**-ES_,`M.)bjj11l6H6HBHHUWU_U_L`6`,8,?,?

,KM.) '**+;<%^l.m^lUZrzz%rxx/P^l.mM*+ T.. ///OWfWqWqq  
 -1,H,Hn-n\`\n\n -I -M.) %)<<^LMC Q U* -l -a /ns   -Q,Q1-Q6)Q;-R c                 P   > [         TU ]  5       n/ SQnU H  nX1;   d  M
  X	 M     U$ )N)r8   r9   r4   r5   r6   r7   )r2   to_dict)r=   outputnamesr'   r?   s       r@   r    SpeechT5FeatureExtractor.to_dict  s2    " ^D~L  rB   )r   r   r   r   r   r   r9   r    r6   r7   r   r!   r"   r4   r5   r   r   r8   )r%   i>          FP      @   hann_windowr0   r   i  g|=r$   T)r   )	NNFNFNNNN)FFNFNNN)ro   
__module____qualname____firstlineno____doc__model_input_namesintfloatboolstrr3   staticmethodr}   rF   r{   rU   r_   r   r   r   r   r   rt   rp   dictr   r   __static_attributes____classcell__)r?   s   @r@   r   r      sN   *X ()9: """)$'  !&*:: : 	:
 : : : : : ": : : : :  $: :x  be#2::&#8<RZZ8H#Y^#	bjj	# #*jj 
* `dfj5:$( ,004;?'+sbjj$u+tBJJ7GdSXkIZZ[\s uRZZed2::>NPTUYZ_U`Pa%abcs tS/12	s
 SMs s %SMs  (~s !sJ!78s  }s 
sp  5:$( ,004;?Ubjj$u+tBJJ/?d5kARRSU U tS/12	U
 SMU U %SMU  (~U !sJ!78U 
Un	c3h 	 	rB   r   )r   r:   typingr   r   r   numpyrF   audio_utilsr   r   r	   r
   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r   r   
get_loggerro   rm   r   __all__r1   rB   r@   <module>r      sT    ,  ' '  \ \ I 4 9 9 
		H	%j7 jZ &
&rB   