
    cCi\A                         S r SSKJrJr  SSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJr  SS	KJrJr  \" 5       (       a  SSKr\R(                  " \5      r " S
 S\5      rS/rg)z%
Feature extractor class for Whisper
    )OptionalUnionN   )is_torch_available)mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                     ^  \ rS rSrSrS/r        SU 4S jjrS\R                  S\	S\R                  4S jr
SS	\R                  S\	S\R                  4S
 jjr\ SS\\R                     S\\R                     S\S\\R                     4S jj5       r          SS\\R                  \\   \\R                     \\\      4   S\S\\   S\\\	\4      S\\   S\\	   S\\   S\\   S\\   S\\	   S\\   S\4S jjrSrU =r$ )WhisperFeatureExtractor$   a/  
Constructs a Whisper feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
Fourier Transform` which should match pytorch's `torch.stft` equivalent.

Args:
    feature_size (`int`, *optional*, defaults to 80):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    hop_length (`int`, *optional*, defaults to 160):
        Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
    chunk_length (`int`, *optional*, defaults to 30):
        The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
        sequences.
    n_fft (`int`, *optional*, defaults to 400):
        Size of the Fourier transform.
    padding_value (`float`, *optional*, defaults to 0.0):
        Padding value used to pad the audio. Should correspond to silences.
    dither (`float`, *optional*, defaults to 0.0):
        Adds dithering. In other words, adds a small Gaussian noise to each frame.
        E.g. use 0.0001 to add dithering with a normal distribution centered
        around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
        The value 0.0 means no dithering.
        Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
        the high log_mel_fbank values for signals with hard-zero sections,
        when VAD cutoff is present in the signal.
input_featuresc	           
         > [         T
U ]  " SUUUUS.U	D6  XPl        X0l        X@l        XB-  U l        U R
                  U-  U l        X l        Xpl        [        SUS-  -   USSUSSS9U l
        g )	N)feature_sizesampling_ratepadding_valuereturn_attention_mask              g     @@slaney)num_frequency_binsnum_mel_filtersmin_frequencymax_frequencyr   norm	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesnb_max_framesr   ditherr   mel_filters)selfr   r   r%   r&   r$   r   r)   r   kwargs	__class__s             p/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/whisper/feature_extraction_whisper.pyr#    WhisperFeatureExtractor.__init__H   s     	 	
%''"7		

 	
 
$(%5!^^z9** 5A:~( '
    waveform_batchdevicereturnc                    US:w  a  [        SU S35      e/ nU H  n[        U[        U R                  S5      U R                  U R                  SU R
                  U R                  SS9nUSS2SS	24   n[        R                  " XUR                  5       S
-
  5      nUS-   S-  nUR                  U5        M     [        R                  " U5      nU$ )z
Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
implementation with 1e-5 tolerance.
cpuzGot device `z` for feature extraction, but feature extraction on CUDA accelerator devices requires torch, which is not installed. Either set `device='cpu'`, or install torch according to the official instructions: https://pytorch.org/get-started/locally/hanng       @log10)frame_lengthr%   powerr)   r*   log_melN       @      @)
ValueErrorr   r	   r$   r%   r)   r*   npmaximummaxappendarray)r+   r1   r2   log_spec_batchwaveformlog_specs         r.   _np_extract_fbank_features2WhisperFeatureExtractor._np_extract_fbank_featuresl   s    
 U?vh 'q q 
 &H"

F3!ZZ??{{ ,,	H  3B3'Hzz(LLNS,@AH 3#-H!!(+ ' .1r0   rE   c                 "   [         R                  " U5      R                  U[         R                  5      n[         R                  " U R
                  US9nU R                  S:w  aC  XR                  [         R                  " UR                  UR                  UR                  S9-  -  n[         R                  " XR
                  U R                  USS9nUSSS24   R                  5       S	-  n[         R                  " U R                  5      R                  U[         R                  5      nUR                  U-  n[         R                   " US
S9R#                  5       nUR%                  5       S	:X  a>  UR'                  S	SS9S   R'                  SSS9S   n	[         R(                  " XS-
  5      nO'[         R(                  " XR'                  5       S-
  5      nUS-   S-  nUS:w  a  UR+                  5       R-                  5       nUR/                  5       $ )z
Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
yielding results similar to cpu computing with 1e-5 tolerance.
)r2   r   )dtyper2   T)windowreturn_complex.Nr;   r   g|=)min)dimkeepdimr   r   r<   r=   r5   )torch
from_numpytofloat32hann_windowr$   r)   randnshaperJ   r2   stftr%   absr*   Tclampr7   rN   rA   r@   detachr5   numpy)
r+   rE   r2   rK   rW   
magnitudesr*   mel_specrF   max_vals
             r.   _torch_extract_fbank_features5WhisperFeatureExtractor._torch_extract_fbank_features   s   
 ##H-00G""4::f=
 ;;#ekk(.._g_n_n&oooHzz(JJ_cd#ss(^'')Q.
&&t'7'78;;FEMMR==:-;;xU399;<<>Qllq$l7:>>1d>STUVG}}X}=H}}X||~/CDHsNc)U?(,,.H~~r0   input_valuesattention_maskr   c                    Ub  [         R                  " U[         R                  5      n/ n[        XR	                  S5      5       Hl  u  pEXDSU R                  5       -
  [         R                  " USU R                  5       S-   5      -  nXVR                  S   :  a  X&US& UR                  U5        Mn     U$ U  Vs/ s H=  owUR                  5       -
  [         R                  " UR                  5       S-   5      -  PM?     nnU$ s  snf )zK
Every array in the list is normalized to have zero mean and unit variance
Nr;   gHz>r   )
r?   rC   int32zipsummeansqrtvarrV   rB   )rb   rc   r   normed_input_valuesvectorlengthnormed_slicexs           r.   zero_mean_unit_var_norm/WhisperFeatureExtractor.zero_mean_unit_var_norm   s     %XXnbhh?N"$"%l4F4Fr4J"K &)=)=)? ?2776RYSY?K^K^K`cgKgChh..q11,9)#**<8 #L #" Vb"bUaPQLBGGAEEGdN4K#KUa"b"" #cs   ;AD
max_length
raw_speech
truncationpad_to_multiple_ofreturn_tensorsr   paddingr   do_normalizereturn_token_timestampsc                    UbP  XR                   :w  a@  [        SU R                  R                   SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R                   S35        [        U[        R                  5      =(       a    [        UR                  5      S	:  nU(       a'  [        UR                  5      S
:  a  [        SU  35      eU=(       dE    [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      nU(       a?  U Vs/ s H1  n[        R                  " U/[        R                  S9R                  PM3     nnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                   [        R                   " [        R"                  5      L a  UR%                  [        R                  5      nU(       d"  [        R                  " U/5      R                  /n['        SU05      nU R)                  UUU(       a  UOU R*                  UUU=(       d    U	S9nU	(       a?  U R-                  US   US   U R.                  S9US'   [        R0                  " US   SS9US'   UR3                  S5      R5                  S
SS	5      n[7        5       (       a  U R8                  OU R:                  nU" US   U
5      n[        US   [        5      (       a8  U Vs/ s H&  n[        R                  " U[        R                  S9PM(     snUS'   OUUS'   U(       aL  US   SS2SSU R<                  24   nUS   R                  S	   U R<                  -  S:w  a  USS2SS24   nUUS'   UbY  [        R?                  SU R                  R                   S35        U Vs/ s H  n[        U5      U R<                  -  PM     snUS'   Ub  URA                  U5      nU$ s  snf s  snf s  snf )a  
Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
the STFT computation if available, otherwise a slower NumPy based one.

Args:
    raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
        stereo, i.e. single float per timestep.
    truncation (`bool`, *optional*, default to `True`):
        Activates truncation to cut input sequences longer than *max_length* to *max_length*.
    pad_to_multiple_of (`int`, *optional*, defaults to None):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_attention_mask (`bool`, *optional*):
        Whether to return the attention mask. If left to the default, will return the attention mask according
        to the specific feature_extractor's default.

        [What are attention masks?](../glossary#attention-mask)

        <Tip>

        For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
        bugs.

        </Tip>

    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
        pipeline.
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used to fill the padding values / vectors.
    do_normalize (`bool`, *optional*, defaults to `False`):
        Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
        improve the performance of the model.
    device (`str`, *optional*, defaults to `'cpu'`):
        Specifies the device for computation of the log-mel spectrogram of audio signals in the
        `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
    return_token_timestamps (`bool`, *optional*, defaults to `None`):
        Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

        Whether or not to return the number of frames of the input raw_speech.
        These num_frames can be used by the model to compute word level timestamps.
Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   )rJ   r   )rw   rr   rt   ru   r   rc   )rc   r   )axisr;   z,`return_token_timestamps` is deprecated for z~ and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
num_frames)!r   r>   r-   __name__loggerwarning
isinstancer?   ndarraylenrV   listtupleasarrayrS   rY   rJ   float64astyper   padr'   rp   r   stackget	transposer   r`   rG   r%   warning_onceconvert_to_tensors)r+   rs   rt   ru   rv   r   rw   rr   r   rx   r2   ry   r,   is_batched_numpy
is_batchedspeechbatched_speechpadded_inputsr   extract_fbank_featuresfeaturerescaled_attention_maskraw_speech_is                          r.   __call__ WhisperFeatureExtractor.__call__   s   H $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  3 NNVW[WeWeWnWnVo p\ \
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	 Q[\Q[v"**fXRZZ@BBQ[J\JJz2::$F$FJbjjAJ
BJJ//J4D4DQSQ[Q[H\4\#**2::6J **j\2445J%'7&DE %/zT^^!1"7"G< ! 
 .2.J.J./,-=>"00 /K /M*+
 /1hh}EU7V]^._M*+ '**+;<FFq!QO 3E2F2FD..DLkLk 	 0q0A6JnQ'..dr.sdrY`rzz'/Tdr.sM*+ /=M*+ &34D&EaI[DOOI[F[&\#
 -.44Q7$//IQN*A!SbS&*I'.EM*+".>t~~?V?V>W  XV  W eo*odnT`3|+<+Odn*oM,'%)<<^LME ]R /t( +ps   8Q-Q"Q)r&   r)   r%   r*   r$   r'   r(   r   )P   i>        i  r   r   F)r5   )r   )
TNNNrr   NNNr5   N)r~   
__module____qualname____firstlineno____doc__model_input_namesr#   r?   r   strrG   r`   staticmethodr   floatrp   r   boolr   intr   r   r   __static_attributes____classcell__)r-   s   @r.   r   r   $   s   B ** #"
H S UWU_U_ < bjj  #  Z\ZdZd  >  be#2::&#8<RZZ8H#Y^#	bjj	# #0  ,0;?04!-$('+'+ %26["**d5k4

3CT$u+EVVW[ [ %SM	[
 !sJ!78[  (~[ #[ SM[  }[ tn[ [ "*$[ 
[ [r0   r   )r   typingr   r   r\   r?    r   audio_utilsr   r   r	   !feature_extraction_sequence_utilsr
   feature_extraction_utilsr   utilsr   r   rP   
get_loggerr~   r   r   __all__r!   r0   r.   <module>r      s\    #  " H H I 4 ( 			H	%w6 wt	 %
%r0   