
    cCi[3                         S SK JrJr  S SKrS SKrSSKJr  SSKJ	r	  SSK
JrJrJr  SSKJr  \" 5       (       a  S SKrSrS	r\R&                  " \5      r\" S
S9 " S S\5      5       rS/rg)    )OptionalUnionN   )SequenceFeatureExtractor)BatchFeature)
TensorTypeis_librosa_availablelogging)requiresgh㈵>g      p>)torchlibrosa)backendsc                   0  ^  \ rS rSrSrSS/r       SU 4S jjrSS jr          SS\\	R                  \\   \\	R                     \\\      4   S\S	\\   S
\\\\4      S\\   S\\   S\\   S\\   S\\   S\\   S\\   S\4S jjrSrU =r$ )ParakeetFeatureExtractor%   aB  
Constructs a Parakeet feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
Fourier Transform` which should match pytorch's `torch.stft` equivalent.

Args:
    feature_size (`int`, *optional*, defaults to 80):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    hop_length (`int`, *optional*, defaults to 160):
        Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
    n_fft (`int`, *optional*, defaults to 512):
        Size of the Fourier transform.
    win_length (`int`, *optional*, defaults to 400):
        The window length for the STFT computation.
    preemphasis (`float`, *optional*, defaults to 0.97):
        A preemphasis filter coefficient. 0.0 means no preemphasis filter.
    padding_value (`float`, *optional*, defaults to 0.0):
        Padding value used to pad the audio. Should correspond to silences.
input_featuresattention_maskc           	        > [         T
U ]  " SXUS.UD6  X0l        X@l        XPl        X`l        [        R                  R                  X$USUS-  SS9n	[        R                  " U	5      R                  [        R                  5      U l        g )N)feature_sizesampling_ratepadding_value           slaney)srn_fftn_melsfminfmaxnorm )super__init__
hop_lengthr   
win_lengthpreemphasisr   filtersmelr   
from_numpytofloat32mel_filters)selfr   r   r$   r   r%   r&   r   kwargsr,   	__class__s             r/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/parakeet/feature_extraction_parakeet.pyr#   !ParakeetFeatureExtractor.__init__C   s     	wl_lwpvw$
$& oo)),S}_`O`go * 
 !++K8;;EMMJ    c           
      
   [         R                  " U R                  SUS9n[         R                  " UU R                  U R
                  U R                  USSS9n[         R                  " U5      n[         R                  " UR                  S5      R                  S5      5      nUR                  S5      nU R                  R                  U5      nXe-  n[         R                  " U[        -   5      nUR                  SSS	5      nU$ )
NF)periodicdeviceTconstant)r$   r%   windowreturn_complexpad_moder   r      )r   hann_windowr%   stftr   r$   view_as_realsqrtpowsumr,   r*   logLOG_ZERO_GUARD_VALUEpermute)r-   waveformr5   r7   r=   
magnitudesr,   mel_specs           r0   _torch_extract_fbank_features6ParakeetFeatureExtractor._torch_extract_fbank_featurese   s    ""4??U6RzzJJ
 ''-
ZZ
q 1 5 5b 9:
^^A&
 &&))&1+99X(<<= ##Aq!,r2   
raw_speech
truncationpad_to_multiple_ofreturn_tensorsreturn_attention_maskpadding
max_lengthr   do_normalizer5   return_token_timestampsreturnc                 
   UbP  XR                   :w  a@  [        SU R                  R                   SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R                   S35        [        U[        R                  5      (       a  [        R                  " U5      nOc[        U[        [        45      (       aH  [        US	   [        R                  5      (       a&  U Vs/ s H  n[        R                  " U5      PM     nn[        U[        R                  5      =(       a    [        UR                  5      S
:  nU(       aW  [        UR                  5      S:  a>  [        R                  SU R                  R                   S35        UR!                  S5      n[        U[        [        45      nU(       ab  U H\  n[        UR                  5      S
:  d  M  [        R                  SU R                  R                   S35        UR!                  S5      nM^     U(       d  U(       a6  U Vs/ s H(  oSS2S4   R#                  [        R$                  5      PM*     nnO'USS2S4   R#                  [        R$                  5      /nU Vs/ s H  n[        U5      PM     nn['        UUS.5      nU R)                  UUUUUSS9nUR*                  R-                  S5      nU R.                  b  [        R0                  " UR                  S
   UR2                  S9R5                  S	5      UR6                  R5                  S
5      :  n[        R8                  " USS2SS
24   USS2S
S24   U R.                  USS2SS24   -  -
  /S
S9nUR;                  U) S5      nU R=                  UU
5      n[        R>                  " UR6                  U R@                  S-  S-  -   U R@                  -
  U RB                  5      n[        R0                  " UR                  S
   U
S9SSS24   USS2S4   :  nUR5                  S5      nUU-  nURE                  S
S9UR5                  S5      -  nUR5                  S
5      nUU-
  S-  U-  RE                  S
S9US
-
  R5                  S5      -  n[        RF                  " U5      R5                  S
5      nUU-
  U[H        -   -  nUU-  n['        UUS.US9$ s  snf s  snf s  snf )a  
Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
the STFT computation if available, otherwise a slower NumPy based one.

Args:
    raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
        stereo, i.e. single float per timestep.
    truncation (`bool`, *optional*, default to `True`):
        Activates truncation to cut input sequences longer than *max_length* to *max_length*.
    pad_to_multiple_of (`int`, *optional*, defaults to None):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_attention_mask (`bool`, *optional*):
        Whether to return the attention mask. If left to the default, will return the attention mask according
        to the specific feature_extractor's default.

        [What are attention masks?](../glossary#attention-mask)

        <Tip>

        For Parakeet models, `attention_mask` should always be passed for batched inference, to avoid subtle
        bugs.

        </Tip>

    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
        pipeline.
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used to fill the padding values / vectors.
    do_normalize (`bool`, *optional*, defaults to `False`):
        Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
        improve the performance of the model.
    device (`str`, *optional*, defaults to `'cpu'`):
        Specifies the device for computation of the log-mel spectrogram of audio signals in the
        `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
    return_token_timestamps (`bool`, *optional*, defaults to `None`):
        Deprecated. Use `return_attention_mask` instead from which the number of frames can be inferred.

        Whether or not to return the number of frames of the input raw_speech.
        These num_frames can be used by the model to compute word level timestamps.
Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r;   r   z2Only mono-channel audio is supported for input to z;. We will take the mean of the channels to convert to mono.r:   )r   audio_lengthspt)rO   rP   rK   rL   rM   )r5   )dimr   )r   r   )datatensor_type)%r   
ValueErrorr/   __name__loggerwarning
isinstancenpndarrayr   tensorlisttupleTensorlenshapemeanr*   r+   r   padr   squeezer&   aranger5   	unsqueezerV   catmasked_fillrH   floor_divider   r$   rA   r?   EPSILON)r-   rJ   rK   rL   rM   rN   rO   rP   r   rQ   r5   rR   r.   speechis_batched_torchis_batched_sequencerV   batched_speechpadded_inputsr   timemaskfeatures_lengthsr   maskinput_features_maskedrh   variancestds                               r0   __call__!ParakeetFeatureExtractor.__call__   s   H $ 2 22 I$..JaJaIb c))-););(< =))-););(<Im_TUW  3 NNVW[WeWeWnWnVo p\ \ j"**--j1J
T5M22z*Q-QSQ[Q[7\7\=GHZ6%,,v.ZJH%j%,,?]C
HXHXDY\]D]J$4$4 5 9NNDT^^E\E\D] ^L L $,J(dE]C$v||$q(NNLT^^MdMdLe fT T $[[_F % 2JTU*D/,,U]];*JUJ$QW-00?@J3=>:V:>%Vc&de!!1 ! 
 '55==bA '||N$8$8$;NDYDYZdd++55a89H #YY2A2&q!"u(=@P@PSabcehfhehbhSi@i(ijpqN ,77	3GN;;NFS --''$**/A*==

JDOO
 n&:&:1&=fMdTUgVYijkmqjqYrr ''+ . 5$((Q(/2B2L2LR2PP~~a *T1a7$>CCCJN^abNbMmMmnpMqqjj",,Q/(4/C'MB$"0"0 '
 	
A I* V ?s   8 S7/S<T)r$   r,   r   r&   r%   )P   i>     i   i  g
ףp=
?r   )cpu)
FNNNlongestNNNr   N)r\   
__module____qualname____firstlineno____doc__model_input_namesr#   rH   r   r`   ra   rc   floatboolr   intstrr   r   r|   __static_attributes____classcell__)r/   s   @r0   r   r   %   s=   4 *+;<  KD> !,0;?04!*$('+'+ %26[
"**d5k4

3CT$u+EVVW[
 [
 %SM	[

 !sJ!78[
  (~[
 #[
 SM[
  }[
 tn[
 [
 "*$[
 
[
 [
r2   r   )typingr   r   numpyr`   r   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r	   r
   utils.import_utilsr   r   rp   rC   
get_loggerr\   r]   r   __all__r!   r2   r0   <module>r      s|    #   I 4 > > *    
		H	% 
'(v
7 v
 )v
r &
&r2   