
    cCi;                     ,   S SK r S SKJr  S SKJrJr  S SKrSSKJ	r	  SSK
Jr  SSKJrJrJr  \R                   " \5      r SS\S	\S
\S\S\S\S\\   S\R,                  4S jjrS\R,                  S\S\S\S\R,                  4
S jr " S S\	5      rS/rg)    N)Sequence)OptionalUnion   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingn_freqsf_minf_maxn_melssample_rate
fft_lengthnormreturnc                    Ub  US:w  a  [        S5      e[        R                  " U [        R                  S9XE-  -  nS[        R
                  " SUS-  -   5      -  nS[        R
                  " SUS-  -   5      -  n	[        R                  " XUS-   5      n
SS	U
S-  -  S-
  -  nUS
S USS -
  n[        R                  " US5      [        R                  " US
5      -
  n[        R                  " S
[        R                  S9nSUSS2SS24   -  USS -  nUSS2SS24   US
S -  n[        R                  " U[        R                  " UU5      5      nUb1  US:X  a+  SUSUS-    USU -
  -  nU[        R                  " US5      -  nU$ )aa  Create a frequency bin conversion matrix (NumPy version).

Args:
    n_freqs (int): Number of frequencies to highlight/apply
    f_min (float): Minimum frequency (Hz)
    f_max (float): Maximum frequency (Hz)
    n_mels (int): Number of mel filterbanks
    sample_rate (int): Sample rate of the audio waveform
    fft_length (int): FFT length
    norm (Optional[str]): If 'slaney', divide the triangular mel weights by
      the width of the mel band (area normalization). (Default: ``None``)

Returns:
    np.ndarray: Triangular filter banks (fb matrix) of size (``n_freqs``,
    ``n_mels``)
    meaning number of frequencies to highlight/apply to x the number of
    filterbanks.
    Each column is a filterbank so that assuming there is a matrix A of
    size (..., ``n_freqs``), the applied result would be
    ``A @ create_fb_matrix_numpy(A.shape[-1], ...)``.
Nslaneyz$norm must be one of None or 'slaney'dtypeg     F@      ?g     @   
      r   g      g       @)
ValueErrornparangefloat32mathlog10linspaceexpand_dimszerosmaximumminimum)r   r   r   r   r   r   r   	all_freqsm_minm_maxm_ptsf_ptsf_diffslopeszerodown_slopes	up_slopesfbenorms                      p/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/gemma3n/feature_extraction_gemma3n.pycreate_fb_matrixr6      s   > DH,?@@ 		'48PQI TZZuu} 566ETZZuu} 566EKKfqj1EREFN+c12E12Ys#F^^E1%y!(DDF88ARZZ(D&CRC.(F3BK7Kq!"uqr
*I	D"**[)<	=BDH,uQ!,uWf~=>
bnnUA&&I    array	dimensionsizestepc                    U R                   S:w  a  [        S5      eUS:w  a  XR                   S-
  :w  a  [        S5      eU R                  u  pEXR-
  U-  S-   nUS::  a"  [        R                  " USU4U R
                  S9$ XFU4nU R                  S   U R                  S   U-  U R                  S   4n[        R                  R                  R                  XUS9$ )	zNA basic NumPy equivalent of PyTorch's unfold for 2D arrays along the last dim.r   zFThis unfold implementation currently supports 2D arrays (batch, time).r   r   zFThis unfold implementation only supports unfolding the last dimension.r   r   )shapestrides)
ndimr   r=   r   r&   r   r>   libstride_tricks
as_strided)	r8   r9   r:   r;   
batch_sizeoriginal_length
num_framesoutput_shapeoutput_stridess	            r5   _unfoldrH   [   s    zzQabbB9

Q6abb"'++J!(T1A5JQxxQ-U[[AAD1LmmA&a(84(?qAQRN66**5n*]]r7   c            #         ^  \ rS rSrSrSS/r                S"S\S\S\S\S	\S
\S\S\S\S\S\S\S\S\S\	\
\      S\	\
\      4 U 4S jjjrS\R                  S\R                  S\\R                  \R                  4   4S jr      S#S\\R                  \\   \\R                     \\\      4   S\\\\4   S\	\   S\S\	\   S\	\\\4      S\	\   S\4S  jjrS!rU =r$ )$Gemma3nAudioFeatureExtractorn   a	  An audio feature extractor Universal Speech Models https://huggingface.co/papers/2303.01037.

Args:
    feature_size (`int`, *optional*, defaults to 128):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        Padding value used to pad the audio. Should correspond to silences.
    return_attention_mask (`bool`, *optional*, defaults to `True`):
        Whether to return the attention mask for the generated MEL spectrograms.
    frame_length_ms (`float`, *optional*, defaults to 32.0):
        The length of a frame in milliseconds.
    hop_length_ms (`float`, *optional*, defaults to 10.0):
        Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
    min_frequency (`float`, *optional*, defaults to 125.0):
        The minimum frequency (in Hz) for the Mel filterbank.
    max_frequency (`float`, *optional*, defaults to 7600.0):
        The maximum frequency (in Hz) for the Mel filterbank.
    preemphasis (`float`, *optional*, defaults to 0.97):
        The preemphasis coefficient.
    preemphasis_htk_flavor (`bool`, *optional*, defaults to `True`):
        Whether to use HTK-style preemphasis.
    fft_overdrive (`bool`, *optional*, defaults to `True`):
        Whether to use FFT overdrive.
    dither (`float`, *optional*, defaults to 0.0):
        Adds dithering. In other words, adds a small Gaussian noise to each frame.
        E.g. use 0.0001 to add dithering with a normal distribution centered
        around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
        The value 0.0 means no dithering.
        Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
        the high log_mel_fbank values for signals with hard-zero sections,
        when VAD cutoff is present in the signal.
    input_scale_factor (`float`, *optional*, defaults to 1.0):
        Scaling factor applied to the input waveform.
    mel_floor (`float`, *optional*, defaults to 1e-05):
        Minimum value for Mel spectrograms to avoid log(0).
    per_bin_mean (`Optional[Sequence[float]]`, *optional*):
        Mean values for per-bin normalization.
    per_bin_stddev (`Optional[Sequence[float]]`, *optional*):
        Standard deviation values for per-bin normalization.
input_featuresinput_features_maskfeature_sizesampling_ratepadding_valuereturn_attention_maskframe_length_mshop_length_msmin_frequencymax_frequencypreemphasispreemphasis_htk_flavorfft_overdriveditherinput_scale_factor	mel_floorper_bin_meanper_bin_stddevc           
      H  > [         TU ]  " SUUUUS.UD6  Xpl        Xl        Xl        Xl        Xl        Xl        Xl        [        [        X%-  S-  5      5      U l        [        [        X&-  S-  5      5      U l        [        R                  " U[        R                  S9U l        S["        R$                  " ["        R&                  " U R                  5      5      -  nU R                  (       a  US-  nUU l        [        R*                  " U R                  [        R,                  S9nSS[        R.                  " S[        R0                  -  U-  U R                  -  5      -
  -  nUR3                  [        R,                  5      U l        [7        U R(                  S-  S-   UUUU R8                  S US9U l        Ub-  [        R                  " U5      R=                  SSU5      U l        OS U l        Ub-  [        R                  " U5      R=                  SSU5      U l         g S U l         g )	N)rN   rO   rP   rQ   g     @@r   r   g      ?r   )r   r   r   r   r   r   r    )!super__init__rT   rU   rV   rW   rX   rY   rZ   introundframe_length
hop_lengthr   r8   float64r[   r"   ceillog2r   r    r!   cospiastypewindowr6   rO   mel_filtersreshaper\   r]   )selfrN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   kwargsr   hann_arangerl   	__class__s                        r5   ra   %Gemma3nAudioFeatureExtractor.__init__   s   ( 	 	
%''"7		

 	
 +*&&<#*"4m&E&N OPeM$AF$JKL)2::>$))DIId.?.?$@AA
!OJ$ii 1 1DBFF1ruu9{#:T=N=N#NOOPmmBJJ/+OOq(1,**!
 # " 6 > >q!\ RD $D%"$((>":"B"B1a"VD"&Dr7   waveformattention_maskr   c                    UR                   S:X  a  [        R                  " USS9nU R                  S:  aO  XR                  [        R                  R
                  " UR                  6 R                  UR                  5      -  -   nU R                  S:w  a  XR                  -  nU R                  S-   n[        USX0R                  S9nU R                  S:  a  U R                  (       aP  USS	S24   SU R                  -
  -  nUSSS24   U R                  USS	S
24   -  -
  n[        R                  " XV/SS9nO*USSS	24   U R                  USS	S24   -  -
  nO	USS	S24   nXpR                   -  n[        R"                  R%                  XpR&                  SS9n[        R(                  " U5      n	[        R*                  " XR,                  5      n
[        R.                  " [        R0                  " XR2                  5      5      nU R4                  b  XR4                  -
  nU R6                  b  XR6                  -  nUR9                  S5      nUS	S	U R                  2   R                  [:        5      nXS	UR                  S    4$ ) r   r   )axis        r   r   )r9   r:   r;   .Nr   )nrx   )r?   r   r%   rY   randomrandnr=   rk   r   rZ   rd   rH   re   rV   rW   concatenaterl   fftrfftr   absmatmulrm   logr'   r[   r\   r]   squeezebool)ro   rt   ru   frame_size_for_unfoldframes_to_processfirst_in_framerest_in_frameframesstftmagnitude_specmel_speclog_mel_specmel_spectrogrammasks                 r5   _extract_spectrogram1Gemma3nAudioFeatureExtractor._extract_spectrogram   sJ   ==A~~hQ7H;;++		0P0W0WX`XfXf0g"ggH""c)"9"99H $ 1 1A 5 $HAV]l]lmc!**!237!;sTEUEU?U!V 1#qt) <t?O?ORcdgiljlildlRm?m m(GbQ*373d6F6FIZ[^`cac`c[cId6dd&sCRCx0F++%vv{{6__2{>99^-=-=>vvbjj>>BC('*;*;;L*'*=*==L&..q100188>%?'<'<Q'? @@@r7   
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc           	         [        U[        R                  5      =(       a    [        UR                  5      S:  n	[        U[
        5      =(       a#    [        US   [        R                  [
        45      n
U	=(       d    U
nU(       a2  U Vs/ s H$  n[        R                  " U/5      R                  PM&     nnO<U(       d5  [        U[        R                  5      (       d  [        R                  " U5      nU(       d  [        R                  " U/5      /nU R                  [        SU05      UUUUUS9n/ n/ n[        UR                  UR                  5       Hd  u  nnU R                  UR                  U5      u  nnUR                  UR                  [        R                   5      5        UR                  U5        Mf     [        XS.US9$ s  snf )a|  Creates a batch of MEL spectrograms from the provided raw speech.

This implementation uses a different algorithm for windowing and preemphasis compared to the built-in
`transformers.audio_utils.spectrogram()` function that _will_ result in different outputs. Consider this
carefully when selecting an audio feature extractor, especially with pre-trained models.

Args:
    raw_speech:
        The audio for which MEL spectrograms are created.
    padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `"longest"`):
        The padding strategy to use for batches of audio with different lengths.
    max_length (`int`, *optional*, defaults to 480000):
        If provided, defines the maximum length of the audio to allow. Audio longer than this will be
        truncated if `truncation=True`.
    truncation (`bool`, *optional*, defaults to `True`):
        Whether or not to truncate audio above `max_length`.
    pad_to_multiple_of (`int`, *optional*, defaults to 128):
        When padding, pad to a multiple of this value. The default value is defined for optimal TPU support.
    return_tensors (`Union[str, TensorType]`, *optional*, defaults to `None`):
        The type of tensors to return (e.g., NumPy, Torch, JAX, TensorFlow).
    return_attention_mask (`bool`, *optional*, defaults to `True`):
        Whether to return the attention mask for the generated MEL spectrograms.
r   r   rL   )r   r   r   r   rQ   )rL   rM   )tensor_type)
isinstancer   ndarraylenr=   r   asarrayTpadr   ziprL   ru   r   appendrk   r!   )ro   r   r   r   r   r   r   rQ   rp   is_batched_numpyis_batched_sequence
is_batchedrsbatched_speechprepared_speechprepared_speech_maskspeechr   s                     r5   __call__%Gemma3nAudioFeatureExtractor.__call__  s   F &j"**=[#jFVFVBWZ[B[(X>t:jYZm^`^h^hjr]sCt%<)<
7ABz"**bT*,,zJBJJz2::$F$FJ/J**j\23J*J78!!1"7 " 
 ! = =~?\?\]LFD44VXXtDLFD""6==#<= ''- ^
 .\&
 	
/ Cs   
+G
)rY   r   rX   rd   re   rZ   rU   rm   r[   rT   r\   r]   rV   rW   rl   )   i>  ry   Tg      @@g      $@g     @_@g     @g
ףp=
?TTry   r   gh㈵>NN)longesti S Tr   NT)__name__
__module____qualname____firstlineno____doc__model_input_namesrb   floatr   r   r   ra   r   r   tupler   r   liststrr	   r
   r   r   __static_attributes____classcell__)rr   s   @r5   rJ   rJ   n   s   )V *+@A  #"&*!%#$%!'+"$'2648#B'B' B' 	B'
  $B' B' B' B' B' B' !%B' B' B' "B' B'  x/!B'" !%1#B' B'H+ARZZ +A +AX]^`^h^hjljtjt^tXu +A` 6?$+,/;?04B
"**d5k4

3CT$u+EVVWB
 tS/12B
 SM	B

 B
 %SMB
 !sJ!78B
  (~B
 
B
 B
r7   rJ   )N)r"   collections.abcr   typingr   r   numpyr   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr	   r
   r   
get_loggerr   loggerrb   r   r   r   r6   rH   rJ   __all__r_   r7   r5   <module>r      s      $ "  I 4 9 9 
		H	% ::: : 	:
 : : 3-: ZZ:z^2:: ^# ^S ^ ^

 ^&a
#; a
H *
*r7   