
    cCiY-                         S r SSKJrJr  SSKrSSKJr  SSKJ	r	  SSK
JrJrJr  \R                  " \5      r " S S	\5      rS	/rg)
z&
Feature extractor class for Wav2Vec2
    )OptionalUnionN   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                   |  ^  \ rS rSrSrSS/r     SU 4S jjr\ SS\\	R                     S\\	R                     S\S\\	R                     4S jj5       r       SS	\\	R                  \\   \\	R                     \\\      4   S
\\\\4   S\\   S\S\\   S\\   S\\\\4      S\\   S\4S jjrSrU =r$ )Wav2Vec2FeatureExtractor   a  
Constructs a Wav2Vec2 feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

Args:
    feature_size (`int`, *optional*, defaults to 1):
        The feature dimension of the extracted features.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used to fill the padding values.
    do_normalize (`bool`, *optional*, defaults to `True`):
        Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
        improve the performance for some models, *e.g.*,
        [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
    return_attention_mask (`bool`, *optional*, defaults to `False`):
        Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.

        <Tip>

        Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
        [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
        `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
        should be passed.

        For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
        [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
        passed for batched inference.

        </Tip>input_valuesattention_maskc                 D   > [         TU ]  " SXUS.UD6  X@l        XPl        g )N)feature_sizesampling_ratepadding_value )super__init__return_attention_maskdo_normalize)selfr   r   r   r   r   kwargs	__class__s          r/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.pyr   !Wav2Vec2FeatureExtractor.__init__C   s*     	wl_lwpvw%:"(    r   returnc                    Ub  [         R                  " U[         R                  5      n/ n[        XR	                  S5      5       Hl  u  pEXDSU R                  5       -
  [         R                  " USU R                  5       S-   5      -  nXVR                  S   :  a  X&US& UR                  U5        Mn     U$ U  Vs/ s H=  owUR                  5       -
  [         R                  " UR                  5       S-   5      -  PM?     nnU$ s  snf )zK
Every array in the list is normalized to have zero mean and unit variance
NgHz>r   )
nparrayint32zipsummeansqrtvarshapeappend)r   r   r   normed_input_valuesvectorlengthnormed_slicexs           r   zero_mean_unit_var_norm0Wav2Vec2FeatureExtractor.zero_mean_unit_var_normP   s     %XXnbhh?N"$"%l4F4Fr4J"K &)=)=)? ?2776RYSY?K^K^K`cgKgChh..q11,9)#**<8 #L #" Vb"bUaPQLBGGAEEGdN4K#KUa"b"" #cs   ;AD
raw_speechpadding
max_length
truncationpad_to_multiple_ofr   return_tensorsr   c	                    Ub<  XR                   :w  a,  [        SU  SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R
                   S35        [        U[        R                  5      =(       a    [        UR                  5      S	:  n
U
(       a'  [        UR                  5      S
:  a  [        SU  35      eU
=(       dE    [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      nU(       d  U/n[        SU05      nU R                  UUUUUUS9nUS   n[        US   [        R                  5      (       d9  U Vs/ s H&  n[        R                  " U[        R                   S9PM(     snUS'   GO[        U[        R                  5      (       d  [        US   [        R                  5      (       ag  US   R"                  [        R"                  " [        R$                  5      L a3  U Vs/ s H!  oR'                  [        R                   5      PM#     snUS'   Or[        U[        R                  5      (       aS  UR"                  [        R"                  " [        R$                  5      L a"  UR'                  [        R                   5      US'   UR)                  S5      nUb7  U Vs/ s H&  n[        R                  " U[        R*                  S9PM(     snUS'   U R,                  (       aE  U R/                  X#S9[0        R2                  La  UOSnU R5                  US   UU R6                  S9US'   Ub  UR9                  U5      nU$ s  snf s  snf s  snf )ap  
Main method to featurize and prepare for the model one or several sequence(s).

Args:
    raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
        stereo, i.e. single float per timestep.
    padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
        Select a strategy to pad the returned sequences (according to the model's padding side and padding
        index) among:

        - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
          sequence if provided).
        - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
          acceptable input length for the model if that argument is not provided.
        - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
          lengths).
    max_length (`int`, *optional*):
        Maximum length of the returned list and optionally padding length (see above).
    truncation (`bool`):
        Activates truncation to cut input sequences longer than *max_length* to *max_length*.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value.

        This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
        `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
    return_attention_mask (`bool`, *optional*):
        Whether to return the attention mask. If left to the default, will return the attention mask according
        to the specific feature_extractor's default.

        [What are attention masks?](../glossary#attention-mask)

        <Tip>

        Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
        [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
        `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
        `attention_mask` should be passed.

        For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
        [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
        be passed for batched inference.

        </Tip>

    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors.
    padding_value (`float`, *optional*, defaults to 0.0):
Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.      z2Only mono-channel audio is supported for input to r   r   )r4   r5   r6   r7   r   )dtyper   )r5   )r   r   )r   
ValueErrorloggerwarningr   __name__
isinstancer"   ndarraylenr*   listtupler   padasarrayfloat32r=   float64astypegetr$   r   _get_padding_strategiesr   
DO_NOT_PADr1   r   convert_to_tensors)r   r3   r4   r5   r6   r7   r   r8   r   r   is_batched_numpy
is_batchedencoded_inputspadded_inputsr   r#   r   s                    r   __call__!Wav2Vec2FeatureExtractor.__call__f   s;   L $ 2 22 I$ P**+ ,**+9]O1F  3 NNVW[WeWeWnWnVo p\ \
 &j"**=[#jFVFVBWZ[B[J$4$4 5 9QRVQWXYY% 
zD%=1lz*Q-RTR\R\^ceiQj7k 	
 $J &~z&BC!!1"7 ! 
 %^4,q/2::66^j,k^jUZRZZRZZ-P^j,kM.)<44<?BJJ77Q%%"**)==S_,`S_%\\"**-ES_,`M.)bjj11l6H6HBHHUWU_U_L`6`,8,?,?

,KM.) '**+;<%^l.m^lUZrzz%rxx/P^l.mM*+  ///OWfWqWqq  
 -1,H,Hn-n\`\n\n -I -M.) %)<<^LM; -l -a /ns   8-M>&(N-N)r   r   )r;   i>          FT)rV   )FNFNNNN)rA   
__module____qualname____firstlineno____doc__model_input_namesr   staticmethodrE   r"   rC   floatr1   r   boolstrr   r   intr	   r   rT   __static_attributes____classcell__)r   s   @r   r   r      sU   B ()9: #) ad#2::&#8<RZZ8H#Y^#	bjj	# #0 6;$( ,004;?'+J"**d5k4

3CT$u+EVVWJ tS/12J SM	J
 J %SMJ  (~J !sJ!78J  }J 
J Jr   r   )rZ   typingr   r   numpyr"   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r	   r
   
get_loggerrA   r?   r   __all__r   r   r   <module>rj      sJ    #  I 4 9 9 
		H	%Q7 Qh &
&r   