
    cCi&                         S r SSKJrJr  SSKrSSKJr  SSKJ	r	  SSK
JrJrJr  \R                  " \5      r " S S	\5      rS	/rg)
z$Feature extractor class for EnCodec.    )OptionalUnionN   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                   X  ^  \ rS rSrSrSS/r     SS\S\S\S\\   S	\\   4
U 4S
 jjjr	\
S\\   4S j5       r\
S\\   4S j5       r     SS\\R                  \\   \\R                     \\\      4   S\\\\\4      S\\   S\\   S\\\\4      S\\   S\4S jjrSrU =r$ )EncodecFeatureExtractor   a  
Constructs an EnCodec feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

Instantiating a feature extractor with the defaults will yield a similar configuration to that of the
[facebook/encodec_24khz](https://huggingface.co/facebook/encodec_24khz) architecture.

Args:
    feature_size (`int`, *optional*, defaults to 1):
        The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
    sampling_rate (`int`, *optional*, defaults to 24000):
        The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used to fill the padding values.
    chunk_length_s (`float`, *optional*):
        If defined the audio is pre-processed into chunks of lengths `chunk_length_s` and then encoded.
    overlap (`float`, *optional*):
        Defines the overlap between each chunk. It is used to compute the `chunk_stride` using the following
        formulae : `int((1.0 - self.overlap) * self.chunk_length)`.
input_valuespadding_maskfeature_sizesampling_ratepadding_valuechunk_length_soverlapc                 D   > [         TU ]  " SXUS.UD6  X@l        XPl        g )N)r   r   r    )super__init__r   r   )selfr   r   r   r   r   kwargs	__class__s          p/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/encodec/feature_extraction_encodec.pyr    EncodecFeatureExtractor.__init__7   s)     	wl_lwpvw,    returnc                 b    U R                   c  g [        U R                   U R                  -  5      $ )N)r   intr   r   s    r   chunk_length$EncodecFeatureExtractor.chunk_lengthE   s-    &t**T-?-??@@r   c                     U R                   b  U R                  c  g [        S[        SU R                  -
  U R                  -  5      5      $ )N   g      ?)r   r   maxr!   r#   r"   s    r   chunk_stride$EncodecFeatureExtractor.chunk_strideM   s@    &$,,*>q#sT\\1T5F5FFGHHr   	raw_audiopadding
truncation
max_lengthreturn_tensorsc                    Ub<  X`R                   :w  a,  [        SU  SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R
                   S35        U(       a  U(       a  [        S	5      eUc  S
n[        [        U[        [        45      =(       a(    [        US   [        R                  [        [        45      5      nU(       a>  U Vs/ s H0  n[        R                  " U[        R                  S9R                  PM2     nnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                  [        R                  " [        R                   5      L a  UR#                  [        R                  5      nU(       d!  [        R                  " U5      R                  /n[%        U5       H  u  pU
R&                  S:  a  [        SU
R(                   35      eU R*                  S:X  a,  U
R&                  S:w  a  [        SU
R(                  S    S35      eU R*                  S:X  d  M{  U
R(                  S   S:w  d  M  [        SU
R(                  S    S35      e   Sn[-        SU05      nU R.                  b  U R0                  b  Uc  U(       a]  [3        S U 5       5      n[5        [        R6                  " X@R.                  -  5      5      nUS-
  U R.                  -  U R0                  -   nOhU(       a_  [9        S U 5       5      n[5        [        R:                  " X@R.                  -  5      5      nUS-
  U R.                  -  U R0                  -   nSnOUnUc.  U R=                  UUUUUS9nU(       a  UR?                  S5      US'   / nUR?                  S5       H3  n
U R*                  S:X  a  U
S   n
URA                  U
R                  5        M5     XS'   Ub  URC                  U5      nU$ s  snf )a  
Main method to featurize and prepare for the model one or several sequence(s).

Args:
    raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
        `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
        (`feature_size = 2`).
    padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
        Select a strategy to pad the returned sequences (according to the model's padding side and padding
        index) among:

        - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
          sequence if provided).
        - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
          acceptable input length for the model if that argument is not provided.
        - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
          lengths).
    truncation (`bool`, *optional*, defaults to `False`):
        Activates truncation to cut input sequences longer than `max_length` to `max_length`.
    max_length (`int`, *optional*):
        Maximum length of the returned list and optionally padding length (see above).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors.
Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.zABoth padding and truncation were set. Make sure you only set one.Tr   )dtype   z6Expected input shape (channels, length) but got shape r&   z$Expected mono audio but example has z	 channelsz&Expected stereo audio but example has r   c              3   >   #    U  H  oR                   S    v   M     g7fr   Nshape.0arrays     r   	<genexpr>3EncodecFeatureExtractor.__call__.<locals>.<genexpr>         GYEQY   c              3   >   #    U  H  oR                   S    v   M     g7fr5   r6   r8   s     r   r;   r<      r=   r>   r-   )r-   r,   r+   return_attention_maskattention_maskr   ).N)"r   
ValueErrorloggerwarningr   __name__bool
isinstancelisttuplenpndarrayasarrayfloat32Tr1   float64astype	enumeratendimr7   r   r   r(   r#   minr!   floorr'   ceilpadpopappendconvert_to_tensors)r   r*   r+   r,   r-   r.   r   
is_batchedaudioidxexamplepadded_inputsr   nb_steps                 r   __call__ EncodecFeatureExtractor.__call__T   s   T $ 2 22 I$ P**+ ,**+9]O1F  3 NNVW[WeWeWnWnVo p\ \
 z`aa_Gy4-0jj1PRPZPZ\acgOh6i

 LUVI5E<>>IIVIJy"**$E$E

9BJJ?I	2::..9??bhhrzzFZ3Z!((4I I.001I &i0LC||a #YZaZgZgYh!ijj  A%',,!*; #GVXHYGZZc!dee  A%'--*;q*@ #I'--XZJ[I\\e!fgg 1 #^Y$?@(T->->-JzOa  GY GG
bhhz4E4E'EFG%kT->->>ARARR
  GY GG
bggj3D3D&DEF%kT->->>ARARR
& ,   HH%%&- % M 0=0A0ABR0Sn-$((8G  A%!),		* 9
 )5n%%)<<^LMq Ws   $7Q)r   r   )r&   i]  g        NN)NFNNN)rE   
__module____qualname____firstlineno____doc__model_input_namesr!   floatr   r   propertyr#   r(   r   rJ   rK   rH   rF   strr   r	   r   r`   __static_attributes____classcell__)r   s   @r   r   r      si   . (8 ""*.#'  	
 ! %  Ahsm A A Ihsm I I @D%*$(;?'+zT%[$rzz2BDeDUUVz %c? :;<z TN	z
 SMz !sJ!78z  }z 
z zr   r   )re   typingr   r   numpyrJ   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r	   r
   
get_loggerrE   rC   r   __all__r   r   r   <module>rs      sH    + "  I 4 9 9 
		H	%q6 qh %
%r   