
    cCi5!                         S r SSKJrJr  SSKrSSKJr  SSKJ	r	  SSK
JrJrJr  \R                  " \5      r " S S	\5      rS	/rg)
zFeature extractor class for Dia    )OptionalUnionN   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                   
  ^  \ rS rSrSrSS/r    SS\S\S\S\4U 4S	 jjjr     SS
\	\
R                  \\   \\
R                     \\\      4   S\\	\\\4      S\\   S\\   S\\	\\4      S\\   S\4S jjrSrU =r$ )DiaFeatureExtractor   a
  
Constructs an Dia feature extractor.

This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
most of the main methods. Users should refer to this superclass for more information regarding those methods.

Args:
    feature_size (`int`, *optional*, defaults to 1):
        The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
    padding_value (`float`, *optional*, defaults to 0.0):
        The value that is used for padding.
    hop_length (`int`, *optional*, defaults to 512):
        Overlap length between successive windows.
input_valuesn_quantizersfeature_sizesampling_ratepadding_value
hop_lengthc                 8   > [         TU ]  " SXUS.UD6  X@l        g )N)r   r   r    )super__init__r   )selfr   r   r   r   kwargs	__class__s         h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/dia/feature_extraction_dia.pyr   DiaFeatureExtractor.__init__1   s#     	wl_lwpvw$    	raw_audiopadding
truncation
max_lengthreturn_tensorsreturnc                    Ub<  X`R                   :w  a,  [        SU  SU R                    SU R                    SU S3	5      eO-[        R                  SU R                  R
                   S35        U(       a  U(       a  [        S5      eUc  S	n[        [        U[        [        45      =(       a(    [        US
   [        R                  [        [        45      5      nU(       a>  U Vs/ s H0  n[        R                  " U[        R                  S9R                  PM2     nnOU(       dC  [        U[        R                  5      (       d$  [        R                  " U[        R                  S9nOo[        U[        R                  5      (       aP  UR                  [        R                  " [        R                   5      L a  UR#                  [        R                  5      nU(       d!  [        R                  " U5      R                  /n[%        U5       HB  u  pU R&                  S:X  d  M  U
R(                  S:X  d  M)  [        R*                  " U
S5      X'   MD     [%        U5       H  u  pU
R(                  S:  a  [        SU
R,                   35      eU R&                  S:X  a,  U
R(                  S:w  a  [        SU
R,                  S    S35      eU R&                  S:X  d  M{  U
R(                  S:w  d  M  [        SU
R,                  S    S35      e   [/        SU05      nU R&                  nSU l        U R1                  UUUUS	U R2                  S9nUR5                  S5      US'   / nUR5                  S5       H3  n
U R&                  S:X  a  U
S   n
UR7                  U
R                  5        M5     XS'   Ub  UR9                  U5      nXl        U$ s  snf )a  
Main method to featurize and prepare for the model one or several sequence(s).

Args:
    raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
        The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
        values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
        `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
        (`feature_size = 2`).
    padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
        Select a strategy to pad the returned sequences (according to the model's padding side and padding
        index) among:

        - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
          sequence if provided).
        - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
          acceptable input length for the model if that argument is not provided.
        - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
          lengths).
    truncation (`bool`, *optional*, defaults to `False`):
        Activates truncation to cut input sequences longer than `max_length` to `max_length`.
    max_length (`int`, *optional*):
        Maximum length of the returned list and optionally padding length (see above).
    return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
        If set, will return tensors instead of list of python integers. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return Numpy `np.ndarray` objects.
    sampling_rate (`int`, *optional*):
        The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
        `sampling_rate` at the forward call to prevent silent errors.
z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.zABoth padding and truncation were set. Make sure you only set one.Tr   )dtype   z6Expected input shape (channels, length) but got shape    z$Expected mono audio but example has z	 channelsz&Expected stereo audio but example has r   )r!   r    r   return_attention_maskpad_to_multiple_ofattention_maskpadding_mask).N)r   
ValueErrorloggerwarningr   __name__bool
isinstancelisttuplenpndarrayasarrayfloat32Tr&   float64astype	enumerater   ndimmeanshaper   padr   popappendconvert_to_tensors)r   r   r   r    r!   r"   r   
is_batchedaudioidxexampler   original_feature_sizepadded_inputss                 r   __call__DiaFeatureExtractor.__call__<   se   T $ 2 22 I$ P**+ ,**+9]O1F  3 NNVW[WeWeWnWnVo p\ \
 z`aa_Gy4-0jj1PRPZPZ\acgOh6i

 LUVI5E<>>IIVIJy"**$E$E

9BJJ?I	2::..9??bhhrzzFZ3Z!((4I I.001I &i0LC  A%',,!*;!#"!5	 1
 &i0LC||a #YZaZgZgYh!ijj  A%',,!*; #GVXHYGZZc!dee  A%',,!*; #I'--XZJ[I\\e!fgg 1 $^Y$?@ !% 1 1 !!"&# ! 
 )6(9(9:J(Kn%$((8G  A%!),		* 9
 )5n%%)<<^LM 2m Ws   $7O)r   r   )r)   i>  g        i   )NFNNN)r1   
__module____qualname____firstlineno____doc__model_input_namesintfloatr   r   r6   r7   r4   r   r2   strr   r	   r   rK   __static_attributes____classcell__)r   s   @r   r   r      s   " (8 ""	%	% 	% 		%
 	% 	% @D%*$(;?'+xT%[$rzz2BDeDUUVx %c? :;<x TN	x
 SMx !sJ!78x  }x 
x xr   r   )rP   typingr   r   numpyr6   !feature_extraction_sequence_utilsr   feature_extraction_utilsr   utilsr   r	   r
   
get_loggerr1   r/   r   __all__r   r   r   <module>r^      sH    & "  I 4 9 9 
		H	%W2 Wt !
!r   