
    hG6                         S SK r S SKJrJr  S SKJr  S SKJr  S SKJ	r	J
r
JrJrJr  S SKrS SKrSSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  \	(       a  S SKJr  SSK J!r!  \ " S S5      5       r"SSS\#4S jr$g)    N)	dataclassfield)BytesIO)Path)TYPE_CHECKINGAnyClassVarOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathxopen)no_op_if_value_is_nullstring_to_dictAudioDecoder   )FeatureTypec                   *   \ rS rSr% SrSr\\   \S'   Sr	\
\S'   Sr\\   \S'   \" SSS	9r\\   \S
'   Sr\\   \S'   \R$                  " \R&                  " 5       \R(                  " 5       S.5      r\\   \S'   \" S SSS9r\\S'   S rS\\\\\S4   S\4S jr SS\S\\\\\\
S4   4      SS4S jjrS\S\\S4   4   4S jrS\\R@                  \RB                  4   S\RB                  4S jr"SS\RB                  S\RB                  4S jjr#Sr$g)Audio   a  Audio [`Feature`] to extract audio data from an audio file.

Input: The Audio feature accepts as input:
- A `str`: Absolute path to the audio file (i.e. random access is allowed).
- A `pathlib.Path`: path to the audio file (i.e. random access is allowed).
- A `dict` with the keys:

    - `path`: String with relative path of the audio file to the archive file.
    - `bytes`: Bytes content of the audio file.

  This is useful for parquet or webdataset files which embed audio files.

- A `dict` with the keys:

    - `array`: Array containing the audio sample
    - `sampling_rate`: Integer corresponding to the sampling rate of the audio sample.

- A `torchcodec.decoders.AudioDecoder`: torchcodec audio decoder object.

Output: The Audio features output data as `torchcodec.decoders.AudioDecoder` objects, with additional keys:

- `array`: Array containing the audio sample
- `sampling_rate`: Integer corresponding to the sampling rate of the audio sample.

Args:
    sampling_rate (`int`, *optional*):
        Target sampling rate. If `None`, the native sampling rate is used.
    mono (`bool`, defaults to `True`):
        Whether to convert the audio signal to mono by averaging samples across
        channels.
    decode (`bool`, defaults to `True`):
        Whether to decode the audio data. If `False`,
        returns the underlying dictionary in the format `{"path": audio_path, "bytes": audio_bytes}`.
    stream_index (`int`, *optional*):
        The streaming index to use from the file. If `None` defaults to the "best" index.

Example:

```py
>>> from datasets import load_dataset, Audio
>>> ds = load_dataset("PolyAI/minds14", name="en-US", split="train")
>>> ds = ds.cast_column("audio", Audio(sampling_rate=44100))
>>> ds[0]["audio"]
<datasets.features._torchcodec.AudioDecoder object at 0x11642b6a0>
>>> audio = ds[0]["audio"]
>>> audio.get_samples_played_in_range(0, 10)
AudioSamples:
    data (shape): torch.Size([2, 110592])
    pts_seconds: 0.0
    duration_seconds: 2.507755102040816
    sample_rate: 44100
```
Nsampling_rateTdecodestream_indexF)defaultrepriddictdtypebytespathpa_type)r   initr   _typec                     U R                   $ N)r&   )selfs    Q/home/james-whalen/.local/lib/python3.13/site-packages/datasets/features/audio.py__call__Audio.__call__X   s    ||    valuer   returnc                     SSK nSSKJn  Uc  [	        S5      e[
        R                  (       a  SSKJn  OSn[        U[        5      (       a  SUS.$ [        U[        5      (       a  S[        UR                  5       5      S.$ [        U[        [        45      (       a  USS.$ Ub  [        X5      (       a  [        U5      $ SU;   ad  [!        5       nU" UR#                  US   R%                  [&        R(                  5      5      US	   S
9R+                  USS9  UR-                  5       SS.$ UR/                  S5      GbL  [0        R2                  R5                  US   5      (       Ga$  US   R7                  S5      (       a  UR/                  S	5      c  [9        S5      eUR/                  S5      (       aG  [&        R:                  " US   [&        R<                  S9R%                  [&        R(                  5      S-  nO9[&        R>                  " US   SSS9R%                  [&        R(                  5      S-  n[!        5       nU" UR#                  U5      US	   S
9R+                  USS9  UR-                  5       SS.$ SUR/                  S5      S.$ UR/                  S5      c  UR/                  S5      b#  UR/                  S5      UR/                  S5      S.$ [	        SU S35      e! [         a  n[        S5      UeSnAff = f)zEncode example into a format for Arrow.

Args:
    value (`str`, `bytes`,`bytearray`,`dict`, `AudioDecoder`):
        Data passed as input to Audio feature.

Returns:
    `dict`
r   NAudioEncoder<To support encoding audio data, please install 'torchcodec'.zvalue must be providedr   r#   arrayr   sample_ratewavformatr%   pcmzBTo use PCM files, please specify a 'sampling_rate' in Audio objectr$   )r"   i  hr)r"   modezUAn audio sample should have one of 'path' or 'bytes' but they are missing or None in .) torchtorchcodec.encodersr4   ImportError
ValueErrorr   TORCHCODEC_AVAILABLEtorchcodec.decodersr   
isinstancestrr   absoluter$   	bytearrayencode_torchcodec_audior   
from_numpyastypenpfloat32to_file_likegetvaluegetosr%   isfileendswithKeyError
frombufferint16memmap)r+   r0   rA   r4   errr   bufferbytes_values           r,   encode_exampleAudio.encode_example[   s   	g8 =566&&8  LeS!!!511t$$!3u~~/?+@AAy122"D11%*U*I*I*511YF  w!6!6rzz!BCQVWfQgl6%l0#__.==YYv*rww~~eFm/L/LV}%%e,,99_-5"#ghh99W%%"$--gbhh"O"V"VWYWaWa"bej"jK"$))E&M3"O"V"VWYWaWa"bej"jK U--k:oH^_ll5 m  "(!2DAA!%uyy/@AAYYw+uyy/@/L"YYw/69JKKghmgnnop g  	g\]cff	gs   
K   
K;*K66K;token_per_repo_idc                 "   [         R                  (       a  SSKJn  O[	        S5      eU R
                  (       d  [        S5      eUS   b
  US   US   4OUS   S4u  pEUc  Uc  [        SU S	35      eUc,  [        U5      (       a  U" X@R                  U R                  S
9nOUc  U=(       d    0 nUR                  S5      S   nUR                  [         R                  5      (       a  [         R                  O[         R                  n[!        Xx5      n	U	b  UR#                  U	S   5      OSn
[%        U
S9n['        USUS9nU" XR                  U R                  S
9nOU" XPR                  U R                  S
9nXES.Ul        XFR*                  l        U$ )a  Decode example audio file into audio data.

Args:
    value (`dict`):
        A dictionary with keys:

        - `path`: String with relative audio file path.
        - `bytes`: Bytes of the audio file.
    token_per_repo_id (`dict`, *optional*):
        To access and decode
        audio files from private repositories on the Hub, you can pass
        a dictionary repo_id (`str`) -> token (`bool` or `str`)

Returns:
    `torchcodec.decoders.AudioDecoder`
r   r   z<To support decoding audio data, please install 'torchcodec'.zMDecoding is disabled for this feature. Please use Audio(decode=True) instead.r$   Nr%   zJAn audio sample should have one of 'path' or 'bytes' but both are None in r@   )r   r8   ::repo_idtokenrbdownload_config)r%   r$   )r   rE   _torchcodecr   rC   r   RuntimeErrorrD   r   r   r   split
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   rR   r   r   _hf_encodedmetadatar%   )r+   r0   r_   r   r%   r$   audio
source_urlpatternsource_url_fieldsre   rh   fs                r,   decode_exampleAudio.decode_example   s   & &&1\]]{{noo9>w9SuV}eGn5Z_`fZgimYn<EMijoippqrss=]400 4E4ESWSeSefE] 1 7RD)"-J+5+@+@ASAS+T+T''Z`ZvZv  !/z CK\Kh%))*;I*FGnrE,59OdD/BA 1B1BPTPbPbcE !5F5FTXTfTfgE%):"r/   r   c                 h    SSK Jn  U R                  (       a  [        S5      eU" S5      U" S5      S.$ )z[If in the decodable state, raise an error, otherwise flatten the feature into a dictionary.r   )Valuez'Cannot flatten a decoded Audio feature.binarystringr#   )featuresrz   r   rD   )r+   rz   s     r,   flattenAudio.flatten   s0    #;;FGG8_(O
 	
r/   storagec                    [         R                  R                  UR                  5      (       ag  [         R                  " S/[        U5      -  [         R                  " 5       S9n[         R                  R                  X!/SS/UR                  5       S9nGO^[         R                  R                  UR                  5      (       ag  [         R                  " S/[        U5      -  [         R                  " 5       S9n[         R                  R                  X/SS/UR                  5       S9nGO[         R                  R                  UR                  5      (       aq  UR                  R                  S5      (       aQ  [         R                  " UR                  SS9 Vs/ s H   oDb  [        5       R!                  U5      OSPM"     sn5      nGO*[         R                  R                  UR                  5      (       a  UR                  R#                  S5      S	:  a  UR%                  S5      nO5[         R                  " S/[        U5      -  [         R                  " 5       S9nUR                  R#                  S5      S	:  a  UR%                  S5      nO5[         R                  " S/[        U5      -  [         R                  " 5       S9n[         R                  R                  X#/SS/UR                  5       S9n['        XR(                  5      $ s  snf )
ar  Cast an Arrow array to the Audio arrow storage type.
The Arrow types that can be converted to the Audio pyarrow storage type are:

- `pa.string()` - it must contain the "path" data
- `pa.binary()` - it must contain the audio bytes
- `pa.struct({"bytes": pa.binary()})`
- `pa.struct({"path": pa.string()})`
- `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter

Args:
    storage (`Union[pa.StringArray, pa.StructArray]`):
        PyArrow array to cast.

Returns:
    `pa.StructArray`: Array in the Audio arrow storage type, that is
        `pa.struct({"bytes": pa.binary(), "path": pa.string()})`
Ntyper$   r%   maskr6   F)zero_copy_onlyr   )patypes	is_stringr   r6   lenr{   StructArrayfrom_arraysis_null	is_binaryr|   	is_structget_all_field_indicesto_numpyr   r]   get_field_indexr   r   r&   )r+   r   bytes_array
path_arrayxs        r,   cast_storageAudio.cast_storage   s1   $ 88gll++((D6CL#8ryy{KKnn00+1G'SYIZahapapar0sGXX--4&3w<"7biikJJnn00'1FRXHY`g`o`o`q0rGXX--',,2T2TU\2]2]hhOVO_O_otO_OuvOu!m''*EOuvG XX--||++G49%mmG4 hhvG'<299;O||++F3q8$]]62
XXtfs7|&;"))+N
nn00+1JWV\L]dkdsdsdu0vG'<<00 ws   'L	c           	      x  ^ Tc  0 m[         U4S j5       n[        R                  " UR                  5        Vs/ s H  nUb  US   c  U" US   5      OUS   OSPM      sn[        R                  " 5       S9n[        R                  " UR                  S5      R                  5        Vs/ s H&  ofb  [        R                  R                  U5      OSPM(     sn[        R                  " 5       S9n[        R                  R                  XW/SS/UR                  5       S9n[        XR                  5      $ s  snf s  snf )a   Embed audio files into the Arrow array.

Args:
    storage (`pa.StructArray`):
        PyArrow array to embed.

Returns:
    `pa.StructArray`: Array in the Audio arrow storage type, that is
        `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
Nc                 z  > U R                  S5      S   nUR                  [        R                  5      (       a  [        R                  O[        R
                  n[        X5      nUb  TR                  US   5      OS n[        US9n[        U SUS9 nUR                  5       sS S S 5        $ ! , (       d  f       g = f)Nra   rb   rc   rd   rf   rg   )rk   rl   r   rm   rn   ro   r   rR   r   r   read)r%   rs   rt   ru   re   rh   rv   r_   s          r,   path_to_bytes*Audio.embed_storage.<locals>.path_to_bytes  s    D)"-J+5+@+@ASAS+T+T''Z`ZvZv  !/z CK\Kh%))*;I*FGnrE,59OtT?Cqvvx DCCs   B,,
B:r$   r%   r   r   )r   r   r6   	to_pylistr{   r   rS   r%   basenamer|   r   r   r   r   r&   )r+   r   r_   r   r   r   r%   r   s     `     r,   embed_storageAudio.embed_storage  s*    $ "			  
 		  hh !**,,A UVTaQwZ-?qy)QwZgkk, 
 XXNUmm\bNcNmNmNopNod'7RWWd#TANop

 ..,,k-FRXHY`k`s`s`u,v'<<00 qs   %D2'-D7 r*   )%__name__
__module____qualname____firstlineno____doc__r   r
   int__annotations__r   boolr   r   r    rH   r"   r	   r   structr{   r|   r&   r   r(   r-   r   r$   rJ   r!   r]   rw   r~   StringArrayr   r   r   __static_attributes__r   r/   r,   r   r      sh   4l $(M8C='FD"&L(3-&d7B7!E8C=!YYbiik'RSGXc]SwU?E3?BE#ui~*U$V B[_ BJ ]a33.6tCsDRVAW<W7X.Y3	3j	
}d33E.FFG 	
&1E".."..*H$I &1bnn &1P&1R^^ &1PRP^P^ &1 &1r/   r   rr   r   r1   c                 N   [        U S5      (       a  U R                  $  SSKJn  U R                  5       n[        5       nU" UR                  R                  5       UR                  S9R                  USS9  UR                  5       S S.$ ! [         a  n[	        S5      UeS nAff = f)	Nrp   r   r3   r5   r7   r9   r:   r#   )hasattrrp   rB   r4   rC   get_all_samplesr   datacpur8   rP   rQ   )rr   r4   rZ   samplesr[   s        r,   rK   rK   0  s    um$$   	g8 '')W\\%%'W5H5HIVVW]fkVl*D99  	g\]cff	gs   B	 	
B$BB$)%rS   dataclassesr   r   ior   pathlibr   typingr   r   r	   r
   r   numpyrN   pyarrowr    r   download.download_configr   tabler   utils.file_utilsr   r   utils.py_utilsr   r   rF   r   r}   r   r   r!   rK   r   r/   r,   <module>r      sg    	 (   @ @    5  3 C 0% U1 U1 U1p:> :d :r/   