
    cCit                         S SK Jr  S SKJrJr  S SKrS SKrSSKJ	r	J
r
  SSKJr  SSKJrJr  \
R                   " \5      r\	" \" S	S	S
95       " S S\5      5       rg)    )UserDict)AnyUnionN   )add_end_docstringslogging   )ffmpeg_read)Pipelinebuild_pipeline_init_argsT)has_feature_extractorhas_tokenizerc            	          ^  \ rS rSrSrSrSrSrSrU 4S jr	S\
\R                  \\\4   S\S\\\\4      4U 4S	 jjrS
 rSS jrS rS rSrU =r$ )#ZeroShotAudioClassificationPipeline    a  
Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
provide an audio and a set of `candidate_labels`.

<Tip warning={true}>

The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.

</Tip>

Example:
```python
>>> from transformers import pipeline
>>> from datasets import load_dataset

>>> dataset = load_dataset("ashraq/esc50")
>>> audio = next(iter(dataset["train"]["audio"]))["array"]
>>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
>>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vacuum cleaner"])
[{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vacuum cleaner'}]
```


Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
`"zero-shot-audio-classification"`. See the list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
FTc                 x   > [         TU ]  " S0 UD6  U R                  S:w  a  [        SU R                   S35      eg )NptzThe z is only available in PyTorch. )super__init__	framework
ValueError	__class__)selfkwargsr   s     o/home/james-whalen/.local/lib/python3.13/site-packages/transformers/pipelines/zero_shot_audio_classification.pyr   ,ZeroShotAudioClassificationPipeline.__init__D   s>    "6">>T!tDNN#33QRSS "    audiosr   returnc                 &   > [         TU ]  " U40 UD6$ )ah  
Assign labels to the audio(s) passed as inputs.

Args:
    audios (`str`, `list[str]`, `np.array` or `list[np.array]`):
        The pipeline handles three types of inputs:
        - A string containing a http link pointing to an audio
        - A string containing a local path to an audio
        - An audio loaded in numpy
    candidate_labels (`list[str]`):
        The candidate labels for this audio. They will be formatted using *hypothesis_template*.
    hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
        The format used in conjunction with *candidate_labels* to attempt the audio classification by
        replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
        already formatted.
Return:
    A list of dictionaries containing one entry per proposed label. Each dictionary contains the
    following keys:
    - **label** (`str`) -- One of the suggested *candidate_labels*.
    - **score** (`float`) -- The score attributed by the model to that label. It is a value between
        0 and 1, computed as the `softmax` of `logits_per_audio`.
)r   __call__)r   r   r   r   s      r   r"   ,ZeroShotAudioClassificationPipeline.__call__K   s    . w1&11r   c                 H    0 nSU;   a  US   US'   SU;   a  US   US'   U0 0 4$ )Ncandidate_labelshypothesis_templater   )r   r   preprocess_paramss      r   _sanitize_parameters8ZeroShotAudioClassificationPipeline._sanitize_parametersd   sI    '4:;M4N01 F*7=>S7T34 "b((r   c                 p   [        U[        5      (       ar  UR                  S5      (       d  UR                  S5      (       a!  [        R                  " U5      R
                  nO%[        US5       nUR                  5       nS S S 5        [        U[        5      (       a  [        XR                  R                  5      n[        U[        R                  5      (       d  [        S5      e[        UR                   5      S:w  a  [#        S5      eU R                  U/U R                  R                  SS9nU R$                  S:X  a  UR'                  U R(                  5      nX%S	'   U Vs/ s H  ocR+                  U5      PM     nnU R-                  XpR$                  S
S9nU/US'   U$ ! , (       d  f       GN(= fs  snf )Nzhttp://zhttps://rbz"We expect a numpy ndarray as inputr	   zNWe expect a single channel audio input for ZeroShotAudioClassificationPipeliner   )sampling_ratereturn_tensorsr%   T)r-   paddingtext_inputs)
isinstancestr
startswithrequestsgetcontentopenreadbytesr
   feature_extractorr,   npndarray	TypeErrorlenshaper   r   todtypeformat	tokenizer)	r   audior%   r&   finputsx	sequencesr/   s	            r   
preprocess.ZeroShotAudioClassificationPipeline.preprocessm   sj   eS!!	**e.>.>z.J.J !U+33%&!FFHE ' eU##'='='K'KLE%,,@AAu{{q mnn''G4#9#9#G#GX\ ( 
 >>T!YYtzz*F%5!"<LM<Lq//2<L	MnnY~~W[n\!,}) '&" Ns   /F!#F3!
F0c                     UR                  S5      nUR                  S5      n[        US   [        5      (       a  US   nOUS   S   nU R                  " S0 UDUD6nUUR                  S.nU$ )Nr%   r/   r   )r%   logitsr   )popr0   r   modellogits_per_audio)r   model_inputsr%   r/   outputsmodel_outputss         r   _forward,ZeroShotAudioClassificationPipeline._forward   s    '++,>?"&&}5k!nh//%a.K &a.+K**;{;l; !1..
 r   c                    UR                  S5      nUS   S   nU R                  S:X  a   UR                  SS9nUR                  5       nO[	        S5      e[        [        XR5      S S9 VVs/ s H	  u  pgXgS	.PM     nnnU$ s  snnf )
Nr%   rK   r   r   )dimz`tf` framework not supported.c                     U S   * $ )Nr   r   )rF   s    r   <lambda>AZeroShotAudioClassificationPipeline.postprocess.<locals>.<lambda>   s    _`ab_c^cr   )key)scorelabel)rL   r   softmaxtolistr   sortedzip)	r   rQ   r%   rK   probsscoresrZ   candidate_labelresults	            r   postprocess/ZeroShotAudioClassificationPipeline.postprocess   s    (,,-?@x(+>>T!NNqN)E\\^F<== +1V1NTc*d
*d& 6*d 	 
 	
s   ,Br   )NzThis is a sound of {}.)__name__
__module____qualname____firstlineno____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r   r:   r;   r8   r1   dictr   listr"   r(   rH   rR   rd   __static_attributes____classcell__)r   s   @r   r   r       s~    : O!"OT2uRZZT%AB 2c 2VZ[_`ceh`h[iVj 22):" r   r   )collectionsr   typingr   r   numpyr:   r3   utilsr   r   audio_classificationr
   baser   r   
get_loggerrf   loggerr   r   r   r   <module>r{      s_    !    . 4 
		H	% ,4W[\]H( H ^Hr   