
    cCi+                         S SK JrJrJr  SSKJr  SSKJr  SSKJ	r	  \" 5       (       a  S SK
r
SSKJr  SS	KJr  S
r " S S\	5      rg)    )AnyUnionoverload   )GenerationConfig)is_torch_available   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc            
       *  ^  \ rS rSrSrSrSrSrSrSrSr	\
" SS9rSSSS.U 4S	 jjrS
 rS r\S\S\S\\\4   4S j5       r\S\\   S\S\\\\4      4S j5       rS\\\\   4   S\\\\4   \\\\4      4   4U 4S jjr   SS jrS rSrU =r$ )TextToAudioPipeline   a|  
Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
pipeline generates an audio file from an input text and optional other conditional inputs.

Unless the model you're using explicitly sets these generation parameters in its configuration files
(`generation_config.json`), the following default values will be used:
- max_new_tokens: 256

Example:

```python
>>> from transformers import pipeline

>>> pipe = pipeline(model="suno/bark-small")
>>> output = pipe("Hey it's HuggingFace on the phone!")

>>> audio = output["audio"]
>>> sampling_rate = output["sampling_rate"]
```

Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

<Tip>

You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
[`TextToAudioPipeline.__call__.generate_kwargs`].

Example:

```python
>>> from transformers import pipeline

>>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

>>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
>>> generate_kwargs = {
...     "do_sample": True,
...     "temperature": 0.7,
...     "max_new_tokens": 35,
... }

>>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
```

</Tip>

This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
`"text-to-audio"`.

See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
TF   )max_new_tokensN)vocodersampling_rateno_processorc                
  > [         T	U ]  " U0 UD6  X0l        U R                  S:X  a  [	        S5      eS U l        U R                  R                  [        R                  " 5       ;   aG  Uc=  [        R                  " [        5      R                  U R                  R                  5      OUU l        X l        U R
                  b%  U R
                  R                   R                  U l        U R                  c  U R                  R                   nU R                  R"                  R%                  SS 5      nUb  UR'                  UR)                  5       5        S HL  n[+        XhS 5      nUb  X l        M  [+        USS 5      c  M*  [+        UR,                  US 5      nUc  MF  X l        MN     U R                  cT  U R                  (       dB  [/        U R0                  S5      (       a&  U R0                  R2                  R                  U l        g g g g )Ntfz5The TextToAudioPipeline is only available in PyTorch.generation_config)sample_rater   codec_configfeature_extractor)super__init__r   	framework
ValueErrorr   model	__class__r   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodevicer   config__dict__getupdateto_dictgetattrr   hasattr	processorr   )
selfr   r   r   argskwargsr&   
gen_configsampling_rate_namer    s
            ^/home/james-whalen/.local/lib/python3.13/site-packages/transformers/pipelines/text_to_audio.pyr   TextToAudioPipeline.__init__a   s   $)&) )>>T!TUU::#H#O#O#QQ ?  //0BCFFtzzGXGXY L +<<#!%!4!4!B!BD% ZZ&&F,,001DdKJ%j0023&F" 'D I ,)6&V^T:F$+F,?,?ASUY$ZM$0-:* 'G %d.?.?GDNN\oDpDp!%!A!A!O!OD Eq.?%    c                 h   [        U[        5      (       a  U/nU R                  R                  R                  S:X  a?  U R
                  R                  R                  SS5      SSSSS.nUR                  U5        UnU R                  (       a  U R                  OU R                  nU" U40 UDSS	0D6nU$ )
Nbarkmax_input_semantic_lengthr   FT
max_length)r9   add_special_tokensreturn_attention_maskreturn_token_type_idspaddingreturn_tensorspt)
isinstancestrr   r&   
model_typer   semantic_configr(   r)   r   	tokenizerr-   )r.   textr0   
new_kwargspreprocessoroutputs         r3   
preprocessTextToAudioPipeline.preprocess   s    dC  6D::''61 #44DDHHIdfij&+)-).'J f%F)-):):t~~dBfBTBr5   c                    U R                  X R                  S9nUS   nUS   nU R                  R                  5       (       a^  U R                  X@R                  S9nSU;  a  U R                  US'   UR                  U5        U R                  R                  " S0 UDUD6nOC[        U5      (       a  [        SUR                  5        35      eU R                  " S0 UDUD6S   nU R                  b  U R                  U5      nU$ )N)r%   forward_paramsgenerate_kwargsr   zYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r    )_ensure_tensor_on_devicer%   r   can_generater   r)   generatelenr   keysr   )r.   model_inputsr0   rL   rM   rH   s         r3   _forwardTextToAudioPipeline._forward   s   ..vkk.J 01 !23::""$$";;OT_T_;`O #/97;7M7M 34 !!/2ZZ((J<J>JF?## KKZK_K_KaJbd 
 ZZA,A.A!DF<<#\\&)Fr5   text_inputsrL   returnc                     g NrN   r.   rW   rL   s      r3   __call__TextToAudioPipeline.__call__   s    SVr5   c                     g rZ   rN   r[   s      r3   r\   r]      s    _br5   c                 &   > [         TU ]  " U40 UD6$ )a  
Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

Args:
    text_inputs (`str` or `list[str]`):
        The text(s) to generate.
    forward_params (`dict`, *optional*):
        Parameters passed to the model generation/forward method. `forward_params` are always passed to the
        underlying model.
    generate_kwargs (`dict`, *optional*):
        The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
        complete overview of generate, check the [following
        guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
        only passed to the underlying model if the latter is a generative model.

Return:
    A `dict` or a list of `dict`: The dictionaries have two keys:

    - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
    - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
)r   r\   )r.   rW   rL   r    s      r3   r\   r]      s    0 w>~>>r5   c                     [        U SS 5      b  U R                  US'   [        U SS 5      b  U R                  US'   U R                  US'   U(       a  UO0 U(       a  UO0 S.nUc  0 n0 nXU4$ )Nassistant_modelassistant_tokenizerrD   )rL   rM   )r+   ra   rD   rb   )r.   preprocess_paramsrL   rM   paramspostprocess_paramss         r3   _sanitize_parameters(TextToAudioPipeline._sanitize_parameters   s     4*D1=151E1EO-.4.5A+/>>OK(595M5MO12 1?nB2Ar

 $ " *<<<r5   c                 T   0 nU R                   R                  R                  S:X  a  SnOSnU R                  (       a8  [	        U[
        5      (       a  X   nO9[	        U[        5      (       a  US   nOUnOU R                  R                  U5      n[	        U[        5      (       a@  U Vs/ s H.  oUR                  S[        R                  S9R                  5       PM0     snUS'   O/UR                  S[        R                  S9R                  5       US'   U R                  US'   U$ s  snf )Ncsmaudiowaveformr   cpu)r%   dtyper   )r   r&   rB   r   r@   dicttupler-   decodelistr$   torchfloatnumpyr   )r.   rj   output_dictwaveform_keyrk   els         r3   postprocessTextToAudioPipeline.postprocess   s    ::''50"L%L %&& .E5)) 8  ~~,,U3HeT""]e#f]eWYEEekkE$J$P$P$R]e#fK #+;;e5;;;#O#U#U#WK '+'9'9O$ $gs   )5D%)r   r   r   )NNN)__name__
__module____qualname____firstlineno____doc___load_processor_pipeline_calls_generate_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   rI   rU   r   rA   r   rn   r\   rq   r   rf   rx   __static_attributes____classcell__)r    s   @r3   r   r      s   2j O#O!#O "2" '+$T (P (PT0B VCV3V4S>V VbDIbbdSVX[S[nI]b b? d3i0?	tCH~tDcN33	4?8 	=. r5   r   )typingr   r   r   
generationr   utilsr   baser
   rr   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r   r#   r   rN   r5   r3   <module>r      s>    ( ' ) &  QC1 w( wr5   