
    +hg                     0   S SK r S SKJrJrJrJrJrJr  S SKr	S SK
r
S SKJs  Jr  S SKJrJrJrJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJr  S	S
KJ r J!r!J"r"J#r#  \" 5       (       a  S SK$J%s  J&r'  Sr(OSr(\RR                  " \*5      r+Sr, " S S\!\"\#5      r-g)    N)AnyCallableDictListOptionalUnion)ClapTextModelWithProjectionRobertaTokenizerRobertaTokenizerFastSpeechT5HifiGan   )AutoencoderKLUNet2DConditionModel)KarrasDiffusionSchedulers)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )AudioPipelineOutputDeprecatedPipelineMixinDiffusionPipelineStableDiffusionMixinTFaj  
    Examples:
        ```py
        >>> from diffusers import AudioLDMPipeline
        >>> import torch
        >>> import scipy

        >>> repo_id = "cvssp/audioldm-s-full-v2"
        >>> pipe = AudioLDMPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
        >>> pipe = pipe.to("cuda")

        >>> prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
        >>> audio = pipe(prompt, num_inference_steps=10, audio_length_in_s=5.0).audios[0]

        >>> # save the audio sample as a .wav file
        >>> scipy.io.wavfile.write("techno.wav", rate=16000, data=audio)
        ```
c            $         ^  \ rS rSrSrSrSrS\S\S\	\
\4   S\S	\S
\4U 4S jjr   S%S\\R$                     S\\R$                     4S jjrS rS rS r   S%S jrS&S jr\R2                  " 5       \" \5                      S'S\	\\\   4   S\\   S\S\S\\	\\\   4      S\\   S\S\\	\R@                  \\R@                     4      S\\R$                     S\\R$                     S\\R$                     S\!S\\"\\\R$                  /S4      S \\   S!\\#\\$4      S"\\   4 S# jj5       5       r%S$r&U =r'$ )(AudioLDMPipeline<   a  
Pipeline for text-to-audio generation using AudioLDM.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).

Args:
    vae ([`AutoencoderKL`]):
        Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
    text_encoder ([`~transformers.ClapTextModelWithProjection`]):
        Frozen text-encoder (`ClapTextModelWithProjection`, specifically the
        [laion/clap-htsat-unfused](https://huggingface.co/laion/clap-htsat-unfused) variant.
    tokenizer ([`PreTrainedTokenizer`]):
        A [`~transformers.RobertaTokenizer`] to tokenize text.
    unet ([`UNet2DConditionModel`]):
        A `UNet2DConditionModel` to denoise the encoded audio latents.
    scheduler ([`SchedulerMixin`]):
        A scheduler to be used in combination with `unet` to denoise the encoded audio latents. Can be one of
        [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    vocoder ([`~transformers.SpeechT5HifiGan`]):
        Vocoder of class `SpeechT5HifiGan`.
z0.33.1ztext_encoder->unet->vaevaetext_encoder	tokenizerunet	schedulervocoderc           	         > [         TU ]  5         U R                  UUUUUUS9  [        U SS 5      (       a5  S[	        U R
                  R                  R                  5      S-
  -  U l        g SU l        g )N)r   r   r   r    r!   r"   r   r         )	super__init__register_modulesgetattrlenr   configblock_out_channelsvae_scale_factor)selfr   r   r   r    r!   r"   	__class__s          h/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/audioldm/pipeline_audioldm.pyr'   AudioLDMPipeline.__init__W   sx     	% 	 	
 W^^bdikoVpVpc$((//*L*L&MPQ&Q Rvw    Nprompt_embedsnegative_prompt_embedsc                 0   Ub  [        U[        5      (       a  SnO3Ub!  [        U[        5      (       a  [        U5      nOUR                  S   nUGcN  U R                  USU R
                  R                  SSS9n	U	R                  n
U	R                  nU R                  USSS	9R                  nUR                  S
   U
R                  S
   :  a  [        R                  " X5      (       dj  U R
                  R                  USS2U R
                  R                  S-
  S
24   5      n[        R                  SU R
                  R                   SU 35        U R                  U
R                  U5      UR                  U5      S9nUR                   n["        R$                  " US
S9nUR                  U R                  R&                  US9nUR                  u  nnUR)                  SU5      nUR+                  X-  U5      nU(       Ga  UGc  Uc  S/U-  nO[-        U5      [-        U5      La$  [/        S[-        U5       S[-        U5       S35      e[        U[        5      (       a  U/nO2U[        U5      :w  a!  [1        SU S[        U5       SU SU S3	5      eUnUR                  S   nU R                  USUSSS9nUR                  R                  U5      nUR                  R                  U5      nU R                  UUS9nUR                   n["        R$                  " US
S9nU(       ap  UR                  S   nUR                  U R                  R&                  US9nUR)                  SU5      nUR+                  X-  U5      n[        R2                  " Xv/5      nU$ )a  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `List[str]`, *optional*):
        prompt to be encoded
    device (`torch.device`):
        torch device
    num_waveforms_per_prompt (`int`):
        number of waveforms that should be generated per prompt
    do_classifier_free_guidance (`bool`):
        whether to use classifier free guidance or not
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the audio generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
Nr$   r   
max_lengthTpt)paddingr6   
truncationreturn_tensorslongest)r8   r:   z\The following part of your input was truncated because CLAP can only handle sequences up to z	 tokens: )attention_mask)dim)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancestrlistr*   shaper   model_max_length	input_idsr=   torchequalbatch_decodeloggerwarningr   totext_embedsF	normalizer?   repeatviewtype	TypeError
ValueErrorcat)r.   promptr@   num_waveforms_per_promptdo_classifier_free_guidancenegative_promptr3   r4   
batch_sizetext_inputstext_input_idsr=   untruncated_idsremoved_textbs_embedseq_lenuncond_tokensr6   uncond_inputuncond_input_idss                       r0   _encode_promptAudioLDMPipeline._encode_promptl   s   D *VS"9"9JJvt$<$<VJ&,,Q/J ..$>>::# ) K )22N(77N"nnVYW[n\ffO$$R(N,@,@,DDU[[N N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 !--!!&)-008 . M *55MKK2>M%((t/@/@/F/Fv(V
 	
 &,,Q0HI%**8+NPWX '+A+I&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS11!0 1s?33 )/)::J3K_J` ax/
| <33  !0&,,Q/J>>$%# * L  ,5588@)88;;FCN%)%6%6 - &7 &" &<%G%G"%&[[1GR%P"&,2215G%;%>%>TEVEVE\E\ek%>%l"%;%B%B1F^%_"%;%@%@Afho%p"
 "II'=&MNMr2   c                     SU R                   R                  R                  -  U-  nU R                   R                  U5      R                  nU$ )Nr$   )r   r+   scaling_factordecodesample)r.   latentsmel_spectrograms      r0   decode_latentsAudioLDMPipeline.decode_latents   s=    dhhoo444w>((//'299r2   c                     UR                  5       S:X  a  UR                  S5      nU R                  U5      nUR                  5       R	                  5       nU$ )N   r$   )r>   squeezer"   cpufloat)r.   rm   waveforms      r0   mel_spectrogram_to_waveform,AudioLDMPipeline.mel_spectrogram_to_waveform   sJ     A%-55a8O<<0<<>'')r2   c                 n   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   n0 nU(       a  X$S'   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   nU(       a  XS'   U$ )Neta	generator)setinspect	signaturer!   step
parameterskeys)r.   rz   ry   accepts_etaextra_step_kwargsaccepts_generators         r0   prepare_extra_step_kwargs*AudioLDMPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r2   c                 6   X0R                   -  nX(:  a  [        SU SU S35      eU R                  R                  R                  U R                   -  S:w  a:  [        SU R                  R                  R                   SU R                    S35      eUb  Ub6  [        U[        5      (       a  US::  a  [        SU S[        U5       S35      eUb  Ub  [        S	U S
U S35      eUc  Uc  [        S5      eUbA  [        U[        5      (       d,  [        U[        5      (       d  [        S[        U5       35      eUb  Ub  [        SU SU S35      eUbE  UbA  UR                  UR                  :w  a&  [        SUR                   SUR                   S35      eg g g )NzH`audio_length_in_s` has to be a positive value greater than or equal to z	, but is rB   r   zwThe number of frequency bins in the vocoder's log-mel spectrogram has to be divisible by the VAE scale factor, but got z bins and a scale factor of z5`callback_steps` has to be a positive integer but is z	 of type zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r-   rV   r"   r+   model_in_dimrC   intrT   rD   rE   rF   )	r.   rX   audio_length_in_svocoder_upsample_factorcallback_stepsr[   r3   r4   min_audio_length_in_ss	            r0   check_inputsAudioLDMPipeline.check_inputs  s     !8:O:O O4Z[pZq r'(+ 
 <<++d.C.CCqH--1\\-@-@-M-M,NNj((),  "&
>30O0OSaefSfGGW X(), 
 -";08N}o ^0 0  ^ 5w  FC)@)@TZ\`IaIaQRVW]R^Q_`aa&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  C *L$r2   c                    UU[        U5      U R                  -  [        U R                  R                  R                  5      U R                  -  4n[        U[        5      (       a*  [        U5      U:w  a  [        S[        U5       SU S35      eUc  [        XXTS9nOUR                  U5      nXpR                  R                  -  nU$ )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rz   r@   r?   )r   r-   r"   r+   r   rC   rE   r*   rV   r   rN   r!   init_noise_sigma)	r.   r\   num_channels_latentsheightr?   r@   rz   rl   rF   s	            r0   prepare_latents AudioLDMPipeline.prepare_latentsR  s     K4000##001T5J5JJ	
 i&&3y>Z+GA#i.AQ R&<'gi 
 ?"5fZGjj(G NN;;;r2   rX   r   num_inference_stepsguidance_scaler[   rY   ry   rz   rl   return_dictcallbackr   cross_attention_kwargsoutput_typec           
         [         R                  " U R                  R                  R                  5      U R                  R                  R
                  -  nUc0  U R                  R                  R                  U R                  -  U-  n[        UU-  5      n[        X R                  R                  R
                  -  5      nUU R                  -  S:w  a[  [        [         R                  " UU R                  -  5      5      U R                  -  n[        R                  SU SUU-   SU S35        U R                  UUUUUU
U5        Ub  [        U[        5      (       a  SnO3Ub!  [        U[         5      (       a  [#        U5      nOU
R$                  S   nU R&                  nUS:  nU R)                  UUUUUU
US	9n
U R*                  R-                  UUS
9  U R*                  R.                  nU R                  R                  R0                  nU R3                  UU-  UUU
R4                  UUU	5      n	U R7                  X5      n[#        U5      X0R*                  R8                  -  -
  nU R;                  US9 n[=        U5       GHP  u  nnU(       a  [>        R@                  " U	/S-  5      OU	nU R*                  RC                  UU5      nU R                  UUSU
US9RD                  nU(       a  URG                  S5      u  n n!U UU!U -
  -  -   nU R*                  RH                  " UUU	40 UD6RJ                  n	U[#        U5      S-
  :X  d)  US-   U:  a`  US-   U R*                  R8                  -  S:X  a@  URM                  5         Ub-  UU-  S:X  a$  U[O        U R*                  SS5      -  n"U" U"UU	5        [P        (       d  GM;  [R        RT                  " 5         GMS     SSS5        U RW                  U	5      n#U RY                  U#5      n$U$SS2SU24   n$US:X  a  U$R[                  5       n$U(       d  U$4$ []        U$S9$ ! , (       d  f       Nd= f)u  
The call function to the pipeline for generation.

Args:
    prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
    audio_length_in_s (`int`, *optional*, defaults to 5.12):
        The length of the generated audio sample in seconds.
    num_inference_steps (`int`, *optional*, defaults to 10):
        The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
        expense of slower inference.
    guidance_scale (`float`, *optional*, defaults to 2.5):
        A higher guidance scale value encourages the model to generate audio that is closely linked to the text
        `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
        pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
    num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
        The number of waveforms to generate per prompt.
    eta (`float`, *optional*, defaults to 0.0):
        Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
        applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
        generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor is generated by sampling using the supplied random `generator`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
        provided, text embeddings are generated from the `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
        not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple.
    callback (`Callable`, *optional*):
        A function that calls every `callback_steps` steps during inference. The function is called with the
        following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
    callback_steps (`int`, *optional*, defaults to 1):
        The frequency at which the `callback` function is called. If not specified, the callback is called at
        every step.
    cross_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
        [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    output_type (`str`, *optional*, defaults to `"np"`):
        The output format of the generated image. Choose between `"np"` to return a NumPy `np.ndarray` or
        `"pt"` to return a PyTorch `torch.Tensor` object.

Examples:

Returns:
    [`~pipelines.AudioPipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.AudioPipelineOutput`] is returned, otherwise a `tuple` is
        returned where the first element is a list with the generated audio.
Nr   zAudio length in seconds z is increased to z; so that it can be handled by the model. It will be cut to z after the denoising process.r$   g      ?)r3   r4   )r@   )totalr   )encoder_hidden_statesclass_labelsr   ordernp)audios)/r   prodr"   r+   upsample_ratessampling_rater    sample_sizer-   r   ceilrL   infor   rC   rD   rE   r*   rF   _execution_devicerf   r!   set_timesteps	timestepsin_channelsr   r?   r   r   progress_bar	enumeraterI   rW   scale_model_inputrk   chunkr~   prev_sampleupdater)   XLA_AVAILABLExm	mark_steprn   rv   numpyr   )%r.   rX   r   r   r   r[   rY   ry   rz   rl   r3   r4   r   r   r   r   r   r   r   original_waveform_lengthr\   r@   rZ   r   r   r   num_warmup_stepsr   itlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textstep_idxrm   audios%                                        r0   __call__AudioLDMPipeline.__call__h  s<   ^ #%''$,,*=*=*L*L"MPTP\P\PcPcPqPq"q$ $		 0 0 < <t?T?T TWn n&)@@A#&'8<<;N;N;\;\'\#] D)))Q.$*?*?!?@ADDYDYYFKK*+<*==NvXoOoNp qMM^L_ `%& 	#"	
 *VS"9"9JJvt$<$<VJ&,,Q/J'' '5s&:# ++$''#9 , 
 	$$%8$HNN,,	  $yy//;;&&11 
 !::9J y>,?..BVBV,VV%89\!),1A\UYYy1}%=bi"%)^^%E%EFXZ[%\" "YY&*.!.+A '  &  /9C9I9I!9L6%!2^YjGj5k!kJ ..--j!WZHYZff I**A9I/IqSTuX\XfXfXlXlNlpqNq '')+N0Ba0G#$(K#K 1g6 =LLN= - :D --g600Aa22223$KKME8O"%00] :9s   3EP2:P22
Q )r-   )NNN)N)NN
   g      @Nr$   g        NNNNTNr$   Nr   )(__name__
__module____qualname____firstlineno____doc___last_supported_versionmodel_cpu_offload_seqr   r	   r   r
   r   r   r   r   r'   r   rI   Tensorrf   rn   rv   r   r   r   no_gradr   EXAMPLE_DOC_STRINGrD   r   rt   r   	Generatorboolr   r   r   r   __static_attributes____classcell__)r/   s   @r0   r   r   <   s^   . '5xx 2x )+??@	x
 #x -x !x6 049=I  -I !) 6IV
!. #8v, ]]_12 )--1#% #;?23MQ*.049= GK();?%)#D1c49n%D1 $E?D1 !	D1
 D1 "%T#Y"78D1 #+3-D1 D1 E%//43H"HIJD1 %,,'D1  -D1 !) 6D1 D1 8S#u||$<d$BCDD1 !D1  !)c3h 8!D1" c]#D1 3 D1r2   r   ).r|   typingr   r   r   r   r   r   r   r   rI   torch.nn.functionalnn
functionalrP   transformersr	   r
   r   r   modelsr   r   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rL   r   r    r2   r0   <module>r      s     = =     m m 9 3 O O - r r ))MM			H	% (r1.0ACW r1r2   