
    +h:                        S SK r S SKJrJrJrJrJrJrJr  S SK	r
S SKrS SKrS SKJs  Jr  S SKJrJrJrJr  SSKJrJr  SSKJrJrJrJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SS	K%J&r&  SS
K'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.J/r/J0r0J1r1  SSK2J3r3J4r4  SSK5J6r6  SSK7J8r8  SSK9J:r:J;r;  SSK<J=r=  \-" 5       (       a  S SK>J?s  J@rA  SrBOSrB\.R                  " \D5      rESrF SS\R                  S\\R                     S\I4S jjrJ " S S\:\;\\\\8\5	      rKg)    N)AnyCallableDictListOptionalTupleUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInputVaeImageProcessor)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)SparseControlNetModel)adjust_lora_scale_text_encoder)MotionAdapter)KarrasDiffusionSchedulers)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)is_compiled_modulerandn_tensor)VideoProcessor   )FreeInitMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFa  
    Examples:
        ```python
        >>> import torch
        >>> from diffusers import AnimateDiffSparseControlNetPipeline
        >>> from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
        >>> from diffusers.utils import export_to_gif, load_image

        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
        >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
        >>> controlnet_id = "guoyww/animatediff-sparsectrl-scribble"
        >>> lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
        >>> vae_id = "stabilityai/sd-vae-ft-mse"
        >>> device = "cuda"

        >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
        >>> controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
        >>> vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
        >>> scheduler = DPMSolverMultistepScheduler.from_pretrained(
        ...     model_id,
        ...     subfolder="scheduler",
        ...     beta_schedule="linear",
        ...     algorithm_type="dpmsolver++",
        ...     use_karras_sigmas=True,
        ... )
        >>> pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
        ...     model_id,
        ...     motion_adapter=motion_adapter,
        ...     controlnet=controlnet,
        ...     vae=vae,
        ...     scheduler=scheduler,
        ...     torch_dtype=torch.float16,
        ... ).to(device)
        >>> pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
        >>> pipe.fuse_lora(lora_scale=1.0)

        >>> prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
        >>> negative_prompt = "low quality, worst quality, letterboxed"

        >>> image_files = [
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png",
        ... ]
        >>> condition_frame_indices = [0, 8, 15]
        >>> conditioning_frames = [load_image(img_file) for img_file in image_files]

        >>> video = pipe(
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     num_inference_steps=25,
        ...     conditioning_frames=conditioning_frames,
        ...     controlnet_conditioning_scale=1.0,
        ...     controlnet_frame_indices=condition_frame_indices,
        ...     generator=torch.Generator().manual_seed(1337),
        ... ).frames[0]
        >>> export_to_gif(video, "output.gif")
        ```
encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr0   r1   moder3   AttributeError)r,   r-   r.   s      y/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.pyretrieve_latentsr8   w   s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSS    c            7       <  ^  \ rS rSrSrSr/ SQr/ SQr  SIS\S\	S	\
S
\\\4   S\S\S\S\S\4U 4S jjjr     SJS\\R,                     S\\R,                     S\\   S\\   4S jjrSKS jrS rS rS r        SLS\4S jjrS r SKS jr S r!S \R,                  S!\S"\S#\RD                  S$\RF                  S%\$\R,                  \R,                  4   4S& jr%\&S' 5       r'\&S( 5       r(\&S) 5       r)\&S* 5       r*\&S+ 5       r+\RX                  " 5       \-" \.5      SSSS,S-S.SS/S0SSSSSSSS1S2SSS3/S4SSS5/4S6\\\/\0\/   4      S7\\   S8\\   S!\S9\S:\S;\\\/\0\/   4      S<\S=\S>\\\Rb                  \0\Rb                     4      S5\\R,                     S\\R,                     S\\R,                     S?\\2   S@\\0\R,                        S \\0\2      SA\/SB\3SC\\4\/\54      S\\\0\   4   S"\0\   SD\3S\\   SE\\6\\\4/S4      SF\0\/   42SG jj5       5       r7SHr8U =r9$ )M#AnimateDiffSparseControlNetPipeline   a  
Pipeline for controlled text-to-video generation using the method described in [SparseCtrl: Adding Sparse Controls
to Text-to-Video Diffusion Models](https://huggingface.co/papers/2311.16933).

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).

The pipeline also inherits the following loading methods:
    - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
    - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
    - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
    - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

Args:
    vae ([`AutoencoderKL`]):
        Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
    text_encoder ([`CLIPTextModel`]):
        Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
    tokenizer (`CLIPTokenizer`):
        A [`~transformers.CLIPTokenizer`] to tokenize text.
    unet ([`UNet2DConditionModel`]):
        A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
    motion_adapter ([`MotionAdapter`]):
        A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
    scheduler ([`SchedulerMixin`]):
        A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
        [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)r3   prompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetr?   
controlnet	schedulerr=   r>   c
                   > [         T
U ]  5         [        U[        5      (       a  [        R
                  " XE5      nU R                  UUUUUUUUU	S9	  [        U SS 5      (       a/  S[        U R                  R                  R                  5      S-
  -  OSU l        [        SU R                  S9U l        [        U R                  SSS	9U l        g )
N)	rB   rC   rD   rE   r?   rF   rG   r=   r>   rB   r&   r*      F)	do_resizevae_scale_factorT)rK   do_convert_rgbdo_normalize)super__init__
isinstancer   r   from_unet2dregister_modulesgetattrlenrB   configblock_out_channelsrK   r%   video_processorr   control_image_processor)selfrB   rC   rD   rE   r?   rF   rG   r=   r>   	__class__s             r7   rO   ,AnimateDiffSparseControlNetPipeline.__init__   s     	d011"..tDD%)!/' 	 
	
 W^^bdikoVpVpc$((//*L*L&MPQ&Q Rvw-PTPePef'8!224V[(
$r9   r@   rA   
lora_scale	clip_skipc
                 
   UbS  [        U [        5      (       a>  Xl        [        (       d  [	        U R
                  U5        O[        U R
                  U5        Ub  [        U[        5      (       a  Sn
O3Ub!  [        U[        5      (       a  [        U5      n
OUR                  S   n
UGc  [        U [        5      (       a  U R                  XR                  5      nU R                  USU R                  R                  SSS9nUR                  nU R                  USSS	9R                  nUR                  S
   UR                  S
   :  a  [         R"                  " X5      (       dj  U R                  R%                  USS2U R                  R                  S-
  S
24   5      n[&        R)                  SU R                  R                   SU 35        [+        U R
                  R,                  S5      (       aA  U R
                  R,                  R.                  (       a  UR0                  R3                  U5      nOSnU	c%  U R                  UR3                  U5      US9nUS   nOQU R                  UR3                  U5      USS9nUS
   U	S-   *    nU R
                  R4                  R7                  U5      nU R
                  b  U R
                  R8                  nO0U R:                  b  U R:                  R8                  nOUR8                  nUR3                  UUS9nUR                  u  nnnUR=                  SUS5      nUR?                  UU-  US
5      nU(       Ga  UGc|  Uc  S/U
-  nOUb;  [A        U5      [A        U5      La$  [C        S[A        U5       S[A        U5       S35      e[        U[        5      (       a  U/nO2U
[        U5      :w  a!  [E        SU S[        U5       SU SU
 S3	5      eUn[        U [        5      (       a  U R                  UU R                  5      nUR                  S   nU R                  USUSSS9n[+        U R
                  R,                  S5      (       aA  U R
                  R,                  R.                  (       a  UR0                  R3                  U5      nOSnU R                  UR                  R3                  U5      US9nUS   nU(       aG  UR                  S   nUR3                  UUS9nUR=                  SUS5      nUR?                  X-  US
5      nU R
                  b6  [        U [        5      (       a!  [        (       a  [G        U R
                  U5        Xg4$ )a,  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `List[str]`, *optional*):
        prompt to be encoded
    device: (`torch.device`):
        torch device
    num_images_per_prompt (`int`):
        number of images that should be generated per prompt
    do_classifier_free_guidance (`bool`):
        whether to use classifier free guidance or not
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    lora_scale (`float`, *optional*):
        A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
    clip_skip (`int`, *optional*):
        Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
        the output of the pre-final layer will be used for computing the prompt embeddings.
Nr*   r   
max_lengthTpt)paddingr_   
truncationreturn_tensorslongest)ra   rc   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rg   output_hidden_statesdtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$rP   r   _lora_scaler   r   rC   r!   strlistrT   shaper   maybe_convert_promptrD   model_max_length	input_idstorchequalbatch_decodeloggerwarningr4   rU   rf   rg   to
text_modelfinal_layer_normrj   rE   repeatviewtype	TypeError
ValueErrorr"   )rY   promptrk   num_images_per_promptdo_classifier_free_guidancenegative_promptr@   rA   r\   r]   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrg   prompt_embeds_dtypebs_embedseq_len_uncond_tokensr_   uncond_inputs                          r7   encode_prompt1AnimateDiffSparseControlNetPipeline.encode_prompt   sQ   V !j7U&V&V) $#.t/@/@*M!$"3"3Z@*VS"9"9JJvt$<$<VJ&,,Q/J $ ;<<226>>J..$>>::# ) K )22N"nnVYW[n\ffO$$R(N,@,@,DDU[[N N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 t((//1EFF4K\K\KcKcKvKv!,!;!;!>!>v!F!%  $ 1 1.2C2CF2K\j 1 k -a 0 $ 1 1"%%f-ncg !2 ! !.b 1IM2B C
 !% 1 1 < < M Mm \("&"3"3"9"9YY""&))//"/"5"5%((/B6(R,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS11!0 1s?33 )/)::J3K_J` ax/
| <33  !0 $ ;<< $ 9 9- X&,,Q/J>>$%# * L t((//1EFF4K\K\KcKcKvKv!-!<!<!?!?!G!%%)%6%6&&))&1- &7 &" &<A%>"&,2215G%;%>%>EXag%>%h"%;%B%B1F[]^%_"%;%@%@Acelnp%q"($ >??DTDT#D$5$5zB44r9   c                 d   [        U R                  R                  5       5      R                  n[	        U[
        R                  5      (       d  U R                  USS9R                  nUR                  X%S9nU(       aq  U R                  USS9R                  S   nUR                  USS9nU R                  [
        R                  " U5      SS9R                  S   nUR                  USS9nXg4$ U R                  U5      R                  nUR                  USS9n[
        R                  " U5      n	X4$ )	Nr`   )rc   )rk   rj   T)rh   r   dim)nextr>   
parametersrj   rP   ru   Tensorr=   pixel_valuesrz   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rY   imagerk   r   rh   rj   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedss
             r7   encode_image0AnimateDiffSparseControlNetPipeline.encode_image  s?   T''2245;;%..**5*FSSE4&*&8&8UY&8&Z&h&hik&l#&=&O&OPekl&O&m#-1-?-?  'd .@ .mB. * .L-]-]%1 .^ .* +JJ--e4AAL'99:OUV9WL"'"2"2<"@44r9   c                 
   / nU(       a  / nUGc&  [        U[        5      (       d  U/n[        U5      [        U R                  R                  R
                  5      :w  aB  [        S[        U5       S[        U R                  R                  R
                  5       S35      e[        XR                  R                  R
                  5       Hh  u  p[        U	[        5      (       + n
U R                  XSU
5      u  pUR                  US S S 24   5        U(       d  MP  WR                  US S S 24   5        Mj     OEU H?  nU(       a$  UR                  S5      u  pWR                  U5        UR                  U5        MA     / n[        U5       Hw  u  p[        R                  " U/U-  SS9nU(       a2  [        R                  " WU   /U-  SS9n[        R                  " X/SS9nUR                  US9nUR                  U5        My     U$ )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r*   r&   r   r   rk   )rP   rp   rT   rE   encoder_hid_projimage_projection_layersr   zipr   r   appendchunk	enumerateru   catrz   )rY   ip_adapter_imageip_adapter_image_embedsrk   r   r   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsis                 r7   prepare_ip_adapter_image_embedsCAnimateDiffSparseControlNetPipeline.prepare_ip_adapter_image_embeds  s9    &$&!"*.55$4#5 #$DII,F,F,^,^(__ abefvbwax  yE  FI  JN  JS  JS  Jd  Jd  J|  J|  F}  E~  ~K  L  >A ))"<"<"T"T>9' +55E*W&W#DHDUDU+Q8KEA# ##$7a$@A..)001MdTUg1VW> (?#.H[HaHabcHdE0)001MN##$78	 (? #%&/&="A"'))-@,ADY,Y_`"a*/4yy:OPQ:R9SVk9kqr/s,&+ii1M0cij&k#"5"8"8"8"G#**+>? '> '&r9   c                    SU R                   R                  R                  -  U-  nUR                  u  p#pEnUR	                  SSSSS5      R                  X$-  X5U5      nU R                   R                  U5      R                  nUS S S 24   R                  X$S4UR                  SS  -   5      R	                  SSSSS5      nUR                  5       nU$ )Nr*   r   r&   r      re   )	rB   rU   scaling_factorrq   permutereshapedecoder1   float)	rY   r3   r   channels
num_framesheightwidthr   videos	            r7   decode_latents2AnimateDiffSparseControlNetPipeline.decode_latents  s    dhhoo444w>:A--7
j%//!Q1a0889PRZdij(//dAg&&
'CekkRSRTo'UV^^_`bcefhiklmr9   c                 n   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   n0 nU(       a  X$S'   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   nU(       a  XS'   U$ )Netar-   )setinspect	signaturerG   stepr   keys)rY   r-   r   accepts_etaextra_step_kwargsaccepts_generators         r7   prepare_extra_step_kwargs=AnimateDiffSparseControlNetPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r9         ?controlnet_conditioning_scalec           
      *  ^  US-  S:w  d	  US-  S:w  a  [        SU SU S35      eU	bW  [        U 4S jU	 5       5      (       d=  [        ST R                   SU	 Vs/ s H  oT R                  ;  d  M  UPM     sn 35      eUb  Ub  [        S	U S
U S35      eUc  Uc  [        S5      eUbA  [        U[        5      (       d,  [        U[
        5      (       d  [        S[        U5       35      eUb  Ub  [        SU SU S35      eUbC  Ub@  UR                  UR                  :w  a&  [        SUR                   SUR                   S35      eUb  Ub  [        S5      eUb[  [        U[
        5      (       d  [        S[        U5       35      eUS   R                  S;  a  [        SUS   R                   S35      e[        [        S5      =(       a8    [        T R                  [        R                  R                  R                  5      n[        T R                  [         5      (       d0  U(       am  [        T R                  R"                  [         5      (       aD  [        U
[
        5      (       a  U
 H  nT R%                  XU5        M     OT R%                  XU5        O e[        T R                  [         5      (       d0  U(       aJ  [        T R                  R"                  [         5      (       a!  [        U[&        5      (       d  [)        S5      eg  es  snf )NrI   r   z7`height` and `width` have to be divisible by 8 but are z and rm   c              3   @   >#    U  H  oTR                   ;   v   M     g 7fN)_callback_tensor_inputs).0krY   s     r7   	<genexpr>CAnimateDiffSparseControlNetPipeline.check_inputs.<locals>.<genexpr>  s      F
7Y!---7Ys   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is Dscaled_dot_product_attentionzLFor single controlnet: `controlnet_conditioning_scale` must be type `float`.)r   allr   rP   ro   rp   r   rq   ndimr4   FrF   ru   _dynamo
eval_frameOptimizedModuler   	_orig_modcheck_imager   r   )rY   r   r   r   r   r@   rA   r   r   "callback_on_step_end_tensor_inputsr   r   r   is_compiledimage_s   `              r7   check_inputs0AnimateDiffSparseControlNetPipeline.check_inputs  s    A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 C
 DTEaEaDbbn  |^  pH  |^vw  ko  kG  kG  bGpq  |^  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@)@TZ\`IaIaQRVW]R^Q_`aa&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  ',C,O ^  #.5t<< PQUVmQnPop  )+00> \]tuv]w]|]|\}}~  a!?@ 
ZOOU]]55EEF
 t(=>>4??446KLL%&&#F$$V]C $   >5 t(=>>4??446KLL;UCC noo D 5O pHs   L1Lc                    [        U[        R                  R                  5      n[        U[        R                  5      n[        U[
        R                  5      n[        U[        5      =(       a'    [        US   [        R                  R                  5      n[        U[        5      =(       a    [        US   [        R                  5      n[        U[        5      =(       a    [        US   [
        R                  5      n	U(       d:  U(       d3  U(       d,  U(       d%  U(       d  U	(       d  [        S[        U5       35      eU(       a  Sn
O[        U5      n
Ub  [        U[        5      (       a  SnO6Ub!  [        U[        5      (       a  [        U5      nOUb  UR                  S   nU
S:w  a  U
W:w  a  [        SU
 SU 35      eg g )Nr   zimage must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is r*   zdIf image batch size is not 1, image batch size must be same as prompt batch size. image batch size: z, prompt batch size: )rP   PILImageru   r   npndarrayrp   r   r   rT   ro   rq   r   )rY   r   r   r@   image_is_pilimage_is_tensorimage_is_npimage_is_pil_listimage_is_tensor_listimage_is_np_listimage_batch_sizeprompt_batch_sizes               r7   r   /AnimateDiffSparseControlNetPipeline.check_imageK  s   !%9$UELL9 

3&ud3]
58SYY__8])%6]:eAhPUP\P\;]%eT2Wz%(BJJ7W #%($ f  gk  lq  gr  fs  t   "5z*VS"9"9 !Jvt$<$< #F& - 3 3A 6q %59J%Jv  xH  wI  I^  _p  ^q  r  &K r9   c
                 0   UUUX@R                   -  XPR                   -  4n
[        U[        5      (       a*  [        U5      U:w  a  [	        S[        U5       SU S35      eU	c  [        XXvS9n	OU	R                  U5      n	XR                  R                  -  n	U	$ )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r-   rk   rj   )	rK   rP   rp   rT   r   r$   rz   rG   init_noise_sigma)rY   r   num_channels_latentsr   r   r   rj   rk   r-   r3   rq   s              r7   prepare_latents3AnimateDiffSparseControlNetPipeline.prepare_latentsq  s      +++***
 i&&3y>Z+GA#i.AQ R&<'gi 
 ?"5fZGjj(G NN;;;r9   c                 p   U R                   R                  XUS9nUR                  S5      R                  XE5      nUR                  u  pxpnUR                  5       S:  a  UR                  5       S::  d   eU R                  R                  (       a  UR                  Xx-  XU5      nSU-  S-
  n[        U R                  R                  U5      5      U R                  R                  R                  -  n
U
R                  XxSX0R                  -  X R                  -  5      n
OUn
U
R!                  SSSSS5      n
U
$ )N)r   r   r   r*   r&   r   r   )rX   
preprocess	unsqueezerz   rq   minmaxrF   "use_simplified_condition_embeddingr   r8   rB   encoderU   r   rK   r   )rY   r   r   r   rk   rj   controlnet_imagesr   r   r   conditioning_framess              r7   prepare_image1AnimateDiffSparseControlNetPipeline.prepare_image  s0   ,,77TY7Z!OOA.11&@:K:Q:Q7
% !$$&!+0A0E0E0G10LLL??== 1 9 9*:QS[ej k !$5 5 9"2488??CT3U"VY]YaYaYhYhYwYw"w"5"="=65J5J+JEUjUjLj# #4199!Q1aH""r9   r  r   controlnet_frame_indicesrk   rj   returnc                     UR                   S   [        U5      :  d   eUR                   u  pgpn
[        R                  " XgX)U
4XTS9n[        R                  " USX)U
4XTS9nUS S 2S S 2S [        U5      24   US S 2S S 2U4'   SUS S 2S S 2U4'   X4$ )Nr&   ri   r*   )rq   rT   ru   zeros)rY   r  r   r
  rk   rj   r   r   r   r   r   controlnet_condcontrolnet_cond_masks                r7   #prepare_sparse_control_conditioningGAnimateDiffSparseControlNetPipeline.prepare_sparse_control_conditioning  s     #((+s3K/LLLL1D1J1J.
a++zZQV&W_dt${{J:u+U]br:MaQRTsVYZrVsTsNs:t1667?@Q#;;<44r9   c                     U R                   $ r   _guidance_scalerY   s    r7   guidance_scale2AnimateDiffSparseControlNetPipeline.guidance_scale  s    ###r9   c                     U R                   $ r   )
_clip_skipr  s    r7   r]   -AnimateDiffSparseControlNetPipeline.clip_skip  s    r9   c                      U R                   S:  $ )Nr*   r  r  s    r7   r   ?AnimateDiffSparseControlNetPipeline.do_classifier_free_guidance  s    ##a''r9   c                     U R                   $ r   )_cross_attention_kwargsr  s    r7   cross_attention_kwargs:AnimateDiffSparseControlNetPipeline.cross_attention_kwargs  s    +++r9   c                     U R                   $ r   )_num_timestepsr  s    r7   num_timesteps1AnimateDiffSparseControlNetPipeline.num_timesteps  s    """r9      2   g      @r*   g        pilTr   Fr3   r   r   r   num_inference_stepsr  r   num_videos_per_promptr   r-   r   r   output_typereturn_dictr  
guess_modecallback_on_step_endr   c                    [        U R                  5      (       a  U R                  R                  OU R                  nU=(       d-    U R                  R                  R
                  U R                  -  nU=(       d-    U R                  R                  R
                  U R                  -  nSnU R                  UUUUUUUUUUUS9  X`l        UU l	        UU l
        Ub  [        U[        5      (       a  SnO3Ub!  [        U[        5      (       a  [        U5      nOUR                  S   nU R                   n[        U["        5      (       a  UR                  R$                  O"UR&                  S   R                  R$                  nU=(       d    UnU R(                  b  U R(                  R+                  SS5      OSnU R-                  UUUU R.                  UUUUU R0                  S9	u  pU R.                  (       a  [2        R4                  " X/5      nUR7                  USS9nUc  Ub"  U R9                  UUUUU-  U R.                  5      nU R;                  UX2UUR<                  5      nU R?                  UUUUUR<                  5      u  n n!U R@                  RC                  UUS9  U R@                  RD                  n"U R                  R                  RF                  n#U RI                  UU-  U#UUUUR<                  UU
U5	      nU RK                  X5      n$Uc  Ub  S	W0OSn%U RL                  (       a  U RN                  OSn&[Q        U&5       GH  n'U RL                  (       a#  U RS                  UU'UUUR<                  U
5      u  nn"[        U"5      U l*        [        U"5      XPR@                  RV                  -  -
  n(U RY                  U RT                  S
9 n)[[        U"5       GH  u  n*n+U R.                  (       a  [2        R4                  " U/S-  5      OUn,U R@                  R]                  U,U+5      n,U(       aD  U R.                  (       a3  Un-U R@                  R]                  U-U+5      n-UR_                  S5      S   n.OU,n-Un.U R                  U-U+U.U U!UUSS9u  n/n0U R                  U,U+UUU%U/U0S9R`                  n1U R.                  (       a  U1R_                  S5      u  n2n3U2UU3U2-
  -  -   n1U R@                  Rb                  " U1U+U40 U$D6Rd                  nUb\  0 n4U H  n5[g        5       U5   U4U5'   M     U" U U*U+U45      n6U6Ri                  SU5      nU6Ri                  SU5      nU6Ri                  SU5      nU*[        U"5      S-
  :X  d)  U*S-   U(:  a0  U*S-   U R@                  RV                  -  S:X  a  U)Rk                  5         [l        (       d  GM  [n        Rp                  " 5         GM      SSS5        GM     US:X  a  Un7O+U Rs                  U5      n8U Rt                  Rw                  U8US9n7U Ry                  5         U(       d  U74$ [{        U7S9$ ! , (       d  f       GM  = f)u  
The call function to the pipeline for generation.

Args:
    prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
    height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
        The height in pixels of the generated video.
    width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
        The width in pixels of the generated video.
    num_frames (`int`, *optional*, defaults to 16):
        The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
        amounts to 2 seconds of video.
    num_inference_steps (`int`, *optional*, defaults to 50):
        The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
        expense of slower inference.
    guidance_scale (`float`, *optional*, defaults to 7.5):
        A higher guidance scale value encourages the model to generate images closely linked to the text
        `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide what to not include in image generation. If not defined, you need to
        pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
    eta (`float`, *optional*, defaults to 0.0):
        Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
        applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
        generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
        `(batch_size, num_channel, num_frames, height, width)`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
        provided, text embeddings are generated from the `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
        not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
    ip_adapter_image: (`PipelineImageInput`, *optional*):
        Optional image input to work with IP Adapters.
    ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
        Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
        IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
        contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
        provided, embeddings are computed from the `ip_adapter_image` input argument.
    conditioning_frames (`List[PipelineImageInput]`, *optional*):
        The SparseControlNet input to provide guidance to the `unet` for generation.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
        of a plain tuple.
    cross_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
        [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0):
        The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
        to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
        the corresponding scale as a list.
    controlnet_frame_indices (`List[int]`):
        The indices where the conditioning frames must be applied for generation. Multiple frames can be
        provided to guide the model to generate similar structure outputs, where the `unet` can
        "fill-in-the-gaps" for interpolation videos, or a single frame could be provided for general expected
        structure. Must have the same length as `conditioning_frames`.
    clip_skip (`int`, *optional*):
        Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
        the output of the pre-final layer will be used for computing the prompt embeddings.
    callback_on_step_end (`Callable`, *optional*):
        A function that calls at the end of each denoising steps during the inference. The function is called
        with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
        callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
        `callback_on_step_end_tensor_inputs`.
    callback_on_step_end_tensor_inputs (`List`, *optional*):
        The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
        will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
        `._callback_tensor_inputs` attribute of your pipeline class.

Examples:

Returns:
    [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
        returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
r*   )r   r   r   r   r@   rA   r   r   r   r   r   Nr   scale)r@   rA   r\   r]   )repeatsr   r   r   )totalr&   F)encoder_hidden_statesr  conditioning_maskconditioning_scaler,  r+  )r2  r  added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr3   r@   rA   latent)r   r*  )frames)>r#   rF   r   rE   rU   sample_sizerK   r   r  r  r  rP   ro   rp   rT   rq   _execution_devicer   global_pool_conditionsnetsr  getr   r   r]   ru   r   r   r   r  rj   r  rG   set_timesteps	timestepsin_channelsr   r   free_init_enabled_free_init_num_itersrange_apply_free_initr"  orderprogress_barr   scale_model_inputr   r1   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   rW   postprocess_videomaybe_free_model_hooksr+   )9rY   r   r   r   r   r(  r  r   r)  r   r-   r3   r@   rA   r   r   r  r*  r+  r  r   r
  r,  r]   r-  r   rF   r   rk   r<  text_encoder_lora_scaler   r  r  r@  r   r   r5  num_free_init_itersfree_init_iternum_warmup_stepsrG  r   tlatent_model_inputcontrol_model_inputcontrolnet_prompt_embedsdown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   video_tensors9                                                            r7   __call__,AnimateDiffSparseControlNetPipeline.__call__  sl   f 3ET__2U2UT__..[_[j[j
 O499++77$:O:OOM))558M8MM ! 	+'#9-$;/Q%*G 	 	
  .#'=$ *VS"9"9JJvt$<$<VJ&,,Q/J'' *&;<< 44#**AA 	
  9#9
 ?C>Y>Y>eD''++GT:ko 	  150B0B!,,'#9.nn 1C 
1
- ++!II'=&MNM%77
PQ7R '+B+N?? '2200L #001DeU[]g]m]mn040X0X-EvzO_O_1
--
 	$$%8$HNN,,	  $yy//;;&&.. 

 !::9J
  +/F/R \* 	 <@;Q;Qd77WX#$78N%%%)%:%:^-@&'--Yb&" #&i.D"9~0CnnFZFZ0ZZ "")<)<"=%i0DAqEIEeEeG9q=)Akr&)-)I)IJ\^_)`&!d&F&F.5+.2nn.N.NObde.f+3@3F3Fq3I!3L0.@+3@0CG??+.F(7*>+H#-$) DS 	D@*,@ "&*.;/E*;8N6J "+ " f  77=G=M=Ma=P:)?%6?]nKn9o%o
 #nn11*a^L]^jjG+7*,!CA17!OA. "D+?aO+\("2"6"6y'"J(8(<(<_m(\1A1E1EF^`v1w. C	NQ..AE=M3MSTWXSX\`\j\j\p\pRptuRu$++-$}w 1 >= 9R ("E..w7L((::[f:gE 	##%8O(66W >=s   G/X79X77
Y	)r  r  r  rn   r"  rX   rK   rW   )NN)NNNNNr   )NNNNNNNr   ):__name__
__module____qualname____firstlineno____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r	   r   r   r   r   r   r
   r   rO   r   ru   r   r   intr   r   r   r   r   r   r   r   r  rk   rj   r   r  propertyr  r]   r   r  r#  no_gradr    EXAMPLE_DOC_STRINGro   r   	Generatorr   boolr   r   r   rb  __static_attributes____classcell__)rZ   s   @r7   r;   r;      sU   : EST 157;

 $
 !	

 (/9:
 &
 *
 -
 .
 5
 
P 049=&*#'t5  -t5 !) 6t5 UOt5 C=t5n52+'\
!, # $+//2\ (-\~#N nr2#*5"\\5 5 #&	5
 5 {{5 
u||U\\)	*5$ $ $   ( ( , , # # ]]_12 37 $##% #;?%&MQ*.049=9=@DBF  ;?CF/0c #'KO9B5u7sDI~./u7 u7 }	u7
 u7 !u7 u7 "%T#Y"78u7  #u7 u7 E%//43H"HIJu7 %,,'u7  -u7 !) 6u7 ##56u7  "*$u||*<!=!u7" &d+=&>?#u7$ %u7& 'u7( !)c3h 8)u7* (-UDK-?'@+u7, #'s)-u7. /u70 C=1u72 'xc40@$0F'GH3u74 -1I5u7 3 u7r9   r;   )Nr1   )Lr   typingr   r   r   r   r   r   r	   numpyr   r   ru   torch.nn.functionalnn
functionalr   transformersr
   r   r   r   image_processorr   r   loadersr   r   r   r   modelsr   r   r   r   (models.controlnets.controlnet_sparsectrlr   models.lorar   models.unets.unet_motion_modelr   
schedulersr   utilsr   r   r   r    r!   r"   utils.torch_utilsr#   r$   rW   r%   free_init_utilsr'   pipeline_utilsr(   r)   pipeline_outputr+   torch_xla.core.xla_modelcore	xla_modelrN  rM  
get_loggerrd  rx   rn  r   ro  ro   r8   r;    r9   r7   <module>r     s     D D D  
    h h D w w [ [ M 9 ; 3  B - + D 6 ))MM			H	%; @ ck
TLL
T-5eoo-F
T\_
T{7"{7r9   