
    +h"                     Z   S SK r S SKJrJrJrJrJrJr  S SKrS SK	J
r
JrJrJr  SSKJr  SSKJrJrJrJr  SSKJrJrJrJr  SSKJr  SS	KJr  SS
KJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.  SSK/J0r0  SSK1J2r2  SSK3J4r4J5r5  SSK6J7r7  \'" 5       (       a  S SK8J9s  J:r;  Sr<OSr<\(Rz                  " \>5      r?Sr@ S"S\R                  S\\R                     S\C4S jjrD    S#S\\E   S\\\C\R                  4      S\\\E      S\\\G      4S jjrH " S  S!\4\5\\\\0\2\5
      rIg)$    N)AnyCallableDictListOptionalUnion)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)DDIMSchedulerDPMSolverMultistepSchedulerEulerAncestralDiscreteSchedulerEulerDiscreteSchedulerLMSDiscreteSchedulerPNDMScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingscale_lora_layersunscale_lora_layers)randn_tensor)VideoProcessor   )FreeInitMixin)AnimateDiffFreeNoiseMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFa  
    Examples:
        ```py
        >>> import imageio
        >>> import requests
        >>> import torch
        >>> from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
        >>> from diffusers.utils import export_to_gif
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> adapter = MotionAdapter.from_pretrained(
        ...     "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16
        ... )
        >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(
        ...     "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter
        ... ).to("cuda")
        >>> pipe.scheduler = DDIMScheduler(
        ...     beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace"
        ... )


        >>> def load_video(file_path: str):
        ...     images = []

        ...     if file_path.startswith(("http://", "https://")):
        ...         # If the file_path is a URL
        ...         response = requests.get(file_path)
        ...         response.raise_for_status()
        ...         content = BytesIO(response.content)
        ...         vid = imageio.get_reader(content)
        ...     else:
        ...         # Assuming it's a local file path
        ...         vid = imageio.get_reader(file_path)

        ...     for frame in vid:
        ...         pil_image = Image.fromarray(frame)
        ...         images.append(pil_image)

        ...     return images


        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
        ... )
        >>> output = pipe(
        ...     video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5
        ... )
        >>> frames = output.frames[0]
        >>> export_to_gif(frames, "animation.gif")
        ```
encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrr1   r2   moder4   AttributeError)r-   r.   r/   s      z/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.pyretrieve_latentsr9   l   s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSS    num_inference_stepsdevice	timestepssigmasc                    Ub  Ub  [        S5      eUb  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
X2S.UD6  U R                  n[        U5      nX14$ Ub  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
XBS.UD6  U R                  n[        U5      nX14$ U R                  " U4S	U0UD6  U R                  nX14$ )a  
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

Args:
    scheduler (`SchedulerMixin`):
        The scheduler to get timesteps from.
    num_inference_steps (`int`):
        The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
        must be `None`.
    device (`str` or `torch.device`, *optional*):
        The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
    timesteps (`List[int]`, *optional*):
        Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
        `num_inference_steps` and `sigmas` must be `None`.
    sigmas (`List[float]`, *optional*):
        Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
        `num_inference_steps` and `timesteps` must be `None`.

Returns:
    `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
    second element is the number of inference steps.
zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr=   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r=   r<   r>   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r>   r<   r<    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r=   len)	schedulerr;   r<   r=   r>   kwargsaccepts_timestepsaccept_sigmass           r8   retrieve_timestepsrN   z   s}   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	M)MfM''	!)n )) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	GvGG''	!)n )) 	 3MFMfM''	))r:   c            6         ^  \ rS rSrSrSr/ SQr/ SQr  SLS\S\	S	\
S
\\\4   S\S\\\\\\\4   S\S\4U 4S jjjr     SMS\\R4                     S\\R4                     S\\   S\\   4S jjrSNS jrS rSOS\S\R4                  4S jjr SOS\4S jjr!S r"        SPS jr#S r$            SQS!\\R4                     S"\S#\S$\S%\S&\\   S'\\RJ                     S(\\RL                     S)\\\RN                  \(\RN                     4      S*\\R4                     S\S+\)S\R4                  4S, jjr*\+S- 5       r,\+S. 5       r-\+S/ 5       r.\+S0 5       r/\+S1 5       r0\+S2 5       r1\Rd                  " 5       SSSSS3S SSS4S5SSS6SSSSSSS7S8SSSS*/S4S!\(\(\3      S9\\\4\(\4   4      S"\\   S#\\   S:\S;\)S<\\(\      S=\\(\      S>\S?\S@\\\4\(\4   4      SA\\   SB\S)\\\RN                  \(\RN                     4      S*\\R4                     S\\R4                     S\\R4                     SC\\3   SD\\(\R4                        SE\\4   SF\)SG\\5\4\64      S\\   SH\\7\\\5/S4      SI\(\4   S\44SJ jj5       r8SKr9U =r:$ )RAnimateDiffVideoToVideoPipeline   a  
Pipeline for video-to-video generation.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).

The pipeline also inherits the following loading methods:
    - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
    - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
    - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
    - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

Args:
    vae ([`AutoencoderKL`]):
        Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
    text_encoder ([`CLIPTextModel`]):
        Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
    tokenizer (`CLIPTokenizer`):
        A [`~transformers.CLIPTokenizer`] to tokenize text.
    unet ([`UNet2DConditionModel`]):
        A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
    motion_adapter ([`MotionAdapter`]):
        A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
    scheduler ([`SchedulerMixin`]):
        A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
        [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)r4   prompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetrT   rJ   rR   rS   c	                 d  > [         T	U ]  5         [        U[        5      (       a  [        R
                  " XE5      nU R                  UUUUUUUUS9  [        U SS 5      (       a/  S[        U R                  R                  R                  5      S-
  -  OSU l        [        U R                  S9U l        g )N)rW   rX   rY   rZ   rT   rJ   rR   rS   rW   r&   r+      )vae_scale_factor)super__init__
isinstancer   r   from_unet2dregister_modulesgetattrrI   rW   configblock_out_channelsr]   r%   video_processor)
selfrW   rX   rY   rZ   rT   rJ   rR   rS   rH   s
            r8   r_   (AnimateDiffVideoToVideoPipeline.__init__   s    $ 	d011"..tDD%)/' 	 		
 W^^bdikoVpVpc$((//*L*L&MPQ&Q Rvw-t?T?TUr:   rU   rV   
lora_scale	clip_skipc
                 
   UbS  [        U [        5      (       a>  Xl        [        (       d  [	        U R
                  U5        O[        U R
                  U5        Ub  [        U[        [        45      (       a  Sn
O3Ub!  [        U[        5      (       a  [        U5      n
OUR                  S   n
UGc  [        U [        5      (       a  U R                  XR                  5      nU R                  USU R                  R                  SSS9nUR                   nU R                  USSS	9R                   nUR                  S
   UR                  S
   :  a  ["        R$                  " X5      (       dj  U R                  R'                  USS2U R                  R                  S-
  S
24   5      n[(        R+                  SU R                  R                   SU 35        [-        U R
                  R.                  S5      (       aA  U R
                  R.                  R0                  (       a  UR2                  R5                  U5      nOSnU	c%  U R                  UR5                  U5      US9nUS   nOQU R                  UR5                  U5      USS9nUS
   U	S-   *    nU R
                  R6                  R9                  U5      nU R
                  b  U R
                  R:                  nO0U R<                  b  U R<                  R:                  nOUR:                  nUR5                  UUS9nUR                  u  nnnUR?                  SUS5      nURA                  UU-  US
5      nU(       Ga  UGc|  Uc  S/U
-  nOUb;  [C        U5      [C        U5      La$  [E        S[C        U5       S[C        U5       S35      e[        U[        5      (       a  U/nO2U
[        U5      :w  a!  [G        SU S[        U5       SU SU
 S3	5      eUn[        U [        5      (       a  U R                  UU R                  5      nUR                  S   nU R                  USUSSS9n[-        U R
                  R.                  S5      (       aA  U R
                  R.                  R0                  (       a  UR2                  R5                  U5      nOSnU R                  UR                   R5                  U5      US9nUS   nU(       aG  UR                  S   nUR5                  UUS9nUR?                  SUS5      nURA                  X-  US
5      nU R
                  b6  [        U [        5      (       a!  [        (       a  [I        U R
                  U5        Xg4$ )a,  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `List[str]`, *optional*):
        prompt to be encoded
    device: (`torch.device`):
        torch device
    num_images_per_prompt (`int`):
        number of images that should be generated per prompt
    do_classifier_free_guidance (`bool`):
        whether to use classifier free guidance or not
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    lora_scale (`float`, *optional*):
        A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
    clip_skip (`int`, *optional*):
        Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
        the output of the pre-final layer will be used for computing the prompt embeddings.
Nr+   r   
max_lengthTpt)paddingrl   
truncationreturn_tensorslongest)rn   rp   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rt   output_hidden_states)dtyper<    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)%r`   r   _lora_scaler   r   rX   r"   strdictlistrI   shaper   maybe_convert_promptrY   model_max_length	input_idstorchequalbatch_decodeloggerwarningr5   rd   rs   rt   to
text_modelfinal_layer_normrv   rZ   repeatviewtype	TypeErrorrA   r#   )rg   promptr<   num_images_per_promptdo_classifier_free_guidancenegative_promptrU   rV   ri   rj   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrt   prompt_embeds_dtypebs_embedseq_len_uncond_tokensrl   uncond_inputs                          r8   encode_prompt-AnimateDiffVideoToVideoPipeline.encode_prompt  sU   V !j7U&V&V) $#.t/@/@*M!$"3"3Z@*Vc4["A"AJJvt$<$<VJ&,,Q/J $ ;<<226>>J..$>>::# ) K )22N"nnVYW[n\ffO$$R(N,@,@,DDU[[N N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 t((//1EFF4K\K\KcKcKvKv!,!;!;!>!>v!F!%  $ 1 1.2C2CF2K\j 1 k -a 0 $ 1 1"%%f-ncg !2 ! !.b 1IM2B C
 !% 1 1 < < M Mm \("&"3"3"9"9YY""&))//"/"5"5%((/B6(R,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS11!0 1s?33 )/)::J3K_J` ax/
| <33  !0 $ ;<< $ 9 9- X&,,Q/J>>$%# * L t((//1EFF4K\K\KcKcKvKv!-!<!<!?!?!G!%%)%6%6&&))&1- &7 &" &<A%>"&,2215G%;%>%>EXag%>%h"%;%B%B1F[]^%_"%;%@%@Acelnp%q"($ >??DTDT#D$5$5zB44r:   c                 d   [        U R                  R                  5       5      R                  n[	        U[
        R                  5      (       d  U R                  USS9R                  nUR                  X%S9nU(       aq  U R                  USS9R                  S   nUR                  USS9nU R                  [
        R                  " U5      SS9R                  S   nUR                  USS9nXg4$ U R                  U5      R                  nUR                  USS9n[
        R                  " U5      n	X4$ )	Nrm   )rp   r<   rv   T)ru   r   dim)nextrS   rF   rv   r`   r   TensorrR   pixel_valuesr   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rg   imager<   r   ru   rv   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedss
             r8   encode_image,AnimateDiffVideoToVideoPipeline.encode_image  s?   T''2245;;%..**5*FSSE4&*&8&8UY&8&Z&h&hik&l#&=&O&OPekl&O&m#-1-?-?  'd .@ .mB. * .L-]-]%1 .^ .* +JJ--e4AAL'99:OUV9WL"'"2"2<"@44r:   c                 
   / nU(       a  / nUGc&  [        U[        5      (       d  U/n[        U5      [        U R                  R                  R
                  5      :w  aB  [        S[        U5       S[        U R                  R                  R
                  5       S35      e[        XR                  R                  R
                  5       Hh  u  p[        U	[        5      (       + n
U R                  XSU
5      u  pUR                  US S S 24   5        U(       d  MP  WR                  US S S 24   5        Mj     OEU H?  nU(       a$  UR                  S5      u  pWR                  U5        UR                  U5        MA     / n[        U5       Hw  u  p[        R                  " U/U-  SS9nU(       a2  [        R                  " WU   /U-  SS9n[        R                  " X/SS9nUR                  US9nUR                  U5        My     U$ )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r+   r&   r   r   )r<   )r`   r|   rI   rZ   encoder_hid_projimage_projection_layersrA   zipr   r   appendchunk	enumerater   catr   )rg   ip_adapter_imageip_adapter_image_embedsr<   r   r   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsis                 r8   prepare_ip_adapter_image_embeds?AnimateDiffVideoToVideoPipeline.prepare_ip_adapter_image_embeds  s9    &$&!"*.55$4#5 #$DII,F,F,^,^(__ abefvbwax  yE  FI  JN  JS  JS  Jd  Jd  J|  J|  F}  E~  ~K  L  >A ))"<"<"T"T>9' +55E*W&W#DHDUDU+Q8KEA# ##$7a$@A..)001MdTUg1VW> (?#.H[HaHabcHdE0)001MN##$78	 (? #%&/&="A"'))-@,ADY,Y_`"a*/4yy:OPQ:R9SVk9kqr/s,&+ii1M0cij&k#"5"8"8"8"G#**+>? '> '&r:      decode_chunk_sizereturnc                     / n[        S[        U5      U5       H=  nXXS-    n[        U R                  R	                  U5      US9nUR                  U5        M?     [        R                  " U5      $ )Nr   )r.   )rangerI   r9   rW   encoder   r   r   )rg   videor.   r   r4   r   batch_videos          r8   encode_video,AnimateDiffVideoToVideoPipeline.encode_video  sc    q#e*&78AA$9:K*488??;+GS\]KNN;' 9 yy!!r:   c                 F   SU R                   R                  R                  -  U-  nUR                  u  p4pVnUR	                  SSSSS5      R                  X5-  XFU5      n/ n[        SUR                  S   U5       H?  n	XX-    n
U R                   R                  U
5      R                  n
UR                  U
5        MA     [        R                  " U5      nUS S S 24   R                  X5S4UR                  SS  -   5      R	                  SSSSS5      nUR                  5       nU$ )Nr+   r   r&   r      rr   )rW   rd   scaling_factorr}   permutereshaper   decoder2   r   r   r   float)rg   r4   r   r   channels
num_framesheightwidthr   r   batch_latentss              r8   decode_latents.AnimateDiffVideoToVideoPipeline.decode_latents  s   dhhoo444w>:A--7
j%//!Q1a0889PRZdijq'--*,=>A#(=>M HHOOM:AAMLL' ?
 		% dAg&&
'CekkRSRTo'UV^^_`bcefhiklmr:   c                 n   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   n0 nU(       a  X$S'   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   nU(       a  XS'   U$ )Netar.   )rB   rC   rD   rJ   steprF   rG   )rg   r.   r   accepts_etaextra_step_kwargsaccepts_generators         r8   prepare_extra_step_kwargs9AnimateDiffVideoToVideoPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  r:   c           
        ^  US:  d  US:  a  [        SU 35      eUS-  S:w  d	  US-  S:w  a  [        SU SU S35      eUbW  [        U 4S jU 5       5      (       d=  [        S	T R                   S
U Vs/ s H  oT R                  ;  d  M  UPM     sn 35      eUb  Ub  [        SU SU S35      eUc  Uc  [        S5      eUb7  [        U[        [
        [        45      (       d  [        S[        U5       35      eUb  U	b  [        SU SU	 S35      eUbC  U	b@  UR                  U	R                  :w  a&  [        SUR                   SU	R                   S35      eUb  Ub  [        S5      eU
b  Ub  [        S5      eUb\  [        U[
        5      (       d  [        S[        U5       35      eUS   R                  S;  a  [        SUS   R                   S35      eg g s  snf )Nr   r+   z2The value of strength should in [0.0, 1.0] but is r\   z7`height` and `width` have to be divisible by 8 but are z and rx   c              3   @   >#    U  H  oTR                   ;   v   M     g 7fN)_callback_tensor_inputs).0krg   s     r8   	<genexpr>?AnimateDiffVideoToVideoPipeline.check_inputs.<locals>.<genexpr>A  s      F
7Y!---7Ys   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z:`prompt` has to be of type `str`, `list` or `dict` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z3Only one of `video` or `latents` should be providedzProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is D)
rA   allr   r`   rz   r|   r{   r   r}   ndim)rg   r   strengthr   r   r   r4   r   rU   rV   r   r   "callback_on_step_end_tensor_inputsr   s   `             r8   check_inputs,AnimateDiffVideoToVideoPipeline.check_inputs,  s    a<8a<QRZQ[\]]A:?eai1nVW]V^^cdicjjklmm-9# F
7YF
 C
 C
 DTEaEaDbbn  |^  pH  |^vw  ko  kG  kG  bGpq  |^  pH  oI  J  -";08N}o ^0 0  ^ 5w  
6Ct;L(M(MYZ^_eZfYghii&+A+M9/9J K*++]_ 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  !4RSS',C,O ^  #.5t<< PQUVmQnPop  )+00> \]tuv]w]|]|\}}~  ? /K pHs   4GGc                     [        [        X-  5      U5      n[        X-
  S5      nX&U R                  R                  -  S  nX!U-
  4$ )Nr   )minintmaxrJ   order)rg   r;   r=   r   r<   init_timestept_starts          r8   get_timesteps-AnimateDiffVideoToVideoPipeline.get_timestepst  sP    C 3 >?ATU)91=(<(<<>?	777r:   r+   Fr   r   r   num_channels_latentsr   timesteprv   r<   r.   r4   	add_noisec                    U
c  UR                   S   OU
R                   S   nUUUX R                  -  X0R                  -  4n[        U	[        5      (       a*  [	        U	5      U:w  a  [        S[	        U	5       SU S35      eU
GcZ  U R                  R                  R                  (       a7  UR                  5       nU R                  R                  [        R                  S9  [        U	[        5      (       a@  [        U5       Vs/ s H)  nU R                  X   X   U5      R                  S5      PM+     nnO1U Vs/ s H$  nU R                  UX5      R                  S5      PM&     nn[        R                   " USS9nU R                  R                  R                  (       a  U R                  R                  U5        UR                  U5      nU R                  R                  R"                  U-  nUUR                   S   :  a7  UUR                   S   -  S:X  a!  S	U S
UR                   S    S3n[        U5      eUUR                   S   :  a5  UUR                   S   -  S:w  a  [        SUR                   S    SU S35      e[        R                   " U/SS9n[%        UR                   XUS9nU R&                  R)                  UUU5      R+                  SSSSS5      n
U
$ XR                   :w  a  [        SU< SU
R                   < 35      eU
R                  XS9n
U(       a'  [%        XXS9nU R&                  R)                  U
UU5      n
U
$ s  snf s  snf )Nr+   r&   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rv   r   r   zYou have passed z# text prompts (`prompt`), but only zp initial images (`image`). Please make sure to update your script to pass as many initial images as text promptsz'Cannot duplicate `image` of batch size z to z text prompts.)r.   r<   rv   r   r   z!`latents` expected to have shape=z, but found latents.shape=)r}   r]   r`   r|   rI   rA   rW   rd   force_upcastr   r   r   float32r   r   	unsqueezer   r   r$   rJ   r   r   )rg   r   r   r   r   r   r   rv   r<   r.   r4   r   r   r   r}   r   init_latentsviderror_messagenoises                       r8   prepare_latents/AnimateDiffVideoToVideoPipeline.prepare_latents}  sF    (/U[[^GMM!<L
 +++***
 i&&3y>Z+GA#i.AQ R&<'gi 
 ?xx++%--0)T** #:. . %%eh	>OPZZ[\].   
 nssmrfi 1 1#y T ^ ^_` amrs 99\q9L xx++E"'??51L88??99LHLL..q11j<CUCUVWCX6X\]6] 'zl2UVbVhVhijVkUl m   !//l0033
\EWEWXYEZ8Z^_8_ =l>P>PQR>S=TTXYcXddrs   %yy,Q? !3!3y_deEnn..|UHMUUVWYZ\]_`bcdG  % #EuhFaSZS`S`Rb!cddjjj5G$U\..227E8LW 
  ts   0M	9+Mc                     U R                   $ r   _guidance_scalerg   s    r8   guidance_scale.AnimateDiffVideoToVideoPipeline.guidance_scale  s    ###r:   c                     U R                   $ r   )
_clip_skipr  s    r8   rj   )AnimateDiffVideoToVideoPipeline.clip_skip      r:   c                      U R                   S:  $ )Nr+   r	  r  s    r8   r   ;AnimateDiffVideoToVideoPipeline.do_classifier_free_guidance  s    ##a''r:   c                     U R                   $ r   )_cross_attention_kwargsr  s    r8   cross_attention_kwargs6AnimateDiffVideoToVideoPipeline.cross_attention_kwargs  s    +++r:   c                     U R                   $ r   )_num_timestepsr  s    r8   num_timesteps-AnimateDiffVideoToVideoPipeline.num_timesteps  s    """r:   c                     U R                   $ r   )
_interruptr  s    r8   	interrupt)AnimateDiffVideoToVideoPipeline.interrupt  r  r:   2   g      @g?g        pilTr   r;   enforce_inference_stepsr=   r>   r  r   r   num_videos_per_promptr   r   r   output_typereturn_dictr  callback_on_step_endr   c                    U=(       d-    U R                   R                  R                  U R                  -  nU=(       d-    U R                   R                  R                  U R                  -  nSnU R	                  UU
UUUUUUUUUUS9  Xl        UU l        UU l        SU l        Ub  [        U[        [        45      (       a  SnO3Ub!  [        U[        5      (       a  [        U5      nOUR                  S   nU R                  nU R                   nU(       dG  [#        U R$                  UUXx5      u  puU R'                  XWU
U5      u  puUSS R)                  UU-  5      nOD[+        XZ-  5      n[#        U R$                  UUXx5      u  nnXu* S nUSS R)                  UU-  5      nUc?  U R,                  R/                  XUS9nUR1                  SSSSS	5      nUR3                  UUS
9nU R                   R                  R4                  n U R7                  UUUU UU-  UUUUUUUS9nU R8                  b  U R8                  R;                  SS5      OSn!UR                  S   n"U R<                  (       a0  U R?                  UU"UUU R@                  UUUU!U RB                  S9
u  nnOgU RE                  UUUU R@                  UUUU!U RB                  S9	u  nnU R@                  (       a  [F        RH                  " UU/5      nURK                  U"SS9nUc  Ub"  U RM                  UUUUU-  U R@                  5      n#U RO                  X5      n$Uc  Ub  SW#0OSn%U RP                  (       a  U RR                  OSn&[U        U&5       GHm  n'U RP                  (       aB  U RW                  UU'UUUR                   U5      u  p[        U5      nU R'                  XWU
U5      u  pu[        U5      U l,        [        U5      XPR$                  RZ                  -  -
  n(U R]                  U RX                  S9 n)[_        U5       GH  u  n*n+U R`                  (       a  M  U R@                  (       a  [F        RH                  " U/S-  5      OUn,U R$                  Rc                  U,U+5      n,U R                  U,U+UU R8                  U%S9Rd                  n-U R@                  (       a  U-Rg                  S5      u  n.n/U.U	U/U.-
  -  -   n-U R$                  Rh                  " U-U+U40 U$D6Rj                  nUb\  0 n0U H  n1[m        5       U1   U0U1'   M     U" U U*U+U05      n2U2Ro                  SU5      nU2Ro                  SU5      nU2Ro                  SU5      nU*[        U5      S-
  :X  d)  U*S-   U(:  a0  U*S-   U R$                  RZ                  -  S:X  a  U)Rq                  5         [r        (       d  GM  [t        Rv                  " 5         GM     SSS5        GMp     US:X  a  UnO,U Ry                  UU5      n3U R,                  R{                  U3US9nU R}                  5         U(       d  U4$ [        US9$ ! , (       d  f       GM  = f)u+  
The call function to the pipeline for generation.

Args:
    video (`List[PipelineImageInput]`):
        The input video to condition the generation on. Must be a list of images/frames of the video.
    prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
    height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
        The height in pixels of the generated video.
    width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
        The width in pixels of the generated video.
    num_inference_steps (`int`, *optional*, defaults to 50):
        The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
        expense of slower inference.
    timesteps (`List[int]`, *optional*):
        Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
        in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
        passed will be used. Must be in descending order.
    sigmas (`List[float]`, *optional*):
        Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
        their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
        will be used.
    strength (`float`, *optional*, defaults to 0.8):
        Higher strength leads to more differences between original video and generated video.
    guidance_scale (`float`, *optional*, defaults to 7.5):
        A higher guidance scale value encourages the model to generate images closely linked to the text
        `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide what to not include in image generation. If not defined, you need to
        pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
    eta (`float`, *optional*, defaults to 0.0):
        Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
        applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
        generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
        `(batch_size, num_channel, num_frames, height, width)`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
        provided, text embeddings are generated from the `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
        not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
    ip_adapter_image: (`PipelineImageInput`, *optional*):
        Optional image input to work with IP Adapters.
    ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
        Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
        IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
        contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
        provided, embeddings are computed from the `ip_adapter_image` input argument.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
    cross_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
        [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    clip_skip (`int`, *optional*):
        Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
        the output of the pre-final layer will be used for computing the prompt embeddings.
    callback_on_step_end (`Callable`, *optional*):
        A function that calls at the end of each denoising steps during the inference. The function is called
        with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
        callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
        `callback_on_step_end_tensor_inputs`.
    callback_on_step_end_tensor_inputs (`List`, *optional*):
        The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
        will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
        `._callback_tensor_inputs` attribute of your pipeline class.
    decode_chunk_size (`int`, defaults to `16`):
        The number of frames to decode at a time when calling `decode_latents` method.

Examples:

Returns:
    [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
        returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
r+   )r   r   r   r   r   rU   rV   r   r4   r   r   r   FNr   )r   r   r&   r   r   r   )r   r   r   r   r   r   rv   r<   r.   r4   r   r   scale)
r   r   r<   r#  r   r   rU   rV   ri   rj   )rU   rV   ri   rj   )repeatsr   r   )total)encoder_hidden_statesr  added_cond_kwargsr4   rU   rV   latent)r   r$  )frames)@rZ   rd   sample_sizer]   r   r
  r  r  r  r`   rz   r{   r|   rI   r}   _execution_devicerv   rN   rJ   r   r   r   rf   preprocess_videor   r   in_channelsr  r  getfree_noise_enabled_encode_prompt_free_noiser   rj   r   r   r   r   r   r   free_init_enabled_free_init_num_itersr   _apply_free_initr  r   progress_barr   r  scale_model_inputr2   r   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   postprocess_videomaybe_free_model_hooksr,   )4rg   r   r   r   r   r;   r"  r=   r>   r  r   r   r#  r   r.   r4   rU   rV   r   r   r$  r%  r  rj   r&  r   r   r   r<   rv   latent_timestepdenoising_inference_stepsr   text_encoder_lora_scaler   r   r   r,  num_free_init_itersfree_init_iternum_warmup_stepsr9  r   tlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsvideo_tensors4                                                       r8   __call__(AnimateDiffVideoToVideoPipeline.__call__  s   h O499++77$:O:OOM))558M8MM ! 	+'#9-$;/Q 	 	
  .#'=$ *Vc4["A"AJJvt$<$<VJ&,,Q/J''

 '-? 3VY.*I .2-?-?@S`hjp-q*I'm22:@U3UVO(+,?,J(K%3E 96940I0 ""6"78I'm22:@U3UVO ?((99%V[9\EMM!Q1a0EHHF%H8E#yy//;;&&!5!$99$/- ' 
" ?C>Y>Y>eD''++GT:ko 	  ]]1%
""484R4R%&;,0,L,L /+'=2.. 5S 51M1 594F4F%00+'=2.. 5G 
51M1 // %		+A=*Q R);;JTU;VM '+B+N?? '2200L !::9J
  +/F/R \* 	 <@;Q;Qd77WX#$78N%%%)%:%:^-@&'--Yb&" '*)n#151C1CDWdlnt1u.	"%i.D"9~0CnnFZFZ0ZZ "")<)<"=%i0DAq~~  FJEeEeG9q=)Akr&)-)I)IJ\^_)`& "&*.;/3/J/J*; "+ " f  77=G=M=Ma=P:)?%6?]nKn9o%o
 #nn11*a^L]^jjG+7*,!CA17!OA. "D+?aO+\("2"6"6y'"J(8(<(<_m(\1A1E1EF^`v1w. C	NQ..AE=M3MSTWXSX\`\j\j\p\pRptuRu$++-$}Q 1 >= 9r ("E..w8IJL((::[f:gE 	##%8O(66q >=s   F"X<=X<<
Y	)r  r  r
  r  ry   r  r]   rf   )NN)NNNNNr   )r   )NNNNNNNN)N@   rT  r   r+   NNNNNr   F);__name__
__module____qualname____firstlineno____doc__model_cpu_offload_seq_optional_componentsr   r   r
   r   r   r   r   r   r   r   r   r   r   r   r	   r   r_   r   r   r   r   r   r   r   r   r   r   r   r   r   rv   r<   	Generatorr   boolr  propertyr  rj   r   r  r  r  no_gradr   rz   r   r   r   rR  __static_attributes____classcell__)rH   s   @r8   rP   rP      s1   8 EST" 157;!!V!V $!V !	!V
 (/9:!V &!V  "+')
!V .!V  5!!V !VR 049=&*#'t5  -t5 !) 6t5 UOt5 C=t5n52+'Z" "U\\ " &!. # $+/FP8 )-$%"&'+)-MQ*.!#P%P P 	P
 "P P 3-P $P &P E%//43H"HIJP %,,'P P P 
Pd $ $   ( ( , , # #   ]]_ 1526 $##%(-)-(, #;?/0MQ*.049=9=@D%* ;?#'KO9B!#7~7D+,-~7 sDI~./~7 	~7
 }~7 !~7 "&~7 DI&~7 e%~7 ~7 ~7 "%T#Y"78~7  (}~7 ~7 E%//43H"HIJ~7  %,,'!~7"  -#~7$ !) 6%~7& ##56'~7( "*$u||*<!=)~7* c]+~7, -~7. !)c3h 8/~70 C=1~72 'xc40@$0F'GH3~74 -1I5~76 7~7 ~7r:   rP   )Nr2   )NNNN)JrC   typingr   r   r   r   r   r   r   transformersr	   r
   r   r   image_processorr   loadersr   r   r   r   modelsr   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   r   r   r   r   r   utilsr   r    r!   r"   r#   utils.torch_utilsr$   rf   r%   free_init_utilsr'   free_noise_utilsr(   pipeline_utilsr)   r*   pipeline_outputr,   torch_xla.core.xla_modelcore	xla_modelr@  r?  
get_loggerrU  r   EXAMPLE_DOC_STRINGr   r\  rz   r9   r   r<   r   rN   rP   r@   r:   r8   <module>ru     sC    = =  h h 1 w w [ [ 9 ;  o n - - + 8 D 6 ))MM			H	%3 p ck
TLL
T-5eoo-F
T\_
T  *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*vt7"t7r:   