
    +h                     8   S SK r S SKJrJrJrJrJrJr  S SKr	S SK
r
S SKJrJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSK J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'  \" 5       (       a  S SK(J)s  J*r+  Sr,OSr,\RZ                  " \.5      r/Sr0    S&S\1S\1S\2S\24S jjr3    S'S\\1   S\\\4\
Rj                  4      S\\\1      S\\\2      4S jjr6 S(S\
Rn                  S \\
Rp                     S!\44S" jjr9S)S# jr: " S$ S%\%\\5      r;g)*    N)AnyCallableDictListOptionalUnion)T5EncoderModelT5TokenizerFast   )MultiPipelineCallbacksPipelineCallback)PipelineImageInput)FromSingleFileMixinLTXVideoLoraLoaderMixin)AutoencoderKLLTXVideo)LTXVideoTransformer3DModel)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )LTXPipelineOutputTFaS  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTXImageToVideoPipeline
        >>> from diffusers.utils import export_to_video, load_image

        >>> pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")

        >>> image = load_image(
        ...     "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
        ... )
        >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

        >>> video = pipe(
        ...     image=image,
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=704,
        ...     height=480,
        ...     num_frames=161,
        ...     num_inference_steps=50,
        ... ).frames[0]
        >>> export_to_video(video, "output.mp4", fps=24)
        ```
base_seq_lenmax_seq_len
base_shift	max_shiftc                 4    XC-
  X!-
  -  nX5U-  -
  nX-  U-   nU$ N )image_seq_lenr   r   r   r    mbmus           j/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/ltx/pipeline_ltx_image2video.pycalculate_shiftr)   K   s3     
	K$>?A%%A		Q	BI    num_inference_stepsdevice	timestepssigmasc                    Ub  Ub  [        S5      eUb  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
X2S.UD6  U R                  n[        U5      nX14$ Ub  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
XBS.UD6  U R                  n[        U5      nX14$ U R                  " U4S	U0UD6  U R                  nX14$ )a  
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

Args:
    scheduler (`SchedulerMixin`):
        The scheduler to get timesteps from.
    num_inference_steps (`int`):
        The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
        must be `None`.
    device (`str` or `torch.device`, *optional*):
        The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
    timesteps (`List[int]`, *optional*):
        Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
        `num_inference_steps` and `sigmas` must be `None`.
    sigmas (`List[float]`, *optional*):
        Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
        `num_inference_steps` and `timesteps` must be `None`.

Returns:
    `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
    second element is the number of inference steps.
zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr-   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r-   r,   r.   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r.   r,   r,   r#   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r-   len)	schedulerr+   r,   r-   r.   kwargsaccepts_timestepsaccept_sigmass           r(   retrieve_timestepsr=   Y   s}   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	M)MfM''	!)n )) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	GvGG''	!)n )) 	 3MFMfM''	))r*   encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrrB   rC   moderE   AttributeError)r>   r?   r@   s      r(   retrieve_latentsrI      s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSSr*   c                     UR                  [        [        SUR                  5      5      SS9nU R                  [        [        SU R                  5      5      SS9nXU-  -  nX%-  SU-
  U -  -   n U $ )a  
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891).

Args:
    noise_cfg (`torch.Tensor`):
        The predicted noise tensor for the guided diffusion process.
    noise_pred_text (`torch.Tensor`):
        The predicted noise tensor for the text-guided diffusion process.
    guidance_rescale (`float`, *optional*, defaults to 0.0):
        A rescale factor applied to the noise predictions.

Returns:
    noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
r   T)dimkeepdim)stdlistrangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaleds         r(   rescale_noise_cfgrW      s{    " ""tE!_5I5I,J'KUY"ZHmmU1inn%= >mMG#''9: 6!>N:NR[9[[Ir*   c            7         ^  \ rS rSrSrSr/ r/ SQrS\S\	S\
S\S	\4
U 4S
 jjr     SMS\\\\   4   S\S\S\\R(                     S\\R*                     4
S jjr          SNS\\\\   4   S\\\\\   4      S\S\S\\R0                     S\\R0                     S\\R0                     S\\R0                     S\S\\R(                     S\\R*                     4S jjr     SOS jr\SPS\R0                  S\S\S \R0                  4S! jj5       r\ SPS\R0                  S"\S#\S$\S\S\S \R0                  4S% jj5       r\ SQS\R0                  S&\R0                  S'\R0                  S(\S \R0                  4
S) jj5       r\ SQS\R0                  S&\R0                  S'\R0                  S(\S \R0                  4
S* jj5       r           SRS.\\R0                     S/\S0\S#\S$\S"\S\\R*                     S\\R(                     S1\\RB                     S\\R0                     S \R0                  4S2 jjr"\#S3 5       r$\#S4 5       r%\#S5 5       r&\#S6 5       r'\#S7 5       r(\#S8 5       r)\#S9 5       r*\RV                  " 5       \," \-5      SSSS+S,S-S:S;SS<S=SSSSSSSS=SS>SSSS/S4S.\.S\\\\   4   S\\\\\   4      S#\S$\S"\S?\S@\SA\\   SB\SC\S\\   S1\\\RB                  \\RB                     4      S\\R0                     S\\R0                     S\\R0                     S\\R0                     S\\R0                     SD\\\\   4   SE\\\\\   4      SF\\   SG\SH\\/\\04      SI\\1\\\//S4      SJ\\   S\44SK jj5       5       r2SLr3U =r4$ )SLTXImageToVideoPipeline   ag  
Pipeline for image-to-video generation.

Reference: https://github.com/Lightricks/LTX-Video

Args:
    transformer ([`LTXVideoTransformer3DModel`]):
        Conditional Transformer architecture to denoise the encoded video latents.
    scheduler ([`FlowMatchEulerDiscreteScheduler`]):
        A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    vae ([`AutoencoderKLLTXVideo`]):
        Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
    text_encoder ([`T5EncoderModel`]):
        [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
        the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
    tokenizer (`CLIPTokenizer`):
        Tokenizer of class
        [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
    tokenizer (`T5TokenizerFast`):
        Second Tokenizer of class
        [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
ztext_encoder->transformer->vae)rE   prompt_embedsnegative_prompt_embedsr9   vaetext_encoder	tokenizertransformerc                 v  > [         TU ]  5         U R                  UUUUUS9  [        U SS 5      b  U R                  R
                  OSU l        [        U SS 5      b  U R                  R                  OSU l        [        U SS 5      b   U R                  R                  R                  OSU l        [        U S5      b   U R                  R                  R                  OSU l        [        U R                  S9U l        [        U SS 5      b  U R"                  R$                  OS	U l        S
U l        SU l        SU l        g )N)r]   r^   r_   r`   r9   r]          r`   r   )vae_scale_factorr_           y   )super__init__register_modulesgetattrr]   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratior`   config
patch_sizetransformer_spatial_patch_sizepatch_size_ttransformer_temporal_patch_sizer   video_processorr_   model_max_lengthtokenizer_max_lengthdefault_heightdefault_widthdefault_frames)selfr9   r]   r^   r_   r`   r7   s         r(   rj    LTXImageToVideoPipeline.__init__   s5    	%# 	 	
 3:$t2L2XDHH..^` 	* 4;43M3YDHH//_` 	+ 3:$t2T2`D##..fg 	+ 5<D-4P4\D##00bc 	,  .t?a?ab/6t[$/O/[DNN++ad 	! " !r*   Nr   re   promptnum_videos_per_promptmax_sequence_lengthr,   dtypec           	         U=(       d    U R                   nU=(       d    U R                  R                  n[        U[        5      (       a  U/OUn[        U5      nU R                  USUSSSS9nUR                  nUR                  n	U	R                  5       R                  U5      n	U R                  USSS9R                  n
U
R                  S   UR                  S   :  a]  [        R                  " X5      (       dB  U R                  R                  U
S S 2US-
  S24   5      n[        R!                  S	U S
U 35        U R                  UR                  U5      5      S   nUR                  XTS9nUR                  u  pnUR#                  SUS5      nUR%                  Xb-  US5      nU	R%                  US5      n	U	R#                  US5      n	X4$ )N
max_lengthTpt)paddingr   
truncationadd_special_tokensreturn_tensorslongest)r   r   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )r   r,   )_execution_devicer^   r   
isinstancestrr8   r_   	input_idsattention_maskbooltoshapetorchequalbatch_decodeloggerwarningrepeatview)r|   r~   r   r   r,   r   
batch_sizetext_inputstext_input_idsprompt_attention_maskuntruncated_idsremoved_textr[   _seq_lens                  r(   _get_t5_prompt_embeds-LTXImageToVideoPipeline._get_t5_prompt_embeds  s    14110**00'44&&[
nn *# % 
 %.. + : : 5 : : < ? ? G..SW.Xbb  $(<(<R(@@UcIuIu>>66qJ]`aJadfJfGf7ghLNN'(	,A
 )).*;*;F*CDQG%((u(D &++A%,,Q0EqI%**:+MwXZ[ 5 : ::r J 5 < <=RTU V33r*   Tnegative_promptdo_classifier_free_guidancer[   r\   r   negative_prompt_attention_maskc                 <   U
=(       d    U R                   n
[        U[        5      (       a  U/OUnUb  [        U5      nOUR                  S   nUc  U R                  UUU	U
US9u  pWU(       a  Uc  U=(       d    Sn[        U[        5      (       a  X/-  OUnUb;  [        U5      [        U5      La$  [        S[        U5       S[        U5       S35      eU[        U5      :w  a!  [        SU S[        U5       S	U SU S
3	5      eU R                  UUU	U
US9u  phXWXh4$ )ab  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `List[str]`, *optional*):
        prompt to be encoded
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
        Whether to use classifier free guidance or not.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    device: (`torch.device`, *optional*):
        torch device
    dtype: (`torch.dtype`, *optional*):
        torch dtype
r   )r~   r   r   r,   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	r   r   r   r8   r   r   type	TypeErrorr0   )r|   r~   r   r   r   r[   r\   r   r   r   r,   r   r   s                r(   encode_prompt%LTXImageToVideoPipeline.encode_prompt2  sy   P 1411'44&&VJ&,,Q/J 373M3M&;$7 4N 40M '+A+I-3O@J?\_@`@`j+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  FJE_E_&&;$7 F` FB" 5Kkkr*   c	           
      :  ^  US-  S:w  d	  US-  S:w  a  [        SU SU S35      eUbW  [        U 4S jU 5       5      (       d=  [        ST R                   SU V	s/ s H  oT R                  ;  d  M  U	PM     sn	 35      eUb  Ub  [        S	U S
U S35      eUc  Uc  [        S5      eUbA  [        U[        5      (       d,  [        U[
        5      (       d  [        S[        U5       35      eUb  Uc  [        S5      eUb  Uc  [        S5      eUb  Ub  UR                  UR                  :w  a&  [        SUR                   SUR                   S35      eUR                  UR                  :w  a&  [        SUR                   SUR                   S35      eg g g s  sn	f )Nrb   r   z8`height` and `width` have to be divisible by 32 but are z and r   c              3   @   >#    U  H  oTR                   ;   v   M     g 7fr"   )_callback_tensor_inputs).0kr|   s     r(   	<genexpr>7LTXImageToVideoPipeline.check_inputs.<locals>.<genexpr>  s      F
7Y!---7Ys   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but got: `prompt_attention_mask` z% != `negative_prompt_attention_mask` )r0   allr   r   r   rN   r   r   )
r|   r~   heightwidth"callback_on_step_end_tensor_inputsr[   r\   r   r   r   s
   `         r(   check_inputs$LTXImageToVideoPipeline.check_inputs  sK    B;!urzQWX^W__dejdkklmnn-9# F
7YF
 C
 C
 DTEaEaDbbn  |^  pH  |^vw  ko  kG  kG  bGpq  |^  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@)@TZ\`IaIaQRVW]R^Q_`aa$)>)Fdee!-2P2Xvww$)?)K""&<&B&BB --:-@-@,A B.445Q8 
 %**.L.R.RR 55J5P5P4Q R6<<=Q@  S *L$+ pHs   F1FrE   rr   rt   returnc           
          U R                   u  p4pVnXR-  nXa-  n	Xq-  n
U R                  USUUU	UU
U5      n U R                  SSSSSSSS	5      R                  SS	5      R                  SS5      n U $ )
Nr   r   r         r   r         )r   reshapepermuteflatten)rE   rr   rt   r   num_channels
num_framesr   r   post_patch_num_framespost_patch_heightpost_patch_widths              r(   _pack_latents%LTXImageToVideoPipeline._pack_latents  s     ?Fmm;
*e * :"0 .//!	
 //!Q1aAq9AA!QGOOPQSTUr*   r   r   r   c           
          U R                  S5      nU R                  XaX#SXTU5      n U R                  SSSSSSSS	5      R                  SS	5      R                  SS5      R                  SS5      n U $ )
Nr   r   r   r   r   r   r   r   r   )sizer   r   r   )rE   r   r   r   rr   rt   r   s          r(   _unpack_latents'LTXImageToVideoPipeline._unpack_latents  st     \\!_
//*&\gqr//!Q1aAq9AA!QGOOPQSTU]]^_abcr*   latents_meanlatents_stdscaling_factorc                     UR                  SSSSS5      R                  U R                  U R                  5      nUR                  SSSSS5      R                  U R                  U R                  5      nX-
  U-  U-  n U $ Nr   r   r   r   r,   r   rE   r   r   r   s       r(   _normalize_latents*LTXImageToVideoPipeline._normalize_latents  su     $((B1a8;;GNNGMMZ!&&q"aA699'..'--X)^;kIr*   c                     UR                  SSSSS5      R                  U R                  U R                  5      nUR                  SSSSS5      R                  U R                  U R                  5      nX-  U-  U-   n U $ r   r   r   s       r(   _denormalize_latents,LTXImageToVideoPipeline._denormalize_latents  su     $((B1a8;;GNNGMMZ!&&q"aA699'..'--X'.8<Gr*   rf   rg      imager   num_channels_latentsr?   c           
          X@R                   -  nXPR                   -  nUS-
  U R                  -  S-   nX#XdU4nUSXdU4nU
b  U
R                  U5      nSUS S 2S S 2S4'   U R                  XR                  U R
                  5      R                  S5      nU
R                  S:w  d  U
R                  S S UR                  :w  a*  [        SU
R                   SUR                  U4-    S	35      eU
R                  XS
9U4$ [        U	[        5      (       a  [        U	5      U:w  a  [        S[        U	5       SU S35      e[        U5       Vs/ s HJ  n[        U R                   R#                  X   R%                  S5      R%                  S5      5      X   5      PML     nnOSU Vs/ s HF  n[        U R                   R#                  UR%                  S5      R%                  S5      5      U	5      PMH     nn[&        R(                  " USS9R                  U5      nU R+                  XR                   R,                  U R                   R.                  5      nUR1                  SSUSS5      n[&        R2                  " XUS
9nSUS S 2S S 2S4'   [5        XXS9nX-  USU-
  -  -   n
U R                  XR                  U R
                  5      R                  S5      nU R                  XR                  U R
                  5      n
X4$ s  snf s  snf )Nr         ?r   r   r   r   z$Provided `latents` tensor has shape z, but the expected shape is r   r,   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.rK   r?   r,   r   )rn   rp   	new_zerosr   rs   ru   squeezerP   r   r0   r   r   rN   r8   rO   rI   r]   encode	unsqueezer   catr   r   r   r   zerosr   )r|   r   r   r   r   r   r   r   r,   r?   rE   r   
mask_shapeconditioning_maskiinit_latentsimgnoises                     r(   prepare_latents'LTXImageToVideoPipeline.prepare_latents  s*    ===;;; 1n)L)LLqP
:uM !Z?
 ' 1 1* =),aAg& $ 2 2!#F#FHlHl!gbk  ||q GMM"1$59J9P9P$P :7==/Iefwf}f}  BV  AX  gX  fY  YZ  [  ::V:9;LLLi&&9~+ Ec)nEU V  *|+km  z**A !1C1CA1F1P1PQR1S!TV_Vbc*  L hmgl`c q1A1K1KA1N!OQZ[gl   yy1588?..|XX=R=RTXT\T\ThThi#**1aQB!KK
O%(!Q'"UT2UaBS>S5TT ..BBDDhDh

'"+ 	 $$88$:^:^
 ))3
s   AK6AK;c                     U R                   $ r"   _guidance_scaler|   s    r(   guidance_scale&LTXImageToVideoPipeline.guidance_scale8  s    ###r*   c                     U R                   $ r"   )_guidance_rescaler   s    r(   rS   (LTXImageToVideoPipeline.guidance_rescale<      %%%r*   c                      U R                   S:  $ )Nr   r   r   s    r(   r   3LTXImageToVideoPipeline.do_classifier_free_guidance@  s    ##c))r*   c                     U R                   $ r"   )_num_timestepsr   s    r(   num_timesteps%LTXImageToVideoPipeline.num_timestepsD  s    """r*   c                     U R                   $ r"   )_current_timestepr   s    r(   current_timestep(LTXImageToVideoPipeline.current_timestepH  r   r*   c                     U R                   $ r"   )_attention_kwargsr   s    r(   attention_kwargs(LTXImageToVideoPipeline.attention_kwargsL  r   r*   c                     U R                   $ r"   )
_interruptr   s    r(   	interrupt!LTXImageToVideoPipeline.interruptP  s    r*      2   r           pil
frame_rater+   r-   r   rS   decode_timestepdecode_noise_scaleoutput_typereturn_dictr  callback_on_step_endr   c                    [        U[        [        45      (       a  UR                  nU R	                  UUUUUUUUS9  Xl        Xl        UU l        SU l        SU l	        Ub  [        U[        5      (       a  SnO3Ub!  [        U[        5      (       a  [        U5      nOUR                  S   nU R                  nU R                  UUU R                   UUUUUUUS9
u  nnnnU R                   (       a.  ["        R$                  " UU/SS9n["        R$                  " UU/SS9nUc4  U R&                  R)                  XUS9nUR+                  UUR,                  S	9nU R.                  R0                  R2                  nU R5                  UUU-  UUUU["        R6                  UUU5
      u  nnU R                   (       a  ["        R$                  " UU/5      nUS-
  U R8                  -  S-   nX@R:                  -  n XPR:                  -  n!UU -  U!-  n"[<        R>                  " S
SU-  U5      n#[A        U"U RB                  R0                  RE                  SS5      U RB                  R0                  RE                  SS5      U RB                  R0                  RE                  SS5      U RB                  R0                  RE                  SS5      5      n$[G        U RB                  UUU	U#U$S9u  p[I        [        U	5      XRB                  RJ                  -  -
  S5      n%[        U	5      U l&        U R8                  U-  U R:                  U R:                  4n&U RO                  US9 n'[Q        U	5       GH  u  n(n)U RR                  (       a  M  U)U l	        U R                   (       a  ["        R$                  " U/S-  5      OUn*U*R+                  UR,                  5      n*U)RU                  U*R                  S   5      n+U+RW                  S5      SU-
  -  n+U R.                  RY                  S5         U R/                  U*UU+UUU U!U&USS9
S   n,SSS5        W,R[                  5       n,U R                   (       ab  U,R]                  S5      u  n-n.U-U R^                  U.U--
  -  -   n,U+R]                  S5      u  n+n/U R`                  S:  a  [c        U,U.U R`                  S9n,U Re                  U,UU U!U Rf                  U Rh                  5      n,U Re                  UUU U!U Rf                  U Rh                  5      nU,SS2SS2SS24   n,USS2SS2SS24   n0U RB                  Rk                  U,U)U0SS9S   n1["        R$                  " USS2SS2SS24   U1/SS9nU Rm                  XRf                  U Rh                  5      nUbJ  0 n2U H  n3[o        5       U3   U2U3'   M     U" U U(U)U25      n4U4Rq                  SU5      nU4Rq                  SU5      nU([        U	5      S-
  :X  d)  U(S-   U%:  a0  U(S-   U RB                  RJ                  -  S:X  a  U'Rs                  5         [t        (       d  GM  [v        Rx                  " 5         GM     SSS5        US:X  a  Un5GOU Re                  UUU U!U Rf                  U Rh                  5      nU R{                  XR|                  R~                  U R|                  R                  U R|                  R0                  R                  5      nUR+                  UR,                  5      nU R|                  R0                  R                  (       d  Sn+O["        R                  " UR                  UUUR,                  S9n6[        U[        5      (       d  U/U-  nUc  UnO[        U[        5      (       d  U/U-  n["        R                  " UUUR,                  S	9n+["        R                  " UUUR,                  S	9SS2SSSS4   nSU-
  U-  UU6-  -   nU R|                  R                  UU+SS9S   n5U R&                  R                  U5US9n5U R                  5         U(       d  U54$ [        U5S 9$ ! , (       d  f       GN= f! , (       d  f       GN= f)!u  
Function invoked when calling the pipeline for generation.

Args:
    image (`PipelineImageInput`):
        The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
    prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
        instead.
    height (`int`, defaults to `512`):
        The height in pixels of the generated image. This is set to 480 by default for the best results.
    width (`int`, defaults to `704`):
        The width in pixels of the generated image. This is set to 848 by default for the best results.
    num_frames (`int`, defaults to `161`):
        The number of video frames to generate
    num_inference_steps (`int`, *optional*, defaults to 50):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    timesteps (`List[int]`, *optional*):
        Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
        in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
        passed will be used. Must be in descending order.
    guidance_scale (`float`, defaults to `3 `):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.
    guidance_rescale (`float`, *optional*, defaults to 0.0):
        Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
        Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
        [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
        Guidance rescale factor should fix overexposure when using zero terminal SNR.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        The number of videos to generate per prompt.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will ge generated by sampling using the supplied random `generator`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    prompt_attention_mask (`torch.Tensor`, *optional*):
        Pre-generated attention mask for text embeddings.
    negative_prompt_embeds (`torch.FloatTensor`, *optional*):
        Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
        provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
    negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
        Pre-generated attention mask for negative text embeddings.
    decode_timestep (`float`, defaults to `0.0`):
        The timestep at which generated video is decoded.
    decode_noise_scale (`float`, defaults to `None`):
        The interpolation factor between random noise and denoised latents at the decode timestep.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between
        [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ltx.LTXPipelineOutput`] instead of a plain tuple.
    attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    callback_on_step_end (`Callable`, *optional*):
        A function that calls at the end of each denoising steps during the inference. The function is called
        with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
        callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
        `callback_on_step_end_tensor_inputs`.
    callback_on_step_end_tensor_inputs (`List`, *optional*):
        The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
        will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
        `._callback_tensor_inputs` attribute of your pipeline class.
    max_sequence_length (`int` defaults to `128 `):
        Maximum sequence length to use with the `prompt`.

Examples:

Returns:
    [`~pipelines.ltx.LTXPipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.ltx.LTXPipelineOutput`] is returned, otherwise a `tuple` is
        returned where the first element is a list with the generated images.
)r~   r   r   r   r[   r\   r   r   FNr   r   )
r~   r   r   r   r[   r\   r   r   r   r,   r   )r   r   r   r   base_image_seq_len   max_image_seq_len   r         ?r    ffffff?)r.   r'   )totalr   r   cond_uncond)
hidden_statesencoder_hidden_statestimestepencoder_attention_maskr   r   r   rope_interpolation_scaler  r  )rS   )r  rE   r[   latentr   )r  )frames)Ir   r   r   tensor_inputsr   r   r   r  r
  r  r   rN   r8   r   r   r   r   r   r   rv   
preprocessr   r   r`   rq   in_channelsr   float32rp   rn   nplinspacer)   r9   getr=   maxorderr   progress_bar	enumerater  expandr   cache_contextfloatchunkr   rS   rW   r   rs   ru   stepr   localspopupdateXLA_AVAILABLExm	mark_stepr   r]   r   r   r   timestep_conditioningrandntensordecodepostprocess_videomaybe_free_model_hooksr   )7r|   r   r~   r   r   r   r   r  r+   r-   r   rS   r   r?   rE   r[   r   r\   r   r  r  r  r  r  r  r   r   r   r,   r   r   latent_num_frameslatent_heightlatent_widthvideo_sequence_lengthr.   r'   num_warmup_stepsr$  r0  r   tlatent_model_inputr"  
noise_prednoise_pred_uncondrR   r   noise_latentspred_latentscallback_kwargsr   callback_outputsvideor   s7                                                          r(   __call__ LTXImageToVideoPipeline.__call__T  s[   h *-=?U,VWW1E1S1S. 	/Q'#9"7+I 	 		
  .!1!1!% *VS"9"9JJvt$<$<VJ&,,Q/J'' +(,(H(H"7'#9"7+I 3  
	
!"* ++!II'=}&MSTUM$)II/MOd.ekl$m! ?((33EPU3VEHHF-2E2EHFE#//66BB%)%9%9.. MM&
"" ++ %		+<>O*P Q (!^0S0SSVWW"D"DD B BB 1M AL PS!&9"9;NO!NN!!%%&:C@NN!!%%&94@NN!!%%lC8NN!!%%k48
 *<NN*
&	 s9~0CnnFZFZ0ZZ\]^!)n //*<....$
  %89\!),1>>)*&AEAaAaUYYy1}%=gn"%7%:%:=;N;N%O" 88$6$<$<Q$?@#--b1Q9J5JK%%33MB!%!1!1&8.;!)/D#4,*1I)9$) "2 " "J C (--/
339C9I9I!9L6%!2T5H5HO^oLo5p!pJ"*.."3KHa,,q0%6&$J_J_&

 "11%! 7788
 ..%! 7788 (1ab1
 '1ab 1#~~22:q-]b2cdef))WQ2A2X%6$E1M,,@@$BfBf (3&(O?-3Xa[* @';D!Q'X$.229gFG$4$8$8-$XM I**A9I/IqSTuX\XfXfXlXlNlpqNq '') =LLNa - :f ("E**!3344G //..0D0DdhhooFdFdG jj!4!45G88??88GMMYv]d]j]jk!/488'6&7*&DO%-)8&#$6==*<)=
)J& <<gmm\%*\\2DV[b[h[h%itT4-&" 11W<?QTY?YYHHOOGX5OI!LE((::5k:ZE 	##%8O .._ CB :9s,   Caa.Ha7a
aa
a$)r  r  r   r   r
  r   r{   ry   rz   rx   rs   ru   rn   rp   rv   )Nr   re   NN)
NTr   NNNNre   NN)NNNNN)r   r   )r   )
Nr   re   rf   rg   r   NNNN)5__name__
__module____qualname____firstlineno____doc__model_cpu_offload_seq_optional_componentsr   r   r   r	   r
   r   rj   r   r   r   intr   r   r,   r   r   r   Tensorr   r   staticmethodr   r   r4  r   r   	Generatorr   propertyr   rS   r   r   r  r  r  no_gradr   EXAMPLE_DOC_STRINGr   r   r   r   rQ  __static_attributes____classcell__)r7   s   @r(   rY   rY      s   . =T&"2&" #&" %	&"
 #&" 0&"T )-%&#&)-'+.4c49n%.4  #.4 !	.4
 &.4 $.4h <@,0%&049=8<AE#&)-'+Qlc49n%Ql "%T#Y"78Ql &*	Ql
  #Ql  -Ql !) 6Ql  (5Ql )1(>Ql !Ql &Ql $Qlr ,0#"'+3j u||  PS \a\h\h  ,  st		+.	8;	DG	UX	lo			 	  or-2\\HMfk	   or-2\\HMfk	  )-$''+)-/3*.@*%@* @* "	@*
 @* @* @* $@* &@* EOO,@* %,,'@* 
@*D $ $ & & * * # # & & & &   ]]_12 %)(,;?#%# !"%/0MQ*.048<9=AE58BF%* 59KO9B#&7Z/!Z/ c49n%Z/ "%T#Y"78	Z/
 Z/ Z/ Z/ Z/ !Z/ 9Z/ Z/  Z/  (}Z/ E%//43H"HIJZ/ %,,'Z/   -!Z/"  (5#Z/$ !) 6%Z/& )1(>'Z/( ud5k12)Z/* %U5$u++=%>?+Z/, c]-Z/. /Z/0 #4S>21Z/2 'xc40@$0F'GH3Z/4 -1I5Z/6 !7Z/ 3 Z/r*   rY   )r  r  r  r  )NNNN)NrC   )r  )<r2   typingr   r   r   r   r   r   numpyr+  r   transformersr	   r
   	callbacksr   r   image_processorr   loadersr   r   models.autoencodersr   models.transformersr   
schedulersr   utilsr   r   r   utils.torch_utilsr   rv   r   pipeline_utilsr   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelr;  r:  
get_loggerrS  r   r`  rZ  r4  r)   r   r,   r=   r[  r]  rI   rW   rY   r#   r*   r(   <module>rt     s\    = =   8 A 1 C 8 = 9 O O - - . . ))MM			H	% B 

 
 	

 
  *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*z ck
TLL
T-5eoo-F
T\_
T4s//1DF] s/r*   