
    +h/                     &   S SK r S SKJrJrJrJrJr  S SKrS SK	r	S SK
JrJr  SSKJrJr  SSKJr  SSKJrJr  SSKJr  SS	KJrJrJrJr  SS
KJr  SSKJr  SSK J!r!  SSK"J#r#  \" 5       (       a  S SK$J%r%  O
 " S S5      r%\" 5       (       a  S SK&J's  J(r)  Sr*OSr*\RV                  " \,5      r-Sr.    S!S\\/   S\\\0\	Rb                  4      S\\\/      S\\\2      4S jjr3 S"S\	Rh                  S\\	Rj                     S\04S jjr6 " S S \!5      r7g)#    N)CallableDictListOptionalUnion)T5EncoderModelT5TokenizerFast   )MultiPipelineCallbacksPipelineCallback)PipelineImageInput)AutoencoderKLCosmosCosmosTransformer3DModel)EDMEulerScheduler)is_cosmos_guardrail_availableis_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )CosmosPipelineOutput)CosmosSafetyCheckerc                       \ rS rSrS rSrg)r   %   c                     [        S5      e)Nz|`cosmos_guardrail` is not installed. Please install it to use the safety checker for Cosmos: `pip install cosmos_guardrail`.)ImportError)selfargskwargss      p/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/cosmos/pipeline_cosmos_video2world.py__init__CosmosSafetyChecker.__init__&   s     O      N)__name__
__module____qualname____firstlineno__r$   __static_attributes__r'   r&   r#   r   r   %   s    	r&   r   TFaA	  
    Examples:
        Image conditioning:

        ```python
        >>> import torch
        >>> from diffusers import CosmosVideoToWorldPipeline
        >>> from diffusers.utils import export_to_video, load_image

        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Video2World"
        >>> pipe = CosmosVideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
        >>> pipe.to("cuda")

        >>> prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day."
        >>> image = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
        ... )

        >>> video = pipe(image=image, prompt=prompt).frames[0]
        >>> export_to_video(video, "output.mp4", fps=30)
        ```

        Video conditioning:

        ```python
        >>> import torch
        >>> from diffusers import CosmosVideoToWorldPipeline
        >>> from diffusers.utils import export_to_video, load_video

        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Video2World"
        >>> pipe = CosmosVideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
        >>> pipe.transformer = torch.compile(pipe.transformer)
        >>> pipe.to("cuda")

        >>> prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
        ... )[
        ...     :21
        ... ]  # This example uses only the first 21 frames

        >>> video = pipe(video=video, prompt=prompt).frames[0]
        >>> export_to_video(video, "output.mp4", fps=30)
        ```
num_inference_stepsdevice	timestepssigmasc                    Ub  Ub  [        S5      eUb  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
X2S.UD6  U R                  n[        U5      nX14$ Ub  S[        [        R                  " U R                  5      R
                  R                  5       5      ;   nU(       d  [        SU R                   S35      eU R                  " S
XBS.UD6  U R                  n[        U5      nX14$ U R                  " U4S	U0UD6  U R                  nX14$ )a  
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

Args:
    scheduler (`SchedulerMixin`):
        The scheduler to get timesteps from.
    num_inference_steps (`int`):
        The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
        must be `None`.
    device (`str` or `torch.device`, *optional*):
        The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
    timesteps (`List[int]`, *optional*):
        Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
        `num_inference_steps` and `sigmas` must be `None`.
    sigmas (`List[float]`, *optional*):
        Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
        `num_inference_steps` and `timesteps` must be `None`.

Returns:
    `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
    second element is the number of inference steps.
zYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr/   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r/   r.   r0   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r0   r.   r.   r'   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r/   len)	schedulerr-   r.   r/   r0   r"   accepts_timestepsaccept_sigmass           r#   retrieve_timestepsr>   f   s}   > !3tuu'3w/@/@AXAX/Y/d/d/i/i/k+ll .y/B/B.C Da b  	M)MfM''	!)n )) 
	 C(9(9):Q:Q(R(](](b(b(d$ee.y/B/B.C D_ `  	GvGG''	!)n )) 	 3MFMfM''	))r&   encoder_output	generatorsample_modec                    [        U S5      (       a!  US:X  a  U R                  R                  U5      $ [        U S5      (       a   US:X  a  U R                  R                  5       $ [        U S5      (       a  U R                  $ [        S5      e)Nlatent_distsampleargmaxlatentsz3Could not access latents of provided encoder_output)hasattrrC   rD   moderF   AttributeError)r?   r@   rA   s      r#   retrieve_latentsrJ      s}     ~}--+2I))00;;		/	/K84K))..00		+	+%%%RSSr&   c            0         ^  \ rS rSrSrSr/ SQrS/r SAS\S\	S	\
S
\S\S\4U 4S jjjr    SBS\\\\   4   S\S\\R*                     S\\R,                     4S jjr        SCS\\\\   4   S\\\\\   4      S\S\S\\R2                     S\\R2                     S\S\\R*                     S\\R,                     4S jjr         SDS\R2                  S \S!S"S#\S$\S%\S\S&\S\\R,                     S\\R*                     S'\\\R6                  \\R6                     4      S(\\R2                     S)\R2                  4S* jjr    SES+ jr\S, 5       r\S- 5       r \S. 5       r!\S/ 5       r"\S0 5       r#\RH                  " 5       \%" \&5      SSSSSSSS1S2SS3S4SSSSSS5SSS(/S4S6\'S\\'   S\\\\   4   S\\\\\   4      S#\S$\S%\S7\S8\(S&\S9\(S:\S\\   S'\\\R6                  \\R6                     4      S(\\R2                     S\\R2                     S\\R2                     S;\\   S<\S=\\\)\\\*/S4   \+\,4      S>\\   S\4,S? jj5       5       r-S@r.U =r/$ )FCosmosVideoToWorldPipeline   a  
Pipeline for image-to-world and video-to-world generation using [Cosmos
Predict-1](https://github.com/nvidia-cosmos/cosmos-predict1).

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).

Args:
    text_encoder ([`T5EncoderModel`]):
        Frozen text-encoder. Cosmos uses
        [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
        [t5-11b](https://huggingface.co/google-t5/t5-11b) variant.
    tokenizer (`T5TokenizerFast`):
        Tokenizer of class
        [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
    transformer ([`CosmosTransformer3DModel`]):
        Conditional Transformer to denoise the encoded image latents.
    scheduler ([`FlowMatchEulerDiscreteScheduler`]):
        A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
    vae ([`AutoencoderKLCosmos`]):
        Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
ztext_encoder->transformer->vae)rF   prompt_embedsnegative_prompt_embedssafety_checkerNtext_encoder	tokenizertransformervaer;   c           	      x  > [         TU ]  5         Uc
  [        5       nU R                  UUUUUUS9  [	        U SS 5      (       a   U R
                  R                  R                  OSU l        [	        U SS 5      (       a   U R
                  R                  R                  OSU l
        [        U R                  S9U l        g )N)rT   rQ   rR   rS   r;   rP   rT      )vae_scale_factor)superr$   r   register_modulesgetattrrT   configtemporal_compression_ratiovae_scale_factor_temporalspatial_compression_ratiovae_scale_factor_spatialr   video_processor)r    rQ   rR   rS   rT   r;   rP   r9   s          r#   r$   #CosmosVideoToWorldPipeline.__init__   s     	!02N%#) 	 	
 ;B$t:T:TDHHOO66Z[ 	& V]]achjnUoUo(Q(Quv%-t?\?\]r&      promptmax_sequence_lengthr.   dtypec           
      H   U=(       d    U R                   nU=(       d    U R                  R                  n[        U[        5      (       a  U/OUnU R                  USUSSSSS9nUR                  nUR                  R                  5       R                  U5      nU R                  USSS9R                  nUR                  S   UR                  S   :  a]  [        R                  " Xh5      (       dB  U R
                  R                  US S 2US	-
  S24   5      n	[        R                  S
U SU	 35        U R                  UR                  U5      US9R                   n
U
R                  XCS9n
UR#                  S	S9R%                  5       n['        U5       H  u  pSXUS 24'   M     U
$ )N
max_lengthTptF)paddingrg   
truncationreturn_tensorsreturn_lengthreturn_offsets_mappinglongest)ri   rk   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: )attention_mask)re   r.   dimr   )_execution_devicerQ   re   
isinstancestrrR   	input_idsrp   booltoshapetorchequalbatch_decodeloggerwarninglast_hidden_statesumcpu	enumerate)r    rc   rd   r.   re   text_inputstext_input_idsprompt_attention_maskuntruncated_idsremoved_textrN   lengthsilengths                 r#   _get_t5_prompt_embeds0CosmosVideoToWorldPipeline._get_t5_prompt_embeds   s    14110**00'44&&nn *#( % 
 %.. + : : ? ? A D DV L..SW.Xbb  $(<(<R(@@UcIuIu>>66qJ]`aJadfJfGf7ghLNN'(	,A
 ))f%6K * 


 	 &((u(D'+++2668"7+IA()MVW*% , r&   Tr   negative_promptdo_classifier_free_guidancenum_videos_per_promptrN   rO   c
                    U=(       d    U R                   n[        U[        5      (       a  U/OUnUb  [        U5      n
OUR                  S   n
UcG  U R                  XXS9nUR                  u  pnUR                  SUS5      nUR                  X-  US5      nU(       a  Uc  U=(       d    Sn[        U[        5      (       a  X/-  OUnUb;  [        U5      [        U5      La$  [        S[        U5       S[        U5       S35      eU
[        U5      :w  a!  [        S	U S
[        U5       SU S
U
 S3	5      eU R                  X'XS9nUR                  u  pnUR                  SUS5      nUR                  X-  US5      nXV4$ )ab  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `List[str]`, *optional*):
        prompt to be encoded
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
        Whether to use classifier free guidance or not.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    device: (`torch.device`, *optional*):
        torch device
    dtype: (`torch.dtype`, *optional*):
        torch dtype
r   )rc   rd   r.   re   r   ro    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rs   rt   ru   r:   ry   r   repeatviewtype	TypeErrorr2   )r    rc   r   r   r   rN   rO   rd   r.   re   
batch_size_seq_lens                r#   encode_prompt(CosmosVideoToWorldPipeline.encode_prompt  s   L 1411'44&&VJ&,,Q/J  66v 7 M
 *//MA)004I1MM)..z/QSZ\^_M&+A+I-3O@J?\_@`@`j+<<fuO!d6l$:O&OUVZ[jVkUl mV~Q(  s?33 )/)::J3K_J` ax/
| <33  &*%?%?&X^ &@ &"
 388MA%;%B%B1F[]^%_"%;%@%@Acelnp%q"44r&        y   Fvideor   num_channels_latents   heightwidth
num_framesinput_frames_guidancer@   rF   returnc           
      B	   [        U[        5      (       a*  [        U5      U:w  a  [        S[        U5       SU S35      eUR	                  S5      nX:  a%  US-
  U R
                  -  S-   nUS S 2S S 2U* S 24   nOUS-
  U R
                  -  S-   nXm-
  nUR                  UR	                  S5      UR	                  S5      XR	                  S5      UR	                  S5      5      n[        R                  " UU/SS	9n[        U[        5      (       aR  [        U5       Vs/ s H;  n[        U R                  R                  UU   R                  S5      5      UU   S
9PM=     nnODU Vs/ s H7  n[        U R                  R                  UR                  S5      5      U5      PM9     nn[        R                  " USS	9R                  U	5      nU R                  R                  R                   GbL  U R                  R                  R                   U R                  R                  R"                  nn[        R$                  " U5      R'                  SU R                  R                  R(                  SSS5      S S 2S S 2S UR	                  S5      24   R                  U5      n[        R$                  " U5      R'                  SU R                  R                  R(                  SSS5      S S 2S S 2S UR	                  S5      24   R                  U5      nUU-
  U R*                  R                  R,                  -  U-  nO#UU R*                  R                  R,                  -  nUS-
  U R
                  -  S-   nX@R.                  -  nXPR.                  -  nX#UUU4nUc  [1        UXU	S9nOUR                  XS9nXR*                  R                  R2                  -  nUSUUU4nUR5                  U5      nUR                  U5      nUR                  SSUR	                  S5      SS5      nSUS S 2S S 2S U24'   UU-  SU-
  U-  -   nS =nn U(       aI  UR                  SSUR	                  S5      SS5      nSUS S 2S S 2S U24'   Un U(       d  UU-  SU-
  U-  -   n UUUUUU 4$ s  snf s  snf )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r   r   r   r
      rq   )r@   ro   r@   r.   re   r.   re         ?)rt   listr:   r2   sizer]   	new_zerosrz   catrangerJ   rT   encode	unsqueezerx   r[   latents_meanlatents_stdtensorr   latent_channelsr;   
sigma_datar_   r   	sigma_maxnew_ones)!r    r   r   r   r   r   r   r   r   re   r.   r@   rF   num_cond_framesnum_cond_latent_framesnum_padding_framesri   r   init_latentsvidr   r   num_latent_frameslatent_heightlatent_widthry   padding_shapeones_paddingzeros_paddingcond_indicator	cond_maskuncond_indicatoruncond_masks!                                    r#   prepare_latents*CosmosVideoToWorldPipeline.prepare_latentsh  s    i&&3y>Z+GA#i.AQ R&<'gi 
  **Q-(&01n9W9W%WZ[%["!Q,-E&5&9d>\>\%\_`%`"!+!=ooejjmUZZ]DVXbXbcdXeglgqgqrsgtuGIIug.A6Ei&& z**A !q1C1CA1F!GS\]^S_`*  L
 gllfk_b,TXX__S]]1=M-NPYZfkLlyy1588?88??''3(,(D(DdhhooFaFa+L\*a88"aCAqJ`LL]L]^_L`J`D`bL!  [)a88"aCAqJ`LL]L]^_L`J`D`bL! 
 )<74>>;P;P;[;[[^iiL'$..*?*?*J*JJL'!^0N0NNQRR"?"?? = ==3DmUab?"5ITYZGjjj<GNN11;;;#Q(9=,W''6))-8 **1aa!QG8;q!44445"\1Q5G=4XX	)--;&&00Aw||A1M>AQ#:$:#::;'K(.=EUAUYf@ffn6F	S^^^k
 ms   AR&>Rc           
      ,  ^  US-  S:w  d	  US-  S:w  a  [        SU SU S35      eUbW  [        U 4S jU 5       5      (       d=  [        ST R                   SU Vs/ s H  oT R                  ;  d  M  UPM     sn 35      eUb  Ub  [        S	U S
U S35      eUc  Uc  [        S5      eUbA  [        U[        5      (       d,  [        U[
        5      (       d  [        S[        U5       35      eUc  Uc  [        S5      eUb  Ub  [        S5      eg g s  snf )Nr   r   z8`height` and `width` have to be divisible by 16 but are z and r   c              3   @   >#    U  H  oTR                   ;   v   M     g 7fN)_callback_tensor_inputs).0kr    s     r#   	<genexpr>:CosmosVideoToWorldPipeline.check_inputs.<locals>.<genexpr>  s      F
7Y!---7Ys   z2`callback_on_step_end_tensor_inputs` has to be in z, but found zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z-Either `image` or `video` has to be provided.z2Only one of `image` or `video` has to be provided.)r2   allr   rt   ru   r   r   )	r    rc   r   r   rN   "callback_on_step_end_tensor_inputsimager   r   s	   `        r#   check_inputs'CosmosVideoToWorldPipeline.check_inputs  s    B;!urzQWX^W__dejdkklmnn-9# F
7YF
 C
 C
 DTEaEaDbbn  |^  pH  |^vw  ko  kG  kG  bGpq  |^  pH  oI  J  -";08N}o ^0 0  ^ 5w  FC)@)@TZ\`IaIaQRVW]R^Q_`aa=U]LMM!2QRR "3# pHs   D1Dc                     U R                   $ r   _guidance_scaler    s    r#   guidance_scale)CosmosVideoToWorldPipeline.guidance_scale  s    ###r&   c                      U R                   S:  $ )Nr   r   r   s    r#   r   6CosmosVideoToWorldPipeline.do_classifier_free_guidance  s    ##c))r&   c                     U R                   $ r   )_num_timestepsr   s    r#   num_timesteps(CosmosVideoToWorldPipeline.num_timesteps  s    """r&   c                     U R                   $ r   )_current_timestepr   s    r#   current_timestep+CosmosVideoToWorldPipeline.current_timestep  s    %%%r&   c                     U R                   $ r   )
_interruptr   s    r#   	interrupt$CosmosVideoToWorldPipeline.interrupt  s    r&   $   g      @gMbP?   pilr   r-   r   augment_sigmafpsoutput_typereturn_dictcallback_on_step_endr   c                    U R                   c  [        SU R                   S35      e[        U[        [
        45      (       a  UR                  nU R                  X5UUUX5        Xl        SU l	        SU l
        U R                  nU R                   b  U R                   R                  U5        UbR  [        U[        5      (       a  U/OUnU H2  nU R                   R                  U5      (       a  M%  [        SU S35      e   U R                   R                  S5        Ub  [        U[        5      (       a  SnO3Ub!  [        U[        5      (       a  [!        U5      nOUR"                  S	   nU R%                  UUU R&                  UUUUUS
9u  nn[)        U R*                  UU5      u  nnU R,                  R.                  nU R0                  R.                  nUb,  U R2                  R5                  XU5      R7                  S5      nOU R2                  R9                  X%U5      nUR                  UUS9nU R0                  R:                  R<                  S-
  nU R?                  UUU-  UUUUU R&                  U
[@        RB                  UUU5      u  nnn n!n"n#U"R                  U5      n"U R&                  (       a  U#R                  U5      n#[@        RD                  " U/U[@        RB                  S9nURG                  SSXVUS9n$[!        U5      XR*                  RH                  -  -
  n%[!        U5      U l%        U RM                  US9 n&[O        U5       GH  u  n'n(U RP                  (       a  M  U(U l	        U(RS                  UR"                  S	   5      R                  U5      n)U R*                  RT                  U'   n*UU*:  n+U R*                  RW                  U5      n,U R*                  RW                  U*5      n-U+(       a  U S	-  OU n.[Y        UR"                  UU[@        RB                  S9n/UU/USS2SSSS4   -  -   n0U0U,-  U--  n0U.U0-  SU.-
  U-  -   n0U R*                  R[                  U0U(5      n0U0R                  U5      n0U R1                  U0U)UUU"U$SS9S	   n1Un2U R&                  (       a  U+(       a  U!S	-  OU!n3[Y        UR"                  UU[@        RB                  S9n4UU4USS2SSSS4   -  -   n5U5U,-  U--  n5U3U5-  SU3-
  U-  -   n5U R*                  R[                  U5U(5      n5U5R                  U5      n5U R1                  U5U)UUU#U$SS9S	   n6[@        R\                  " U6U1/5      n1[@        R\                  " U2U2/5      n2U R*                  R_                  U1U(U2SS9S   n1U R*                  =R`                  S-  sl0        U R&                  (       aE  U1Rc                  SS	S9u  n6n7W3U-  SU3-
  U6-  -   n6U.U-  SU.-
  U7-  -   n7U7U Rd                  U7U6-
  -  -   n1OU.U-  SU.-
  U1-  -   n1U R*                  R_                  U1U(USU1S9S	   nUb\  0 n8U H  n9[g        5       U9   U8U9'   M     U" U U'U(U85      n:U:Ri                  SU5      nU:Ri                  SU5      nU:Ri                  SU5      nU'[!        U5      S-
  :X  d)  U'S-   U%:  a0  U'S-   U R*                  RH                  -  S	:X  a  U&Rk                  5         [l        (       d  GM  [n        Rp                  " 5         GM     SSS5        SU l	        US:X  Gd  U R,                  R:                  Rr                  GbL  U R,                  R:                  Rr                  U R,                  R:                  Rt                  n<n;[@        RD                  " U;5      Rw                  SU R,                  R:                  Rx                  SSS5      SS2SS2SUR{                  S5      24   R                  U5      n;[@        RD                  " U<5      Rw                  SU R,                  R:                  Rx                  SSS5      SS2SS2SUR{                  S5      24   R                  U5      n<UU<-  U R*                  R:                  R|                  -  U;-   nO"XR*                  R:                  R|                  -  nU R,                  R                  UR                  U5      SS9S	   nU R                   Gb)  U R                   R                  U5        U R2                  R                  USS9nUS-  R                  [        R                  5      n/ n=U H/  n>U R                   R                  U>5      n>U=R                  U>5        M1     [        R                  " U=5      R                  [        RB                  5      S-  S-  S-
  n[@        R                  " U5      R                  S	SSSS5      nU R2                  R                  UUS9nU R                   R                  S5        OU R2                  R                  UUS9nOUnU R                  5         U(       d  U4$ [        US9$ ! , (       d  f       GNP= f) a  
The call function to the pipeline for generation.

Args:
    prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
        instead.
    height (`int`, defaults to `720`):
        The height in pixels of the generated image.
    width (`int`, defaults to `1280`):
        The width in pixels of the generated image.
    num_frames (`int`, defaults to `121`):
        The number of frames in the generated video.
    num_inference_steps (`int`, defaults to `36`):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    guidance_scale (`float`, defaults to `7.0`):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`.
    fps (`int`, defaults to `30`):
        The frames per second of the generated video.
    num_videos_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
        generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor is generated by sampling using the supplied random `generator`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.FloatTensor`, *optional*):
        Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
        provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generated image. Choose between `PIL.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`CosmosPipelineOutput`] instead of a plain tuple.
    callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
        A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
        each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
        DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
        list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
    callback_on_step_end_tensor_inputs (`List`, *optional*):
        The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
        will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
        `._callback_tensor_inputs` attribute of your pipeline class.

Examples:

Returns:
    [`~CosmosPipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`CosmosPipelineOutput`] is returned, otherwise a `tuple` is returned where
        the first element is a list with the generated images and the second element is a list of `bool`s
        indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
Nz)You have disabled the safety checker for z. This is in violation of the [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). Please ensure that you are compliant with the license agreement.Fz5Cosmos Guardrail detected unsafe text in the prompt: zR. Please ensure that the prompt abides by the NVIDIA Open Model License Agreement.r   r   r   )rc   r   r   r   rN   rO   r.   rd   r   r   )re   )totalr   )hidden_statestimestepencoder_hidden_statesr   condition_maskpadding_maskr   )r   rq   )r   pred_original_samplerF   rN   rO   latentro   np)r      g     o@r   r
   )frames)KrP   r2   r9   rt   r   r   tensor_inputsr   r   r   r   rs   rx   ru   check_text_safetyr   r:   ry   r   r   r>   r;   rT   re   rS   r`   
preprocessr   preprocess_videor[   in_channelsr   rz   float32r   r   orderr   progress_barr   r   expandr0   _get_conditioning_c_inr   scale_model_inputr   step_step_indexchunkr   localspopupdateXLA_AVAILABLExm	mark_stepr   r   r   r   r   r   decodepostprocess_videoastyper   uint8check_video_safetyappendstack
from_numpypermutemaybe_free_model_hooksr   )?r    r   r   rc   r   r   r   r   r-   r   r   r   r   r   r@   rF   rN   rO   r   r   r   r   rd   r.   prompt_listpr   r/   	vae_dtypetransformer_dtyper   conditioning_latentsr   r   r   r   r   num_warmup_stepsr  r   tr   current_sigmais_augment_sigma_greaterc_in_augmentc_in_originalcurrent_cond_indicator
cond_noisecond_latent
noise_predrD   current_uncond_indicatoruncond_noiseuncond_latentnoise_pred_uncondnoise_pred_condcallback_kwargsr   callback_outputsr   r   video_batchr   s?                                                                  r#   __call__#CosmosVideoToWorldPipeline.__call__  s\
   t &;DNN;K LS T  *-=?U,VWW1E1S1S. 	&%@bdiq-!%''*""6*!*4VS*A*Avhv$A..@@CC(STUSV WX Y  % ""5) *VS"9"9JJvt$<$<VJ&,,Q/J +(,(H(H"7'#9 3  	
	
" *<DNNL_ag)h&	& HHNN	 ,,22((33E5ISSTUVE((99%OEi8#//66BBQFbfbvbv.. ,,!MMc
_%~7GT_ LL!23	++%..):;Km_V5==Y((AvDU(V y>,?..BVBV,VV!)n%89\!),1>>)*&88GMM!$45889JK $ 5 5a 8+8M+I(#~~DD]S $ E Em T?W!);]k&)'--9U[chcpcpq
2Z-PQSWY]_ceiPiBj5jj)L8=H4{BaJ`F`dkEkk"nn>>{AN)nn->?!--"-%*7#,!- % .  
 !33G_/?!/Ceu,#/[ainiviv#wL$8<-XY[_aegkmqXqJr;r$rM$1L$@=$PM$<}$LPQTlPlpwOw$wM$(NN$D$D]TU$VM$1$4$45F$GM(,(8(8&3!).D'2%1$) )9 ) )% "',=z+J!KJ"YY'78F "^^00QTY0Z[\]
**a/*339C9I9I!QR9I9S6%03GG77;LLM &
 /1EEMcIcgvHvv $ "143F3F/\mJm3n!nJ /1EEMcIcgqHqq 
 ..--7T^ .  (3&(O?-3Xa[* @';D!Q'X$.229gFG$4$8$8-$XM-=-A-ABZ\r-s* I**A9I/IqSTuX\XfXfXlXlNlpqNq '') =LLNy - :~ "&h&xx++7,0HHOO,H,H$((//JeJekLL.T!TXX__<<b!QG1N_PWP\P\]^P_N_H_aR[  LL-T!TXX__<<b!QG1N_PWP\P\]^P_N_H_aR[ 
 "K/$..2G2G2R2RRUaa!NN$9$9$D$DDHHOOGJJy$9uOMaPE"".##&&v.,,>>uRV>W,,RXX6  C--@@EC&&s+ ! -44RZZ@5H1LqP((/771aAF,,>>uR]>^##&&u-,,>>uR]>^E 	##%8O#511U :9s   ;N$i$i
i)r   r   r   r   r_   r]   r`   r   )Nrb   NN)NTr   NNrb   NN)	r   r   r   TFNNNNNNNN)0r(   r)   r*   r+   __doc__model_cpu_offload_seqr   _optional_componentsr   r	   r   r   r   r   r$   r   ru   r   intr   rz   r.   re   r   rw   Tensorr   	Generatorr   r   propertyr   r   r   r   r   no_gradr   EXAMPLE_DOC_STRINGr   floatr   r   r   r   r5  r,   __classcell__)r9   s   @r#   rL   rL      s   . =T,- /3^$^ #^ .	^
 !^ %^ ,^ ^@ )-#&)-'+(c49n%( !( &	(
 $(\ <@,0%&049=#&)-'+Q5c49n%Q5 "%T#Y"78Q5 &*	Q5
  #Q5  -Q5 !) 6Q5 !Q5 &Q5 $Q5p ,0&+'+)-MQ*.V_||V_ V_ !	V_
 V_ V_ V_ &*V_  $V_ $V_ &V_ E%//43H"HIJV_ %,,'V_ 
V_z +/#SJ $ $ * * # # & &   ]]_12 %)*.(,;?#% #&+$/0MQ*.049=%*  9B#&32!2 &'2 c49n%	2
 "%T#Y"782 2 2 2 !2 2  $2 2 2  (}2 E%//43H"HIJ2  %,,'!2"  -#2$ !) 6%2& c]'2( )2* '(Cd+T124DF\\]
+20 -1I122 !32 3 2r&   rL   r7  )NrD   )8r4   typingr   r   r   r   r   numpyr   rz   transformersr   r	   	callbacksr   r   image_processorr   modelsr   r   
schedulersr   utilsr   r   r   r   utils.torch_utilsr   r`   r   pipeline_utilsr   pipeline_outputr   cosmos_guardrailr   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr(   r}   r@  r;  ru   r.   rA  r>   r<  r=  rJ   rL   r'   r&   r#   <module>rS     s*    8 8   8 A 1 C + n n - - . 1 !""4  ))MM			H	%, d *.15%)$(8*!#8* U3,-.8* S	"	8*
 T%[!8*z ck
TLL
T-5eoo-F
T\_
TK
2!2 K
2r&   