
    +hE                        S SK JrJrJrJr  S SKrS SKJr  SSKJ	r	J
r
  SSKJrJr  SSKJrJrJr  SSKJr  S	S
KJrJr  SSKJr  \" 5       (       a  S SKJs  Jr  SrOSr\R<                  " \5      r Sr!SS jr" " S S\5      r#g)    )CallableListOptionalUnionN)XLMRobertaTokenizer   )UNet2DConditionModelVQModel)DDIMSchedulerDDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPTFav  
    Examples:
        ```py
        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
        >>> pipe_prior.to("cuda")

        >>> prompt = "red cat, 4k photo"
        >>> out = pipe_prior(prompt)
        >>> image_emb = out.image_embeds
        >>> negative_image_emb = out.negative_image_embeds

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     prompt,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=negative_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ... ).images

        >>> image[0].save("cat.png")
        ```
c                 n    XS-  -  nXS-  -  S:w  a  US-  nXS-  -  nXS-  -  S:w  a  US-  nX2-  XB-  4$ )Nr   r   r    )hwscale_factornew_hnew_ws        j/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky.pyget_new_h_wr   K   s[    q E?a
q E?a
!555    c            !       "  ^  \ rS rSrSrSrS\S\S\S\	\
\4   S\4
U 4S	 jjrS
 r SS jr\R"                  " 5       \" \5                  SS\	\\\   4   S\	\R,                  \\R,                     4   S\	\R,                  \\R,                     4   S\\	\\\   4      S\S\S\S\S\S\\	\R4                  \\R4                     4      S\\R,                     S\\   S\\\\\R,                  /S4      S\S\4S jj5       5       rSrU =r$ ) KandinskyPipelineU   a  
Pipeline for text-to-image generation using Kandinsky

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

Args:
    text_encoder ([`MultilingualCLIP`]):
        Frozen text-encoder.
    tokenizer ([`XLMRobertaTokenizer`]):
        Tokenizer of class
    scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
        A scheduler to be used in combination with `unet` to generate image latents.
    unet ([`UNet2DConditionModel`]):
        Conditional U-Net architecture to denoise the image embedding.
    movq ([`VQModel`]):
        MoVQ Decoder to generate the image from the latents.
ztext_encoder->unet->movqtext_encoder	tokenizerunet	schedulermovqc                    > [         TU ]  5         U R                  UUUUUS9  S[        U R                  R
                  R                  5      S-
  -  U l        g )N)r#   r$   r%   r&   r'   r   r   )super__init__register_moduleslenr'   configblock_out_channelsmovq_scale_factor)selfr#   r$   r%   r&   r'   	__class__s         r   r*   KandinskyPipeline.__init__k   s\     	% 	 	
 "#s499+;+;+N+N'ORS'S!Tr   c                     Uc  [        XX2S9nO<UR                  U:w  a  [        SUR                   SU 35      eUR                  U5      nXVR                  -  nU$ )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r0   r7   r6   r5   r4   latentsr&   s          r   prepare_latents!KandinskyPipeline.prepare_latents   s`    ?"5fZG}}% #A'--P[\a[b!cddjj(G666r   Nc                 ^   [        U[        5      (       a  [        U5      OSnU R                  USSSSSSS9nUR                  nU R                  USSS9R                  n	U	R
                  S	   UR
                  S	   :  a  [        R                  " X5      (       dj  U R                  R                  U	S S 2U R                  R                  S-
  S	24   5      n
[        R                  S
U R                  R                   SU
 35        UR                  U5      nUR                  R                  U5      nU R                  XS9u  pUR                  USS9nUR                  USS9nUR                  USS9nU(       Ga  Uc  S/U-  nO[!        U5      [!        U5      La$  [#        S[!        U5       S[!        U5       S35      e[        U[$        5      (       a  U/nO2U[        U5      :w  a!  ['        SU S[        U5       SU SU S3	5      eUnU R                  USSSSSSS9nUR                  R                  U5      nUR                  R                  U5      nU R                  UUS9u  nnUR
                  S   nUR)                  SU5      nUR+                  Xc-  U5      nUR
                  S   nUR)                  SUS5      nUR+                  Xc-  US	5      nUR                  USS9n[        R,                  " UU/5      n[        R,                  " UU/5      n[        R,                  " UU/5      nXU4$ )Nr   
max_lengthTM   pt)padding
truncationr?   return_attention_maskadd_special_tokensreturn_tensorslongest)rB   rF   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rB   r?   rC   rD   rE   rF   )
isinstancelistr,   r$   rI   r7   torchequalbatch_decodemodel_max_lengthloggerwarningr9   rJ   r#   repeat_interleavetype	TypeErrorstrr8   repeatviewcat)r0   promptr5   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lens                        r   _encode_prompt KandinskyPipeline._encode_prompt   s    %/vt$<$<S[!
nn "&# % 
 %....SW.Xbb  $(<(<R(@@UcIuIu>>66q$..JiJilmJmprJrGr7stLNNNN334Il^M
 (**62..11&9	484E4E$ 5F 5
1 &778MST7U%?%Q%QRgmn%Q%o"//0E1/M	&&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS11!0 1s?33 )/)::J3K_J` ax/
| <33  !0>>$&*#'# * L %1$:$:$=$=f$E!+::==fEHLHYHY/@P IZ IE"$E -2215G%;%B%B1F[%\"%;%@%@Acel%m"7==a@G0Q0X0XYZ\qst0u-0Q0V0V2GR1-  0AABW]^A_ "II'=}&MNM).4UWq3r)s&		#3Y"?@I)CCr   r^   image_embedsnegative_image_embedsra   heightwidthnum_inference_stepsguidance_scaler_   r4   r;   output_typecallbackcallback_stepsreturn_dictc           	      |   [        U[        5      (       a  SnO8[        U[        5      (       a  [        U5      nO[	        S[        U5       35      eU R                  nUU	-  nUS:  nU R                  UUU	UU5      u  nnn[        U[        5      (       a  [        R                  " USS9n[        U[        5      (       a  [        R                  " USS9nU(       aN  UR                  U	SS9nUR                  U	SS9n[        R                  " X2/SS9R                  UR                  US9nU R                  R                  UUS9  U R                  R                  nU R                   R"                  R$                  n['        XVU R(                  5      u  pVU R+                  UUXV4UR                  UU
UU R                  5      n[-        U R/                  U5      5       GH  u  nnU(       a  [        R                  " U/S-  5      OUnUUS	.nU R!                  UUUUS
S9S   nU(       aj  UR1                  UR2                  S   SS9u  nnUR5                  S5      u  nnUR5                  S5      u  nn UUUU-
  -  -   n[        R                  " UU /SS9n[7        U R                  R"                  S5      (       a$  U R                  R"                  R8                  S;   d   UR1                  UR2                  S   SS9u  nnU R                  R;                  UUUU
S9R<                  nUb-  UU-  S:X  a$  U[?        U R                  SS5      -  n!U" U!UU5        [@        (       d  GM  [B        RD                  " 5         GM     U RF                  RI                  USS9S   n"U RK                  5         US;  a  [	        SU 35      eUS;   aX  U"S-  S-   n"U"RM                  SS5      n"U"RO                  5       RQ                  SSSS5      RS                  5       RU                  5       n"US:X  a  U RW                  U"5      n"U(       d  U"4$ [Y        U"S9$ )at  
Function invoked when calling the pipeline for generation.

Args:
    prompt (`str` or `List[str]`):
        The prompt or prompts to guide the image generation.
    image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
        The clip image embeddings for text prompt, that will be used to condition the image generation.
    negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
        The clip image embeddings for negative text prompt, will be used to condition the image generation.
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
        if `guidance_scale` is less than `1`).
    height (`int`, *optional*, defaults to 512):
        The height in pixels of the generated image.
    width (`int`, *optional*, defaults to 512):
        The width in pixels of the generated image.
    num_inference_steps (`int`, *optional*, defaults to 100):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    guidance_scale (`float`, *optional*, defaults to 4.0):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.
    num_images_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will ge generated by sampling using the supplied random `generator`.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
        (`np.array`) or `"pt"` (`torch.Tensor`).
    callback (`Callable`, *optional*):
        A function that calls every `callback_steps` steps during inference. The function is called with the
        following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
    callback_steps (`int`, *optional*, defaults to 1):
        The frequency at which the `callback` function is called. If not specified, the callback is called at
        every step.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

Examples:

Returns:
    [`~pipelines.ImagePipelineOutput`] or `tuple`
r   z2`prompt` has to be of type `str` or `list` but is g      ?r   rK   )r6   r5   )r5   r   )text_embedsrs   F)sampletimestepencoder_hidden_statesadded_cond_kwargsr|   variance_type)learnedlearned_range)r4   orderT)force_not_quantizer   )rA   nppilzIOnly the output types `pt`, `pil` and `np` are supported not output_type=)r   r   g      ?r   r   )images)-rO   rZ   rP   r,   r8   rX   _execution_devicerq   rQ   r]   rW   r9   r6   r&   set_timesteps	timestepsr%   r-   in_channelsr   r/   r<   	enumerateprogress_barsplitr7   chunkhasattrr   stepprev_samplegetattrXLA_AVAILABLExm	mark_stepr'   decodemaybe_free_model_hooksclampcpupermutefloatnumpynumpy_to_pilr   )#r0   r^   rs   rt   ra   ru   rv   rw   rx   r_   r4   r;   ry   rz   r{   r|   rb   r5   r`   rh   ri   _timesteps_tensornum_channels_latentsitlatent_model_inputr   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idximages#                                      r   __call__KandinskyPipeline.__call__   sE   R fc""J%%VJQRVW]R^Q_`aa''"77
&4s&:#7;7J7JF13NP_8
411 lD)) 99\q9L+T22$)II.C$K!&'99:OUV9WL$9$K$KLagh$K$i! 99&;%JPQRUU#))& V L 	$$%8$H>>33#yy//;;#F43I3IJ &&-v=&,,NN
 d//0@ABDAq=XG9q=!9^e0=| \)&@"3! #  J +,6,<,<W]]1=MST,<,U)
M5?5E5Ea5H2!?(5(;(;A(>%%.?UfCf1gg
"YY
4F'GQO
 --??NN))77;WW * 0 0q1Aq 0 I
A nn))#	 * 
 k  #N(:a(? CC1g.}O CT 		  T B8L##%11hithuvww-'CK#%EKK1%EIIK''1a399;AACE%%%e,E8O"%00r   )r/   )N)N   r   d   g      @r   NNr   Nr   T) __name__
__module____qualname____firstlineno____doc__model_cpu_offload_seqr   r   r	   r   r   r   r
   r*   r<   rq   rQ   no_gradr   EXAMPLE_DOC_STRINGrZ   r   Tensorr   intr   	Generatorr   boolr   __static_attributes____classcell__)r1   s   @r   r!   r!   U   s   & 7U&U 'U #	U
 56U U(	" dDL ]]_12 <@#& #%&MQ*.%*GK !q1c49n%q1 ELL$u||*<<=q1  %U\\43E%EF	q1
 "%T#Y"78q1 q1 q1 !q1 q1  #q1 E%//43H"HIJq1 %,,'q1 c]q1 8S#u||$<d$BCDq1 q1  !q1 3 q1r   r!   )   )$typingr   r   r   r   rQ   transformersr   modelsr	   r
   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   r#   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rU   r   r   r!   r   r   r   <module>r      sy    3 2  4 6 
 . C * ))MM			H	% >6N1) N1r   