
    +h]                     2   S SK Jr  S SKJrJrJr  S SKrS SKr	S SK
r
S SKJrJrJrJr  SSKJr  SSKJr  SSKJrJrJrJr  SS	KJr  S
SKJr  \" 5       (       a  S SKJs  Jr   Sr!OSr!\RD                  " \#5      r$Sr%Sr&\ " S S\5      5       r' " S S\5      r(g)    )	dataclass)ListOptionalUnionN)CLIPImageProcessorCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjection   )PriorTransformer)UnCLIPScheduler)
BaseOutputis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineTFav  
    Examples:
        ```py
        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior")
        >>> pipe_prior.to("cuda")

        >>> prompt = "red cat, 4k photo"
        >>> out = pipe_prior(prompt)
        >>> image_emb = out.image_embeds
        >>> negative_image_emb = out.negative_image_embeds

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     prompt,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=negative_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ... ).images

        >>> image[0].save("cat.png")
        ```
a  
    Examples:
        ```py
        >>> from diffusers import KandinskyPriorPipeline, KandinskyPipeline
        >>> from diffusers.utils import load_image
        >>> import PIL

        >>> import torch
        >>> from torchvision import transforms

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
        ... )
        >>> pipe_prior.to("cuda")

        >>> img1 = load_image(
        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
        ...     "/kandinsky/cat.png"
        ... )

        >>> img2 = load_image(
        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
        ...     "/kandinsky/starry_night.jpeg"
        ... )

        >>> images_texts = ["a cat", img1, img2]
        >>> weights = [0.3, 0.3, 0.4]
        >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights)

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16)
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     "",
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=zero_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=150,
        ... ).images[0]

        >>> image.save("starry_cat.png")
        ```
c                       \ rS rSr% Sr\\R                  \R                  4   \
S'   \\R                  \R                  4   \
S'   Srg)KandinskyPriorPipelineOutputy   z
Output class for KandinskyPriorPipeline.

Args:
    image_embeds (`torch.Tensor`)
        clip image embeddings for text prompt
    negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`)
        clip image embeddings for unconditional tokens
image_embedsnegative_image_embeds N)__name__
__module____qualname____firstlineno____doc__r   torchTensornpndarray__annotations____static_attributes__r       p/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.pyr   r   y   s9     bjj011 rzz!9::r&   r   c                     ^  \ rS rSrSrS/rSrS\S\S\	S\
S\S	\4U 4S
 jjr\R                  " 5       \" \5              SS\\\\R,                  R,                  \R.                  4      S\\   S\S\S\\\R6                  \\R6                     4      S\\R.                     S\\   S\S\4S jj5       5       rS rSS jr SS jr\R                  " 5       \" \ 5              S S\\\\   4   S\\\\\   4      S\S\S\\\R6                  \\R6                     4      S\\R.                     S\S\\   S\!4S jj5       5       r"Sr#U =r$$ )!KandinskyPriorPipeline   aq  
Pipeline for generating image prior for Kandinsky

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

Args:
    prior ([`PriorTransformer`]):
        The canonical unCLIP prior to approximate the image embedding from the text embedding.
    image_encoder ([`CLIPVisionModelWithProjection`]):
        Frozen image-encoder.
    text_encoder ([`CLIPTextModelWithProjection`]):
        Frozen text-encoder.
    tokenizer (`CLIPTokenizer`):
        Tokenizer of class
        [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
    scheduler ([`UnCLIPScheduler`]):
        A scheduler to be used in combination with `prior` to generate image embedding.
priorztext_encoder->priorimage_encodertext_encoder	tokenizer	schedulerimage_processorc           	      J   > [         TU ]  5         U R                  UUUUUUS9  g )N)r+   r-   r.   r/   r,   r0   )super__init__register_modules)selfr+   r,   r-   r.   r/   r0   	__class__s          r'   r3   KandinskyPriorPipeline.__init__   s5     	%'+ 	 	
r&   images_and_promptsweightsnum_images_per_promptnum_inference_steps	generatorlatentsnegative_prior_promptnegative_promptguidance_scalec                    U
=(       d    U R                   n
[        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      e/ n[        X5       GH!  u  p[	        U[
        5      (       a  U " UUUUUUU	S9R                  nO[	        U[        R                  R                  [        R                  45      (       a  [	        U[        R                  R                  5      (       aN  U R                  USS9R                  S   R                  S5      R                  U R                  R                   U
S9nU R                  U5      S	   nO[        S
[#        U5       35      eUR%                  X-  5        GM$     [        R&                  " U5      R)                  SSS9nU " UUUUUUU	S9nUS:X  a  UR*                  OUR                  n[-        UUS9$ )a  
Function invoked when using the prior pipeline for interpolation.

Args:
    images_and_prompts (`List[Union[str, PIL.Image.Image, torch.Tensor]]`):
        list of prompts and images to guide the image generation.
    weights: (`List[float]`):
        list of weights for each condition in `images_and_prompts`
    num_images_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    num_inference_steps (`int`, *optional*, defaults to 25):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will ge generated by sampling using the supplied random `generator`.
    negative_prior_prompt (`str`, *optional*):
        The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if
        `guidance_scale` is less than `1`).
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if
        `guidance_scale` is less than `1`).
    guidance_scale (`float`, *optional*, defaults to 4.0):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.

Examples:

Returns:
    [`KandinskyPriorPipelineOutput`] or `tuple`
z`images_and_prompts` contains z items and `weights` contains z, items - they should be lists of same length)r;   r:   r<   r=   r?   r@   pt)return_tensorsr   )dtypedevicer   zq`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor`  but is T)dimkeepdim r   r   )rE   len
ValueErrorzip
isinstancestrr   PILImager    r!   r0   pixel_values	unsqueezetor,   rD   typeappendcatsumr   r   )r5   r8   r9   r:   r;   r<   r=   r>   r?   r@   rE   image_embeddingscondweight	image_embout_zerozero_image_embs                    r'   interpolate"KandinskyPriorPipeline.interpolate   s   l &4;;!"c'l205G1H0IIghklshtgu  vb  c   2<LD$$$ (;*?'#$9#1 ,  D399??ELL"ABBdCIIOO44,,T$,G%a)"1$"4"4":":6J	  !..t4^D	 ! H  IM  NR  IS  HT  U  ##I$679 =< II./3343H	 3"71)
 <Kb;P77V^VkVk+Zhiir&   c                     Uc  [        XX2S9nO<UR                  U:w  a  [        SUR                   SU 35      eUR                  U5      nXVR                  -  nU$ )N)r<   rE   rD   zUnexpected latents shape, got z, expected )r   shaperK   rS   init_noise_sigma)r5   ra   rD   rE   r<   r=   r/   s          r'   prepare_latents&KandinskyPriorPipeline.prepare_latents!  s`    ?"5fZG}}% #A'--P[\a[b!cddjj(G666r&   c                 h   U=(       d    U R                   n[        R                  " SSU R                  R                  R
                  U R                  R                  R
                  5      R                  X R                  R                  S9nU R                  U5      S   nUR                  US5      nU$ )N   r   )rE   rD   r   )	rE   r    zerosr,   config
image_sizerS   rD   repeat)r5   
batch_sizerE   zero_imgr]   s        r'   get_zero_embed%KandinskyPriorPipeline.get_zero_embed,  s    &4;;;;q!T%7%7%>%>%I%I4K]K]KdKdKoKopss!3!3!9!9 t 
 ++H5nE'..z1=r&   c                 p   [        U[        5      (       a  [        U5      OSnU R                  USU R                  R                  SSS9nUR
                  nUR                  R                  5       R                  U5      n	U R                  USSS9R
                  n
U
R                  S   UR                  S   :  a  [        R                  " X5      (       d  U R                  R                  U
S S 2U R                  R                  S-
  S24   5      n[        R                  S	U R                  R                   S
U 35        US S 2S U R                  R                  24   nU R                  UR                  U5      5      nUR                   nUR"                  nUR%                  USS9nUR%                  USS9nU	R%                  USS9n	U(       Ga  Uc  S/U-  nO['        U5      ['        U5      La$  [)        S['        U5       S['        U5       S35      e[        U[*        5      (       a  U/nO2U[        U5      :w  a!  [-        SU S[        U5       SU SU S3	5      eUnU R                  USU R                  R                  SSS9nUR                  R                  5       R                  U5      nU R                  UR
                  R                  U5      5      nUR                   nUR"                  nUR                  S   nUR/                  SU5      nUR1                  Xc-  U5      nUR                  S   nUR/                  SUS5      nUR1                  Xc-  US5      nUR%                  USS9n[        R2                  " UU/5      n[        R2                  " UU/5      n[        R2                  " UU	/5      n	XU	4$ )Nrf   
max_lengthTrB   )paddingrp   
truncationrC   longest)rq   rC   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: r   )rF   rH   z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rM   listrJ   r.   model_max_length	input_idsattention_maskboolrS   ra   r    equalbatch_decodeloggerwarningr-   text_embedslast_hidden_staterepeat_interleaverT   	TypeErrorrN   rK   rj   viewrV   )r5   promptrE   r:   do_classifier_free_guidancer?   rk   text_inputstext_input_ids	text_maskuntruncated_idsremoved_texttext_encoder_outputprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lens                         r'   _encode_prompt%KandinskyPriorPipeline._encode_prompt5  s    %/vt$<$<S[!
nn ~~66 % 
 %....33588@	..SW.Xbb  $(<(<R(@@UcIuIu>>66q$..JiJilmJmprJrGr7stLNNNN334Il^M ,A/P1P1P/P,PQN"//0A0A&0IJ+77%8%J%J"%778MST7U%?%Q%QRgmn%Q%o"//0E1/M	&&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS11!0 1s?33 )/)::J3K_J` ax/
| <33  !0>>$>>::# * L  ,::??ADDVL9=9J9J<KaKaKdKdekKl9m6%O%[%["0Z0l0l- -2215G%;%B%B1F[%\"%;%@%@Acel%m"7==a@G0Q0X0XYZ\qst0u-0Q0V0V2GR1-  0AABW]^A_ "II'=}&MNM).4UWq3r)s&		#3Y"?@I)CCr&   r   output_typereturn_dictc
           	         [        U[        5      (       a  U/nO,[        U[        5      (       d  [        S[	        U5       35      e[        U[        5      (       a  U/nO/[        U[        5      (       d  Ub  [        S[	        U5       35      eUb	  X-   nSU-  nU R
                  n
[        U5      nX-  nUS:  nU R                  XX<U5      u  pnU R                  R                  XJS9  U R                  R                  nU R                  R                  R                  nU R                  UU4UR                  U
UUU R                  5      n[!        U R#                  U5      5       H  u  nnU(       a  [$        R&                  " U/S-  5      OUnU R                  UUUUUS9R(                  nU(       a  UR+                  S5      u  nnUUUU-
  -  -   nUS-   UR,                  S	   :X  a  SnOUUS-      nU R                  R/                  UUUUUS
9R0                  n[2        (       d  M  [4        R6                  " 5         M     U R                  R9                  U5      nUnUc8  U R;                  UR,                  S	   UR<                  S9nU R?                  5         OLUR+                  S5      u  nn[A        U S5      (       a'  U RB                  b  U RD                  RG                  5         US;  a  [        SU 35      eUS:X  a<  URI                  5       RK                  5       nURI                  5       RK                  5       nU	(       d  UU4$ [M        UUS9$ )a  
Function invoked when calling the pipeline for generation.

Args:
    prompt (`str` or `List[str]`):
        The prompt or prompts to guide the image generation.
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
        if `guidance_scale` is less than `1`).
    num_images_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    num_inference_steps (`int`, *optional*, defaults to 25):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will ge generated by sampling using the supplied random `generator`.
    guidance_scale (`float`, *optional*, defaults to 4.0):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.
    output_type (`str`, *optional*, defaults to `"pt"`):
        The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"`
        (`torch.Tensor`).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

Examples:

Returns:
    [`KandinskyPriorPipelineOutput`] or `tuple`
z2`prompt` has to be of type `str` or `list` but is Nz;`negative_prompt` has to be of type `str` or `list` but is r   g      ?)rE   )timestepproj_embeddingencoder_hidden_statesry   rf   r   )r   sampler<   prev_timestepfinal_offload_hook)rB   r"   zBOnly the output types `pt` and `np` are supported not output_type=r"   rI   )'rM   rN   rv   rK   rT   _execution_devicerJ   r   r/   set_timesteps	timestepsr+   rh   embedding_dimrc   rD   	enumerateprogress_barr    rV   predicted_image_embeddingchunkra   stepprev_sampleXLA_AVAILABLExm	mark_steppost_process_latentsrm   rE   maybe_free_model_hookshasattrr   
prior_hookoffloadcpunumpyr   )r5   r   r?   r:   r;   r<   r=   r@   r   r   rE   rk   r   r   r   r   prior_timesteps_tensorr   itlatent_model_inputr    predicted_image_embedding_uncondpredicted_image_embedding_textr   rX   zero_embedss                              r'   __call__KandinskyPriorPipeline.__call__  sd   j fc""XFFD))QRVW]R^Q_`aaos++./OOT227RZ[_`o[pZqrss &-F/1O''[
7
&4s&:#?C?R?R1P_@
<9
 	$$%8$H!%!9!9

))77&&'NN
 d//0FGHDAq=XG9q=!9^e(,

",&@( )3 ) (' & +SlSrSrstSuP02P,L~25UUP -) 1u.44Q77 $ 6q1u =nn)))#+ *  k  }A ID **11':" "--gmmA.>w~~-VK''),<,B,B1,E)kt122t7N7N7Z'')l*abmanopp$/335;;=%//+113K$k22+9Ialmmr&   r   )rf      NNNrH         @N)rf   N)N)Nrf   r   NNr   rB   T)%r   r   r   r   r   _exclude_from_cpu_offloadmodel_cpu_offload_seqr   r
   r   r	   r   r   r3   r    no_gradr   EXAMPLE_INTERPOLATE_DOC_STRINGr   r   rN   rO   rP   r!   floatintr   	Generatorr^   rc   rm   r   EXAMPLE_DOC_STRINGrz   r   r%   __classcell__)r6   s   @r'   r)   r)      sb   ( ")	1

 5
 2	

 !
 #
 ,
( ]]_=>
 &'#%MQ*./3! #gj sCIIOOU\\'I!JKgj egj  #	gj
 !gj E%//43H"HIJgj %,,'gj  (}gj gj gj ? gjT	 _DB ]]_12 <@%&#%MQ*. #%) Wnc49n%Wn "%T#Y"78Wn  #	Wn
 !Wn E%//43H"HIJWn %,,'Wn Wn c]Wn Wn 3 Wnr&   r)   ))dataclassesr   typingr   r   r   r   r"   	PIL.ImagerO   r    transformersr   r   r	   r
   modelsr   
schedulersr   utilsr   r   r   r   utils.torch_utilsr   pipeline_utilsr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   r}   r   r   r   r)   r   r&   r'   <module>r      s    " ( (    v v & )  . . ))MM			H	% <+" \ ;: ; ;fn. fnr&   