
    +hU                        S SK JrJrJrJr  S SKrS SKrS SKJ	r	  SSK
Jr  SSKJrJr  SSKJr  SSKJrJrJr  SS	KJr  S
SKJrJr  SSKJr  \" 5       (       a  S SKJs  Jr  Sr OSr \RB                  " \"5      r#Sr$SS jr% " S S\5      r&g)    )CallableListOptionalUnionN)XLMRobertaTokenizer   )VaeImageProcessor)UNet2DConditionModelVQModel)DDIMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPTFa  
    Examples:
        ```py
        >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
        >>> from diffusers.utils import load_image
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
        ... )
        >>> pipe_prior.to("cuda")

        >>> prompt = "A red cartoon frog, 4k"
        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)

        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained(
        ...     "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        ... )
        >>> pipe.to("cuda")

        >>> init_image = load_image(
        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
        ...     "/kandinsky/frog.png"
        ... )

        >>> image = pipe(
        ...     prompt,
        ...     image=init_image,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=zero_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ...     strength=0.2,
        ... ).images

        >>> image[0].save("red_frog.png")
        ```
c                 n    XS-  -  nXS-  -  S:w  a  US-  nXS-  -  nXS-  -  S:w  a  US-  nX2-  XB-  4$ )Nr   r   r    )hwscale_factornew_hnew_ws        r/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.pyget_new_h_wr   W   s[    q E?a
q E?a
!555    c            #         ^  \ rS rSrSrSrS\S\S\S\	S\
4
U 4S	 jjrS
 rS r S%S jrS\R                   S\R                   S\R"                  S\R                   4S jr\R&                  " 5       \" \5                  S&S\\\\   4   S\\R                   \R4                  R4                  \\R                      \\R4                  R4                     4   S\R                   S\R                   S\\\\\   4      S\S\S\S\S\S\S\\\R<                  \\R<                     4      S\\   S \\\\\R                   /S4      S!\S"\ 4 S# jj5       5       r!S$r"U =r#$ )'KandinskyImg2ImgPipelinea   a  
Pipeline for image-to-image generation using Kandinsky

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

Args:
    text_encoder ([`MultilingualCLIP`]):
        Frozen text-encoder.
    tokenizer ([`XLMRobertaTokenizer`]):
        Tokenizer of class
    scheduler ([`DDIMScheduler`]):
        A scheduler to be used in combination with `unet` to generate image latents.
    unet ([`UNet2DConditionModel`]):
        Conditional U-Net architecture to denoise the image embedding.
    movq ([`VQModel`]):
        MoVQ image encoder and decoder
ztext_encoder->unet->movqtext_encodermovq	tokenizerunet	schedulerc                 v  > [         TU ]  5         U R                  UUUUUS9  [        U SS 5      (       a/  S[	        U R
                  R                  R                  5      S-
  -  OSU l        [        U SS 5      (       a   U R
                  R                  R                  OSn[        U R                  USSS9U l        g )	N)r#   r%   r&   r'   r$   r$   r   r         bicubic)vae_scale_factorvae_latent_channelsresamplereducing_gap)super__init__register_modulesgetattrlenr$   configblock_out_channelsmovq_scale_factorlatent_channelsr	   image_processor)selfr#   r$   r%   r&   r'   movq_latent_channels	__class__s          r   r1   !KandinskyImg2ImgPipeline.__init__w   s     	% 	 	
 DK4QWY]C^C^A#dii&&99:Q>?de 	 DK4QWY]C^C^tyy//??de0!33 4	 
r   c                     [        [        X-  5      U5      n[        X-
  S5      nU R                  R                  US  nXaU-
  4$ )Nr   )minintmaxr'   	timesteps)r:   num_inference_stepsstrengthdeviceinit_timestept_startrB   s          r   get_timesteps&KandinskyImg2ImgPipeline.get_timesteps   sL    C 3 >?ATU)91=NN,,WX6	777r   c                    Uc  [        X6XTS9nO<UR                  U:w  a  [        SUR                   SU 35      eUR                  U5      nXR                  -  nUR                  n[        X6XTS9nU R                  XU5      nU$ )N)	generatorrE   dtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma	add_noise)	r:   latentslatent_timesteprM   rL   rE   rK   r'   noises	            r   prepare_latents(KandinskyImg2ImgPipeline.prepare_latents   s    ?"5fZG}}% #A'--P[\a[b!cddjj(G666UT..Ar   Nc                 ^   [        U[        5      (       a  [        U5      OSnU R                  USSSSSSS9nUR                  nU R                  USSS9R                  n	U	R
                  S	   UR
                  S	   :  a  [        R                  " X5      (       dj  U R                  R                  U	S S 2U R                  R                  S-
  S	24   5      n
[        R                  S
U R                  R                   SU
 35        UR                  U5      nUR                  R                  U5      nU R                  XS9u  pUR                  USS9nUR                  USS9nUR                  USS9nU(       Ga  Uc  S/U-  nO[!        U5      [!        U5      La$  [#        S[!        U5       S[!        U5       S35      e[        U[$        5      (       a  U/nO2U[        U5      :w  a!  ['        SU S[        U5       SU SU S3	5      eUnU R                  USSSSSSS9nUR                  R                  U5      nUR                  R                  U5      nU R                  UUS9u  nnUR
                  S   nUR)                  SU5      nUR+                  Xc-  U5      nUR
                  S   nUR)                  SUS5      nUR+                  Xc-  US	5      nUR                  USS9n[        R,                  " UU/5      n[        R,                  " UU/5      n[        R,                  " UU/5      nXU4$ )Nr   
max_lengthM   Tpt)paddingrX   
truncationreturn_attention_maskadd_special_tokensreturn_tensorslongest)r[   r_   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancelistr4   r%   rb   rM   torchequalbatch_decodemodel_max_lengthloggerwarningrO   rc   r#   repeat_interleavetype	TypeErrorstrrN   repeatviewcat)r:   promptrE   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lens                        r   _encode_prompt'KandinskyImg2ImgPipeline._encode_prompt   s    %/vt$<$<S[!
nn "&# % 
 %....SW.Xbb  $(<(<R(@@UcIuIu>>66q$..JiJilmJmprJrGr7stLNNNN334Il^M
 (**62..11&9	484E4E$ 5F 5
1 &778MST7U%?%Q%QRgmn%Q%o"//0E1/M	&&!#z 1fT/%::UVZ[jVkUl mV~Q(  OS11!0 1s?33 )/)::J3K_J` ax/
| <33  !0>>$&*#'# * L %1$:$:$=$=f$E!+::==fEHLHYHY/@P IZ IE"$E -2215G%;%B%B1F[%\"%;%@%@Acel%m"7==a@G0Q0X0XYZ\qst0u-0Q0V0V2GR1-  0AABW]^A_ "II'=}&MNM).4UWq3r)s&		#3Y"?@I)CCr   original_samplesrT   rB   returnc                 "   [         R                  " SSS[         R                  S9nSU-
  n[         R                  " USS9nUR	                  UR
                  UR                  S9nUR	                  UR
                  5      nXc   S	-  nUR                  5       n[        UR                  5      [        UR                  5      :  a?  UR                  S
5      n[        UR                  5      [        UR                  5      :  a  M?  SXc   -
  S	-  nUR                  5       n[        UR                  5      [        UR                  5      :  a?  UR                  S
5      n[        UR                  5      [        UR                  5      :  a  M?  Xq-  X-  -   n	U	$ )Ng-C6?g{Gz?i  )rL         ?r   rd   )rE   rL   g      ?ra   r   )rj   linspacefloat32cumprodrO   rE   rL   flattenr4   rM   	unsqueeze)
r:   r   rT   rB   betasalphasalphas_cumprodsqrt_alpha_prodsqrt_one_minus_alpha_prodnoisy_sampless
             r   rQ   "KandinskyImg2ImgPipeline.add_noise  sh    vtTGuv15'**2B2I2IQaQgQg*hLL!1!8!89	(3s:)113/''(3/?/E/E+FF-77;O /''(3/?/E/E+FF &')B%Bs$J!$=$E$E$G!+112S9I9O9O5PP(A(K(KB(O% +112S9I9O9O5PP (:=V=^^r   rw   imageimage_embedsnegative_image_embedsrz   heightwidthrC   rD   guidance_scalerx   rK   output_typecallbackcallback_stepsreturn_dictc           
      B
   [        U[        5      (       a  SnO8[        U[        5      (       a  [        U5      nO[	        S[        U5       35      eU R                  nUU-  nU
S:  nU R                  UUUUU5      u  nnn[        U[        5      (       a  [        R                  " USS9n[        U[        5      (       a  [        R                  " USS9nU(       aN  UR                  USS9nUR                  USS9n[        R                  " XC/SS9R                  UR                  US9n[        U[        5      (       d  U/n[        S U 5       5      (       d)  [	        SU Vs/ s H  n[        U5      PM     sn S	35      e[        R                  " U Vs/ s H  nU R                  R                  UXv5      PM!     snSS9nUR                  UR                  US9nU R                   R#                  U5      S
   nUR                  USS9nU R$                  R'                  UUS9  U R)                  XU5      u  nn[+        U R$                  R,                  R.                  U	-  5      S-
  n[        R0                  " U/U-  UR                  US9nU R2                  R,                  R4                  n[7        XgU R8                  5      u  pgU R;                  UUUUXg4UR                  UUU R$                  5      n[=        U R?                  U5      5       GH  u  nnU(       a  [        R                  " U/S-  5      OUnUUS.nU R3                  UUUUSS9S   nU(       aj  URA                  URB                  S   SS9u  nn URE                  S5      u  n!n"U RE                  S5      u  nn#U!U
U"U!-
  -  -   n[        R                  " UU#/SS9n[G        U R$                  R,                  S5      (       a$  U R$                  R,                  RH                  S;   d   URA                  URB                  S   SS9u  nnU R$                  RK                  UUUUS9RL                  nUb-  UU-  S:X  a$  U[O        U R$                  SS5      -  n$U" U$UU5        [P        (       d  GM  [R        RT                  " 5         GM     U R                   RW                  USS9S   nU RY                  5         US;  a  [	        SU 35      eU R                  R[                  X-5      nU(       d  U4$ []        US9$ s  snf s  snf )a  
Function invoked when calling the pipeline for generation.

Args:
    prompt (`str` or `List[str]`):
        The prompt or prompts to guide the image generation.
    image (`torch.Tensor`, `PIL.Image.Image`):
        `Image`, or tensor representing an image batch, that will be used as the starting point for the
        process.
    image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
        The clip image embeddings for text prompt, that will be used to condition the image generation.
    negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
        The clip image embeddings for negative text prompt, will be used to condition the image generation.
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
        if `guidance_scale` is less than `1`).
    height (`int`, *optional*, defaults to 512):
        The height in pixels of the generated image.
    width (`int`, *optional*, defaults to 512):
        The width in pixels of the generated image.
    num_inference_steps (`int`, *optional*, defaults to 100):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    strength (`float`, *optional*, defaults to 0.3):
        Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
        will be used as a starting point, adding more noise to it the larger the `strength`. The number of
        denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
        be maximum and the denoising process will run for the full number of iterations specified in
        `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
    guidance_scale (`float`, *optional*, defaults to 4.0):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.
    num_images_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
        (`np.array`) or `"pt"` (`torch.Tensor`).
    callback (`Callable`, *optional*):
        A function that calls every `callback_steps` steps during inference. The function is called with the
        following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
    callback_steps (`int`, *optional*, defaults to 1):
        The frequency at which the `callback` function is called. If not specified, the callback is called at
        every step.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

Examples:

Returns:
    [`~pipelines.ImagePipelineOutput`] or `tuple`
r   z2`prompt` has to be of type `str` or `list` but is r   r   rd   )rL   rE   c              3      #    U  H9  n[        U[        R                  R                  [        R                  45      v   M;     g 7fN)rh   PILImagerj   Tensor).0is     r   	<genexpr>4KandinskyImg2ImgPipeline.__call__.<locals>.<genexpr>  s+     Q5a:a#))//5<<!@AA5s   AAzInput is in incorrect format: z:. Currently, we only support  PIL image and pytorch tensorrR   )rE   r   )text_embedsr   F)sampletimestepencoder_hidden_statesadded_cond_kwargsr   variance_type)learnedlearned_range)rK   orderT)force_not_quantizer   )rZ   nppilzIOnly the output types `pt`, `pil` and `np` are supported not output_type=)images)/rh   rs   ri   r4   rN   rq   _execution_devicer   rj   rv   rp   rO   rL   allr9   
preprocessr$   encoder'   set_timestepsrH   r@   r5   num_train_timestepstensorr&   in_channelsr   r7   rU   	enumerateprogress_barsplitrM   chunkhasattrr   stepprev_sampler3   XLA_AVAILABLExm	mark_stepdecodemaybe_free_model_hookspostprocessr   )%r:   rw   r   r   r   rz   r   r   rC   rD   r   rx   rK   r   r   r   r   r{   rE   ry   r   r   _r   rR   timesteps_tensorrS   num_channels_latentstlatent_model_inputr   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idxs%                                        r   __call__!KandinskyImg2ImgPipeline.__call__-  s1   ^ fc""J%%VJQRVW]R^Q_`aa''"77
&4s&:# 8<7J7JF13NP_8
411 lD)) 99\q9L+T22$)II.C$K!&'99:OUV9WL$9$K$KLagh$K$i! 99&;%JPQRUU#))& V L
 %&&GEQ5QQQ051I5a$q'51I0J  KE  F  		V[\V[QR4//::1eLV[\bcd}226B))""5))4++,Aq+I 	$$%8$H040B0BCVbh0i-- dnn33GG(RSVWW,,'8:'EM]McMclrs#yy//;;#F43I3IJ &&-v=&,,NN
 d//0@ABDAq=XG9q=!9^e0=| \)&@"3! #  J +,6,<,<W]]1=MST,<,U)
M5?5E5Ea5H2!?(5(;(;A(>%%.?UfCf1gg
"YY
4F'GQO
 --??NN))77;WW * 0 0q1Aq 0 I
A nn))#	 * 
 k  #N(:a(? CC1g.}O CT 		  T B8L##%11hithuvww$$00D8O"%00s 2J ]s   "T
&T)r9   r7   r   )N   r   d   g333333?g      @r   Nr   Nr   T)$__name__
__module____qualname____firstlineno____doc__model_cpu_offload_seqr   r   r   r
   r   r1   rH   rU   r   rj   r   	IntTensorrQ   no_gradr   EXAMPLE_DOC_STRINGr   rs   r   r   r   r   r@   float	Generatorr   boolr   __static_attributes____classcell__)r<   s   @r   r!   r!   a   s'   & 7
&
 
 '	

 #
 !
88, dDN,, || ??	
 
4 ]]_12 <@#& #%&MQ%*GK #J1c49n%J1 U\\399??D4FSYY__H]]^J1 ll	J1
  %||J1 "%T#Y"78J1 J1 J1 !J1 J1 J1  #J1 E%//43H"HIJJ1 c]J1 8S#u||$<d$BCDJ1  !J1" #J1 3 J1r   r!   )r)   )'typingr   r   r   r   	PIL.Imager   rj   transformersr   r9   r	   modelsr
   r   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   r#   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rn   r   r   r!   r   r   r   <module>r      s    3 2   1 3 ' 
 . C * ))MM			H	%& R6X10 X1r   