
    +h                     t   S SK r S SKJrJrJrJrJrJrJr  S SK	r	S SK
JrJrJr  S SKJr  SSKJr  SSKJrJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+J,r,  SSK-J.r.  \"" 5       (       a  S SK/J0s  J1r2  Sr3OSr3\#Rh                  " \55      r6Sr7 " S S\*\,\\5      r8g)    N)AnyCallableDictListOptionalTupleUnion)CLIPTextModelCLIPTextModelWithProjectionCLIPTokenizer)CLIPTextModelOutput   )VaeImageProcessor)StableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLPriorTransformerUNet2DConditionModel)get_timestep_embedding)adjust_lora_scale_text_encoder)KarrasDiffusionSchedulers)USE_PEFT_BACKEND	deprecateis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor   )DiffusionPipelineImagePipelineOutputStableDiffusionMixin   )StableUnCLIPImageNormalizerTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import StableUnCLIPPipeline

        >>> pipe = StableUnCLIPPipeline.from_pretrained(
        ...     "fusing/stable-unclip-2-1-l", torch_dtype=torch.float16
        ... )  # TODO update model path
        >>> pipe = pipe.to("cuda")

        >>> prompt = "a photo of an astronaut riding a horse on mars"
        >>> images = pipe(prompt).images
        >>> images[0].save("astronaut_horse.png")
        ```
c            0       H  ^  \ rS rSr% SrSS/rSr\\S'   \	\S'   \
\S'   \\S'   \\S'   \\S	'   \\S
'   \\S'   \\S'   \\S'   \\S'   S\S\	S\
S\S\S	\S
\S\S\S\S\4U 4S jjr  S7S\\\\4      S\\R,                     4S jjr    S8S\\R,                     S\\R,                     S\\   4S jjr     S9S\\R,                     S\\R,                     S\\   S\\   4S jjrS rS rS r   S:S jrS r   S7S\R,                  S \S!\\R,                     S"\\RB                     4S# jjr"\RF                  " 5       \$" \%5                            S;S$\\\&\'\&   4      S%\\   S&\\   S'\S(\S)\\\&\'\&   4      S*\\   S+\S"\\RB                     S,\\R,                     S\\R,                     S\\R,                     S-\\&   S.\(S/\\)\\\R,                  /S4      S0\S1\\*\&\+4      S \S2\S3\S4\\R,                     S\\   4,S5 jj5       5       r,S6r-U =r.$ )<StableUnCLIPPipelineF   a  
Pipeline for text-to-image generation using stable unCLIP.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).

The pipeline also inherits the following loading methods:
    - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
    - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
    - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights

Args:
    prior_tokenizer ([`CLIPTokenizer`]):
        A [`CLIPTokenizer`].
    prior_text_encoder ([`CLIPTextModelWithProjection`]):
        Frozen [`CLIPTextModelWithProjection`] text-encoder.
    prior ([`PriorTransformer`]):
        The canonical unCLIP prior to approximate the image embedding from the text embedding.
    prior_scheduler ([`KarrasDiffusionSchedulers`]):
        Scheduler used in the prior denoising process.
    image_normalizer ([`StableUnCLIPImageNormalizer`]):
        Used to normalize the predicted image embeddings before the noise is applied and un-normalize the image
        embeddings after the noise has been applied.
    image_noising_scheduler ([`KarrasDiffusionSchedulers`]):
        Noise schedule for adding noise to the predicted image embeddings. The amount of noise to add is determined
        by the `noise_level`.
    tokenizer ([`CLIPTokenizer`]):
        A [`CLIPTokenizer`].
    text_encoder ([`CLIPTextModel`]):
        Frozen [`CLIPTextModel`] text-encoder.
    unet ([`UNet2DConditionModel`]):
        A [`UNet2DConditionModel`] to denoise the encoded image latents.
    scheduler ([`KarrasDiffusionSchedulers`]):
        A scheduler to be used in combination with `unet` to denoise the encoded image latents.
    vae ([`AutoencoderKL`]):
        Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
priorimage_normalizerz+text_encoder->prior_text_encoder->unet->vaeprior_tokenizerprior_text_encoderprior_schedulerimage_noising_scheduler	tokenizertext_encoderunet	schedulervaec                   > [         TU ]  5         U R                  UUUUUUUUU	U
US9  [        U SS 5      (       a/  S[	        U R
                  R                  R                  5      S-
  -  OSU l        [        U R                  S9U l
        g )N)r+   r,   r)   r-   r*   r.   r/   r0   r1   r2   r3   r3   r    r$      )vae_scale_factor)super__init__register_modulesgetattrlenr3   configblock_out_channelsr6   r   image_processor)selfr+   r,   r)   r-   r*   r.   r/   r0   r1   r2   r3   	__class__s               u/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.pyr8   StableUnCLIPPipeline.__init__   s    $ 	+1+-$;% 	 	
 W^^bdikoVpVpc$((//*L*L&MPQ&Q Rvw0$BWBWX    Ntext_model_outputtext_attention_maskc                    UGc  [        U[        5      (       a  [        U5      OSnU R                  USU R                  R                  SSS9nUR
                  n	UR                  R                  5       R                  U5      n
U R                  USSS9R
                  nUR                  S   U	R                  S   :  a  [        R                  " X5      (       d  U R                  R                  US S 2U R                  R                  S-
  S24   5      n[        R                  S	U R                  R                   S
U 35        U	S S 2S U R                  R                  24   n	U R                  U	R                  U5      5      nUR                   nUR"                  nOUS   R                  S   nUS   US   pUn
UR%                  USS9nUR%                  USS9nU
R%                  USS9n
U(       Ga\  S/U-  nU R                  USU R                  R                  SSS9nUR                  R                  5       R                  U5      nU R                  UR
                  R                  U5      5      nUR                   nUR"                  nUR                  S   nUR'                  SU5      nUR)                  Xs-  U5      nUR                  S   nUR'                  SUS5      nUR)                  Xs-  US5      nUR%                  USS9n[        R*                  " UU/5      n[        R*                  " UU/5      n[        R*                  " UU
/5      n
XU
4$ )Nr$   
max_lengthTptpaddingrG   
truncationreturn_tensorslongestrJ   rL   \The following part of your input was truncated because CLIP can only handle sequences up to 	 tokens: r   )dim )
isinstancelistr;   r+   model_max_length	input_idsattention_maskbooltoshapetorchequalbatch_decodeloggerwarningr,   text_embedslast_hidden_staterepeat_interleaverepeatviewcat)r?   promptdevicenum_images_per_promptdo_classifier_free_guidancerD   rE   
batch_sizetext_inputstext_input_ids	text_maskuntruncated_idsremoved_textprior_text_encoder_outputprompt_embedstext_enc_hid_statesuncond_tokensuncond_inputuncond_text_mask0negative_prompt_embeds_prior_text_encoder_outputnegative_prompt_embedsuncond_text_enc_hid_statesseq_lens                          rA   _encode_prior_prompt)StableUnCLIPPipeline._encode_prior_prompt   s    $(264(@(@VaJ..$//@@# / K )22N#22779<<VDI"2269]a2bllO$$R(N,@,@,DDU[[N N  $33@@#At';';'L'Lq'PSU'U$UV  ,,==>i~W "03ZT5I5I5Z5Z3Z0Z![(,(?(?@Q@QRX@Y(Z%5AAM";"M"M +1-33A6J1B11EGXYZG[.+I%778MST7U1CCDY_`Ca//0E1/M	&D:-M//$//@@# 0 L  ,::??ADDVL?C?V?V&&))&1@< &V%a%a")Y)k)k& -2215G%;%B%B1F[%\"%;%@%@Acel%m"066q9G)C)J)J1Ncef)g&)C)H)H2GR*&  0AABW]^A_ "II'=}&MNM"'))-GI\,]"^		#3Y"?@I9<<rC   rr   rx   
lora_scalec	                     Sn
[        SSU
SS9  U R                  " S	UUUUUUUUS.U	D6n[        R                  " US   US   /5      nU$ )
Nz`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple.z_encode_prompt()1.0.0Fstandard_warn)rg   rh   ri   rj   negative_promptrr   rx   r}   r$   r    )r   encode_promptr\   rf   )r?   rg   rh   ri   rj   r   rr   rx   r}   kwargsdeprecation_messageprompt_embeds_tuples               rA   _encode_prompt#StableUnCLIPPipeline._encode_prompt  s{     a$g/BRWX"00 

"7(C+'#9!

 

 		#6q#9;Nq;Q"RSrC   	clip_skipc
                 
   UbS  [        U [        5      (       a>  Xl        [        (       d  [	        U R
                  U5        O[        U R
                  U5        Ub  [        U[        5      (       a  Sn
O3Ub!  [        U[        5      (       a  [        U5      n
OUR                  S   n
UGc  [        U [        5      (       a  U R                  XR                  5      nU R                  USU R                  R                  SSS9nUR                  nU R                  USSS	9R                  nUR                  S
   UR                  S
   :  a  [         R"                  " X5      (       dj  U R                  R%                  USS2U R                  R                  S-
  S
24   5      n[&        R)                  SU R                  R                   SU 35        [+        U R
                  R,                  S5      (       aA  U R
                  R,                  R.                  (       a  UR0                  R3                  U5      nOSnU	c%  U R                  UR3                  U5      US9nUS   nOQU R                  UR3                  U5      USS9nUS
   U	S-   *    nU R
                  R4                  R7                  U5      nU R
                  b  U R
                  R8                  nO0U R:                  b  U R:                  R8                  nOUR8                  nUR3                  UUS9nUR                  u  nnnUR=                  SUS5      nUR?                  UU-  US
5      nU(       Ga  UGc|  Uc  S/U
-  nOUb;  [A        U5      [A        U5      La$  [C        S[A        U5       S[A        U5       S35      e[        U[        5      (       a  U/nO2U
[        U5      :w  a!  [E        SU S[        U5       SU SU
 S3	5      eUn[        U [        5      (       a  U R                  UU R                  5      nUR                  S   nU R                  USUSSS9n[+        U R
                  R,                  S5      (       aA  U R
                  R,                  R.                  (       a  UR0                  R3                  U5      nOSnU R                  UR                  R3                  U5      US9nUS   nU(       aG  UR                  S   nUR3                  UUS9nUR=                  SUS5      nUR?                  X-  US
5      nU R
                  b6  [        U [        5      (       a!  [        (       a  [G        U R
                  U5        Xg4$ )a,  
Encodes the prompt into text encoder hidden states.

Args:
    prompt (`str` or `List[str]`, *optional*):
        prompt to be encoded
    device: (`torch.device`):
        torch device
    num_images_per_prompt (`int`):
        number of images that should be generated per prompt
    do_classifier_free_guidance (`bool`):
        whether to use classifier free guidance or not
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts not to guide the image generation. If not defined, one has to pass
        `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
        less than `1`).
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
        provided, text embeddings will be generated from `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
        weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
        argument.
    lora_scale (`float`, *optional*):
        A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
    clip_skip (`int`, *optional*):
        Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
        the output of the pre-final layer will be used for computing the prompt embeddings.
Nr$   r   rG   TrH   rI   rM   rN   rO   rP   rQ   use_attention_mask)rX   )rX   output_hidden_states)dtyperh   rS   ?`negative_prompt` should be the same type to `prompt`, but got  != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$rT   r   _lora_scaler   r   r0   r   strrU   r;   r[   r   maybe_convert_promptr/   rV   rW   r\   r]   r^   r_   r`   hasattrr<   r   rX   rZ   
text_modelfinal_layer_normr   r1   rd   re   type	TypeError
ValueErrorr   )r?   rg   rh   ri   rj   r   rr   rx   r}   r   rk   rl   rm   ro   rp   rX   prompt_embeds_dtypebs_embedrz   _rt   rG   ru   s                          rA   r   "StableUnCLIPPipeline.encode_prompt)  sQ   V !j7U&V&V) $#.t/@/@*M!$"3"3Z@*VS"9"9JJvt$<$<VJ&,,Q/J $ ;<<226>>J..$>>::# ) K )22N"nnVYW[n\ffO$$R(N,@,@,DDU[[N N  $~~::#At~~'F'F'JR'O$OP  778	,Q
 t((//1EFF4K\K\KcKcKvKv!,!;!;!>!>v!F!%  $ 1 1.2C2CF2K\j 1 k -a 0 $ 1 1"%%f-ncg !2 ! !.b 1IM2B C
 !% 1 1 < < M Mm \("&"3"3"9"9YY""&))//"/"5"5%((/B6(R,22'1%,,Q0EqI%**86K+KWVXY '+A+I&!#z 1#VD<Q(QUVZ[jVkUl mV~Q(  OS11!0 1s?33 )/)::J3K_J` ax/
| <33  !0 $ ;<< $ 9 9- X&,,Q/J>>$%# * L t((//1EFF4K\K\KcKcKvKv!-!<!<!?!?!G!%%)%6%6&&))&1- &7 &" &<A%>"&,2215G%;%>%>EXag%>%h"%;%B%B1F[]^%_"%;%@%@Acelnp%q"($ >??DTDT#D$5$5zB44rC   c                 T   Sn[        SSUSS9  SU R                  R                  R                  -  U-  nU R                  R	                  USS9S   nUS	-  S
-   R                  SS5      nUR                  5       R                  SS	SS5      R                  5       R                  5       nU$ )Nz{The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) insteaddecode_latentsr   Fr   r$   return_dictr   r    g      ?r   )
r   r3   r<   scaling_factordecodeclampcpupermutefloatnumpy)r?   latentsr   images       rA   r   #StableUnCLIPPipeline.decode_latents  s     \"G-@PUVdhhoo444w>U;A>S''1-		##Aq!Q/557==?rC   c                 n   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   n0 nU(       a  X$S'   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   nU(       a  XS'   U$ Neta	generator)setinspect	signaturer-   step
parameterskeysr?   r   r   accepts_etaextra_step_kwargsaccepts_generators         rA   prepare_prior_extra_step_kwargs4StableUnCLIPPipeline.prepare_prior_extra_step_kwargs  s     s7#4#4T5I5I5N5N#O#Z#Z#_#_#abb'*e$ (3w/@/@AUAUAZAZ/[/f/f/k/k/m+nn-6k*  rC   c                 n   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   n0 nU(       a  X$S'   S[        [        R                  " U R                  R                  5      R
                  R                  5       5      ;   nU(       a  XS'   U$ r   )r   r   r   r2   r   r   r   r   s         rA   prepare_extra_step_kwargs.StableUnCLIPPipeline.prepare_extra_step_kwargs  s     s7#4#4T^^5H5H#I#T#T#Y#Y#[\\'*e$ (3w/@/@ATAT/U/`/`/e/e/g+hh-6k*  rC   c	                 r   US-  S:w  d	  US-  S:w  a  [        SU SU S35      eUb  Ub6  [        U[        5      (       a  US::  a  [        SU S[        U5       S35      eUb  Ub  [        S5      eUc  Uc  [        S	5      eUbA  [        U[        5      (       d,  [        U[
        5      (       d  [        S
[        U5       35      eUb  Ub  [        S5      eUb>  Ub;  [        U5      [        U5      La$  [        S[        U5       S[        U5       S35      eUbC  Ub@  UR                  UR                  :w  a&  [        SUR                   SUR                   S35      eUS:  d#  XPR                  R                  R                  :  a0  [        SU R                  R                  R                  S-
   S35      eg )Nr5   r   z7`height` and `width` have to be divisible by 8 but are z and r   z5`callback_steps` has to be a positive integer but is z	 of type z[Provide either `prompt` or `prompt_embeds`. Please make sure to define only one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zProvide either `negative_prompt` or `negative_prompt_embeds`. Cannot leave both `negative_prompt` and `negative_prompt_embeds` undefined.r   r   zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z$`noise_level` must be between 0 and r$   z, inclusive.)r   rT   intr   r   rU   r   r[   r.   r<   num_train_timesteps)	r?   rg   heightwidthcallback_stepsnoise_levelr   rr   rx   s	            rA   check_inputs!StableUnCLIPPipeline.check_inputs  s#    A:?eai1nVW]V^^cdicjjklmm"&
>30O0OSaefSfGGW X(), 
 -";m  >m3w  z&#'>'>zRXZ^G_G_QRVW]R^Q_`aa&+A+M \  /"=F|4#88UVZ[jVkUl mV~Q( 
 $)?)K""&<&B&BB --:-@-@,A B.445Q8  ?k-I-I-P-P-d-dd6t7S7S7Z7Z7n7nqr7r6ss  A  erC   c                     Uc  [        XX2S9nO<UR                  U:w  a  [        SUR                   SU 35      eUR                  U5      nXVR                  -  nU$ )Nr   rh   r   zUnexpected latents shape, got z, expected )r   r[   r   rZ   init_noise_sigma)r?   r[   r   rh   r   r   r2   s          rA   prepare_latents$StableUnCLIPPipeline.prepare_latentsL  s`    ?"5fZG}}% #A'--P[\a[b!cddjj(G666rC   image_embedsr   noiser   c                 >   Uc)  [        UR                  XAR                  UR                  S9n[        R
                  " U/UR                  S   -  UR                  S9nU R                  R                  UR                  5        U R                  R                  U5      nU R                  R                  XUS9nU R                  R                  U5      n[        X!R                  S   SSS9nUR                  UR                  5      n[        R                  " X4S5      nU$ )	a  
Add noise to the image embeddings. The amount of noise is controlled by a `noise_level` input. A higher
`noise_level` increases the variance in the final un-noised images.

The noise is applied in two ways:
1. A noise schedule is applied directly to the embeddings.
2. A vector of sinusoidal time embeddings are appended to the output.

In both cases, the amount of noise is controlled by the same `noise_level`.

The embeddings are normalized before the noise is applied and un-normalized after the noise is applied.
r   r   rh   )	timestepsr   rO   T)r   embedding_dimflip_sin_to_cosdownscale_freq_shiftr$   )r   r[   rh   r   r\   tensorr*   rZ   scaler.   	add_noiseunscaler   rf   )r?   r   r   r   r   s        rA   noise_image_embeddings+StableUnCLIPPipeline.noise_image_embeddingsW  s   & = ""i@S@S[g[m[mE llK=<3E3Ea3H#HQ]QdQde  !4!45,,22<@33==lin=o,,44\B,!1C1CB1GY]tu
 "nn\%7%78yy,!<a@rC   rg   r   r   num_inference_stepsguidance_scaler   ri   r   r   output_typer   callbackr   cross_attention_kwargsprior_num_inference_stepsprior_guidance_scaleprior_latentsc                 
   U=(       d-    U R                   R                  R                  U R                  -  nU=(       d-    U R                   R                  R                  U R                  -  nU R	                  UUUUUUUUS9  Ub  [        U[        5      (       a  SnO3Ub!  [        U[        5      (       a  [        U5      nOUR                  S   nUU-  nU R                  nUS:  nU R                  UUUUS9u  nnnU R                  R                  UUS9  U R                  R                  nU R                  R                  R                   nU R#                  UU4UR$                  UU	UU R                  5      nU R'                  X5      n[)        U R+                  U5      5       H  u  n n!U(       a  [,        R.                  " U/S-  5      OUn"U R                  R1                  U"U!5      n"U R                  U"U!UUUS	9R2                  n#U(       a  U#R5                  S5      u  n$n%U$UU%U$-
  -  -   n#U R                  R6                  " U#4U!US
.UDSS0D6S   nUc  M  U U-  S:X  d  M  U" U U!U5        M     U R                  R9                  U5      nUn&US:  n'Ub  UR;                  SS5      OSn(U R=                  UUUU'UUUU(US9	u  pU'(       a  [,        R.                  " X/5      nU R?                  U&UU	S9n&U'(       a.  [,        R@                  " U&5      n[,        R.                  " UU&/5      n&U RB                  R                  UUS9  U RB                  R                  n)U R                   R                  RD                  n*UU*[G        U5      U R                  -  [G        U5      U R                  -  4n+U R#                  U+UR$                  UU	U
U RB                  S9n
U RI                  X5      n,[)        U R+                  U)5      5       H  u  n n!U'(       a  [,        R.                  " U
/S-  5      OU
n"U RB                  R1                  U"U!5      n"U R                  U"U!UU&USS9S   n-U'(       a  U-R5                  S5      u  n.n/U.UU/U.-
  -  -   n-U RB                  R6                  " U-U!U
40 U,DSS0D6S   n
Ub-  U U-  S:X  a$  U [K        U RB                  SS5      -  n0U" U0U!U
5        [L        (       d  M  [N        RP                  " 5         M     US:X  d>  U RR                  RU                  XRR                  R                  RV                  -  SS9S   n1OU
n1U RX                  R[                  U1US9n1U R]                  5         U(       d  U14$ [_        U1S9$ )u  
The call function to the pipeline for generation.

Args:
    prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
    height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
        The height in pixels of the generated image.
    width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
        The width in pixels of the generated image.
    num_inference_steps (`int`, *optional*, defaults to 20):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    guidance_scale (`float`, *optional*, defaults to 10.0):
        A higher guidance scale value encourages the model to generate images closely linked to the text
        `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
    negative_prompt (`str` or `List[str]`, *optional*):
        The prompt or prompts to guide what to not include in image generation. If not defined, you need to
        pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
    num_images_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    eta (`float`, *optional*, defaults to 0.0):
        Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
        applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
        generation deterministic.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor is generated by sampling using the supplied random `generator`.
    prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
        provided, text embeddings are generated from the `prompt` input argument.
    negative_prompt_embeds (`torch.Tensor`, *optional*):
        Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
        not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generated image. Choose between `PIL.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
    callback (`Callable`, *optional*):
        A function that calls every `callback_steps` steps during inference. The function is called with the
        following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
    callback_steps (`int`, *optional*, defaults to 1):
        The frequency at which the `callback` function is called. If not specified, the callback is called at
        every step.
    cross_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
        [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    noise_level (`int`, *optional*, defaults to `0`):
        The amount of noise to add to the image embeddings. A higher `noise_level` increases the variance in
        the final un-noised images. See [`StableUnCLIPPipeline.noise_image_embeddings`] for more details.
    prior_num_inference_steps (`int`, *optional*, defaults to 25):
        The number of denoising steps in the prior denoising process. More denoising steps usually lead to a
        higher quality image at the expense of slower inference.
    prior_guidance_scale (`float`, *optional*, defaults to 4.0):
        A higher guidance scale value encourages the model to generate images closely linked to the text
        `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
    prior_latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
        embedding generation in the prior denoising process. Can be used to tweak the same generation with
        different prompts. If not provided, a latents tensor is generated by sampling using the supplied random
        `generator`.
    clip_skip (`int`, *optional*):
        Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
        the output of the pre-final layer will be used for computing the prompt embeddings.
Examples:

Returns:
    [`~pipelines.ImagePipelineOutput`] or `tuple`:
        [`~ pipeline_utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning
        a tuple, the first element is a list with the generated images.
)rg   r   r   r   r   r   rr   rx   Nr$   r   g      ?)rg   rh   ri   rj   r   r    )timestepproj_embeddingencoder_hidden_statesrX   )r   sampler   Fr   )	rg   rh   ri   rj   r   rr   rx   r}   r   )r   r   r   )r[   r   rh   r   r   r2   )r   class_labelsr   r   orderlatentr   )r   )images)0r1   r<   sample_sizer6   r   rT   r   rU   r;   r[   _execution_devicer{   r-   set_timestepsr   r)   r   r   r   r   	enumerateprogress_barr\   rf   scale_model_inputpredicted_image_embeddingchunkr   post_process_latentsgetr   r   
zeros_liker2   in_channelsr   r   r:   XLA_AVAILABLExm	mark_stepr3   r   r   r>   postprocessmaybe_free_model_hooksr"   )2r?   rg   r   r   r   r   r   ri   r   r   r   rr   rx   r   r   r   r   r   r   r   r   r   r   rk   rh   !prior_do_classifier_free_guidanceprior_prompt_embeds prior_text_encoder_hidden_statesprior_text_maskprior_timesteps_tensorr   prior_extra_step_kwargsitlatent_model_inputr    predicted_image_embedding_uncondpredicted_image_embedding_textr   rj   text_encoder_lora_scaler   num_channels_latentsr[   r   
noise_prednoise_pred_uncondnoise_pred_textstep_idxr   s2                                                     rA   __call__StableUnCLIPPipeline.__call__  s   P O499++77$:O:OOM))558M8MM 	)#+'#9 	 		
 *VS"9"9JJvt$<$<VJ&,,Q/J"77
''
 -A3,F) RVQjQj"7(I	 Rk R
N= 	**+DV*T!%!5!5!?!? 

))77,,'%%  
 #'"F"Fy"V d//0FGHDAqCdM?Q+>!?jw!%!5!5!G!GHZ\]!^(,

"2&F. )3 ) (' & 1SlSrSrstSuP02P,LOc25UUP -) !0055)$ *	
 " M #N(:a(?A}-9 I< 

77F$ '5s&:# :P9["&&w5ae 	  150B0B"7(C+'#9. 1C 
1
- '!II'=&MNM 22%# 3 
 '%*%5%5l%C"
 !99&<l%KLL 	$$%8$HNN,,	  $yy//;; K4000J$///	
 &&%%nn ' 
 !::9J d//	:;DAq=XG9q=!9^e!%!A!ABTVW!X "&3)'=! #  J +5?5E5Ea5H2!?.?UfCf1gg
 nn))*aiDUichijklG#N(:a(? CC1g.}7 <: h&HHOOGhhoo.L.L$LZ_O`abcEE$$00K0P 	##%8O"%00rC   )r   r>   r6   )NN)NNNN)NNNNN)NNN)NNN   g      $@Nr$   g        NNNNpilTNr$   Nr      g      @NN)/__name__
__module____qualname____firstlineno____doc___exclude_from_cpu_offloadmodel_cpu_offload_seqr   __annotations__r   r   r   r%   r
   r   r   r8   r   r	   r   r   r\   Tensorr{   r   r   r   r   r   r   r   r   r   	Generatorr   no_gradr   EXAMPLE_DOC_STRINGr   r   rY   r   r   r   r  __static_attributes____classcell__)r@   s   @rA   r'   r'   F   s&   $L ")*< =I #"33.. 2166 
((	#Y '#Y 8	#Y
  #Y 3#Y 6#Y ";#Y !#Y $#Y ##Y -#Y  !#YX JN6:[= $E*=u*D$EF[= &ell3[=H 049=&*  - !) 6 UON 049=&*#'t5  -t5 !) 6t5 UOt5 C=t5n	!$!0 #:z	 )-/3,ll, , %	,
 EOO,,\ ]]_12 37 $##% $;?/0/3*.049=%* GK;?)+&)04#'3q1 sDI~./q1 	q1
 }q1 !q1 q1 "%T#Y"78q1  (}q1 q1 EOO,q1 %,,'q1  -q1 !) 6q1 c]q1  !q1" 8S#u||$<d$BCD#q1$ %q1& !)c3h 8'q1( )q1, $'-q1. $/q10  -1q12 C=3q1 3 q1rC   r'   )9r   typingr   r   r   r   r   r   r	   r\   transformersr
   r   r   &transformers.models.clip.modeling_clipr   r>   r   loadersr   r   modelsr   r   r   models.embeddingsr   models.lorar   
schedulersr   utilsr   r   r   r   r   r   r   utils.torch_utilsr   pipeline_utilsr!   r"   r#   stable_unclip_image_normalizerr%   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr  r_   r  r'   r   rC   rA   <module>r2     s     D D D  R R F 0 R K K 7 9 3   . Y Y G ))MM			H	% $r1+-HJhr1rC   