
    +hK                        S SK r S SKJrJrJr  S SKrS SKrS SKJ	r
  S SKJrJrJrJr  SSKJrJr  SSKJr  SSKJrJr  SS	KJr  S
SKJrJrJr  SSKJr  \" 5       (       a  S SK J!s  J"r#  Sr$OSr$\RJ                  " \&5      r' " S S\\5      r(g)    N)ListOptionalUnion)
functional)CLIPImageProcessorCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjection   )UNet2DConditionModelUNet2DModel)UnCLIPScheduler)is_torch_xla_availablelogging)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )UnCLIPTextProjModelTFc                   `  ^  \ rS rSr% SrSr\\S'   \\S'   \	\S'   \
\S'   \\S'   \\S	'   \\S
'   \\S'   \\S'   \\S'   SrS\S\	S\
S\S\S	\S
\S\S\S\4U 4S jjrS rS rS S\\R*                     4S jjr\R.                  " 5                  S!S\\\R4                  R4                  \\R4                  R4                     \R*                  4      S\S\S\S\\R:                     S\\R*                     S\\R*                     S\\R*                     S\S\\   S\ 4S jj5       r!Sr"U =r#$ )"UnCLIPImageVariationPipeline.   aR  
Pipeline to generate image variations from an input image using UnCLIP.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).

Args:
    text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
        Frozen text-encoder.
    tokenizer ([`~transformers.CLIPTokenizer`]):
        A `CLIPTokenizer` to tokenize text.
    feature_extractor ([`~transformers.CLIPImageProcessor`]):
        Model that extracts features from generated images to be used as inputs for the `image_encoder`.
    image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
        Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
    text_proj ([`UnCLIPTextProjModel`]):
        Utility class to prepare and combine the embeddings before they are passed to the decoder.
    decoder ([`UNet2DConditionModel`]):
        The decoder to invert the image embedding into an image.
    super_res_first ([`UNet2DModel`]):
        Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
    super_res_last ([`UNet2DModel`]):
        Super resolution UNet. Used in the last step of the super resolution diffusion process.
    decoder_scheduler ([`UnCLIPScheduler`]):
        Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
    super_res_scheduler ([`UnCLIPScheduler`]):
        Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).
z0.33.1decoder	text_projtext_encoder	tokenizerfeature_extractorimage_encodersuper_res_firstsuper_res_lastdecoder_schedulersuper_res_schedulerzPtext_encoder->image_encoder->text_proj->decoder->super_res_first->super_res_lastc                 R   > [         TU ]  5         U R                  UUUUUUUUU	U
S9
  g )N)
r   r   r   r   r   r    r!   r"   r#   r$   )super__init__register_modules)selfr   r   r   r   r   r    r!   r"   r#   r$   	__class__s              t/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/unclip/pipeline_unclip_image_variation.pyr'   %UnCLIPImageVariationPipeline.__init__Z   sA     	%/'+)/ 3 	 	
    c                     Uc  [        XX2S9nO<UR                  U:w  a  [        SUR                   SU 35      eUR                  U5      nXVR                  -  nU$ )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r)   r2   r1   r0   r/   latents	schedulers          r+   prepare_latents,UnCLIPImageVariationPipeline.prepare_latentsw   s`    ?"5fZG}}% #A'--P[\a[b!cddjj(G666r-   c                    [        U[        5      (       a  [        U5      OSnU R                  USU R                  R                  SS9nUR
                  nUR                  R                  5       R                  U5      nU R                  UR                  U5      5      n	U	R                  n
U	R                  nU
R                  USS9n
UR                  USS9nUR                  USS9nU(       GaV  S/U-  nUR                  S   nU R                  USUS	SS
9nUR                  R                  5       R                  U5      nU R                  UR
                  R                  U5      5      nUR                  nUR                  nUR                  S   nUR                  SU5      nUR                  XS-  U5      nUR                  S   nUR                  SUS5      nUR                  XS-  US5      nUR                  USS9n[         R"                  " UU
/5      n
[         R"                  " UU/5      n[         R"                  " X/5      nXU4$ )Nr   
max_lengthpt)paddingr;   return_tensorsr   dim T)r=   r;   
truncationr>   )
isinstancelistlenr   model_max_length	input_idsattention_maskboolr4   r   text_embedslast_hidden_staterepeat_interleaver2   repeatviewtorchcat)r)   promptr0   num_images_per_promptdo_classifier_free_guidance
batch_sizetext_inputstext_input_ids	text_masktext_encoder_outputprompt_embedstext_encoder_hidden_statesuncond_tokensr;   uncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lens                       r+   _encode_prompt+UnCLIPImageVariationPipeline._encode_prompt   si   $.vt$<$<S[!
 nn ~~66	 % 
 %....33588@	"//0A0A&0IJ+77%8%J%J"%778MST7U%?%Q%QRgmn%Q%o"//0E1/M	&D:-M'--b1J>>$%# * L  ,::??ADDVL9=9J9J<KaKaKdKdekKl9m6%O%[%["0Z0l0l- -2215G%;%B%B1F[%\"%;%@%@Acel%m"7==a@G0Q0X0XYZ\qst0u-0Q0V0V2GR1-  0AABW]^A_ "II'=}&MNM).4UWq3r)s&		#3"?@I)CCr-   image_embeddingsc                 L   [        U R                  R                  5       5      R                  nUcc  [	        U[
        R                  5      (       d  U R                  USS9R                  nUR                  X%S9nU R                  U5      R                  nUR                  USS9nU$ )Nr<   )imagesr>   )r0   r1   r   r?   )nextr    
parametersr1   rD   rP   Tensorr   pixel_valuesr4   image_embedsrM   )r)   imager0   rS   re   r1   s         r+   _encode_image*UnCLIPImageVariationPipeline._encode_image   s    T''2245;;#eU\\22..eD.Q^^HHFH8E#11%8EE+==>SYZ=[r-   rm   rS   decoder_num_inference_stepssuper_res_num_inference_stepsr/   decoder_latentssuper_res_latentsdecoder_guidance_scaleoutput_typereturn_dictc           	      P   Ub]  [        U[        R                  R                  5      (       a  SnO@[        U[        5      (       a  [	        U5      nOUR
                  S   nOUR
                  S   nS/U-  nU R                  nX-  nU	S:  nU R                  XX/5      u  nnnU R                  XX(5      nU R                  UUUUS9u  nnUR                  S:X  ak  UR                  [        R                  5      n[        R                  " UU R                  R                  S4SS9nUR                  [        R                   5      nO,[        R                  " UU R                  R                  S4S	S9nU R"                  R%                  X>S
9  U R"                  R&                  nU R(                  R*                  R,                  nU R(                  R*                  R.                  nU R(                  R*                  R.                  nUc.  U R1                  UUUU4UR2                  UUUU R"                  5      n[5        U R7                  U5      5       GH  u  nnU(       a  [        R8                  " U/S-  5      OUnU R)                  UUUUUS9R:                  nU(       av  UR=                  S5      u  nnUR?                  UR
                  S   SS9u  nnUR?                  UR
                  S   SS9u  nn UU	UU-
  -  -   n[        R8                  " UU /SS9nUS-   UR
                  S   :X  a  Sn!OUUS-      n!U R"                  RA                  UUUU!US9RB                  nGM     URE                  SS5      nUn"U RF                  R%                  XNS
9  U RF                  R&                  n#U RH                  R*                  R,                  S-  n$U RH                  R*                  R.                  nU RH                  R*                  R.                  nUc.  U R1                  UU$UU4U"R2                  UUUU RF                  5      nUR                  S:X  a  [        RJ                  " U"UU/S9n%OV0 n&S[L        RN                  " [        RJ                  5      RP                  ;   a  S	U&S'   [        RJ                  " U"4UU/SSS.U&D6n%[5        U R7                  U#5      5       H  u  nnUU#R
                  S   S-
  :X  a  U RR                  n'OU RH                  n'[        R8                  " UU%/SS9nU'" UUS9R:                  nUS-   U#R
                  S   :X  a  Sn!OU#US-      n!U RF                  RA                  UUUU!US9RB                  n[T        (       d  M  [V        RX                  " 5         M     UnU R[                  5         US-  S-   nURE                  SS5      nUR]                  5       R_                  SSSS5      Ra                  5       Rc                  5       nU
S:X  a  U Re                  U5      nU(       d  U4$ [g        US9$ )a
  
The call function to the pipeline for generation.

Args:
    image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.Tensor`):
        `Image` or tensor representing an image batch to be used as the starting point. If you provide a
        tensor, it needs to be compatible with the [`CLIPImageProcessor`]
        [configuration](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
        Can be left as `None` only when `image_embeddings` are passed.
    num_images_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    decoder_num_inference_steps (`int`, *optional*, defaults to 25):
        The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
        image at the expense of slower inference.
    super_res_num_inference_steps (`int`, *optional*, defaults to 7):
        The number of denoising steps for super resolution. More denoising steps usually lead to a higher
        quality image at the expense of slower inference.
    generator (`torch.Generator`, *optional*):
        A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
        generation deterministic.
    decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
        Pre-generated noisy latents to be used as inputs for the decoder.
    super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
        Pre-generated noisy latents to be used as inputs for the decoder.
    decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
        A higher guidance scale value encourages the model to generate images closely linked to the text
        `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
    image_embeddings (`torch.Tensor`, *optional*):
        Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings
        can be passed for tasks like image interpolations. `image` can be left as `None`.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generated image. Choose between `PIL.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

Returns:
    [`~pipelines.ImagePipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
        returned where the first element is a list with the generated images.
Nr   r   rA   g      ?)re   rZ   r[   rT   mps)valueT)r0   r   )sampletimestepencoder_hidden_statesclass_labelsrI   r?   )prev_timestepr/   rB   )size	antialiasbicubicF)r   modealign_corners)rz   r{   g      ?r   pil)rg   )4rD   PILImagerE   rF   r2   _execution_devicerc   rn   r   typerP   intFpadclip_extra_context_tokensrJ   r#   set_timesteps	timestepsr   configin_channelssample_sizer8   r1   	enumerateprogress_barrQ   rz   chunksplitstepprev_sampleclampr$   r!   interpolateinspect	signatureri   r"   XLA_AVAILABLExm	mark_stepmaybe_free_model_hookscpupermutefloatnumpynumpy_to_pilr   )(r)   rm   rS   rp   rq   r/   rr   rs   re   rt   ru   rv   rU   rR   r0   rT   rZ   r[   rX   additive_clip_time_embeddingsdecoder_text_maskdecoder_timesteps_tensornum_channels_latentsheightwidthitlatent_model_input
noise_prednoise_pred_uncondnoise_pred_text_predicted_variancer~   image_smallsuper_res_timesteps_tensorchannelsimage_upscaledinterpolate_antialiasunets(                                           r+   __call__%UnCLIPImageVariationPipeline.__call__   s5   n %11
E4(( Z
"[[^
)//2J
"''7
&<s&B#?C?R?R1@
<19  --e=Re EINN-''A(C	 ES E
A"$A ;;% "uyy1I !i$..2Z2Z\]1^fg h 1 6 6uzz B !i$..2Z2Z\]1^fj k,,-H,X#'#9#9#C#C #||22>>$$00##//""22165A*00&&O d//0HIJDAqE`O+<q+@!Afu)&@:0 &  f  +5?5E5Ea5H2!?'8'>'>?Q?W?WXY?Z`a'>'b$!16E6K6KL^LdLdefLgmn6K6o3!3.1G?]nKn1oo
"YY
4F'GQO
1u066q99 $ 8Q ? #4499AmW` : k 3 K: *//A6% 	  ../L.\%)%=%=%G%G"''..::a?%%,,88$$++77$ $ 4 4Xvu5!!!((! ;;%]];fe_MN$&!g//>III59%k2]]#)5/	QVZoN d//0JKLDAq .44Q7!;;**++!&,=~+NTU!V) f 
 1u288;; $ :1q5 A !% 8 8 = =A0Yb !> !k  }5 M8 " 	##% c!Aq!		##Aq!Q/557==?%%%e,E8O"%00r-    )N)Nr         NNNNg       @r   T)$__name__
__module____qualname____firstlineno____doc___last_supported_versionr   __annotations__r   r   r	   r   r
   r   r   model_cpu_offload_seqr'   r8   rc   r   rP   rj   rn   no_gradr   r   r   r   r   	Generatorr   strrJ   r   __static_attributes____classcell__)r*   s   @r+   r   r   .   s   : '!!""--))00  &&((n
%
 2
 !	

 '
 .
 5
 %
 $
 +
 -
:	=D~ T\]b]i]iTj   ]]_ X\%&+--./3264837(+%* _1ciiootCIIOO/DellRST_1  #_1 &)	_1
 (+_1 EOO,_1 "%,,/_1 $ELL1_1 #5<<0_1 !&_1 c]_1 _1 _1r-   r   ))r   typingr   r   r   	PIL.Imager   rP   torch.nnr   r   transformersr   r   r	   r
   modelsr   r   
schedulersr   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r   r-   r+   <module>r      sw     ( (   $  8 ) 4 - \ \ * ))MM			H	%A1#:<M A1r-   