
    +hW                         S SK r S SKJrJrJrJr  S SKrS SKJr	  S SK
JrJr  S SKJr  SSKJrJrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJrJrJr  SSKJr  \" 5       (       a  S SK J!s  J"r#  Sr$OSr$\RJ                  " \&5      r' " S S\\5      r(g)    N)ListOptionalTupleUnion)
functional)CLIPTextModelWithProjectionCLIPTokenizer)CLIPTextModelOutput   )PriorTransformerUNet2DConditionModelUNet2DModel)UnCLIPScheduler)is_torch_xla_availablelogging)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )UnCLIPTextProjModelTFc                    x  ^  \ rS rSr% SrSrS/r\\S'   \	\S'   \
\S'   \\S'   \\S'   \\S	'   \\S
'   \\S'   \\S'   \\S'   SrS\S\	S\S\S\
S	\S
\S\S\S\4U 4S jjrS r  S#S\\\\4      S\\R.                     4S jjr\R2                  " 5                      S$S\\\\\   4      S\S\S\S\S\\\R:                  \\R:                     4      S\\R.                     S\\R.                     S\\R.                     S\\\\4      S\\R.                     S\S\S\\   S \4S! jj5       r S"r!U =r"$ )%UnCLIPPipeline)   a  
Pipeline for text-to-image generation using unCLIP.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
implemented for all pipelines (downloading, saving, running on a particular device, etc.).

Args:
    text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
        Frozen text-encoder.
    tokenizer ([`~transformers.CLIPTokenizer`]):
        A `CLIPTokenizer` to tokenize text.
    prior ([`PriorTransformer`]):
        The canonical unCLIP prior to approximate the image embedding from the text embedding.
    text_proj ([`UnCLIPTextProjModel`]):
        Utility class to prepare and combine the embeddings before they are passed to the decoder.
    decoder ([`UNet2DConditionModel`]):
        The decoder to invert the image embedding into an image.
    super_res_first ([`UNet2DModel`]):
        Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
    super_res_last ([`UNet2DModel`]):
        Super resolution UNet. Used in the last step of the super resolution diffusion process.
    prior_scheduler ([`UnCLIPScheduler`]):
        Scheduler used in the prior denoising process (a modified [`DDPMScheduler`]).
    decoder_scheduler ([`UnCLIPScheduler`]):
        Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
    super_res_scheduler ([`UnCLIPScheduler`]):
        Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).

z0.33.1priordecoder	text_projtext_encoder	tokenizersuper_res_firstsuper_res_lastprior_schedulerdecoder_schedulersuper_res_schedulerzAtext_encoder->text_proj->decoder->super_res_first->super_res_lastc                 R   > [         TU ]  5         U R                  UUUUUUUUU	U
S9
  g )N)
r   r   r   r    r   r!   r"   r#   r$   r%   )super__init__register_modules)selfr   r   r   r    r   r!   r"   r#   r$   r%   	__class__s              d/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/unclip/pipeline_unclip.pyr(   UnCLIPPipeline.__init__Y   sA     	%+)+/ 3 	 	
    c                     Uc  [        XX2S9nO<UR                  U:w  a  [        SUR                   SU 35      eUR                  U5      nXVR                  -  nU$ )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r*   r3   r2   r1   r0   latents	schedulers          r,   prepare_latentsUnCLIPPipeline.prepare_latentsu   s`    ?"5fZG}}% #A'--P[\a[b!cddjj(G666r.   text_model_outputtext_attention_maskc                    UGc  [        U[        5      (       a  [        U5      OSnU R                  USU R                  R                  SSS9nUR
                  n	UR                  R                  5       R                  U5      n
U R                  USSS9R
                  nUR                  S   U	R                  S   :  a  [        R                  " X5      (       d  U R                  R                  US S 2U R                  R                  S-
  S24   5      n[        R                  S	U R                  R                   S
U 35        U	S S 2S U R                  R                  24   n	U R                  U	R                  U5      5      nUR                   nUR"                  nOUS   R                  S   nUS   US   pUn
UR%                  USS9nUR%                  USS9nU
R%                  USS9n
U(       Ga\  S/U-  nU R                  USU R                  R                  SSS9nUR                  R                  5       R                  U5      nU R                  UR
                  R                  U5      5      nUR                   nUR"                  nUR                  S   nUR'                  SU5      nUR)                  Xs-  U5      nUR                  S   nUR'                  SUS5      nUR)                  Xs-  US5      nUR%                  USS9n[        R*                  " UU/5      n[        R*                  " UU/5      n[        R*                  " UU
/5      n
XU
4$ )Nr   
max_lengthTpt)paddingr>   
truncationreturn_tensorslongest)r@   rB   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: r   dim )
isinstancelistlenr    model_max_length	input_idsattention_maskboolr5   r3   torchequalbatch_decodeloggerwarningr   text_embedslast_hidden_staterepeat_interleaverepeatviewcat)r*   promptr1   num_images_per_promptdo_classifier_free_guidancer;   r<   
batch_sizetext_inputstext_input_ids	text_maskuntruncated_idsremoved_texttext_encoder_outputprompt_embedstext_enc_hid_statesuncond_tokensuncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embedsuncond_text_enc_hid_statesseq_lens                          r,   _encode_promptUnCLIPPipeline._encode_prompt   sr    $(264(@(@VaJ..$>>::# ) K )22N#22779<<VDI"nnVYW[n\ffO$$R(N,@,@,DDU[[N N  $~~::#At~~'F'F'JR'O$OP  778	,Q "03TT^^5T5T3T0T!U"&"3"3N4E4Ef4M"N/;;M"5"G"G +1-33A6J1B11EGXYZG[.+I%778MST7U1CCDY_`Ca//0E1/M	&D:-M>>$>>::# * L  ,::??ADDVL9=9J9J<KaKaKdKdekKl9m6%O%[%[")S)e)e& -2215G%;%B%B1F[%\"%;%@%@Acel%m"066q9G)C)J)J1Ncef)g&)C)H)H2GR*&  0AABW]^A_ "II'=}&MNM"'))-GI\,]"^		#3Y"?@I9<<r.   rZ   r[   prior_num_inference_stepsdecoder_num_inference_stepssuper_res_num_inference_stepsr0   prior_latentsdecoder_latentssuper_res_latentsprior_guidance_scaledecoder_guidance_scaleoutput_typereturn_dictc           	         UbP  [        U[        5      (       a  SnOJ[        U[        5      (       a  [        U5      nO)[	        S[        U5       35      eU
S   R                  S   nU R                  nUU-  nUS:  =(       d    US:  nU R                  UUUUX5      u  nnnU R                  R                  UUS9  U R                  R                  nU R                  R                  R                  nU R                  UU4UR                   UUUU R                  5      n[#        U R%                  U5      5       H  u  nnU(       a  [&        R(                  " U/S-  5      OUnU R                  UUUUUS9R*                  nU(       a  UR-                  S5      u  nnUUUU-
  -  -   nUS-   UR                  S   :X  a  SnOUUS-      nU R                  R/                  UUUUUS	9R0                  nM     U R                  R3                  U5      nUnU R5                  UUUUS
9u  nn UR
                  S:X  ak  UR                  [&        R6                  5      n[8        R:                  " UU R4                  R<                  S4SS9n!U!R                  [&        R>                  5      n!O,[8        R:                  " UU R4                  R<                  S4SS9n!U R@                  R                  UUS9  U R@                  R                  n"U RB                  R                  RD                  n#U RB                  R                  RF                  n$U RB                  R                  RF                  n%U R                  UU#U$U%4UR                   UUUU R@                  5      n[#        U R%                  U"5      5       GH  u  nnU(       a  [&        R(                  " U/S-  5      OUnU RC                  UUUU U!S9RH                  n&U(       av  U&R-                  S5      u  n'n(U'RK                  UR                  S   SS9u  n'n)U(RK                  UR                  S   SS9u  n(n*U'UU(U'-
  -  -   n&[&        R(                  " U&U*/SS9n&US-   U"R                  S   :X  a  SnOU"US-      nU R@                  R/                  U&UUUUS9R0                  nGM     URM                  SS5      nUn+U RN                  R                  UUS9  U RN                  R                  n,U RP                  R                  RD                  S-  n-U RP                  R                  RF                  n$U RP                  R                  RF                  n%U R                  UU-U$U%4U+R                   UUU	U RN                  5      n	UR
                  S:X  a  [8        RR                  " U+U$U%/S9n.OV0 n/S[T        RV                  " [8        RR                  5      RX                  ;   a  SU/S'   [8        RR                  " U+4U$U%/SSS.U/D6n.[#        U R%                  U,5      5       H  u  nnUU,R                  S   S-
  :X  a  U RZ                  n0OU RP                  n0[&        R(                  " U	U./SS9nU0" UUS9RH                  n&US-   U,R                  S   :X  a  SnOU,US-      nU RN                  R/                  U&UU	UUS9R0                  n	[\        (       d  M  [^        R`                  " 5         M     U	n1U Rc                  5         U1S-  S-   n1U1RM                  SS5      n1U1Re                  5       Rg                  SSSS5      Ri                  5       Rk                  5       n1US:X  a  U Rm                  U15      n1U(       d  U14$ [o        U1S9$ )a  
The call function to the pipeline for generation.

Args:
    prompt (`str` or `List[str]`):
        The prompt or prompts to guide image generation. This can only be left undefined if `text_model_output`
        and `text_attention_mask` is passed.
    num_images_per_prompt (`int`, *optional*, defaults to 1):
        The number of images to generate per prompt.
    prior_num_inference_steps (`int`, *optional*, defaults to 25):
        The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
        image at the expense of slower inference.
    decoder_num_inference_steps (`int`, *optional*, defaults to 25):
        The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
        image at the expense of slower inference.
    super_res_num_inference_steps (`int`, *optional*, defaults to 7):
        The number of denoising steps for super resolution. More denoising steps usually lead to a higher
        quality image at the expense of slower inference.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
        generation deterministic.
    prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*):
        Pre-generated noisy latents to be used as inputs for the prior.
    decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
        Pre-generated noisy latents to be used as inputs for the decoder.
    super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
        Pre-generated noisy latents to be used as inputs for the decoder.
    prior_guidance_scale (`float`, *optional*, defaults to 4.0):
        A higher guidance scale value encourages the model to generate images closely linked to the text
        `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
    decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
        A higher guidance scale value encourages the model to generate images closely linked to the text
        `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
    text_model_output (`CLIPTextModelOutput`, *optional*):
        Pre-defined [`CLIPTextModel`] outputs that can be derived from the text encoder. Pre-defined text
        outputs can be passed for tasks like text embedding interpolations. Make sure to also pass
        `text_attention_mask` in this case. `prompt` can the be left `None`.
    text_attention_mask (`torch.Tensor`, *optional*):
        Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
        masks are necessary when passing `text_model_output`.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generated image. Choose between `PIL.Image` or `np.array`.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

Returns:
    [`~pipelines.ImagePipelineOutput`] or `tuple`:
        If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
        returned where the first element is a list with the generated images.
Nr   z2`prompt` has to be of type `str` or `list` but is r   g      ?)r1   r   )timestepproj_embeddingencoder_hidden_statesrM   )rz   sampler0   prev_timestep)image_embeddingsrd   text_encoder_hidden_statesr\   mps)valueT)r}   rz   r|   class_labelsrM   rE   )r~   r0   rD   )size	antialiasbicubicF)r   modealign_corners)r}   rz   g      ?r   pil)images)8rH   strrI   rJ   r4   typer3   _execution_devicerm   r#   set_timesteps	timestepsr   configembedding_dimr9   r2   	enumerateprogress_barrO   rY   predicted_image_embeddingchunkstepprev_samplepost_process_latentsr   intFpadclip_extra_context_tokensrN   r$   r   in_channelssample_sizer}   splitclampr%   r!   interpolateinspect	signature
parametersr"   XLA_AVAILABLExm	mark_stepmaybe_free_model_hookscpupermutefloatnumpynumpy_to_pilr   )2r*   rZ   r[   ro   rp   rq   r0   rr   rs   rt   r;   r<   ru   rv   rw   rx   r]   r1   r\   rd   re   r`   prior_timesteps_tensorr   itlatent_model_inputr    predicted_image_embedding_uncondpredicted_image_embedding_textr~   r   additive_clip_time_embeddingsdecoder_text_maskdecoder_timesteps_tensornum_channels_latentsheightwidth
noise_prednoise_pred_uncondnoise_pred_text_predicted_varianceimage_smallsuper_res_timesteps_tensorchannelsimage_upscaledinterpolate_antialiasunetimages2                                                     r,   __call__UnCLIPPipeline.__call__   s   J &#&&
FD)) [
 #UVZ[aVbUc!dee*1-33A6J''"77
&:S&@&`DZ]`D`#8<8K8KF13NPa9
5*I 	**+DV*T!%!5!5!?!?

))77,,'  
 d//0FGHDAqC^M?Q+>!?dq(,

",&9( )3 ) (' & +SlSrSrstSuP02P,LOc25UUP -) 1u.44Q77 $ 6q1u = 0055)$#+ 6  k / I> 

77F( >B^^-'':(C	 >L >
:: ;;% "uyy1I !i$..2Z2Z\]1^fg h 1 6 6uzz B !i$..2Z2Z\]1^fj k,,-HQW,X#'#9#9#C#C #||22>>$$00##//..-vu=%%""
 d//0HIJDAqE`O+<q+@!Afu)&9:0 &  f  +5?5E5Ea5H2!?'8'>'>?Q?W?WXY?Z`a'>'b$!16E6K6KL^LdLdefLgmn6K6o3!3.1G?]nKn1oo
"YY
4F'GQO
1u066q99 $ 8Q ? #4499AmW` : k 3 K: *//A6% 	  ../LU[.\%)%=%=%G%G"''..::a?%%,,88$$++77 00651$$
 ;;%]];fe_MN$&!g//>III59%k2]]#)5/	QVZoN d//0JKLDAq .44Q7!;;**++!&,=~+NTU!V) f 
 1u288;; $ :1q5 A !% 8 8 = =A0Yb !> !k  }5 M8 " 	##% c!Aq!		##Aq!Q/557==?%%%e,E8O"%00r.    )NN)Nr      r      NNNNNNg      @g       @r   T)#__name__
__module____qualname____firstlineno____doc___last_supported_version_exclude_from_cpu_offloadr   __annotations__r   r   r   r	   r   r   model_cpu_offload_seqr(   r9   r   r   r
   r   rO   Tensorrm   no_gradr   r   r   	Generatorr   rN   r   __static_attributes____classcell__)r+   s   @r,   r   r   )   sZ   < '!(	!!""--  $$&&((_

 &
 2	

 !
 '
 %
 $
 )
 +
 -
8	" JN6:Y= $E*=u*D$EFY= &ell3Y=v ]]_ 37%&)++--.MQ042648IM6:&)(+%* !\1sDI~./\1  #\1 $'	\1
 &)\1 (+\1 E%//43H"HIJ\1  -\1 "%,,/\1 $ELL1\1 $E*=u*D$EF\1 &ell3\1 $\1 !&\1 c]\1  !\1 \1r.   r   ))r   typingr   r   r   r   rO   torch.nnr   r   transformersr   r	   &transformers.models.clip.modeling_clipr
   modelsr   r   r   
schedulersr   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rR   r   r   r.   r,   <module>r      sq     / /  $ C F I I ) 4 - \ \ * ))MM			H	%O1,.? O1r.   