
    +hE                         S SK JrJrJr  S SKrS SKrS SKJr  SSK	J
r
JrJr  SSKJr  SSKJrJrJr  SSKJr  S	S
KJr  S	SKJr  S	SKJr  S	SKJrJrJr  \" 5       (       a  S SKJ s  J!r"  Sr#OSr#\RH                  " \%5      r&Sr' " S S\\5      r(g)    )ListOptionalUnionN)CLIPTokenizer   )AutoencoderKLControlNetModelUNet2DConditionModel)PNDMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )BlipImageProcessor)Blip2QFormerModel)ContextCLIPTextModel)DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutputTFa  
    Examples:
        ```py
        >>> from diffusers.pipelines import BlipDiffusionControlNetPipeline
        >>> from diffusers.utils import load_image
        >>> from controlnet_aux import CannyDetector
        >>> import torch

        >>> blip_diffusion_pipe = BlipDiffusionControlNetPipeline.from_pretrained(
        ...     "Salesforce/blipdiffusion-controlnet", torch_dtype=torch.float16
        ... ).to("cuda")

        >>> style_subject = "flower"
        >>> tgt_subject = "teapot"
        >>> text_prompt = "on a marble table"

        >>> cldm_cond_image = load_image(
        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/kettle.jpg"
        ... ).resize((512, 512))
        >>> canny = CannyDetector()
        >>> cldm_cond_image = canny(cldm_cond_image, 30, 70, output_type="pil")
        >>> style_image = load_image(
        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/flower.jpg"
        ... )
        >>> guidance_scale = 7.5
        >>> num_inference_steps = 50
        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"


        >>> output = blip_diffusion_pipe(
        ...     text_prompt,
        ...     style_image,
        ...     cldm_cond_image,
        ...     style_subject,
        ...     tgt_subject,
        ...     guidance_scale=guidance_scale,
        ...     num_inference_steps=num_inference_steps,
        ...     neg_prompt=negative_prompt,
        ...     height=512,
        ...     width=512,
        ... ).images
        >>> output[0].save("image.png")
        ```
c            #         ^  \ rS rSrSrSrSr   S(S\S\S\	S\
S	\S
\S\S\S\S\\   S\\   4U 4S jjjrS rS)S jrS*S jrS*S jr S+S jr\R2                  " 5       \" \5                 S,S\\   S\R<                  R<                  S\R<                  R<                  S\\   S\\   S\\R@                     S\S\S\S\S \\!\RD                  \\RD                     4      S!\\   S"\S#\S$\\   S%\#4 S& jj5       5       r$S'r%U =r&$ )-BlipDiffusionControlNetPipeline[   a  
Pipeline for Canny Edge based Controlled subject-driven generation using Blip Diffusion.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

Args:
    tokenizer ([`CLIPTokenizer`]):
        Tokenizer for the text encoder
    text_encoder ([`ContextCLIPTextModel`]):
        Text encoder to encode the text prompt
    vae ([`AutoencoderKL`]):
        VAE model to map the latents to the image
    unet ([`UNet2DConditionModel`]):
        Conditional U-Net architecture to denoise the image embedding.
    scheduler ([`PNDMScheduler`]):
         A scheduler to be used in combination with `unet` to generate image latents.
    qformer ([`Blip2QFormerModel`]):
        QFormer model to get multi-modal embeddings from the text and image.
    controlnet ([`ControlNetModel`]):
        ControlNet model to get the conditioning image embedding.
    image_processor ([`BlipImageProcessor`]):
        Image Processor to preprocess and postprocess the image.
    ctx_begin_pos (int, `optional`, defaults to 2):
        Position of the context token in the text encoder.
z0.33.1z qformer->text_encoder->unet->vae	tokenizertext_encodervaeunet	schedulerqformer
controlnetimage_processorctx_begin_posmeanstdc                 n   > [         TU ]  5         U R                  UUUUUUUUS9  U R                  XUS9  g )N)r   r   r   r   r   r   r    r!   )r"   r#   r$   )super__init__register_modulesregister_to_config)selfr   r   r   r   r   r   r    r!   r"   r#   r$   	__class__s               {/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.pyr'   (BlipDiffusionControlNetPipeline.__init__z   sP     	%!+ 	 		
 	mCP    c                 "    U R                  XSS9$ )NF)image_input
text_inputreturn_dict)r   )r*   input_imagesrc_subjects      r,   get_query_embeddings4BlipDiffusionControlNetPipeline.get_query_embeddings   s    ||Y^|__r.   c           
          / n[        X5       HJ  u  pgSU SUR                  5        3nUR                  SR                  U/[	        X4-  5      -  5      5        ML     U$ )Nza  z, )zipstripappendjoinint)r*   promptstgt_subjectsprompt_strengthprompt_repsrvprompttgt_subjects           r,   _build_prompt-BlipDiffusionControlNetPipeline._build_prompt   s]    #&w#=F+a'78FIIdii3/L+M MNO $>
 	r.   c	                     XX44n	[        U[        5      (       a*  [        U5      U:w  a  [        S[        U5       SU S35      eUc  [	        XXeS9nOUR                  XeS9nXR                  R                  -  nU$ )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatordevicedtyperI   rJ   )
isinstancelistlen
ValueErrorr   tor   init_noise_sigma)
r*   
batch_sizenum_channelsheightwidthrJ   rI   rH   latentsshapes
             r,   prepare_latents/BlipDiffusionControlNetPipeline.prepare_latents   s    69i&&3y>Z+GA#i.AQ R&<'gi 
 ?"5fZGjjj<G NN;;;r.   c                    U=(       d    U R                   nU R                  R                  R                  R                  nX@R
                  R                  R                  -  nU R                  USSUSS9R                  U5      nUR                  S   nU R                  R                  /U-  nU R                  UR                  UUS9S   nU$ )N
max_lengthTpt)padding
truncationr[   return_tensorsr   )	input_idsctx_embeddingsr"   )_execution_devicer   
text_modelconfigmax_position_embeddingsr   num_query_tokensr   rP   rW   r"   r`   )	r*   query_embedsrC   rI   max_lentokenized_promptrR   r"   text_embeddingss	            r,   encode_prompt-BlipDiffusionControlNetPipeline.encode_prompt   s    1411 ##..55MM<<&&777>>  * 
 "V* 	 "''*
223j@++&00'' , 
 	 r.   c	           	         U R                   R                  UX#S.SSSSS9S   R                  U5      nUR                  S   n	U	S:X  a  Un
OUn
UR	                  U
SS	9nUR                  XgS
9nU(       a  [
        R                  " U/S-  5      nU$ )N)rU   rT   TFr\   )size
do_rescaledo_center_cropdo_normalizer_   pixel_valuesr      )dimrK   r   )r!   
preprocessrP   rW   repeat_interleavetorchcat)r*   imagerU   rT   rR   num_images_per_promptrI   rJ   do_classifier_free_guidanceimage_batch_size	repeat_bys              r,   prepare_control_image5BlipDiffusionControlNetPipeline.prepare_control_image   s     $$// 3  0 
  "V* 	 !;;q>q "I .I''	q'94&IIugk*Er.   rC   reference_imagecondtioning_imagesource_subject_categorytarget_subject_categoryrV   guidance_scalerT   rU   num_inference_stepsrH   
neg_promptr@   rA   output_typer2   c                 b   U R                   nU R                  R                  X R                  R                  U R                  R
                  SS9S   nUR                  U5      n[        U[        5      (       a  U/n[        U[        5      (       a  U/n[        U[        5      (       a  U/n[        U5      nU R                  UUUUS9nU R                  X$5      nU R                  UUU5      nUS:  nU(       a  U R                  R                  R                  R                  nU R!                  U/U-  SUSS9nU R                  UR"                  R                  U5      SS	9S
   n[$        R&                  " UU/5      nS[        U R(                  R                  R*                  5      S-
  -  nU R-                  UU R(                  R                  R.                  UU-  U	U-  UUU R(                  R0                  US9n0 nU R2                  R4                  " U
40 UD6  U R7                  UU	UUSUU R8                  R0                  US9n[;        U R=                  U R2                  R>                  5      5       H  u  nnUS:  nU(       a  [$        R&                  " U/S-  5      OUnU R9                  UUUUSS9u  nn U R)                  UUUUU S9S   n!U(       a  U!RA                  S5      u  n"n#U"UU#U"-
  -  -   n!U R2                  RC                  U!UU5      S   n[D        (       d  M  [F        RH                  " 5         M     U RJ                  RM                  X`RJ                  R                  RN                  -  SS9S
   n$U R                  RQ                  U$US9n$U RS                  5         U(       d  U$4$ [U        U$S9$ )a
  
Function invoked when calling the pipeline for generation.

Args:
    prompt (`List[str]`):
        The prompt or prompts to guide the image generation.
    reference_image (`PIL.Image.Image`):
        The reference image to condition the generation on.
    condtioning_image (`PIL.Image.Image`):
        The conditioning canny edge image to condition the generation on.
    source_subject_category (`List[str]`):
        The source subject category.
    target_subject_category (`List[str]`):
        The target subject category.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will ge generated by random sampling.
    guidance_scale (`float`, *optional*, defaults to 7.5):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.
    height (`int`, *optional*, defaults to 512):
        The height of the generated image.
    width (`int`, *optional*, defaults to 512):
        The width of the generated image.
    seed (`int`, *optional*, defaults to 42):
        The seed to use for random generation.
    num_inference_steps (`int`, *optional*, defaults to 50):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    neg_prompt (`str`, *optional*, defaults to ""):
        The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
        if `guidance_scale` is less than `1`).
    prompt_strength (`float`, *optional*, defaults to 1.0):
        The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
        to amplify the prompt.
    prompt_reps (`int`, *optional*, defaults to 20):
        The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
Examples:

Returns:
    [`~pipelines.ImagePipelineOutput`] or `tuple`
r\   )
image_mean	image_stdr_   rr   )r>   r?   r@   rA         ?r[   )r]   r[   r_   N)r`   ra   r   r   rs   )rR   rS   rT   rU   rH   rV   rJ   rI   )ry   rU   rT   rR   rz   rI   rJ   r{   F)encoder_hidden_statescontrolnet_condr2   )timestepr   down_block_additional_residualsmid_block_additional_residualsampleprev_sample)r2   )r   )images)+rb   r!   ru   rd   r#   r$   rP   rL   strrN   rE   r5   rk   r   rc   re   r   r`   rw   rx   r   block_out_channelsrX   in_channelsrJ   r   set_timestepsr~   r    	enumerateprogress_bar	timestepschunkstepXLA_AVAILABLExm	mark_stepr   decodescaling_factorpostprocessmaybe_free_model_hooksr   )%r*   rC   r   r   r   r   rV   r   rT   rU   r   rH   r   r@   rA   r   r2   rI   rR   rg   rj   r{   r[   uncond_inputuncond_embeddingsscale_down_factorextra_set_kwargs
cond_imageitlatent_model_inputdown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textry   s%                                        r,   __call__(BlipDiffusionControlNetPipeline.__call__   s   L ''..99(8(8DKKOOdh : 

 *,,V4fc""XF-s33'>&?#-s33'>&?#[
##0+#	 $ 
 00Z,,\66J&4s&:#&**55<<TTJ>>z)$%#	 * L !% 1 1&0033F;# !2 ! ! $ii):O(LMO#dii&6&6&I&I"JQ"NO&&!))55..,,))// ' 	
 $$%8M<LM//#!"#//''(C 0 	

 d//0H0HIJDAq*83*>'=XG9q=!9^e;???"&5 *! <K <8"$8 "&50F.B #  J +5?5E5Ea5H2!?.?UfCf1gg
nn)) 	G }C KF ((//*H*H HV[\]^_$$00K0P 	##%8O"%00r.    )r   NN)r      )N)F)Ng      @   r   2   N r   r   pilT)'__name__
__module____qualname____firstlineno____doc___last_supported_versionmodel_cpu_offload_seqr   r   r   r
   r   r   r	   r   r=   r   floatr'   r5   rE   rX   rk   r~   rw   no_gradr   EXAMPLE_DOC_STRINGr   PILImager   Tensorr   	Generatorboolr   __static_attributes____classcell__)r+   s   @r,   r   r   [   s   6 '>  Q Q +Q 	Q
 #Q !Q #Q $Q ,Q Q 5kQ %[Q Q8`"H %*"H ]]_12 +/ ##%MQ$&!$%* #u1S	u1 u1 99??	u1
 "&cu1 "&cu1 %,,'u1 u1 u1 u1 !u1 E%//43H"HIJu1 SMu1 u1 u1  c]!u1" #u1 3 u1r.   r   ))typingr   r   r   	PIL.Imager   rw   transformersr   modelsr   r	   r
   
schedulersr   utilsr   r   r   utils.torch_utilsr   $blip_diffusion.blip_image_processingr   blip_diffusion.modeling_blip2r    blip_diffusion.modeling_ctx_clipr   pipeline_utilsr   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r   r   r.   r,   <module>r      s    ) (   & J J ' 
 . E = C \ \ ))MM			H	%+ \P1&=?P P1r.   