
    +h;                        S SK JrJrJr  S SKrS SKrS SKJr  SSK	J
r
Jr  SSKJr  SSKJrJrJr  SSKJr  S	S
KJrJrJr  SSKJr  SSKJr  SSKJr  \" 5       (       a  S SKJs  J r!  Sr"OSr"\RF                  " \$5      r%Sr& " S S\\5      r'g)    )ListOptionalUnionN)CLIPTokenizer   )AutoencoderKLUNet2DConditionModel)PNDMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )BlipImageProcessor)Blip2QFormerModel)ContextCLIPTextModelTFah  
    Examples:
        ```py
        >>> from diffusers.pipelines import BlipDiffusionPipeline
        >>> from diffusers.utils import load_image
        >>> import torch

        >>> blip_diffusion_pipe = BlipDiffusionPipeline.from_pretrained(
        ...     "Salesforce/blipdiffusion", torch_dtype=torch.float16
        ... ).to("cuda")


        >>> cond_subject = "dog"
        >>> tgt_subject = "dog"
        >>> text_prompt_input = "swimming underwater"

        >>> cond_image = load_image(
        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/dog.jpg"
        ... )
        >>> guidance_scale = 7.5
        >>> num_inference_steps = 25
        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"


        >>> output = blip_diffusion_pipe(
        ...     text_prompt_input,
        ...     cond_image,
        ...     cond_subject,
        ...     tgt_subject,
        ...     guidance_scale=guidance_scale,
        ...     num_inference_steps=num_inference_steps,
        ...     neg_prompt=negative_prompt,
        ...     height=512,
        ...     width=512,
        ... ).images
        >>> output[0].save("image.png")
        ```
c            !         ^  \ rS rSrSrSrSr   S%S\S\S\	S\
S	\S
\S\S\S\\   S\\   4U 4S jjjrS rS&S jrS'S jrS'S jr\R.                  " 5       \" \5                 S(S\\   S\R8                  R8                  S\\   S\\   S\\R<                     S\S\S\S\S\\\R@                  \\R@                     4      S\\   S\S \S!\\   S"\!4S# jj5       5       r"S$r#U =r$$ ))BlipDiffusionPipelineT   aX  
Pipeline for Zero-Shot Subject Driven Generation using Blip Diffusion.

This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

Args:
    tokenizer ([`CLIPTokenizer`]):
        Tokenizer for the text encoder
    text_encoder ([`ContextCLIPTextModel`]):
        Text encoder to encode the text prompt
    vae ([`AutoencoderKL`]):
        VAE model to map the latents to the image
    unet ([`UNet2DConditionModel`]):
        Conditional U-Net architecture to denoise the image embedding.
    scheduler ([`PNDMScheduler`]):
         A scheduler to be used in combination with `unet` to generate image latents.
    qformer ([`Blip2QFormerModel`]):
        QFormer model to get multi-modal embeddings from the text and image.
    image_processor ([`BlipImageProcessor`]):
        Image Processor to preprocess and postprocess the image.
    ctx_begin_pos (int, `optional`, defaults to 2):
        Position of the context token in the text encoder.
z0.33.1z qformer->text_encoder->unet->vae	tokenizertext_encodervaeunet	schedulerqformerimage_processorctx_begin_posmeanstdc           
      l   > [         TU ]  5         U R                  UUUUUUUS9  U R                  XU
S9  g )N)r   r   r   r   r   r   r    )r!   r"   r#   )super__init__register_modulesregister_to_config)selfr   r   r   r   r   r   r    r!   r"   r#   	__class__s              t/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.pyr&   BlipDiffusionPipeline.__init__q   sM     	%+ 	 	
 	mCP    c                 "    U R                  XSS9$ )NF)image_input
text_inputreturn_dict)r   )r)   input_imagesrc_subjects      r+   get_query_embeddings*BlipDiffusionPipeline.get_query_embeddings   s    ||Y^|__r-   c           
          / n[        X5       HJ  u  pgSU SUR                  5        3nUR                  SR                  U/[	        X4-  5      -  5      5        ML     U$ )Nza  z, )zipstripappendjoinint)r)   promptstgt_subjectsprompt_strengthprompt_repsrvprompttgt_subjects           r+   _build_prompt#BlipDiffusionPipeline._build_prompt   s]    #&w#=F+a'78FIIdii3/L+M MNO $>
 	r-   c	                     XX44n	[        U[        5      (       a*  [        U5      U:w  a  [        S[        U5       SU S35      eUc  [	        XXeS9nOUR                  XeS9nXR                  R                  -  nU$ )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatordevicedtype)rH   rI   )
isinstancelistlen
ValueErrorr   tor   init_noise_sigma)
r)   
batch_sizenum_channelsheightwidthrI   rH   rG   latentsshapes
             r+   prepare_latents%BlipDiffusionPipeline.prepare_latents   s    69i&&3y>Z+GA#i.AQ R&<'gi 
 ?"5fZGjjj<G NN;;;r-   c                    U=(       d    U R                   nU R                  R                  R                  R                  nX@R
                  R                  R                  -  nU R                  USSUSS9R                  U5      nUR                  S   nU R                  R                  /U-  nU R                  UR                  UUS9S   nU$ )N
max_lengthTpt)padding
truncationrY   return_tensorsr   )	input_idsctx_embeddingsr!   )_execution_devicer   
text_modelconfigmax_position_embeddingsr   num_query_tokensr   rN   rU   r!   r^   )	r)   query_embedsrB   rH   max_lentokenized_promptrP   r!   text_embeddingss	            r+   encode_prompt#BlipDiffusionPipeline.encode_prompt   s    1411 ##..55MM<<&&777>>  * 
 "V* 	 "''*
223j@++&00'' , 
 	 r-   rB   reference_imagesource_subject_categorytarget_subject_categoryrT   guidance_scalerR   rS   num_inference_stepsrG   
neg_promptr?   r@   output_typer1   c                    U R                   nU R                  R                  X R                  R                  U R                  R
                  SS9S   nUR                  U5      n[        U[        5      (       a  U/n[        U[        5      (       a  U/n[        U[        5      (       a  U/n[        U5      nU R                  UUUUS9nU R                  X#5      nU R                  UUU5      nUS:  nU(       a  U R                  R                  R                  R                  nU R!                  U/U-  SUSS9nU R                  UR"                  R                  U5      SS	9S
   n[$        R&                  " UU/5      nS[        U R(                  R                  R*                  5      S-
  -  nU R-                  UU R(                  R                  R.                  UU-  UU-  U
UU R(                  R0                  US9n0 nU R2                  R4                  " U	40 UD6  [7        U R9                  U R2                  R:                  5      5       H  u  nnUS:  nU(       a  [$        R&                  " U/S-  5      OUnU R)                  UUUSSS9S   nU(       a  UR=                  S5      u  nnUUUU-
  -  -   nU R2                  R?                  UUU5      S   n[@        (       d  M  [B        RD                  " 5         M     U RF                  RI                  XPRF                  R                  RJ                  -  SS9S
   n U R                  RM                  U US9n U RO                  5         U(       d  U 4$ [Q        U S9$ )a/  
Function invoked when calling the pipeline for generation.

Args:
    prompt (`List[str]`):
        The prompt or prompts to guide the image generation.
    reference_image (`PIL.Image.Image`):
        The reference image to condition the generation on.
    source_subject_category (`List[str]`):
        The source subject category.
    target_subject_category (`List[str]`):
        The target subject category.
    latents (`torch.Tensor`, *optional*):
        Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
        generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
        tensor will ge generated by random sampling.
    guidance_scale (`float`, *optional*, defaults to 7.5):
        Guidance scale as defined in [Classifier-Free Diffusion
        Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
        of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
        `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
        the text `prompt`, usually at the expense of lower image quality.
    height (`int`, *optional*, defaults to 512):
        The height of the generated image.
    width (`int`, *optional*, defaults to 512):
        The width of the generated image.
    num_inference_steps (`int`, *optional*, defaults to 50):
        The number of denoising steps. More denoising steps usually lead to a higher quality image at the
        expense of slower inference.
    generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
        One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
        to make generation deterministic.
    neg_prompt (`str`, *optional*, defaults to ""):
        The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
        if `guidance_scale` is less than `1`).
    prompt_strength (`float`, *optional*, defaults to 1.0):
        The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
        to amplify the prompt.
    prompt_reps (`int`, *optional*, defaults to 20):
        The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
    output_type (`str`, *optional*, defaults to `"pil"`):
        The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
        (`np.array`) or `"pt"` (`torch.Tensor`).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
Examples:

Returns:
    [`~pipelines.ImagePipelineOutput`] or `tuple`
rZ   )
image_mean	image_stdr]   pixel_values)r=   r>   r?   r@         ?rY   )r[   rY   r]   N)r^   r_   r   r   r   )rP   rQ   rR   rS   rG   rT   rI   rH   )timestepencoder_hidden_statesdown_block_additional_residualsmid_block_additional_residualsampleprev_sampleF)r1   )rq   )images))r`   r    
preprocessrb   r"   r#   rN   rJ   strrL   rD   r4   ri   r   ra   rc   r   r^   torchcatr   block_out_channelsrV   in_channelsrI   r   set_timesteps	enumerateprogress_bar	timestepschunkstepXLA_AVAILABLExm	mark_stepr   decodescaling_factorpostprocessmaybe_free_model_hooksr   )!r)   rB   rk   rl   rm   rT   rn   rR   rS   ro   rG   rp   r?   r@   rq   r1   rH   rP   re   rh   do_classifier_free_guidancerY   uncond_inputuncond_embeddingsscale_down_factorextra_set_kwargsitlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textimages!                                    r+   __call__BlipDiffusionPipeline.__call__   sw   L ''..99(8(8DKKOOdh : 

 *,,V4fc""XF-s33'>&?#-s33'>&?#[
##0+#	 $ 
 00Z,,\66J&4s&:#&**55<<TTJ>>z)$%#	 * L !% 1 1&0033F;# !2 ! ! $ii):O(LMO#dii&6&6&I&I"JQ"NO&&!))55..,,))// ' 	
 $$%8M<LMd//0H0HIJDAq*83*>'=XG9q=!9^e"&504.2 #  J +5?5E5Ea5H2!?.?UfCf1gg
nn)) 	G }5 K8 ((//*H*H HV[\]^_$$00K0P 	##%8O"%00r-    )r   NN)rv      )N)Ng      @   r   2   N rv   r   pilT)%__name__
__module____qualname____firstlineno____doc___last_supported_versionmodel_cpu_offload_seqr   r   r   r	   r
   r   r   r<   r   floatr&   r4   rD   rV   ri   r   no_gradr   EXAMPLE_DOC_STRINGr   PILImager   Tensorr   	Generatorboolr   __static_attributes____classcell__)r*   s   @r+   r   r   T   s   2 '>  Q Q +Q 	Q
 #Q !Q #Q ,Q Q 5kQ %[Q Q4`"4 ]]_12 +/ ##%MQ$&!$%* !c1S	c1 c1 "&c	c1
 "&cc1 %,,'c1 c1 c1 c1 !c1 E%//43H"HIJc1 SMc1 c1 c1 c]c1  !c1 3 c1r-   r   )(typingr   r   r   	PIL.Imager   r   transformersr   modelsr   r	   
schedulersr
   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   r   blip_image_processingr   modeling_blip2r   modeling_ctx_clipr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r   r   r-   r+   <module>r      s    ) (   & 9 ' 
 . \ \ 5 - 3 ))MM			H	%% PU135F U1r-   