
    +h1M                        S SK Jr  S SKJrJrJrJr  S SKrS SKJ	s  J
r  S SKJ	r	  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$  \ " S S\5      5       r% " S S\\5      r&g)    )	dataclass)DictOptionalTupleUnionN)nn   )ConfigMixinregister_to_config)ConsistencyDecoderScheduler)
BaseOutput)apply_forward_hook)randn_tensor   )ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORSAttentionProcessorAttnAddedKVProcessorAttnProcessor)
ModelMixin)UNet2DModel   )DecoderOutputDiagonalGaussianDistributionEncoderc                   $    \ rS rSr% SrS\S'   Srg)ConsistencyDecoderVAEOutput&   a  
Output of encoding method.

Args:
    latent_dist (`DiagonalGaussianDistribution`):
        Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
        `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
r   latent_dist N)__name__
__module____qualname____firstlineno____doc____annotations____static_attributes__r        o/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/autoencoders/consistency_decoder_vae.pyr   r   &   s     0/r(   r   c            4         ^  \ rS rSrSrSr\                        S9S\S\S\S\	S\
\S	4   S
\S\
\	S	4   S\S\S\S\S\S\
\S	4   S\
\	S	4   S\S\S\S\S\S\S\S\	S\	S\
\	S	4   40U 4S jjj5       rS:S\4S jjrS  rS! rS" r\S#\\	\4   4S$ j5       rS%\\\\	\4   4   4S& jrS' r\ S:S(\R4                  S)\S#\\\
\   4   4S* jj5       r\   S;S+\R4                  S,\\R>                     S)\S-\S#\\ \
\R4                     4   4
S. jj5       r!S/\R4                  S0\R4                  S1\S#\R4                  4S2 jr"S/\R4                  S0\R4                  S1\S#\R4                  4S3 jr#S:S(\R4                  S)\S#\\\
4   4S4 jjr$   S<S5\R4                  S6\S)\S,\\R>                     S#\\ \
\R4                     4   4
S7 jjr%S8r&U =r'$ )=ConsistencyDecoderVAE4   a  
The consistency decoder used with DALL-E 3.

Examples:
    ```py
    >>> import torch
    >>> from diffusers import StableDiffusionPipeline, ConsistencyDecoderVAE

    >>> vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
    >>> pipe = StableDiffusionPipeline.from_pretrained(
    ...     "stable-diffusion-v1-5/stable-diffusion-v1-5", vae=vae, torch_dtype=torch.float16
    ... ).to("cuda")

    >>> image = pipe("horse", generator=torch.manual_seed(0)).images[0]
    >>> image
    ```
Fscaling_factorlatent_channelssample_sizeencoder_act_fnencoder_block_out_channels.encoder_double_zencoder_down_block_typesencoder_in_channelsencoder_layers_per_blockencoder_norm_num_groupsencoder_out_channelsdecoder_add_attentiondecoder_block_out_channelsdecoder_down_block_typesdecoder_downsample_paddingdecoder_in_channelsdecoder_layers_per_blockdecoder_norm_epsdecoder_norm_num_groupsdecoder_num_train_timestepsdecoder_out_channelsdecoder_resnet_time_scale_shiftdecoder_time_embedding_typedecoder_up_block_typesc                 h  > [         TU ]  5         [        UUUUUU	U
US9U l        [	        UUUUUUUUUUUUUS9U l        [        5       U l        U R                  US9  U R                  SS9  U R                  S[        R                  " / SQ5      S S S 2S S 4   SS9  U R                  S	[        R                  " / S
Q5      S S S 2S S 4   SS9  [        R                  " SU-  SU-  S5      U l        SU l        SU l        U R"                  R$                  U l        [)        U R"                  R$                  [*        [,        45      (       a  U R"                  R$                  S   OU R"                  R$                  n[/        US[1        U R"                  R2                  5      S-
  -  -  5      U l        SU l        g )N)act_fnblock_out_channelsdouble_zdown_block_typesin_channelslayers_per_blocknorm_num_groupsout_channels)add_attentionrG   rI   downsample_paddingrJ   rK   norm_epsrL   num_train_timestepsrM   resnet_time_scale_shifttime_embedding_typeup_block_types)rG   F)force_upcastmeans)gg:?gyD?glL?gN3^)
persistentstds)g4?gn=?gr	^?gr` ?r   r   r   g      ?)super__init__r   encoderr   decoder_unetr   decoder_schedulerr   register_buffertorchtensorr   Conv2d
quant_convuse_slicing
use_tilingconfigr/   tile_sample_min_size
isinstancelisttupleintlenrG   tile_latent_min_sizetile_overlap_factor)selfr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   	__class__s                            r)   rZ   ConsistencyDecoderVAE.__init__I   s   V 	!9%5+53-	
 (/959+5%3 ;-$C ;1
 "=!>3MNU3LLIJ4QRTXZ^K^_ 	 	

 	ELL!OPQUWXZ^`dQderw 	 	
 ))A$7_9LaP  %)KK$;$;! $++11D%=AA KK##A&(( 	
 %(qSA_A_=`cd=d7e(f$g!#' r(   rd   c                     Xl         g)z
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
N)rd   )rn   rd   s     r)   enable_tiling#ConsistencyDecoderVAE.enable_tiling   s	     %r(   c                 &    U R                  S5        g)z
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
decoding in one step.
FN)rr   rn   s    r)   disable_tiling$ConsistencyDecoderVAE.disable_tiling   s    
 	5!r(   c                     SU l         g)z
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
TNrc   ru   s    r)   enable_slicing$ConsistencyDecoderVAE.enable_slicing   s    
  r(   c                     SU l         g)z
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
decoding in one step.
FNry   ru   s    r)   disable_slicing%ConsistencyDecoderVAE.disable_slicing   s    
 !r(   returnc                    ^ 0 nS[         S[        R                  R                  S[        [         [
        4   4U4S jjmU R                  5        H  u  p#T" X#U5        M     U$ )z
Returns:
    `dict` of attention processors: A dictionary containing all attention processors used in the model with
    indexed by its weight name.
namemodule
processorsc                    > [        US5      (       a  UR                  5       X  S3'   UR                  5        H  u  p4T" U  SU 3XB5        M     U$ )Nget_processor
.processor.)hasattrr   named_children)r   r   r   sub_namechildfn_recursive_add_processorss        r)   r   JConsistencyDecoderVAE.attn_processors.<locals>.fn_recursive_add_processors   sZ    v//282F2F2H
V:./#)#8#8#:+tfAhZ,@%T $; r(   )strr_   r   Moduler   r   r   )rn   r   r   r   r   s       @r)   attn_processors%ConsistencyDecoderVAE.attn_processors   sb     
	c 	588?? 	X\]`bt]tXu 	 !//1LD'jA 2 r(   	processorc           	      d  ^ [        U R                  R                  5       5      n[        U[        5      (       a-  [        U5      U:w  a  [        S[        U5       SU SU S35      eS[        S[        R                  R                  4U4S jjmU R                  5        H  u  p4T" X4U5        M     g)	a  
Sets the attention processor to use to compute attention.

Parameters:
    processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
        The instantiated processor class or a dictionary of processor classes that will be set as the processor
        for **all** `Attention` layers.

        If `processor` is a dict, the key needs to define the path to the corresponding cross attention
        processor. This is strongly recommended when setting trainable attention processors.

z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                 
  > [        US5      (       aJ  [        U[        5      (       d  UR                  U5        O#UR                  UR	                  U  S35      5        UR                  5        H  u  p4T" U  SU 3XB5        M     g )Nset_processorr   r   )r   rg   dictr   popr   )r   r   r   r   r   fn_recursive_attn_processors        r)   r   MConsistencyDecoderVAE.set_attn_processor.<locals>.fn_recursive_attn_processor   ss    v//!)T22((3(($z7J)KL#)#8#8#:+tfAhZ,@%S $;r(   N)rk   r   keysrg   r   
ValueErrorr   r_   r   r   r   )rn   r   countr   r   r   s        @r)   set_attn_processor(ConsistencyDecoderVAE.set_attn_processor   s     D((--/0i&&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1LD'i@ 2r(   c           	      ~   [        S U R                  R                  5        5       5      (       a  [        5       nOr[        S U R                  R                  5        5       5      (       a  [	        5       nO8[        S[        [        U R                  R                  5       5      5       35      eU R                  U5        g)zU
Disables custom attention processors and sets the default attention implementation.
c              3   F   #    U  H  oR                   [        ;   v   M     g 7fN)ro   r   .0procs     r)   	<genexpr>CConsistencyDecoderVAE.set_default_attn_processor.<locals>.<genexpr>  s     iKh4~~!>>Kh   !c              3   F   #    U  H  oR                   [        ;   v   M     g 7fr   )ro   r   r   s     r)   r   r     s     hJg$#==Jgr   zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr   valuesr   r   r   nextiterr   )rn   r   s     r)   set_default_attn_processor0ConsistencyDecoderVAE.set_default_attn_processor  s     i4K_K_KfKfKhiii,.Ih$J^J^JeJeJghhh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r(   xreturn_dictc                    U R                   (       aI  UR                  S   U R                  :  d  UR                  S   U R                  :  a  U R                  XS9$ U R                  (       aY  UR                  S   S:  aF  UR                  S5       Vs/ s H  o0R                  U5      PM     nn[        R                  " U5      nOU R                  U5      nU R                  U5      n[        U5      nU(       d  U4$ [        US9$ s  snf )a  
Encode a batch of images into latents.

Args:
    x (`torch.Tensor`): Input batch of images.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
        instead of a plain tuple.

Returns:
        The latent representations of the encoded images. If `return_dict` is True, a
        [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] is returned, otherwise a
        plain `tuple` is returned.
)r   r   r   r   )rd   shaperf   tiled_encoderc   splitr[   r_   catrb   r   r   )rn   r   r   x_sliceencoded_sliceshmoments	posteriors           r)   encodeConsistencyDecoderVAE.encode  s    $ ??d.G.G G177SU;Y]YrYrKr$$Q$@@
QCD771:N:ll73:NN		.)AQA//!$09	<*yAA Os   Dz	generatornum_inference_stepsc                    XR                   R                  -  U R                  -
  U R                  -  nS[	        U R                   R
                  5      S-
  -  n[        R                  " USUS9nUR                  u  pgpU R                  R                  X@R                  S9  U R                  R                  [        USX4X!R                  UR                  S9-  n
U R                  R                   H  n[         R"                  " U R                  R%                  X5      U/SS9nU R'                  X5      R(                  S	S	2S	S2S	S	2S	S	24   nU R                  R+                  XX5      R,                  nUn
M     U
nU(       d  U4$ [/        US
9$ )a  
Decodes the input latent vector `z` using the consistency decoder VAE model.

Args:
    z (torch.Tensor): The input latent vector.
    generator (Optional[torch.Generator]): The random number generator. Default is None.
    return_dict (bool): Whether to return the output as a dictionary. Default is True.
    num_inference_steps (int): The number of inference steps. Default is 2.

Returns:
    Union[DecoderOutput, Tuple[torch.Tensor]]: The decoded output.

r   r   nearest)modescale_factor)devicer	   )r   dtyper   dimNsample)re   r-   rV   rX   rk   rG   Finterpolater   r]   set_timestepsr   init_noise_sigmar   r   	timestepsr_   concatscale_model_inputr\   r   stepprev_sampler   )rn   r   r   r   r   r   
batch_size_heightwidthx_ttmodel_inputmodel_outputr   x_0s                   r)   decodeConsistencyDecoderVAE.decode:  s_   * +++djj8DIIES!?!?@1DEMM!),G'(ww$
v,,-@,U$$55F*iwwWXW_W_9
 
 ''11A,,(>(>(P(PQT(XZ['\bcdK,,[<CCArr1aKPL0055lsVbbKC	 2 6MC((r(   abblend_extentc                     [        UR                  S   UR                  S   U5      n[        U5       H@  nUS S 2S S 2U* U-   S S 24   SXC-  -
  -  US S 2S S 2US S 24   XC-  -  -   US S 2S S 2US S 24'   MB     U$ )Nr   r   minr   range)rn   r   r   r   ys        r)   blend_vConsistencyDecoderVAE.blend_vj  s    1771:qwwqz<@|$Aa\MA$5q89QAQ=QRUVWXZ[]^`aWaUbfgfvUwwAaAqjM %r(   c                     [        UR                  S   UR                  S   U5      n[        U5       H@  nUS S 2S S 2S S 2U* U-   4   SXC-  -
  -  US S 2S S 2S S 2U4   XC-  -  -   US S 2S S 2S S 2U4'   MB     U$ )Nr	   r   r   )rn   r   r   r   r   s        r)   blend_hConsistencyDecoderVAE.blend_hq  s    1771:qwwqz<@|$AaA}q'889QAQ=QRUVWXZ[]^`aWaUbfgfvUwwAaAqjM %r(   c           
         [        U R                  SU R                  -
  -  5      n[        U R                  U R                  -  5      nU R                  U-
  n/ n[	        SUR
                  S   U5       H  n/ n[	        SUR
                  S   U5       H_  n	USS2SS2XwU R                  -   2XU R                  -   24   n
U R                  U
5      n
U R                  U
5      n
UR                  U
5        Ma     UR                  U5        M     / n[        U5       H  u  px/ n[        U5       Hb  u  pUS:  a  U R                  XgS-
     U	   X5      n
U	S:  a  U R                  XS-
     X5      n
UR                  U
SS2SS2SU2SU24   5        Md     UR                  [        R                  " USS95        M     [        R                  " USS9n[        U5      nU(       d  U4$ [        US9$ )a*  Encode a batch of images using a tiled encoder.

When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
output, but they should be much less noticeable.

Args:
    x (`torch.Tensor`): Input batch of images.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
        instead of a plain tuple.

Returns:
    [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`] or `tuple`:
        If return_dict is True, a [`~models.autoencoders.consistency_decoder_vae.ConsistencyDecoderVAEOutput`]
        is returned, otherwise a plain `tuple` is returned.
r   r   r   r	   Nr   r   )rj   rf   rm   rl   r   r   r[   rb   append	enumerater   r   r_   r   r   r   )rn   r   r   overlap_sizer   	row_limitrowsirowjtileresult_rows
result_rowr   r   s                  r)   r   "ConsistencyDecoderVAE.tiled_encodew  s   ( 444D<T<T8TUV444t7O7OOP--<	 q!''!*l3AC1aggaj,7Aqt'@'@#@@!$JcJcFcBccd||D)t,

4 	 8
 KK 4 oFAJ$S> q5<<UAKDq5<<E
DGD!!$q!ZiZ)'C"DE * uyy;< & ))KQ/09	<*yAAr(   r   sample_posteriorc                     UnU R                  U5      R                  nU(       a  UR                  US9nOUR                  5       nU R	                  XtS9R                  nU(       d  U4$ [        US9$ )a'  
Args:
    sample (`torch.Tensor`): Input sample.
    sample_posterior (`bool`, *optional*, defaults to `False`):
        Whether to sample from the posterior.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
    generator (`torch.Generator`, *optional*, defaults to `None`):
        Generator to use for sampling.

Returns:
    [`DecoderOutput`] or `tuple`:
        If return_dict is True, a [`DecoderOutput`] is returned, otherwise a plain `tuple` is returned.
)r   r   )r   r   r   r   r   r   )	rn   r   r   r   r   r   r   r   decs	            r)   forwardConsistencyDecoderVAE.forward  si    * KKN..	  9 5A Akk!k1886MC((r(   )	r]   r\   r[   rb   rl   rm   rf   rc   rd   )g{P?       silu)         r  T)DownEncoderBlock2Dr  r  r  r	   r   r  r   F)i@  i     r  )ResnetDownsampleBlock2Dr  r  r  r      r	   gh㈵>r  r     scale_shiftlearned)ResnetUpsampleBlock2Dr  r  r  )T)NTr   )FTN)(r!   r"   r#   r$   r%   _supports_group_offloadingr   floatrj   r   r   boolrZ   rr   rv   rz   r}   propertyr   r   r   r   r   r   r   r_   Tensorr   r   r   r   	Generatorr   r   r   r   r   r   r'   __classcell__)ro   s   @r)   r+   r+   4   s   $ "' !( $6J!%5
 $%()')$%&+6L5
 +,#$()"'')+/$%/<+43
G^(^( ^( 	^(
 ^( %*#s(O^( ^( #(S/^( !^( #&^(  "%!^(" "#^($  $%^(& %*#s(O'^(( #(S/)^(4 %(5^(6 !7^(8 #&9^(:  ;^(< "%=^(> &)?^(@ "A^(B *-C^(D &)E^(F !&c3hG^( ^(B% %" ! c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF+ 37 B B,0 B	*E2N,OO	P B  BD  04 #$,)<<,) EOO,,) 	,)
 !,) 
}eELL11	2,) ,)^ %,, c ell  %,, c ell 5Bell 5B 5BOjlqOqIr 5Bt "' /3 ) )  ) 	 )
 EOO, ) 
}eELL11	2 )  )r(   r+   )'dataclassesr   typingr   r   r   r   r_   torch.nn.functionalr   
functionalr   configuration_utilsr
   r   
schedulersr   utilsr   utils.accelerate_utilsr   utils.torch_utilsr   attention_processorr   r   r   r   r   modeling_utilsr   unets.unet_2dr   vaer   r   r   r   r+   r    r(   r)   <module>r"     sn    " / /     B 5  8 -  ( ' E E 
0* 
0 
0Z)J Z)r(   