
    +hM                        S SK JrJrJrJr  S SKrS SKJr  S SKJs  J	r
  S SKrSSKJrJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  \R<                  " \5      r Sr! " S S\RD                  5      r# " S S\RH                  5      r% " S S\RL                  5      r' " S S\RH                  5      r( " S S\RH                  5      r) " S S\RH                  5      r* " S S\RH                  5      r+ " S S\RH                  5      r, " S S\RH                  5      r- " S  S!\RH                  5      r. " S" S#\\\5      r/g)$    )ListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixin)logging)apply_forward_hook   )get_activation)AutoencoderKLOutput)
ModelMixin   )DecoderOutputDiagonalGaussianDistributionc                      ^  \ rS rSrSr  SS\S\S\\\\\\4   4   S\\\\\\4   4   S\\\\\\4   4   SS	4U 4S
 jjjrSU 4S jjr	Sr
U =r$ )QwenImageCausalConv3d+   aM  
A custom 3D causal convolution layer with feature caching support.

This layer extends the standard Conv3D layer by ensuring causality in the time dimension and handling feature
caching for efficient inference.

Args:
    in_channels (int): Number of channels in the input image
    out_channels (int): Number of channels produced by the convolution
    kernel_size (int or tuple): Size of the convolving kernel
    stride (int or tuple, optional): Stride of the convolution. Default: 1
    padding (int or tuple, optional): Zero-padding added to all three sides of the input. Default: 0
in_channelsout_channelskernel_sizestridepaddingreturnNc                    > [         TU ]  UUUUUS9  U R                  S   U R                  S   U R                  S   U R                  S   SU R                  S   -  S4U l        SU l        g )N)r   r   r   r   r   r   r   r   r   r   r   )super__init__r   _padding)selfr   r   r   r   r   	__class__s         p/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/autoencoders/autoencoder_kl_qwenimage.pyr    QwenImageCausalConv3d.__init__:   sx     	#%# 	 	
 a$,,q/4<<?DLLYZO]^aeamamnoap]prst     c                 :  > [        U R                  5      nUb^  U R                  S   S:  aK  UR                  UR                  5      n[        R
                  " X!/SS9nUS==   UR                  S   -  ss'   [        R                  " X5      n[        TU ])  U5      $ )N   r   r   dim)listr!   todevicetorchcatshapeFpadr   forward)r"   xcache_xr   r#   s       r$   r3   QwenImageCausalConv3d.forwardN   s    t}}%4==#3a#7jj*G		7,A.AAJ'--**JEE!wq!!r&   )r!   r   )r   r   N)__name__
__module____qualname____firstlineno____doc__intr   r   r    r3   __static_attributes____classcell__r#   s   @r$   r   r   +   s    & 4545!! ! 3c3m 445	!
 c5c3//0! sE#sC-001! 
! !(" "r&   r   c                   N   ^  \ rS rSrSrSS\S\S\S\SS4
U 4S	 jjjrS
 rSr	U =r
$ )QwenImageRMS_normX   a  
A custom RMS normalization layer.

Args:
    dim (int): The number of dimensions to normalize over.
    channel_first (bool, optional): Whether the input tensor has channels as the first dimension.
        Default is True.
    images (bool, optional): Whether the input represents image data. Default is True.
    bias (bool, optional): Whether to include a learnable bias term. Default is False.
r*   channel_firstimagesbiasr   Nc                 R  > [         TU ]  5         U(       d  SOSnU(       a  U/UQ7OU4nX l        US-  U l        [        R
                  " [        R                  " U5      5      U l        U(       a0  [        R
                  " [        R                  " U5      5      U l
        g SU l
        g )N)r   r   r   )r   r   g      ?        )r   r    rD   scalenn	Parameterr.   onesgammazerosrF   )r"   r*   rD   rE   rF   broadcastable_dimsr0   r#   s          r$   r    QwenImageRMS_norm.__init__d   sw    .4Y&.;*)*#*#X
\\%**U"34
8<BLLU!34	#	r&   c                     [         R                  " XR                  (       a  SOSS9U R                  -  U R                  -  U R
                  -   $ )Nr   r)   )r1   	normalizerD   rI   rM   rF   )r"   r4   s     r$   r3   QwenImageRMS_norm.forwardn   s<    {{1(:(:1DtzzQTXT^T^^aeajajjjr&   )rF   rD   rM   rI   TTF)r8   r9   r:   r;   r<   r=   boolr    r3   r>   r?   r@   s   @r$   rB   rB   X   sJ    	FC F FT FX\ Fim F Fk kr&   rB   c                   ,   ^  \ rS rSrSrU 4S jrSrU =r$ )QwenImageUpsampler   z
Perform upsampling while ensuring the output tensor has the same data type as the input.

Args:
    x (torch.Tensor): Input tensor to be upsampled.

Returns:
    torch.Tensor: Upsampled tensor with the same data type as the input.
c                 \   > [         TU ]  UR                  5       5      R                  U5      $ r7   )r   r3   floattype_as)r"   r4   r#   s     r$   r3   QwenImageUpsample.forward}   s#    wqwwy)11!44r&    )r8   r9   r:   r;   r<   r3   r>   r?   r@   s   @r$   rX   rX   r   s    5 5r&   rX   c                   L   ^  \ rS rSrSrS\S\SS4U 4S jjrSS/4S	 jrS
r	U =r
$ )QwenImageResample   aP  
A custom resampling module for 2D and 3D data.

Args:
    dim (int): The number of input/output channels.
    mode (str): The resampling mode. Must be one of:
        - 'none': No resampling (identity operation).
        - 'upsample2d': 2D upsampling with nearest-exact interpolation and convolution.
        - 'upsample3d': 3D upsampling with nearest-exact interpolation, convolution, and causal 3D convolution.
        - 'downsample2d': 2D downsampling with zero-padding and convolution.
        - 'downsample3d': 3D downsampling with zero-padding, convolution, and causal 3D convolution.
r*   moder   Nc           
        > [         TU ]  5         Xl        X l        US:X  a<  [        R
                  " [        SSS9[        R                  " XS-  SSS95      U l        g US	:X  aO  [        R
                  " [        SSS9[        R                  " XS-  SSS95      U l        [        XS-  S
SS9U l
        g US:X  aE  [        R
                  " [        R                  " S5      [        R                  " XSSS95      U l        g US:X  aV  [        R
                  " [        R                  " S5      [        R                  " XSSS95      U l        [        XS
SSS9U l
        g [        R                  " 5       U l        g )N
upsample2d)       @re   znearest-exact)scale_factorrb   r   r   r   r   
upsample3d)r   r   r   )r   r   r   downsample2d)r   r   r   r   )r   r   )r   downsample3d)r   r   r   r   )r   r   )r   r    r*   rb   rJ   
SequentialrX   Conv2dresampler   	time_conv	ZeroPad2dIdentity)r"   r*   rb   r#   s      r$   r    QwenImageResample.__init__   s'   	 <MM!zP		#axA6DM \!MM!zP		#axA6DM 33aT]^DN^#MM",,|*DbiiPSZ[djFklDM^#MM",,|*DbiiPSZ[djFklDM23YybklDN KKMDMr&   r   c                 l   UR                  5       u  pEpgnU R                  S:X  Ga  UGb  US   n	X)   c  SX)'   US==   S-  ss'   GOUS S 2S S 2[        * S 2S S 2S S 24   R                  5       n
U
R                  S   S:  a^  X)   bY  X)   S:w  aQ  [
        R                  " X)   S S 2S S 2SS S 2S S 24   R                  S5      R                  U
R                  5      U
/SS9n
U
R                  S   S:  aQ  X)   bL  X)   S:X  aD  [
        R                  " [
        R                  " U
5      R                  U
R                  5      U
/SS9n
X)   S:X  a  U R                  U5      nOU R                  XU	   5      nXU	'   US==   S-  ss'   UR                  USXVXx5      n[
        R                  " US S 2SS S 2S S 2S S 2S S 24   US S 2SS S 2S S 2S S 2S S 24   4S5      nUR                  XEUS-  Xx5      nUR                  S   nUR                  SSSSS	5      R                  XF-  XWU5      nU R                  U5      nUR!                  XFUR                  S5      UR                  S5      UR                  S5      5      R                  SSSSS	5      nU R                  S
:X  a  Ub  US   n	X)   c!  UR                  5       X)'   US==   S-  ss'   U$ US S 2S S 2SS 2S S 2S S 24   R                  5       n
U R                  [
        R                  " X)   S S 2S S 2SS 2S S 2S S 24   U/S5      5      nXU	'   US==   S-  ss'   U$ )Nrh   r   Repr   r   rR   r)   r   r(   rj   )sizerb   CACHE_Tcloner0   r.   r/   	unsqueezer,   r-   
zeros_likern   reshapestackpermuterm   view)r"   r4   
feat_cachefeat_idxbcthwidxr5   s              r$   r3   QwenImageResample.forward   s   aA99$%qk?*&+JOQK1$K1whiA 56<<>G}}Q'!+
0KPZP_chPh"'))'_Q2q!^<FFqILLW^^\^eflm# }}Q'!+
0KPZP_chPh"'))U-=-=g-F-I-I'..-Y[b,cij"k!%/ NN1- NN1o>&-sOQK1$K		!Qa3AQq!Q1a'7%8!Aq!Q1<L:M$NPQRA		!Aq4AGGAJIIaAq!$,,QUA!<MM!FF1AFF1Iqvvay9AA!Q1aP99&%qk?*&'ggiJOQK1$K 	  1bc1a0668Guyy*/!QQPQ/2RTU1VXY'Z[A&-sOQK1$Kr&   )r*   rb   rm   rn   )r8   r9   r:   r;   r<   r=   strr    r3   r>   r?   r@   s   @r$   r`   r`      s5    *C *s *t *6 %)A3 + +r&   r`   c                   \   ^  \ rS rSrSr  SS\S\S\S\SS4
U 4S	 jjjrSS
/4S jr	Sr
U =r$ )QwenImageResidualBlock   a)  
A custom residual block module.

Args:
    in_dim (int): Number of input channels.
    out_dim (int): Number of output channels.
    dropout (float, optional): Dropout rate for the dropout layer. Default is 0.0.
    non_linearity (str, optional): Type of non-linearity to use. Default is "silu".
in_dimout_dimdropoutnon_linearityr   Nc                 n  > [         TU ]  5         Xl        X l        [	        U5      U l        [        USS9U l        [        XSSS9U l	        [        USS9U l
        [        R                  " U5      U l        [        X"SSS9U l        X:w  a  [        XS5      U l        g [        R                  " 5       U l        g )NFrE   r   r   rg   )r   r    r   r   r   nonlinearityrB   norm1r   conv1norm2rJ   Dropoutr   conv2rp   conv_shortcut)r"   r   r   r   r   r#   s        r$   r    QwenImageResidualBlock.__init__   s     	*=9 've<
*6AqI
&wu=
zz'**7QJ
JPJ[26AFacalalanr&   r   c           	         U R                  U5      nU R                  U5      nU R                  U5      nUb  US   nUS S 2S S 2[        * S 2S S 2S S 24   R	                  5       nUR
                  S   S:  aV  X%   bQ  [        R                  " X%   S S 2S S 2SS S 2S S 24   R                  S5      R                  UR                  5      U/SS9nU R                  XU   5      nXbU'   US==   S-  ss'   OU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nUb  US   nUS S 2S S 2[        * S 2S S 2S S 24   R	                  5       nUR
                  S   S:  aV  X%   bQ  [        R                  " X%   S S 2S S 2SS S 2S S 24   R                  S5      R                  UR                  5      U/SS9nU R                  XU   5      nXbU'   US==   S-  ss'   X-   $ U R                  U5      nX-   $ Nr   r   rR   r)   r   )r   r   r   ru   rv   r0   r.   r/   rw   r,   r-   r   r   r   r   )r"   r4   r}   r~   r   r   r5   s          r$   r3   QwenImageResidualBlock.forward   s   q! JJqMa !1+C1whiA-.446G}}Q!#
(C))Z_Q2q!^%D%N%Nq%Q%T%TU\UcUc%dfm$ntuv

1o.A%sOQK1K

1A JJqMa  LLO!1+C1whiA-.446G}}Q!#
(C))Z_Q2q!^%D%N%Nq%Q%T%TU\UcUc%dfm$ntuv

1o.A%sOQK1K
 u 

1A ur&   )	r   r   r   r   r   r   r   r   r   )rH   silur8   r9   r:   r;   r<   r=   r[   r   r    r3   r>   r?   r@   s   @r$   r   r      sd     #oo o 	o
 o 
o o( %)A3 ( (r&   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )QwenImageAttentionBlocki"  zm
Causal self-attention with a single head.

Args:
    dim (int): The number of channels in the input tensor.
c                    > [         TU ]  5         Xl        [        U5      U l        [
        R                  " XS-  S5      U l        [
        R                  " XS5      U l        g )Nr   r   )	r   r    r*   rB   normrJ   rl   to_qkvproj)r"   r*   r#   s     r$   r     QwenImageAttentionBlock.__init__*  sI     &c*	ii1Wa0IIc*	r&   c                 p   UnUR                  5       u  p4pVnUR                  SSSSS5      R                  X5-  XFU5      nU R                  U5      nU R	                  U5      nUR                  X5-  SUS-  S5      nUR                  SSSS5      R                  5       nUR                  SSS9u  pn[        R                  " XU5      nUR                  S5      R                  SSS5      R                  X5-  XFU5      nU R                  U5      nUR                  X5XFU5      nUR                  SSSSS5      nX-   $ )Nr   r   r   r   r(   rR   r)   )rt   r{   ry   r   r   
contiguouschunkr1   scaled_dot_product_attentionsqueezer   r|   )r"   r4   identity
batch_sizechannelstimeheightwidthqkvqkvs               r$   r3   QwenImageAttentionBlock.forward3  s0   45FFH1
dEIIaAq!$,,Z->RWXIIaL kk!nkk*+Q1bAkk!Q1%002))A2)&a **13IIaL  Aq)11*2CXW\] IIaL FF:Xu=IIaAq!$|r&   )r*   r   r   r   )	r8   r9   r:   r;   r<   r    r3   r>   r?   r@   s   @r$   r   r   "  s    + r&   r   c            	       T   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrSS	/4S
 jr	Sr
U =r$ )QwenImageMidBlockiO  z
Middle block for QwenImageVAE encoder and decoder.

Args:
    dim (int): Number of input/output channels.
    dropout (float): Dropout rate.
    non_linearity (str): Type of non-linearity to use.
r*   r   r   
num_layersc           	      T  > [         TU ]  5         Xl        [        XX#5      /n/ n[	        U5       H8  nUR                  [        U5      5        UR                  [        XX#5      5        M:     [        R                  " U5      U l	        [        R                  " U5      U l
        SU l        g )NF)r   r    r*   r   rangeappendr   rJ   
ModuleList
attentionsresnetsgradient_checkpointing)	r"   r*   r   r   r   r   r   _r#   s	           r$   r    QwenImageMidBlock.__init__Y  s     *#GKL
z"A5c:;NN1#GST # --
3}}W-&+#r&   Nr   c                     U R                   S   " XU5      n[        U R                  U R                   SS  5       H  u  pEUb  U" U5      nU" XU5      nM     U$ Nr   r   )r   zipr   )r"   r4   r}   r~   attnresnets         r$   r3   QwenImageMidBlock.forwardh  s]    LLOA84  ab1ABLDGqh/A	 C r&   )r   r*   r   r   )rH   r   r   r   r@   s   @r$   r   r   O  sB    ,C ,% ,c ,`c , , %)A3  r&   r   c                   `   ^  \ rS rSrSrSS/ SQS/ / SQSS	4S
\4U 4S jjjrSS/4S jrSrU =r	$ )QwenImageEncoder3div  aC  
A 3D encoder module.

Args:
    dim (int): The base number of channels in the first layer.
    z_dim (int): The dimensionality of the latent space.
    dim_mult (list of int): Multipliers for the number of channels in each block.
    num_res_blocks (int): Number of residual blocks in each block.
    attn_scales (list of float): Scales at which to apply attention mechanisms.
    temperal_downsample (list of bool): Whether to downsample temporally in each block.
    dropout (float): Dropout rate for the dropout layers.
    non_linearity (str): Type of non-linearity to use.
   r(   r   r   r(   r(   r   rU   rH   r   r   c	           	      6  > [         TU ]  5         Xl        X l        X0l        X@l        XPl        X`l        [        U5      U l	        S/U-    V	s/ s H  oU	-  PM	     n
n	Sn[        SU
S   SSS9U l        [        R                  " / 5      U l        [        [!        U
S S U
SS  5      5       H  u  nu  p[#        U5       HS  nU R                  R%                  ['        XU5      5        X;   a$  U R                  R%                  [)        U5      5        UnMU     U[+        U5      S-
  :w  d  M~  Xl   (       a  SOSnU R                  R%                  [-        UUS	95        US
-  nM     [/        WXxSS9U l        [3        USS9U l        [        XSSS9U l        SU l        g s  sn	f )Nr         ?r   r   rg   rR   rj   ri   rb   re   r   Fr   )r   r    r*   z_dimdim_multnum_res_blocksattn_scalestemperal_downsampler   r   r   conv_inrJ   r   down_blocks	enumerater   r   r   r   r   lenr`   r   	mid_blockrB   norm_outconv_outr   )r"   r*   r   r   r   r   r   r   r   udimsrI   ir   r   r   rb   r#   s                    r$   r    QwenImageEncoder3d.__init__  s    	
 ,&#6 *=9 #$x0Aa0 -QQAF ==,$-c$s)T!"X.F$G A >*  ''(>vPW(XY'$$++,CG,LM 	 + CMA%%)<)?~^  ''(9'(MN %H +7GWXY *'%@-gaK&+#; 1s   FNr   c           	         Ub  US   nUS S 2S S 2[         * S 2S S 2S S 24   R                  5       nUR                  S   S:  aV  X$   bQ  [        R                  " X$   S S 2S S 2SS S 2S S 24   R                  S5      R                  UR                  5      U/SS9nU R                  XU   5      nXRU'   US==   S-  ss'   OU R                  U5      nU R                   H  nUb  U" XU5      nM  U" U5      nM     U R                  XU5      nU R                  U5      nU R                  U5      nUb  US   nUS S 2S S 2[         * S 2S S 2S S 24   R                  5       nUR                  S   S:  aV  X$   bQ  [        R                  " X$   S S 2S S 2SS S 2S S 24   R                  S5      R                  UR                  5      U/SS9nU R                  XU   5      nXRU'   US==   S-  ss'   U$ U R                  U5      nU$ r   )ru   rv   r0   r.   r/   rw   r,   r-   r   r   r   r   r   r   )r"   r4   r}   r~   r   r5   layers          r$   r3   QwenImageEncoder3d.forward  s   !1+C1whiA-.446G}}Q!#
(C))Z_Q2q!^%D%N%Nq%Q%T%TU\UcUc%dfm$ntuvQ30A%sOQK1KQA %%E%!2!H	 & NN1(3 MM!a !1+C1whiA-.446G}}Q!#
(C))Z_Q2q!^%D%N%Nq%Q%T%TU\UcUc%dfm$ntuvaC1A%sOQK1K  a Ar&   )r   r   r   r*   r   r   r   r   r   r   r   r   r   
r8   r9   r:   r;   r<   r   r    r3   r>   r?   r@   s   @r$   r   r   v  sJ      /#2, 2, 2,h %)A3 % %r&   r   c                   h   ^  \ rS rSrSr   SS\S\S\S\S\\   S	\4U 4S
 jjjr	SS/4S jr
SrU =r$ )QwenImageUpBlocki  an  
A block that handles upsampling for the QwenImageVAE decoder.

Args:
    in_dim (int): Input dimension
    out_dim (int): Output dimension
    num_res_blocks (int): Number of residual blocks
    dropout (float): Dropout rate
    upsample_mode (str, optional): Mode for upsampling ('upsample2d' or 'upsample3d')
    non_linearity (str): Type of non-linearity to use
Nr   r   r   r   upsample_moder   c           	      D  > [         T
U ]  5         Xl        X l        / nUn[	        US-   5       H   n	UR                  [        XXF5      5        UnM"     [        R                  " U5      U l	        S U l
        Ub#  [        R                  " [        X%S9/5      U l
        SU l        g )Nr   r   F)r   r    r   r   r   r   r   rJ   r   r   
upsamplersr`   r   )r"   r   r   r   r   r   r   r   current_dimr   r#   s             r$   r    QwenImageUpBlock.__init__  s     	 ~)*ANN1+_`!K + }}W- $ mm->w-[,\]DO&+#r&   r   c                     U R                    H  nUb  U" XU5      nM  U" U5      nM     U R                  b0  Ub  U R                  S   " XU5      nU$ U R                  S   " U5      nU$ )a	  
Forward pass through the upsampling block.

Args:
    x (torch.Tensor): Input tensor
    feat_cache (list, optional): Feature cache for causal convolutions
    feat_idx (list, optional): Feature index for cache management

Returns:
    torch.Tensor: Output tensor
r   )r   r   )r"   r4   r}   r~   r   s        r$   r3   QwenImageUpBlock.forward  su     llF%1(31I	 # ??&%OOA&qh?  OOA&q)r&   )r   r   r   r   r   )rH   Nr   )r8   r9   r:   r;   r<   r=   r[   r   r   r    r3   r>   r?   r@   s   @r$   r   r     sn    
" '+#,, , 	,
 ,  }, , ,< %)A3  r&   r   c                   `   ^  \ rS rSrSrSS/ SQS/ / SQSS	4S
\4U 4S jjjrSS/4S jrSrU =r	$ )QwenImageDecoder3di&  a?  
A 3D decoder module.

Args:
    dim (int): The base number of channels in the first layer.
    z_dim (int): The dimensionality of the latent space.
    dim_mult (list of int): Multipliers for the number of channels in each block.
    num_res_blocks (int): Number of residual blocks in each block.
    attn_scales (list of float): Scales at which to apply attention mechanisms.
    temperal_upsample (list of bool): Whether to upsample temporally in each block.
    dropout (float): Dropout rate for the dropout layers.
    non_linearity (str): Type of non-linearity to use.
r   r(   r   r   FTTrH   r   r   c	           
        > [         TU ]  5         Xl        X l        X0l        X@l        XPl        X`l        [        U5      U l	        US   /US S S2   -    V	s/ s H  oU	-  PM	     n
n	SS[        U5      S-
  -  -  n[        X*S   SSS9U l        [        U
S   XxSS9U l        [        R                   " / 5      U l        [%        ['        U
S S U
SS  5      5       Hg  u  nu  pUS:  a  US-  nS nU[        U5      S-
  :w  a  Xl   (       a  S	OS
n[)        UUUUUUS9nU R"                  R+                  U5        Uc  Mb  US-  nMi     [-        WSS9U l        [        USSSS9U l        SU l        g s  sn	f )NrR   r   r   r   r   r   rg   r   rh   rd   )r   r   r   r   r   r   re   Fr   )r   r    r*   r   r   r   r   temperal_upsampler   r   r   r   r   r   r   rJ   r   	up_blocksr   r   r   r   rB   r   r   r   )r"   r*   r   r   r   r   r   r   r   r   r   rI   r   r   r   r   up_blockr#   s                    r$   r    QwenImageDecoder3d.__init__5  s    	
 ,&!2*=9 #+2,(4R4.!@A!@Aa!@AaCMA-.. -UGQJ +47GWXY r*$-c$s)T!"X.F$G A 1u1 !MCMA%%0A0D, (-++H NN!!(+ (/ %H4 *'%@-gq!QG&+#Q Bs   E(Nr   c           	         Ub  US   nUS S 2S S 2[         * S 2S S 2S S 24   R                  5       nUR                  S   S:  aV  X$   bQ  [        R                  " X$   S S 2S S 2SS S 2S S 24   R                  S5      R                  UR                  5      U/SS9nU R                  XU   5      nXRU'   US==   S-  ss'   OU R                  U5      nU R                  XU5      nU R                   H  nU" XU5      nM     U R                  U5      nU R                  U5      nUb  US   nUS S 2S S 2[         * S 2S S 2S S 24   R                  5       nUR                  S   S:  aV  X$   bQ  [        R                  " X$   S S 2S S 2SS S 2S S 24   R                  S5      R                  UR                  5      U/SS9nU R                  XU   5      nXRU'   US==   S-  ss'   U$ U R                  U5      nU$ r   )ru   rv   r0   r.   r/   rw   r,   r-   r   r   r   r   r   r   )r"   r4   r}   r~   r   r5   r   s          r$   r3   QwenImageDecoder3d.forwardu  s   !1+C1whiA-.446G}}Q!#
(C))Z_Q2q!^%D%N%Nq%Q%T%TU\UcUc%dfm$ntuvQ30A%sOQK1KQA NN1(3 H1A ' MM!a !1+C1whiA-.446G}}Q!#
(C))Z_Q2q!^%D%N%Nq%Q%T%TU\UcUc%dfm$ntuvaC1A%sOQK1K  a Ar&   )r   r   r   r*   r   r   r   r   r   r   r   r   r   r   r@   s   @r$   r   r   &  sJ      -#>, >, >,@ %)A3 # #r&   r   c                     ^  \ rS rSrSrSr\SS/ SQS/ / SQS	/ S
Q/ SQ4	S\S\S\\   S\S\	\
   S\	\   S\
S\	\
   S\	\
   SS4U 4S jjj5       r    S4S\\   S\\   S\\
   S\\
   SS4
S jjrS5S jrS5S jrS5S jrS  rS!\R(                  4S" jr\ S6S!\R(                  S#\S\\\\   4   4S$ jj5       rS6S%\R(                  S#\4S& jjr\S6S%\R(                  S#\S\\\R(                  4   4S' jj5       rS(\R(                  S)\R(                  S*\S\R(                  4S+ jrS(\R(                  S)\R(                  S*\S\R(                  4S, jrS!\R(                  S\4S- jr S6S%\R(                  S#\S\\\R(                  4   4S. jjr!   S7S/\R(                  S0\S#\S1\\RD                     S\\\R(                  4   4
S2 jjr#S3r$U =r%$ )8AutoencoderKLQwenImagei  a  
A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.

This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
for all models (such as downloading or saving).
F`      r   r   r   rH   )gy):gMOg^)gQ?gtVƿgZӼ?gBfjÿgU0*?gL
F%u?gMg&?gz6>׿gF%uȿg[ AcgMJ?gW2ıҿ)g_L@gNё\C?gQ@g?@g9#J{?g|a2U?gHPs@g0* @gJ{/L&
@gJY8@g]C@g(?gK46?gS:?go_Ι@g-?base_dimr   r   r   r   r   r   latents_meanlatents_stdr   Nc
           	        > [         T
U ]  5         X l        X`l        US S S2   U l        [        XS-  X4XPR                  U5      U l        [        US-  US-  S5      U l        [        X"S5      U l	        [        XX4XPR                  U5      U l        S[        U R                  5      -  U l        SU l        SU l        SU l        SU l        SU l        SU l        U R                  b*  [)        S U R                  R+                  5        5       5      OSU R                  b*  [)        S	 U R                  R+                  5        5       5      OSS
.U l        g )NrR   r   r   F      c              3   B   #    U  H  n[        U[        5      v   M     g 7fr7   
isinstancer   .0ms     r$   	<genexpr>2AutoencoderKLQwenImage.__init__.<locals>.<genexpr>       `I_A:a)>??I_   r   c              3   B   #    U  H  n[        U[        5      v   M     g 7fr7   r   r  s     r$   r  r    r  r  )decoderencoder)r   r    r   r   r   r   r  r   
quant_convpost_quant_convr   r
  r   spatial_compression_ratiouse_slicing
use_tilingtile_sample_min_heighttile_sample_min_widthtile_sample_stride_heighttile_sample_stride_widthsummodules_cached_conv_counts)r"   r   r   r   r   r   r   r   r   r   r#   s             r$   r    AutoencoderKLQwenImage.__init__  sB    	
#6 !4TrT!:)ai;H`H`bi
 0	519aH4U1E)X{DZDZ\c
 *+c$2J2J.K)K& !
   '*#%(" *-&(+%
 ||' `I]I]I_``||' `I]I]I_``$
 r&   r  r  r  r  c                     SU l         U=(       d    U R                  U l        U=(       d    U R                  U l        U=(       d    U R                  U l        U=(       d    U R                  U l        g)a  
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.

Args:
    tile_sample_min_height (`int`, *optional*):
        The minimum height required for a sample to be separated into tiles across the height dimension.
    tile_sample_min_width (`int`, *optional*):
        The minimum width required for a sample to be separated into tiles across the width dimension.
    tile_sample_stride_height (`int`, *optional*):
        The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
        no tiling artifacts produced across the height dimension.
    tile_sample_stride_width (`int`, *optional*):
        The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
        artifacts produced across the width dimension.
TN)r  r  r  r  r  )r"   r  r  r  r  s        r$   enable_tiling$AutoencoderKLQwenImage.enable_tiling  sW    0 &<&[@[@[#%:%Xd>X>X")B)ddFdFd&(@(aDDaDa%r&   c                     SU l         g)z
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
decoding in one step.
FN)r  r"   s    r$   disable_tiling%AutoencoderKLQwenImage.disable_tiling  s    
  r&   c                     SU l         g)z
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
TNr  r  s    r$   enable_slicing%AutoencoderKLQwenImage.enable_slicing  s    
  r&   c                     SU l         g)z
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
decoding in one step.
FNr!  r  s    r$   disable_slicing&AutoencoderKLQwenImage.disable_slicing  s    
 !r&   c                     S nU" U R                   5      U l        S/U l        S /U R                  -  U l        U" U R                  5      U l        S/U l        S /U R
                  -  U l        g )Nc                 p    SnU R                  5        H  n[        U[        5      (       d  M  US-  nM!     U$ r   )r  r  r   )modelcountr  s      r$   _count_conv3d9AutoencoderKLQwenImage.clear_cache.<locals>._count_conv3d  s5    E]]_a!677QJE % Lr&   r   )r
  	_conv_num	_conv_idx	_feat_mapr  _enc_conv_num_enc_conv_idx_enc_feat_map)r"   r+  s     r$   clear_cache"AutoencoderKLQwenImage.clear_cache  sd    	 't||4$..0*4<<8S"Vd&8&88r&   r4   c           
         UR                   u    p#pEU R                  (       a/  XPR                  :  d  X@R                  :  a  U R	                  U5      $ U R                  5         SUS-
  S-  -   n[        U5       H  nS/U l        US:X  a9  U R                  US S 2S S 2S S2S S 2S S 24   U R                  U R                  S9nMJ  U R                  US S 2S S 2SSUS-
  -  -   SSU-  -   2S S 2S S 24   U R                  U R                  S9n	[        R                  " WU	/S5      nM     U R                  W5      n
U R                  5         U
$ )Nr   r(   r   r}   r~   r   )r0   r  r  r  tiled_encoder3  r   r1  r  r2  r.   r/   r  )r"   r4   r   	num_framer   r   iter_r   outout_encs              r$   _encodeAutoencoderKLQwenImage._encode$  s:   )*&1??(B(B BfOjOjFj$$Q''Y]q((uA"#DAvll1Q2A2q!^#4ASAS^b^p^plq||aAQUOa!a%i7A=>#11!// $ 
 iidQ/  ooc"
r&   return_dictc                 >   U R                   (       aY  UR                  S   S:  aF  UR                  S5       Vs/ s H  o0R                  U5      PM     nn[        R
                  " U5      nOU R                  U5      n[        U5      nU(       d  U4$ [        US9$ s  snf )a  
Encode a batch of images into latents.

Args:
    x (`torch.Tensor`): Input batch of images.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

Returns:
        The latent representations of the encoded videos. If `return_dict` is True, a
        [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
r   r   )latent_dist)r  r0   splitr=  r.   r/   r   r   )r"   r4   r?  x_sliceencoded_slicesr   	posteriors          r$   encodeAutoencoderKLQwenImage.encode<  s      
QCD771:N:ll73:NN		.)AQA03	<"y99 Os   Bzc           
         UR                   u    p4pVU R                  U R                  -  nU R                  U R                  -  nU R                  (       a  Xh:  d  XW:  a  U R                  XS9$ U R                  5         U R                  U5      n	[        U5       H  n
S/U l	        U
S:X  a;  U R                  U	S S 2S S 2XS-   2S S 2S S 24   U R                  U R                  S9nML  U R                  U	S S 2S S 2XS-   2S S 2S S 24   U R                  U R                  S9n[        R                  " WU/S5      nM     [        R                  " WSSS9nU R                  5         U(       d  U4$ [        US	9$ )
Nr?  r   r   r6  r   g      r   )minmaxsample)r0   r  r  r  r  tiled_decoder3  r  r   r.  r
  r/  r.   r/   clampr   )r"   rH  r?  r   r8  r   r   tile_latent_min_heighttile_latent_min_widthr4   r   r:  r;  s                r$   _decodeAutoencoderKLQwenImage._decodeW  sS   )*&1!%!<!<@^@^!^ $ : :d>\>\ \?? =A`$$Q$@@  #y!ASDNAvll1Q11u9a%:#;aeaoaolp||AaAAIq!&;$<bfbpbp|qiidQ/ " kk#4S16MC((r&   c                 P   U R                   (       ac  UR                  S   S:  aP  UR                  S5       Vs/ s H  o0R                  U5      R                  PM     nn[
        R                  " U5      nOU R                  U5      R                  nU(       d  U4$ [        US9$ s  snf )a  
Decode a batch of images.

Args:
    z (`torch.Tensor`): Input batch of latent vectors.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

Returns:
    [`~models.vae.DecoderOutput`] or `tuple`:
        If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
        returned.
r   r   rM  )r  r0   rB  rS  rN  r.   r/   r   )r"   rH  r?  z_slicedecoded_slicesdecodeds         r$   decodeAutoencoderKLQwenImage.decodep  s     
QJK''RS*U*wll73::*NUii/Gll1o,,G:G,, Vs   $B#ar   blend_extentc           	         [        UR                  S   UR                  S   U5      n[        U5       HI  nUS S 2S S 2S S 2U* U-   S S 24   SXC-  -
  -  US S 2S S 2S S 2US S 24   XC-  -  -   US S 2S S 2S S 2US S 24'   MK     U$ )Nr   rK  r0   r   )r"   r[  r   r\  ys        r$   blend_vAutoencoderKLQwenImage.blend_v  s    1772;\B|$A Aq<-!*;Q!>?1qGWCWX[\]^`acdfgij]j[k \  AaAq!m % r&   c                    [        UR                  S   UR                  S   U5      n[        U5       HI  nUS S 2S S 2S S 2S S 2U* U-   4   SXC-  -
  -  US S 2S S 2S S 2S S 2U4   XC-  -  -   US S 2S S 2S S 2S S 2U4'   MK     U$ )NrR   r   r_  )r"   r[  r   r\  r4   s        r$   blend_hAutoencoderKLQwenImage.blend_h  s    1772;\B|$A Aq!l]Q->!>?1qGWCWX[\]^`acdfgij]j[k \  AaAq!m % r&   c                    UR                   u    p#pEX@R                  -  nXPR                  -  nU R                  U R                  -  nU R                  U R                  -  n	U R                  U R                  -  n
U R
                  U R                  -  nX-
  nX-
  n/ n[        SX@R                  5       GHH  n/ n[        SXPR
                  5       GH  nU R                  5         / nSUS-
  S-  -   n[        U5       H  nS/U l        US:X  a.  USS2SS2SS2XU R                  -   2UUU R                  -   24   nO<USS2SS2SSUS-
  -  -   SSU-  -   2XU R                  -   2UUU R                  -   24   nU R                  UU R                  U R                  S9nU R                  U5      nUR                  U5        M     UR                  [        R                  " USS95        GM     UR                  U5        GMK     U R                  5         / n[        U5       H  u  nn/ n[        U5       Hi  u  nnUS:  a  U R!                  XS-
     U   UU5      nUS:  a  U R#                  UUS-
     UU5      nUR                  USS2SS2SS2SU
2SU24   5        Mk     UR                  [        R                  " USS95        M     [        R                  " US	S9SS2SS2SS2SU2SU24   nU$ )
zEncode a batch of images using a tiled encoder.

Args:
    x (`torch.Tensor`): Input batch of videos.

Returns:
    `torch.Tensor`:
        The latent representation of the encoded videos.
r   r   r(   Nr6  r   r)   rR   r   )r0   r  r  r  r  r  r   r3  r1  r  r2  r  r   r.   r/   r   ra  rd  )r"   r4   r   
num_framesr   r   latent_heightlatent_widthrQ  rR  tile_latent_stride_heighttile_latent_stride_widthblend_heightblend_widthrowsr   rowjr   frame_ranger   tileresult_rows
result_rowr<  s                            r$   r7  #AutoencoderKLQwenImage.tiled_encode  s    +,'''1&"@"@@ > >>!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b -I+F q&"@"@AAC1e%B%BC  ":>a"77{+A*+D&Av Arr143N3N/N+NPQTUX\XrXrTrPr!rs QUOa!a%i7D$?$? ??D$>$> >>	@   <<9K9KVZVhVh<iD??40DKK% , 

599Tq12' D( KK- B. 	oFAsJ$S>4 q5<<UAlKDq5<<AE
D+FD!!$q!Q0J1J0JLeMeLe'e"fg * uyy<= & ii+Aq!^m^]l],RS
r&   c                    UR                   u    p4pVXPR                  -  nX`R                  -  nU R                  U R                  -  n	U R                  U R                  -  n
U R                  U R                  -  nU R
                  U R                  -  nU R                  U R                  -
  nU R                  U R
                  -
  n/ n[        SX[5       H  n/ n[        SXl5       H  nU R                  5         / n[        U5       Ho  nS/U l        USS2SS2UUS-   2UUU	-   2UUU
-   24   nU R                  U5      nU R                  UU R                  U R                  S9nUR                  U5        Mq     UR                  [        R                  " USS95        M     UR                  U5        M     U R                  5         / n[        U5       H  u  nn/ n[        U5       H~  u  nnUS:  a  U R!                  UUS-
     U   UU5      nUS:  a  U R#                  UUS-
     UU5      nUR                  USS2SS2SS2SU R                  2SU R
                  24   5        M     UR                  [        R                  " USS95        M     [        R                  " USS9SS2SS2SS2SU2SU24   nU(       d  U4$ [%        US	9$ )
a  
Decode a batch of images using a tiled decoder.

Args:
    z (`torch.Tensor`): Input batch of latent vectors.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

Returns:
    [`~models.vae.DecoderOutput`] or `tuple`:
        If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
        returned.
r   Nr   r6  r   r)   rR   r   rM  )r0   r  r  r  r  r  r   r3  r.  r  r
  r/  r   r.   r/   r   ra  rd  r   )r"   rH  r?  r   rg  r   r   sample_heightsample_widthrQ  rR  rj  rk  rl  rm  rn  r   ro  rp  r   r   rr  rX  rs  rt  decs                             r$   rO  #AutoencoderKLQwenImage.tiled_decode  s    +,'''1&!?!??===!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b 22T5S5SS0043P3PP q&<AC1e>  "z*A&'SDNQ1q1u9a!6L2L.LaRSVkRkNkklD//5D"ll4DNNUYUcUcldGKK( + 

599Tq12 ? KK = 	oFAsJ$S>4 q5<<QUAlKDq5<<AE
D+FD!!$q!Q0P$2P2P0PRqTXTqTqRq'q"rs * uyy<= & ii+Aq!^m^]l],RS6MC((r&   rN  sample_posterior	generatorc                     UnU R                  U5      R                  nU(       a  UR                  US9nOUR                  5       nU R	                  XsS9nU$ )z
Args:
    sample (`torch.Tensor`): Input sample.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
)r|  rJ  )rF  rA  rN  rb   rY  )	r"   rN  r{  r?  r|  r4   rE  rH  ry  s	            r$   r3   AutoencoderKLQwenImage.forward  sS     KKN..	  9 5A Akk!k5
r&   )r  r.  r-  r1  r0  r2  r/  r
  r  r  r  r  r   r   r  r  r  r  r  r  r   )NNNN)r   N)T)FTN)&r8   r9   r:   r;   r<    _supports_gradient_checkpointingr	   r=   r   r   r[   rV   r    r   r  r  r"  r%  r3  r.   Tensorr=  r   r   r   r   rF  rS  r   rY  ra  rd  r7  rO  	Generatorr3   r>   r?   r@   s   @r$   r   r     s    (-$  +#%*= %o $d8
8
 8
 *	8

 8
 %[8
 "$Z8
 8
 5k8
 %[8
 
8
 8
x 15/35948b (b  (}b $,E?	b
 #+5/b 
b<  !9  0 37::,0:	"E*F$GG	H: :4) )D )2 - -4 -5X]XdXdIdCe - -0 %,, c ell  %,, c ell @ell @/B @D=)ell =) =)}^c^j^jOjIk =)D "' /3  	
 EOO, 
}ell*	+ r&   r   )0typingr   r   r   r   r.   torch.nnrJ   torch.nn.functional
functionalr1   torch.utils.checkpointconfiguration_utilsr   r	   loadersr
   utilsr   utils.accelerate_utilsr   activationsr   modeling_outputsr   modeling_utilsr   vaer   r   
get_loggerr8   loggerru   Conv3dr   ModulerB   UpsamplerX   r`   r   r   r   r   r   r   r   r^   r&   r$   <module>r     s  * 0 /      B -  8 ( 2 ' < 
		H	%
*"BII *"Zk		 k45 5T		 TnGRYY GT*bii *Z$		 $Nh hVBryy BJr rjSZ6L Sr&   