
    +h                        S SK JrJrJr  S SKrS SKrS SKJr  S SK	Js  J
r  S SKrSSKJrJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  \R>                  " \ 5      r! S)S\"S\"S\RF                  S\RH                  S\"S\RJ                  4S jjr& " S S\RN                  5      r( " S S\RN                  5      r) " S S\RN                  5      r* " S S\RN                  5      r+ " S S\RN                  5      r, " S S \RN                  5      r- " S! S"\RN                  5      r. " S# S$\RN                  5      r/ " S% S&\RN                  5      r0 " S' S(\\5      r1g)*    )OptionalTupleUnionN   )ConfigMixinregister_to_config)logging)apply_forward_hook   )get_activation)	Attention)AutoencoderKLOutput)
ModelMixin   )DecoderOutputDiagonalGaussianDistribution
num_framesheight_widthdtypedevice
batch_sizereturnc                 R   [         R                  " SU S-   [         R                  US9nUR                  U5      n[         R                  " XfSS9u  px[         R
                  " Xx:*  S[        S5      * 5      R                  US9n	Ub"  U	R                  S5      R                  USS5      n	U	$ )	Nr   )r   r   xy)indexingr   inf)r   )
torcharangeint32repeat_interleavemeshgridwherefloatto	unsqueezeexpand)
r   r   r   r   r   indicesindices_blocksxymasks
             t/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/autoencoders/autoencoder_kl_hunyuan_video.pyprepare_causal_attention_maskr.   $   s     ll1j1nEKKOG..|<N>>.4HDA;;qvq5<-033%3@D~~a ''
B;K    c                      ^  \ rS rSr      SS\S\S\\\\\\4   4   S\\\\\\4   4   S\\\\\\4   4   S\\\\\\4   4   S\S	\S
S4U 4S jjjr	S\
R                  S
\
R                  4S jrSrU =r$ )HunyuanVideoCausalConv3d1   in_channelsout_channelskernel_sizestridepaddingdilationbiaspad_moder   Nc	           
         > [         T	U ]  5         [        U[        5      (       a  X3U4OUnXl        US   S-  US   S-  US   S-  US   S-  US   S-
  S4U l        [        R                  " XX4XVUS9U l        g )Nr   r   r   r9   )	super__init__
isinstanceintr:   time_causal_paddingnnConv3dconv)
selfr3   r4   r5   r6   r7   r8   r9   r:   	__class__s
            r-   r>   !HunyuanVideoCausalConv3d.__init__2   s     	AKKY\A]A]{=cn NaNaNaNaNQ$
  IIkgfjk	r/   hidden_statesc                 v    [         R                  " XR                  U R                  S9nU R	                  U5      $ )N)mode)FpadrA   r:   rD   rE   rH   s     r-   forward HunyuanVideoCausalConv3d.forwardM   s+    m-E-EDMMZyy''r/   )rD   r:   rA   )r   r   r   r   T	replicate)__name__
__module____qualname____firstlineno__r@   r   r   boolstrr>   r   TensorrN   __static_attributes____classcell__rF   s   @r-   r1   r1   1   s    
 9:344556#ll l 3c3m 445	l
 c5c3//0l sE#sC-001l U3S=112l l l 
l l6(U\\ (ell ( (r/   r1   c                      ^  \ rS rSr     SS\S\\   S\S\S\S\\\\4   S	S4U 4S
 jjjr	S\
R                  S	\
R                  4S jrSrU =r$ )HunyuanVideoUpsampleCausal3DR   Nr3   r4   r5   r6   r9   upsample_factorr   c                 d   > [         TU ]  5         U=(       d    UnX`l        [        XX4US9U l        g Nr<   )r=   r>   r^   r1   rD   )rE   r3   r4   r5   r6   r9   r^   rF   s          r-   r>   %HunyuanVideoUpsampleCausal3D.__init__S   s1     	#2{.,[bfg	r/   rH   c                    UR                  S5      nUR                  SUS-
  4SS9u  p4[        R                  " UR	                  S5      U R
                  SS  SS9R                  S5      nUS:  aF  UR                  5       n[        R                  " X@R
                  SS9n[        R                  " X44SS9nOUnU R                  U5      nU$ )Nr   r   dimnearest)scale_factorrJ   )sizesplitrK   interpolatesqueezer^   r&   
contiguousr   catrD   )rE   rH   r   first_frameother_framess        r-   rN   $HunyuanVideoUpsampleCausal3D.forwardc   s    "''*
$1$7$7JN8KQR$7$S!mm"1E1Eab1IPY

)A, 	 > (224L==DXDX_hiL!II{&AqIM'M		-0r/   )rD   r^   )Nr   r   Tr   r   r   )rQ   rR   rS   rT   r@   r   rU   r   r$   r>   r   rW   rN   rX   rY   rZ   s   @r-   r\   r\   R   s     '+6?hh smh 	h
 h h ueU23h 
h h U\\ ell  r/   r\   c                      ^  \ rS rSr     SS\S\\   S\S\S\SS4U 4S	 jjjrS
\R                  S\R                  4S jr
SrU =r$ )HunyuanVideoDownsampleCausal3D|   Nchannelsr4   r7   r5   r9   r   c           	      X   > [         TU ]  5         U=(       d    Un[        XXFX5S9U l        g r`   )r=   r>   r1   rD   )rE   rt   r4   r7   r5   r9   r6   rF   s          r-   r>   'HunyuanVideoDownsampleCausal3D.__init__}   s+     	#/x,X[Zam	r/   rH   c                 (    U R                  U5      nU$ NrD   rM   s     r-   rN   &HunyuanVideoDownsampleCausal3D.forward   s    		-0r/   ry   )Nr   r   Tr   )rQ   rR   rS   rT   r@   r   rU   r>   r   rW   rN   rX   rY   rZ   s   @r-   rr   rr   |   s     '+nn smn 	n
 n n 
n nU\\ ell  r/   rr   c                      ^  \ rS rSr     SS\S\\   S\S\S\S\S	S4U 4S
 jjjrS\	R                  S	\	R                  4S jrSrU =r$ )HunyuanVideoResnetBlockCausal3D   Nr3   r4   dropoutgroupsepsnon_linearityr   c                   > [         TU ]  5         U=(       d    Un[        U5      U l        [        R
                  " XAUSS9U l        [        XSSS5      U l        [        R
                  " XBUSS9U l	        [        R                  " U5      U l        [        X"SSS5      U l        S U l        X:w  a  [        XSSS5      U l        g g )NT)r   affiner   r   r   )r=   r>   r   nonlinearityrB   	GroupNormnorm1r1   conv1norm2Dropoutr~   conv2conv_shortcut)rE   r3   r4   r~   r   r   r   rF   s          r-   r>   (HunyuanVideoResnetBlockCausal3D.__init__   s     	#2{*=9\\&3tL
-kAqQ
\\&CM
zz'*-l!QPQR
!&!9+UVXY[\!]D 'r/   rH   c                 \   UR                  5       nUnU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  b  U R                  U5      nX-   nU$ rx   )rk   r   r   r   r   r~   r   r   )rE   rH   residuals      r-   rN   'HunyuanVideoResnetBlockCausal3D.forward   s    %002 

=1))-8

=1

=1))-8]3

=1)))(3H%0r/   )r   r   r   r~   r   r   r   )N            ư>swish)rQ   rR   rS   rT   r@   r   r$   rV   r>   r   rW   rN   rX   rY   rZ   s   @r-   r|   r|      s     '+$^^ sm^ 	^
 ^ ^ ^ 
^ ^2U\\ ell  r/   r|   c                      ^  \ rS rSr       SS\S\S\S\S\S\S\S	\S
S4U 4S jjjrS\	R                  S
\	R                  4S jrSrU =r$ )HunyuanVideoMidBlock3D   r3   r~   
num_layers
resnet_epsresnet_act_fnresnet_groupsadd_attentionattention_head_dimr   Nc	                   > [         TU ]  5         Ub  UO[        US-  S5      nXpl        [	        UUUUUUS9/n	/ n
[        U5       He  nU R                  (       a#  U
R                  [        UX-  UUUSSSSS9	5        OU
R                  S 5        U	R                  [	        UUUUUUS95        Mg     [        R                  " U
5      U l
        [        R                  " U	5      U l        SU l        g )N   r   r3   r4   r   r   r~   r   T)headsdim_headr   norm_num_groupsresidual_connectionr9   upcast_softmax_from_deprecated_attn_blockF)r=   r>   minr   r|   rangeappendr   rB   
ModuleList
attentionsresnetsgradient_checkpointing)rE   r3   r~   r   r   r   r   r   r   r   r   _rF   s               r-   r>   HunyuanVideoMidBlock3D.__init__   s    	)6)BK[\L\^`Ha* ,'($+	
 
z"A!!!!#)?!3&(5,0!'+48
 !!$'NN/ +!,"(#"/	% #: --
3}}W-&+#r/   rH   c           	         [         R                  " 5       (       a  U R                  (       a  U R                  U R                  S   U5      n[        U R                  U R                  SS  5       H  u  p#Ub  UR                  u  pEpgnUR                  SSSSS5      R                  SS5      n[        XgU-  UR                  UR                  US9n	U" XS9nUR                  SXgU45      R                  SSSSS5      nU R                  X15      nM     U$ U R                  S   " U5      n[        U R                  U R                  SS  5       H  u  p#Ub  UR                  u  pEpgnUR                  SSSSS5      R                  SS5      n[        XgU-  UR                  UR                  US9n	U" XS9nUR                  SXgU45      R                  SSSSS5      nU" U5      nM     U$ )Nr   r   r   r   r   )r   )attention_mask)r   is_grad_enabledr   _gradient_checkpointing_funcr   zipr   shapepermuteflattenr.   r   r   	unflatten)
rE   rH   attnresnetr   num_channelsr   heightwidthr   s
             r-   rN   HunyuanVideoMidBlock3D.forward   s     ""t'B'B ==dll1o}]M #DOOT\\!"5E F#JWJ]J]GJj%$1$9$9!Q1a$H$P$PQRTU$VM%B"UNM4G4GI]I]jt&N %)$VM$1$;$;A
TY?Z$[$c$cdeghjkmnpq$rM $ A A& X !G6  !LLOM:M #DOOT\\!"5E F#JWJ]J]GJj%$1$9$9!Q1a$H$P$PQRTU$VM%B"UNM4G4GI]I]jt&N %)$VM$1$;$;A
TY?Z$[$c$cdeghjkmnpq$rM &} 5 !G r/   )r   r   r   r   )r   r   r   r   r   Tr   rQ   rR   rS   rT   r@   r$   rV   rU   r>   r   rW   rN   rX   rY   rZ   s   @r-   r   r      s      $""#<,<, <, 	<,
 <, <, <, <,  <, 
<, <,|U\\ ell  r/   r   c                      ^  \ rS rSr        SS\S\S\S\S\S\S\S	\S
\S\SS4U 4S jjjrS\	R                  S\	R                  4S jrSrU =r$ )HunyuanVideoDownBlock3Di  r3   r4   r~   r   r   r   r   add_downsampledownsample_stridedownsample_paddingr   Nc                 F  > [         TU ]  5         / n[        U5       H*  nUS:X  a  UOUnUR                  [	        UUUUUUS95        M,     [
        R                  " U5      U l        U(       a'  [
        R                  " [        UUU
U	S9/5      U l	        OS U l	        SU l
        g )Nr   r   )r4   r7   r6   F)r=   r>   r   r   r|   rB   r   r   rr   downsamplersr   )rE   r3   r4   r~   r   r   r   r   r   r   r   r   irF   s                r-   r>    HunyuanVideoDownBlock3D.__init__   s     	z"A)*a+\KNN/ +!-"(#"/	 # }}W- "2$%1 20		!D !%D&+#r/   rH   c                 ,   [         R                  " 5       (       a6  U R                  (       a%  U R                   H  nU R	                  X!5      nM     OU R                   H  nU" U5      nM     U R
                  b  U R
                   H  nU" U5      nM     U$ rx   )r   r   r   r   r   r   )rE   rH   r   downsamplers       r-   rN   HunyuanVideoDownBlock3D.forwardO  s      ""t'B'B,, $ A A& X ' ,, &} 5 ' (#00 +M :  1 r/   )r   r   r   )r   r   r   r   r   Tr   r   r   rZ   s   @r-   r   r     s    
  $#!""#-,-, -, 	-,
 -, -, -, -, -, -,  -, 
-, -,^U\\ ell  r/   r   c                      ^  \ rS rSr       SS\S\S\S\S\S\S\S	\S
\\\\4   SS4U 4S jjjr	S\
R                  S\
R                  4S jrSrU =r$ )HunyuanVideoUpBlock3Di^  r3   r4   r~   r   r   r   r   add_upsampleupsample_scale_factorr   Nc
                 D  > [         TU ]  5         / n
[        U5       H*  nUS:X  a  UOUnU
R                  [	        UUUUUUS95        M,     [
        R                  " U
5      U l        U(       a&  [
        R                  " [        UUU	S9/5      U l	        OS U l	        SU l
        g )Nr   r   )r4   r^   F)r=   r>   r   r   r|   rB   r   r   r\   
upsamplersr   )rE   r3   r4   r~   r   r   r   r   r   r   r   r   input_channelsrF   s                r-   r>   HunyuanVideoUpBlock3D.__init___  s     	z"A,-F[NNN/ .!-"(#"/	 # }}W- mm0$%1(=DO #DO&+#r/   rH   c                 ,   [         R                  " 5       (       a6  U R                  (       a%  U R                   H  nU R	                  X!5      nM     OU R                   H  nU" U5      nM     U R
                  b  U R
                   H  nU" U5      nM     U$ rx   )r   r   r   r   r   r   )rE   rH   r   	upsamplers       r-   rN   HunyuanVideoUpBlock3D.forward  s}      ""t'B'B,, $ A A& X ' ,, &} 5 ' ??&!__	 )- 8 - r/   )r   r   r   )r   r   r   r   r   Trp   )rQ   rR   rS   rT   r@   r$   rV   rU   r   r>   r   rW   rN   rX   rY   rZ   s   @r-   r   r   ^  s    
  $!6?,,,, ,, 	,,
 ,, ,, ,, ,, ,,  %S#s]3,, 
,, ,,\U\\ ell  r/   r   c                      ^  \ rS rSrSr           SS\S\S\\S4   S\\S4   S\S	\S
\S\S\S\SS4U 4S jjjr	S\
R                  S\
R                  4S jrSrU =r$ )HunyuanVideoEncoder3Di  zp
Causal encoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
r3   r4   down_block_types.block_out_channelslayers_per_blockr   act_fndouble_ztemporal_compression_ratiospatial_compression_ratior   Nc                 ,  > [         TU ]  5         [        XS   SSS9U l        S U l        [
        R                  " / 5      U l        US   n[        U5       GHB  u  pUS:w  a  [        SU 35      eUnXM   nU[        U5      S-
  :H  n[        [        R                  " U5      5      n[        [        R                  " U
5      5      nU
S:X  a:  [        UU:  5      n[        U[        U5      S-
  U-
  :  =(       a    U(       + 5      nO1U
S:X  a  [        UU:  5      n[        UU:  5      nO[        S	U
 35      eU(       a  S
OSnU(       a  SOSn[        UU-   5      n[!        UUU[        U=(       d    U5      SUUUSS9	nU R                  R#                  U5        GME     [%        US   SUUS   UU	S9U l        [
        R&                  " US   USS9U l        [
        R*                  " 5       U l        U(       a  SU-  OUn[        US   USS9U l        SU l        g )Nr   r   r   r5   r6   r   zUnsupported down_block_type: r      $Unsupported time_compression_ratio: r   r   r   r   r   r   r   )	r   r3   r4   r   r   r   r   r   r   r   r3   r   r   r   r   r   r   
num_groupsr   r   r5   F)r=   r>   r1   conv_in	mid_blockrB   r   down_blocks	enumerate
ValueErrorlenr@   nplog2rU   tupler   r   r   r   conv_norm_outSiLUconv_actconv_outr   )rE   r3   r4   r   r   r   r   r   r   mid_block_add_attentionr   r   output_channelr   down_block_typeinput_channelis_final_blocknum_spatial_downsample_layersnum_time_downsample_layersadd_spatial_downsampleadd_time_downsampledownsample_stride_HWdownsample_stride_Tr   
down_blockconv_out_channelsrF   s                             r-   r>   HunyuanVideoEncoder3D.__init__  sA   & 	/PQ=R`ajkl==,+A."+,<"=A";; #@@Q!RSS*M/2N#&8"9A"==N,/8Q0R,S)),RWW5O-P)Q&)Q.)-a2O.O)P&&*#01A58RRSj\jXj'# ,q0)-a2O.O)P&&*1/I+I&J# #GHbGc!dee-C6 *=$4 %&9<P&P Q0+)+#$:$Q>QR$-"3#$
J ##J/K #>N 0*2. 1"5)1
  \\7I"7MZiost	08A,l01CB1GIZhij&+#r/   rH   c                    U R                  U5      n[        R                  " 5       (       aR  U R                  (       aA  U R                   H  nU R                  X!5      nM     U R                  U R                  U5      nO,U R                   H  nU" U5      nM     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rx   )
r   r   r   r   r   r   r   r   r   r   )rE   rH   r  s      r-   rN   HunyuanVideoEncoder3D.forward  s    ]3  ""t'B'B"..
 $ A A* \ / !==dnnm\M"..
 *= 9 / !NN=9M**=9m4m4r/   )r   r   r   r   r   r   r   )r   r   r   r   r   r            r  r   r   siluTTr   r   )rQ   rR   rS   rT   __doc__r@   r   rV   rU   r>   r   rW   rN   rX   rY   rZ   s   @r-   r   r     s     -
 /C !! $*+)*#P,P, P,  S/	P, "#s(OP, P, P, P, P,  %(!P," $'#P,$ 
%P, P,dU\\ ell  r/   r   c                      ^  \ rS rSrSr          SS\S\S\\S4   S\\S4   S\S	\S
\S\S\4U 4S jjjrS\	R                  S\	R                  4S jrSrU =r$ )HunyuanVideoDecoder3Di	  zp
Causal decoder for 3D video-like data introduced in [Hunyuan Video](https://huggingface.co/papers/2412.03603).
r3   r4   up_block_types.r   r   r   r   time_compression_ratior   c                   > [         TU ]  5         XPl        [        XS   SSS9U l        [
        R                  " / 5      U l        [        US   SUUS   UUS9U l	        [        [        U5      5      nUS   n[        U5       GH-  u  pUS:w  a  [        S	U 35      eUnX   nU[        U5      S-
  :H  n[        [         R"                  " U
5      5      n[        [         R"                  " U	5      5      nU	S
:X  a:  [%        UU:  5      n[%        U[        U5      S-
  U-
  :  =(       a    U(       + 5      nO[        SU	 35      eU(       a  SOSnU(       a  SOSn['        UU-   5      n[)        U R                  S-   UU[%        U=(       d    U5      USUUS9nU R                  R+                  U5        UnGM0     [
        R,                  " US   USS9U l        [
        R0                  " 5       U l        [        US   USS9U l        SU l        g )Nr   r   r   r   r   r   r   r   zUnsupported up_block_type: r   r   r   r   r   r   )r   r3   r4   r   r   r   r   r   r   r   F)r=   r>   r   r1   r   rB   r   	up_blocksr   r   listreversedr   r   r   r@   r   r   rU   r   r   r   r   r   r   r   r   r   )rE   r3   r4   r  r   r   r   r   r   r  r   reversed_block_out_channelsr   r   up_block_typeprev_output_channelr   num_spatial_upsample_layersnum_time_upsample_layersadd_spatial_upsampleadd_time_upsampleupsample_scale_factor_HWupsample_scale_factor_Tr   up_blockrF   s                            r-   r>   HunyuanVideoDecoder3D.__init__  s#   $ 	 0/PR=Sabklmr* 0*2. 1"5)1
 '+84F+G&H#4Q7 ). 9A 77 #>}o!NOO"08;N#&8"9A"==N*-bgg6O.P*Q''*2773I+J'K$%*'+A0K,K'L$$(/0147OOOfXfTf%! !#GH^G_!`aa1Ev6$.?dT#$)*AD\*\$]!,0014/+!"6"K:KL&;$-	H NN!!(+"0E !:J  \\7I!7LYhnrs	01CA1Fbcd&+#r/   rH   r   c                    U R                  U5      n[        R                  " 5       (       aR  U R                  (       aA  U R	                  U R
                  U5      nU R                   H  nU R	                  X!5      nM     O,U R                  U5      nU R                   H  nU" U5      nM     U R                  U5      nU R                  U5      nU R                  U5      nU$ rx   )
r   r   r   r   r   r   r  r   r   r   )rE   rH   r   s      r-   rN   HunyuanVideoDecoder3D.forward^  s    ]3  ""t'B'B ==dnnm\M NN $ A A( Z + !NN=9M NN ( 7 + **=9m4m4r/   )r   r   r   r   r   r   r   r  )
r   r   r   r   r   r   r	  r   r   r  Tr   r   )rQ   rR   rS   rT   r  r@   r   rV   r>   r   rW   rN   rX   rY   rZ   s   @r-   r  r  	  s     +
 /C !! $&')*!N,N, N, c3h	N, "#s(ON, N, N, N, !$N,  $'!N, N,`U\\ ell  r/   r  c                     ^  \ rS rSrSrSr\             S5S\S\S\S\\	S4   S	\\	S4   S
\\   S\S\	S\S\
S\S\S\SS4U 4S jjj5       r      S6S\\   S\\   S\\   S\\
   S\\
   S\\
   SS4S jjrS7S jrS7S jrS7S jrS\R&                  S\R&                  4S  jr\ S8S\R&                  S!\S\\\\   4   4S" jj5       rS8S#\R&                  S!\S\\\R&                  4   4S$ jjr\S8S#\R&                  S!\S\\\R&                  4   4S% jj5       rS&\R&                  S'\R&                  S(\S\R&                  4S) jrS&\R&                  S'\R&                  S(\S\R&                  4S* jrS&\R&                  S'\R&                  S(\S\R&                  4S+ jrS\R&                  S\4S, jr S8S#\R&                  S!\S\\\R&                  4   4S- jjr!S\R&                  S\4S. jr"S8S#\R&                  S!\S\\\R&                  4   4S/ jjr#   S9S0\R&                  S1\S!\S2\\RH                     S\\\R&                  4   4
S3 jjr%S4r&U =r'$ ):AutoencoderKLHunyuanVideoit  aV  
A VAE model with KL loss for encoding videos into latents and decoding latent representations into videos.
Introduced in [HunyuanVideo](https://huggingface.co/papers/2412.03603).

This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
for all models (such as downloading or saving).
Tr3   r4   latent_channelsr   .r  r   r   r   r   scaling_factorr   r   r   r   Nc                   > [         TU ]  5         Xl        [        UUUUUU	USUUUS9U l        [        UUUUUU	UUUUS9
U l        [        R                  " SU-  SU-  SS9U l	        [        R                  " X3SS9U l
        Xl        Xl        SU l        SU l        SU l        SU l        SU l        SU l        S	U l        S
U l        S
U l        SU l        g )NT)r3   r4   r   r   r   r   r   r   r   r   r   )
r3   r4   r  r   r   r   r   r  r   r   r   r   r   Fr           )r=   r>   r  r   encoderr  decoderrB   rC   
quant_convpost_quant_convr   r   use_slicing
use_tilinguse_framewise_encodinguse_framewise_decodingtile_sample_min_heighttile_sample_min_widthtile_sample_min_num_framestile_sample_stride_heighttile_sample_stride_widthtile_sample_stride_num_frames)rE   r3   r4   r'  r   r  r   r   r   r   r(  r   r   r   rF   s                 r-   r>   "AutoencoderKLHunyuanVideo.__init__  s	   6 	&@#,#(-1-+$;'A&?
 -'%)1-+#=&?$;
 ))A$7_9LZ[\!yyWXY)B&*D' !
   '+#&*# '*#%("*,' *-&(+%-/*r/   r5  r6  r7  r8  r9  r:  c                 J   SU l         U=(       d    U R                  U l        U=(       d    U R                  U l        U=(       d    U R                  U l        U=(       d    U R                  U l        U=(       d    U R
                  U l        U=(       d    U R                  U l        g)a_  
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.

Args:
    tile_sample_min_height (`int`, *optional*):
        The minimum height required for a sample to be separated into tiles across the height dimension.
    tile_sample_min_width (`int`, *optional*):
        The minimum width required for a sample to be separated into tiles across the width dimension.
    tile_sample_min_num_frames (`int`, *optional*):
        The minimum number of frames required for a sample to be separated into tiles across the frame
        dimension.
    tile_sample_stride_height (`int`, *optional*):
        The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
        no tiling artifacts produced across the height dimension.
    tile_sample_stride_width (`int`, *optional*):
        The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
        artifacts produced across the width dimension.
    tile_sample_stride_num_frames (`int`, *optional*):
        The stride between two consecutive frame tiles. This is to ensure that there are no tiling artifacts
        produced across the frame dimension.
TN)r2  r5  r6  r7  r8  r9  r:  )rE   r5  r6  r7  r8  r9  r:  s          r-   enable_tiling'AutoencoderKLHunyuanVideo.enable_tiling  s~    @ &<&[@[@[#%:%Xd>X>X"*D*gHgHg')B)ddFdFd&(@(aDDaDa%-J-pdNpNp*r/   c                     SU l         g)z
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
decoding in one step.
FN)r2  rE   s    r-   disable_tiling(AutoencoderKLHunyuanVideo.disable_tiling  s    
  r/   c                     SU l         g)z
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
TNr1  r@  s    r-   enable_slicing(AutoencoderKLHunyuanVideo.enable_slicing  s    
  r/   c                     SU l         g)z
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
decoding in one step.
FNrD  r@  s    r-   disable_slicing)AutoencoderKLHunyuanVideo.disable_slicing  s    
 !r/   r*   c                 L   UR                   u  p#pEnU R                  (       a   X@R                  :  a  U R                  U5      $ U R                  (       a/  X`R
                  :  d  XPR                  :  a  U R                  U5      $ U R                  U5      nU R                  U5      nU$ rx   )
r   r3  r7  _temporal_tiled_encoder2  r6  r5  tiled_encoder-  r/  )rE   r*   r   r   r   r   r   encs           r-   _encode!AutoencoderKLHunyuanVideo._encode  s    >?gg;
*e&&:8W8W+W..q11??(B(B BfOjOjFj$$Q''LLOooa 
r/   return_dictc                 >   U R                   (       aY  UR                  S   S:  aF  UR                  S5       Vs/ s H  o0R                  U5      PM     nn[        R
                  " U5      nOU R                  U5      n[        U5      nU(       d  U4$ [        US9$ s  snf )a  
Encode a batch of images into latents.

Args:
    x (`torch.Tensor`): Input batch of images.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.

Returns:
        The latent representations of the encoded videos. If `return_dict` is True, a
        [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
r   r   )latent_dist)r1  r   rh   rN  r   rl   r   r   )rE   r*   rP  x_sliceencoded_slicesh	posteriors          r-   encode AutoencoderKLHunyuanVideo.encode!  s      
QCD771:N:ll73:NN		.)AQA03	<"y99 Os   Bzc                    UR                   u  p4pVnU R                  U R                  -  nU R                  U R                  -  n	U R                  U R
                  -  n
U R                  (       a  XZ:  a  U R                  XS9$ U R                  (       a  Xy:  d  Xh:  a  U R                  XS9$ U R                  U5      nU R                  U5      nU(       d  U4$ [        US9$ )NrP  sample)r   r5  r   r6  r7  r   r4  _temporal_tiled_decoder2  tiled_decoder0  r.  r   )rE   rY  rP  r   r   r   r   r   tile_latent_min_heighttile_latent_min_widthtile_latent_min_num_framesdecs               r-   _decode!AutoencoderKLHunyuanVideo._decode=  s    >?gg;
*e!%!<!<@^@^!^ $ : :d>\>\ \%)%D%DHgHg%g"&&:+R..q.JJ?? =A`$$Q$@@  #ll1o6MC((r/   c                 P   U R                   (       ac  UR                  S   S:  aP  UR                  S5       Vs/ s H  o0R                  U5      R                  PM     nn[
        R                  " U5      nOU R                  U5      R                  nU(       d  U4$ [        US9$ s  snf )a  
Decode a batch of images.

Args:
    z (`torch.Tensor`): Input batch of latent vectors.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

Returns:
    [`~models.vae.DecoderOutput`] or `tuple`:
        If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
        returned.
r   r   r\  )r1  r   rh   rd  r]  r   rl   r   )rE   rY  rP  z_slicedecoded_slicesdecodeds         r-   decode AutoencoderKLHunyuanVideo.decodeQ  s     
QJK''RS*U*wll73::*NUii/Gll1o,,G:G,, Vs   $B#abblend_extentc           	         [        UR                  S   UR                  S   U5      n[        U5       HI  nUS S 2S S 2S S 2U* U-   S S 24   SXC-  -
  -  US S 2S S 2S S 2US S 24   XC-  -  -   US S 2S S 2S S 2US S 24'   MK     U$ )Nr   r   r   r   )rE   rl  rm  rn  r+   s        r-   blend_v!AutoencoderKLHunyuanVideo.blend_vk  s    1772;\B|$A Aq<-!*;Q!>?1qGWCWX[\]^`acdfgij]j[k \  AaAq!m % r/   c                    [        UR                  S   UR                  S   U5      n[        U5       HI  nUS S 2S S 2S S 2S S 2U* U-   4   SXC-  -
  -  US S 2S S 2S S 2S S 2U4   XC-  -  -   US S 2S S 2S S 2S S 2U4'   MK     U$ )Nr   r   rq  rE   rl  rm  rn  r*   s        r-   blend_h!AutoencoderKLHunyuanVideo.blend_hs  s    1772;\B|$A Aq!l]Q->!>?1qGWCWX[\]^`acdfgij]j[k \  AaAq!m % r/   c           	         [        UR                  S   UR                  S   U5      n[        U5       HI  nUS S 2S S 2U* U-   S S 2S S 24   SXC-  -
  -  US S 2S S 2US S 2S S 24   XC-  -  -   US S 2S S 2US S 2S S 24'   MK     U$ )Nr   rq  ru  s        r-   blend_t!AutoencoderKLHunyuanVideo.blend_t{  s    1772;\B|$A A}q'8!Q!>?1qGWCWX[\]^`acdfgij]j[k \  AaAq!m % r/   c                 P   UR                   u  p#pEnXPR                  -  nX`R                  -  nU R                  U R                  -  n	U R                  U R                  -  n
U R                  U R                  -  nU R
                  U R                  -  nX-
  nX-
  n/ n[        SXPR                  5       H  n/ n[        SX`R
                  5       Hd  nUSS2SS2SS2UUU R                  -   2UUU R                  -   24   nU R                  U5      nU R                  U5      nUR                  U5        Mf     UR                  U5        M     / n[        U5       H  u  nn/ n[        U5       Hj  u  nnUS:  a  U R                  UUS-
     U   UU5      nUS:  a  U R                  UUS-
     UU5      nUR                  USS2SS2SS2SU2SU24   5        Ml     UR                  [        R                  " USS95        M     [        R                  " USS9SS2SS2SS2SU2SU24   nU$ )zEncode a batch of images using a tiled encoder.

Args:
    x (`torch.Tensor`): Input batch of videos.

Returns:
    `torch.Tensor`:
        The latent representation of the encoded videos.
r   Nr   r   rc   r   )r   r   r5  r6  r8  r9  r   r-  r/  r   r   rr  rv  r   rl   )rE   r*   r   r   r   r   r   latent_heightlatent_widthr`  ra  tile_latent_stride_heighttile_latent_stride_widthblend_heightblend_widthrowsr   rowjtileresult_rows
result_rowrM  s                          r-   rL  &AutoencoderKLHunyuanVideo.tiled_encode  s2    ?@gg;
*e"@"@@ > >>!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b -I+F q&"@"@AAC1e%B%BCAq!a$*E*E&E"Eq1tOiOiKiGiij||D)t,

4 	 D
 KK B oFAsJ$S>4 q5<<QUAlKDq5<<AE
D+FD!!$q!Q0J1J0JLeMeLe'e"fg * uyy;< & ii+Aq!^m^]l],RS
r/   c                    UR                   u  p4pVnX`R                  -  nXpR                  -  n	U R                  U R                  -  n
U R                  U R                  -  nU R                  U R                  -  nU R
                  U R                  -  nU R                  U R                  -
  nU R                  U R
                  -
  n/ n[        SXl5       Hv  n/ n[        SX}5       HP  nUSS2SS2SS2UUU
-   2UUU-   24   nU R                  U5      nU R                  U5      nUR                  U5        MR     UR                  U5        Mx     / n[        U5       H  u  nn/ n[        U5       H~  u  nnUS:  a  U R                  UUS-
     U   UU5      nUS:  a  U R                  UUS-
     UU5      nUR                  USS2SS2SS2SU R                  2SU R
                  24   5        M     UR                  [        R                  " USS95        M     [        R                  " USS9SS2SS2SS2SU2SU	24   nU(       d  U4$ [        US9$ )a  
Decode a batch of images using a tiled decoder.

Args:
    z (`torch.Tensor`): Input batch of latent vectors.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.

Returns:
    [`~models.vae.DecoderOutput`] or `tuple`:
        If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
        returned.
r   Nr   r   rc   r   r\  )r   r   r5  r6  r8  r9  r   r0  r.  r   r   rr  rv  r   rl   r   )rE   rY  rP  r   r   r   r   r   sample_heightsample_widthr`  ra  r  r  r  r  r  r   r  r  r  ri  r  r  rc  s                            r-   r_  &AutoencoderKLHunyuanVideo.tiled_decode  sS    ?@gg;
*e!?!??===!%!<!<@^@^!^ $ : :d>\>\ \$($B$BdFdFd$d!#'#@#@DDbDb#b 22T5S5SS0043P3PP q&<AC1e>Aq!a*@&@"@!aJ_F_B__`++D1,,t,

7#	 ?
 KK = oFAsJ$S>4 q5<<QUAlKDq5<<AE
D+FD!!$q!Q0P$2P2P0PRqTXTqTqRq'q"rs * uyy<= & ii+Aq!^m^]l],RS6MC((r/   c           
         UR                   u  p#pEnUS-
  U R                  -  S-   nU R                  U R                  -  nU R                  U R                  -  n	X-
  n
/ n[	        SX@R                  5       H  nUS S 2S S 2XU R                  -   S-   2S S 2S S 24   nU R
                  (       a0  XPR                  :  d  X`R                  :  a  U R                  U5      nO"U R                  U5      nU R                  U5      nUS:  a  US S 2S S 2SS 2S S 2S S 24   nUR                  U5        M     / n[        U5       Hm  u  pUS:  a<  U R                  XS-
     X5      nUR                  US S 2S S 2S U	2S S 2S S 24   5        MG  UR                  US S 2S S 2S U	S-   2S S 2S S 24   5        Mo     [        R                  " USS9S S 2S S 2S U24   nU$ )Nr   r   r   rc   )r   r   r7  r:  r   r2  r5  r6  rL  r-  r/  r   r   rz  r   rl   )rE   r*   r   r   r   r   r   latent_num_framesrb  tile_latent_stride_num_framesblend_num_framesr  r   r  r  rM  s                   r-   rK  0AutoencoderKLHunyuanVideo._temporal_tiled_encode  s   >?gg;
*e'!^0O0OORSS%)%D%DHgHg%g"(,(J(JdNmNm(m%5Uq*&H&HIAQ14#B#BBQFF1LMDF-H-H$HETnTnLn((.||D)t,1uAq!"aN+JJt J 
 ~GA1u||CAJG!!$q!-K.K-KQPQ'Q"RS!!$q!-P/Lq/P-PRSUV'V"WX & ii
*1a1C2C1C+CD
r/   c           
      p   UR                   u  p4pVnUS-
  U R                  -  S-   nU R                  U R                  -  n	U R                  U R                  -  n
U R
                  U R                  -  nU R                  U R                  -  nU R
                  U R                  -
  n/ n[        SX\5       H  nUS S 2S S 2XU-   S-   2S S 2S S 24   nU R                  (       aA  UR                   S   U
:  d  UR                   S   U	:  a  U R                  USS9R                  nO"U R                  U5      nU R                  U5      nUS:  a  US S 2S S 2SS 2S S 2S S 24   nUR                  U5        M     / n[        U5       H  u  nnUS:  aG  U R                  XS-
     UU5      nUR                  US S 2S S 2S U R                  2S S 2S S 24   5        MS  UR                  US S 2S S 2S U R                  S-   2S S 2S S 24   5        M     [         R"                  " USS9S S 2S S 2S U24   nU(       d  U4$ [%        US	9$ )
Nr   r   r   rp  Tr[  r   rc   r\  )r   r   r5  r   r6  r7  r:  r   r2  r_  r]  r0  r.  r   r   rz  r   rl   r   )rE   rY  rP  r   r   r   r   r   num_sample_framesr`  ra  rb  r  r  r  r   r  ri  r  rc  s                       r-   r^  0AutoencoderKLHunyuanVideo._temporal_tiled_decode  s5   >?gg;
*e'!^t/N/NNQRR!%!<!<@^@^!^ $ : :d>\>\ \%)%D%DHgHg%g"(,(J(JdNmNm(m%::T=_=__q*DAQ1#==AA1aGHDDJJrN5J$JdjjY[n_uNu++Dd+CJJ++D1,,t,1u!!QAq.1JJw E 
 ~GAt1u||CAJ6FG!!$q!-Qt/Q/Q-QSTVW'W"XY!!$q!-Ut/Q/QTU/U-UWXZ['["\] & ii
*1a1C2C1C+CD6MC((r/   r]  sample_posterior	generatorc                     UnU R                  U5      R                  nU(       a  UR                  US9nOUR                  5       nU R	                  XsS9nU$ )a)  
Args:
    sample (`torch.Tensor`): Input sample.
    sample_posterior (`bool`, *optional*, defaults to `False`):
        Whether to sample from the posterior.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
)r  r[  )rW  rR  r]  rJ   rj  )	rE   r]  r  rP  r  r*   rV  rY  rc  s	            r-   rN   !AutoencoderKLHunyuanVideo.forward2  sS     KKN..	  9 5A Akk!k5
r/   )r.  r-  r0  r/  r   r   r5  r7  r6  r8  r:  r9  r  r4  r3  r1  r2  )r   r   r*  r  r$  r	  r   r  r   g>I?r   r   T)NNNNNN)r   N)T)FTN)(rQ   rR   rS   rT   r   _supports_gradient_checkpointingr   r@   r   rV   r$   rU   r>   r   r=  rA  rE  rH  r   rW   rN  r
   r   r   r   rW  r   rd  rj  rr  rv  rz  rL  r_  rK  r^  	GeneratorrN   rX   rY   rZ   s   @r-   r&  r&  t  s    (,$ !-
+
 *> !! ()**+(,1U0U0 U0 	U0
  S/U0 c3hU0" "#J#U0$ %U0& 'U0( )U0* +U0, $'-U0. %(/U00 "&1U02 
3U0 U0r 15/34859489=&q (&q  (}&q %-SM	&q
 $,E?&q #+5/&q (0&q 
&qP  ! %,,  37::,0:	"E*F$GG	H: :6) )D )E-Y^YeYeJeDf )( - -4 -5X]XdXdIdCe - -2 %,, c ell  %,, c ell  %,, c ell 0ell 0/B 0d8)ell 8) 8)}^c^j^jOjIk 8)t 9L >") ")4 ")SXYfhmhthtYtSu ")N "' /3  	
 EOO, 
}ell*	+ r/   r&  rx   )2typingr   r   r   numpyr   r   torch.nnrB   torch.nn.functional
functionalrK   torch.utils.checkpointconfiguration_utilsr   r   utilsr	   utils.accelerate_utilsr
   activationsr   attention_processorr   modeling_outputsr   modeling_utilsr   vaer   r   
get_loggerrQ   loggerr@   r   r   rW   r.   Moduler1   r\   rr   r|   r   r   r   r   r  r&   r/   r-   <module>r     s?   * )       B  8 ( + 2 ' < 
		H	% ei

#&
/4{{
DILL
^a

\\
(ryy (B'299 'TRYY (+bii +\^RYY ^B<bii <~<BII <~iBII iXhBII hVT
K Tr/   