
    +h*                       S SK Jr  S SKJrJrJrJrJr  S SKrS SK	J
r
  S SKJ
s  Jr  S SKrSSKJrJrJr  SSKJrJrJr  SSKJrJrJr  SSKJr  S	S
KJr  S	SKJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  S	SK*J+r+J,r,  S	SK-J.r.  S	SK/J0r0J1r1J2r2  S	SK3J4r4  S	SK5J6r6  SSK7J8r8  SSK9J:r:  \Rv                  " \<5      r=\ " S S\5      5       r> " S S\
R~                  5      r@ " S S\
R~                  5      rA " S S\
R~                  5      rB " S S\
R~                  5      rC " S S\
R~                  5      rD " S  S!\
R~                  5      rE " S" S#\
R~                  5      rF " S$ S%\.\\5      rG " S& S'\.\\\5      rHg)(    )	dataclass)AnyDictOptionalTupleUnionN   )ConfigMixin
FrozenDictregister_to_config)FromOriginalModelMixinPeftAdapterMixinUNet2DConditionLoadersMixin)
BaseOutput	deprecatelogging)apply_freeu   )BasicTransformerBlock)
ADDED_KV_ATTENTION_PROCESSORSCROSS_ATTENTION_PROCESSORS	AttentionAttentionProcessorAttnAddedKVProcessorAttnProcessorAttnProcessor2_0FusedAttnProcessor2_0IPAdapterAttnProcessorIPAdapterAttnProcessor2_0)TimestepEmbedding	Timesteps)
ModelMixin)Downsample2DResnetBlock2D
Upsample2D)DualTransformer2DModel)Transformer2DModel   )UNetMidBlock2DCrossAttn)UNet2DConditionModelc                   8    \ rS rSr% Sr\R                  \S'   Srg)UNetMotionOutput4   z
The output of [`UNetMotionOutput`].

Args:
    sample (`torch.Tensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
        The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
sample N)	__name__
__module____qualname____firstlineno____doc__torchTensor__annotations____static_attributes__r/       b/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/unets/unet_motion_model.pyr,   r,   4   s     LLr9   r,   c                    l  ^  \ rS rSrSr               SS\S\S\\   S\\   S\S\S	\S
\\   S\S\\   S\	S\S\S\\	   S\\   4U 4S jjjr
     SS\R                  S\\R                     S\\R                     S\\R                     S\S\\\	\4      S\R                  4S jjrSrU =r$ )AnimateDiffTransformer3DA   a  
A Transformer model for video-like data.

Parameters:
    num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
    attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
    in_channels (`int`, *optional*):
        The number of channels in the input and output (specify if the input is **continuous**).
    num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
    dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
    attention_bias (`bool`, *optional*):
        Configure if the `TransformerBlock` attention should contain a bias parameter.
    sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
        This is fixed during training since it is used to learn a number of position embeddings.
    activation_fn (`str`, *optional*, defaults to `"geglu"`):
        Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
        activation functions.
    norm_elementwise_affine (`bool`, *optional*):
        Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
    double_self_attention (`bool`, *optional*):
        Configure if each `TransformerBlock` should contain two self-attention layers.
    positional_embeddings: (`str`, *optional*):
        The type of positional embeddings to apply to the sequence input before passing use.
    num_positional_embeddings: (`int`, *optional*):
        The maximum length of the sequence over which to apply positional embeddings.
num_attention_headsattention_head_dimin_channelsout_channels
num_layersdropoutnorm_num_groupscross_attention_dimattention_biassample_sizeactivation_fnnorm_elementwise_affinedouble_self_attentionpositional_embeddingsnum_positional_embeddingsc                   > [         TU ]  5         Xl        X l        X-  nX0l        [
        R                  " XsSSS9U l        [
        R                  " UU5      U l	        [
        R                  " [        U5       Vs/ s H  n[        UUUUUUU	UUUUS9PM     sn5      U l        [
        R                  " UU5      U l        g s  snf )Nư>T)
num_groupsnum_channelsepsaffine)rC   rE   rH   rF   rJ   rI   rK   rL   )super__init__r>   r?   r@   nn	GroupNormnormLinearproj_in
ModuleListranger   transformer_blocksproj_out)selfr>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   	inner_dim_	__class__s                     r:   rT   !AnimateDiffTransformer3D.__init__^   s    $ 	#6 "4'<	&LLO[_hlm	yyi8 #%-- z* +A &'&#(;"/#1*?,C*?.G +#
& 		)[9%s   :C hidden_statesencoder_hidden_statestimestepclass_labels
num_framescross_attention_kwargsreturnc           	         UR                   u  pxpXu-  nUnUSSS24   R                  XXU
5      nUR                  SSSSS5      nU R                  U5      nUR                  SSSSS5      R                  X-  U
-  XX5      nU R	                  US9nU R
                   H  nU" UUUUUS9nM     U R                  US9nUSSSS24   R                  XXU5      R                  SSSSS5      R                  5       nUR                  XxX5      nX-   nU$ )	a  
The [`AnimateDiffTransformer3D`] forward method.

Args:
    hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
        Input hidden_states.
    encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
        Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
        self-attention.
    timestep ( `torch.LongTensor`, *optional*):
        Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
    class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
        Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
        `AdaLayerZeroNorm`.
    num_frames (`int`, *optional*, defaults to 1):
        The number of frames to be processed per batch. This is used to reshape the hidden states.
    cross_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

Returns:
    torch.Tensor:
        The output tensor.
Nr   r   r(   r	      )input)rc   rd   re   rh   rf   )shapereshapepermuterW   rY   r\   r]   
contiguous)r^   rc   rd   re   rf   rg   rh   batch_frameschannelheightwidth
batch_sizeresidualblockoutputs                  r:   forward AnimateDiffTransformer3D.forward   s=   F 0=/B/B,v!/
 %dAg.66zw`ef%--aAq!<		-0%--aAq!<DDZEX[`E`blv=9 ,,E!+&;!'=)M - M:$a-(WZGDWQ1a#Z\	 	 &--lVS)r9   )r?   r@   rW   r>   rY   r]   r\   )   X   NNr(               NFNgegluTTNN)NNNr(   N)r0   r1   r2   r3   r4   intr   floatboolstrrT   r5   r6   
LongTensorr   r   ry   r8   __classcell__ra   s   @r:   r<   r<   A   s   < $&"$%)&*!-1$%)$(,&*/337!0: 0:  0: c]	0:
 sm0: 0: 0: 0: &c]0: 0: c]0: 0: "&0:  $0:  (}0:  $,C=!0: 0:j =A/337;?E||E  ((8(89E 5++,	E
 u//0E E !)c3h 8E 
E Er9   r<   c            &       H  ^  \ rS rSr               SS\S\S\S\S\S\S\S	\S
\S\S\S\S\S\\\	\   4   S\
\   S\S\\\	\   4   S\4$U 4S jjjr  SS\R                  S\
\R                     S\S\\R                  \	\R                  S4   4   4S jjrSrU =r$ )DownBlockMotion   r@   rA   temb_channelsrC   rB   
resnet_epsresnet_time_scale_shiftresnet_act_fnresnet_groupsresnet_pre_normoutput_scale_factoradd_downsampledownsample_paddingtemporal_num_attention_headstemporal_cross_attention_dimtemporal_max_seq_length%temporal_transformer_layers_per_blocktemporal_double_self_attentionc                   > [         TU ]  5         / n/ n[        U[        5      (       a  U4U-  nO[	        U5      U:w  a  [        SU 35      e[        U[        5      (       a  U4U-  nO[	        U5      U:w  a  [        SU 35      e[        U5       H[  nUS:X  a  UOUnUR                  [        UUUUU	UUUUU
S9
5        UR                  [        UU   UUU   U	USSSUX.U   -  US95        M]     [        R                  " U5      U l        [        R                  " U5      U l        U(       a(  [        R                  " [        US	UUS
S9/5      U l        OS U l        SU l        g )Nz\`temporal_transformer_layers_per_block` must be an integer or a tuple of integers of length zS`temporal_num_attention_heads` must be an integer or a tuple of integers of length r   
r@   rA   r   rQ   groupsrC   time_embedding_normnon_linearityr   pre_normFr   
sinusoidalr>   r@   rB   rD   rE   rF   rH   rK   rL   r?   rJ   Topuse_convrA   paddingname)rS   rT   
isinstancer   len
ValueErrorr[   appendr$   r<   rU   rZ   resnetsmotion_modulesr#   downsamplersgradient_checkpointing)r^   r@   rA   r   rC   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ira   s                         r:   rT   DownBlockMotion.__init__   s   * 	 ;SAA5Z4\_i4i167:Enoynz{ 
 2C88,H+JZ+W(-.*<efpeqr  z"A)*a+\KNN +!-"/"(#(?"/(;, !!((DQ(G ,DQG$1(D#(")*6.E'3TU7V'V*H! #@ }}W- mmN; " $!%%1 2!
!D !%D&+#r9   rc   tembrg   ri   .c                    [        U5      S:  d  UR                  SS 5      b  Sn[        SSU5        Sn[        U R                  U R
                  5      nU HT  u  p[        R                  " 5       (       a$  U R                  (       a  U R                  XU5      nOU	" XS9nU
" XS9nXq4-   nMV     U R                  b  U R                   H	  nU" US9nM     Xq4-   nX4$ )	Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0r/   input_tensorr   rg   rc   )r   getr   zipr   r   r5   is_grad_enabledr   _gradient_checkpointing_funcr   )r^   rc   r   rg   argskwargsdeprecation_messageoutput_statesblocksresnetmotion_moduledownsamplers               r:   ry   DownBlockMotion.forward6  s     t9q=FJJw5A #Ugw(;<T\\4#6#67%+!F$$&&4+F+F $ A A&Y] ^ &M M)-OM),<<M &, (#00 +- H  1 *,<<M++r9   )r   r   r   r   )r}   r(   rN   defaultswishr~   T      ?Tr(   r(   Nr~   r(   T)Nr(   )r0   r1   r2   r3   r   r   r   r   r   r   r   rT   r5   r6   ry   r8   r   r   s   @r:   r   r      s     '0$ $%(#"#?@6:')HI/3'[,[, [, 	[,
 [, [, [, "%[, [, [, [, #[, [,  [, ',CsO&<[,  '/sm![," "%#[,$ 05S%*_/E%[,& )-'[, [,@ (,	,||, u||$, 	, 
u||U5<<#455	6, ,r9   r   c            6         ^  \ rS rSr                       S'S\S\S\S\S\S\\\\   4   S\S	\S
\S\S\	S\S\S\S\S\	S\	S\	S\	S\	S\S\
\   S\S\S\\\\   4   S\	44U 4S jjjr       S(S\R                  S\
\R                     S\
\R                     S \
\R                     S!\S"\
\R                     S#\
\\\4      S$\
\R                     4S% jjrS&rU =r$ ))CrossAttnDownBlockMotioniX  r@   rA   r   rC   rB   transformer_layers_per_blockr   r   r   r   r   r>   rE   r   r   r   dual_cross_attentionuse_linear_projectiononly_cross_attentionupcast_attentionattention_typer   r   r   r   r   c                   > [         TU ]  5         / n/ n/ nSU l        Xl        [	        U[
        5      (       a  U4U-  nO[        U5      U:w  a  [        SU 35      e[	        U[
        5      (       a  U4U-  nO[        U5      U:w  a  [        SU 35      e[        U5       H  nUS:X  a  UOUnUR                  [        UUUUU
UUU	UUS9
5        U(       d'  UR                  [        UX,-  UUU   UU
UUUUS9
5        OUR                  [        UX,-  USUU
S95        UR                  [        UUUU   U
US	S
SUUU-  US95        M     [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        U(       a(  [        R                  " ['        USUUSS9/5      U l        OS U l        S	U l        g )NTPtransformer_layers_per_block must be an integer or a list of integers of length Ytemporal_transformer_layers_per_block must be an integer or a list of integers of length r   r   r@   rB   rE   rD   r   r   r   r   r(   r@   rB   rE   rD   Fr   r   r   r   r   )rS   rT   has_cross_attentionr>   r   r   r   r   r[   r   r$   r'   r&   r<   rU   rZ   
attentionsr   r   r#   r   r   ) r^   r@   rA   r   rC   rB   r   r   r   r   r   r   r>   rE   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   s                                   r:   rT   !CrossAttnDownBlockMotion.__init__Y  s1   : 	
#' #6  2C88,H+JZ+W(-.*<bcmbno 
 ;SAA5Z4\_i4i167:Eklvkwx  z"A)*a+\KNN +!-"/"(#(?"/(;, (!!&+$;$0#?#B,?(5.C-A)9'5 !!*+$;$0#$,?(5	 !!((D ,DQG$1(D#(")*6.E'37S'S*HY #x --
3}}W- mmN; " $!%%1 2!
!D !%D&+#r9   rc   r   rd   attention_maskrg   encoder_attention_maskrh   additional_residualsc	           
      <   Ub(  UR                  SS 5      b  [        R                  S5        Sn	[        [	        U R
                  U R                  U R                  5      5      n
[        U
5       H  u  nu  pn[        R                  " 5       (       a$  U R                  (       a  U R                  XU5      nOU" XS9nU" UUUUUSS9S   nU" XS9nU[        U
5      S	-
  :X  a  Ub  X-   nX4-   n	M     U R                  b  U R                   H	  nU" US
9nM     X4-   n	X4$ )Nr   SPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r/   r   Frc   rd   rh   r   r   return_dictr   r   r(   r   )r   loggerwarninglistr   r   r   r   	enumerater5   r   r   r   r   r   )r^   rc   r   rd   r   rg   r   rh   r   r   r   r   r   attnr   r   s                   r:   ry    CrossAttnDownBlockMotion.forward  s9    "-%))'48Dtuc$,,9L9LMN09&0A,A,m$$&&4+F+F $ A A&Y] ^ &M M +&;'=-'=! M *-OM CK!O#(<(H - D),<<M+ 1B. (#00 +- H  1 *,<<M++r9   )r   r   r   r   r   r>   r   )r}   r(   r(   rN   r   r   r~   Tr(      r   r(   TFFFFr   N   r~   r(   T)NNNr(   NNNr0   r1   r2   r3   r   r   r   r   r   r   r   rT   r5   r6   r   r   ry   r8   r   r   s   @r:   r   r   X  s>    ?@ '0$ $#$#'%("##%*&+%*!&'6:,-')HI/37D,D, D, 	D,
 D, D, ',CsO&<D, D, "%D, D, D, D, !D, !D, #D,   !D," #D,$ #%D,&  $'D,( #)D,* +D,, -D,. '/sm/D,0 '*1D,2 "%3D,4 05S%*_/E5D,6 )-7D, D,R (,8<159=;?7;/,||/, u||$/,  (5	/,
 !./, /, !) 6/, !)c3h 8/, 'u||4/, /,r9   r   c            8         ^  \ rS rSr                      S*S\S\S\S\S\\   S\S\S	\\\\   4   S
\S\	S\	S\S\
S\S\S\S\
S\
S\
S\
S\
S\	S\\   S\S\S\\\\   4   44U 4S jjjr       S+S\R                  S\\R                  S4   S \\R                     S!\\R                     S"\\\	\4      S#\\   S$\\R                     S%\\R                     S&\S'\R                  4S( jjrS)rU =r$ ),CrossAttnUpBlockMotioni  r@   rA   prev_output_channelr   resolution_idxrC   rB   r   r   r   r   r   r   r>   rE   r   add_upsampler   r   r   r   r   r   r   r   r   c                   > [         T!U ]  5         / n/ n/ nSU l        Xl        [	        U[
        5      (       a  U4U-  nO)[        U5      U:w  a  [        SU S[        U5       35      e[	        U[
        5      (       a  U4U-  nO)[        U5      U:w  a  [        SU S[        U5       35      e[        U5       H  nUUS-
  :X  a  UOUnUS:X  a  UOUn UR                  [        U U-   UUU	UUU
UUUS9
5        U(       d'  UR                  [        UX.-  UUU   UUUUUUS9
5        OUR                  [        UX.-  USUUS	95        UR                  [        UUUU   UUS
SSUUU-  S9
5        M     [        R                  " U5      U l        [        R                  " U5      U l        [        R                  " U5      U l        U(       a&  [        R                  " ['        USUS9/5      U l        OS U l        S
U l        XPl        g )NTr   z, got r   r(   r   r   r   r   Fr   r   
r>   r@   rB   rD   rE   rF   rH   rK   rL   r?   r   rA   )rS   rT   r   r>   r   r   r   r   r[   r   r$   r'   r&   r<   rU   rZ   r   r   r   r%   
upsamplersr   r   )"r^   r@   rA   r   r   r   rC   rB   r   r   r   r   r   r   r>   rE   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   res_skip_channelsresnet_in_channelsra   s"                                    r:   rT   CrossAttnUpBlockMotion.__init__  s|   : 	
#' #6  2C88,H+JZ+W(-.*<bcmbnntux  zV  vW  uX  Y 
 ;SAA5Z4\_i4i167:Eklvkww}  B  Ch  i  ~j  k  z"A01Z!^0C,89Q!4LNN 25F F!-"/"(#(?"/(;, (!!&+$;$0#?#B,?(5.C-A)9'5 !!*+$;$0#$,?(5	 !!((D ,DQG$1(D#(")*6.E'37S'S[ #x --
3}}W- mmN; mmZtbn-o,pqDO"DO&+#,r9   rc   res_hidden_states_tuple.r   rd   rh   upsample_sizer   r   rg   ri   c
                 ,   Ub(  UR                  SS 5      b  [        R                  S5        [        U SS 5      =(       a5    [        U SS 5      =(       a!    [        U SS 5      =(       a    [        U SS 5      n
[	        U R
                  U R                  U R                  5      nU H  u  pnUS   nUS S nU
(       aC  [        U R                  UUU R                  U R                  U R                  U R                  S9u  p[        R                  " X/S	S
9n[        R                   " 5       (       a$  U R"                  (       a  U R%                  XU5      nOU" XS9nU" UUUUUSS9S   nU" XS9nM     U R&                  b  U R&                   H	  nU" XS9nM     U$ )Nr   r   s1s2b1b2r   r   r   r   r(   dimr   Fr   r   r   rc   output_size)r   r   r   getattrr   r   r   r   r   r   r   r   r   r   r5   catr   r   r   r   )r^   rc   r   r   rd   rh   r   r   r   rg   is_freeu_enabledr   r   r   r   res_hidden_states	upsamplers                    r:   ry   CrossAttnUpBlockMotion.forward  s    "-%))'48Dtu D$% *dD)*dD)* dD)	 	 T\\4??D4G4GH+1'F- 7 ;&=cr&B#  3>''!%wwwwwwww40 "II}&HaPM$$&&4+F+F $ A A&Y] ^ &M M +&;'=-'=! M *-OMC ,2F ??&!__	 ) a - r9   )r   r   r   r   r>   r   r   r   )Nr}   r(   r(   rN   r   r   r~   Tr(   r   r   TFFFFr   Nr   r~   r(   )NNNNNNr(   )r0   r1   r2   r3   r   r   r   r   r   r   r   rT   r5   r6   r   r   ry   r8   r   r   s   @r:   r   r     s`    )-?@ '0$ $#$#'%(!%*&+%*!&'6:,-')HI7{-{- {- !	{-
 {- !{- {- {- ',CsO&<{- {- "%{- {- {- {- !{-  !!{-" ##{-$ %{-& #'{-(  $){-* #+{-, -{-. /{-0 '/sm1{-2 '*3{-4 "%5{-6 05S%*_/E7{- {-B (,8<;?'+159=?||? "'u||S'8!9? u||$	?
  (5? !)c3h 8?  }? !.? !) 6? ? 
? ?r9   r   c            (       8  ^  \ rS rSr              SS\S\S\S\S\\   S\S\S	\S
\S\S\S\S\S\S\\   S\S\S\	\\
\   4   4$U 4S jjjr   SS\R                  S\
\R                  S4   S\\R                     S\S\R                  4
S jjrSrU =r$ )UpBlockMotioni  r@   r   rA   r   r   rC   rB   r   r   r   r   r   r   r   r   r   r   r   c                 r  > [         TU ]  5         / n/ n[        U[        5      (       a  U4U-  nO[	        U5      U:w  a  [        SU 35      e[        U5       He  nUUS-
  :X  a  UOUnUS:X  a  UOUnUR                  [        UU-   UUUUUU	U
UUS9
5        UR                  [        UUUU   UUSSSUUU-  S9
5        Mg     [        R                  " U5      U l        [        R                  " U5      U l        U(       a&  [        R                  " [        US	US
9/5      U l        OS U l        SU l        XPl        g )Nr   r(   r   r   Fr   r   r   Tr   )rS   rT   r   r   r   r   r[   r   r$   r<   rU   rZ   r   r   r%   r   r   r   )r^   r@   r   rA   r   r   rC   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   s                           r:   rT   UpBlockMotion.__init__  se   * 	 ;SAA5Z4\_i4i167:Eklvkwx  z"A01Z!^0C,89Q!4LNN 25F F!-"/"(#(?"/(;, !!((D ,DQG$1(D#(")*6.E'37S'S' #D }}W- mmN; mmZtbn-o,pqDO"DO&+#,r9   rc   r   .r   rg   ri   c                    [        U5      S:  d  UR                  SS 5      b  Sn[        SSU5        [        U SS 5      =(       a5    [        U SS 5      =(       a!    [        U SS 5      =(       a    [        U SS 5      n	[	        U R
                  U R                  5      n
U
 H  u  pUS	   nUS S	 nU	(       aC  [        U R                  UUU R                  U R                  U R                  U R                  S
9u  p[        R                  " X/SS9n[        R                  " 5       (       a$  U R                   (       a  U R#                  XU5      nOU" XS9nU" XS9nM     U R$                  b  U R$                   H	  nU" XS9nM     U$ )Nr   r   r   r   r   r   r   r   r   r   r(   r   r   r   r   )r   r   r   r  r   r   r   r   r   r   r   r   r   r5   r  r   r   r   r   )r^   rc   r   r   r   rg   r   r   r   r  r   r   r   r  r  s                  r:   ry   UpBlockMotion.forward   sv    t9q=FJJw5A #Ugw(;< D$% *dD)*dD)* dD)	 	 T\\4#6#67%+!F 7 ;&=cr&B#  3>''!%wwwwwwww40 "II}&HaPM$$&&4+F+F $ A A&Y] ^ &M M)-OM1 &,4 ??&!__	 ) a - r9   )r   r   r   r   r   )Nr}   r(   rN   r   r   r~   Tr   TNr   r~   r(   )NNr(   )r0   r1   r2   r3   r   r   r   r   r   r   r   rT   r5   r6   ry   r8   r   r   s   @r:   r  r    sy    )- '0$ $%(!6:,-')HI'L-L- !L- 	L-
 L- !L- L- L- L- "%L- L- L- L- #L- L-  '/sm!L-" '*#L-$ "%%L-& 05S%*_/E'L- L-d (,5||5 "'u||S'8!95 u||$	5 5 
5 5r9   r  c            .         ^  \ rS rSr                   S"S\S\S\S\S\\\\   4   S\S\S	\S
\S\	S\S\S\S\	S\	S\	S\S\S\
\   S\S\\\\   4   4*U 4S jjjr      S#S\R                  S\
\R                     S\
\R                     S\
\R                     S\
\\\4      S\
\R                     S\S\R                  4S  jjrS!rU =r$ )$UNetMidBlockCrossAttnMotioniX  r@   r   rC   rB   r   r   r   r   r   r   r>   r   rE   r   r   r   r   r   r   r   r   c                 v  > [         TU ]  5         SU l        Xl        U	b  U	O[	        US-  S5      n	[        U[        5      (       a  U4U-  nO[        U5      U:w  a  [        SU S35      e[        U[        5      (       a  U4U-  nO[        U5      U:w  a  [        SU S35      e[        UUUUU	UUUUU
S9
/n/ n/ n[        U5       H  nU(       d&  UR                  [        UX-  UUU   UU	UUUS9	5        OUR                  [        UX-  US	UU	S
95        UR                  [        UUUUU	UUUUU
S9
5        UR                  [        UUU-  UUU   U	USSUSS9
5        M     [        R                   " U5      U l        [        R                   " U5      U l        [        R                   " U5      U l        SU l        g )NTrk   r~   zT`transformer_layers_per_block` should be an integer or a list of integers of length .z]`temporal_transformer_layers_per_block` should be an integer or a list of integers of length r   )r@   rB   rE   rD   r   r   r   r(   r   Fr   r   )
r>   r?   r@   rB   rD   rE   rF   rK   rL   rH   )rS   rT   r   r>   minr   r   r   r   r$   r[   r   r'   r&   r<   rU   rZ   r   r   r   r   )r^   r@   r   rC   rB   r   r   r   r   r   r   r>   r   rE   r   r   r   r   r   r   r   r   r   r   r   r   ra   s                             r:   rT   $UNetMidBlockCrossAttnMotion.__init__Y  s.   0 	#' #6 )6)BK[\L\^`Ha 2C88,H+JZ+W(-.*<fgqfrrst 
 ;SAA5Z4\_i4i167:Eopzo{{|}  '(+$$;+$7(
 
z"A'!!&+#:$/#?#B,?(5.C)9'5
 !!*+#:$/#$,?(5	 NN +!,"/"(#(?"/(;, !!((D'26R'R +DQG$1(D#(*6.E")Q #n --
3}}W- mmN;&+#r9   rc   r   rd   r   rh   r   rg   ri   c           
         Ub(  UR                  SS 5      b  [        R                  S5        U R                  S   " XS9n[	        U R
                  U R                  SS  U R                  5      nU H|  u  pnU	" UUUUUSS9S   n[        R                  " 5       (       a;  U R                  (       a*  U R                  XS S S US 5      nU R                  XU5      nMi  U" US S S US 5      nU
" XS9nM~     U$ )Nr   r   r   r   r(   Fr   )r   r   r   r   r   r   r   r5   r   r   r   )r^   rc   r   rd   r   rh   r   rg   r   r   r   r   s               r:   ry   #UNetMidBlockCrossAttnMotion.forward  s    "-%))'48DtuQ]NT__dll12&68K8KL+1'D- +&;'=-'=! M $$&&4+F+F $ A A!$dJPT! !% A A&Y] ^ -mT4z[_ ` &M M# ,2& r9   )r   r   r   r   r>   r   )r}   r(   r(   rN   r   r   r~   Tr(   r   r   FFFr   r(   Nr~   r(   )NNNNNr(   r   r   s   @r:   r  r  X  s   
 ?@ '0$ $#$%(#'%*&+!&',-6:')HI-{,{, {, 	{,
 {, ',CsO&<{, {, "%{, {, {, {, !{, #{, !{, #{,   $!{," #{,$ %{,& '*'{,( '/sm){,* "%+{,, 05S%*_/E-{, {,@ (,8<15;?9=$||$ u||$$  (5	$
 !.$ !)c3h 8$ !) 6$ $ 
$ $r9   r  c                      ^  \ rS rSr        SS\S\S\\\\   4   S\\\\   4   S\S\\   S\	S	\S
\4U 4S jjjr
SrU =r$ )MotionModulesi  r@   layers_per_blockr   r>   rF   rE   rH   rD   max_seq_lengthc
                 d  > [         TU ]  5         [        R                  " / 5      U l        [        U[        5      (       a  U4U-  nO)[        U5      U:w  a  [        SU S[        U5       35      e[        U5       H2  n
U R                  R                  [        UX:   UUUUUX-  SU	S9
5        M4     g )NzZThe number of transformer layers per block must match the number of layers per block, got  and r   )
r@   rB   rD   rE   rH   rF   r>   r?   rK   rL   )rS   rT   rU   rZ   r   r   r   r   r   r[   r   r<   )r^   r@   r  r   r>   rF   rE   rH   rD   r  r   ra   s              r:   rT   MotionModules.__init__  s     	 mmB/2C88,H+JM]+](-.2BB'(c2N.O-PR 
 '(A&&( +;>$3(;"/#1(;'2'I*6.< )r9   )r   )r   r   r   FNr   r~   r~   )r0   r1   r2   r3   r   r   r   r   r   r   rT   r8   r   r   s   @r:   r  r    s     !"?@67$-1$! %% % ',CsO&<	%
 #3c
?3% % &c]% % % % %r9   r  c                      ^  \ rS rSr\          SS\\S4   S\\\\   4   S\\\\   \\\      4   S\S\\\\   4   S\\\\   4   S	\S
\S\S\	\   4U 4S jjj5       r
S rSrU =r$ )MotionAdapteri&  block_out_channels.motion_layers_per_block#motion_transformer_layers_per_block!motion_mid_block_layers_per_block'motion_transformer_layers_per_mid_blockmotion_num_attention_headsmotion_norm_num_groupsmotion_max_seq_lengthuse_motion_mid_blockconv_in_channelsc                 J  > [         TU ]  5         / n/ n[        U[        5      (       a  U4[	        U5      -  nO;[	        U5      [	        U5      :w  a#  [        S[	        U5       S[	        U5       35      e[        U[        5      (       a  U4[	        U5      -  n[        U[        5      (       a  U4U-  nO*[	        U5      U:w  a  [        SU S[	        U5       S35      e[        U[        5      (       a  U4[	        U5      -  nO;[	        U5      [	        U5      :w  a#  [        S[	        U5       S[	        U5       35      eU
(       a  [        R                  " XS   SS	S
9U l        OSU l        [        U5       H/  u  pX   nUR                  [        UUSSSXm   UX-   X=   S9	5        M1     U	(       a  [        US   USSSUS   UUUS9	U l        OSU l        [        [        U5      5      nUS   n[        [        U5      5      n[        [        U5      5      n[        [        U5      5      n[        U5       H6  u  pUU   nUR                  [        UUSSSUU   UUU   S	-   UU   S9	5        M8     [        R                  " U5      U l        [        R                  " U5      U l        g)a  Container to store AnimateDiff Motion Modules

Args:
    block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
    The tuple of output channels for each UNet block.
    motion_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 2):
        The number of motion layers per UNet block.
    motion_transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple[int]]`, *optional*, defaults to 1):
        The number of transformer layers to use in each motion layer in each block.
    motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
        The number of motion layers in the middle UNet block.
    motion_transformer_layers_per_mid_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
        The number of transformer layers to use in each motion layer in the middle block.
    motion_num_attention_heads (`int` or `Tuple[int]`, *optional*, defaults to 8):
        The number of heads to use in each attention layer of the motion module.
    motion_norm_num_groups (`int`, *optional*, defaults to 32):
        The number of groups to use in each group normalization layer of the motion module.
    motion_max_seq_length (`int`, *optional*, defaults to 32):
        The maximum sequence length to use in the motion module.
    use_motion_mid_block (`bool`, *optional*, defaults to True):
        Whether to use a motion module in the middle of the UNet.
zKThe number of motion layers per block must match the number of blocks, got r  z$The number of layers per mid block (zD) must match the length of motion_transformer_layers_per_mid_block ()zgThe length of the attention head number tuple in the motion module must match the number of block, got r   r	   r(   kernel_sizer   Nr   F)	r@   rD   rE   rH   rF   r>   r  r  r   r   )rS   rT   r   r   r   r   rU   Conv2dconv_inr   r   r  	mid_blockr   reversedrZ   down_blocks	up_blocks)r^   r  r  r   r!  r"  r#  r$  r%  r&  r'  r0  r1  r   rr   output_channelreversed_block_out_channels reversed_motion_layers_per_block,reversed_motion_transformer_layers_per_block#reversed_motion_num_attention_headsra   s                       r:   rT   MotionAdapter.__init__'  s   J 		-s33'>&@3GYCZ&Z#()S1C-DD-./uS9P5Q4RT 
 93??3V2X[^_q[r2r/=sCC771723 89=^^67X6Y ZUUX  ZA  VB  UC  CDE 
 0#66*D)FM_I`)`&+,4F0GG((+,F(G'HcRdNeMfh 
 99%5!7LZ[efgDLDL#$67JA/2N .$:(,")#((B(E#8%<%?1T1W
 8   *.r2 6$(%$$>r$B4!B-T
DN "DN&*84F+G&H#4Q7+/9P0Q+R(7;HEh<i7j4.28<V3W.X+#$?@JA8;N .$:(,")#((KA(N#8%Ea%H1%L1]^_1`
 A  ==5y1r9   c                     g Nr/   )r^   r.   s     r:   ry   MotionAdapter.forward  s    r9   )r-  r0  r.  r1  )
i@  i  r   r   r   r(   r(   r(   r   r~   r~   TN)r0   r1   r2   r3   r   r   r   r   r   r   rT   ry   r8   r   r   s   @r:   r  r  &  s     /E:;YZ12JK=>&(%'%)*.A2!#s(OA2 "'sE#J!7A2 .33c
E%PS*DU3U-V	A2
 ,/A2 27sE#J1GA2 %*#uSz/$:A2 !$A2  #A2 #A2 #3-A2 A2F r9   r  c            C         ^  \ rS rSrSrSrS/r\                                SQS\\	   S\	S\	S	\
\S
4   S\
\S
4   S\
\	S
4   S\\	\
\	   4   S\	S\S\S\	S\S\	S\\	\
\	   \
\
   4   S\\\	\
\	   \
\
   4      S\\	\
\	   \
\
   4   S\\\	\
\	   \
\
   4      S\\\	\
\	   4      S\\\	\
\	   4      S\S\\	\
\	S
4   4   S\	S\\	\
\	S
4   4   S\\\	\
\	S
4   \
\
\	S
4   S
4   4      S\S \	S!\\	   S"\\   S#\\   S$\\	   S%\\	   S&\\	   4@U 4S' jjj5       r\  SRS(\S)\\   S*\4S+ jj5       rSSS- jrS)\\   S,S4S. jr    STS/\S0\S1\S2\\   S3\S,S4S4 jjr\S,\\\4   4S5 j5       rS6\\\\\4   4   4S7 jrSUS8\\	   S9\	S,S4S: jjrSSS; jrSSS< jrS=\S>\S?\S@\S,S4
SA jrSSSB jr SC r!SD r"       SVSE\#RH                  SF\\#RH                  \\	4   SG\#RH                  SH\\#RH                     SI\\#RH                     SJ\\\\%4      SK\\\\#RH                  4      SL\\
\#RH                        SM\\#RH                     SN\S,\\&\
\#RH                     4   4SO jjr'SPr(U =r)$ )WUNetMotionModeli  a)  
A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
sample shaped output.

This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
for all models (such as downloading or saving).
TrW   NrG   r@   rA   down_block_types.up_block_typesr  r  r   mid_block_scale_factoract_fnrD   norm_epsrE   r   $reverse_transformer_layers_per_blockr   -reverse_temporal_transformer_layers_per_block transformer_layers_per_mid_block)temporal_transformer_layers_per_mid_blockr   r>   r%  r#  "reverse_motion_num_attention_headsr&  mid_block_layersencoder_hid_dimencoder_hid_dim_typeaddition_embed_typeaddition_time_embed_dim%projection_class_embeddings_input_dimtime_cond_proj_dimc!                    > [         T7U ]  5         Xl        [        U5      [        U5      :w  a  [	        SU SU S35      e[        U5      [        U5      :w  a  [	        SU SU S35      e[        U[        5      (       d*  [        U5      [        U5      :w  a  [	        SU SU S35      e[        U[        5      (       a*  [        U5      [        U5      :w  a  [	        SU SU S35      e[        U[        5      (       d*  [        U5      [        U5      :w  a  [	        SU SU S35      e[        U[        5      (       a,  Uc)  U H#  n![        U![        5      (       d  M  [	        S	5      e   [        U[        5      (       a,  Uc)  U H#  n![        U![        5      (       d  M  [	        S
5      e   Sn"Sn#U"S-
  S-  n$[        R                  " X&S   U"U$S9U l
        US   S-  n%[        US   SS5      U l        US   n&[        U&U%U
U S9U l        Uc  S U l        US:X  a#  [        USS5      U l        [        UU%5      U l        [        R$                  " / 5      U l        [        R$                  " / 5      U l        [        U[        5      (       a  U4[        U5      -  n[        U[        5      (       a  U4[        U5      -  n[        U[        5      (       a  U/[        U5      -  n[        U[        5      (       a  U/[        U5      -  n[        U[        5      (       a  U/[        U5      -  n[        U[        5      (       a  U/[        U5      -  n[        U[        5      (       a  U/[        U5      -  n[        U[        5      (       a  U4[        U5      -  nUS   n'[+        U5       H  u  n(n)U'n*UU(   n'U([        U5      S-
  :H  n+U)S:X  aQ  [-        S40 SU*_SU'_SU%_SUU(   _SUU(   _SU_SU
_SU_SUU(   _SUU(   _SU_S U+(       + _S!U_S"UU(   _S#U_S$UU(   _6n,O4U)S%:X  a#  [/        U*U'U%UU(   UU
UU+(       + UUU(   UUU(   S&9n,O[	        S'5      eU R&                  R1                  U,5        M     Uc  [        US(   [        5      (       a  US(   OSnU(       a)  [3        US(   U%UU
U	US(   US(   US)UUUS(   UUUS*9U l        O"[7        US(   U%UU
U	US(   US(   US)UUUS+9U l        SU l        [        [;        U5      5      n-[        [;        U5      5      n.[        [;        U5      5      n/[        [;        U5      5      n0[        [;        U5      5      n1Uc  [        [;        U5      5      nUc  [        [;        U5      5      nU-S   n'[+        U5       GH  u  n(n2U([        U5      S-
  :H  n+U'n3U-U(   n'U-[=        U(S-   [        U5      S-
  5         n*U+(       d  Sn4U =R8                  S-  sl        OS)n4U2S,:X  aR  [?        S40 SU*_SU'_S-U3_SU%_S.U(_SU/U(   S-   _SUU(   _SU_SU
_SU_SU.U(   _SU0U(   _S/U4_S!U_S"U1U(   _S#U_S$UU(   _6n5O3U2S0:X  a"  [A        U*U3U'U%U(U/U(   S-   UU
UU4U1U(   UUU(   S19n5O[	        S25      eU R(                  R1                  U55        U'n3GM	     Ub8  [        RB                  " US   XS39U l"        [        RF                  " 5       U l$        OS U l"        S U l$        U#S-
  S-  n6[        R                  " US   UU#U6S9U l%        g )5Nz\Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: z. `up_block_types`: r  zbMust provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: z. `down_block_types`: zdMust provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: zdMust provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: z^Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: zOMust provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.ziMust provide 'reverse_temporal_transformer_layers_per_block` if using asymmetrical motion module in UNet.r	   r(   r   r   r*  rk   T)rA  cond_proj_dim	text_timer   r@   rA   r   rB   r   r   r   r   r>   rE   r   r   r   r   r   r   r   )r@   rA   r   rB   r   r   r   r   r   r   r   r   zeInvalid `down_block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`r   F)r@   r   r   r   r   rE   r>   r   r   r   rB   r   r   r   r   )r@   r   r   r   r   rE   r>   r   r   r   rB   r   r   r   r   r   r  )r@   r   rA   r   r   rB   r   r   r   r   r   r   r   z_Invalid `up_block_type` encountered. Must be one of `CrossAttnUpBlockMotion` or `UpBlockMotion`)rP   rO   rQ   r/   )&rS   rT   rG   r   r   r   r   r   rU   r,  r-  r!   	time_projr    time_embeddingencoder_hid_projadd_time_projadd_embeddingrZ   r0  r1  r   r   r   r   r  r.  r)   num_upsamplersr/  r  r   r  rV   conv_norm_outSiLUconv_actconv_out)8r^   rG   r@   rA   r>  r?  r  r  r   r@  rA  rD   rB  rE   r   rC  r   rD  rE  rF  r   r>   r%  r#  rG  r&  rH  rI  rJ  rK  rL  rM  rN  layer_number_per_blockconv_in_kernelconv_out_kernelconv_in_paddingtime_embed_dimtimestep_input_dimr2  r   down_block_typeinput_channelis_final_block
down_blockr3  reversed_num_attention_headsreversed_layers_per_blockreversed_cross_attention_dimr6  up_block_typer   r   up_blockconv_out_paddingra   s8                                                          r:   rT   UNetMotionModel.__init__  s	   \ 	&  C$77no  oA  AU  Vd  Ue  ef  g  !"c*:&;;t  vH  uI  I_  `p  _q  qr  s  -s33<O8PTWXhTi8iv  xK  wL  Lb  cs  bt  tu  v  )400S9L5MQTUeQf5fv  xK  wL  Lb  cs  bt  tu  v  *C00S9I5JcRbNc5cp  rB  qC  CY  Zj  Yk  kl  m  2D99>b>j*F&4d;;$%vww +G
 <dCC=E*O&4d;;$ D  +P )A-!3yyA.NTc

 ,A.2"#5a#8$B/2/vM_
  '$(D!+-!*+BD!!LD!23XZh!iD ==,r*)3//#6"83?O;P"P)3//#6"83?O;P"P&,, 01C8H4II2C88,H+ICP`La+a(:C@@4X3Y\_`p\q3q0;SAA5Z4[^abr^s4s1CSII=j<knq o =9 0#66*D)FM]I^)^& ,A."+,<"=A*M/2N#&8"9A"==N"<<5  -!/ #1  02	
 2Na1P  ( #) #2 )<A(> )<A(> (: (6#5 +@ 2LA1N -B  ;``a:b!
$ !$55, -!/"0/2'"("1'5#5'91KA1N,A:_`a:b
 !{  ##J/Y #>^ ,34>?[\^?_ad4e4e,R0kl -  8.r2,#$$:$7$;$7$;-%*&;+-G-K(=-M6_DN& 5.r2,#$$:$7$;$7$;-%*&;+-MDN    '+84F+G&H#'+H5H,I'J$$(2B)C$D!'+H5H,I'J$.28<V3W.X+/737A]8^3_08@<@JoAp<q94Q7 ). 9A}#&8"9A"==N"08;N7AE3GYCZ]^C^8_`M "###q(#$ 881  -!/ )< #1	
 $%  9;a? 2VVW1X  ( #) #2 )EQ(G )EQ(G ". +@ 2UUV1W  -B!" ;hhi:j#& /1( -(;!/"0#$8;a?'"("1!-1TUV1W,A:ghi:j  !u  NN!!(+"0q !:v &!#/2"D GGIDM!%D DM+a/A5		q!<_Vf
r9   unetmotion_adapterload_weightsc           	         US LnU(       Ga.  UR                  UR                  S9  [        UR                  S   5      [        UR                  S   5      :w  a  [	        S5      e[        UR                  S   [        5      (       a*  UR                  S   /[        UR                  S   5      -  nO[        UR                  S   5      n[        UR                  S   [        5      (       a*  UR                  S   /[        UR                  S   5      -  nO[        UR                  S   5      nXV:w  a  [	        S5      e[        UR                  5      nU R                  US'   / nUS    H-  n	S	U	;   a  UR                  S
5        M  UR                  S5        M/     XS'   / n
US    H-  n	S	U	;   a  U
R                  S5        M  U
R                  S5        M/     XS'   U(       a  UR                  S   US'   UR                  S   US'   UR                  S   US'   UR                  S   US'   UR                  S   US'   UR                  S   US'   UR                  S   US'   UR                  S   (       a  UR                  S   US'   UR                  S5      (       d  US   US'   U R                  U 5      u  p[        U Vs0 s H   oU;   d  X;   d  M  XR                  U5      _M"     sn5      nU R                  US'   U R                  U5      nU(       d  U$ U(       a  UR                  S   (       a  UR                  Ul        [         R"                  " UR                  R$                  UR                  R$                  S S 2SS 2S S 2S S 24   /SS9nUR                  R'                  XR                  R(                  S.5        O3UR                  R'                  UR                  R+                  5       5        UR,                  R'                  UR,                  R+                  5       5        UR.                  R'                  UR.                  R+                  5       5        [1        S UR2                  R5                  5        5       5      (       Ga0  0 nUR2                  R7                  5        H  u  nnUR9                  S5      (       a-  [;        [<        S 5      (       a  [>        O[@        nU" 5       UU'   MI  [;        [<        S 5      (       a  [B        O[D        nU" URF                  URH                  URJ                  URL                  S!9UU'   M     UR2                  R7                  5        H!  u  nnUU;  d  M  URO                  5       UU'   M#     URQ                  U5        S"UR                  l)        URT                  Ul*        [W        URX                  5       GH  u  nnURX                  U   RZ                  R'                  URZ                  R+                  5       5        [;        URX                  U   S#5      (       a@  URX                  U   R\                  R'                  UR\                  R+                  5       5        URX                  U   R^                  (       d  M  URX                  U   R^                  R'                  UR^                  R+                  5       5        GM     [W        UR`                  5       GH  u  nnUR`                  U   RZ                  R'                  URZ                  R+                  5       5        [;        UR`                  U   S#5      (       a@  UR`                  U   R\                  R'                  UR\                  R+                  5       5        UR`                  U   Rb                  (       d  M  UR`                  U   Rb                  R'                  URb                  R+                  5       5        GM     URd                  RZ                  R'                  URd                  RZ                  R+                  5       5        URd                  R\                  R'                  URd                  R\                  R+                  5       5        URf                  b3  URf                  R'                  URf                  R+                  5       5        URh                  b3  URh                  R'                  URh                  R+                  5       5        URj                  R'                  URj                  R+                  5       5        U(       a  URm                  U5        UR                  URn                  5        U$ s  snf )$N)devicer>  r  z;Incompatible Motion Adapter, got different number of blocksr  r  zEIncompatible Motion Adapter, got different number of layers per block_class_name	CrossAttnr   r   r?  r   r  r#  r%  r&  r"  rF  r   r   r'  r@   r>   r?   rk   r(   r   )weightbiasc              3   N   #    U  H  n[        U[        [        45      v   M     g 7fr9  )r   r   r   .0procs     r:   	<genexpr>.UNetMotionModel.from_unet2d.<locals>.<genexpr>\  s(      
5 t46OPQQ5s   #%zattn1.processorscaled_dot_product_attention)hidden_sizerE   r   
num_tokensip_image_projr   )8torq  r   configr   r   r   r   dictr0   r   r   _get_signature_keysr   from_configr-  r5   r  rt  load_state_dictru  
state_dictrR  rS  anyattn_processorsvaluesitemsendswithhasattrFr   r   r   r   r}  rE   r   r~  ra   set_attn_processorrJ  rT  r   r0  r   r   r   r1  r   r.  rX  rZ  r[  load_motion_modulesdtype)clsrm  rn  ro  has_motion_adapterexpanded_layers_per_block!expanded_adapter_layers_per_blockr  r0  down_blocks_typer1  expected_kwargsoptional_kwargskmodelupdated_conv_in_weight
attn_procsr   	processorattn_processor_classr   re  rj  s                          r:   from_unet2dUNetMotionModel.from_unet2d  s    ,47T[[1 4;;123s>;P;PQe;f7gg !^__ $++&893??-1[[9K-L,MPSTXT_T_`rTsPt,t),0=O1P,Q).//0IJCPP5C5J5JKd5e4fil"))*>?j 51 599N9NOh9i4j1(M !hii dkk" #} &'9 :..""#=>""#45	 !;
 &1!"	 &'7 8..  !9:  1	 !9
 $- 3A3H3HIe3fF/0.<.C.CD[.\F*+-;-B-BCY-ZF)*)7)>)>?X)YF%&BPBWBW9CF>? ?M>S>S5?F:; 4B3H3HIe3fF/0 $$%78(6(=(=>P(Q}% zz/00,23G,HF()+.+B+B3+G(vnv!oAUYZYm-Q

1-vno #}'L ."7"78J"K*22EM%*YY$$n&<&<&C&CAqr1aK&PQWX&" MM))5KUaUaUfUf*ghMM))$,,*A*A*CD''(A(A(CD,,T-@-@-K-K-MN 
,,335
 
 
 J#'#7#7#=#=#?i==!233,3A7U,V,V(\i ) (<'=Jt$ #1&DEE 23 )
 (<$-$9$9,5,I,I'oo#,#7#7	(Jt$ $@$ $)#8#8#>#>#@iz)'0':':'<Jt$ $A $$Z00?ELL-%)%:%:E"&t'7'78MAza ((889K9K9V9V9XYu((+\::!!!$//??
@U@U@`@`@bc  #000!!!$11AA*BYBYBdBdBfg 9 %T^^4KAxOOA&&66x7G7G7R7R7TUuq)<88"--==h>Q>Q>\>\>^_q!,,,"--==h>Q>Q>\>\>^_ 5 	//0F0F0Q0Q0ST""224>>3L3L3W3W3YZ)//0B0B0M0M0OP==$NN**4==+C+C+EF&&t}}'?'?'AB%%n5 	a os   <e,e,ri   c                    U R                  5        H
  nSUl        M     U R                   H-  nUR                  nUR                  5        H
  nSUl        M     M/     U R                   H-  nUR                  nUR                  5        H
  nSUl        M     M/     [        U R                  S5      (       a5  U R                  R                  nUR                  5        H
  nSUl        M     gg)zlFreeze the weights of just the UNet2DConditionModel, and leave the motion modules
unfrozen for fine tuning.
FTr   N)
parametersrequires_gradr0  r   r1  r  r.  )r^   paramre  r   rj  s        r:   freeze_unet2d_params$UNetMotionModel.freeze_unet2d_params  s    
 __&E"'E ' **J'66N'224&*# 5 +
 H%44N'224&*# 5 '
 4>>#344!^^::N'224&*# 5 5r9   c                 B   [        UR                  5       HE  u  p#U R                  U   R                  R                  UR                  R	                  5       5        MG     [        UR
                  5       HE  u  p$U R
                  U   R                  R                  UR                  R	                  5       5        MG     [        U R                  S5      (       aH  U R                  R                  R                  UR                  R                  R	                  5       5        g g )Nr   )r   r0  r   r  r  r1  r  r.  )r^   rn  r   re  rj  s        r:   r  #UNetMotionModel.load_motion_modules  s    &~'A'ABMAQ..>>z?X?X?c?c?ef C$^%=%=>KANN1,,<<X=T=T=_=_=ab ? 4>>#344NN))99.:R:R:a:a:l:l:no 5r9   save_directoryis_main_processsafe_serializationvariantpush_to_hubc           	      |   U R                  5       n0 nUR                  5        H  u  pSU	;   d  M  XU	'   M     [        U R                  S   U R                  S   U R                  S   U R                  S   U R                  S   U R                  S   S9nUR	                  U5        UR
                  " S
UUUUUS	.UD6  g )Nr   r  r  rD   r#  r%  r&  )r  r  r$  r#  r%  r&  )r  r  r  r  r  r/   )r  r  r  r  r  save_pretrained)r^   r  r  r  r  r  r   r  motion_state_dictr  vadapters               r:   save_motion_modules#UNetMotionModel.save_motion_modules  s     __&
 $$&DA1$'(!$ '  #{{+?@$(KK0B$C#';;/@#A'+{{3O'P"&++.E"F!%-C!D
 	 12 	
)+1#	
 	
r9   c                    ^ 0 nS[         S[        R                  R                  S[        [         [
        4   4U4S jjmU R                  5        H  u  p#T" X#U5        M     U$ )z
Returns:
    `dict` of attention processors: A dictionary containing all attention processors used in the model with
    indexed by its weight name.
r   module
processorsc                    > [        US5      (       a  UR                  5       X  S3'   UR                  5        H  u  p4T" U  SU 3XB5        M     U$ )Nget_processor
.processorr  )r  r  named_children)r   r  r  sub_namechildfn_recursive_add_processorss        r:   r  DUNetMotionModel.attn_processors.<locals>.fn_recursive_add_processors  sZ    v//282F2F2H
V:./#)#8#8#:+tfAhZ,@%T $; r9   )r   r5   rU   Moduler   r   r  )r^   r  r   r  r  s       @r:   r  UNetMotionModel.attn_processors  sb     
	c 	588?? 	X\]`bt]tXu 	 !//1LD'jA 2 r9   r  c           	      d  ^ [        U R                  R                  5       5      n[        U[        5      (       a-  [        U5      U:w  a  [        S[        U5       SU SU S35      eS[        S[        R                  R                  4U4S jjmU R                  5        H  u  p4T" X4U5        M     g)	a  
Sets the attention processor to use to compute attention.

Parameters:
    processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
        The instantiated processor class or a dictionary of processor classes that will be set as the processor
        for **all** `Attention` layers.

        If `processor` is a dict, the key needs to define the path to the corresponding cross attention
        processor. This is strongly recommended when setting trainable attention processors.

z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r  c                 
  > [        US5      (       aJ  [        U[        5      (       d  UR                  U5        O#UR                  UR	                  U  S35      5        UR                  5        H  u  p4T" U  SU 3XB5        M     g )Nset_processorr  r  )r  r   r  r  popr  )r   r  r  r  r  fn_recursive_attn_processors        r:   r  GUNetMotionModel.set_attn_processor.<locals>.fn_recursive_attn_processor  ss    v//!)T22((3(($z7J)KL#)#8#8#:+tfAhZ,@%S $;r9   N)r   r  keysr   r  r   r   r5   rU   r  r  )r^   r  countr   r  r  s        @r:   r  "UNetMotionModel.set_attn_processor  s     D((--/0i&&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1LD'i@ 2r9   
chunk_sizer   c                    ^ US;  a  [        SU 35      eU=(       d    SnS[        R                  R                  S[        S[        4U4S jjmU R                  5        H  nT" X1U5        M     g)	a  
Sets the attention processor to use [feed forward
chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

Parameters:
    chunk_size (`int`, *optional*):
        The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
        over each tensor of dim=`dim`.
    dim (`int`, *optional*, defaults to `0`):
        The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
        or dim=1 (sequence length).
)r   r(   z-Make sure to set `dim` to either 0 or 1, not r(   r  r  r   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g Nset_chunk_feed_forward)r  r   r  r  childrenr  r  r   r  fn_recursive_feed_forwards       r:   r  JUNetMotionModel.enable_forward_chunking.<locals>.fn_recursive_feed_forward-  =    v788---M*)%SA +r9   N)r   r5   rU   r  r   r  )r^   r  r   r  r  s       @r:   enable_forward_chunking'UNetMotionModel.enable_forward_chunking  sn     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmoF%f#> &r9   c                    ^ S[         R                  R                  S[        S[        4U4S jjmU R	                  5        H  nT" US S5        M     g )Nr  r  r   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g r  r  r  s       r:   r  KUNetMotionModel.disable_forward_chunking.<locals>.fn_recursive_feed_forward8  r  r9   r   )r5   rU   r  r   r  )r^   r  r  s     @r:   disable_forward_chunking(UNetMotionModel.disable_forward_chunking7  sH    	Behhoo 	B3 	BUX 	B mmoF%fdA6 &r9   c           	      ~   [        S U R                  R                  5        5       5      (       a  [        5       nOr[        S U R                  R                  5        5       5      (       a  [	        5       nO8[        S[        [        U R                  R                  5       5      5       35      eU R                  U5        g)zU
Disables custom attention processors and sets the default attention implementation.
c              3   F   #    U  H  oR                   [        ;   v   M     g 7fr9  )ra   r   rw  s     r:   rz  =UNetMotionModel.set_default_attn_processor.<locals>.<genexpr>G  s     iKh4~~!>>Kh   !c              3   F   #    U  H  oR                   [        ;   v   M     g 7fr9  )ra   r   rw  s     r:   rz  r  I  s     hJg$#==Jgr  zOCannot call `set_default_attn_processor` when attention processors are of type N)	allr  r  r   r   r   nextiterr  )r^   r  s     r:   set_default_attn_processor*UNetMotionModel.set_default_attn_processorC  s     i4K_K_KfKfKhiii,.Ih$J^J^JeJeJghhh%Iabfgklp  mA  mA  mH  mH  mJ  hK  cL  bM  N  		*r9   r   r   r   r   c                     [        U R                  5       H9  u  pV[        USU5        [        USU5        [        USU5        [        USU5        M;     g)a  Enables the FreeU mechanism from https://huggingface.co/papers/2309.11497.

The suffixes after the scaling factors represent the stage blocks where they are being applied.

Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.

Args:
    s1 (`float`):
        Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
        mitigate the "oversmoothing effect" in the enhanced denoising process.
    s2 (`float`):
        Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
        mitigate the "oversmoothing effect" in the enhanced denoising process.
    b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
    b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
r   r   r   r   N)r   r1  setattr)r^   r   r   r   r   r   upsample_blocks          r:   enable_freeuUNetMotionModel.enable_freeuS  sJ    $ "+4>>!:AND"-ND"-ND"-ND"-	 ";r9   c                     1 Skn[        U R                  5       H9  u  p#U H.  n[        X45      (       d  [        X4S5      c  M"  [	        X4S5        M0     M;     g)zDisables the FreeU mechanism.>   r   r   r   r   N)r   r1  r  r  r  )r^   
freeu_keysr   r  r  s        r:   disable_freeuUNetMotionModel.disable_freeul  sH    -
!*4>>!:A>--D1Q1]Nt4   ";r9   c                    SU l         U R                  R                  5        H3  u  pS[        UR                  R
                  5      ;   d  M*  [        S5      e   U R                  U l         U R                  5        H)  n[        U[        5      (       d  M  UR                  SS9  M+     U R                  [        5       5        g)u  
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
are fused. For cross-attention modules, key and value projection matrices are fused.

<Tip warning={true}>

This API is 🧪 experimental.

</Tip>
NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr  r  r   ra   r0   r   modulesr   r   fuse_projectionsr  r   )r^   r`   attn_processorr  s       r:   fuse_qkv_projections$UNetMotionModel.fuse_qkv_projectionsu  s     )-%!%!5!5!;!;!=A#n66??@@ !tuu "> )-(<(<%llnF&),,''T'2 % 	 5 78r9   c                 V    U R                   b  U R                  U R                   5        gg)um   Disables the fused QKV projection if enabled.

<Tip warning={true}>

This API is 🧪 experimental.

</Tip>

N)r  r  )r^   s    r:   unfuse_qkv_projections&UNetMotionModel.unfuse_qkv_projections  s)     ((4##D$A$AB 5r9   r.   re   rd   timestep_condr   rh   added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr   c                   ^$ SU R                   -  m$SnSn[        U$4S jUR                  SS  5       5      (       a  [        R	                  S5        SnUb2  SUR                  UR                  5      -
  S	-  nUR                  S5      nUn[        R                  " U5      (       d  UR                  R                  S
:H  nUR                  R                  S:H  n[        U[        5      (       a/  U(       d  U(       a  [        R                  O[        R                  nO.U(       d  U(       a  [        R                   O[        R"                  n[        R$                  " U/UUR                  S9nO7['        UR                  5      S:X  a  US   R                  UR                  5      nUR                  S   nUR)                  UR                  S   5      nU R+                  U5      nUR                  U R                  S9nU R-                  UU5      nSnU R.                  R0                  S:X  a  SU;  a  [3        U R4                   S35      eUR7                  S5      nSU;  a  [3        U R4                   S35      eUR7                  S5      nU R9                  UR;                  5       5      nUR=                  UR                  S   S45      n[        R>                  " UU/SS9nUR                  UR                  5      nU RA                  U5      nUc  UOUU-   nURC                  USUR                  S   U-  S9nU RD                  b  U R.                  RF                  S:X  au  SU;  a  [3        U R4                   S35      eUR7                  S5      nU RE                  U5      nU Vs/ s H$  nURC                  USUR                  S   U-  S9PM&     nnUU4nURI                  SSSSS5      R=                  UR                  S   U-  S4UR                  SS -   5      nU RK                  U5      nU4nU RL                   HD  n[O        US5      (       a   URP                  (       a  U" UUUUUUS9u  nnOU" UUUS9u  nnUU-  nMF     Ub%  Sn[S        UU5       H  u  nn UU -   nUU4-  nM     UnU RT                  bC  [O        U RT                  S 5      (       a  U RU                  UUUUUUS!9nOU RU                  UUUUUS"9nU	b  X-   n[W        U RX                  5       H  u  n!n"U!['        U RX                  5      S-
  :H  n#U['        U"RZ                  5      * S nUS['        U"RZ                  5      *  nU#(       d  U(       a  US   R                  SS n[O        U"S5      (       a   U"RP                  (       a  U"" UUUUUUUUS#9nM  U"" UUUUUS$9nM     U R\                  (       a"  U R]                  U5      nU R_                  U5      nU Ra                  U5      nUSSS24   R=                  SU4UR                  SS -   5      RI                  SSSSS5      nU
(       d  U4$ [c        US%9$ s  snf )&aW  
The [`UNetMotionModel`] forward method.

Args:
    sample (`torch.Tensor`):
        The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
    timestep (`torch.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
    encoder_hidden_states (`torch.Tensor`):
        The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
    timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
        Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
        through the `self.time_embedding` layer to obtain the timestep embeddings.
    attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
        An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
        is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
        negative values to the attention scores corresponding to "discard" tokens.
    cross_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
        A tuple of tensors that if specified are added to the residuals of down unet blocks.
    mid_block_additional_residual: (`torch.Tensor`, *optional*):
        A tensor that if specified is added to the residual of the middle unet block.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.unets.unet_motion_model.UNetMotionOutput`] instead of a plain
        tuple.

Returns:
    [`~models.unets.unet_motion_model.UNetMotionOutput`] or `tuple`:
        If `return_dict` is True, an [`~models.unets.unet_motion_model.UNetMotionOutput`] is returned,
        otherwise a `tuple` is returned where the first element is the sample tensor.
r   FNc              3   2   >#    U  H  oT-  S :g  v   M     g7f)r   Nr/   )rx  sdefault_overall_up_factors     r:   rz  *UNetMotionModel.forward.<locals>.<genexpr>  s     M;La,,1;Ls   z9Forward upsample size to force interpolation output size.Tr(   g     mpsnpu)r  rq  r   )r  rQ  text_embedsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`time_idsz has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`r   r   )r   r   r  image_embedsz has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`r	   rk   r   )rc   r   rd   r   rg   rh   )rc   r   rg   r/   r   )rd   r   rg   rh   )rd   r   rh   )rc   r   r   rd   r   r   rg   rh   )rc   r   r   r   rg   )r.   )2rW  r  rm   r   infor  r  	unsqueezer5   	is_tensorrq  typer   r   float32float64int32int64tensorr   expandrR  rS  r  rK  r   ra   r   rU  flattenrn   concatrV  repeat_interleaverT  rJ  ro   r-  r0  r  r   r   r.  r   r1  r   rX  rZ  r[  r,   )%r^   r.   re   rd   r  r   rh   r  r  r  r   forward_upsample_sizer   	timestepsis_mpsis_npur  rg   t_embembaug_embr  r  time_embeds
add_embedsr	  image_embeddown_block_res_samplesdownsample_blockres_samplesnew_down_block_res_samplesdown_block_res_sampledown_block_additional_residualr   r  rd  r  s%                                       @r:   ry   UNetMotionModel.forward  s   d %&t':':$:! !&M6<<;LMMMKKST$(! %."3"3FLL"AAXMN+55a8N 	y)) ]]''50F]]''50F(E***0F(.&u{{i[fmmTI!Q&!$**6==9I \\!_
$$V\\!_5	y)
 tzz*!!%7;;**k9$55 ~~&  '{  |  ,//>K!22 ~~&  'x  y  ),,Z8H,,X-=-=-?@K%--{/@/@/CR.HIK{K&@bIJ#syy1J((4G_c#-##JA399Q<R\C\#]  ,1Q1QUd1d%66 ~~&  'A  B  -00@L00>L $0#/K --ja[M^M^_`MadnMn-o#/   &;L$I! 1aA.66Q*8TVX7Y\b\h\hijik\l7lmf% #) $ 0 0')>??DTDhDh&6"(*?#1)+A'# '7VRUbl&m#"k1" !1 +6)+&IL&(GJE%'E )>@^(^%*/D.FF*	J &@" >>%t~~'788*?#1)+A (  *?#1+A (  )4;F "+4>>!:A~#dnn"5"99N0#n6L6L2M1M1OPK%;<Zs>CYCY?Z>Z%[" "&; 6r : @ @ D~'<==.BdBd'"(,7*?"/#1)+A	 ("(,7"/)/ ";@ ''/F]]6*Fv& a(("j)9FLL<L)LMUUVWYZ\]_`bcd9v..Ws   +Y)rV  rU  rZ  r-  rX  r[  r0  rT  r.  rW  r  rG   rS  rR  r1  ) Nrk   rk   )r   r   r   r   )r  r   r   r   r;  r   r(   r(   silur~   gh㈵>r   r(   Nr(   NNr(   Fr   r~   r   NTr(   NNNNNN)NT)ri   N)TTNF)Nr   )NNNNNNT)*r0   r1   r2   r3   r4    _supports_gradient_checkpointing _skip_layerwise_casting_patternsr   r   r   r   r   r   r   r   rT   classmethodr*   r  r  r  r  r  propertyr   r   r  r  r  r  r  r  r  r  r  r5   r6   r   r,   ry   r8   r   r   s   @r:   r=  r=    s    (,$(.x$ &*-
+
 /E34"#()!#'MN_cVWhlMQVW&+;<%'BCqu%) !)-.2-115?C,0WA
c]A
 A
 	A

  S/A
 c3hA
" "#s(O#A
$  U3Z0%A
&  'A
( !&)A
* +A
, -A
. /A
0 !1A
2 ',CsU5\,I&J3A
4 /7uS%*eTYl=Z7[.\5A
6 05S%*eEl5R/S7A
8 8@c5QT:W\]bWcFc@d7e9A
: +35eCj3I*J;A
< 4<E#uSz/<R3S=A
>  $?A
@ #3c3h#78AA
B  #CA
D %*#uS#X*>$?EA
F -5U3c3hQVW\]`be]eWfhkWkQl;l5m,nGA
H #IA
J KA
L "#MA
N 'smOA
P &c]QA
R "*#SA
T 08}UA
V %SMWA
 A
F
  37!	W"W !/W 	W Wr+0p(=2I pd p !%#'!%!!
!
 !
 !	!

 #!
 !
 
!
F c+=&=!>  0 AE2Dd3PbKbFc2c,d  AD?(3- ?S ?Y] ?:	7+ .u .% .U . .$ .2594C$ 1515;??CIM@D f/f/ eS01f/  %||	f/
  -f/ !.f/ !)c3h 8f/ $Dell):$;<f/ *2%2E)Ff/ (0'=f/ f/ 
u||!44	5f/ f/r9   r=  )Idataclassesr   typingr   r   r   r   r   r5   torch.nnrU   torch.nn.functional
functionalr  torch.utils.checkpointconfiguration_utilsr
   r   r   loadersr   r   r   utilsr   r   r   utils.torch_utilsr   	attentionr   attention_processorr   r   r   r   r   r   r   r   r   r   
embeddingsr    r!   modeling_utilsr"   r   r#   r$   r%    transformers.dual_transformer_2dr&   transformers.transformer_2dr'   unet_2d_blocksr)   unet_2d_conditionr*   
get_loggerr0   r   r,   r  r<   r   r   r   r  r  r  r  r=  r/   r9   r:   <module>r@     s:   " 4 4      N N \ \ 3 3 , -   6 ' < < E < 3 3 
		H	% 	z 	 	Tryy Tn},bii },@v,ryy v,r}RYY }@DBII DNb")) bJ&BII &RFJ-C FRS/j+/JL\ S/r9   