
    +h}                     @   S SK Jr  S SKJrJrJr  S SKrS SKJr  S SK	Js  J
r  SSKJr  SSKJr  SSKJr  SS	KJrJrJrJrJr  SS
KJr  SSKJrJrJrJrJrJ r    " S S\RB                  5      r" " S S\RB                  5      r#S\RH                  S\RH                  4S jr% " S S\RB                  5      r& " S S\RB                  5      r' " S S\RB                  5      r( " S S\RB                  5      r) " S S\RB                  5      r* " S S\RB                  5      r+g)    )partial)OptionalTupleUnionN   )	deprecate   )get_activation)SpatialNorm)Downsample1DDownsample2DFirDownsample2DKDownsample2Ddownsample_2d)AdaGroupNorm)FirUpsample2DKUpsample2D
Upsample1D
Upsample2Dupfirdn2d_nativeupsample_2dc            "          ^  \ rS rSrSrSSSSSSSS	S
SSSSSSS.S\S\\   S\S\S\S\S\\   S\S\	S\	S\S\\   S\S\S\S\\   4 U 4S jjjr
S\R                  S \R                  S!\R                  4S" jrS#rU =r$ )$ResnetBlockCondNorm2D,   a  
A Resnet block that use normalization layer that incorporate conditioning information.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
    groups_out (`int`, *optional*, default to None):
        The number of groups to use for the second normalization layer. if set to None, same as `groups`.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
    time_embedding_norm (`str`, *optional*, default to `"ada_group"` ):
        The normalization layer for time embedding `temb`. Currently only support "ada_group" or "spatial".
    kernel (`torch.Tensor`, optional, default to None): FIR filter, see
        [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
    output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
    use_in_shortcut (`bool`, *optional*, default to `True`):
        If `True`, add a 1x1 nn.conv2d layer for skip-connection.
    up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
    down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
    conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
        `conv_shortcut` output.
    conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
        If None, same as `out_channels`.
NF               ư>swish	ada_group      ?T)out_channelsconv_shortcutdropouttemb_channelsgroups
groups_outepsnon_linearitytime_embedding_normoutput_scale_factoruse_in_shortcutupdownconv_shortcut_biasconv_2d_out_channelsin_channelsr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   c          	      $  > [         TU ]  5         Xl        Uc  UOUnX l        X0l        Xl        Xl        Xl        Xl        Uc  UnU R                  S:X  a  [        XQXhS9U l
        O9U R                  S:X  a  [        X5      U l
        O[        SU R                   35      e[        R                  " XSSSS9U l        U R                  S:X  a  [        XRXxS9U l        O9U R                  S:X  a  [        X%5      U l        O[        SU R                   35      e["        R                  R%                  U5      U l        U=(       d    Un[        R                  " UUSSSS9U l        [+        U	5      U l        S =U l        U l        U R
                  (       a  [3        USS	9U l        O"U R                  (       a  [5        USSS
S9U l        Uc  U R                  U:g  OUU l        S U l        U R6                  (       a  [        R                  " UUSSSUS9U l        g g )Nr    )r(   spatialz" unsupported time_embedding_norm:    r	   kernel_sizestridepaddingFuse_convopr:   r8   namer   r6   r7   r8   bias)super__init__r1   r"   use_conv_shortcutr-   r.   r+   r*   r   norm1r   
ValueErrornnConv2dconv1norm2torchDropoutr$   conv2r
   nonlinearityupsample
downsampler   r   r,   r#   )selfr1   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   	__class__s                    Q/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/resnet.pyrA   ResnetBlockCondNorm2D.__init__J   s   ( 	&&2&:{(!.	#6 #6 J##{2%m&RDJ%%2$[@DJA$BZBZA[\]]YY{aPQ[\]
##{2%m:WDJ%%2$\ADJA$BZBZA[\]]xx''03C|YY|-AqYZdef
*=9*..77&{UCDMYY*;PQX\]DOKZKbt//3GGhw!!#$'"D      input_tensortembreturnc                    [        U5      S:  d  UR                  SS 5      b  Sn[        SSU5        UnU R                  Xb5      nU R	                  U5      nU R
                  bV  UR                  S   S:  a   UR                  5       nUR                  5       nU R                  U5      nU R                  U5      nO/U R                  b"  U R                  U5      nU R                  U5      nU R                  U5      nU R                  Xb5      nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  b  U R                  U5      nX-   U R                  -  nU$ )Nr   scaleThe `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`.1.0.0@   )lengetr   rC   rL   rM   shape
contiguousrN   rG   rH   r$   rK   r#   r+   )rO   rT   rU   argskwargsdeprecation_messagehidden_statesoutput_tensors           rQ   forwardResnetBlockCondNorm2D.forward   sN   t9q=FJJw5A #Ugw(;<$

=7))-8==$""1%++668 - 8 8 :==6L MM-8M__(??<8L OOM:M

=1

=7))-8]3

=1)--l;L%59Q9QQrS   )rG   rK   r#   r.   rN   r$   r1   rL   rC   rH   r"   r+   r*   r-   rM   rB   r,   )__name__
__module____qualname____firstlineno____doc__intr   boolfloatstrrA   rI   Tensorre   __static_attributes____classcell__rP   s   @rQ   r   r   ,   s3   B '+# $($#.%(*.#'.2%I I sm	I
 I I I I SMI I I !I #I "$I I  !I" !#I$ 'sm%I IV%ELL % %Z_ZfZf % %rS   r   c            (       *  ^  \ rS rSrSrSSSSSSSS	S
SSSSSSSSSS.S\S\\   S\S\S\S\S\\   S\S\S\	S\S\	S\\
R                     S\S\\   S\S\S\S \\   4&U 4S! jjjrS"\
R                  S#\
R                  S$\
R                  4S% jrS&rU =r$ )'ResnetBlock2D   a  
A Resnet block.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
    groups_out (`int`, *optional*, default to None):
        The number of groups to use for the second normalization layer. if set to None, same as `groups`.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
    non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
    time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
        By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" for a
        stronger conditioning with scale and shift.
    kernel (`torch.Tensor`, optional, default to None): FIR filter, see
        [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
    output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
    use_in_shortcut (`bool`, *optional*, default to `True`):
        If `True`, add a 1x1 nn.conv2d layer for skip-connection.
    up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
    down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
    conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
        `conv_shortcut` output.
    conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
        If None, same as `out_channels`.
NFr   r   r   Tr   r   defaultr!   )r"   r#   r$   r%   r&   r'   pre_normr(   r)   skip_time_actr*   kernelr+   r,   r-   r.   r/   r0   r1   r"   r#   r$   r%   r&   r'   rx   r(   r)   ry   r*   rz   r+   r,   r-   r.   r/   r0   c          	        >^ [         TU ]  5         US:X  a  [        S5      eUS:X  a  [        S5      eSU l        Xl        Uc  UOUnX l        X0l        UU l        UU l        Xl	        Xl
        Xl        Uc  Un[        R                  R                  XaU	SS9U l        [        R                   " XSSSS	9U l        Ubu  U R                  S
:X  a  [        R$                  " XR5      U l        OPU R                  S:X  a   [        R$                  " USU-  5      U l        O [        SU R                   S35      eS U l        [        R                  R                  XrU	SS9U l        [        R                  R+                  U5      U l        U=(       d    Un[        R                   " UUSSSS	9U l        [1        U
5      U l        S =U l        U l        U R                  (       aI  US:X  a  SmU4S jU l        OUS:X  a  [9        [:        R<                  SSS9U l        Ok[?        USS9U l        O[U R                  (       aJ  US:X  a  SmU4S jU l        O6US:X  a  [9        [:        R@                  SSS9U l        O[C        USSSS9U l        Uc  U R                  U:g  OUU l"        S U l#        U RD                  (       a  [        R                   " UUSSSUS9U l#        g g )Nr    zkThis class cannot be used with `time_embedding_norm==ada_group`, please use `ResnetBlockCondNorm2D` insteadr3   ziThis class cannot be used with `time_embedding_norm==spatial`, please use `ResnetBlockCondNorm2D` insteadT
num_groupsnum_channelsr(   affiner4   r	   r5   rw   scale_shiftr   zunknown time_embedding_norm :  fir)r	   r4   r4   r	   c                    > [        U TS9$ N)rz   )r   x
fir_kernels    rQ   <lambda>(ResnetBlock2D.__init__.<locals>.<lambda>%  s    +a
*KrS   sde_vpg       @nearest)scale_factormodeFr9   c                    > [        U TS9$ r   )r   r   s    rQ   r   r   -  s    M!J,OrS   )r6   r7   r;   r<   r   r>   )$r@   rA   rD   rx   r1   r"   rB   r-   r.   r+   r*   ry   rI   rE   	GroupNormrC   rF   rG   Lineartime_emb_projrH   rJ   r$   rK   r
   rL   rM   rN   r   Finterpolater   
avg_pool2dr   r,   r#   )rO   r1   r"   r#   r$   r%   r&   r'   rx   r(   r)   ry   r*   rz   r+   r,   r-   r.   r/   r0   r   rP   s                       @rQ   rA   ResnetBlock2D.__init__   s   . 	+-}  )+{  &&2&:{(!.	#6 #6 *JXX''6Y\ei'j
YY{aPQ[\]
$''94%'YY}%K"))]:%'YY}a,>N%O" #A$BZBZA[[\!]^^!%DXX'':^ajn'o
xx''03C|YY|-AqYZdef
*=9*..77)
 K8# 'Ci X *; GYY)
"O8#")!,,Aa"P".{UTU\`"aKZKbt//3GGhw!!#$'"D  rS   rT   rU   rV   c                    [        U5      S:  d  UR                  SS 5      b  Sn[        SSU5        UnU R                  U5      nU R	                  U5      nU R
                  bV  UR                  S   S:  a   UR                  5       nUR                  5       nU R                  U5      nU R                  U5      nO/U R                  b"  U R                  U5      nU R                  U5      nU R                  U5      nU R                  b>  U R                  (       d  U R	                  U5      nU R                  U5      S S 2S S 2S S 4   nU R                  S:X  a  Ub  Xb-   nU R                  U5      nOqU R                  S:X  aP  Uc  [        SU R                   35      e[        R                   " US	S
S9u  pxU R                  U5      nUS
U-   -  U-   nOU R                  U5      nU R	                  U5      nU R#                  U5      nU R%                  U5      nU R&                  b  U R'                  UR                  5       5      nX-   U R(                  -  n	U	$ )Nr   rX   rY   rZ   r[   rw   r   z9 `temb` should not be None when `time_embedding_norm` is r   r	   )dim)r\   r]   r   rC   rL   rM   r^   r_   rN   rG   r   ry   r*   rH   rD   rI   chunkr$   rK   r#   r+   )
rO   rT   rU   r`   ra   rb   rc   
time_scale
time_shiftrd   s
             rQ   re   ResnetBlock2D.forward@  s.   t9q=FJJw5A #Ugw(;<$

=1))-8==$""1%++668 - 8 8 :==6L MM-8M__(??<8L OOM:M

=1)%%((.%%d+Aq$,<=D##y0 - 4 JJ}5M%%6| OPTPhPhOij  &+[[qa%@"J JJ}5M)Q^<zIM JJ}5M))-8]3

=1)--l.E.E.GHL%59Q9QQrS   )rG   rK   r#   r.   rN   r$   r1   rL   rC   rH   r"   r+   rx   ry   r   r*   r-   rM   rB   r,   )rg   rh   ri   rj   rk   rl   r   rm   rn   ro   rI   rp   rA   re   rq   rr   rs   s   @rQ   ru   ru      sf   D '+# $($##,)-%(*.#'.2+b b sm	b
 b b b b SMb b b b b !b &b  #!b" "$#b$ %b& 'b( !)b* 'sm+b bH5ELL 5 5Z_ZfZf 5 5rS   ru   tensorrV   c                    [        U R                  5      S:X  a  U S S 2S S 2S 4   $ [        U R                  5      S:X  a  U S S 2S S 2S S S 24   $ [        U R                  5      S:X  a  U S S 2S S 2SS S 24   $ [        S[        U 5       S35      e)Nr   r4      r   z`len(tensor)`: z has to be 2, 3 or 4.)r\   r^   rD   )r   s    rQ   rearrange_dimsr   y  s    
6<<AaDj!!
6<<AaD!m$$	V\\	a	aAqj!!?3v;-7LMNNrS   c                      ^  \ rS rSrSr  SS\S\S\\\\\4   4   S\S\4
U 4S jjjr	S	\
R                  S
\
R                  4S jrSrU =r$ )Conv1dBlocki  ax  
Conv1d --> GroupNorm --> Mish

Parameters:
    inp_channels (`int`): Number of input channels.
    out_channels (`int`): Number of output channels.
    kernel_size (`int` or `tuple`): Size of the convolving kernel.
    n_groups (`int`, default `8`): Number of groups to separate the channels into.
    activation (`str`, defaults to `mish`): Name of the activation function.
inp_channelsr"   r6   n_groups
activationc                    > [         TU ]  5         [        R                  " XX3S-  S9U l        [        R
                  " XB5      U l        [        U5      U l        g )Nr   r8   )	r@   rA   rE   Conv1dconv1dr   
group_normr
   mish)rO   r   r"   r6   r   r   rP   s         rQ   rA   Conv1dBlock.__init__  sD     	iiK`aQab,,x>":.	rS   inputsrV   c                     U R                  U5      n[        U5      nU R                  U5      n[        U5      nU R                  U5      nU$ N)r   r   r   r   )rO   r   intermediate_reproutputs       rQ   re   Conv1dBlock.forward  sM     KK/*+<= OO,=>*+<=,-rS   )r   r   r   )   r   rg   rh   ri   rj   rk   rl   r   r   ro   rA   rI   rp   re   rq   rr   rs   s   @rQ   r   r     s|    	   // / 3c3h/0	/
 / / /ell u||  rS   r   c                      ^  \ rS rSrSr  SS\S\S\S\\\\\4   4   S\4
U 4S jjjr	S	\
R                  S
\
R                  S\
R                  4S jrSrU =r$ )ResidualTemporalBlock1Di  au  
Residual 1D block with temporal convolutions.

Parameters:
    inp_channels (`int`): Number of input channels.
    out_channels (`int`): Number of output channels.
    embed_dim (`int`): Embedding dimension.
    kernel_size (`int` or `tuple`): Size of the convolving kernel.
    activation (`str`, defaults `mish`): It is possible to choose the right activation function.
r   r"   	embed_dimr6   r   c                 4  > [         TU ]  5         [        XU5      U l        [        X"U5      U l        [        U5      U l        [        R                  " X25      U l	        X:w  a  [        R                  " XS5      U l        g [        R                  " 5       U l        g )Nr	   )r@   rA   r   conv_inconv_outr
   time_emb_actrE   r   time_embr   Identityresidual_conv)rO   r   r"   r   r6   r   rP   s         rQ   rA    ResidualTemporalBlock1D.__init__  s{     	"<{K#LL*:6		): 9E8TBIIl!4 	Z\ZeZeZg 	rS   r   trV   c                     U R                  U5      nU R                  U5      nU R                  U5      [        U5      -   nU R	                  U5      nX0R                  U5      -   $ )z
Args:
    inputs : [ batch_size x inp_channels x horizon ]
    t : [ batch_size x embed_dim ]

returns:
    out : [ batch_size x out_channels x horizon ]
)r   r   r   r   r   r   )rO   r   r   outs       rQ   re   ResidualTemporalBlock1D.forward  s\     a MM!ll6"^A%66mmC ''///rS   )r   r   r   r   r   )   r   r   rs   s   @rQ   r   r     s    	  45 

 
 	

 3c3h/0
 
 
&0ell 0u|| 0 0 0rS   r   c            	          ^  \ rS rSrSr   SS\S\\   S\S\4U 4S jjjrSS\	R                  S	\S
\	R                  4S jjrSrU =r$ )TemporalConvLayeri  a  
Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016

Parameters:
    in_dim (`int`): Number of input channels.
    out_dim (`int`): Number of output channels.
    dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
in_dimout_dimr$   norm_num_groupsc                 f  > [         TU ]  5         U=(       d    UnXl        X l        [        R
                  " [        R                  " XA5      [        R                  " 5       [        R                  " XSSS95      U l	        [        R
                  " [        R                  " XB5      [        R                  " 5       [        R                  " U5      [        R                  " X!SSS95      U l        [        R
                  " [        R                  " XB5      [        R                  " 5       [        R                  " U5      [        R                  " X!SSS95      U l        [        R
                  " [        R                  " XB5      [        R                  " 5       [        R                  " U5      [        R                  " X!SSS95      U l        [        R                  R                  U R                  S   R                   5        [        R                  R                  U R                  S   R"                  5        g )Nr4   r	   r	   )r	   r   r   r   )r@   rA   r   r   rE   
Sequentialr   SiLUConv3drG   rJ   rK   conv3conv4initzeros_weightr?   )rO   r   r   r$   r   rP   s        rQ   rA   TemporalConvLayer.__init__  sb    	#V ]]LL1GGIIIfy)D


 ]]LL2GGIJJwIIgy)D	

 ]]LL2GGIJJwIIgy)D	

 ]]LL2GGIJJwIIgy)D	

 	tzz"~,,-
tzz"~**+rS   rc   
num_framesrV   c                    US S S 24   R                  SU4UR                  SS  -   5      R                  SSSSS5      nUnU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nX1-   nUR                  SSSSS5      R                  UR                  S   UR                  S   -  S4UR                  SS  -   5      nU$ )Nr   r	   r   r   r4   r   )reshaper^   permuterG   rK   r   r   )rO   rc   r   identitys       rQ   re   TemporalConvLayer.forward  s    $'"**B
+;m>Q>QRSRT>U+UV^^_`bcefhiklm 	 !

=1

=1

=1

=1 0%--aAq!<DD  #m&9&9!&<<bAMDWDWXYXZD[[
 rS   )rG   rK   r   r   r   r   )Nr   r   )r	   rg   rh   ri   rj   rk   rl   r   rn   rA   rI   rp   re   rq   rr   rs   s   @rQ   r   r     so     "&!',', #', 	',
 ', ',RU\\ s 5<<  rS   r   c            	          ^  \ rS rSrSr   SS\S\\   S\S\4U 4S jjjrS\	R                  S	\	R                  S
\	R                  4S jrSrU =r$ )TemporalResnetBlocki  a  
A Resnet block.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
r1   r"   r%   r(   c                   > [         TU ]  5         Xl        Uc  UOUnX l        SnU Vs/ s H  ofS-  PM	     nn[        R
                  R                  SXSS9U l        [
        R                  " UUUSUS9U l	        Ub  [
        R                  " X25      U l        OS U l        [        R
                  R                  SX$SS9U l        [        R
                  R                  S5      U l        [
        R                  " UUUSUS9U l        [!        S	5      U l        U R                  U:g  U l        S U l        U R$                  (       a  [
        R                  " UUSSS
S9U l        g g s  snf )Nr   r   r   Tr|   r	   r5   r   silur   )r@   rA   r1   r"   rI   rE   r   rC   r   rG   r   r   rH   rJ   r$   rK   r
   rL   r,   r#   )	rO   r1   r"   r%   r(   r6   kr8   rP   s	           rQ   rA   TemporalResnetBlock.__init__*  sR    	&&2&:{(#./;a6;/XX''2Kae'f
YY#

 $!#=!GD!%DXX''2Lbf'g
xx'',YY#

 +62#//<?!!#"D  A 0s   E rT   rU   rV   c                    UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  bI  U R                  U5      nU R                  U5      S S 2S S 2S S 2S S 4   nUR	                  SSSSS5      nX2-   nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  b  U R                  U5      nX-   nU$ )Nr   r   r	   r4   r   )	rC   rL   rG   r   r   rH   r$   rK   r#   )rO   rT   rU   rc   rd   s        rQ   re   TemporalResnetBlock.forward`  s    $

=1))-8

=1)$$T*D%%d+Aq!T4,?@D<<1aA.D)0M

=1))-8]3

=1)--l;L$4rS   )rG   rK   r#   r$   r1   rL   rC   rH   r"   r   r,   )Nr   r   r   rs   s   @rQ   r   r     ss    	 '+ 44 sm4 	4
 4 4lELL    rS   r   c                      ^  \ rS rSrSr       SS\S\\   S\S\S\\   S\S	\4U 4S
 jjjr	  SS\
R                  S\\
R                     S\\
R                     4S jjrSrU =r$ )SpatioTemporalResBlocki{  a  
A SpatioTemporal Resnet block.

Parameters:
    in_channels (`int`): The number of channels in the input.
    out_channels (`int`, *optional*, default to be `None`):
        The number of output channels for the first conv2d layer. If None, same as `in_channels`.
    temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
    eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
    temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
    merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
    merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
        The merge strategy to use for the temporal mixing.
    switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
        If `True`, switch the spatial and temporal mixing.
r1   r"   r%   r(   temporal_epsmerge_factorswitch_spatial_to_temporal_mixc	                    > [         T	U ]  5         [        UUUUS9U l        [	        Ub  UOUUb  UOUUUb  UOUS9U l        [        UUUS9U l        g )N)r1   r"   r%   r(   )alphamerge_strategyr   )r@   rA   ru   spatial_res_blockr   temporal_res_blockAlphaBlender
time_mixer)
rO   r1   r"   r%   r(   r   r   r   r   rP   s
            rQ   rA   SpatioTemporalResBlock.__init__  sp     	!.#%'	"
 #6(4(@k)5)A{' , 8c	#
 ')+I
rS   rc   rU   image_only_indicatorc                    UR                   S   nU R                  X5      nUR                   u  pVpxXT-  n	US S S 24   R                  XXgU5      R                  SSSSS5      n
US S S 24   R                  XXgU5      R                  SSSSS5      nUb  UR                  XS5      nU R	                  X5      nU R                  U
UUS9nUR                  SSSSS5      R                  XVXx5      nU$ )Nr   r   r   r	   r4   r   )	x_spatial
x_temporalr   )r^   r   r   r   r   r   )rO   rc   rU   r   r   batch_frameschannelsheightwidth
batch_sizehidden_states_mixs              rQ   re   SpatioTemporalResBlock.forward  s$    *//3
..}C0=0C0C-!/
 $'"**:8UZ[ccdeghjkmnpqr 	 $'"**:8UZ[ccdeghjkmnpqr 	 <<
;D//D'$!5 ( 
 &--aAq!<DD\]ckrS   )r   r   r   )Nr   r   Ng      ?learned_with_imagesF)NN)rg   rh   ri   rj   rk   rl   r   rn   rm   rA   rI   rp   re   rq   rr   rs   s   @rQ   r   r   {  s    ( '+ (,!,/4

 sm
 	

 
 uo
 
 )-
 
H (,7;	|| u||$ 'u||4	 rS   r   c            	          ^  \ rS rSrSr/ SQr  SS\S\S\4U 4S jjjr	S\
R                  S	\S
\
R                  4S jr SS\
R                  S\
R                  S\\
R                     S
\
R                  4S jjrSrU =r$ )r   i  a  
A module to blend spatial and temporal features.

Parameters:
    alpha (`float`): The initial value of the blending factor.
    merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
        The merge strategy to use for the temporal mixing.
    switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
        If `True`, switch the spatial and temporal mixing.
)learnedfixedr   r   r   r   c                   > [         TU ]  5         X l        X0l        X R                  ;  a  [        SU R                   35      eU R                  S:X  a(  U R                  S[        R                  " U/5      5        g U R                  S:X  d  U R                  S:X  aE  U R                  S[        R                  R                  [        R                  " U/5      5      5        g [        SU R                   35      e)Nzmerge_strategy needs to be in r   
mix_factorr   r   zUnknown merge strategy )r@   rA   r   r   
strategiesrD   register_bufferrI   rp   register_parameterrE   	Parameter)rO   r   r   r   rP   s       rQ   rA   AlphaBlender.__init__  s     	,.L+0=doo=NOPP')  u||UG/DE  I-1D1DH]1]##L%((2D2DU\\SXRYEZ2[\6t7J7J6KLMMrS   r   ndimsrV   c           	      @   U R                   S:X  a  U R                  nU$ U R                   S:X  a"  [        R                  " U R                  5      nU$ U R                   S:X  a  Uc  [	        S5      e[        R
                  " UR                  5       [        R                  " SSUR                  S9[        R                  " U R                  5      S   5      nUS:X  a  US S 2S S S 2S S 4   nU$ US	:X  a  UR                  S
5      S S 2S S 4   nU$ [	        SU S35      e[        e)Nr   r   r   zMPlease provide image_only_indicator to use learned_with_images merge strategyr	   )device).Nr   r4   r   zUnexpected ndims z. Dimensions should be 3 or 5)r   r  rI   sigmoidrD   whererm   onesr	  r   NotImplementedError)rO   r   r  r   s       rQ   	get_alphaAlphaBlender.get_alpha  s    ')OOE6 3   I-MM$//2E0 -   $99#+ !pqqKK$))+

1a(<(C(CDdoo.y9E zaq$45  !b)!T4-8  !#4UG;X!YZZ &%rS   r   r   c                     U R                  X1R                  5      nUR                  UR                  5      nU R                  (       a  SU-
  nXA-  SU-
  U-  -   nU$ )Nr!   )r  ndimtodtyper   )rO   r   r   r   r   r   s         rQ   re   AlphaBlender.forward  sV     3^^D)..%KEu
 ::rS   )r   r   )r   Fr   )rg   rh   ri   rj   rk   r  rn   ro   rm   rA   rI   rp   rl   r  r   re   rq   rr   rs   s   @rQ   r   r     s    	 =J
 4/4	NN N )-	N N(ell 3 5<< F 8<	<< LL 'u||4	
 
 rS   r   ),	functoolsr   typingr   r   r   rI   torch.nnrE   torch.nn.functional
functionalr   utilsr   activationsr
   attention_processorr   downsamplingr   r   r   r   r   normalizationr   
upsamplingr   r   r   r   r   r   Moduler   ru   rp   r   r   r   r   r   r   r    rS   rQ   <module>r"     s      ) )      ' ,  ( NBII NbxBII xxO5<< OELL O "))  H,0bii ,0^D		 DNY")) YzQRYY QhN299 NrS   