
    +h                        S SK r S SKJrJr  S SKrS SKJr  SSKJrJr  SSK	J
r
  SSKJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJr  \R2                  " \5      rS rSS jr " S S\R<                  5      r " S S\R<                  5      r  " S S\R<                  5      r! " S S\R<                  5      r" " S S\
\5      r# " S S\
\5      r$g)    N)OptionalUnion)nn   )ConfigMixinregister_to_config)
ModelMixin)FeedForward)	Attention)TimestepEmbedding	Timestepsget_2d_sincos_pos_embed)Transformer2DModelOutput)AdaLayerNorm)loggingc                    S nXSU-  -
  :  d  XSU-  -   :  a  [         R                  S5        [        R                  " 5          U" X1-
  U-  5      nU" XA-
  U-  5      nU R	                  SU-  S-
  SU-  S-
  5        U R                  5         U R                  U[        R                  " S5      -  5        U R                  U5        U R                  X4S9  U sS S S 5        $ ! , (       d  f       g = f)Nc                 h    S[         R                  " U [         R                  " S5      -  5      -   S-  $ )N      ?       @)matherfsqrt)xs    g/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/pipelines/unidiffuser/modeling_uvit.pynorm_cdf(_no_grad_trunc_normal_.<locals>.norm_cdf   s(    dhhq499S>122c99       zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.   r   )minmax)loggerwarningtorchno_graduniform_erfinv_mul_r   r   add_clamp_)tensormeanstdabr   lus           r   _no_grad_trunc_normal_r2      s    : 	1s7{1s7{ 2;	

 
 ah#%&ah#%& 	A	1q519- 	 	C$))C.()D 	!#+ 
s   BC  
C.c                     [        XX#U5      $ )a  Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean},
\text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for
generating the random values works best when :math:`a \leq \text{mean} \leq b`.

Args:
    tensor: an n-dimensional `torch.Tensor`
    mean: the mean of the normal distribution
    std: the standard deviation of the normal distribution
    a: the minimum cutoff value
    b: the maximum cutoff value
Examples:
    >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w)
)r2   )r+   r,   r-   r.   r/   s        r   trunc_normal_r4   9   s      "&::r   c                   H   ^  \ rS rSrSr         SU 4S jjrS rSrU =r$ )
PatchEmbedL   z2D Image to Patch Embeddingc
                   > [         TU ]  5         X-  X#-  -  n
Xpl        X`l        [        R
                  " XEX34X8S9U l        U(       a  [        R                  " USSS9U l        OS U l        Xl	        U R                  (       aF  [        U[        U
S-  5      SS9nU R                  SUR                  5       R                  S	5      SS
9  g g )N)kernel_sizestridebiasFgư>)elementwise_affineeps      ?pt)output_type	pos_embedr   )
persistent)super__init__flatten
layer_normr   Conv2dproj	LayerNormnormuse_pos_embedr   intregister_bufferfloat	unsqueeze)selfheightwidth
patch_sizein_channels	embed_dimrF   rE   r;   rK   num_patchesrA   	__class__s               r   rD   PatchEmbed.__init__O   s     	+0CD$II0HQ[
	 Y5dSDIDI*/	3{C?O;P^bcI  ioo.?.I.I!.LY^ _ r   c                    U R                  U5      nU R                  (       a!  UR                  S5      R                  SS5      nU R                  (       a  U R	                  U5      nU R
                  (       a  XR                  -   $ U$ )Nr   r   )rH   rE   	transposerF   rJ   rK   rA   )rP   latents     r   forwardPatchEmbed.forwardn   sb    6"<<^^A&00A6F??YYv&FNN**Mr   )rE   rF   rJ   rH   rK   )	   r^      r      FTTT)	__name__
__module____qualname____firstlineno____doc__rD   r\   __static_attributes____classcell__rW   s   @r   r6   r6   L   s3    % `>	 	r   r6   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	SkipBlockz   dimc                    > [         TU ]  5         [        R                  " SU-  U5      U l        [        R
                  " U5      U l        g )Nr   )rC   rD   r   Linearskip_linearrI   rJ   )rP   rl   rW   s     r   rD   SkipBlock.__init__{   s7    99QWc2 LL%	r   c                 r    U R                  [        R                  " X/SS95      nU R                  U5      nU$ )Nrl   )ro   r$   catrJ   )rP   r   skips      r   r\   SkipBlock.forward   s1    UYYyb9:IIaLr   )rJ   ro   )	ra   rb   rc   rd   rL   rD   r\   rf   rg   rh   s   @r   rj   rj   z   s    &C & r   rj   c                      ^  \ rS rSrSr            SS\S\S\S\\   S\S\\   S	\S
\S\S\S\S\S\S\4U 4S jjjr	      SS jr
SrU =r$ )UTransformerBlock   a  
A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations.

Parameters:
    dim (`int`): The number of channels in the input and output.
    num_attention_heads (`int`): The number of heads to use for multi-head attention.
    attention_head_dim (`int`): The number of channels in each head.
    dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
    activation_fn (`str`, *optional*, defaults to `"geglu"`):
        Activation function to be used in feed-forward.
    num_embeds_ada_norm (:obj: `int`, *optional*):
        The number of diffusion steps used during training. See `Transformer2DModel`.
    attention_bias (:obj: `bool`, *optional*, defaults to `False`):
        Configure if the attentions should contain a bias parameter.
    only_cross_attention (`bool`, *optional*):
        Whether to use only cross-attention layers. In this case two cross attention layers are used.
    double_self_attention (`bool`, *optional*):
        Whether to use two self-attention layers. In this case no cross attention layers are used.
    upcast_attention (`bool`, *optional*):
        Whether to upcast the query and key to float32 when performing the attention calculation.
    norm_elementwise_affine (`bool`, *optional*):
        Whether to use learnable per-element affine parameters during layer normalization.
    norm_type (`str`, defaults to `"layer_norm"`):
        The layer norm implementation to use.
    pre_layer_norm (`bool`, *optional*):
        Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
        as opposed to after ("post-LayerNorm"). Note that `BasicTransformerBlock` uses pre-LayerNorm, e.g.
        `pre_layer_norm = True`.
    final_dropout (`bool`, *optional*):
        Whether to use a final Dropout layer after the feedforward network.
rl   num_attention_headsattention_head_dimcross_attention_dimactivation_fnnum_embeds_ada_normattention_biasonly_cross_attentiondouble_self_attentionupcast_attentionnorm_elementwise_affine	norm_typepre_layer_normfinal_dropoutc           
        > [         TU ]  5         Xl        US L=(       a    US:H  U l        Xl        US;   a  Uc  [        SU SU S35      e[        UUUUUU	(       a  UOS US9U l        Uc  U
(       a  [        UU
(       d  UOS UUUUUS9U l        OS U l        U R                  (       a  [        X5      U l
        O[        R                  " XS9U l
        Uc  U
(       a6  U R                  (       a  [        X5      O[        R                  " XS9U l        OS U l        [        R                  " XS9U l        [        XXoS	9U l        g 
Nada_norm)r   ada_norm_zeroz`norm_type` is set to zw, but `num_embeds_ada_norm` is not defined. Please make sure to define `num_embeds_ada_norm` if setting `norm_type` to .)	query_dimheadsdim_headdropoutr;   r|   r   )r   r|   r   r   r   r;   r   )r<   )r   r}   r   rC   rD   r   use_ada_layer_normr   
ValueErrorr   attn1attn2r   norm1r   rI   norm2norm3r
   ffrP   rl   rz   r{   r   r|   r}   r~   r   r   r   r   r   r   r   r   rW   s                   r   rD   UTransformerBlock.__init__   N   $ 	$8!#6d#B"_	U_H_,55:M:U( 4KKT+UVX  %'7K 3QU-

 *.C"?T$7Z^)+#!1DJ DJ""%c?DJcVDJ*.C ** S6\\#R J DJ \\#R
c-mr   c                    U R                   (       a5  U R                  (       a  U R                  X5      nOU R                  U5      nOUnUb  UO0 nU R                  " U4U R                  (       a  UOS US.UD6n	U R                   (       d4  U R                  (       a  U R                  X5      n	OU R                  U	5      n	X-   nU R
                  b  U R                   (       a4  U R                  (       a  U R                  X5      OU R                  U5      nOUnU R
                  " U4UUS.UD6n	U R                   (       d3  U R                  (       a  U R                  X5      OU R                  U	5      n	X-   nU R                   (       a  U R                  U5      nOUnU R                  U5      n
U R                   (       d  U R                  U
5      n
X-   nU$ N)encoder_hidden_statesattention_mask	r   r   r   r   r   r   r   r   r   )rP   hidden_statesr   r   encoder_attention_masktimestepcross_attention_kwargsclass_labelsnorm_hidden_statesattn_output	ff_outputs              r   r\   UTransformerBlock.forward   s    &&%)ZZ%H"%)ZZ%>"!. <R;]!7cejj
;?;T;T"7Z^)
 %	
 ""&&"jj?"jj5#3::!"";?;R;RDJJ}7X\XbXbcpXq # &3"
 **"&;5 )	K &&CGCZCZdjj?`d`j`jkv`w'7M !%M!:!.GG./	 ""

9-I!1r   	r   r   r   r   r   r   r   r   r   )        NgegluNFFFFTrF   TFNNNNNNra   rb   rc   rd   re   rL   r   strboolrD   r\   rf   rg   rh   s   @r   rx   rx      s   L -1$-1$%*&+!&(,%##!KnKn !Kn  	Kn &c]Kn Kn &c]Kn Kn #Kn  $Kn Kn "&Kn Kn Kn  !Kn Kn` "##M Mr   rx   c                      ^  \ rS rSrSr            SS\S\S\S\\   S\S\\   S	\S
\S\S\S\S\S\S\4U 4S jjjr	      SS jr
SrU =r$ )UniDiffuserBlockiN  a  
A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations and puts the
LayerNorms on the residual backbone of the block. This matches the transformer block in the [original UniDiffuser
implementation](https://github.com/thu-ml/unidiffuser/blob/main/libs/uvit_multi_post_ln_v1.py#L104).

Parameters:
    dim (`int`): The number of channels in the input and output.
    num_attention_heads (`int`): The number of heads to use for multi-head attention.
    attention_head_dim (`int`): The number of channels in each head.
    dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
    activation_fn (`str`, *optional*, defaults to `"geglu"`):
        Activation function to be used in feed-forward.
    num_embeds_ada_norm (:obj: `int`, *optional*):
        The number of diffusion steps used during training. See `Transformer2DModel`.
    attention_bias (:obj: `bool`, *optional*, defaults to `False`):
        Configure if the attentions should contain a bias parameter.
    only_cross_attention (`bool`, *optional*):
        Whether to use only cross-attention layers. In this case two cross attention layers are used.
    double_self_attention (`bool`, *optional*):
        Whether to use two self-attention layers. In this case no cross attention layers are used.
    upcast_attention (`bool`, *optional*):
        Whether to upcast the query and key to float() when performing the attention calculation.
    norm_elementwise_affine (`bool`, *optional*):
        Whether to use learnable per-element affine parameters during layer normalization.
    norm_type (`str`, defaults to `"layer_norm"`):
        The layer norm implementation to use.
    pre_layer_norm (`bool`, *optional*):
        Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
        as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
        (`pre_layer_norm = False`).
    final_dropout (`bool`, *optional*):
        Whether to use a final Dropout layer after the feedforward network.
rl   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   c           
        > [         TU ]  5         Xl        US L=(       a    US:H  U l        Xl        US;   a  Uc  [        SU SU S35      e[        UUUUUU	(       a  UOS US9U l        Uc  U
(       a  [        UU
(       d  UOS UUUUUS9U l        OS U l        U R                  (       a  [        X5      U l
        O[        R                  " XS9U l
        Uc  U
(       a6  U R                  (       a  [        X5      O[        R                  " XS9U l        OS U l        [        R                  " XS9U l        [        XXoS	9U l        g r   r   r   s                   r   rD   UniDiffuserBlock.__init__r  r   r   c                    U R                   (       a4  U R                  (       a  U R                  X5      nOU R                  U5      nUb  UO0 nU R                  " U4U R                  (       a  UOS US.UD6nX-   nU R                   (       d4  U R                  (       a  U R                  X5      nOU R                  U5      nU R
                  b  U R                   (       a3  U R                  (       a  U R                  X5      OU R                  U5      nU R
                  " U4UUS.UD6nX-   nU R                   (       d3  U R                  (       a  U R                  X5      OU R                  U5      nU R                   (       a  U R                  U5      nU R                  U5      n	X-   nU R                   (       d  U R                  U5      nU$ r   r   )
rP   r   r   r   r   r   r   r   r   r   s
             r   r\   UniDiffuserBlock.forward  s    && $

= C $

= 9 <R;]!7cejj
;?;T;T"7Z^)
 %	
 $3
 ""&& $

= C $

= 9::!"";?;R;RDJJ}7X\XbXbcpXq  **&;5 )	K (7M &&;?;R;RDJJ}7X\XbXbcpXq   JJ}5MGGM*	!1 "" JJ}5Mr   r   )r   Nr   NFFFFTrF   FTr   r   rh   s   @r   r   r   N  s   !P -1$-1$%*&+!&(,%$"!KnKn !Kn  	Kn &c]Kn Kn &c]Kn Kn #Kn  $Kn Kn "&Kn Kn Kn  !Kn Kn` "##M Mr   r   c            .         ^  \ rS rSrSr\                       SS\S\S\\   S\\   S\S\S	\S
\\   S\	S\\   S\\   S\\   S\
S\\   S\	S\	S\	S\
S\
S\	S\	S\	4,U 4S jjj5       r       S S\	S\	S\	4S jjrSrU =r$ )!UTransformer2DModeli  a  
Transformer model based on the [U-ViT](https://github.com/baofff/U-ViT) architecture for image-like data. Compared
to [`Transformer2DModel`], this model has skip connections between transformer blocks in a "U"-shaped fashion,
similar to a U-Net. Supports only continuous (actual embeddings) inputs, which are embedded via a [`PatchEmbed`]
layer and then reshaped to (b, t, d).

Parameters:
    num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
    attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
    in_channels (`int`, *optional*):
        Pass if the input is continuous. The number of channels in the input.
    out_channels (`int`, *optional*):
        The number of output channels; if `None`, defaults to `in_channels`.
    num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
    dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    norm_num_groups (`int`, *optional*, defaults to `32`):
        The number of groups to use when performing Group Normalization.
    cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
    attention_bias (`bool`, *optional*):
        Configure if the TransformerBlocks' attention should contain a bias parameter.
    sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
        Note that this is fixed at training time as it is used for learning a number of position embeddings. See
        `ImagePositionalEmbeddings`.
    num_vector_embeds (`int`, *optional*):
        Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
        Includes the class for the masked latent pixel.
    patch_size (`int`, *optional*, defaults to 2):
        The patch size to use in the patch embedding.
    activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
    num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
        The number of diffusion steps used during training. Note that this is fixed at training time as it is used
        to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
        up to but not more than steps than `num_embeds_ada_norm`.
    use_linear_projection (int, *optional*): TODO: Not used
    only_cross_attention (`bool`, *optional*):
        Whether to use only cross-attention layers. In this case two cross attention layers are used in each
        transformer block.
    upcast_attention (`bool`, *optional*):
        Whether to upcast the query and key to float() when performing the attention calculation.
    norm_type (`str`, *optional*, defaults to `"layer_norm"`):
        The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
    block_type (`str`, *optional*, defaults to `"unidiffuser"`):
        The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
        backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
        behavior in `diffusers`.)
    pre_layer_norm (`bool`, *optional*):
        Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
        as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
        (`pre_layer_norm = False`).
    norm_elementwise_affine (`bool`, *optional*):
        Whether to use learnable per-element affine parameters during layer normalization.
    use_patch_pos_embed (`bool`, *optional*):
        Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
    final_dropout (`bool`, *optional*):
        Whether to use a final Dropout layer after the feedforward network.
rz   r{   rT   out_channels
num_layersr   norm_num_groupsr|   r   sample_sizenum_vector_embedsrS   r}   r~   use_linear_projectionr   r   r   
block_typer   r   ff_final_dropoutc                   > [         TU ]  5         Xl        Xl        X l        X-  nUb  Uc   S5       eU
c   S5       eXl        Xl        Xl        [        U
U
UUUUS9U l	        US:X  a  [        nO[        n[        R                  " [        US-  5       Vs/ s H  nU" UUUUUUUU	UUUUUUS9PM     sn5      U l        U" UUUUUUUU	UUUUUUS9U l        [        R                  " [        US-  5       Vs/ s H6  n[        R"                  " [%        U5      U" UUUUUUUU	UUUUUUS9S.5      PM8     sn5      U l        Uc  UOUU l        [        R*                  " U5      U l        g s  snf s  snf )Nz0Patch input requires in_channels and patch_size.z?UTransformer2DModel over patched input must provide sample_sizerQ   rR   rS   rT   rU   rK   unidiffuserr   )r   r|   r}   r~   r   r   r   r   r   r   r   )ru   block)rC   rD   r   rz   r{   rQ   rR   rS   r6   rA   r   rx   r   
ModuleListrangetransformer_in_blockstransformer_mid_block
ModuleDictrj   transformer_out_blocksr   rI   norm_out)rP   rz   r{   rT   r   r   r   r   r|   r   r   r   rS   r}   r~   r   r   r   r   r   r   r   use_patch_pos_embedr   	inner_dim	block_clsdrW   s                              r   rD   UTransformer2DModel.__init__M  s   6 	%:"#6 "4'<	 &:+AuCuuA&i(ii& " 
$#!#-
 &(I)I%']]$ zQ/#" 0A! '&#(;"/(;#1)=%5'#1,C"2  0#&
", &/ 3' 3)!5-)$;*&
"& ')mm2 zQ/10 0A/  )%! "+%/.$+0C*70C+91E-=&/+94K*:"	. 01'
#< ,8+?K\ Y/URs   E'6=E,return_dicthidden_states_is_embedding
unpatchifyc	           	         U(       d  U(       a  [        SU SU SU S35      eU(       d  U R                  U5      n/ n	U R                   H  n
U
" UUUUUS9nU	R                  U5        M      U R	                  U5      nU R
                   H)  nUS   " XR                  5       5      nUS   " UUUUUS9nM+     U R                  U5      nU(       a  [        UR                  S   S	-  5      =pUR                  S
XU R                  U R                  U R                  4S9n[        R                  " SU5      nUR                  S
U R                  XR                  -  XR                  -  4S9nOUnU(       d  U4$ [        US9$ )a  
Args:
    hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
        When continuous, `torch.Tensor` of shape `(batch size, channel, height, width)`): Input hidden_states
    encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
        Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
        self-attention.
    timestep ( `torch.long`, *optional*):
        Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
    class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
        Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels
        conditioning.
    cross_attention_kwargs (*optional*):
        Keyword arguments to supply to the cross attention layers, if used.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
        tuple.
    hidden_states_is_embedding (`bool`, *optional*, defaults to `False`):
        Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will
        ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the
        transformer blocks.
    unpatchify (`bool`, *optional*, defaults to `True`):
        Whether to unpatchify the transformer output.

Returns:
    [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
    [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When
    returning a tuple, the first element is the sample tensor.
z!Cannot both define `unpatchify`: z and `return_dict`: z since when `unpatchify` is zy the returned output is of shape (batch_size, seq_len, hidden_dim) rather than (batch_size, num_channels, height, width).)r   r   r   r   ru   r   r   r>   rr   shapenhwpqc->nchpwq)sample)r   rA   r   appendr   r   popr   rL   r   reshaperS   r   r$   einsumr   )rP   r   r   r   r   r   r   r   r   skipsin_block	out_blockrQ   rR   outputs                  r   r\   UTransformer2DModel.forward  s   T k3J<?ST_S` a$$.< 0JJ  * NN=9M
 22H$&;!'=)M LL' 3 22=A 44I%f-mYY[IM%g.&;!'=)M 5 m4  !4!4Q!73!>??F)116$//4??DL]L]^ 2 M "LL)9=IM"**4,,f.FP_P_H_` + F #F9'v66r   )r{   rQ   r   rz   r   rS   rA   r   r   r   r   rR   )r_   X   NNr   r       NFNNr   r   NFFFrF   r   FTFF)NNNNTFT)ra   rb   rc   rd   re   r   rL   r   rN   r   r   rD   r\   rf   rg   rh   s   @r   r   r     s   7r  $&"$%)&*!-1$%)+/$%$-1&+%*!&%'$(,!!&1H0 H0  H0 c]	H0
 smH0 H0 H0 H0 &c]H0 H0 c]H0 $C=H0 SMH0 H0 &c]H0   $!H0" ##H0$ %H0& 'H0( )H0* +H0, "&-H00 1H0 H0Z ## +0f7 f7 %)f7 f7 f7r   r   c            6         ^  \ rS rSrSr\                            S'S\S\S\S\S\S\\   S	\\   S
\S\S\S\\   S\	S\\   S\\   S\\   S\
S\\   S\	S\	S\	S\
S\
S\	S\	S\	S\	44U 4S jjj5       r\R                  R                  S 5       r   S(S\R                   S \R                   S!\R                   S"\\R                   \\4   S#\\R                   \\4   S$\\\R                   \\4      4S% jjrS&rU =r$ ))UniDiffuserModeliA  a  
Transformer model for a image-text [UniDiffuser](https://huggingface.co/papers/2303.06555) model. This is a
modification of [`UTransformer2DModel`] with input and output heads for the VAE-embedded latent image, the
CLIP-embedded image, and the CLIP-embedded prompt (see paper for more details).

Parameters:
    text_dim (`int`): The hidden dimension of the CLIP text model used to embed images.
    clip_img_dim (`int`): The hidden dimension of the CLIP vision model used to embed prompts.
    num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
    attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
    in_channels (`int`, *optional*):
        Pass if the input is continuous. The number of channels in the input.
    out_channels (`int`, *optional*):
        The number of output channels; if `None`, defaults to `in_channels`.
    num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
    dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    norm_num_groups (`int`, *optional*, defaults to `32`):
        The number of groups to use when performing Group Normalization.
    cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use.
    attention_bias (`bool`, *optional*):
        Configure if the TransformerBlocks' attention should contain a bias parameter.
    sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images.
        Note that this is fixed at training time as it is used for learning a number of position embeddings. See
        `ImagePositionalEmbeddings`.
    num_vector_embeds (`int`, *optional*):
        Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
        Includes the class for the masked latent pixel.
    patch_size (`int`, *optional*, defaults to 2):
        The patch size to use in the patch embedding.
    activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
    num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`.
        The number of diffusion steps used during training. Note that this is fixed at training time as it is used
        to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for
        up to but not more than steps than `num_embeds_ada_norm`.
    use_linear_projection (int, *optional*): TODO: Not used
    only_cross_attention (`bool`, *optional*):
        Whether to use only cross-attention layers. In this case two cross attention layers are used in each
        transformer block.
    upcast_attention (`bool`, *optional*):
        Whether to upcast the query and key to float32 when performing the attention calculation.
    norm_type (`str`, *optional*, defaults to `"layer_norm"`):
        The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`.
    block_type (`str`, *optional*, defaults to `"unidiffuser"`):
        The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual
        backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard
        behavior in `diffusers`.)
    pre_layer_norm (`bool`, *optional*):
        Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"),
        as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm
        (`pre_layer_norm = False`).
    norm_elementwise_affine (`bool`, *optional*):
        Whether to use learnable per-element affine parameters during layer normalization.
    use_patch_pos_embed (`bool`, *optional*):
        Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`).
    ff_final_dropout (`bool`, *optional*):
        Whether to use a final Dropout layer after the feedforward network.
    use_data_type_embedding (`bool`, *optional*):
        Whether to use a data type embedding. This is only relevant for UniDiffuser-v1 style models; UniDiffuser-v1
        is continue-trained from UniDiffuser-v0 on non-publically-available data and accepts a `data_type`
        argument, which can either be `1` to use the weights trained on non-publically-available data or `0`
        otherwise. This argument is subsequently embedded by the data type embedding, if used.
text_dimclip_img_dimnum_text_tokensrz   r{   rT   r   r   r   r   r|   r   r   r   rS   r}   r~   r   r   r   r   r   r   r   r   use_data_type_embeddingc           	        > [         TU ]  5         XE-  U l        Uc   S5       eXl        X`l        Uc  UOUU l        Xl        U R                  U-  U R                  U-  -  U l        [        UUUUU R                  US9U l	        [        R                  " X R                  5      U l        [        R                  " XR                  5      U l        [        U R                  SSS9U l        U(       a,  [!        U R                  SU R                  -  U R                  S9O[        R"                  " 5       U l        [        U R                  SSS9U l        U(       a,  [!        U R                  SU R                  -  U R                  S9O[        R"                  " 5       U l        X0l        SU-   S	-   U R                  -   U l        [        R.                  " [0        R2                  " S	U R,                  U R                  5      5      U l        [        R6                  " U	S
9U l        [;        U R4                  SS9  UU l        U R<                  (       aa  [        R>                  " SU R                  5      U l         [        R.                  " [0        R2                  " S	S	U R                  5      5      U l!        [E        S$0 SU_SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_SU_S U_S!U_S"U_S#U_6U l#        US-  U-  n[        R                  " U R                  U5      U l$        [        R                  " U R                  U5      U l%        [        R                  " U R                  U5      U l&        g )%Nz<UniDiffuserModel over patched input must provide sample_sizer   Tr   )flip_sin_to_cosdownscale_freq_shift   )out_dimr   r   )pg{Gz?)r-   rz   r{   rT   r   r   r   r   r|   r   r   r   rS   r}   r~   r   r   r   r   r   r   r   r   r    )'rC   rD   r   r   rT   r   rS   rV   r6   
vae_img_inr   rn   clip_img_intext_inr   timestep_img_projr   Identitytimestep_img_embedtimestep_text_projtimestep_text_embedr   
num_tokens	Parameterr$   zerosrA   Dropoutpos_embed_dropr4   r   	Embeddingdata_type_token_embeddingdata_type_pos_embed_tokenr   transformervae_img_outclip_img_outtext_out)rP   r   r   r   rz   r{   rT   r   r   r   r   r|   r   r   r   rS   r}   r~   r   r   r   r   r   r   use_timestep_embeddingr   r   r   r   	patch_dimrW   s                                 r   rD   UniDiffuserModel.__init__  sk   @ 	 -A&f(ff&&&+7+?K\$ ,,
:t?O?OS]?]^
 %!#nn-
 99\>>Byy>>: "+NN !""
 & DNN"  	 #,NN !"#
 & DNN"  	   //1A58H8HHekk!T__dnn&UV jj73dnn$/ (?$''-/\\!T^^-LD*-/\\%++aDNN:[-\D* / 
 3
1
 $
 &	

 "
 
 ,
 !4
 *
 $
 0
 "
 (
 !4
 #8
  "6!
" .#
$  %
& "'
( *)
* %<+
, !4-
. ./
6  ]l2	99T^^Y?IIdnnlC		$..(;r   c                     S1$ )NrA   r   )rP   s    r   no_weight_decay UniDiffuserModel.no_weight_decay
  s
    }r   latent_image_embedsimage_embedsprompt_embedstimestep_imgtimestep_text	data_typec	                 	   UR                   S   n	U R                  U5      n
U R                  U5      nU R                  U5      nUR	                  S5      U
R	                  S5      p[
        R                  " U5      (       d/  [
        R                  " U/[
        R                  U
R                  S9nU[
        R                  " XR                  UR                  S9-  nU R                  U5      nUR                  U R                  S9nU R                  U5      nUR                  SS9n[
        R                  " U5      (       d/  [
        R                  " U/[
        R                  U
R                  S9nU[
        R                  " XR                  UR                  S9-  nU R!                  U5      nUR                  U R                  S9nU R#                  U5      nUR                  SS9nU R$                  (       a  Uc   S5       e[
        R                  " U5      (       d/  [
        R                  " U/[
        R&                  U
R                  S9nU[
        R                  " XR                  UR                  S9-  nU R)                  U5      R                  SS9n[
        R*                  " UUUUUU
/SS9nO[
        R*                  " UUXU
/SS9nU R$                  (       aO  [
        R*                  " U R,                  SS2SS2SS24   U R.                  U R,                  SS2SS2SS24   /SS9nOU R,                  nUU-   nU R1                  U5      nU R3                  UUSSUS	S
S	S9S   nU R$                  (       a  UR5                  SSSUSU4SS9u  nnnnnnOUR5                  SSUSU4SS9u  nnnnnU R7                  U5      n['        UR                   S   S-  5      =nnUR9                  SUUU R:                  U R:                  U R<                  4S9n[
        R>                  " SU5      nUR9                  SU R<                  UU R:                  -  UU R:                  -  4S9nU RA                  U5      nU RC                  U5      nUUU4$ )a  
Args:
    latent_image_embeds (`torch.Tensor` of shape `(batch size, latent channels, height, width)`):
        Latent image representation from the VAE encoder.
    image_embeds (`torch.Tensor` of shape `(batch size, 1, clip_img_dim)`):
        CLIP-embedded image representation (unsqueezed in the first dimension).
    prompt_embeds (`torch.Tensor` of shape `(batch size, seq_len, text_dim)`):
        CLIP-embedded text representation.
    timestep_img (`torch.long` or `float` or `int`):
        Current denoising step for the image.
    timestep_text (`torch.long` or `float` or `int`):
        Current denoising step for the text.
    data_type: (`torch.int` or `float` or `int`, *optional*, defaults to `1`):
        Only used in UniDiffuser-v1-style models. Can be either `1`, to use weights trained on nonpublic data,
        or `0` otherwise.
    encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
        Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
        self-attention.
    cross_attention_kwargs (*optional*):
        Keyword arguments to supply to the cross attention layers, if used.


Returns:
    `tuple`: Returns relevant parts of the model's noise prediction: the first element of the tuple is tbe VAE
    image embedding, the second element is the CLIP image embedding, and the third element is the CLIP text
    embedding.
r   r   )dtypedevice)r  rs   NzBdata_type must be supplied if the model uses a data type embeddingr   FT)r   r   r   r   r   r   r   r>   rr   r   r   )"r   r   r   r   sizer$   	is_tensorr+   longr  onesr  r   tor   rO   r   r   r   rL   r   rt   rA   r   r   r  splitr  r   rS   r   r   r  r  )rP   r  r  r  r  r  r  r   r   
batch_sizevae_hidden_statesclip_hidden_statestext_hidden_statesr   num_img_tokenstimestep_img_tokentimestep_text_tokendata_type_tokenr   rA   t_img_token_outt_text_token_outdata_type_token_outr  img_clip_outimg_vae_outrQ   rR   s                               r   r\   UniDiffuserModel.forward  s   L )..q1
 !OO,?@!--l;!\\-8*<*A*A!*DFWF\F\]^F_ |,, <<ejjQbQiQijL $ejjCUCU^j^q^q&rr!33LA 0222D!445GH/99a9@ }--!LL-

SdSkSklM &

:EXEXanauau(vv"55mD 2444::4F"667JK1;;;B ''(n*nn(??9--!LL)EIIN_NfNfg	 "EJJzYbYiYi$jjI"<<YGQQVWQXO!II&'#&&% 
M "II#%8:LbstM ''		7U7A.0N0NPTP^P^_`bgbikl_lPmntuI I%	1++M: (("7#9'+ ) 	
 	 '' ##Q1oq.$QWX#Y # VcUhUhA>: Vi VRO-x{ &&{3 [..q1S899!))vudootHYHYZ * 
 ll#3[A!))t((&4??*BEDOOD[\ * 
 ((6==*L(22r   )r   r  r   r   rT   r   rV   r   r   r   rS   rA   r   r   r   r  r   r   r   r   r  r   r   r  )r`   i   M   r_   r   NNr   r   r   NFNNNr   NFFFrF   r   FFTFTF)r   NN)ra   rb   rc   rd   re   r   rL   r   rN   r   r   rD   r$   jitignorer	  Tensorr   r\   rf   rg   rh   s   @r   r   r   A  sf   =~  !#%"$%)&*!-1$%)+/$($-1&+%*!&%'$$(,!!%(-;F<F< F< 	F<
 !F<  F< c]F< smF< F< F< F< &c]F< F< c]F< $C=F<  SM!F<" #F<$ &c]%F<&  $'F<( #)F<* +F<, -F<. /F<0 1F<4 "&5F<8 9F<: "&;F< F<P YY  @A"#_3"\\_3 ll_3 ||	_3
 ELL%45_3 U\\5#56_3 E%,,s":;<_3 _3r   r   )r   r   g       r   )%r   typingr   r   r$   r   configuration_utilsr   r   modelsr	   models.attentionr
   models.attention_processorr   models.embeddingsr   r   r   models.modeling_outputsr   models.normalizationr   utilsr   
get_loggerra   r"   r2   r4   Moduler6   rj   rx   r   r   r   r   r   r   <module>r7     s     "   B   + 3 V V ? 0  
		H	%"J;&+ +\		 &|		 |B~ryy ~Jk7*k k7\	l3z; l3r   