
    +h=                        S SK JrJr  S SKrS SKJr  SSKJrJr  SSK	J
r
  SSKJr  SSKJrJrJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJr  \
R6                  " \5      r " S S\R<                  5      r " S S\\5      r g)    )DictUnionN   )ConfigMixinregister_to_config)logging   )FeedForward)	AttentionAttentionProcessorCogVideoXAttnProcessor2_0)&CogView3CombinedTimestepSizeEmbeddingsCogView3PlusPatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuous%CogView3PlusAdaLayerNormZeroTextImagec            	          ^  \ rS rSrSr    SS\S\S\S\4U 4S jjjrS\R                  S	\R                  S
\R                  S\R                  4S jr	Sr
U =r$ )CogView3PlusTransformerBlock"   a  
Transformer block used in [CogView](https://github.com/THUDM/CogView3) model.

Args:
    dim (`int`):
        The number of channels in the input and output.
    num_attention_heads (`int`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`):
        The number of channels in each head.
    time_embed_dim (`int`):
        The number of channels in timestep embedding.
dimnum_attention_headsattention_head_dimtime_embed_dimc                   > [         TU ]  5         [        XAS9U l        [	        UUUUSSSS[        5       S9	U l        [        R                  " USSS9U l	        [        R                  " USSS9U l
        [        XS	S
9U l        g )N)embedding_dimr   T
layer_normFư>)		query_dimheadsdim_headout_dimbiasqk_normelementwise_affineeps	processorgh㈵>)r%   r&   zgelu-approximate)r   dim_outactivation_fn)super__init__r   norm1r   r   attn1nn	LayerNormnorm2norm2_contextr
   ff)selfr   r   r   r   	__class__s        p/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_cogview3plus.pyr+   %CogView3PlusTransformerBlock.__init__1   s     	:a
%' $/1


 \\#%TJ
\\#%TR#BTU    hidden_statesencoder_hidden_statesembreturnc           
         UR                  S5      nU R                  XU5      u
  nnnnn	n
nnnnU R                  XZS9u  nnXR                  S5      U-  -   nX+R                  S5      U-  -   nU R	                  U5      nUSUS S 2S 4   -   -  US S 2S 4   -   nU R                  U5      n
U
SUS S 2S 4   -   -  US S 2S 4   -   n
[        R                  " X/SS9nU R                  U5      nXR                  S5      US S 2US 24   -  -   nX.R                  S5      US S 2S U24   -  -   nUR                  [        R                  :X  a  UR                  SS5      nUR                  [        R                  :X  a  UR                  SS5      nX4$ )N   )r8   r9   )r   i  i  )sizer,   r-   	unsqueezer0   r1   torchcatr2   dtypefloat16clip)r3   r8   r9   r:   text_seq_lengthnorm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpnorm_encoder_hidden_states
c_gate_msac_shift_mlpc_scale_mlp
c_gate_mlpattn_hidden_statesattn_encoder_hidden_states	ff_outputs                     r5   forward$CogView3PlusTransformerBlock.forwardM   s    044Q7 JJ}SA	
& :>, :D :
66 &(:(:1(=@R(RR 58L8LQ8ORl8l l "ZZ6/1yD7I3IJYWXZ^W^M__%)%7%78M%N"%?1{STVZSZG[C[%\_jklnrkr_s%s" #YY(B'W]^_GG./	%(:(:1(=	!_M]J]@^(^^ 58L8LQ8OR[\]_o`o_o\oRp8p p%--/)..vu=M &&%--7$9$>$>vu$M!33r7   )r-   r2   r,   r0   r1   )i 
  @   (      )__name__
__module____qualname____firstlineno____doc__intr+   r@   TensorrS   __static_attributes____classcell__r4   s   @r5   r   r   "   s      #%"$!VV !V  	V
 V V804||04  %||04 \\	04
 
04 04r7   r   c                     ^  \ rS rSrSrSrSS/rSS/r\           S!S\	S	\	S
\	S\	S\	S\	S\	S\	S\	S\	S\	4U 4S jjj5       r
\S\\\4   4S j5       rS\\\\\4   4   4S jr S"S\R&                  S\R&                  S\R(                  S\R&                  S\R&                  S\R&                  S\S\\R&                  \4   4S jjrS rU =r$ )#CogView3PlusTransformer2DModel   a!  
The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay
Diffusion](https://huggingface.co/papers/2403.05121).

Args:
    patch_size (`int`, defaults to `2`):
        The size of the patches to use in the patch embedding layer.
    in_channels (`int`, defaults to `16`):
        The number of channels in the input.
    num_layers (`int`, defaults to `30`):
        The number of layers of Transformer blocks to use.
    attention_head_dim (`int`, defaults to `40`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `64`):
        The number of heads to use for multi-head attention.
    out_channels (`int`, defaults to `16`):
        The number of channels in the output.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    time_embed_dim (`int`, defaults to `512`):
        Output dimension of timestep embeddings.
    condition_dim (`int`, defaults to `256`):
        The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
        crop_coords).
    pos_embed_max_size (`int`, defaults to `128`):
        The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
        to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
        means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
        patch_size => 128 * 8 * 2 => 2048`.
    sample_size (`int`, defaults to `128`):
        The base resolution of input latents. If height/width is not provided during generation, this value is used
        to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
Tpatch_embednormr   r   
patch_sizein_channels
num_layersr   r   out_channelstext_embed_dimr   condition_dimpos_embed_max_sizesample_sizec                   > [         TU ]  5         X`l        XT-  U l        SU	-  U l        [        UU R                  UUU
S9U l        [        UU	U R                  U R                  S9U l        [        R                  " [        U5       Vs/ s H  n[        U R                  UUUS9PM     sn5      U l        [        U R                  USSS9U l        [        R                   " U R                  X-  U R                  -  SS	9U l        SU l        g s  snf )
N   )rh   hidden_sizerg   text_hidden_sizerm   )r   rl   pooled_projection_dimtimesteps_dim)r   r   r   r   Fr   )r   conditioning_embedding_dimr%   r&   T)r#   )r*   r+   rj   	inner_dimrs   r   re   r   time_condition_embedr.   
ModuleListranger   transformer_blocksr   norm_outLinearproj_outgradient_checkpointing)r3   rg   rh   ri   r   r   rj   rk   r   rl   rm   rn   _r4   s                r5   r+   'CogView3PlusTransformer2DModel.__init__   s    	(,A &+]%:"1#!+1
 %K('"&"<"<..	%
! #%-- z* +A -(;'9#1	 +
#
 /..'5$	
 		$..*2IDL]L]2]dhi&+#'s    D
r;   c                    ^ 0 nS[         S[        R                  R                  S[        [         [
        4   4U4S jjmU R                  5        H  u  p#T" X#U5        M     U$ )z
Returns:
    `dict` of attention processors: A dictionary containing all attention processors used in the model with
    indexed by its weight name.
namemodule
processorsc                    > [        US5      (       a  UR                  5       X  S3'   UR                  5        H  u  p4T" U  SU 3XB5        M     U$ )Nget_processor
.processor.)hasattrr   named_children)r   r   r   sub_namechildfn_recursive_add_processorss        r5   r   SCogView3PlusTransformer2DModel.attn_processors.<locals>.fn_recursive_add_processors   sZ    v//282F2F2H
V:./#)#8#8#:+tfAhZ,@%T $; r7   )strr@   r.   Moduler   r   r   )r3   r   r   r   r   s       @r5   attn_processors.CogView3PlusTransformer2DModel.attn_processors   sb     
	c 	588?? 	X\]`bt]tXu 	 !//1LD'jA 2 r7   r'   c           	      d  ^ [        U R                  R                  5       5      n[        U[        5      (       a-  [        U5      U:w  a  [        S[        U5       SU SU S35      eS[        S[        R                  R                  4U4S jjmU R                  5        H  u  p4T" X4U5        M     g)	a  
Sets the attention processor to use to compute attention.

Parameters:
    processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
        The instantiated processor class or a dictionary of processor classes that will be set as the processor
        for **all** `Attention` layers.

        If `processor` is a dict, the key needs to define the path to the corresponding cross attention
        processor. This is strongly recommended when setting trainable attention processors.

z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                 
  > [        US5      (       aJ  [        U[        5      (       d  UR                  U5        O#UR                  UR	                  U  S35      5        UR                  5        H  u  p4T" U  SU 3XB5        M     g )Nset_processorr   r   )r   
isinstancedictr   popr   )r   r   r'   r   r   fn_recursive_attn_processors        r5   r   VCogView3PlusTransformer2DModel.set_attn_processor.<locals>.fn_recursive_attn_processor  ss    v//!)T22((3(($z7J)KL#)#8#8#:+tfAhZ,@%S $;r7   N)lenr   keysr   r   
ValueErrorr   r@   r.   r   r   )r3   r'   countr   r   r   s        @r5   set_attn_processor1CogView3PlusTransformer2DModel.set_attn_processor   s     D((--/0i&&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1LD'i@ 2r7   r8   r9   timesteporiginal_sizetarget_sizecrop_coordsreturn_dictc                 $   UR                   SS u  pUR                   S   n
U R                  X5      nU R                  X4XVUR                  5      nUSS2SU
24   nUSS2U
S24   n[	        U R
                  5       HR  u  p[        R                  " 5       (       a)  U R                  (       a  U R                  UUUU5      u  pMH  U" UUUS9u  pMT     U R                  X5      nU R                  U5      nU R                  R                  nX-  nX-  n	UR                  UR                   S   XU R                  X4S9n[        R                   " SU5      nUR                  UR                   S   U R                  X-  X-  4S9nU(       d  U4$ [#        US9$ )	a  
The [`CogView3PlusTransformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor`):
        Input `hidden_states` of shape `(batch size, channel, height, width)`.
    encoder_hidden_states (`torch.Tensor`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape
        `(batch_size, sequence_len, text_embed_dim)`
    timestep (`torch.LongTensor`):
        Used to indicate denoising step.
    original_size (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    target_size (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    crop_coords (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.

Returns:
    `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]:
        The denoised latents using provided inputs as conditioning.
Nr=   )r8   r9   r:   r   )shapeznhwcpq->nchpwq)sample)r   re   rw   rB   	enumeraterz   r@   is_grad_enabledr~   _gradient_checkpointing_funcr{   r}   configrg   reshaperj   einsumr   )r3   r8   r9   r   r   r   r   r   heightwidthrE   r:   index_blockblockrg   outputs                   r5   rS   &CogView3PlusTransformer2DModel.forward  s   L &++BC0/55a8((
 ''[h[n[no -a1A/1A.A B%a)9&9:"+D,C,C"DK$$&&4+F+F7;7X7X!)	844 8="/*?844 #E m9m4 [[++
%#%-- &&q)6$:K:KZd . 
 %5}E&& &&q)4+<+<f>QSXSef ' 
 9'v66r7   )	r~   rv   r{   rj   re   rs   r}   rw   rz   )r	         rV   rU   r   i   rW      rd   rd   )T)rX   rY   rZ   r[   r\    _supports_gradient_checkpointing _skip_layerwise_casting_patterns_no_split_modulesr   r]   r+   propertyr   r   r   r   r   r   r@   r^   
LongTensorboolr   rS   r_   r`   ra   s   @r5   rc   rc      s    D (,$(5v'>$79QR "$#%"! "%9,9, 9, 	9,
  9, !9, 9, 9, 9, 9,  9, 9, 9,v c+=&=!>  0 AE2Dd3PbKbFc2c,d  AT !S7||S7  %||S7 ""	S7
 ||S7 \\S7 \\S7 S7 
u||55	6S7 S7r7   rc   )!typingr   r   r@   torch.nnr.   configuration_utilsr   r   utilsr   	attentionr
   attention_processorr   r   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrX   loggerr   r   rc    r7   r5   <module>r      s`        B  # Z Z W 7 ' Y 
		H	%[4299 [4|r7Z r7r7   