ó
    +ýòhÃ=  ã                   ó  • S SK JrJr  S SKrS SKJr  SSKJrJr  SSK	J
r
  SSKJr  SSKJrJrJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJr  \
R6                  " \5      r " S S\R<                  5      r " S S\\5      r g)é    )ÚDictÚUnionNé   )ÚConfigMixinÚregister_to_config)Úloggingé   )ÚFeedForward)Ú	AttentionÚAttentionProcessorÚCogVideoXAttnProcessor2_0)Ú&CogView3CombinedTimestepSizeEmbeddingsÚCogView3PlusPatchEmbed)ÚTransformer2DModelOutput)Ú
ModelMixin)ÚAdaLayerNormContinuousÚ%CogView3PlusAdaLayerNormZeroTextImagec            	       ó¶   ^ • \ rS rSrSr    SS\S\S\S\4U 4S jjjrS\R                  S	\R                  S
\R                  S\R                  4S jr	Sr
U =r$ )ÚCogView3PlusTransformerBlocké"   aœ  
Transformer block used in [CogView](https://github.com/THUDM/CogView3) model.

Args:
    dim (`int`):
        The number of channels in the input and output.
    num_attention_heads (`int`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`):
        The number of channels in each head.
    time_embed_dim (`int`):
        The number of channels in timestep embedding.
ÚdimÚnum_attention_headsÚattention_head_dimÚtime_embed_dimc                 ó  >• [         TU ]  5         [        XAS9U l        [	        UUUUSSSS[        5       S9	U l        [        R                  " USSS9U l	        [        R                  " USSS9U l
        [        XS	S
9U l        g )N)Úembedding_dimr   TÚ
layer_normFçíµ ÷Æ°>)	Ú	query_dimÚheadsÚdim_headÚout_dimÚbiasÚqk_normÚelementwise_affineÚepsÚ	processorgñhãˆµøä>)r%   r&   zgelu-approximate)r   Údim_outÚactivation_fn)ÚsuperÚ__init__r   Únorm1r   r   Úattn1ÚnnÚ	LayerNormÚnorm2Únorm2_contextr
   Úff)Úselfr   r   r   r   Ú	__class__s        €Úp/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_cogview3plus.pyr+   Ú%CogView3PlusTransformerBlock.__init__1   s€   ø€ ô 	‰ÑÔä:ÈÑaˆŒ
äØØ%Ø'ØØØ Ø$ØÜ/Ó1ñ

ˆŒ
ô —\’\ #¸%ÀTÑJˆŒ
ÜŸ\š\¨#À%ÈTÑRˆÔä #ÐBTÑUˆó    Úhidden_statesÚencoder_hidden_statesÚembÚreturnc           
      ó  • UR                  S5      nU R                  XU5      u
  nnnnn	n
nnnnU R                  XZS9u  nnXR                  S5      U-  -   nX+R                  S5      U-  -   nU R	                  U5      nUSUS S 2S 4   -   -  US S 2S 4   -   nU R                  U5      n
U
SUS S 2S 4   -   -  US S 2S 4   -   n
[        R                  " X¥/SS9nU R                  U5      nXR                  S5      US S 2US 24   -  -   nX.R                  S5      US S 2S U24   -  -   nUR                  [        R                  :X  a  UR                  SS5      nUR                  [        R                  :X  a  UR                  SS5      nX4$ )Né   )r8   r9   )r   i  ÿÿiàÿ  )Úsizer,   r-   Ú	unsqueezer0   r1   ÚtorchÚcatr2   ÚdtypeÚfloat16Úclip)r3   r8   r9   r:   Útext_seq_lengthÚnorm_hidden_statesÚgate_msaÚ	shift_mlpÚ	scale_mlpÚgate_mlpÚnorm_encoder_hidden_statesÚ
c_gate_msaÚc_shift_mlpÚc_scale_mlpÚ
c_gate_mlpÚattn_hidden_statesÚattn_encoder_hidden_statesÚ	ff_outputs                     r5   ÚforwardÚ$CogView3PlusTransformerBlock.forwardM   sá  € ð 0×4Ñ4°QÓ7ˆð J‰J}¸SÓAñ	
ØØØØØØ&ØØØØð :>¿¹Ø,ð :Dð :
Ñ6ÐÐ6ð &×(:Ñ(:¸1Ó(=Ð@RÑ(RÑRˆØ 5×8LÑ8LÈQÓ8OÐRlÑ8lÑ lÐð "ŸZ™Z¨Ó6ÐØ/°1°yÂÀDÀÑ7IÑ3IÑJÈYÒWXÐZ^ÐW^ÑM_Ñ_Ðà%)×%7Ñ%7Ð8MÓ%NÐ"Ø%?À1À{ÒSTÐVZÐSZÑG[ÑC[Ñ%\Ð_jÒklÐnrÐkrÑ_sÑ%sÐ"ô #ŸYšYÐ(BÐ'WÐ]^Ñ_ÐØ—G‘GÐ.Ó/ˆ	à%×(:Ñ(:¸1Ó(=À	Ê!È_ÑM]ÐJ]Ñ@^Ñ(^Ñ^ˆØ 5×8LÑ8LÈQÓ8OÐR[Ò\]Ð_oÐ`oÐ_oÐ\oÑRpÑ8pÑ pÐà×Ñ¤%§-¡-Ó/Ø)×.Ñ.¨v°uÓ=ˆMØ ×&Ñ&¬%¯-©-Ó7Ø$9×$>Ñ$>¸vÀuÓ$MÐ!ØÐ3Ð3r7   )r-   r2   r,   r0   r1   )i 
  é@   é(   é   )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Úintr+   r@   ÚTensorrS   Ú__static_attributes__Ú__classcell__©r4   s   @r5   r   r   "   sŒ   ø† ñð  Ø#%Ø"$Ø!ñVàðVð !ðVð  ð	Vð
 ÷Vð Vð804à—|‘|ð04ð  %Ÿ|™|ð04ð \‰\ð	04ð
 
‰÷04ò 04r7   r   c                   óž  ^ • \ rS rSrSrSrSS/rSS/r\           S!S\	S	\	S
\	S\	S\	S\	S\	S\	S\	S\	S\	4U 4S jjj5       r
\S\\\4   4S j5       rS\\\\\4   4   4S jr S"S\R&                  S\R&                  S\R(                  S\R&                  S\R&                  S\R&                  S\S\\R&                  \4   4S jjrS rU =r$ )#ÚCogView3PlusTransformer2DModelé€   a!  
The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay
Diffusion](https://huggingface.co/papers/2403.05121).

Args:
    patch_size (`int`, defaults to `2`):
        The size of the patches to use in the patch embedding layer.
    in_channels (`int`, defaults to `16`):
        The number of channels in the input.
    num_layers (`int`, defaults to `30`):
        The number of layers of Transformer blocks to use.
    attention_head_dim (`int`, defaults to `40`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `64`):
        The number of heads to use for multi-head attention.
    out_channels (`int`, defaults to `16`):
        The number of channels in the output.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    time_embed_dim (`int`, defaults to `512`):
        Output dimension of timestep embeddings.
    condition_dim (`int`, defaults to `256`):
        The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
        crop_coords).
    pos_embed_max_size (`int`, defaults to `128`):
        The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
        to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
        means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
        patch_size => 128 * 8 * 2 => 2048`.
    sample_size (`int`, defaults to `128`):
        The base resolution of input latents. If height/width is not provided during generation, this value is used
        to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
TÚpatch_embedÚnormr   r   Ú
patch_sizeÚin_channelsÚ
num_layersr   r   Úout_channelsÚtext_embed_dimr   Úcondition_dimÚpos_embed_max_sizeÚsample_sizec                 ó  >• [         TU ]  5         X`l        XT-  U l        SU	-  U l        [        UU R                  UUU
S9U l        [        UU	U R                  U R                  S9U l        [        R                  " [        U5       Vs/ s H  n[        U R                  UUUS9PM     sn5      U l        [        U R                  USSS9U l        [        R                   " U R                  X-  U R                  -  SS	9U l        SU l        g s  snf )
Né   )rh   Úhidden_sizerg   Útext_hidden_sizerm   )r   rl   Úpooled_projection_dimÚtimesteps_dim)r   r   r   r   Fr   )r   Úconditioning_embedding_dimr%   r&   T)r#   )r*   r+   rj   Ú	inner_dimrs   r   re   r   Útime_condition_embedr.   Ú
ModuleListÚranger   Útransformer_blocksr   Únorm_outÚLinearÚproj_outÚgradient_checkpointing)r3   rg   rh   ri   r   r   rj   rk   r   rl   rm   rn   Ú_r4   s                €r5   r+   Ú'CogView3PlusTransformer2DModel.__init__§   s  ø€ ô 	‰ÑÔØ(ÔØ,ÑAˆŒð &+¨]Ñ%:ˆÔ"ä1Ø#ØŸ™Ø!Ø+Ø1ñ
ˆÔô %KØ(Ø'Ø"&×"<Ñ"<ØŸ.™.ñ	%
ˆÔ!ô #%§-¢-ô ˜zÔ*óò +Aô -ØŸ™Ø(;Ø'9Ø#1ô	ñ +ñó
#
ˆÔô /ØŸ.™.Ø'5Ø$Øñ	
ˆŒô Ÿ	š	 $§.¡.°*Ñ2IÈD×L]ÑL]Ñ2]ÐdhÑiˆŒà&+ˆÕ#ùò's   Â D
r;   c                 óÆ   ^• 0 nS[         S[        R                  R                  S[        [         [
        4   4U4S jjmU R                  5        H  u  p#T" X#U5        M     U$ )z–
Returns:
    `dict` of attention processors: A dictionary containing all attention processors used in the model with
    indexed by its weight name.
ÚnameÚmoduleÚ
processorsc                 ó¢   >• [        US5      (       a  UR                  5       X  S3'   UR                  5        H  u  p4T" U  SU 3XB5        M     U$ )NÚget_processorú
.processorÚ.)Úhasattrr†   Únamed_children)r‚   rƒ   r„   Úsub_nameÚchildÚfn_recursive_add_processorss        €r5   r   ÚSCogView3PlusTransformer2DModel.attn_processors.<locals>.fn_recursive_add_processorsî   sZ   ø€ Üv˜×/Ñ/Ø28×2FÑ2FÓ2H
˜V :Ð.Ñ/à#)×#8Ñ#8Ö#:‘Ù+¨t¨f°A°h°ZÐ,@À%ÖTñ $;ð Ðr7   )Ústrr@   r.   ÚModuler   r   rŠ   )r3   r„   r‚   rƒ   r   s       @r5   Úattn_processorsÚ.CogView3PlusTransformer2DModel.attn_processorsã   sb   ø€ ð ˆ
ð	¬cð 	¼5¿8¹8¿?¹?ð 	ÔX\Ô]`ÔbtÐ]tÑXu÷ 	ð !×/Ñ/Ö1‰LˆDÙ'¨°jÖAñ 2ð Ðr7   r'   c           	      ód  ^• [        U R                  R                  5       5      n[        U[        5      (       a-  [        U5      U:w  a  [        S[        U5       SU SU S35      eS[        S[        R                  R                  4U4S jjmU R                  5        H  u  p4T" X4U5        M     g)	aô  
Sets the attention processor to use to compute attention.

Parameters:
    processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
        The instantiated processor class or a dictionary of processor classes that will be set as the processor
        for **all** `Attention` layers.

        If `processor` is a dict, the key needs to define the path to the corresponding cross attention
        processor. This is strongly recommended when setting trainable attention processors.

z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r‚   rƒ   c                 ó
  >• [        US5      (       aJ  [        U[        5      (       d  UR                  U5        O#UR                  UR	                  U  S35      5        UR                  5        H  u  p4T" U  SU 3XB5        M     g )NÚset_processorr‡   rˆ   )r‰   Ú
isinstanceÚdictr•   ÚpoprŠ   )r‚   rƒ   r'   r‹   rŒ   Úfn_recursive_attn_processors        €r5   r™   ÚVCogView3PlusTransformer2DModel.set_attn_processor.<locals>.fn_recursive_attn_processor  ss   ø€ Üv˜×/Ñ/Ü! )¬T×2Ñ2Ø×(Ñ(¨Õ3à×(Ñ(¨¯©¸$¸¸zÐ7JÓ)KÔLà#)×#8Ñ#8Ö#:‘Ù+¨t¨f°A°h°ZÐ,@À%ÖSò $;r7   N)Úlenr‘   Úkeysr–   r—   Ú
ValueErrorr   r@   r.   r   rŠ   )r3   r'   Úcountr‚   rƒ   r™   s        @r5   Úset_attn_processorÚ1CogView3PlusTransformer2DModel.set_attn_processorý   s®   ø€ ô D×(Ñ(×-Ñ-Ó/Ó0ˆäi¤×&Ñ&¬3¨y«>¸UÓ+BÜØPÔQTÐU^ÓQ_ÐP`ð a0Ø05¨wÐ6QÐRWÐQXÐXkðmóð ð
	T¬cð 	T¼5¿8¹8¿?¹?÷ 	Tð !×/Ñ/Ö1‰LˆDÙ'¨°iÖ@ò 2r7   r8   r9   ÚtimestepÚoriginal_sizeÚtarget_sizeÚcrop_coordsÚreturn_dictc                 ó$  • UR                   SS u  p‰UR                   S   n
U R                  X5      nU R                  X4XVUR                  5      nUSS2SU
24   nUSS2U
S24   n[	        U R
                  5       HR  u  pÍ[        R                  " 5       (       a)  U R                  (       a  U R                  UUUU5      u  pMH  U" UUUS9u  pMT     U R                  X5      nU R                  U5      nU R                  R                  nXŽ-  nXž-  n	UR                  UR                   S   X‰U R                  Xî4S9n[        R                   " SU5      nUR                  UR                   S   U R                  XŽ-  Xž-  4S9nU(       d  U4$ [#        US9$ )	aÎ  
The [`CogView3PlusTransformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor`):
        Input `hidden_states` of shape `(batch size, channel, height, width)`.
    encoder_hidden_states (`torch.Tensor`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape
        `(batch_size, sequence_len, text_embed_dim)`
    timestep (`torch.LongTensor`):
        Used to indicate denoising step.
    original_size (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    target_size (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    crop_coords (`torch.Tensor`):
        CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of
        [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.

Returns:
    `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]:
        The denoised latents using provided inputs as conditioning.
éþÿÿÿNr=   )r8   r9   r:   r   )Úshapeznhwcpq->nchpwq)Úsample)r¨   re   rw   rB   Ú	enumeraterz   r@   Úis_grad_enabledr~   Ú_gradient_checkpointing_funcr{   r}   Úconfigrg   Úreshaperj   Úeinsumr   )r3   r8   r9   r¡   r¢   r£   r¤   r¥   ÚheightÚwidthrE   r:   Úindex_blockÚblockrg   Úoutputs                   r5   rS   Ú&CogView3PlusTransformer2DModel.forward  sÃ  € ðL &×+Ñ+¨B¨CÐ0‰ˆØ/×5Ñ5°aÑ8ˆà×(Ñ(Øó
ˆð ×'Ñ'¨ÀÐ[h×[nÑ[nÓoˆà -ªaÐ1A°/Ð1AÐ.AÑ BÐØ%¢a¨Ñ)9Ð&9Ñ:ˆä"+¨D×,CÑ,CÖ"DÑˆKÜ×$Ò$×&Ñ&¨4×+F×+FØ7;×7XÑ7XØØ!Ø)Øó	8Ñ4Ñ4ñ 8=Ø"/Ø*?Øñ8Ñ4Ñ4ñ #Eð Ÿ™ mÓ9ˆØŸ™ mÓ4ˆð —[‘[×+Ñ+ˆ
ØÑ%ˆØÑ#ˆà%×-Ñ-Ø ×&Ñ& qÑ)¨6¸$×:KÑ:KÈZÐdð .ð 
ˆô ŸšÐ%5°}ÓEˆØ×&Ñ&Ø ×&Ñ& qÑ)¨4×+<Ñ+<¸fÑ>QÐSXÑSeÐfð 'ð 
ˆö Ø9Ðä'¨vÑ6Ð6r7   )	r~   rv   r{   rj   re   rs   r}   rw   rz   )r	   é   é   rV   rU   r¶   i   rW   é   rd   rd   )T)rX   rY   rZ   r[   r\   Ú _supports_gradient_checkpointingÚ _skip_layerwise_casting_patternsÚ_no_split_modulesr   r]   r+   Úpropertyr   r   r   r‘   r   rŸ   r@   r^   Ú
LongTensorÚboolr   rS   r_   r`   ra   s   @r5   rc   rc   €   s¥  ø† ñ ðD (,Ð$Ø(5°vÐ'>Ð$Ø7Ð9QÐRÐàð ØØØ"$Ø#%ØØ"Ø!Ø Ø"%Øñ9,àð9,ð ð9,ð ð	9,ð
  ð9,ð !ð9,ð ð9,ð ð9,ð ð9,ð ð9,ð  ð9,ð ÷9,ó ð9,ðv ð  cÐ+=Ð&=Ñ!>ó ó ðð0 A¨EÐ2DÀdÈ3ÐPbÐKbÑFcÐ2cÑ,dô  AðT !ñS7à—|‘|ðS7ð  %Ÿ|™|ðS7ð ×"Ñ"ð	S7ð
 —|‘|ðS7ð —\‘\ðS7ð —\‘\ðS7ð ðS7ð 
ˆu|‰|Ð5Ð5Ñ	6÷S7ó S7r7   rc   )!Útypingr   r   r@   Útorch.nnr.   Úconfiguration_utilsr   r   Úutilsr   Ú	attentionr
   Úattention_processorr   r   r   Ú
embeddingsr   r   Úmodeling_outputsr   Úmodeling_utilsr   Únormalizationr   r   Ú
get_loggerrX   Úloggerr   r   rc   © r7   r5   Ú<module>rÌ      s`   ð÷  ã Ý ç BÝ Ý #ß ZÑ Zß WÝ 7Ý 'ß Yð 
×	Ò	˜HÓ	%€ô[4 2§9¡9ô [4ô|r7 Z°õ r7r7   