
    +hv                     &   S SK JrJrJrJrJrJr  S SKrS SKJ	r	  S SK
J	s  Jr  SSKJrJr  SSKJr  SSKJrJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&J'r'  \RP                  " \)5      r* " S S\	RV                  5      r, " S S\	RV                  5      r- " S S5      r. " S S5      r/\ " S S\	RV                  5      5       r0 " S S\	RV                  5      r1 " S S\	RV                  5      r2 " S S\$\\\5      r3g)     )AnyDictListOptionalTupleUnionN   )ConfigMixinregister_to_config)PeftAdapterMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers)maybe_allow_in_graph   )FeedForward)	Attention)
CacheMixin)&CogView3CombinedTimestepSizeEmbeddings)Transformer2DModelOutput)
ModelMixin)	LayerNormRMSNormc            	          ^  \ rS rSr    SS\S\S\S\4U 4S jjjrS\R                  S\R                  S	\R                  4S
 jrSr	U =r
$ )CogView4PatchEmbed%   in_channelshidden_size
patch_sizetext_hidden_sizec                    > [         TU ]  5         X0l        [        R                  " XS-  -  U5      U l        [        R                  " XB5      U l        g )Nr   )super__init__r    nnLinearproj	text_proj)selfr   r   r    r!   	__class__s        l/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_cogview4.pyr$   CogView4PatchEmbed.__init__&   s?     	$IIkM9;G	#3A    hidden_statesencoder_hidden_statesreturnc                 \   UR                   u  p4pVXPR                  -  nX`R                  -  nUR                  X4XpR                  XR                  5      nUR                  SSSSSS5      R	                  SS5      R	                  SS5      nU R                  U5      nU R                  U5      nX4$ )Nr   r         r	      )shaper    reshapepermuteflattenr'   r(   )	r)   r.   r/   
batch_sizechannelheightwidthpost_patch_heightpost_patch_widths	            r+   forwardCogView4PatchEmbed.forward3   s    -:-@-@*
V"oo5 OO3%--!2OOEUWfWf
 &--aAq!Q?GG1MUUVWYZ[		-0 $/D E33r-   )r    r'   r(   )    
  r      )__name__
__module____qualname____firstlineno__intr$   torchTensorr?   __static_attributes____classcell__r*   s   @r+   r   r   %   su      $BB B 	B
 B B4U\\ 4%,, 4[`[g[g 4 4r-   r   c            
          ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  S	\R                  S\\R                  \R                  4   4S
 jr	Sr
U =r$ )CogView4AdaLayerNormZeroB   embedding_dimdimr0   Nc                    > [         TU ]  5         [        R                  " USSS9U l        [        R                  " USSS9U l        [        R                  " USU-  SS9U l        g )NFh㈵>elementwise_affineeps   Tbias)r#   r$   r%   r   normnorm_contextr&   linear)r)   rQ   rR   r*   s      r+   r$   !CogView4AdaLayerNormZero.__init__C   sO    LLDI	LLDQiirCxdCr-   r.   r/   tembc                    UR                   nU R                  U5      R                  US9nU R                  U5      R                  US9nU R	                  U5      nUR                  SSS9u  nn	n
nnnnnnnnnUSU
R                  S5      -   -  UR                  S5      -   nUSUR                  S5      -   -  U	R                  S5      -   nUUUUUUUUUU4
$ )NdtyperX   r3   rR   )rb   r[   tor\   r]   chunk	unsqueeze)r)   r.   r/   r_   rb   norm_hidden_statesnorm_encoder_hidden_statesemb	shift_msac_shift_msa	scale_msac_scale_msagate_msa
c_gate_msa	shift_mlpc_shift_mlp	scale_mlpc_scale_mlpgate_mlp
c_gate_mlps                       r+   r?    CogView4AdaLayerNormZero.forwardJ   s    ##!YY}588u8E%)%6%67L%M%P%PW\%P%]"kk$ IIbaI 	
 +a)2E2Ea2H.HIIL_L_`aLbb :a+BWBWXYBZ>Z [^i^s^stu^v v !
 	
r-   )r]   r[   r\   )rD   rE   rF   rG   rH   r$   rI   rJ   r   r?   rK   rL   rM   s   @r+   rO   rO   B   sk    Dc D D D%
"\\%
BG,,%
V[VbVb%
	u||U\\)	*%
 %
r-   rO   c                       \ rS rSrSrS r  SS\S\R                  S\R                  S\	\R                     S	\	\
\R                  \R                  4      S
\
\R                  \R                  4   4S jjrSrg)CogView4AttnProcessorr   a  
Processor for implementing scaled dot-product attention for the CogView4 model. It applies a rotary embedding on
query and key vectors, but does not include spatial normalization.

The processor supports passing an attention mask for text tokens. The attention mask should have shape (batch_size,
text_seq_length) where 1 indicates a non-padded token and 0 indicates a padded token.
c                 D    [        [        S5      (       d  [        S5      eg Nscaled_dot_product_attentionzUCogView4AttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.hasattrFImportErrorr)   s    r+   r$   CogView4AttnProcessor.__init__{   !    q899uvv :r-   Nattnr.   r/   attention_maskimage_rotary_embr0   c           	         UR                   nUR                  u  pxn	UR                  u  pzn	[        R                  " X2/SS9nUR	                  U5      nUR                  U5      nUR                  U5      nUR                  SUR                  S45      R                  SS5      nUR                  SUR                  S45      R                  SS5      nUR                  SUR                  S45      R                  SS5      nUR                  b  UR                  U5      R                  US9nUR                  b  UR                  U5      R                  US9nUbR  SSKJn  U" US S 2S S 2US 2S S 24   USS9US S 2S S 2US 2S S 24'   U" US S 2S S 2US 2S S 24   USS9US S 2S S 2US 2S S 24'   Ub  UnUR                  5       S:X  d   S	5       eUR!                  5       R                  UR"                  5      n[        R$                  " XxU
-   4UR"                  S
9nUUS S 2S U24'   UR'                  S5      nUUR                  SS5      -  nUS:  R'                  S5      R                  UR                   5      n[(        R*                  " XXSSS9nUR                  SS5      R-                  SS5      nUR/                  U5      nUR0                  S   " U5      nUR0                  S   " U5      nUR3                  XR5                  S5      U-
  /SS9u  p2X#4$ )Nr3   rc   r   ra   apply_rotary_embuse_real_unbind_dimCthe shape of text_attn_mask should be (batch_size, text_seq_length)devicer           F	attn_mask	dropout_p	is_causalr	   )rb   r5   rI   catto_qto_kto_v	unflattenheads	transposenorm_qrd   norm_k
embeddingsr   rR   floatr   onesrf   r   r|   r8   type_asto_outsplitsize)r)   r   r.   r/   r   r   rb   r9   text_seq_length	embed_dimimage_seq_lengthquerykeyvaluer   text_attn_maskmix_attn_maskattn_mask_matrixs                     r+   __call__CogView4AttnProcessor.__call__   s    &++1F1L1L.
Y2?2E2E/
i		#8"HaP 		-(ii&		-(DJJ#34>>q!DmmA

B/0::1a@DJJ#34>>q!D ;;"KK&)))6E;;"++c"%%E%2C '5/?aO,a/02BXZ0E!Q(!+, .>Aq/*A-.0@VX.C1o&)*
 %+N!%%'1,s.ss,+11366u||DN!JJ
FV4V'W`e`l`lmM1?M!-o--.)33A6M,}/F/Fq!/LL.2==a@CCEKKPN663RW
 &//15==aC%--e4 A}5A}5/</B/B003oEFA 0C 0
, 33r-    )NN)rD   rE   rF   rG   __doc__r$   r   rI   rJ   r   r   r   rK   r   r-   r+   rx   rx   r   s    w 26HL@4@4 ||@4  %||	@4
 !.@4 #5u||)C#DE@4 
u||U\\)	*@4 @4r-   rx   c                   v   \ rS rSrSrS r    SS\S\R                  S\R                  S\	\R                     S	\	\R                     S
\	\R                     S\	\
\\R                  \R                  4   \\\R                  \R                  4      4      S\\R                  \R                  4   4S jjrSrg)CogView4TrainingAttnProcessor   a  
Training Processor for implementing scaled dot-product attention for the CogView4 model. It applies a rotary
embedding on query and key vectors, but does not include spatial normalization.

This processor differs from CogView4AttnProcessor in several important ways:
1. It supports attention masking with variable sequence lengths for multi-resolution training
2. It unpacks and repacks sequences for efficient training with variable sequence lengths when batch_flag is
   provided
c                 D    [        [        S5      (       d  [        S5      eg r{   r}   r   s    r+   r$   &CogView4TrainingAttnProcessor.__init__   r   r-   Nr   r.   r/   latent_attn_maskr   
batch_flagr   r0   c           
         UR                   u  pnUR                   u  pnUR                  nUR                  nUn[        R                  " X?/SS9nUc%  [        R
                  " X4[        R                  US9nUc%  [        R
                  " X4[        R                  US9nUR                  5       S:X  d   S5       eUR                  [        R                  :X  d   S5       eUR                  5       S:X  d   S5       eUR                  [        R                  :X  d   S	5       e[        R
                  " XU-   4[        R                  US9nUUSS2SU
24'   UUSS2U
S24'   UR                  S5      R                  US
9nUUR                  SS5      -  nUGb  UR                  5       S:X  d   e[        R                  " U5      R                  5       S-   n[        R                  " USS9n
[        R                  " USS9nU
U-   n[        U5       Vs/ s H-  n[        R                  " UUU:H     5      R                  5       PM/     nn[        U5      U:X  d   eUR!                  SS5      nUR!                  SS5      nUUS:H     n[        R                  " U5      UR                   S   :X  d   e[        R"                  " UU5      n[        R$                  R&                  R(                  R+                  USSSS9nUR                   S   n[        R,                  " UUU4UUS9n[/        U5       H/  u  nn UUU:H     n!Sn"U! H  n#SU U"U"U#-   2U"U"U#-   24'   U"U#-  n"M     M1     UR                  [        R0                  S
9nUR                  S5      nUn$Uc  [        R                  " X2/SS9nOWnUR3                  U5      n%UR5                  U5      n&UR7                  U5      n'U%R9                  SUR:                  S45      R                  SS5      n%U&R9                  SUR:                  S45      R                  SS5      n&U'R9                  SUR:                  S45      R                  SS5      n'UR<                  b  UR=                  U%5      R                  US
9n%UR>                  b  UR?                  U&5      R                  US
9n&UGb=  SSK J!n(  UcM  U(" U%SS2SS2U
S2SS24   USS9U%SS2SS2U
S2SS24'   U(" U&SS2SS2U
S2SS24   USS9U&SS2SS2U
S2SS24'   OU%R                   S   W:X  d   eU&R                   S   U:X  d   e[        U5      U	:X  d   eSn)[        U5       H  nSn"XU:H     n*WUU:H     n+[E        U*U+5       Hw  u  n,n-U,U--   n.U(" U%USS2U"U,-   U"U.-   2SS24   UU)   SS9U%USS2U"U,-   U"U.-   2SS24'   U(" U&USS2U"U,-   U"U.-   2SS24   UU)   SS9U&USS2U"U,-   U"U.-   2SS24'   U"U.-  n"U)S-  n)My     M     [F        RH                  " U%U&U'U$SSS9nUR                  SS5      R!                  SS5      nURK                  U%5      nURL                  S   " U5      nURL                  S   " U5      nUc(  UR#                  XRO                  S5      U
-
  /SS9u  p2X#4$ [        R$                  R&                  R(                  RQ                  U[        RR                  " W5      SS9n/[        R                  " U/SS9n0[        R"                  " U0WRU                  5       5      n1[        U15      U	:X  d   e[E        U1U
W5       V2V,V-s/ s H   u  n2n,n-[        R"                  " U2U,U-/5      PM"     n1n,n2n-U1 V2s/ s H  n2U2S   PM
     n3n2U1 V2s/ s H  n2U2S   PM
     n/n2[        U	5       H%  nU3U   UU   UU   S:H  '   U/U   UU   UU   S:H  '   M'     UnX#4$ s  snf s  sn-n,n2f s  sn2f s  sn2f )a5  
Args:
    attn (`Attention`):
        The attention module.
    hidden_states (`torch.Tensor`):
        The input hidden states.
    encoder_hidden_states (`torch.Tensor`):
        The encoder hidden states for cross-attention.
    latent_attn_mask (`torch.Tensor`, *optional*):
        Mask for latent tokens where 0 indicates pad token and 1 indicates non-pad token. If None, full
        attention is used for all latent tokens. Note: the shape of latent_attn_mask is (batch_size,
        num_latent_tokens).
    text_attn_mask (`torch.Tensor`, *optional*):
        Mask for text tokens where 0 indicates pad token and 1 indicates non-pad token. If None, full attention
        is used for all text tokens.
    batch_flag (`torch.Tensor`, *optional*):
        Values from 0 to n-1 indicating which samples belong to the same batch. Samples with the same
        batch_flag are packed together. Example: [0, 1, 1, 2, 2] means sample 0 forms batch0, samples 1-2 form
        batch1, and samples 3-4 form batch2. If None, no packing is used.
    image_rotary_emb (`Tuple[torch.Tensor, torch.Tensor]` or `list[Tuple[torch.Tensor, torch.Tensor]]`, *optional*):
        The rotary embedding for the image part of the input.
Returns:
    `Tuple[torch.Tensor, torch.Tensor]`: The processed hidden states for both image and text streams.
r3   rc   N)rb   r   r   r   z1the dtype of text_attn_mask should be torch.int32zGthe shape of latent_attn_mask should be (batch_size, num_latent_tokens)z3the dtype of latent_attn_mask should be torch.int32ra   r   Tr   right)batch_firstpadding_valuepadding_sider   r   r   r   Fr   r	   )lengthsr   )+r5   rb   r   rI   r   r   int32rR   rf   rd   r   maxitemsumrangelenr8   r   r%   utilsrnnpad_sequencezeros	enumerateboolr   r   r   r   r   r   r   r   r   zipr   r|   r   r   r   unpad_sequencetensortolist)4r)   r   r.   r/   r   r   r   r   kwargsr9   r   r   r   rb   r   latent_hidden_statesmixed_hidden_statesmixed_attn_maskmixed_attn_mask_inputr   packing_batch_sizelatent_seq_lengthmixed_seq_length	batch_idxmixed_seq_length_packedmixed_attn_mask_flattenmixed_hidden_states_flattenmixed_hidden_states_unpadmixed_hidden_states_packed!mixed_hidden_states_packed_paddedlidxmaskseq_lengthsoffsetlengthr   r   r   r   r   rope_idxtext_seq_length_bilatent_seq_length_bitlenllenmlenhidden_states_unpadhidden_states_flattenhidden_states_unpackhencoder_hidden_states_unpads4                                                       r+   r   &CogView4TrainingAttnProcessor.__call__   s   N 2G1L1L.
Y2?2E2E/
i%++&--,#ii)>(U[\] !"ZZ(EU[[aghN#$zz:*HPUP[P[djk !!#q(o*oo(##u{{2g4gg2##%*u,uu*%%4k6kk4  **+;;<EKKX^
 0>+O++,/??++, !0 9 9! < ? ?e ? L03H3R3RSTVW3XX !>>#q(((!&:!6!;!;!=!A $iiA>O %		*: B.1BB ]bbt\u'\uy		/*	*ABCHHJ\u $ ' ./3EEEE '6&=&=a&C#*=*E*Ea*K'(CD[_`D`(a%99-.2K2Q2QRS2TTTT */5NPg)h& 160B0B0O0O* !$	 1P 1- 277:A${{#Q*  ''78	T.zS/@A)FOPD&6/16FVO3KKLf$F * 9 ,..UZZ.@+55a8) !II'<&LRSTM >M 		-(ii&		-( DJJ#34>>q!DmmA

B/0::1a@DJJ#34>>q!D ;;"KK&)))6E;;"++c"%%E%2C '5!3C!Q 0!346F\^4aO,a/0 2B1o.124DZ\2Aq/*A-.
 {{1~);;;;yy|'9999+,
::: !34CF)8s9J)K&+<Z3=N+O( '**<>R&S
d#d{JZ!#q&4-&4-*G"JK,X602Kc1ftmftm&CQFG
 IYQ(Eq HI,X602ICFTMFTM$A1DE
 $ A 'T 5. 6633RW

 &//15==aC%--e4 A}5A}5 3@3F3F "4"4Q"7/"IJPQ 4G 40!H 33= #(((.."4"4"C"C%<=  #D # %*II.Aq$I!#(;;/DFVF]F]F_#` +,
:::
 &))=Pa%b$%bMAtT Ad|,%b ! $
 :N*N9MA1Q49M'*N1E"F1EA1Q41E"F Z(GbcfGg%c*>#+>!+CDH[\_H`$S)*:3*?1*DE	 ) 1M33E'`$
 +O"Fs   4_>'_-__r   NNNN)rD   rE   rF   rG   r   r$   r   rI   rJ   r   r   r   r   r   rK   r   r-   r+   r   r      s    w 4815-1 u4u4 ||u4  %||	u4
 #5<<0u4 !.u4 U\\*u4 #%ell23T%ell@Z:[5\\]
u4 
u||U\\)	*u4 u4r-   r   c                     ^  \ rS rSr    SS\S\S\S\SS4
U 4S jjjr    SS	\R                  S
\R                  S\\R                     S\\	\
\R                  \R                  4   \\
\R                  \R                  4      4      S\\\\R                  4      S\\\\4      S\R                  4S jjrSrU =r$ )CogView4TransformerBlocki  rR   num_attention_headsattention_head_dimtime_embed_dimr0   Nc                   > [         TU ]  5         [        XA5      U l        [	        UUUUSSSS[        5       S9	U l        [        R                  " USSS9U l	        [        R                  " USSS9U l
        [        XSS9U l        g )	NT
layer_normFrT   )		query_dimr   dim_headout_dimrZ   qk_normrV   rW   	processorrU   zgelu-approximate)rR   dim_outactivation_fn)r#   r$   rO   norm1r   rx   attn1r%   r   norm2norm2_contextr   ff)r)   rR   r   r   r   r*   s        r+   r$   !CogView4TransformerBlock.__init__  s     	 .nB
%' $+-


 \\#%TJ
\\#%TR#BTUr-   r.   r/   r_   r   r   attention_kwargsc           
      \   U R                  XU5      u
  nnn	n
nnnnnnUc  0 nU R                  " SUUUUS.UD6u  nnUUUR                  S5      -  -   nUUUR                  S5      -  -   nU R                  U5      SU
R                  S5      -   -  U	R                  S5      -   nU R	                  U5      SUR                  S5      -   -  UR                  S5      -   nU R                  U5      nU R                  U5      nUUUR                  S5      -  -   nUUUR                  S5      -  -   nX4$ )N)r.   r/   r   r   r3   r   )r   r  rf   r  r  r  )r)   r.   r/   r_   r   r   r  rg   rn   rp   rr   rt   rh   ro   rq   rs   ru   attn_hidden_statesattn_encoder_hidden_states	ff_outputff_output_contexts                        r+   r?    CogView4TransformerBlock.forward  s   . JJ}TB	
& #!9= :
,"<-)	:

 :
66 &(:X=O=OPQ=R(RR 58RU_UiUijkUl8l l "ZZ6!i>Q>QRS>T:TUXaXkXklmXnn%)%7%78M%N%%a((&
!!!$&%" GG./	 GG$>?%	H4F4Fq4I(II 58IJL`L`abLc8c c33r-   )r  r  r   r  r  )rB   @   (      r   )rD   rE   rF   rG   rH   r$   rI   rJ   r   r   r   r   r   strr   r?   rK   rL   rM   s   @r+   r   r     s!    #%"$!VV !V  	V
 V 
V V@ (, <@5914||14  %||14 u||$	14
 #%ell23T%ell@Z:[5\\]
14 !c5<<&7!8914 #4S>214 
14 14r-   r   c                      ^  \ rS rSrSS\S\S\\\4   S\SS4
U 4S jjjrS	\R                  S\\R                  \R                  4   4S
 jr
SrU =r$ )CogView4RotaryPosEmbedi  rR   r    rope_axes_dimthetar0   Nc                 R   > [         TU ]  5         Xl        X l        X0l        X@l        g )N)r#   r$   rR   r    r  r  )r)   rR   r    r  r  r*   s        r+   r$   CogView4RotaryPosEmbed.__init__  s#    $*
r-   r.   c           	         UR                   u  p#pEX@R                  -  XPR                  -  pTU R                  S-  U R                  S-  pvSU R                  [        R
                  " SUS[        R                  S9S US-   R                  5       U-  -  -  nSU R                  [        R
                  " SUS[        R                  S9S US-   R                  5       U-  -  -  n	[        R
                  " U R                  S   5      n
[        R
                  " U R                  S   5      n[        R                  " X5      n[        R                  " X5      n[        R
                  " XLR                  S9n[        R
                  " X]R                  S9nXR                  S   -  U-  nXR                  S   -  U-  nUU   nUU   nUR                  S5      nUR                  S5      nUR                  XES5      nUR                  XES5      n[        R                  " X/SS9n[        R                  " UU/SS9nUR                  XE-  S5      nUR                  5       UR!                  5       4$ )	Nr         ?r   ra   r3   r   r   rc   )r5   r    rR   r  rI   arangefloat32r   r  outerr   rf   expandr   r6   cossin)r)   r.   r9   num_channelsr;   r<   dim_hdim_w
h_inv_freq
w_inv_freqh_seqw_seqfreqs_hfreqs_wh_idxw_idxinner_h_idxinner_w_idxfreqss                      r+   r?   CogView4RotaryPosEmbed.forward$  s   2?2E2E/
&//15OO3Kxx1}dhh!muJJ5<<5!5==I.UVWZY__adiij

 JJ5<<5!5==I.UVWZY__adiij

 T//23T//23++e0++e0VNN;U>>:0033v=0033u<+&+& ##A&##A&..3..3 		7,"5		5%.b1fnb1		UYY[))r-   )rR   r    r  r  )     @)rD   rE   rF   rG   rH   r   r   r$   rI   rJ   r?   rK   rL   rM   s   @r+   r  r    sh    C S sCx Y^ mq  $*U\\ $*eELL%,,<V6W $* $*r-   r  c                      ^  \ rS rSrSr    SS\S\S\S\S\S\4U 4S	 jjjr	S
\
R                  S\
R                  S\
R                  4S jrSrU =r$ )CogView4AdaLayerNormContinuousiK  z
CogView4-only final AdaLN: LN(x) -> Linear(cond) -> chunk -> affine. Matches Megatron: **no activation** before the
Linear on conditioning embedding.
rQ   conditioning_embedding_dimrV   rW   rZ   	norm_typec                    > [         TU ]  5         [        R                  " X!S-  US9U l        US:X  a  [        XX55      U l        g US:X  a  [        XU5      U l        g [        SU 35      e)Nr   rY   r   rms_normzunknown norm_type )	r#   r$   r%   r&   r]   r   r[   r   
ValueError)r)   rQ   r1  rV   rW   rZ   r2  r*   s          r+   r$   'CogView4AdaLayerNormContinuous.__init__Q  sj     	ii :A<MTXY$!-6HODI*$4FGDI1)=>>r-   xconditioning_embeddingr0   c                     U R                  UR                  UR                  5      5      n[        R                  " USSS9u  pEU R                  U5      SU-   S S 2S S S 24   -  US S 2S S S 24   -   nU$ )Nr   r3   rc   )r]   rd   rb   rI   re   r[   )r)   r7  r8  ri   scaleshifts         r+   r?   &CogView4AdaLayerNormContinuous.forwardc  sj    kk033AGG<={{3q1IIaLAIq$z22U1dA:5FFr-   )r]   r[   )TrT   Tr   )rD   rE   rF   rG   r   rH   r   r   r  r$   rI   rJ   r?   rK   rL   rM   s   @r+   r0  r0  K  s     $(%?? %(? !	?
 ? ? ? ?$ u|| PUP\P\  r-   r0  c                      ^  \ rS rSrSrSr/ SQr/ SQr\            S S\	S\	S\	S	\	S
\	S\	S\	S\	S\	S\	S\	S\
\	\	4   4U 4S jjj5       r    S!S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\\\\4      S\S\\R                     S\\\
\R                  \R                  4   \\
\R                  \R                  4      4      S\\R                  \4   4S jjrSrU =r$ )"CogView4Transformer2DModelik  a  
Args:
    patch_size (`int`, defaults to `2`):
        The size of the patches to use in the patch embedding layer.
    in_channels (`int`, defaults to `16`):
        The number of channels in the input.
    num_layers (`int`, defaults to `30`):
        The number of layers of Transformer blocks to use.
    attention_head_dim (`int`, defaults to `40`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `64`):
        The number of heads to use for multi-head attention.
    out_channels (`int`, defaults to `16`):
        The number of channels in the output.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    time_embed_dim (`int`, defaults to `512`):
        Output dimension of timestep embeddings.
    condition_dim (`int`, defaults to `256`):
        The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
        crop_coords).
    pos_embed_max_size (`int`, defaults to `128`):
        The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
        to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
        means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
        patch_size => 128 * 8 * 2 => 2048`.
    sample_size (`int`, defaults to `128`):
        The base resolution of input latents. If height/width is not provided during generation, this value is used
        to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
T)r   r   r   )patch_embedr[   proj_outr    r   out_channels
num_layersr   r   text_embed_dimr   condition_dimpos_embed_max_sizesample_sizer  c                   > [         TU ]  5         SU	-  nXe-  nUn[        XQUSS9U l        [	        X.X5      U l        [        UU	UUS9U l        [        R                  " [        U5       Vs/ s H  n[        XXX5      PM     sn5      U l        [        XSS9U l        [        R                  " XU-  U-  SS9U l        SU l        g s  snf )	N   r.  )r  )rQ   rD  pooled_projection_dimtimesteps_dimF)rV   TrY   )r#   r$   r  roper   r?  r   time_condition_embedr%   
ModuleListr   r   transformer_blocksr0  norm_outr&   r@  gradient_checkpointing)r)   r    r   rA  rB  r   r   rC  r   rD  rE  rF  r  rI  	inner_dim_r*   s                   r+   r$   #CogView4Transformer2DModel.__init__  s      	 !& 5'<	# ++==`gh	 .kja$J('"7#	%
! #%-- z**A )I[l*#
 7yejk		)*-D|-SZ^_&+#s   *Cr.   r/   timesteporiginal_sizetarget_sizecrop_coordsr  return_dictr   r   r0   c           
         Ub#  UR                  5       nUR                  SS5      nOSn[        (       a  [        X5        O+Ub(  UR	                  SS 5      b  [
        R                  S5        UR                  u  ppU
c  U R                  U5      n
U R                  R                  nUU-  nUU-  nU R                  X5      u  pU R                  X4XVUR                  5      n[        R                  " U5      nU R                    HX  n["        R$                  " 5       (       a,  U R&                  (       a  U R)                  UUUUU
U	U5      u  pMI  U" UUUU
U	U5      u  pMZ     U R+                  UU5      nU R-                  U5      nUR/                  UUUSUU5      nUR1                  SSSSS	S
5      R3                  SS
5      R3                  S	S5      n[        (       a  [5        X5        U(       d  U4$ [7        US9$ )Nr:  r  zVPassing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective.r   r   r	   r3   r2   r   r4   )sample)copypopr   r   getloggerwarningr5   rK  configr    r?  rL  rb   r   silurN  rI   is_grad_enabledrP  _gradient_checkpointing_funcrO  r@  r6   r7   r8   r   r   )r)   r.   r/   rT  rU  rV  rW  r  rX  r   r   
lora_scaler9   r  r;   r<   pr=   r>   r_   blockoutputs                         r+   r?   "CogView4Transformer2DModel.forward  s    '/446)--gs;JJd/+0@0D0DWd0S0_l 3@2E2E/
& ##yy7 KK"""aK A:/3/?/?/e,((+\i\o\opvvd| ,,E$$&&4+F+F7;7X7X!)$"$844 8=!)$"$844 -, mT:m4 &--j:KM]_acdfgh&&q!Q1a8@@AFNNqRST19'v66r-   )rP  rO  r?  r@  rK  rL  rN  )r   rA   rA      r  r  rC   r        rk  )rj  rj  )NTNN)rD   rE   rF   rG   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rH   r   r$   rI   rJ   
LongTensorr   r   r  r   r   r   r   r   r?   rK   rL   rM   s   @r+   r>  r>  k  s   > (,$`'J$ "$#%"! "%)30,0, 0, 	0,
 0,  0, !0, 0, 0, 0,  0, 0, S#X0, 0,t 6: 15 R7||R7  %||R7 ""	R7
 ||R7 \\R7 \\R7 #4S>2R7 R7 !.R7 #%ell23T%ell@Z:[5\\]
R7 
u||55	6R7 R7r-   r>  )4typingr   r   r   r   r   r   rI   torch.nnr%   torch.nn.functional
functionalr   configuration_utilsr
   r   loadersr   r   r   r   r   r   utils.torch_utilsr   	attentionr   attention_processorr   cache_utilsr   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrD   r^  Moduler   rO   rx   r   r   r  r0  r>  r   r-   r+   <module>r     s    ; :     B ' V V 5 # + $ ? 7 ' . 
		H	%4 4:-
ryy -
`M4 M4`D4 D4N N4ryy N4 N4b-*RYY -*`RYY @i7[:JJ i7r-   