
    +hb                        S SK JrJrJrJrJr  S SKrS SKJs  J	r
  S SKJr  SSKJrJr  SSKJrJr  SSKJrJrJrJr  SS	KJrJrJr  SS
KJrJrJrJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%  \RL                  " \'5      r( " S S\RR                  5      r* " S S\RR                  5      r+ " S S\RR                  5      r, " S S5      r- " S S\RR                  5      r. " S S\"\\\5      r/g)    )AnyDictOptionalTupleUnionN)nn   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers   )	AttentionAttentionProcessorSanaLinearAttnProcessor2_0)
PatchEmbedPixArtAlphaTextProjectionTimestepEmbedding	Timesteps)Transformer2DModelOutput)
ModelMixin)AdaLayerNormSingleRMSNormc                      ^  \ rS rSr   SS\S\S\S\\   S\SS4U 4S	 jjjr	S
\
R                  S\
R                  4S jrSrU =r$ )	GLUMBConv&   Nin_channelsout_channelsexpand_ratio	norm_typeresidual_connectionreturnc           	        > [         TU ]  5         [        X1-  5      nX@l        XPl        [
        R                  " 5       U l        [
        R                  " XS-  SSS5      U l	        [
        R                  " US-  US-  SSSUS-  S9U l
        [
        R                  " XbSSSSS9U l        S U l        US:X  a  [        US	S
S
S9U l        g g )Nr      r   r	   )groupsFbiasrms_normh㈵>T)epselementwise_affiner+   )super__init__intr$   r%   r   SiLUnonlinearityConv2dconv_inverted
conv_depth
conv_pointnormr   )selfr!   r"   r#   r$   r%   hidden_channels	__class__s          h/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/sana_transformer.pyr1   GLUMBConv.__init__'   s     	l89"#6 GGIYY{a4GAqQ))Oa$719LaQRTU^mpq^qr))O1aQVW	
"$4VZ[DI #    hidden_statesc                    U R                   (       a  UnU R                  U5      nU R                  U5      nU R                  U5      n[        R
                  " USSS9u  pXR                  U5      -  nU R                  U5      nU R                  S:X  a1  U R                  UR                  SS5      5      R                  SS5      nU R                   (       a  UW-   nU$ )Nr   r(   dimr,   )
r%   r6   r4   r7   torchchunkr8   r$   r9   movedim)r:   r@   residualgates       r=   forwardGLUMBConv.forward>   s    ##$H**=9))-86#kk-B%(9(9$(??6>>Z' IIm&;&;Ar&BCKKBPQRM##)H4Mr?   )r7   r6   r8   r4   r9   r$   r%   )   NT)__name__
__module____qualname____firstlineno__r2   floatr   strboolr1   rE   TensorrJ   __static_attributes____classcell__r<   s   @r=   r   r   &   s~    
  #'$(\\ \ 	\
 C=\ "\ 
\ \.U\\ ell  r?   r   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS\R                  S\R                  S\R                  S	\R                  4S
 jr
SrU =r$ )SanaModulatedNormU   rC   r/   r.   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )Nr/   r.   )r0   r1   r   	LayerNormr9   )r:   rC   r/   r.   r<   s       r=   r1   SanaModulatedNorm.__init__V   s!    LLQTU	r?   r@   tembscale_shift_tabler&   c                     U R                  U5      nUS    US S 2S 4   R                  UR                  5      -   R                  SSS9u  pEUSU-   -  U-   nU$ )Nr   r(   rB   )r9   todevicerF   )r:   r@   r_   r`   shiftscales         r=   rJ   SanaModulatedNorm.forwardZ   sh     		-0)$/$q$w-2B2BCTC[C[2\\ccdeklcm%U3e;r?   )r9   )Fư>)rM   rN   rO   rP   r2   rS   rQ   r1   rE   rT   rJ   rU   rV   rW   s   @r=   rY   rY   U   sc    VC VT V V V"\\16RWR^R^	 r?   rY   c                   ~   ^  \ rS rSrU 4S jrSS\R                  S\R                  S\R                  4S jjrSr	U =r
$ )	&SanaCombinedTimestepGuidanceEmbeddingsc   c                   > [         TU ]  5         [        SSSS9U l        [	        SUS9U l        [        SSSS9U l        [	        SUS9U l        [        R                  " 5       U l
        [        R                  " USU-  SS9U l        g )N   Tr   )num_channelsflip_sin_to_cosdownscale_freq_shift)r!   time_embed_dim   r*   )r0   r1   r   	time_projr   timestep_embedderguidance_condition_projguidance_embedderr   r3   siluLinearlinear)r:   embedding_dimr<   s     r=   r1   /SanaCombinedTimestepGuidanceEmbeddings.__init__d   sx    "T`ab!2sS`!a'0cSWno'p$!2sS`!aGGI	iiq=/@tLr?   timestepguidancehidden_dtypec                 
   U R                  U5      nU R                  UR                  US95      nU R                  U5      nU R	                  UR                  US95      nXW-   nU R                  U R                  U5      5      U4$ )N)dtype)rr   rs   rb   rt   ru   rx   rv   )	r:   r{   r|   r}   timesteps_projtimesteps_embguidance_projguidance_embconditionings	            r=   rJ   .SanaCombinedTimestepGuidanceEmbeddings.forwardo   s    1..~/@/@|/@/TU44X>--m.>.>\.>.RS$3{{499\23\AAr?   )rt   ru   rx   rv   rr   rs   NN)rM   rN   rO   rP   r1   rE   rT   r   rJ   rU   rV   rW   s   @r=   ri   ri   c   s<    	MB B B[`[f[f B Br?   ri   c                       \ rS rSrSrS r  SS\S\R                  S\	\R                     S\	\R                     S	\R                  4
S
 jjr
Srg)SanaAttnProcessor2_0z   zk
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
c                 D    [        [        S5      (       d  [        S5      eg )Nscaled_dot_product_attentionzTSanaAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportError)r:   s    r=   r1   SanaAttnProcessor2_0.__init__   s!    q899tuu :r?   Nattnr@   encoder_hidden_statesattention_maskr&   c           	      *   Uc  UR                   OUR                   u  pVnUb<  UR                  XFU5      nUR                  XQR                  SUR                   S   5      nUR	                  U5      nUc  UnUR                  U5      n	UR                  U5      n
UR                  b  UR                  U5      nUR                  b  UR                  U	5      n	U	R                   S   nXR                  -  nUR                  USUR                  U5      R                  SS5      nU	R                  USUR                  U5      R                  SS5      n	U
R                  USUR                  U5      R                  SS5      n
[        R                  " XXSSS9nUR                  SS5      R                  USUR                  U-  5      nUR                  UR                  5      nUR                  S   " U5      nUR                  S   " U5      nX!R                   -  nU$ )NrD   r(   r           F)	attn_mask	dropout_p	is_causalr   )shapeprepare_attention_maskviewheadsto_qto_kto_vnorm_qnorm_k	transposer   r   reshaperb   r   to_outrescale_output_factor)r:   r   r@   r   r   
batch_sizesequence_length_querykeyvalue	inner_dimhead_dims                r=   __call__SanaAttnProcessor2_0.__call__   s    $9#@MF[FaFa 	'
Q %!88ZdeN ,00ZZ^MaMabdMefN		-( ($1!ii-.		/0;;"KK&E;;"++c"CIIbM	

*

:r4::x@JJ1aPhhz2tzz8<FFq!L

:r4::x@JJ1aP 663RW
 &//15==j"djj[cNcd%((5 A}5A}5%(B(BBr?    r   )rM   rN   rO   rP   __doc__r1   r   rE   rT   r   r   rU   r   r?   r=   r   r   z   si    v 9=1566 ||6  (5	6
 !.6 
6 6r?   r   c                   h  ^  \ rS rSrSr             SS\S\S\S\S\\   S	\\   S
\\   S\S\S\S\S\S\\	   SS4U 4S jjjr
      SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\S\S\R                  4S jjrSrU =r$ )SanaTransformerBlock   zS
Transformer block introduced in [Sana](https://huggingface.co/papers/2410.10629).
NrC   num_attention_headsattention_head_dimdropoutnum_cross_attention_headscross_attention_head_dimcross_attention_dimattention_biasnorm_elementwise_affinenorm_epsattention_out_bias	mlp_ratioqk_normr&   c                   > [         TU ]  5         [        R                  " USU
S9U l        [        UUUUb  UOS UUUS [        5       S9	U l        Ub>  [        R                  " XU
S9U l        [        UUUb  UOS UUUUSU[        5       S9
U l
        [        XUS SS9U l        [        R                  " [        R                  " SU5      US-  -  5      U l        g )	NFr\   )		query_dimr   dim_headkv_headsr   r   r+   r   	processorT)
r   r   r   r   r   r   r   r+   out_biasr   )r$   r%   rq         ?)r0   r1   r   r]   norm1r   r   attn1norm2r   attn2r   ff	ParameterrE   randnr`   )r:   rC   r   r   r   r   r   r   r   r   r   r   r   r   r<   s                 r=   r1   SanaTransformerBlock.__init__   s      	 \\#%XN
%',3,?(T $02


 *c[cdDJ"6=6I2t$7/1+.0DJ Ci4UZ[!#ekk!S.ACH.L!Mr?   r@   r   r   encoder_attention_maskr{   heightwidthc                 j   UR                   S   nU R                  S    UR                  USS5      -   R                  SSS9u  pppU R	                  U5      nUSU
-   -  U	-   nUR                  UR                  5      nU R                  U5      nXU-  -   nU R                  b  U R                  UUUS9nUU-   nU R                  U5      nUSU-   -  U-   nUR                  SXg45      R                  SSSS5      nU R                  U5      nUR                  SS5      R                  SSS5      nXU-  -   nU$ )	Nr   rq   rD   r(   rB   )r   r   r	   r   )r   r`   r   rF   r   rb   r   r   r   r   	unflattenpermuter   flatten)r:   r@   r   r   r   r{   r   r   r   	shift_msa	scale_msagate_msa	shift_mlp	scale_mlpgate_mlpnorm_hidden_statesattn_output	ff_outputs                     r=   rJ   SanaTransformerBlock.forward   so    #((+
 ""4(8+;+;J2+NN
%q%/ 	G	h9
 "ZZ6/1y=AIM/22=3F3FGjj!34%;(>> ::!**&;5 % K
 (-7M "ZZ6/1y=AIM/99!f_MUUVWYZ\]_`aGG./	%%a+33Aq!<	%9(<<r?   )r   r   r   r   r   r`   )  F       r      p   r   TFrg   T      @N)NNNNNN)rM   rN   rO   rP   r   r2   rQ   r   rS   rR   r1   rE   rT   
LongTensorrJ   rU   rV   rW   s   @r=   r   r      s    #%"$3525-1#(-#'!%3N3N !3N  	3N
 3N $,C=3N #+3-3N &c]3N 3N "&3N 3N !3N 3N #3N 
3N 3Np 268<9=/3+||+ !.+  (5	+
 !) 6+ 5++,+ + + 
+ +r?   r   c            .       `  ^  \ rS rSrSrSr/ SQrSS/r\                     S.S\	S	\
\	   S
\	S\	S\	S\
\	   S\
\	   S\
\	   S\	S\S\S\S\	S\	S\S\S\
\	   S\S\S\
\   S\SS4,U 4S jjj5       r\S\\\4   4S j5       rS \\\\\4   4   4S! jr      S/S"\R,                  S#\R,                  S$\R,                  S%\
\R,                     S&\
\R,                     S'\
\R,                     S(\
\\\4      S)\
\\R,                        S*\S\\\R,                  S+4   \4   4S, jjrS-rU =r$ )0SanaTransformer2DModeli$  a  
A 2D Transformer model introduced in [Sana](https://huggingface.co/papers/2410.10629) family of models.

Args:
    in_channels (`int`, defaults to `32`):
        The number of channels in the input.
    out_channels (`int`, *optional*, defaults to `32`):
        The number of channels in the output.
    num_attention_heads (`int`, defaults to `70`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`, defaults to `32`):
        The number of channels in each head.
    num_layers (`int`, defaults to `20`):
        The number of layers of Transformer blocks to use.
    num_cross_attention_heads (`int`, *optional*, defaults to `20`):
        The number of heads to use for cross-attention.
    cross_attention_head_dim (`int`, *optional*, defaults to `112`):
        The number of channels in each head for cross-attention.
    cross_attention_dim (`int`, *optional*, defaults to `2240`):
        The number of channels in the cross-attention output.
    caption_channels (`int`, defaults to `2304`):
        The number of channels in the caption embeddings.
    mlp_ratio (`float`, defaults to `2.5`):
        The expansion ratio to use in the GLUMBConv layer.
    dropout (`float`, defaults to `0.0`):
        The dropout probability.
    attention_bias (`bool`, defaults to `False`):
        Whether to use bias in the attention layer.
    sample_size (`int`, defaults to `32`):
        The base size of the input latent.
    patch_size (`int`, defaults to `1`):
        The size of the patches to use in the patch embedding layer.
    norm_elementwise_affine (`bool`, defaults to `False`):
        Whether to use elementwise affinity in the normalization layer.
    norm_eps (`float`, defaults to `1e-6`):
        The epsilon value for the normalization layer.
    qk_norm (`str`, *optional*, defaults to `None`):
        The normalization to use for the query and key.
    timestep_scale (`float`, defaults to `1.0`):
        The scale to use for the timesteps.
T)r   r   rY   patch_embedr9   Nr!   r"   r   r   
num_layersr   r   r   caption_channelsr   r   r   sample_size
patch_sizer   r   interpolation_scaleguidance_embedsguidance_embeds_scaler   timestep_scaler&   c                 t  > [         TU ]  5         U=(       d    UnX4-  n[        UUUUUUUb  SOS S9U l        U(       a  [	        U5      U l        O[        U5      U l        [        U	US9U l        [        USSS9U l
        [        R                  " [        U5       Vs/ s H  n[        UUUUUUUUUUU
US9PM     sn5      U l        [        R                   " ["        R$                  " SU5      US	-  -  5      U l        [)        US
SS9U l        [        R,                  " UX-  U-  5      U l        S
U l        g s  snf )Nsincos)r   r   r   r!   	embed_dimr   pos_embed_type)in_featureshidden_sizer-   T)r.   r/   )	r   r   r   r   r   r   r   r   r   r   r   Frg   r\   )r0   r1   r   r   ri   
time_embedr   r   caption_projectionr   caption_normr   
ModuleListranger   transformer_blocksr   rE   r   r`   rY   norm_outrw   proj_outgradient_checkpointing)r:   r!   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r<   s                           r=   r1   SanaTransformer2DModel.__init__S  sN   2 	#2{'<	 &!# 3':'F8D
 DYODO0;DO";HXfo"p#I4DQ #%--  z* +A %'&#.G-E(;#1,C%'# +#
* "$ekk!Y.G)UX..X!Y))SWX		)Z-D|-ST&+#1s   D5c                    ^ 0 nS[         S[        R                  R                  S[        [         [
        4   4U4S jjmU R                  5        H  u  p#T" X#U5        M     U$ )z
Returns:
    `dict` of attention processors: A dictionary containing all attention processors used in the model with
    indexed by its weight name.
namemodule
processorsc                    > [        US5      (       a  UR                  5       X  S3'   UR                  5        H  u  p4T" U  SU 3XB5        M     U$ )Nget_processor
.processor.)r   r  named_children)r  r  r  sub_namechildfn_recursive_add_processorss        r=   r  KSanaTransformer2DModel.attn_processors.<locals>.fn_recursive_add_processors  sZ    v//282F2F2H
V:./#)#8#8#:+tfAhZ,@%T $; r?   )rR   rE   r   Moduler   r   r
  )r:   r  r  r  r  s       @r=   attn_processors&SanaTransformer2DModel.attn_processors  sb     
	c 	588?? 	X\]`bt]tXu 	 !//1LD'jA 2 r?   r   c           	      d  ^ [        U R                  R                  5       5      n[        U[        5      (       a-  [        U5      U:w  a  [        S[        U5       SU SU S35      eS[        S[        R                  R                  4U4S jjmU R                  5        H  u  p4T" X4U5        M     g)	a  
Sets the attention processor to use to compute attention.

Parameters:
    processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
        The instantiated processor class or a dictionary of processor classes that will be set as the processor
        for **all** `Attention` layers.

        If `processor` is a dict, the key needs to define the path to the corresponding cross attention
        processor. This is strongly recommended when setting trainable attention processors.

z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r  r  c                 
  > [        US5      (       aJ  [        U[        5      (       d  UR                  U5        O#UR                  UR	                  U  S35      5        UR                  5        H  u  p4T" U  SU 3XB5        M     g )Nset_processorr  r	  )r   
isinstancedictr  popr
  )r  r  r   r  r  fn_recursive_attn_processors        r=   r  NSanaTransformer2DModel.set_attn_processor.<locals>.fn_recursive_attn_processor  ss    v//!)T22((3(($z7J)KL#)#8#8#:+tfAhZ,@%S $;r?   N)lenr  keysr  r  
ValueErrorrR   rE   r   r  r
  )r:   r   countr  r  r  s        @r=   set_attn_processor)SanaTransformer2DModel.set_attn_processor  s     D((--/0i&&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1LD'i@ 2r?   r@   r   r{   r|   r   r   attention_kwargscontrolnet_block_samplesreturn_dict.c
                    Ub#  UR                  5       nUR                  SS5      n
OSn
[        (       a  [        X
5        O+Ub(  UR	                  SS 5      b  [
        R                  S5        UbB  UR                  S:X  a2  SUR                  UR                  5      -
  S-  nUR                  S5      nUbB  UR                  S:X  a2  SUR                  UR                  5      -
  S-  nUR                  S5      nUR                  u  ppU R                  R                  nX-  X-  nnU R                  U5      nUb  U R                  X4UR                  S9u  nnOU R                  X;UR                  S9u  nnU R!                  U5      nUR#                  US	UR                  S	   5      nU R%                  U5      n[&        R(                  " 5       (       at  U R*                  (       ac  [-        U R.                  5       HI  u  nnU R1                  UUUUUUUU5      nUc  M#  S
Us=:  a  [3        U5      ::  d  M;  O  M?  XUS-
     -   nMK     OX[-        U R.                  5       H?  u  nnU" UUUUUUU5      nUc  M  S
Us=:  a  [3        U5      ::  d  M1  O  M5  XUS-
     -   nMA     U R5                  UUU R6                  5      nU R9                  U5      nUR;                  UUUU R                  R                  U R                  R                  S	5      nUR=                  S
SSSSS5      nUR;                  US	UU-  UU-  5      n[        (       a  [?        X
5        U	(       d  U4$ [A        US9$ )Nre         ?zVPassing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective.r   r(   g     )r|   r}   )r   r}   rD   r      r	   rL   )sample)!copyr  r   r   getloggerwarningndimrb   r   	unsqueezer   configr   r   r   r   r   r   rE   is_grad_enabledr   	enumerater   _gradient_checkpointing_funcr  r   r`   r   r   r   r   r   )r:   r@   r   r{   r|   r   r   r   r!  r"  
lora_scaler   rm   r   r   ppost_patch_heightpost_patch_widthembedded_timestepindex_blockblockoutputs                         r=   rJ   SanaTransformer2DModel.forward  s    '/446)--gs;JJd/+0@0D0DWd0S0_l %.*=*=*B
  ."3"3M4G4G"HHHTN+55a8N "-2H2M2MQR2R&'*@*C*CMDWDW*X&X\d%d"%;%E%Ea%H" 3@2E2E/
&KK"".4k5:+((7*.//-:M:M +: +'H' +///m>Q>Q +: +'H' !% 7 78M N 5 : ::r=K^K^_aKb c $ 1 12G H   ""t'B'B&/0G0G&H"U $ A A!")*%$	! ,7A<lsSkOl<l<l$1[[\_4]$]M 'I '00G0G&H"U %!")*%$! ,7A<lsSkOl<l<l$1[[\_4]$]M 'I m5FH^H^_m4 &--)+;T[[=S=SUYU`U`UkUkmo
 &--aAq!Q?&&z27H17LN^abNbc19'v66r?   )	r   r   r   r   r   r   r`   r   r   )r   r   r   r   r   r   r   r   i 	  r   r   Fr   r(   Frg   NFg?Nr$  )NNNNNT)rM   rN   rO   rP   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   r2   r   rQ   rS   rR   r1   propertyr   r   r  r   r  rE   rT   r   r   r   rJ   rU   rV   rW   s   @r=   r   r   $  s   (T (,$S(5v'>$ &(#%"$3525-1 $$(--1 %'*!% #-K,K, smK, !	K,
  K, K, $,C=K, #+3-K, &c]K, K, K, K, K, K, K,  "&!K," #K,$ &c]%K,& 'K,(  %)K,* #+K,, -K,. 
/K, K,Z c+=&=!>  0 AE2Dd3PbKbFc2c,d  AN ,09=1559BF x7||x7  %||x7 ,,	x7
 5<<(x7 !) 6x7 !.x7 #4S>2x7 #+5+>"?x7 x7 
uU\\3&')AA	Bx7 x7r?   r   )0typingr   r   r   r   r   rE   torch.nn.functionalr   
functionalr   configuration_utilsr
   r   loadersr   r   utilsr   r   r   r   attention_processorr   r   r   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerrM   r)  r  r   rY   ri   r   r   r   r   r?   r=   <module>rJ     s    5 4     B ? V V 
 ] \ 7 ' 7 
		H	%,		 ,^		 BRYY B.? ?De299 ePq7Z6FH^ q7r?   