
    +h;V                        S SK r S SKJrJrJrJrJrJr  S SKrS SK	J
r
  S SKJ
s  Jr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)J*r*  \RV                  " \,5      r- " S S\
R\                  5      r/ " S S5      r0 " S S\
R\                  5      r1 " S S\
R\                  5      r2 " S S\&\\\5      r3g)    N)AnyDictListOptionalTupleUnion   )ConfigMixinregister_to_config)PeftAdapterMixin)FromOriginalModelMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers   )LuminaFeedForward)	Attention)TimestepEmbedding	Timestepsapply_rotary_embget_1d_rotary_pos_embed)Transformer2DModelOutput)
ModelMixin)LuminaLayerNormContinuousLuminaRMSNormZeroRMSNormc                      ^  \ rS rSr    SS\S\S\S\SS4
U 4S jjjrS	\R                  S
\R                  S\R                  S\	\R                  \R                  4   4S jr
SrU =r$ )'Lumina2CombinedTimestepCaptionEmbedding%   hidden_sizecap_feat_dimfrequency_embedding_sizenorm_epsreturnNc           	         > [         TU ]  5         [        USSS9U l        [	        U[        US5      S9U l        [        R                  " [        X$S9[        R                  " X!SS95      U l        g )NTg        )num_channelsflip_sin_to_cosdownscale_freq_shift   )in_channelstime_embed_dimeps)bias)super__init__r   	time_projr   mintimestep_embeddernn
Sequentialr   Linearcaption_embedder)selfr!   r"   r#   r$   	__class__s        k/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_lumina2.pyr1   0Lumina2CombinedTimestepCaptionEmbedding.__init__&   si     	"14^a
 "30[RVAW"
 !#L/<[_1`!
    hidden_statestimestepencoder_hidden_statesc                     U R                  U5      R                  U5      nU R                  U5      nU R                  U5      nXV4$ N)r2   type_asr4   r8   )r9   r>   r?   r@   timestep_proj
time_embedcaption_embeds          r;   forward/Lumina2CombinedTimestepCaptionEmbedding.forward;   sG     x088G++M:
--.CD((r=   )r8   r2   r4   )i   i      h㈵>)__name__
__module____qualname____firstlineno__intfloatr1   torchTensorr   rG   __static_attributes____classcell__r:   s   @r;   r   r   %   s       (+

 
 #&	

 
 

 
*)"\\)5:\\)Z_ZfZf)	u||U\\)	*) )r=   r   c                       \ rS rSrSrS r   SS\S\R                  S\R                  S\	\R                     S	\	\R                     S
\	\
   S\R                  4S jjrSrg)Lumina2AttnProcessor2_0D   z
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
used in the Lumina2Transformer2DModel model. It applies normalization and RoPE on query and key vectors.
c                 D    [        [        S5      (       d  [        S5      eg )Nscaled_dot_product_attentionzPAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportError)r9   s    r;   r1    Lumina2AttnProcessor2_0.__init__J   s!    q899pqq :r=   Nattnr>   r@   attention_maskimage_rotary_embbase_sequence_lengthr%   c                    UR                   u  pxn	UR                  U5      n
UR                  U5      nUR                  U5      nU
R                   S   nUR                   S   nXR                  -  nU
R
                  nX-  nU
R                  USUR                  U5      n
UR                  USUU5      nUR                  USUU5      nUR                  b  UR                  U
5      n
UR                  b  UR                  U5      nUb  [        XSS9n
[        XSS9nU
R                  U5      UR                  U5      pUb8  [        R                  " [        R                  " X5      5      UR                  -  nOUR                  nUR                  U-  nUS:  ah  UR                  S5      R!                  SSSUS5      R#                  SS5      nUR                  S5      R!                  SSSUS5      R#                  SS5      nUb"  UR%                  5       R                  USSS5      nU
R'                  SS5      n
UR'                  SS5      nUR'                  SS5      n[(        R*                  " XXUS9nUR'                  SS5      R-                  USUR                  U-  5      nUR/                  U
5      nUR0                  S   " U5      nUR0                  S   " U5      nU$ )	NF)use_real   r	   r   )	attn_maskscaler   )shapeto_qto_kto_vheadsdtypeviewnorm_qnorm_kr   tomathsqrtlogrh   	unsqueezerepeatflattenbool	transposer\   rZ   reshaperC   to_out)r9   r_   r>   r@   r`   ra   rb   
batch_sizesequence_length_querykeyvalue	query_dim	inner_dimhead_dimrn   kv_headssoftmax_scalen_reps                       r;   __call__ Lumina2AttnProcessor2_0.__call__N   s    *7)<)<&
Q 		-(ii-.		/0KKO	IIbM	

* (

:r4::x@hhz2x:

:r8X> ;;"KK&E;;"++c"C '$UuME"35ICXXe_cffUms  + IIdhh&UVY]YcYccM JJM 

h&A:--"))!Q5!<DDQJCOOA&--aAua@HHANE %+00277
Aq"MN1%mmAq!1%66}
 &//15==j"djj[cNcd%--e4 A}5A}5r=    )NNN)rK   rL   rM   rN   __doc__r1   r   rQ   rR   r   rO   r   rS   r   r=   r;   rW   rW   D   s    
r 2637.2GG ||G  %||	G
 !.G #5<<0G 'smG 
G Gr=   rW   c                      ^  \ rS rSr SS\S\S\S\S\S\S\S	S
4U 4S jjjr SS\R                  S\R                  S\R                  S\
\R                     S	\R                  4
S jjrSrU =r$ )Lumina2TransformerBlock   dimnum_attention_headsnum_kv_headsmultiple_offfn_dim_multiplierr$   
modulationr%   Nc                 J  > [         TU ]  5         X-  U l        Xpl        [	        US X-  SUUSSS[        5       S9
U l        [        USU-  UUS9U l        U(       a  [        UUSS9U l
        O[        XS	9U l
        [        XS	9U l        [        XS	9U l        [        XS	9U l        g )
Nrms_normrJ   F)
r   cross_attention_dimdim_headqk_normrm   r   r.   r/   out_bias	processor   )r   r   r   r   T)embedding_dimr$   norm_elementwise_affiner-   )r0   r1   r   r   r   rW   r_   r   feed_forwardr   norm1r   	ffn_norm1norm2	ffn_norm2)	r9   r   r   r   r   r   r$   r   r:   s	           r;   r1    Lumina2TransformerBlock.__init__   s     	2$ $/%!-/
	 .#g#1	
 *!!(,DJ !3DJ 3S/
 3r=   r>   r`   ra   tembc                    U R                   (       a  U R                  X5      u  pVpxU R                  UUUUS9n	XR                  S5      R	                  5       U R                  U	5      -  -   nU R                  U R                  U5      SUR                  S5      -   -  5      n
XR                  S5      R	                  5       U R                  U
5      -  -   nU$ U R                  U5      nU R                  UUUUS9n	XR                  U	5      -   nU R                  U R                  U5      5      n
XR                  U
5      -   nU$ )N)r>   r@   r`   ra   rf   )	r   r   r_   rv   tanhr   r   r   r   )r9   r>   r`   ra   r   norm_hidden_statesgate_msa	scale_mlpgate_mlpattn_output
mlp_outputs              r;   rG   Lumina2TransformerBlock.forward   sA    ??@D

=@_=)))0&8-!1	 $ K *,>,>q,A,F,F,H4::VaKb,bbM**4>>-+HAPYPcPcdePfLf+ghJ),>,>q,A,F,F,H4>>ZdKe,eeM  "&M!:))0&8-!1	 $ K *JJ{,CCM**4>>-+HIJ)NN:,FFMr=   )r_   r   r   r   r   r   r   r   )TrB   )rK   rL   rM   rN   rO   rP   ry   r1   rQ   rR   r   rG   rS   rT   rU   s   @r;   r   r      s      -4-4 !-4 	-4
 -4 "-4 -4 -4 
-4 -4h (,||   ,,	
 u||$ 
 r=   r   c            	          ^  \ rS rSrSS\S\\   S\\   S\4U 4S jjjrS\\   S\\   S\S\\R                     4S jr	S	\R                  S\R                  4S
 jr
S\R                  S\R                  4S jrSrU =r$ )Lumina2RotaryPosEmbed   thetaaxes_dim	axes_lens
patch_sizec                    > [         TU ]  5         Xl        X l        X0l        X@l        U R                  X#U5      U l        g rB   )r0   r1   r   r   r   r   _precompute_freqs_cis	freqs_cis)r9   r   r   r   r   r:   s        r;   r1   Lumina2RotaryPosEmbed.__init__   s6    
 "$33HOr=   r%   c                 0   / n[         R                  R                  R                  5       (       a  [         R                  O[         R
                  n[        [        X5      5       H.  u  nu  px[        XxU R                  US9n	UR                  U	5        M0     U$ )N)r   freqs_dtype)rQ   backendsmpsis_availablefloat32float64	enumeratezipr   r   append)
r9   r   r   r   r   r   ideembs
             r;   r   +Lumina2RotaryPosEmbed._precompute_freqs_cis   sp    	',~~'9'9'F'F'H'Hemmemm"3x#;<IAv)!djjkZCS! = r=   idsc           
         UR                   nUR                   R                  S:X  a  UR                  S5      n/ n[        [	        U R
                  5      5       H  nU R                  U   R                  UR                   5      nUS S 2S S 2XDS-   24   R                  SSUR                  S   5      R                  [        R                  5      nUR                  [        R                  " UR                  S5      R                  UR                  S   SS5      SUS95        M     [        R                  " USS9R                  U5      $ )Nr   cpurf   rd   r   )r   indexr   )devicetyperr   rangelenr   r   rw   ri   rQ   int64r   gatherrv   cat)r9   r   r   resultr   freqsr   s          r;   _get_freqs_cis$Lumina2RotaryPosEmbed._get_freqs_cis   s    ::??e#&&-Cs4==)*ANN1%((4E1aa%i(//1ekk"oFII%++VEMM%,,uq'9'@'@QQRTU'V\]ejkl + yyR(++F33r=   r>   r`   c                 f   UR                   u  p4pVU R                  nXW-  Xg-  pX-  n
UR                  nUR                   S   nUR                  SS9R	                  5       nU Vs/ s H  oU
-   PM	     nn[        U5      n[        R                  " UUS[        R                  US9n[        [        X5      5       H  u  nu  nn[        R                  " U[        R                  US9UUS U2S4'   UUUUU2S4'   [        R                  " U[        R                  US9R                  SS5      R                  SU	5      R                  5       n[        R                  " U	[        R                  US9R                  SS5      R                  US5      R                  5       nUUUUU2S4'   UUUUU2S4'   M     U R                  U5      n[        R                  " X<UR                   S   UUR                   S9n[        R                  " X:UR                   S   UUR                   S9n[        [        X5      5       H)  u  nu  nnUUS U24   UUS U24'   UUUU24   UUS U
24'   M+     UR                  X4XX5      R#                  SSS	SS
S5      R                  S5      R                  SS5      nUUUUX4$ s  snf )Nrf   r   r	   )rn   r   r   rd   r   )r   rn   r      )ri   r   r   sumtolistmaxrQ   zerosint32r   r   arangero   rw   rx   r   rn   permute)r9   r>   r`   r}   channelsheightwidthppost_patch_heightpost_patch_widthimage_seq_lenr   encoder_seq_lenl_effective_cap_lencap_seq_lenseq_lengthsmax_seq_lenposition_idsr   seq_lenrow_idscol_idsr   cap_freqs_cisimg_freqs_ciss                            r;   rG   Lumina2RotaryPosEmbed.forward  s   .;.A.A+
fOO.4k5:+)<%%(..q1,00Q07>>@FYZFY{]2FYZ+& {{:{AU[[Y_`)237J3X)Y%A%W/4||Ku{{ci/jLL[L!+,6ALK/23 .ekk&Qb!+,	  -U[[Pa)1-	  7>LK/236=LK/23' *Z, ''5	 )<VS\SbSb
 yr':6QZQ`Q`
 *337J3X)Y%A%W-6q,;,-GM!\k\/*/8K<O9O/PM!^m^+, *Z z5FK[_WQ1aA&WQZWQ]	 	 m]IGZgge [s   #J.)r   r   r   r   r   )i,     r   r   )rK   rL   rM   rN   rO   r   r1   rQ   rR   r   r   rG   rS   rT   rU   s   @r;   r   r      s    Pc PT#Y P49 Plo P Pd3i DI VY ^bchcoco^p 
4%,, 
45<< 
4;hU\\ ;h5<< ;h ;hr=   r   c            $         ^  \ rS rSrSrSrS/rSS/r\                S"S\	S	\	S
\	S\
\	   S\	S\	S\	S\	S\	S\	S\
\   S\S\S\\	\	\	4   S\\	\	\	4   S\	SS4"U 4S jjj5       r  S#S\R                  S\R                  S\R                  S\R                  S\
\\\4      S\S\\R                  \4   4S  jjrS!rU =r$ )$Lumina2Transformer2DModeliE  a<  
Lumina2NextDiT: Diffusion model with a Transformer backbone.

Parameters:
    sample_size (`int`): The width of the latent images. This is fixed during training since
        it is used to learn a number of position embeddings.
    patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
        The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
    in_channels (`int`, *optional*, defaults to 4):
        The number of input channels for the model. Typically, this matches the number of channels in the input
        images.
    hidden_size (`int`, *optional*, defaults to 4096):
        The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
        hidden representations.
    num_layers (`int`, *optional*, default to 32):
        The number of layers in the model. This defines the depth of the neural network.
    num_attention_heads (`int`, *optional*, defaults to 32):
        The number of attention heads in each attention layer. This parameter specifies how many separate attention
        mechanisms are used.
    num_kv_heads (`int`, *optional*, defaults to 8):
        The number of key-value heads in the attention mechanism, if different from the number of attention heads.
        If None, it defaults to num_attention_heads.
    multiple_of (`int`, *optional*, defaults to 256):
        A factor that the hidden size should be a multiple of. This can help optimize certain hardware
        configurations.
    ffn_dim_multiplier (`float`, *optional*):
        A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
        the model configuration.
    norm_eps (`float`, *optional*, defaults to 1e-5):
        A small value added to the denominator for numerical stability in normalization layers.
    scaling_factor (`float`, *optional*, defaults to 1.0):
        A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
        overall scale of the model's operations.
Tr   
x_embeddernormNsample_sizer   r+   out_channelsr!   
num_layersnum_refiner_layersr   r   r   r   r$   scaling_factoraxes_dim_roper   r"   r%   c                   > [         TU ]  5         U=(       d    UU l        [        SXUS9U l        [
        R                  " X"-  U-  US9U l        [        UUUS9U l	        [
        R                  " [        U5       Vs/ s H  n[        UUU	U
UUSS9PM     sn5      U l        [
        R                  " [        U5       Vs/ s H  n[        UUU	U
UUSS9PM     sn5      U l        [
        R                  " [        U5       Vs/ s H  n[        UUU	U
UUSS9PM     sn5      U l        [!        U[#        US5      SS	SX"-  U R                  -  S
9U l        SU l        g s  snf s  snf s  snf )Ni'  )r   r   r   r   )in_featuresout_features)r!   r"   r$   T)r   Fr*   gư>)r   conditioning_embedding_dimelementwise_affiner.   r/   out_dim)r0   r1   r   r   rope_embedderr5   r7   r   r   time_caption_embed
ModuleListr   r   noise_refinercontext_refinerlayersr   r3   norm_outgradient_checkpointing)r9   r   r   r+   r   r!   r   r   r   r   r   r   r$   r   r  r   r"   r   r:   s                     r;   r1   "Lumina2Transformer2DModel.__init__m  s   ( 	(7K 3-Q[
 ))
0G+0Udop"I#,#

  ]] 12 3A (' &# 3
  "}} 12 3A (' &$ 3 
  mm z* +A (' &# +
  2%'*;'=$+d.?.??
 ',#o s   <E>E E#r>   r?   r@   encoder_attention_maskattention_kwargsreturn_dictc           
         Ub#  UR                  5       nUR                  SS5      nOSn[        (       a  [        X5        O+Ub(  UR	                  SS 5      b  [
        R                  S5        UR                  u  ppU R                  XU5      u  pU R                  X5      u  nnnnnnU R                  U5      nU R                   H  nU" X4U5      nM     U R                   H  nU" US X5      nM     [        U5      n[        [        U5      5      S:  nUR!                  UU["        R$                  S9nUR!                  UUU R&                  R(                  5      n[+        [-        UU5      5       H.  u  nu  nnSUUS U24'   UUS U24   UUS U24'   UU   UUUU24'   M0     UnU R.                   H`  n["        R0                  " 5       (       a0  U R2                  (       a  U R5                  UUU(       a  UOS X5      nMM  U" UU(       a  UOS X5      nMb     U R7                  X5      nU R&                  R8                  n/ n[+        [-        UU5      5       Hv  u  nu  nnUR;                  UU   UU R=                  U
U-  UU-  UUU R>                  5      RA                  SSS	SS
5      RC                  S
S5      RC                  SS	5      5        Mx     ["        RD                  " USS9n[        (       a  [G        X5        U(       d  U4$ [I        US9$ )Nrh         ?zVPassing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective.rf   )rn   Tr   r   r   r	   r   )sample)%copypopr   r   getloggerwarningri   r	  r  r   r  r  r   r   set	new_zerosrQ   ry   configr!   r   r   r  is_grad_enabledr  _gradient_checkpointing_funcr  r   r   ro   r   r   rx   stackr   r   )r9   r>   r?   r@   r  r  r  
lora_scaler}   r   r   r   r   context_rotary_embnoise_rotary_emb
rotary_embencoder_seq_lengthsr   layerr   use_maskr`   joint_hidden_statesr   r   r   r   outputs                               r;   rG   !Lumina2Transformer2DModel.forward  s    '/446)--gs;JJd/+0@0D0DWd0S0_l
 (5':':$
v&*&=&=mWl&m# }E	
 6 ))E$)*?Yk$l! * ''E!-7GNM ( +&s;'(1,&00[PUPZPZ0[+55j+t{{OfOfg-6s;NP[7\-])A)*.N1hwh;'7LQP`Q`P`M`7a#3O#3 34>KA>N?7#: :; .^
 ,[[E$$&&4+F+F $ A A=H.$PZ! !&mx^UY[e l ! m: KK""-6s;NP[7\-])A)MMa 9fk5A:q!T5F5FGAq!Q'AA .^ V+19'v66r=   )	r  r  r  r  r  r   r  r	  r   )   r      Ni 	     r         rI   NrJ   r  )    r1  r1  r   r*   )NT)rK   rL   rM   rN   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rO   r   rP   r   r1   rQ   rR   r   strr   ry   r   r   rG   rS   rT   rU   s   @r;   r   r   E  s   !F (,$23(4f'=$ &*"##%.2 #.:*9 #Z,Z, Z, 	Z,
 smZ, Z, Z,  Z, !Z, Z, Z, %UOZ, Z, Z, S#s]+Z,  c3'!Z," #Z,$ 
%Z, Z,D 6: Z7||Z7 ,,Z7  %||	Z7
 !&Z7 #4S>2Z7 Z7 
u||55	6Z7 Z7r=   r   )4rs   typingr   r   r   r   r   r   rQ   torch.nnr5   torch.nn.functional
functionalr\   configuration_utilsr
   r   loadersr   loaders.single_file_modelr   utilsr   r   r   r   	attentionr   attention_processorr   
embeddingsr   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerrK   r  Moduler   rW   r   r   r   r   r=   r;   <module>rF     s     : :     B ' ? V V ) + ` ` 7 ' Q Q 
		H	%)bii )>Q QhNbii NbYhBII Yhx_7
K9IKa _7r=   