
    +h8                        S SK JrJrJr  S SKrS SKJr  SSKJrJ	r	  SSK
Jr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJr  \R8                  " \5      r " S S\R>                  5      r  " S S\\5      r!g)    )AnyDictOptionalN   )ConfigMixinregister_to_config)logging   )LuminaFeedForward)	AttentionLuminaAttnProcessor2_0)&LuminaCombinedTimestepCaptionEmbeddingLuminaPatchEmbed)Transformer2DModelOutput)
ModelMixin)LuminaLayerNormContinuousLuminaRMSNormZeroRMSNormc                     ^  \ rS rSrSr SS\S\S\S\S\S\S	\S
\S\SS4U 4S jjjr SS\	R                  S\	R                  S\	R                  S\	R                  S\	R                  S\	R                  S\\\\4      4S jjrSrU =r$ )LuminaNextDiTBlock$   a  
A LuminaNextDiTBlock for LuminaNextDiT2DModel.

Parameters:
    dim (`int`): Embedding dimension of the input features.
    num_attention_heads (`int`): Number of attention heads.
    num_kv_heads (`int`):
        Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
    multiple_of (`int`): The number of multiple of ffn layer.
    ffn_dim_multiplier (`float`): The multiplier factor of ffn layer dimension.
    norm_eps (`float`): The eps for norm layer.
    qk_norm (`bool`): normalization for query and key.
    cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
    norm_elementwise_affine (`bool`, *optional*, defaults to True),
dimnum_attention_headsnum_kv_headsmultiple_offfn_dim_multipliernorm_epsqk_normcross_attention_dimnorm_elementwise_affinereturnNc
                 \  > [         T
U ]  5         X-  U l        [        R                  " [
        R                  " U/5      5      U l        [        US X-  U(       a  SOS UUSSS[        5       S9
U l
        [        R                  " 5       U R                  l        [        UUX-  U(       a  SOS UUSSS[        5       S9
U l        [        U[        SU-  S-  5      UUS9U l        [#        UUU	S9U l        ['        XU	S	9U l        ['        XU	S	9U l        ['        XU	S	9U l        ['        XU	S	9U l        g )
Nlayer_norm_across_headsh㈵>F)
	query_dimr   dim_headr   headskv_headsepsbiasout_bias	processor   r   )r   	inner_dimr   r   )embedding_dimr   r    )r)   elementwise_affine)super__init__head_dimnn	Parametertorchzerosgater   r   attn1Identityto_outattn2r   intfeed_forwardr   norm1r   	ffn_norm1norm2	ffn_norm2norm1_context)selfr   r   r   r   r   r   r   r   r    	__class__s             h/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/lumina_nextdit2d.pyr2   LuminaNextDiTBlock.__init__5   s8    	2LL.A-B!CD	  $/18-d%!,.

 KKM

  3/18-d%!,.

 .%#+/*#1	
 '$;


 !G^_SCZ[
 G^_$%8[rs    hidden_statesattention_maskimage_rotary_embencoder_hidden_statesencoder_masktembcross_attention_kwargsc           	         UnU R                  X5      u  ppU R                  " SU	U	UUUS.UD6nU R                  U5      nU R                  " SU	UUUSS.UD6nXR                  R                  5       R                  SSSS5      -  nX-   nUR                  S5      nU R                  R                  S   " U5      nXR                  S5      R                  5       U R                  U5      -  -   nU R                  U R                  U5      SUR                  S5      -   -  5      nXR                  S5      R                  5       U R                  U5      -  -   nU$ )a  
Perform a forward pass through the LuminaNextDiTBlock.

Parameters:
    hidden_states (`torch.Tensor`): The input of hidden_states for LuminaNextDiTBlock.
    attention_mask (`torch.Tensor): The input of hidden_states corresponse attention mask.
    image_rotary_emb (`torch.Tensor`): Precomputed cosine and sine frequencies.
    encoder_hidden_states: (`torch.Tensor`): The hidden_states of text prompt are processed by Gemma encoder.
    encoder_mask (`torch.Tensor`): The hidden_states of text prompt attention mask.
    temb (`torch.Tensor`): Timestep embedding with text prompt embedding.
    cross_attention_kwargs (`Dict[str, Any]`): kwargs for cross attention.
)rI   rL   rJ   query_rotary_embkey_rotary_embN   r    )r?   r9   rC   r<   r8   tanhviewflattenr;   	unsqueezerA   r>   r@   rB   )rD   rI   rJ   rK   rL   rM   rN   rO   residualnorm_hidden_statesgate_msa	scale_mlpgate_mlpself_attn_outputnorm_encoder_hidden_statescross_attn_outputmixed_attn_output
mlp_outputs                     rF   forwardLuminaNextDiTBlock.forwardv   st   , ! =AJJ}<[9i:: 
,"4)-+
 %
 &*%7%78M%N" JJ 
,"<'-
 %
 .		0@0E0EaBPQ0RR,@-55b9

))!,->? #5#5a#8#=#=#?$**]B[#[[&&t~~m'DIL_L_`aLbHb'cd
%(:(:1(=(B(B(Dt~~V`Ga(aarH   )
r9   r<   r>   r@   rB   r8   r3   r?   rC   rA   )T)N)__name__
__module____qualname____firstlineno____doc__r=   floatboolr2   r6   Tensorr   r   strr   re   __static_attributes____classcell__rE   s   @rF   r   r   $   s    4 )-?t?t !?t 	?t
 ?t "?t ?t ?t !?t "&?t 
?t ?tR <@9||9 9  ,,	9
  %||9 ll9 ll9 !)c3h 89 9rH   r   c                      ^  \ rS rSrSr/ SQr\              SS\S\\   S\\   S\\   S	\\   S
\\   S\\   S\\   S\\	   S\\	   S\\
   S\\
   S\\   S\\	   SS4U 4S jjj5       r  SS\R                  S\R                  S\R                  S\R                  S\R                  S\\\4   S\R                  4S jjrSrU =r$ )LuminaNextDiT2DModel   a
  
LuminaNextDiT: Diffusion model with a Transformer backbone.

Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.

Parameters:
    sample_size (`int`): The width of the latent images. This is fixed during training since
        it is used to learn a number of position embeddings.
    patch_size (`int`, *optional*, (`int`, *optional*, defaults to 2):
        The size of each patch in the image. This parameter defines the resolution of patches fed into the model.
    in_channels (`int`, *optional*, defaults to 4):
        The number of input channels for the model. Typically, this matches the number of channels in the input
        images.
    hidden_size (`int`, *optional*, defaults to 4096):
        The dimensionality of the hidden layers in the model. This parameter determines the width of the model's
        hidden representations.
    num_layers (`int`, *optional*, default to 32):
        The number of layers in the model. This defines the depth of the neural network.
    num_attention_heads (`int`, *optional*, defaults to 32):
        The number of attention heads in each attention layer. This parameter specifies how many separate attention
        mechanisms are used.
    num_kv_heads (`int`, *optional*, defaults to 8):
        The number of key-value heads in the attention mechanism, if different from the number of attention heads.
        If None, it defaults to num_attention_heads.
    multiple_of (`int`, *optional*, defaults to 256):
        A factor that the hidden size should be a multiple of. This can help optimize certain hardware
        configurations.
    ffn_dim_multiplier (`float`, *optional*):
        A multiplier for the dimensionality of the feed-forward network. If None, it uses a default value based on
        the model configuration.
    norm_eps (`float`, *optional*, defaults to 1e-5):
        A small value added to the denominator for numerical stability in normalization layers.
    learn_sigma (`bool`, *optional*, defaults to True):
        Whether the model should learn the sigma parameter, which might be related to uncertainty or variance in
        predictions.
    qk_norm (`bool`, *optional*, defaults to True):
        Indicates if the queries and keys in the attention mechanism should be normalized.
    cross_attention_dim (`int`, *optional*, defaults to 2048):
        The dimensionality of the text embeddings. This parameter defines the size of the text representations used
        in the model.
    scaling_factor (`float`, *optional*, defaults to 1.0):
        A scaling factor applied to certain parameters or layers in the model. This can be used for adjusting the
        overall scale of the model's operations.
)patch_embeddernormffn_normNsample_size
patch_sizein_channelshidden_size
num_layersr   r   r   r   r   learn_sigmar   r   scaling_factorr!   c                 b  > [         TU ]  5         Xl        X l        X0l        U(       a  US-  OUU l        X@l        X`l        XF-  U l        Xl	        [        X#USS9U l        [        R                  " [        R                  " U5      5      U l        [#        [%        US5      US9U l        [        R(                  " [+        U5       Vs/ s H  n[-        UUUUU	U
UU5      PM     sn5      U l        [1        U[%        US5      SSSX"-  U R
                  -  S9U l        XF-  S	-  S
:X  d   S5       eg s  snf )Nr
   T)rz   r{   	embed_dimr*   i   )r|   r   Fgư>)r/   conditioning_embedding_dimr0   r)   r*   out_dim   r   z+2d rope needs head dim to be divisible by 4)r1   r2   ry   rz   r{   out_channelsr|   r   r3   r   r   rv   r4   r5   r6   empty	pad_tokenr   mintime_caption_embed
ModuleListranger   layersr   norm_out)rD   ry   rz   r{   r|   r}   r   r   r   r   r   r~   r   r   r   _rE   s                   rF   r2   LuminaNextDiT2DModel.__init__   sD   $ 	&$&/:K!O&#6 #:,.!kX\
 ekk+&>?"HK.DW#
 mm z* +A #' &'	 +
 2%'*;'=$+d.?.??
 2a71<k>kk<1s   D,rI   timesteprL   rM   rK   rO   c                 J   U R                  X5      u  ppUR                  UR                  5      nU R                  X#U5      n
UR	                  5       nU R
                   H  nU" UUUUUU
US9nM     U R                  X5      nU R                  =pU	S   u  pUR                  S5      nX-  X-  -  nUSS2SU24   R                  UX-  X-  XU R                  5      nUR                  SSSSSS5      R                  SS5      R                  SS5      nU(       d  U4$ [        US	9$ )
a]  
Forward pass of LuminaNextDiT.

Parameters:
    hidden_states (torch.Tensor): Input tensor of shape (N, C, H, W).
    timestep (torch.Tensor): Tensor of diffusion timesteps of shape (N,).
    encoder_hidden_states (torch.Tensor): Tensor of caption features of shape (N, D).
    encoder_mask (torch.Tensor): Tensor of caption masks of shape (N, L).
)rN   rO   r   N   rS   r   r
   r   )sample)rv   todevicer   rm   r   r   rz   sizerX   r   permuterY   r   )rD   rI   r   rL   rM   rK   rO   return_dictmaskimg_sizerN   layerheight_tokenswidth_tokensheightwidth
batch_sizesequence_lengthoutputs                      rF   re   LuminaNextDiT2DModel.forward#  sK   & ;?:M:Mm:n7X+..}/C/CD&&xU#((*[[E! %'=M ! m: (,6 "''*
!2u7LM%a)9/)9&9:??/1Feievev
 &&q!Q1a8@@AFNNqRST9'v66rH   )r3   r|   r{   r   r   r   r   r   rv   rz   ry   r   r   )   r
   r   i 	      r   N   Nr$   TTi   g      ?)NT)rg   rh   ri   rj   rk    _skip_layerwise_casting_patternsr   r=   r   rl   rm   r2   r6   rn   r   ro   r   re   rp   rq   rr   s   @rF   rt   rt      s   +Z (N$ $%%&%)$&-/&*%(.2$(&*"&-1*->l>l SM>l c]	>l
 c]>l SM>l &c]>l sm>l c]>l %UO>l 5/>l d^>l $>l &c]>l !>l  
!>l >lN 2637||37 ,,37  %||	37
 ll37  ,,37 !%S#X37 
37 37rH   rt   )"typingr   r   r   r6   torch.nnr4   configuration_utilsr   r   utilsr	   	attentionr   attention_processorr   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   r   
get_loggerrg   loggerModuler   rt   rV   rH   rF   <module>r      sg    ' &   B  ) C 8 ' Q Q 
		H	%K K\d7:{ d7rH   