
    +h=N                        S SK r S SKJrJrJrJrJr  S SKrS SKJ	r	  S SK
J	s  Jr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJr  \R>                  " \ 5      r! " S S\	RD                  5      r# " S S\	RD                  5      r$ " S S\	RD                  5      r% " S S5      r& " S S\	RD                  5      r' " S S\\5      r(g)    N)DictListOptionalTupleUnion   )ConfigMixinregister_to_config)logging   )	Attention)TimestepEmbedding	Timestepsget_2d_sincos_pos_embed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormRMSNormc                   n   ^  \ rS rSrS\S\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	OmniGenFeedForward"   hidden_sizeintermediate_sizec                    > [         TU ]  5         [        R                  " USU-  SS9U l        [        R                  " X!SS9U l        [        R                  " 5       U l        g )Nr   Fbias)super__init__nnLineargate_up_proj	down_projSiLUactivation_fn)selfr   r   	__class__s      k/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_omnigen.pyr   OmniGenFeedForward.__init__#   sJ    IIk17H3HuU#4NWWY    hidden_statesreturnc                     U R                  U5      nUR                  SSS9u  p2X R                  U5      -  nU R                  U5      $ )Nr   dim)r!   chunkr$   r"   )r%   r*   	up_statesgates       r'   forwardOmniGenFeedForward.forward*   sH    %%m4	#//!/4 2 24 88	~~i((r)   )r$   r"   r!   )__name__
__module____qualname____firstlineno__intr   torchTensorr3   __static_attributes____classcell__r&   s   @r'   r   r   "   s6    'C 'C ')U\\ )ell ) )r)   r   c                      ^  \ rS rSr       SS\S\S\S\S\S\S\4U 4S	 jjjrS
 rS\	R                  S\S\	R                  4S jr SS\	R                  S\S\	R                  S\	R                  4S jjrSrU =r$ )OmniGenPatchEmbed1   
patch_sizein_channels	embed_dimr   interpolation_scalepos_embed_max_size	base_sizec                 T  > [         T	U ]  5         [        R                  " X#X4XS9U l        [        R                  " X#X4XS9U l        Xl        XPl        X`l        [        UU R                  UU R                  SS9nU R                  SUR                  5       R                  S5      SS9  g )N)kernel_sizestrider   pt)rG   rE   output_type	pos_embedr   T)
persistent)r   r   r   Conv2doutput_image_projinput_image_projrB   rE   rF   r   register_bufferfloat	unsqueeze)
r%   rB   rC   rD   r   rE   rF   rG   rM   r&   s
            r'   r   OmniGenPatchEmbed.__init__2   s     	!#0HQ["
 !#		0HQ[!
 %#6 "4+## $ 8 8
	 	[)//*;*E*Ea*HUYZr)   c                 0   U R                   c  [        S5      eXR                  -  nX R                  -  nXR                   :  a  [        SU SU R                    S35      eX R                   :  a  [        SU SU R                    S35      eU R                   U-
  S-  nU R                   U-
  S-  nU R                  R	                  SU R                   U R                   S	5      nUSS2X3U-   2XDU-   2SS24   nUR	                  SS	UR
                  S	   5      nU$ )
z2Crops positional embeddings for SD3 compatibility.Nz.`pos_embed_max_size` must be set for cropping.zHeight (z/) cannot be greater than `pos_embed_max_size`: .zWidth (r      r-   )rF   
ValueErrorrB   rM   reshapeshape)r%   heightwidthtopleftspatial_pos_embeds         r'   _cropped_pos_embed$OmniGenPatchEmbed._cropped_pos_embedR   s>   ""*MNN??*(+++6("QRVRiRiQjjkl  ***% OPTPgPgOhhij  &&/A5''%/A5 NN221d6M6MtOfOfhjk-aV|1CTSXLEXZ[.[\-55a=N=T=TUW=XY  r)   r*   is_input_imager+   c                     U(       a  U R                  U5      nOU R                  U5      nUR                  S5      R                  SS5      nU$ )Nr   rX   )rQ   rP   flatten	transpose)r%   r*   rc   s      r'   _patch_embeddings#OmniGenPatchEmbed._patch_embeddingsi   sH     11-@M 22=AM%--a0::1a@r)   padding_latentc                    [        U[        5      (       a  Uc  S /[        U5      -  n/ n[        X5       H  u  pVUR                  SS  u  pxU R                  XR5      nU R                  Xx5      n	XY-   nUb/  [        R                  " XVR                  UR                  5      /SS9nUR                  U5        M     U$ UR                  SS  u  pxU R                  Xx5      n	U R                  X5      nX-   nU$ )Nr.   )
isinstancelistlenzipr[   rg   ra   r:   cattodeviceappend)
r%   r*   rc   ri   patched_latents
sub_latentpaddingr\   r]   rM   s
             r'   r3   OmniGenPatchEmbed.forwardq   s     mT**%"&#m*<!< O'*='I#
 * 0 0 5!33JO
 33FB	'3
&!&J

:CTCT8U+V\^!_J&&z2 (J  *//4MF//>I 22=QM+7Or)   )rQ   rE   rP   rB   rF   )r      i   TrX      @   )N)r5   r6   r7   r8   r9   boolrS   r   ra   r:   r;   rg   r3   r<   r=   r>   s   @r'   r@   r@   1   s     %&"%[[ [ 	[
 [ #[  [ [ [@!.u|| T V[VbVb  ae"\\;?QVQ]Q]	 r)   r@   c                   4   ^  \ rS rSr SU 4S jjrS rSrU =r$ )OmniGenSuScaledRotaryEmbedding   c           	      P  > [         TU ]  5         Xl        X l        X@l        SU R                  [
        R                  " SU R                  S[
        R                  S9R                  5       U R                  -  -  -  nU R                  SUSS9  US   U l
        US	   U l        X0l        g )
N      ?r   r   )dtypeinv_freqF)tensorrN   short_factorlong_factor)r   r   r/   max_position_embeddingsbaser:   arangeint64rS   rR   r   r    original_max_position_embeddings)r%   r/   r   r   r   rope_scalingr   r&   s          r'   r   'OmniGenSuScaledRotaryEmbedding.__init__   s     	'>$	$))Q!5;;(W(](](_bfbjbj(jklZUK(8'60P-r)   c                 X   [         R                  " U5      S-   nX0R                  :  a9  [         R                  " U R                  [         R
                  UR                  S9nO8[         R                  " U R                  [         R
                  UR                  S9n[         R                  " SU R                  S[         R                  UR                  S9R                  5       U R                  -  nSX@R                  U-  -  -  U l        U R                  S S S 2S 4   R                  5       R                  UR                  S   SS5      nUS S 2S S S 24   R                  5       nUR                  R                   n[#        U[$        5      (       a  US:w  a  UOSn[         R&                  " US	S
9   UR                  5       UR                  5       -  R)                  SS5      n	[         R*                  " X4SS9S   n
U R,                  U R                  -  nUS::  a  SnON[.        R0                  " S[.        R2                  " U5      [.        R2                  " U R                  5      -  -   5      nU
R5                  5       U-  nU
R7                  5       U-  nS S S 5        X4$ ! , (       d  f       WW4$ = f)NrX   )r   rr   r   r   r   r-   mpscpuF)device_typeenabledr.   )r:   maxr   r   r   float32rr   r   r   r/   r   rS   r   r   expandr[   typerl   strautocastrf   rp   r   mathsqrtlogcossin)r%   r*   position_idsseq_lenext_factorsinv_freq_shapeinv_freq_expandedposition_ids_expandedr   freqsembscalescaling_factorr   r   s                  r'   r3   &OmniGenSuScaledRotaryEmbedding.forward   s)   ))L)A-:::,,t'7'7u}}UbUiUijK,,t'8'8VcVjVjkK LLDHHau{{=CWCWX^^`cgckckk 	 {YY-FFG MM$4-8>>@GGHZHZ[\H]_acde ,QaZ 8 > > @ $**//%/S%A%AkUZFZk`e^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3A6C0043X3XXE|!$!%1txx$JoJoAp/p+p!q'')n,C'')n,C D x DC Cxs   ,C!J
J))r   r/   r   r   r   r   r   )      '  N)r5   r6   r7   r8   r   r3   r<   r=   r>   s   @r'   r}   r}      s    swQ  r)   r}   c                       \ rS rSrSrS r  SS\S\R                  S\R                  S\	\R                     S	\	\R                     S
\R                  4S jjr
Srg)OmniGenAttnProcessor2_0   z
Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
used in the OmniGen model.
c                 D    [        [        S5      (       d  [        S5      eg )Nscaled_dot_product_attentionzPAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.)hasattrFImportError)r%   s    r'   r    OmniGenAttnProcessor2_0.__init__   s!    q899pqq :r)   Nattnr*   encoder_hidden_statesattention_maskimage_rotary_embr+   c                    UR                   u  pgnUR                  U5      n	UR                  U5      n
UR                  U5      nU	R	                  5       u  pnU
R                   S   nXR
                  -  nUU-  nU	R                  USUR
                  U5      R                  SS5      n	U
R                  USUU5      R                  SS5      n
UR                  USUU5      R                  SS5      nUb  SSKJ	n  U" XSS9n	U" XSS9n
[        R                  " XXS9nUR                  SS5      R                  U	5      nUR                  XUR                  5      nUR                  S   " U5      nU$ )	Nr-   rX   r   )apply_rotary_embrk   )use_real_unbind_dim)	attn_maskr   )r[   to_qto_kto_vsizeheadsviewrf   
embeddingsr   r   r   type_asrZ   out_dimto_out)r%   r   r*   r   r   r   
batch_sizesequence_length_querykeyvaluebszq_len	query_dim	inner_dimhead_dimkv_headsr   s                      r'   __call__ OmniGenAttnProcessor2_0.__call__   s_    *7)<)<&
Q 		-(ii-.		/0 %

IIIbM	

* (

:r4::x@JJ1aPhhz2x:DDQJ

:r8X>HHAN '5$URTUE"3bQC66u5c%//15==eD%--c$,,GA}5r)    )NN)r5   r6   r7   r8   __doc__r   r   r:   r;   r   r   r<   r   r)   r'   r   r      sw    
r 2637%% ||%  %||	%
 !.% #5<<0% 
% %r)   r   c                      ^  \ rS rSrS\S\S\S\S\SS4U 4S	 jjrS
\R                  S\R                  S\R                  S\R                  4S jr	Sr
U =r$ )OmniGenBlock   r   num_attention_headsnum_key_value_headsr   rms_norm_epsr+   Nc                    > [         TU ]  5         [        XS9U l        [	        UUX-  UUSUS[        5       S9	U l        [        XS9U l        [        X5      U l	        g )NepsF)	r   cross_attention_dimdim_headr   r   r   r   out_bias	processor)
r   r   r   input_layernormr   r   	self_attnpost_attention_layernormr   mlp)r%   r   r   r   r   r   r&   s         r'   r   OmniGenBlock.__init__   se     	&{E"! + 7%(-/

 )0(N%%kEr)   r*   r   r   c                     U R                  U5      nU R                  UUUUS9nX-   nU R                  U5      nU R                  U5      nX-   nU$ )N)r*   r   r   r   )r   r   r   r   )r%   r*   r   r   norm_hidden_statesattn_output	ff_outputs          r'   r3   OmniGenBlock.forward  sn     "11-@nn,"4)-	 % 
 &3 "::=IHH/0	%1r)   )r   r   r   r   )r5   r6   r7   r8   r9   rS   r   r:   r;   r3   r<   r=   r>   s   @r'   r   r      s    FF !F !	F
 F F 
F2"\\;@<<[`[g[g	 r)   r   c            (         ^  \ rS rSrSrSrS/r/ SQr\                   S&S\	S\	S\	S	\
S
\	S\	S\	S\	S\	S\	S\	S\	S\	S\S\	S\	S\S\	S\4&U 4S jjj5       rS\R                   S\\R                      S\S\\R                      4S jr S'S\R                   S \\	\
\R*                  4   S\R                   S\\R                      S\\	\\	   4   S!\R                   S"\R                   S#\S\\\\R                      4   4S$ jjrS%rU =r$ )(OmniGenTransformer2DModeli  a  
The Transformer model introduced in OmniGen (https://huggingface.co/papers/2409.11340).

Parameters:
    in_channels (`int`, defaults to `4`):
        The number of channels in the input.
    patch_size (`int`, defaults to `2`):
        The size of the spatial patches to use in the patch embedding layer.
    hidden_size (`int`, defaults to `3072`):
        The dimensionality of the hidden layers in the model.
    rms_norm_eps (`float`, defaults to `1e-5`):
        Eps for RMSNorm layer.
    num_attention_heads (`int`, defaults to `32`):
        The number of heads to use for multi-head attention.
    num_key_value_heads (`int`, defaults to `32`):
        The number of heads to use for keys and values in multi-head attention.
    intermediate_size (`int`, defaults to `8192`):
        Dimension of the hidden layer in FeedForward layers.
    num_layers (`int`, default to `32`):
        The number of layers of transformer blocks to use.
    pad_token_id (`int`, default to `32000`):
        The id of the padding token.
    vocab_size (`int`, default to `32064`):
        The size of the vocabulary of the embedding vocabulary.
    rope_base (`int`, default to `10000`):
        The default theta value to use when creating RoPE.
    rope_scaling (`Dict`, optional):
        The scaling factors for the RoPE. Must contain `short_factor` and `long_factor`.
    pos_embed_max_size (`int`, default to `192`):
        The maximum size of the positional embeddings.
    time_step_dim (`int`, default to `256`):
        Output dimension of timestep embeddings.
    flip_sin_to_cos (`bool`, default to `True`):
        Whether to flip the sin and cos in the positional embeddings when preparing timestep embeddings.
    downscale_freq_shift (`int`, default to `0`):
        The frequency shift to use when downscaling the timestep embeddings.
    timestep_activation_fn (`str`, default to `silu`):
        The activation function to use for the timestep embeddings.
Tr   )patch_embeddingembed_tokensnormrC   rB   r   r   r   r   r   
num_layerspad_token_id
vocab_sizer   r   	rope_baser   rF   time_step_dimflip_sin_to_cosdownscale_freq_shifttimestep_activation_fnc                 R  > [         TU ]  5         Xl        Xl        [	        UUUUS9U l        [        UUU5      U l        [        UUU5      U l	        [        UUU5      U l
        [        R                  " XU	5      U l        [        X5-  UUUUS9U l        [        R                   " [#        U5       Vs/ s H  n[%        X5XgU5      PM     sn5      U l        [)        X4S9U l        [-        USSSS9U l        [        R0                  " X2U-  U R                  -  SS	9U l        SU l        g s  snf )
N)rB   rC   rD   rF   )r   r   r   r   r   Fgư>rX   )norm_elementwise_affinenorm_eps	chunk_dimTr   )r   r   rC   out_channelsr@   r   r   	time_projr   
time_token
t_embedderr   	Embeddingr   r}   rope
ModuleListranger   layersr   r   r   norm_outr    proj_outgradient_checkpointing)r%   rC   rB   r   r   r   r   r   r   r   r   r   r   r   r   rF   r   r   r   r   r   r&   s                        r'   r   "OmniGenTransformer2DModel.__init__I  s,   . 	&'0!#!1	 
 #=/CWX+M;H^_+M;H^_LL,O2.$;-M%
	 mm z**A [?Rgst*
 K:	$[%Z^jkl		+J/FIZIZ/Zaef&+#s   0D$	input_idsinput_img_latentsinput_image_sizesr+   c                 H   Uc  g U Vs/ s H  oDR                  U R                  5      PM     nnU R                  U5      nSnU R                  USS9nUR	                  5        H7  nX8    H,  u  pXv   R                  UR                  5      XXX24'   US-  nM.     M9     U$ s  snf )Nr   Trc   rX   )rq   r   r   r   keys)r%   r  r  r  xcondition_tokensinput_img_inxinput_image_tokensb_inx	start_inxend_inxs              r'   _get_multimodal_embeddings4OmniGenTransformer2DModel._get_multimodal_embeddings  s     7HI7H!TT$**-7HI,,Y7!112CTX1Y&++-E&7&>"	=O=^=a=a$**> 	(9!9: " '? .   Js   $Br*   timestepr   r   return_dictc	                    UR                   u  ppU R                  R                  nX-  X-  pU R                  USS9nUR	                  S5      nU R                  U5      R                  U5      nU R                  U5      R                  S5      nU R                  U5      nU R                  X4U5      nUb  [        R                  " UUU/SS9nO[        R                  " UU/SS9nUR	                  S5      nUR                  SU5      R                  5       nUbh  UR                  5       S:X  aT  UR                   n[        R"                  " U5      R$                  nSU-
  U-  nUR                  S5      R                  U5      nU R'                  X5      nU R(                   HJ  n[        R*                  " 5       (       a&  U R,                  (       a  U R/                  UXU5      nMC  U" XUS9nML     U R1                  U5      nUS S 2U* S 24   nU R3                  UUS9nU R5                  U5      nUR7                  XXUS5      nUR9                  S	S
SSSS5      R;                  SS
5      R;                  SS5      nU(       d  U4$ [=        US9$ )NFr  rX   r.   r-   r   )r   r   )tembr      r   rx   )sample)r[   configrB   r   r   r   r   r   rT   r  r  r:   rp   r   longr/   r   finfominr  r  is_grad_enabledr	  _gradient_checkpointing_funcr   r  r  rZ   permutere   r   )r%   r*   r  r  r  r  r   r   r  r   num_channelsr\   r]   ppost_patch_heightpost_patch_widthnum_tokens_for_output_imagetimestep_projr   r  r  
seq_lengthr   	min_dtyper   blockoutputs                              r'   r3   !OmniGenTransformer2DModel.forward  su    3@2E2E/
&KK"".4k5:+ ,,]5,Q&3&8&8&;#x088G__]3==a@
}-::9Yjk'!II'7]&SYZ[M!IIz=&AqIM"''*
#((Z8==? %.*<*<*>!*C!''EE*..I.0I=N+55a8@@ON  99]A [[E$$&&4+F+F $ A A=:J! !&meu v ! 		-0%a*E)E)F&FGm$?m4%--jM]bcegh&&q!Q1a8@@AFNNqRST9'v66r)   )r   r	  rC   r  r   r  r   r   r  r  r  r   r   )rx   r   i   gh㈵>    r2  i    r2  i }  i@}  r   r   r   Nry      Tr   silu)T)r5   r6   r7   r8   r    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr
   r9   rS   r   r{   r   r   r:   r;   r   r   r  r   FloatTensorr   r   r3   r<   r=   r>   s   @r'   r   r     s   &P (,$'('R$ "#%#%!%!'-04!"%  $$%&,)9,9, 9, 	9,
 9, !9, !9, 9, 9, 9, 9, "%9, +.9, 9, 9,   !9," #9,$ %9,& "'9,( !$)9, 9,v  :>u||:L ae 	%,,	 8 !=7||=7 UE$5$556=7 <<	=7
  -=7  T#Y/=7 =7 ll=7 =7 
'u||)<<	==7 =7r)   r   ))r   typingr   r   r   r   r   r:   torch.nnr   torch.nn.functional
functionalr   configuration_utilsr	   r
   utilsr   attention_processorr   r   r   r   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr5   loggerModuler   r@   r}   r   r   r   r   r)   r'   <module>rF     s     5 5     B  + N N 7 ' 1 
		H	%) )U		 Up0RYY 0f/ /d+299 +\y7
K y7r)   