
    +hYH              
          S SK JrJrJrJr  S SKrS SKJr  SSKJ	r	J
r
  SSKJr  SSKJr  SSKJrJrJrJr  SSKJr  S	S
KJr  S	SKJrJr  S	SKJr  S	SKJrJr  S	SK J!r!  S	SK"J#r#  S	SK$J%r%J&r&  \RN                  " \(5      r) " S S\RT                  5      r+ " S S\RT                  5      r, " S S\RT                  5      r-\ " S S\RT                  5      5       r. " S S\RT                  5      r/\ " S S\#\	\\\5      5       r0g)    )AnyDictOptionalTupleN   )ConfigMixinregister_to_config)PeftAdapterMixin)FromOriginalModelMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers)maybe_allow_in_graph   )FeedForward)MochiAttentionMochiAttnProcessor2_0)
CacheMixin)%MochiCombinedTimestepCaptionEmbedding
PatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousRMSNormc                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )MochiModulatedRMSNorm&   epsc                 R   > [         TU ]  5         Xl        [        SUS5      U l        g Nr   F)super__init__r   r   norm)selfr   	__class__s     i/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_mochi.pyr#   MochiModulatedRMSNorm.__init__'   s#    AsE*	    c                     UR                   nUR                  [        R                  5      nU R	                  U5      nUb  X-  nUR                  U5      nU$ N)dtypetotorchfloat32r$   )r%   hidden_statesscalehidden_states_dtypes       r'   forwardMochiModulatedRMSNorm.forward-   sU    +11%((7		-0)1M%(()<=r)   )r   r$   r+   )	__name__
__module____qualname____firstlineno__floatr#   r3   __static_attributes____classcell__r&   s   @r'   r   r   &   s    +E + r)   r   c                      ^  \ rS rSr  S
S\S\4U 4S jjjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )MochiLayerNormContinuous;   embedding_dimconditioning_embedding_dimc                    > [         TU ]  5         [        R                  " 5       U l        [        R
                  " X!US9U l        [        US9U l        g )N)biasr   )	r"   r#   nnSiLUsiluLinearlinear_1r   r$   )r%   r@   rA   r   rC   r&   s        r'   r#   !MochiLayerNormContinuous.__init__<   s?     	 GGI			"<RVW)c2	r)   xconditioning_embeddingreturnc                 0   UR                   nU R                  U R                  U5      R                  UR                   5      5      nU R	                  USUR                  S5      R                  [        R                  5      -   5      nUR                  U5      $ )N   )r,   rI   rG   r-   r$   	unsqueezer.   r/   )r%   rK   rL   input_dtyper1   s        r'   r3    MochiLayerNormContinuous.forwardJ   sr    
 gg dii(>?BB177KLIIa!eooa033EMMBBDttK  r)   )rI   r$   rG   )h㈵>T)r5   r6   r7   r8   intr#   r.   Tensorr3   r:   r;   r<   s   @r'   r>   r>   ;   sY    
 33 %(3 3!<<! !&! 
	! !r)   r>   c                      ^  \ rS rSrSr SS\S\S\S\SS4
U 4S	 jjjrS
\	R                  S\	R                  S\\	R                  \	R                  \	R                  \	R                  4   4S jrSrU =r$ )MochiRMSNormZeroX   zm
Adaptive RMS Norm used in Mochi.

Parameters:
    embedding_dim (`int`): The size of each embedding vector.
r@   
hidden_dimr   elementwise_affinerM   Nc                    > [         TU ]  5         [        R                  " 5       U l        [        R
                  " X5      U l        [        SUS5      U l        g r!   )	r"   r#   rE   rF   rG   rH   linearr   r$   )r%   r@   rY   r   rZ   r&   s        r'   r#   MochiRMSNormZero.__init__`   s=     	GGI	ii:AsE*	r)   r0   embc                 Z   UR                   nU R                  U R                  U5      5      nUR                  SSS9u  pEpgU R	                  UR                  [        R                  5      5      SUS S 2S 4   R                  [        R                  5      -   -  nUR                  U5      nXXg4$ )N   rO   dim)r,   r\   rG   chunkr$   r-   r.   r/   )r%   r0   r^   r2   	scale_msagate_msa	scale_mlpgate_mlps           r'   r3   MochiRMSNormZero.forwardi   s     ,11kk$))C.)3699QA93F0	Y		-"2"25=="ABa)TUW[T[J\J_J_`e`m`mJnFno%(()<=	;;r)   )r\   r$   rG   )rS   F)r5   r6   r7   r8   __doc__rT   r9   boolr#   r.   rU   r   r3   r:   r;   r<   s   @r'   rW   rW   X   s     bg+ +.1+8=+Z^+	+ +
<"\\
<05
<	u||U\\5<<E	F
< 
<r)   rW   c                   $  ^  \ rS rSrSr    SS\S\S\S\S\S\S	\S
\SS4U 4S jjjr	 SS\
R                  S\
R                  S\
R                  S\
R                  S\\
R                     S\\
R                  \
R                  4   4S jjrSrU =r$ )MochiTransformerBlockv   a  
Transformer block used in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

Args:
    dim (`int`):
        The number of channels in the input and output.
    num_attention_heads (`int`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`):
        The number of channels in each head.
    qk_norm (`str`, defaults to `"rms_norm"`):
        The normalization layer to use.
    activation_fn (`str`, defaults to `"swiglu"`):
        Activation function to use in feed-forward.
    context_pre_only (`bool`, defaults to `False`):
        Whether or not to process context-related conditions with additional layers.
    eps (`float`, defaults to `1e-6`):
        Epsilon value for normalization layers.
rb   num_attention_headsattention_head_dimpooled_projection_dimqk_normactivation_fncontext_pre_onlyr   rM   Nc	                   > [         T	U ]  5         Xpl        SU-  S-  S-  U l        SU-  S-  S-  U l        [        USU-  USS9U l        U(       d  [        USU-  USS9U l        O[        UUUS9U l        [        UUUSUSUUU[        5       SS9U l        [        US	9U l        U R                  (       d	  [        US	9OS U l        [        U5      U l        U R                  (       d	  [        US	9OS U l        [#        XR                  USS
9U l        S U l        U(       d  [#        UU R                  USS
9U l        [        US	9U l        [        US	9U l        g )Nr`   r   r   F)r   rZ   )r@   rA   r   rS   )	query_dimheadsdim_headrC   added_kv_proj_dimadded_proj_biasout_dimout_context_dimrs   	processorr   rD   )	inner_dimrr   rC   )r"   r#   rs   ff_inner_dimff_context_inner_dimrW   norm1norm1_contextr>   r   r   attn1r   norm2norm2_contextnorm3norm3_contextr   ff
ff_contextnorm4norm4_context)
r%   rb   rn   ro   rp   rq   rr   rs   r   r&   s
            r'   r#   MochiTransformerBlock.__init__   sf    	 0Wq[Q.%&)>%>%Bq$H!%c1s7PUV
!1#q;P7PVYns!tD!93+."D $%'3!1-+-

 +s3
CGCXCX2s;^b*3/
CGCXCX2s;^bc->->mbgh)%33+	DO +s3
2s;r)   r0   encoder_hidden_statestembencoder_attention_maskimage_rotary_embc                    U R                  X5      u  pgpU R                  (       d  U R                  X#5      u  ppOU R                  X#5      n
U R                  UU
UUS9u  pXR	                  U[
        R                  " U5      R                  S5      5      -   nU R                  USUR                  S5      R                  [
        R                  5      -   5      nU R                  U5      nXR                  U[
        R                  " U	5      R                  S5      5      -   nU R                  (       d  X R                  U[
        R                  " W5      R                  S5      5      -   nU R                  USWR                  S5      R                  [
        R                  5      -   5      n
U R                  U
5      nX R!                  U[
        R                  " W5      R                  S5      5      -   nX4$ )N)r0   r   r   attention_maskrO   )r   rs   r   r   r   r.   tanhrP   r   r-   r/   r   r   r   r   r   r   )r%   r0   r   r   r   r   norm_hidden_statesre   rf   rg   norm_encoder_hidden_statesenc_gate_msaenc_scale_mlpenc_gate_mlpattn_hidden_statescontext_attn_hidden_states	ff_outputcontext_ff_outputs                     r'   r3   MochiTransformerBlock.forward   s    =AJJ}<[9i$$TXTfTf%UQ&m\ *.););<Q)X&9=,"<-1	 :D :
6 &

3EuzzRZG[GeGefgGh(ii!ZZI<O<OPQ<R<U<UV[VcVc<d8dfGG./	%

9ejj>R>\>\]^>_(``$$$9<N<N*EJJ|,D,N,Nq,Q= %! *.););%M,C,CA,F,I,I%--,X(X*& !%0J K$9<N<N!5::l#;#E#Ea#H= %! 33r)   )r   rs   r   r   r   r~   r   r   r   r   r   r   r   r   )rms_normswigluFư>r+   )r5   r6   r7   r8   ri   rT   strrj   r9   r#   r.   rU   r   r   r3   r:   r;   r<   s   @r'   rl   rl   v   s    4 "%!&<<<< !<<  	<<
  #<< << << << << 
<< <<H 48)4||)4  %||)4 ll	)4
 !&)4 #5<<0)4 
u||U\\)	*)4 )4r)   rl   c                     ^  \ rS rSrSrSS\S\SS4U 4S jjjrS\R                  4S jr	  SS	\S
\S\S\
\R                     S\
\R                     S\R                  4S jjrS\R                  S\R                  S\R                  4S jr  SS\R                  S	\S
\S\S\
\R                     S\
\R                     S\\R                  \R                  4   4S jjrSrU =r$ )	MochiRoPE   ae  
RoPE implementation used in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

Args:
    base_height (`int`, defaults to `192`):
        Base height used to compute interpolation scale for rotary positional embeddings.
    base_width (`int`, defaults to `192`):
        Base width used to compute interpolation scale for rotary positional embeddings.
base_height
base_widthrM   Nc                 4   > [         TU ]  5         X-  U l        g r+   )r"   r#   target_area)r%   r   r   r&   s      r'   r#   MochiRoPE.__init__  s    &3r)   c                 P    [         R                  " XUS-   XES9nUS S USS  -   S-  $ )NrO   devicer,   r   )r.   linspace)r%   startstopnumr   r,   edgess          r'   _centersMochiRoPE._centers  s4    uC!GFPcr
U12Y&!++r)   
num_framesheightwidthr   r,   c                 X   U R                   X#-  -  S-  n[        R                  " XUS9nU R                  U* U-  S-  X&-  S-  X$U5      nU R                  U* U-  S-  X6-  S-  X4U5      n	[        R                  " XxU	SS9u  pn[        R
                  " XU/SS9R                  SS5      nU$ )	Ng      ?r   r   ij)indexingr   ra   r   )r   r.   aranger   meshgridstackview)r%   r   r   r   r   r,   r1   thwgrid_tgrid_hgrid_w	positionss                 r'   _get_positionsMochiRoPE._get_positions
  s     !!V^4<LL%@MM6'E/A-v~/A6SXYMM5&5.1,ema.?PUV!&a$!GKK 8bAFFr1M	r)   freqsposc                    [         R                  " UR                  R                  [         R                  5         [         R
                  " SUR                  [         R                  5      UR                  [         R                  5      5      nS S S 5        [         R                  " U5      n[         R                  " U5      nX44$ ! , (       d  f       N== f)Nznd,dhf->nhf)	r.   autocastr   typer/   einsumr-   cossin)r%   r   r   	freqs_cos	freqs_sins        r'   _create_ropeMochiRoPE._create_rope  s    ^^ELL--u}}=LLu}}0EuxxPUP]P]G^_E > IIe$	IIe$	## >=s   AC
Cpos_frequenciesc                 T    U R                  X#XEU5      nU R                  X5      u  pX4$ r+   )r   r   )
r%   r   r   r   r   r   r,   r   rope_cosrope_sins
             r'   r3   MochiRoPE.forward&  s4     !!*eUK!..D!!r)   )r   )   r   )NN)r5   r6   r7   r8   ri   rT   r#   r.   rU   r   r   r   r,   r   r   r   r3   r:   r;   r<   s   @r'   r   r      s8   4C 43 4 4 4
,5<< , *.'+  	
 & $ 
&$%,, $U\\ $ell $ *.'+"" " 	"
 " &" $" 
u||U\\)	*" "r)   r   c                   D  ^  \ rS rSrSrSrS/rSS/r\            SS\	S	\	S
\	S\	S\	S\	S\
\	   S\S\	S\	S\S\	SS4U 4S jjj5       r  SS\R                  S\R                  S\R                  S\R                  S\
\\\4      S\S\R                  4S jjrSrU =r$ ) MochiTransformer3DModeli4  a  
A Transformer model for video-like data introduced in [Mochi](https://huggingface.co/genmo/mochi-1-preview).

Args:
    patch_size (`int`, defaults to `2`):
        The size of the patches to use in the patch embedding layer.
    num_attention_heads (`int`, defaults to `24`):
        The number of heads to use for multi-head attention.
    attention_head_dim (`int`, defaults to `128`):
        The number of channels in each head.
    num_layers (`int`, defaults to `48`):
        The number of layers of Transformer blocks to use.
    in_channels (`int`, defaults to `12`):
        The number of channels in the input.
    out_channels (`int`, *optional*, defaults to `None`):
        The number of channels in the output.
    qk_norm (`str`, defaults to `"rms_norm"`):
        The normalization layer to use.
    text_embed_dim (`int`, defaults to `4096`):
        Input dimension of text embeddings from the text encoder.
    time_embed_dim (`int`, defaults to `256`):
        Output dimension of timestep embeddings.
    activation_fn (`str`, defaults to `"swiglu"`):
        Activation function to use in feed-forward.
    max_sequence_length (`int`, defaults to `256`):
        The maximum sequence length of text embeddings supported.
Trl   patch_embedr$   N
patch_sizern   ro   
num_layersrp   in_channelsout_channelsrq   text_embed_dimtime_embed_dimrr   max_sequence_lengthrM   c                   > [         TU ]  5         X#-  nU=(       d    Un[        UUUS S9U l        [	        UUU	U
SS9U l        [        R                  " [        R                  " SX#S-  4S5      5      U l
        [        5       U l        [        R                  " [        U5       Vs/ s H  n[        UUUUUUXS-
  :H  S9PM     sn5      U l        [#        UUS	S
SS9U l        [        R&                  " XU-  U-  5      U l        S	U l        g s  snf )N)r   r   	embed_dimpos_embed_type   )r@   rp   r   r   rn   r   r   g        rO   )rb   rn   ro   rp   rq   rr   rs   Fr   
layer_norm)rZ   r   	norm_type)r"   r#   r   r   r   
time_embedrE   	Parameterr.   fullr   r   rope
ModuleListrangerl   transformer_blocksr   norm_outrH   proj_outgradient_checkpointing)r%   r   rn   ro   r   rp   r   r   rq   r   r   rr   r   r}   ir&   s                  r'   r#    MochiTransformer3DModel.__init__V  s+     	'<	#2{%!#	
 @#"7)) !
  "||EJJ;NfgPg7hjm,noK	"$-- z* +A &!(;'9*?#"/%&q.%8 +#
 /$"
 		)*-D|-ST&+#/s   #Dr0   r   timestepr   attention_kwargsreturn_dictc           	         Ub#  UR                  5       nUR                  SS5      nOSn[        (       a  [        X5        O+Ub(  UR	                  SS 5      b  [
        R                  S5        UR                  u  ppnU R                  R                  nX-  nX-  nU R                  UUUUR                  S9u  nnUR                  SSSSS	5      R                  SS5      nU R                  U5      nUR                  SUS
45      R                  SS5      nU R!                  U R"                  U
UUUR$                  [&        R(                  S9n[+        U R,                  5       HW  u  nn[&        R.                  " 5       (       a+  U R0                  (       a  U R3                  UUUUUU5      u  pMK  U" UUUUUS9u  pMY     U R5                  UU5      nU R7                  U5      nUR9                  XXXS
5      nUR                  SSSSS	SS5      nUR9                  US
XU5      n[        (       a  [;        X5        U(       d  U4$ [=        US9$ )Nr1   g      ?zVPassing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective.)hidden_dtyper   r   rO   r   r`   r   r   )r0   r   r   r   r         )sample)copypopr   r   getloggerwarningshapeconfigr   r   r,   permuteflattenr   	unflattenr   r   r   r.   r/   	enumerater   is_grad_enabledr   _gradient_checkpointing_funcr   r   reshaper   r   )r%   r0   r   r   r   r   r   
lora_scale
batch_sizenum_channelsr   r   r   ppost_patch_heightpost_patch_widthr   r   r   blockoutputs                        r'   r3   MochiTransformer3DModel.forward  sa    '/446)--gs;JJd/+0@0D0DWd0S0_l ?L>Q>Q;
*eKK"""K :&*oo!"&,,	 '6 '
## &--aAq!<DDQJ((7%//J3CDLLQPQR99   ''-- % 
 "$"9"9:HAu$$&&4+F+F7;7X7X!)*$844 8="/*?+A%5844 ;$ mT:m4%--jFWklqst%--aAq!QB&&z2z5Q19'v66r)   )r   r   r   r   r   r   r   r   )r         0   i      Nr   i      r   r  )NT)r5   r6   r7   r8   ri    _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr	   rT   r   r   r#   r.   rU   
LongTensorr   r   rj   r3   r:   r;   r<   s   @r'   r   r   4  sc   8 (,$01(5v'>$ #%"%%)&*!"!%#&>,>, !>,  	>,
 >,  #>, >, sm>, >, >, >, >, !>, 
>, >,L 6: Q7||Q7  %||Q7 ""	Q7
 !&Q7 #4S>2Q7 Q7 
Q7 Q7r)   r   )1typingr   r   r   r   r.   torch.nnrE   configuration_utilsr   r	   loadersr
   loaders.single_file_modelr   utilsr   r   r   r   utils.torch_utilsr   	attentionr   attention_processorr   r   cache_utilsr   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r   
get_loggerr5   r   Moduler   r>   rW   rl   r   r    r)   r'   <module>r.     s     . -   B ' ? V V 5 # G $ J 7 ' ; 
		H	%BII *!ryy !:<ryy << |4BII |4 |4~;"		 ;"| s7j+7GI_ak s7 s7r)   