
    +h K              	       V   S SK JrJrJrJrJrJr  S SKrS SKJ	r	  SSK
JrJr  SSKJrJrJr  SSKJrJrJrJr  SSKJr  SS	KJrJr  SS
KJrJrJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)  \RT                  " \+5      r,\ " S S\	RZ                  5      5       r. " S S\&\\\\5      r/g)    )AnyDictListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixinSD3Transformer2DLoadersMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers)maybe_allow_in_graph   )FeedForwardJointTransformerBlock)	AttentionAttentionProcessorFusedJointAttnProcessor2_0JointAttnProcessor2_0)"CombinedTimestepTextProjEmbeddings
PatchEmbed)Transformer2DModelOutput)
ModelMixin)AdaLayerNormContinuousAdaLayerNormZeroc                   r   ^  \ rS rSrS\S\S\4U 4S jjrS\R                  S\R                  4S jrS	r	U =r
$ )
SD3SingleTransformerBlock'   dimnum_attention_headsattention_head_dimc           
         > [         TU ]  5         [        U5      U l        [	        UUUUS[        5       SS9U l        [        R                  " USSS9U l	        [        XSS9U l        g )NTư>)	query_dimdim_headheadsout_dimbias	processorepsFelementwise_affiner/   zgelu-approximate)r$   dim_outactivation_fn)super__init__r    norm1r   r   attnnn	LayerNormnorm2r   ff)selfr$   r%   r&   	__class__s       g/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_sd3.pyr5   "SD3SingleTransformerBlock.__init__)   se     	%c*
'%+-
	 \\#%TJ
#BTU    hidden_statestembc                 B   U R                  XS9u  p4pVnU R                  US S9nUR                  S5      U-  nX-   nU R                  U5      nUSUR                  S5      -   -  UR                  S5      -   nU R	                  U5      n	UR                  S5      U	-  n	X-   nU$ )N)emb)rA   encoder_hidden_states   )r6   r7   	unsqueezer:   r;   )
r<   rA   rB   norm_hidden_statesgate_msa	shift_mlp	scale_mlpgate_mlpattn_output	ff_outputs
             r>   forward!SD3SingleTransformerBlock.forward?   s    GKzzR_zGjDiHii.@X\i]((+k9%3 "ZZ6/1y7J7J17M3MNQZQdQdefQggGG./	&&q)I5	%1r@   )r7   r;   r6   r:   )__name__
__module____qualname____firstlineno__intr5   torchTensorrO   __static_attributes____classcell__r=   s   @r>   r"   r"   '   sH    VV !V  	V,U\\   r@   r"   c                     ^  \ rS rSrSrSrS/rSS/r\             S+S\	S	\	S
\	S\	S\	S\	S\	S\	S\	S\	S\	S\
\	S4   S\\   4U 4S jjj5       rS,S\\	   S\	SS4S jjrS r\S\\\4   4S j5       rS\\\\\4   4   4S jrS rS  r       S-S!\R2                  S"\R2                  S#\R2                  S$\R4                  S%\S&\\\\4      S'\S(\\\	      S\\R2                  \4   4S) jjrS*r U =r!$ ).SD3Transformer2DModelP   ax  
The Transformer model introduced in [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

Parameters:
    sample_size (`int`, defaults to `128`):
        The width/height of the latents. This is fixed during training since it is used to learn a number of
        position embeddings.
    patch_size (`int`, defaults to `2`):
        Patch size to turn the input data into small patches.
    in_channels (`int`, defaults to `16`):
        The number of latent channels in the input.
    num_layers (`int`, defaults to `18`):
        The number of layers of transformer blocks to use.
    attention_head_dim (`int`, defaults to `64`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `18`):
        The number of heads to use for multi-head attention.
    joint_attention_dim (`int`, defaults to `4096`):
        The embedding dimension to use for joint text-image attention.
    caption_projection_dim (`int`, defaults to `1152`):
        The embedding dimension of caption embeddings.
    pooled_projection_dim (`int`, defaults to `2048`):
        The embedding dimension of pooled text projections.
    out_channels (`int`, defaults to `16`):
        The number of latent channels in the output.
    pos_embed_max_size (`int`, defaults to `96`):
        The maximum latent height/width of positional embeddings.
    dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
        The number of dual-stream transformer blocks to use.
    qk_norm (`str`, *optional*, defaults to `None`):
        The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
Tr   	pos_embednormNsample_size
patch_sizein_channels
num_layersr&   r%   joint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizedual_attention_layers.qk_normc                 f  > [         TU ]  5         U
b  U
OUU l        Xe-  U l        [	        UUUUU R                  US9U l        [        U R                  U	S9U l        [        R                  " Xx5      U l
        [        R                  " [        U5       Vs/ s H'  n[        U R                  UUXS-
  :H  UX;   a  SOSS9PM)     sn5      U l        [        U R                  U R                  SSS9U l        [        R                  " U R                  X"-  U R                  -  SS	9U l        SU l        g s  snf )
N)heightwidthra   rb   	embed_dimrh   )embedding_dimrf   rF   TF)r$   r%   r&   context_pre_onlyrj   use_dual_attentionr(   r0   )r-   )r4   r5   rg   	inner_dimr   r^   r   time_text_embedr8   Linearcontext_embedder
ModuleListranger   transformer_blocksr   norm_outproj_outgradient_checkpointing)r<   r`   ra   rb   rc   r&   r%   rd   re   rf   rg   rh   ri   rj   ir=   s                  r>   r5   SD3Transformer2DModel.__init__x   s+   & 	,8,DL+,A#!#nn1
  B..@U 
 !#		*= V"$-- z*
 +A &(;'9%&q.%8#/0/Itu +
#
 /t~~t~~bgmqr		$..*2IDL]L]2]dhi&+#!
s   .D.
chunk_sizer$   returnc                    ^ US;  a  [        SU 35      eU=(       d    SnS[        R                  R                  S[        S[        4U4S jjmU R                  5        H  nT" X1U5        M     g)	a  
Sets the attention processor to use [feed forward
chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

Parameters:
    chunk_size (`int`, *optional*):
        The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
        over each tensor of dim=`dim`.
    dim (`int`, *optional*, defaults to `0`):
        The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
        or dim=1 (sequence length).
)r   rF   z-Make sure to set `dim` to either 0 or 1, not rF   moduler~   r$   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g Nset_chunk_feed_forward)r~   r$   hasattrr   childrenr   r~   r$   childfn_recursive_feed_forwards       r>   r   PSD3Transformer2DModel.enable_forward_chunking.<locals>.fn_recursive_feed_forward   =    v788---M*)%SA +r@   N)
ValueErrorrV   r8   ModulerU   r   )r<   r~   r$   r   r   s       @r>   enable_forward_chunking-SD3Transformer2DModel.enable_forward_chunking   sn     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmoF%f#> &r@   c                    ^ S[         R                  R                  S[        S[        4U4S jjmU R	                  5        H  nT" US S5        M     g )Nr   r~   r$   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g r   r   r   s       r>   r   QSD3Transformer2DModel.disable_forward_chunking.<locals>.fn_recursive_feed_forward   r   r@   r   )rV   r8   r   rU   r   )r<   r   r   s     @r>   disable_forward_chunking.SD3Transformer2DModel.disable_forward_chunking   sH    	Behhoo 	B3 	BUX 	B mmoF%fdA6 &r@   c                    ^ 0 nS[         S[        R                  R                  S[        [         [
        4   4U4S jjmU R                  5        H  u  p#T" X#U5        M     U$ )z
Returns:
    `dict` of attention processors: A dictionary containing all attention processors used in the model with
    indexed by its weight name.
namer   
processorsc                    > [        US5      (       a  UR                  5       X  S3'   UR                  5        H  u  p4T" U  SU 3XB5        M     U$ )Nget_processor
.processor.)r   r   named_children)r   r   r   sub_namer   fn_recursive_add_processorss        r>   r   JSD3Transformer2DModel.attn_processors.<locals>.fn_recursive_add_processors   sZ    v//282F2F2H
V:./#)#8#8#:+tfAhZ,@%T $; r@   )strrV   r8   r   r   r   r   )r<   r   r   r   r   s       @r>   attn_processors%SD3Transformer2DModel.attn_processors   sb     
	c 	588?? 	X\]`bt]tXu 	 !//1LD'jA 2 r@   r.   c           	      d  ^ [        U R                  R                  5       5      n[        U[        5      (       a-  [        U5      U:w  a  [        S[        U5       SU SU S35      eS[        S[        R                  R                  4U4S jjmU R                  5        H  u  p4T" X4U5        M     g)	a  
Sets the attention processor to use to compute attention.

Parameters:
    processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
        The instantiated processor class or a dictionary of processor classes that will be set as the processor
        for **all** `Attention` layers.

        If `processor` is a dict, the key needs to define the path to the corresponding cross attention
        processor. This is strongly recommended when setting trainable attention processors.

z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.r   r   c                 
  > [        US5      (       aJ  [        U[        5      (       d  UR                  U5        O#UR                  UR	                  U  S35      5        UR                  5        H  u  p4T" U  SU 3XB5        M     g )Nset_processorr   r   )r   
isinstancedictr   popr   )r   r   r.   r   r   fn_recursive_attn_processors        r>   r   MSD3Transformer2DModel.set_attn_processor.<locals>.fn_recursive_attn_processor  ss    v//!)T22((3(($z7J)KL#)#8#8#:+tfAhZ,@%S $;r@   N)lenr   keysr   r   r   r   rV   r8   r   r   )r<   r.   countr   r   r   s        @r>   set_attn_processor(SD3Transformer2DModel.set_attn_processor   s     D((--/0i&&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1LD'i@ 2r@   c                    SU l         U R                  R                  5        H3  u  pS[        UR                  R
                  5      ;   d  M*  [        S5      e   U R                  U l         U R                  5        H)  n[        U[        5      (       d  M  UR                  SS9  M+     U R                  [        5       5        g)u  
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
are fused. For cross-attention modules, key and value projection matrices are fused.

<Tip warning={true}>

This API is 🧪 experimental.

</Tip>
NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr   itemsr   r=   rQ   r   modulesr   r   fuse_projectionsr   r   )r<   _attn_processorr   s       r>   fuse_qkv_projections*SD3Transformer2DModel.fuse_qkv_projections  s     )-%!%!5!5!;!;!=A#n66??@@ !tuu "> )-(<(<%llnF&),,''T'2 % 	 : <=r@   c                 V    U R                   b  U R                  U R                   5        gg)um   Disables the fused QKV projection if enabled.

<Tip warning={true}>

This API is 🧪 experimental.

</Tip>

N)r   r   )r<   s    r>   unfuse_qkv_projections,SD3Transformer2DModel.unfuse_qkv_projections0  s)     ((4##D$A$AB 5r@   rA   rE   pooled_projectionstimestepblock_controlnet_hidden_statesjoint_attention_kwargsreturn_dictskip_layersc	                 4   Ub#  UR                  5       nUR                  SS5      n	OSn	[        (       a  [        X	5        O+Ub(  UR	                  SS5      b  [
        R                  S5        UR                  SS u  pU R                  U5      nU R                  XC5      nU R                  U5      nUb9  SU;   a3  UR                  S5      nU R                  X5      u  pUR                  XS9  [        U R                  5       H  u  nnUb  UU;   a  SOS	n[        R                   " 5       (       a0  U R"                  (       a  U(       d  U R%                  UUUUU5      u  p!OU(       d  U" UUUUS
9u  p!Uc  Mt  UR&                  S	L d  M  [)        U R                  5      [)        U5      -  nX[+        UU-  5         -   nM     U R-                  X5      nU R/                  U5      nU R0                  R2                  nU
U-  n
UU-  nUR5                  UR                  S   XUUU R6                  4S9n[        R8                  " SU5      nUR5                  UR                  S   U R6                  U
U-  UU-  4S9n[        (       a  [;        X	5        U(       d  U4$ [=        US9$ )a  
The [`SD3Transformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
        Input `hidden_states`.
    encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
    pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`):
        Embeddings projected from the embeddings of input conditions.
    timestep (`torch.LongTensor`):
        Used to indicate denoising step.
    block_controlnet_hidden_states (`list` of `torch.Tensor`):
        A list of tensors that if specified are added to the residuals of transformer blocks.
    joint_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.
    skip_layers (`list` of `int`, *optional*):
        A list of layer indices to skip during the forward pass.

Returns:
    If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
    `tuple` where the first element is the sample tensor.
Nscaleg      ?z\Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective.ip_adapter_image_embeds)ip_hidden_statesrB   TF)rA   rE   rB   r   r   )shapeznhwpqc->nchpwq)sample)copyr   r   r   getloggerwarningr   r^   rs   ru   
image_projupdate	enumeraterx   rV   is_grad_enabledr{   _gradient_checkpointing_funcrp   r   rU   ry   rz   configra   reshaperg   einsumr   r   )r<   rA   rE   r   r   r   r   r   r   
lora_scalerl   rm   rB   r   r   ip_tembindex_blockblockis_skipinterval_controlra   outputs                         r>   rO   SD3Transformer2DModel.forward=  s   N "-%;%@%@%B"/33GSAJJd/%16L6P6PQXZ^6_6kr &++BC0}5##HA $ 5 56K L!-2KOe2e&<&@&@AZ&[#(,8O(Z%"));K)Z"+D,C,C"DK)5+:TdZ_G$$&&4+F+Fw7;7X7X!)*84%} 7<"/*?+A	84% .9e>T>TX]>]#&t'>'>#?#FdBe#e  -sS^aqSqOr0s s/ #E2 m:m4 [[++
:%#%-- &&q)6*jRVRcRcd . 
 %5}E&& &&q)4+<+<fz>QSX[eSef ' 
 19'v66r@   )
ru   r{   rr   ry   r   rg   r^   rz   rs   rx   )   r         @   r   i   i  i   r   `    N)Nr   )NNNNNTN)"rQ   rR   rS   rT   __doc__ _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr   rU   r   r   r   r5   r   r   propertyr   r   r   r   r   r   r   rV   rW   
LongTensorr   r   boolr   rO   rX   rY   rZ   s   @r>   r\   r\   P   s(   B (,$01(3V'<$ "$#%#'&*%)"$ !%!4,4, 4, 	4,
 4,  4, !4, !4, !$4,  #4, 4,  4,  %H 
4,  #!4, 4,n?(3- ?S ?Y] ?<	7 c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF>4C  /3+/%)/3;? +/r7||r7  %||r7 "LL	r7
 ""r7 )-r7 !)c3h 8r7 r7 d3i(r7 
u||55	6r7 r7r@   r\   )0typingr   r   r   r   r   r   rV   torch.nnr8   configuration_utilsr
   r   loadersr   r   r   utilsr   r   r   r   utils.torch_utilsr   	attentionr   r   attention_processorr   r   r   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   normalizationr   r    
get_loggerrQ   r   r   r"   r\   r   r@   r>   <module>r      s    ; :   B ] ] V V 5 :  H 7 ' D 
		H	% %		 % %P_7-/EGc_7r@   