
    +hZ                     V   S SK Jr  S SKJrJrJrJrJrJr  S SK	r	S SK
Jr  SSKJrJr  SSKJrJr  SSKJrJrJrJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(  \RR                  " \*5      r+\ " S S\'5      5       r, " S S\#\\\5      r- " S S\#5      r.g)    )	dataclass)AnyDictListOptionalTupleUnionN   )ConfigMixinregister_to_config)FromOriginalModelMixinPeftAdapterMixin)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers   )JointTransformerBlock)	AttentionAttentionProcessorFusedJointAttnProcessor2_0)"CombinedTimestepTextProjEmbeddings
PatchEmbed)Transformer2DModelOutput)
ModelMixin)SD3SingleTransformerBlock   )
BaseOutputzero_modulec                   :    \ rS rSr% \\R                     \S'   Srg)SD3ControlNetOutput%   controlnet_block_samples N)	__name__
__module____qualname____firstlineno__r   torchTensor__annotations____static_attributes__r$       e/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/controlnets/controlnet_sd3.pyr!   r!   %   s    #ELL11r-   r!   c            $         ^  \ rS rSrSrSr\                 S-S\S\S\S\S	\S
\S\S\S\S\S\S\S\\S4   S\	\
   S\	\
   S\S\4"U 4S jjj5       rS.S\	\   S\SS4S jjr\S\\
\4   4S j5       rS\\\\
\4   4   4S jrS rS  rS! r\ S/S" j5       r      S0S#\R4                  S$\R4                  S%\S&\R4                  S'\R4                  S(\R8                  S)\	\\
\4      S*\S\\R4                  \4   4S+ jjrS,r U =r!$ )1SD3ControlNetModel*   a  
ControlNet model for [Stable Diffusion 3](https://huggingface.co/papers/2403.03206).

Parameters:
    sample_size (`int`, defaults to `128`):
        The width/height of the latents. This is fixed during training since it is used to learn a number of
        position embeddings.
    patch_size (`int`, defaults to `2`):
        Patch size to turn the input data into small patches.
    in_channels (`int`, defaults to `16`):
        The number of latent channels in the input.
    num_layers (`int`, defaults to `18`):
        The number of layers of transformer blocks to use.
    attention_head_dim (`int`, defaults to `64`):
        The number of channels in each head.
    num_attention_heads (`int`, defaults to `18`):
        The number of heads to use for multi-head attention.
    joint_attention_dim (`int`, defaults to `4096`):
        The embedding dimension to use for joint text-image attention.
    caption_projection_dim (`int`, defaults to `1152`):
        The embedding dimension of caption embeddings.
    pooled_projection_dim (`int`, defaults to `2048`):
        The embedding dimension of pooled text projections.
    out_channels (`int`, defaults to `16`):
        The number of latent channels in the output.
    pos_embed_max_size (`int`, defaults to `96`):
        The maximum latent height/width of positional embeddings.
    extra_conditioning_channels (`int`, defaults to `0`):
        The number of extra channels to use for conditioning for patch embedding.
    dual_attention_layers (`Tuple[int, ...]`, defaults to `()`):
        The number of dual-stream transformer blocks to use.
    qk_norm (`str`, *optional*, defaults to `None`):
        The normalization to use for query and key in the attention layer. If `None`, no normalization is used.
    pos_embed_type (`str`, defaults to `"sincos"`):
        The type of positional embedding to use. Choose between `"sincos"` and `None`.
    use_pos_embed (`bool`, defaults to `True`):
        Whether to use positional embeddings.
    force_zeros_for_pooled_projection (`bool`, defaults to `True`):
        Whether to force zeros for pooled projection embeddings. This is handled in the pipelines by reading the
        config value of the ControlNet model.
TNsample_size
patch_sizein_channels
num_layersattention_head_dimnum_attention_headsjoint_attention_dimcaption_projection_dimpooled_projection_dimout_channelspos_embed_max_sizeextra_conditioning_channelsdual_attention_layers.qk_normpos_embed_typeuse_pos_embed!force_zeros_for_pooled_projectionc                   > [         TU ]  5         UnU
b  U
OUU l        Xe-  U l        U(       a  [	        UUUUU R                  UUS9U l        OS U l        [        U R                  U	S9U l        Ubo  [        R                  " Xx5      U l
        [        R                  " [        U5       Vs/ s H#  n[        U R                  UUSUUU;   a  SOSS9PM%     sn5      U l        OOS U l
        [        R                  " [        U5       Vs/ s H  n[        U R                  UUS9PM     sn5      U l        [        R                  " / 5      U l        [        [#        U R                  5      5       HT  n[        R                  " U R                  U R                  5      n[%        U5      nU R                   R'                  U5        MV     [	        UUUX<-   U R                  S S9n[%        U5      U l        SU l        g s  snf s  snf )N)heightwidthr3   r4   	embed_dimr<   r@   )embedding_dimr:   FT)dimr7   r6   context_pre_onlyr?   use_dual_attention)rH   r7   r6   )rD   rE   r3   r4   rF   r@   )super__init__r;   	inner_dimr   	pos_embedr   time_text_embednnLinearcontext_embedder
ModuleListranger   transformer_blocksr   controlnet_blockslenr   appendpos_embed_inputgradient_checkpointing)selfr2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   default_out_channelsi_controlnet_blockrY   	__class__s                          r.   rL   SD3ControlNetModel.__init__W   s   * 	*,8,DLJ^,A'"!%'..#5-DN "DNA..@U 
 *$&II.A$ZD! ')mm #:.
 / * NN,?+=). '348M3M4SX /
'D# %)D!&(mm #:. / . NN,?+=
 /	'D# "$r!2s42234A!yyH*+;<""))*:; 5 %!#Ann
  +?;&+#S
s   (*G1G6
chunk_sizerH   returnc                    ^ US;  a  [        SU 35      eU=(       d    SnS[        R                  R                  S[        S[        4U4S jjmU R                  5        H  nT" X1U5        M     g)	a  
Sets the attention processor to use [feed forward
chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).

Parameters:
    chunk_size (`int`, *optional*):
        The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
        over each tensor of dim=`dim`.
    dim (`int`, *optional*, defaults to `0`):
        The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
        or dim=1 (sequence length).
)r   r   z-Make sure to set `dim` to either 0 or 1, not r   modulerb   rH   c                    > [        U S5      (       a  U R                  XS9  U R                  5        H  nT" X1U5        M     g )Nset_chunk_feed_forward)rb   rH   )hasattrrg   children)re   rb   rH   childfn_recursive_feed_forwards       r.   rk   MSD3ControlNetModel.enable_forward_chunking.<locals>.fn_recursive_feed_forward   s=    v788---M*)%SA +r-   N)
ValueErrorr)   rP   Moduleintri   )r[   rb   rH   re   rk   s       @r.   enable_forward_chunking*SD3ControlNetModel.enable_forward_chunking   sn     fLSERSS  _1
	Behhoo 	B3 	BUX 	B mmoF%f#> &r-   c                    ^ 0 nS[         S[        R                  R                  S[        [         [
        4   4U4S jjmU R                  5        H  u  p#T" X#U5        M     U$ )z
Returns:
    `dict` of attention processors: A dictionary containing all attention processors used in the model with
    indexed by its weight name.
namere   
processorsc                    > [        US5      (       a  UR                  5       X  S3'   UR                  5        H  u  p4T" U  SU 3XB5        M     U$ )Nget_processor
.processor.)rh   rv   named_children)rs   re   rt   sub_namerj   fn_recursive_add_processorss        r.   r{   GSD3ControlNetModel.attn_processors.<locals>.fn_recursive_add_processors   sZ    v//282F2F2H
V:./#)#8#8#:+tfAhZ,@%T $; r-   )strr)   rP   rn   r   r   ry   )r[   rt   rs   re   r{   s       @r.   attn_processors"SD3ControlNetModel.attn_processors   sb     
	c 	588?? 	X\]`bt]tXu 	 !//1LD'jA 2 r-   	processorc           	      d  ^ [        U R                  R                  5       5      n[        U[        5      (       a-  [        U5      U:w  a  [        S[        U5       SU SU S35      eS[        S[        R                  R                  4U4S jjmU R                  5        H  u  p4T" X4U5        M     g)	a  
Sets the attention processor to use to compute attention.

Parameters:
    processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
        The instantiated processor class or a dictionary of processor classes that will be set as the processor
        for **all** `Attention` layers.

        If `processor` is a dict, the key needs to define the path to the corresponding cross attention
        processor. This is strongly recommended when setting trainable attention processors.

z>A dict of processors was passed, but the number of processors z0 does not match the number of attention layers: z. Please make sure to pass z processor classes.rs   re   c                 
  > [        US5      (       aJ  [        U[        5      (       d  UR                  U5        O#UR                  UR	                  U  S35      5        UR                  5        H  u  p4T" U  SU 3XB5        M     g )Nset_processorrw   rx   )rh   
isinstancedictr   popry   )rs   re   r   rz   rj   fn_recursive_attn_processors        r.   r   JSD3ControlNetModel.set_attn_processor.<locals>.fn_recursive_attn_processor   ss    v//!)T22((3(($z7J)KL#)#8#8#:+tfAhZ,@%S $;r-   N)rW   r~   keysr   r   rm   r}   r)   rP   rn   ry   )r[   r   countrs   re   r   s        @r.   set_attn_processor%SD3ControlNetModel.set_attn_processor   s     D((--/0i&&3y>U+BPQTU^Q_P` a005w6QRWQXXkm 
	Tc 	T588?? 	T !//1LD'i@ 2r-   c                    SU l         U R                  R                  5        H3  u  pS[        UR                  R
                  5      ;   d  M*  [        S5      e   U R                  U l         U R                  5        H)  n[        U[        5      (       d  M  UR                  SS9  M+     U R                  [        5       5        g)u  
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
are fused. For cross-attention modules, key and value projection matrices are fused.

<Tip warning={true}>

This API is 🧪 experimental.

</Tip>
NAddedzQ`fuse_qkv_projections()` is not supported for models having added KV projections.T)fuse)original_attn_processorsr~   itemsr}   r`   r%   rm   modulesr   r   fuse_projectionsr   r   )r[   r^   attn_processorre   s       r.   fuse_qkv_projections'SD3ControlNetModel.fuse_qkv_projections  s     )-%!%!5!5!;!;!=A#n66??@@ !tuu "> )-(<(<%llnF&),,''T'2 % 	 : <=r-   c                 V    U R                   b  U R                  U R                   5        gg)um   Disables the fused QKV projection if enabled.

<Tip warning={true}>

This API is 🧪 experimental.

</Tip>

N)r   r   )r[   s    r.   unfuse_qkv_projections)SD3ControlNetModel.unfuse_qkv_projections&  s)     ((4##D$A$AB 5r-   c           	      N   [        UR                  R                  UR                  R                  UR                  R                  UR                  R                  UR
                  UR                  R                  S9nUR                  UR                  R                  5       SS9  U$ )N)rD   rE   r3   r4   rF   r<   Tstrict)
r   configr2   r3   r4   rM   r<   load_state_dictrN   
state_dict)r[   transformerrN   s      r.   _get_pos_embed_from_transformer2SD3ControlNetModel._get_pos_embed_from_transformer5  s    %%11$$00"))44#**66!++*11DD
	 	!!+"7"7"B"B"DT!Rr-   c                 P   UR                   nU=(       d    UR                  US'   X5S'   U R                  U5      nU(       a  UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       5        UR                  R	                  UR                  R                  5       SS9  [        UR                  5      Ul
        U$ )Nr5   r=   Fr   )r   r5   from_configrN   r   r   rO   rR   rU   r   rY   )clsr   r5   num_extra_conditioning_channelsload_weights_from_transformerr   
controlnets          r.   from_transformer#SD3ControlNetModel.from_transformerA  s     ##)>V->->|0O,-__V,
(  001F1F1Q1Q1ST&&66{7R7R7]7]7_`''778T8T8_8_8ab))99+:X:X:c:c:ens9t)4Z5O5O)PJ&r-   hidden_statescontrolnet_condconditioning_scaleencoder_hidden_statespooled_projectionstimestepjoint_attention_kwargsreturn_dictc	                    Ub#  UR                  5       nUR                  SS5      n	OSn	[        (       a  [        X	5        O+Ub(  UR	                  SS5      b  [
        R                  S5        U R                  b  UR                  S:w  a  [        S5      eU R                  c  UR                  S:w  a  [        S5      eU R                  b  Uc  [        S	5      eU R                  c  Ub  [        S
5      eU R                  b  U R                  U5      nU R                  Xe5      n
U R                  b  U R                  U5      nXR                  U5      -   nSnU R                   H  n[        R                  " 5       (       aH  U R                   (       a7  U R                  b  U R#                  UUUU
5      u  pAO2U R#                  XU
5      nOU R                  b
  U" XU
S9u  pAOU" X5      nX4-   nM     Sn[%        XR&                  5       H  u  pU" U5      nX4-   nM     U Vs/ s H  nUU-  PM
     nn[        (       a  [)        X	5        U(       d  U4$ [+        US9$ s  snf )a  
The [`SD3Transformer2DModel`] forward method.

Args:
    hidden_states (`torch.Tensor` of shape `(batch size, channel, height, width)`):
        Input `hidden_states`.
    controlnet_cond (`torch.Tensor`):
        The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
    conditioning_scale (`float`, defaults to `1.0`):
        The scale factor for ControlNet outputs.
    encoder_hidden_states (`torch.Tensor` of shape `(batch size, sequence_len, embed_dims)`):
        Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
    pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
        from the embeddings of input conditions.
    timestep ( `torch.LongTensor`):
        Used to indicate denoising step.
    joint_attention_kwargs (`dict`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
        tuple.

Returns:
    If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
    `tuple` where the first element is the sample tensor.
Nscale      ?z\Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective.   z/hidden_states must be 4D when pos_embed is usedr
   z3hidden_states must be 3D when pos_embed is not usedzDencoder_hidden_states must be provided when context_embedder is usedzNencoder_hidden_states should not be provided when context_embedder is not usedr$   )r   r   temb)r#   )copyr   r   r   getloggerwarningrN   ndimrm   rR   rO   rY   rU   r)   is_grad_enabledrZ   _gradient_checkpointing_funcziprV   r   r!   )r[   r   r   r   r   r   r   r   r   
lora_scaler   block_res_samplesblockcontrolnet_block_res_samplesblock_res_sampler_   samples                    r.   forwardSD3ControlNetModel.forwardT  s~   N "-%;%@%@%B"/33GSAJJd/%16L6P6PQXZ^6_6kr >>%-*<*<*ANOO ^^#(:(:a(?RSS  ,1F1Ncdd""*/D/Pmnn>>% NN=9M##HA  ,$($9$9:O$P! &(<(<_(MM,,E$$&&4+F+F((4;?;\;\%-	<8)= %)$E$Ee\`$aM ((4;@&3gk<8)=
 %*-$>M 14D D- -0 (*$256GI_I_2`./0@A+GJ]+]( 3a
 So'oRn1C(CRn$'o1022"<XYY (ps   I)
rR   rV   rZ   rM   r   r;   rN   rY   rO   rU   )   r         @   r   i   i  i   r   `   r   r$   NsincosTT)Nr   )   r   T)r   NNNNT)"r%   r&   r'   r(   __doc__ _supports_gradient_checkpointingr   ro   r   r   r}   boolrL   rp   propertyr   r   r~   r	   r   r   r   r   classmethodr   r)   r*   float
LongTensorr   r   r   r,   __classcell__r`   s   @r.   r0   r0   *   ss   (T (,$ "$#%#'&*%)"$+,13!%(0"26%W,W, W, 	W,
 W,  W, !W, !W, !$W,  #W, W,  W, &)W,  %S#XW, #W,  !!W," #W,$ ,0%W, W,t?(3- ?S ?Y] ?: c+=&=!>  0 AE2Dd3PbKbFc2c,d  AF>4C
 jn , %(.2+/%);? xZ||xZ xZ "	xZ
  %||xZ "LLxZ ""xZ !)c3h 8xZ xZ 
u||55	6xZ xZr-   r0   c                      ^  \ rS rSrSrU 4S jr    SS\R                  S\\R                     S\\
   S\R                  S\R                  S	\R                  S
\\\\4      S\S\\\4   4S jjrSrU =r$ )SD3MultiControlNetModeli  a  
`SD3ControlNetModel` wrapper class for Multi-SD3ControlNet

This module is a wrapper for multiple instances of the `SD3ControlNetModel`. The `forward()` API is designed to be
compatible with `SD3ControlNetModel`.

Args:
    controlnets (`List[SD3ControlNetModel]`):
        Provides additional conditioning to the unet during the denoising process. You must set multiple
        `SD3ControlNetModel` as a list.
c                 X   > [         TU ]  5         [        R                  " U5      U l        g )N)rK   rL   rP   rS   nets)r[   controlnetsr`   s     r.   rL    SD3MultiControlNetModel.__init__  s    MM+.	r-   r   r   r   r   r   r   r   r   rc   c	                    [        [        X#U R                  5      5       HV  u  n	u  pnU" UUUUU
UUUS9nU	S:X  a  UnM   [        WS   US   5       VVs/ s H  u  nnUU-   PM     nnn[        U5      4nMX     W$ s  snnf )N)r   r   r   r   r   r   r   r   r   )	enumerater   r   tuple)r[   r   r   r   r   r   r   r   r   r]   imager   r   block_samplescontrol_block_samplescontrol_block_sampleblock_samples                    r.   r   SD3MultiControlNetModel.forward  s     .7s?`d`i`i7j-k)A)j&+!&;#5 %#('='	M Av(5% ?BBWXYBZ\ijk\l>m)>m:,l )<7>m & ) *//D)E(G%) .l, %$)s   A<)r   )NNNT)r%   r&   r'   r(   r   rL   r)   r*   r   tensorr   r   r   r   r}   r   r   r	   r!   r   r   r,   r   r   s   @r.   r   r     s    
/ /3%);? !%||!% ell+!% !K	!%
 "LL!%  %||!% ""!% !)c3h 8!% !% 
"E)	*!% !%r-   r   )/dataclassesr   typingr   r   r   r   r   r	   r)   torch.nnrP   configuration_utilsr   r   loadersr   r   utilsr   r   r   r   	attentionr   attention_processorr   r   r   
embeddingsr   r   modeling_outputsr   modeling_utilsr   transformers.transformer_sd3r   r   r   r   
get_loggerr%   r   r!   r0   r   r$   r-   r.   <module>r      s     " : :   B ? V V - [ [ G 7 ' D / 
		H	% 2* 2 2bZ[2BDZ bZJ2%j 2%r-   