
    +hn                         S SK JrJrJr  S SKrS SKJs  Jr  S SKJr  SSK	J
r
Jr  SSKJrJr  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJr  \R6                  " \5      r " S S\5      r " S S\\
5      rg)    )AnyDictOptionalN)nn   )LegacyConfigMixinregister_to_config)	deprecatelogging   )BasicTransformerBlock)ImagePositionalEmbeddings
PatchEmbedPixArtAlphaTextProjection)Transformer2DModelOutput)LegacyModelMixin)AdaLayerNormSinglec                   (   ^  \ rS rSrU 4S jrSrU =r$ )r       c                 D   > Sn[        SSU5        [        TU ]  " U0 UD6  g )NzImporting `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead.r   1.0.0)r
   super__init__)selfargskwargsdeprecation_message	__class__s       f/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/transformers/transformer_2d.pyr   !Transformer2DModelOutput.__init__!   s,     F,g7JK$)&)     )__name__
__module____qualname____firstlineno__r   __static_attributes____classcell__r   s   @r   r   r       s    * *r!   r   c            5       L  ^  \ rS rSrSrSrS/rSS/r\                         S4S\	S\	S	\
\	   S
\
\	   S\	S\S\	S\
\	   S\S\
\	   S\
\	   S\
\	   S\S\
\	   S\S\S\S\S\S\S\S\S\	S\S\
\   42U 4S  jjj5       rS! rS" rS# r        S5S$\R&                  S%\
\R&                     S&\
\R(                     S'\\\R&                  4   S(\
\R(                     S)\\\4   S*\
\R&                     S+\
\R&                     S,\4S- jjrS. rS/ rS0 rS1 r S6S2 jrS3rU =r$ )7Transformer2DModel'   a  
A 2D Transformer model for image-like data.

Parameters:
    num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
    attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
    in_channels (`int`, *optional*):
        The number of channels in the input and output (specify if the input is **continuous**).
    num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
    dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
    cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
    sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
        This is fixed during training since it is used to learn a number of position embeddings.
    num_vector_embeds (`int`, *optional*):
        The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
        Includes the class for the masked latent pixel.
    activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
    num_embeds_ada_norm ( `int`, *optional*):
        The number of diffusion steps used during training. Pass if at least one of the norm_layers is
        `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
        added to the hidden states.

        During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
    attention_bias (`bool`, *optional*):
        Configure if the `TransformerBlocks` attention should contain a bias parameter.
Tr   latent_image_embeddingnormnum_attention_headsattention_head_dimin_channelsout_channels
num_layersdropoutnorm_num_groupscross_attention_dimattention_biassample_sizenum_vector_embeds
patch_sizeactivation_fnnum_embeds_ada_normuse_linear_projectiononly_cross_attentiondouble_self_attentionupcast_attention	norm_typenorm_elementwise_affinenorm_epsattention_typecaption_channelsinterpolation_scaleuse_additional_conditionsc           	      d  > [         TU ]  5         Ub-  US;  a  [        SU S35      eUS;   a  Uc  [        SU S35      eUS L=(       a    US L U l        US LU l        US L=(       a    US LU l        U R                  (       a#  U R
                  (       a  [        SU SU S	35      eU R
                  (       a#  U R                  (       a  [        S
U SU S35      eU R                  (       d7  U R
                  (       d&  U R                  (       d  [        SU SU SU S35      eUS:X  a!  Ub  SU R                   S3n[        SSUSS9  SnXl	        UU l
        UU l        Xl        X l        U R                  R                  U R                  R                  -  U l        X0l        Uc  UOUU l        SU l        Uc  US:X  a	  U
S:X  a  SnOSnUU l        U R                  (       a  U R)                  US9  g U R
                  (       a  U R+                  US9  g U R                  (       a  U R-                  US9  g g )N)ada_normada_norm_zeroada_norm_singlezRForward pass is not implemented when `patch_size` is not None and `norm_type` is 'z'.)rI   rJ   z0When using a `patch_size` and this `norm_type` (z(), `num_embeds_ada_norm` cannot be None.z"Cannot define both `in_channels`: z and `num_vector_embeds`: zE. Make sure that either `in_channels` or `num_vector_embeds` is None.z(Cannot define both `num_vector_embeds`: z and `patch_size`: zE. Make sure that either `num_vector_embeds` or `num_patches` is None.zHas to define `in_channels`: z, `num_vector_embeds`: z, or patch_size: zQ. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None.
layer_normz&The configuration file of this model: a   is outdated. `norm_type` is either not set or incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config. Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for the `transformer/config.json` fileznorm_type!=num_embeds_ada_normr   F)standard_warnrI   rK      T)rA   )r   r   NotImplementedError
ValueErroris_input_continuousis_input_vectorizedis_input_patchesr   r
   r=   rF   rE   r/   r0   config	inner_dimr1   r2   gradient_checkpointingrG   _init_continuous_input_init_vectorized_inputs_init_patched_inputs)r   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   r   r   s                              r   r   Transformer2DModel.__init__G   s   : 	 ! PP)hirhssuv  ;;@S@[ FykQyz  %0t$;#U*PTBT #4D#@  +4 7 RJd<R##(@(@4[MA[\m[n oR R  %%$*?*?:;L:MM`ak`l mR R  ))$2J2JSWShSh/}<STeSf g<pr 
 $)<)H88H In n   6ATdij"I &;"#6  0#6 "4884;;;Y;YY&+7+?K\&+#$,--+2D,0),1))B& ##'')'<%%((9(=""%%	%: #r!   c                    [         R                  R                  U R                  R                  U R
                  SSS9U l        U R                  (       a:  [         R                  R                  U R
                  U R                  5      U l
        O:[         R                  R                  U R
                  U R                  SSSS9U l
        [        R                  " [        U R                  R                  5       Vs/ s GH)  n[        U R                  U R                  R                   U R                  R"                  U R                  R$                  U R                  R&                  U R                  R(                  U R                  R*                  U R                  R,                  U R                  R.                  U R                  R0                  U R                  R2                  UU R                  R4                  U R                  R6                  U R                  R8                  S9PGM,     sn5      U l        U R                  (       a:  [         R                  R                  U R                  U R<                  5      U l        g [         R                  R                  U R                  U R<                  SSSS9U l        g s  snf )Nư>T)
num_groupsnum_channelsepsaffine   r   )kernel_sizestridepaddingr4   r6   r;   r<   r7   r>   r?   r@   rA   rB   rC   rD   ) torchr   	GroupNormrT   r5   r1   r.   r=   LinearrU   proj_inConv2d
ModuleListranger3   r   r/   r0   r4   r6   r;   r<   r7   r>   r?   r@   rB   rC   rD   transformer_blocksr2   proj_outr   rA   _s      r   rW   )Transformer2DModel._init_continuous_input   s   HH&&{{22AQAQW[dh ' 
	 %% 88??4+;+;T^^LDL 88??4+;+;T^^YZcdno?pDL"$--& t{{556%$ 7A# &NNKK33KK22 KK//(,(G(G"&++";";(,(G(G#';;#=#=)-)I)I*.++*K*K%)[[%A%A',0KK,O,O![[11#';;#=#=" 7%#
. %%!HHOODNND<M<MNDM!HHOODNND<M<M[\efpqOrDM3s   :D1J=c                 T   U R                   R                  c   S5       eU R                   R                  c   S5       eU R                   R                  U l        U R                   R                  U l        U R                  U R                  -  U l        [        U R                   R                  U R                  U R                  U R                  S9U l        [        R                  " [        U R                   R                  5       Vs/ s GH)  n[        U R                  U R                   R                  U R                   R                  U R                   R                   U R                   R"                  U R                   R$                  U R                   R&                  U R                   R(                  U R                   R*                  U R                   R,                  U R                   R.                  UU R                   R0                  U R                   R2                  U R                   R4                  S9PGM,     sn5      U l        [        R8                  " U R                  5      U l        [        R<                  " U R                  U R                   R                  S-
  5      U l        g s  snf )Nz?Transformer2DModel over discrete input must provide sample_sizez=Transformer2DModel over discrete input must provide num_embed)	num_embed	embed_dimheightwidthre   ra   ) rT   r8   r9   ru   rv   num_latent_pixelsr   rU   r-   r   rk   rl   r3   r   r/   r0   r4   r6   r;   r<   r7   r>   r?   r@   rB   rC   rD   rm   	LayerNormnorm_outrh   outro   s      r   rX   *Transformer2DModel._init_vectorized_inputs   s   {{&&2u4uu2{{,,8 	
K	
8 kk--[[,,
!%tzz!9&?kk33t~~VZVaVaimisis'
# #%--& t{{556%$ 7A# &NNKK33KK22 KK//(,(G(G"&++";";(,(G(G#';;#=#=)-)I)I*.++*K*K%)[[%A%A',0KK,O,O![[11#';;#=#=" 7%#
. T^^499T^^T[[-J-JQ-NO/s   D1J%c                 	   U R                   R                  c   S5       eU R                   R                  U l        U R                   R                  U l        U R                   R                  U l        U R                   R
                  b  U R                   R
                  O"[        U R                   R                  S-  S5      n[        U R                   R                  U R                   R                  U R                   R                  U R                  U R                  US9U l
        [        R                  " [        U R                   R                  5       Vs/ s GH)  n[        U R                  U R                   R                   U R                   R"                  U R                   R$                  U R                   R&                  U R                   R(                  U R                   R*                  U R                   R,                  U R                   R.                  U R                   R0                  U R                   R2                  UU R                   R4                  U R                   R6                  U R                   R8                  S9PGM,     sn5      U l        U R                   R<                  S:w  a  [        R>                  " U R                  SSS	9U l         [        RB                  " U R                  S
U R                  -  5      U l"        [        RB                  " U R                  U R                   R                  U R                   R                  -  U RF                  -  5      U l$        OU R                   R<                  S:X  a  [        R>                  " U R                  SSS	9U l         [        RJ                  " [L        RN                  " S
U R                  5      U R                  S-  -  5      U l(        [        RB                  " U R                  U R                   R                  U R                   R                  -  U RF                  -  5      U l)        S U l*        U R                   R<                  S:X  a#  [W        U R                  U RX                  S9U l*        S U l-        U R\                  b$  [_        U R\                  U R                  S9U l-        g g s  snf )Nz>Transformer2DModel over patched input must provide sample_size@   ra   )ru   rv   r:   r1   rt   rF   re   rK   Fr\   )elementwise_affiner_   r         ?)rG   )in_featureshidden_size)0rT   r8   ru   rv   r:   rF   maxr   r1   rU   	pos_embedr   rk   rl   r3   r   r/   r0   r4   r6   r;   r<   r7   r>   r?   r@   rB   rC   rD   rm   rA   rx   ry   rh   
proj_out_1r2   
proj_out_2	Parameterrf   randnscale_shift_tablern   adaln_singler   rG   caption_projectionrE   r   )r   rA   rF   rp   s       r   rY   'Transformer2DModel._init_patched_inputs   st   {{&&2t4tt2kk--[[,,
++00 {{..: KK++T[[,,2A6 	
 $;;**++)){{--((nn 3
 #%--& t{{556%$ 7A# &NNKK33KK22 KK//(,(G(G"&++";";(,(G(G#';;#=#=)-)I)I*.++*K*K%)[[%A%A',0KK,O,O![[11#';;#=#=" 7%#
. ;;  $55LLEW[\DM iiDNN8JKDO ii 6 69O9O ORVRcRc cDO [[""&77LLEW[\DM%'\\%++a2PSWSaSacfSf2f%gD"II 6 69O9O ORVRcRc cDM
 !;;  $55 !3$:X:X!D #'  ,&? 11t~~'D# -[s   D1Shidden_statesencoder_hidden_statestimestepadded_cond_kwargsclass_labelscross_attention_kwargsattention_maskencoder_attention_maskreturn_dictc
                    Ub(  UR                  SS5      b  [        R                  S5        UbB  UR                  S:X  a2  SUR	                  UR
                  5      -
  S-  nUR                  S5      nUbB  UR                  S:X  a2  SUR	                  UR
                  5      -
  S-  nUR                  S5      nU R                  (       a%  UR                  u  ppUnU R                  U5      u  pOU R                  (       a  U R                  U5      nO^U R                  (       aM  UR                  S   U R                  -  UR                  S   U R                  -  pU R                  XX45      u  pnnU R                   HT  n[         R"                  " 5       (       a+  U R$                  (       a  U R'                  UUUUUUUU5      nMH  U" UUUUUUUS	9nMV     U R                  (       a  U R)                  UWW
WWWS
9nOHU R                  (       a  U R+                  U5      nO%U R                  (       a  U R-                  UUUWWWS9nU	(       d  W4$ [/        WS9$ )aj	  
The [`Transformer2DModel`] forward method.

Args:
    hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
        Input `hidden_states`.
    encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
        Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
        self-attention.
    timestep ( `torch.LongTensor`, *optional*):
        Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
    class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
        Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
        `AdaLayerZeroNorm`.
    cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
        A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
        `self.processor` in
        [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
    attention_mask ( `torch.Tensor`, *optional*):
        An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
        is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
        negative values to the attention scores corresponding to "discard" tokens.
    encoder_attention_mask ( `torch.Tensor`, *optional*):
        Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:

            * Mask `(batch, sequence_length)` True = keep, False = discard.
            * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.

        If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
        above. This bias will be added to the cross-attention scores.
    return_dict (`bool`, *optional*, defaults to `True`):
        Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
        tuple.

Returns:
    If `return_dict` is True, an [`~models.transformers.transformer_2d.Transformer2DModelOutput`] is returned,
    otherwise a `tuple` where the first element is the sample tensor.
NscalezSPassing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.r   ra   g     )r   r   r   r   r   r   )r   residual
batch_sizeru   rv   rU   )r   r   r   embedded_timestepru   rv   )sample)getloggerwarningndimtodtype	unsqueezerQ   shape_operate_on_continuous_inputsrR   r-   rS   r:   _operate_on_patched_inputsrm   rf   is_grad_enabledrV   _gradient_checkpointing_func!_get_output_for_continuous_inputs!_get_output_for_vectorized_inputs_get_output_for_patched_inputsr   )r   r   r   r   r   r   r   r   r   r   r   rp   ru   rv   r   rU   r   blockoutputs                      r   forwardTransformer2DModel.forwardD  sl   d "-%))'48Dtu %.*=*=*B
  ."3"3M4G4G"HHHTN+55a8N "-2H2M2MQR2R&'*@*C*CMDWDW*X&X\d%d"%;%E%Ea%H" ##+8+>+>(J6$H'+'I'I-'X$M9%% 77FM"")//3tFH[H[\^H_cgcrcrHrEPTPoPohQMM(<M
 ,,E$$&&4+F+F $ A A!")** 	! !&!#1*?+A%+A!-! -0 ##;;+!%# < F %%;;MJF""88+!)"3 9 F 9'v66r!   c                    UR                   u  p#pEU R                  U5      nU R                  (       dJ  U R                  U5      nUR                   S   nUR	                  SSSS5      R                  X$U-  U5      nX4$ UR                   S   nUR	                  SSSS5      R                  X$U-  U5      nU R                  U5      nX4$ )Nra   r   r   r   )r   r.   r=   ri   permutereshape)r   r   batchrp   ru   rv   rU   s          r   r   0Transformer2DModel._operate_on_continuous_inputs  s    "/"5"5&		-0)) LL7M%++A.I)11!Q1=EEeV[^]fgM ''	 &++A.I)11!Q1=EEeV[^]fgM LL7M''r!   c                 Z   UR                   S   nU R                  U5      nS nU R                  b;  U R                  (       a  Uc  [	        S5      eU R                  X4XQR
                  S9u  p6U R                  b1  U R                  U5      nUR                  USUR                   S   5      nXX64$ )Nr   zW`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.)r   hidden_dtyper   )r   r   r   rG   rP   r   r   view)r   r   r   r   r   r   r   s          r   r   -Transformer2DModel._operate_on_patched_inputs  s    "((+
}5 (--2C2K m  +/*;*;
QdQd +< +'H "".$($;$;<Q$R!$9$>$>z2}ObObceOf$g!XPPr!   c                 >   U R                   (       dD  UR                  X4XV5      R                  SSSS5      R                  5       nU R	                  U5      nOCU R	                  U5      nUR                  X4XV5      R                  SSSS5      R                  5       nX-   nU$ )Nr   r   ra   r   )r=   r   r   
contiguousrn   )r   r   r   r   ru   rv   rU   r   s           r   r   4Transformer2DModel._get_output_for_continuous_inputs  s    ))%%j%KSSTUWXZ[]^_jjl  !MM-8M MM-8M%%j%KSSTUWXZ[]^_jjl  )r!   c                     U R                  U5      nU R                  U5      nUR                  SSS5      n[        R                  " UR                  5       SS9R                  5       nU$ )Nr   r   ra   dim)ry   rz   r   Flog_softmaxdoublefloat)r   r   logitsr   s       r   r   4Transformer2DModel._get_output_for_vectorized_inputs  sT    m4-(1a(v}}A6<<>r!   c                    U R                   R                  S:w  a  U R                  S   R                  R	                  X#UR
                  S9nU R                  [        R                  " U5      5      R                  SSS9u  pU R                  U5      SU	S S 2S 4   -   -  US S 2S 4   -   nU R                  U5      nOU R                   R                  S:X  ag  U R                  S    US S 2S 4   -   R                  SSS9u  pU R                  U5      nUSU	-   -  U-   nU R                  U5      nUR                  S5      nU R                  c  [!        UR"                  S   S-  5      =pVUR%                  SXVU R&                  U R&                  U R(                  4S	9n[*        R,                  " S
U5      nUR%                  SU R(                  XPR&                  -  X`R&                  -  4S	9n
U
$ )NrK   r   )r   r   ra   r   r   r   )r   znhwpqc->nchpwq)rT   rA   rm   norm1embr   r   r   siluchunkry   r   r   rn   squeezer   intr   r   r:   r2   rf   einsum)r   r   r   r   r   ru   rv   conditioningshiftr   r   s              r   r   1Transformer2DModel._get_output_for_patched_inputs  s    ;;  $552215;;??]5H5H @ L  ??166,+?@FFqaFPLE MM-8Aag<NORWXY[_X_R``M OOM:M[[""&77 2248;LQPTW;UU\\]^de\fLE MM-8M)QY7%?M MM-8M)11!4M $ !4!4Q!73!>??F%--vdootHYHYZ . 
 %5}E&&t((&??*BEOOD[\ ' 
 r!   )r   r0   rE   r   rV   ru   r1   rU   rF   rQ   rS   rR   r-   r.   ry   r/   rw   rz   r2   r:   r   ri   rn   r   r   r   rm   rG   r=   rv   )   X   NNra   g        r   NFNNNgegluNFFFFrL   Tgh㈵>defaultNNN)NNNNNNNT)NN)r#   r$   r%   r&   __doc__ _supports_gradient_checkpointing_no_split_modules _skip_layerwise_casting_patternsr	   r   r   r   boolstrr   rW   rX   rY   rf   Tensor
LongTensorr   r   r   r   r   r   r   r   r'   r(   r)   s   @r   r+   r+   '   s   6 (,$01(@&'I$ $&"$%)&*!-1$%)+/$($-1&+%*&+!&%(,' $%)485e; e;  e; c]	e;
 sme; e; e; e; &c]e; e; c]e; $C=e; SMe; e; &c]e;   $!e;" ##e;$  $%e;& 'e;( )e;* "&+e;, -e;. /e;0 1e;2 #3e;4 $,D>5e; e;N#sJ&PPFV 9=/3593715159= J7||J7  (5J7 5++,	J7
  U\\ 12J7 u//0J7 !%S#XJ7 !.J7 !) 6J7 J7X(Q( \` r!   r+   )typingr   r   r   rf   torch.nn.functionalr   
functionalr   configuration_utilsr   r	   utilsr
   r   	attentionr   
embeddingsr   r   r   modeling_outputsr   modeling_utilsr   normalizationr   
get_loggerr#   r   r+   r"   r!   r   <module>r      sb    ' &     H ' - Y Y 7 - . 
		H	%*7 *@)+< @r!   