
    +hO                     p   S SK r S SKrS SKJr  S SKrS SKJr  SS\	4S jjr
\R                  R                  R                  SS4S\	S\	4S jjr " S S	\R                  5      r " S
 S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      rg)    N   key_chunk_sizec                   ^ ^^^^^^^^ TR                   SS u  nmmTR                   S   m[        TU5      mT [        R                  " T5      -  m [        R
                  " [        R                  SS9U4S j5       mUUUUU UUU4S jn[        R                  R                  U[        R                  " SUT5      S	9u  pxn	[        R                  " U	SS
S9n
[        R                  " X-
  5      nU[        R                  " USS9-  nX-  nUR                  SS9n[        R                  " US5      R                  SS9nX-  $ )zBMulti-head dot product attention with a limited number of queries.NF)prevent_csec                 F  > [         R                  " SXTS9n[         R                  " USSS9n[        R                  R                  U5      n[         R                  " X4-
  5      n[         R                  " SX%TS9n[         R                  " SU5      nXeR                  SS9U4$ )	Nz...qhd,...khd->...qhk)	precisionr   Taxiskeepdimsz...vhf,...qhv->...qhfz...qhk->...qhr   )jnpeinsummaxjaxlaxstop_gradientexpsum)querykeyvalueattn_weights	max_scoreexp_weights
exp_valuesr
   s          Y/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/attention_flax.pysummarize_chunk/_query_chunk_attention.<locals>.summarize_chunk   s    zz"95QZ[GGLrDA	GG)))4	ggl67ZZ 7W`a
JJ	:	OOO4i@@    c           	      ^  > [         R                  R                  TS/TR                  S-
  -  U SS/-   [	        TR
                  S S 5      TTT/-   S9n[         R                  R                  T
S/T
R                  S-
  -  U SS/-   [	        T
R
                  S S 5      TTT	/-   S9nT" TX5      $ )Nr      r   operandstart_indicesslice_sizes)r   r   dynamic_slicendimlistshape)	chunk_idx	key_chunkvalue_chunk
k_featuresr   r   	num_headsr   r   
v_featuresr   s      r   chunk_scanner-_query_chunk_attention.<locals>.chunk_scanner+   s    GG))#A.)Q1BBSYYs^,	:/VV * 
	 gg++#a0Iq!3DDU[["-..)Z1XX , 
 ui==r!   r   )fxsTr   r   )r+   minr   sqrt	functoolspartialr   
checkpointr   maparanger   r   expand_dimsr   )r   r   r   r
   r   num_kvr2   chunk_valueschunk_weights	chunk_max
global_max	max_diffs
all_valuesall_weightsr/   r0   r   r1   s   `````         @@@@r   _query_chunk_attentionrF      s   $'IIbcN!FIzRJ0NCHHZ((Es~~59
A :
A> >" .1WW[[=SZZXY[acqMr[-s*LT:J	./ICOOIB77LM!!q!)J//-488a8@K##r!   i   query_chunk_sizec           	         ^ ^^^^^^	^
^ T R                   SS u  m
m	mUUU	U
UUU UU4	S jn[        R                  R                  USS[        R
                  " T
T-  5      S9u  px[        R                  " USS9$ )a  
Flax Memory-efficient multi-head dot product attention. https://huggingface.co/papers/2112.05682v2
https://github.com/AminRezaei0x443/memory-efficient-attention

Args:
    query (`jnp.ndarray`): (batch..., query_length, head, query_key_depth_per_head)
    key (`jnp.ndarray`): (batch..., key_value_length, head, query_key_depth_per_head)
    value (`jnp.ndarray`): (batch..., key_value_length, head, value_depth_per_head)
    precision (`jax.lax.Precision`, *optional*, defaults to `jax.lax.Precision.HIGHEST`):
        numerical precision for computation
    query_chunk_size (`int`, *optional*, defaults to 1024):
        chunk size to divide query array value must divide query_length equally without remainder
    key_chunk_size (`int`, *optional*, defaults to 4096):
        chunk size to divide key and value array value must divide key_value_length equally without remainder

Returns:
    (`jnp.ndarray`) with shape of (batch..., query_length, head, value_depth_per_head)
r   Nc           	         >	 [         R                  R                  T	S/T	R                  S-
  -  U SS/-   [	        T	R
                  S S 5      [        T
T5      TT/-   S9nU T
-   [        UTTTTS94$ )Nr   r#   r   r$   )r   r   r   r
   r   )r   r   r(   r)   r*   r+   r6   rF   )r,   _query_chunkr   r   r0   num_qr
   
q_featuresr   rG   r   s      r   r2   5jax_memory_efficient_attention.<locals>.chunk_scannera   s    gg++3%**q.1iA5FFU[["-.#6F2NPY[e1ff , 
 (("!s%9]k
 	
r!   r   )r4   initr5   lengthr   )r+   r   r   scanmathceilr   concatenate)r   r   r   r
   rG   r   r2   rJ   resr0   rL   rM   s   ``````   @@@r   jax_memory_efficient_attentionrV   J   sp    * $);;rs#3 E9j
 
 WW\\
yy!112	  FA ??3R((r!   c                       \ rS rSr% Sr\\S'   Sr\\S'   Sr\\S'   Sr	\
\S	'   S
r\\S'   S
r\\S'   \R                  r\R                   \S'   S rS rS rSS jrSrg)FlaxAttentionz   a  
A Flax multi-head attention module as described in: https://huggingface.co/papers/1706.03762

Parameters:
    query_dim (:obj:`int`):
        Input hidden states dimension
    heads (:obj:`int`, *optional*, defaults to 8):
        Number of heads
    dim_head (:obj:`int`, *optional*, defaults to 64):
        Hidden states dimension inside each head
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
        enable memory efficient attention https://huggingface.co/papers/2112.05682
    split_head_dim (`bool`, *optional*, defaults to `False`):
        Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
        enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`

	query_dim   heads@   dim_head        dropoutFuse_memory_efficient_attentionsplit_head_dimdtypec                    U R                   U R                  -  nU R                   S-  U l        [        R                  " USU R
                  SS9U l        [        R                  " USU R
                  SS9U l        [        R                  " USU R
                  SS9U l        [        R                  " U R                  U R
                  SS9U l
        [        R                  " U R                  S	9U l        g )
Ng      Fto_q)use_biasrc   nameto_kto_vto_out_0)rc   rg   rate)r^   r\   scalennDenserc   r   r   r   rZ   	proj_attnDropoutr`   dropout_layerself	inner_dims     r   setupFlaxAttention.setup   s    MMDJJ.	]]D(
 XXi%tzzPVW
88ITZZfUXXi%tzzPVW
$..

TZZT\\:r!   c                     UR                   u  p#nU R                  nUR                  X#XTU-  5      n[        R                  " US5      nUR                  X%-  X4U-  5      nU$ N)r         r#   r+   r\   reshaper   	transposert   tensor
batch_sizeseq_lendim	head_sizes         r   reshape_heads_to_batch_dim(FlaxAttention.reshape_heads_to_batch_dim   s[    #)<< 
SJJ	
Yy@PQv|4
 6	AQRr!   c                     UR                   u  p#nU R                  nUR                  X%-  XSU5      n[        R                  " US5      nUR                  X%-  X4U-  5      nU$ ry   r|   r   s         r   reshape_batch_dim_to_heads(FlaxAttention.reshape_batch_dim_to_heads   sZ    #)<< 
SJJ	
 7SQv|4
 7	/Rr!   Nc                 6   Uc  UOUnU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a  UR                  S   n[
        R                  " XGSU R                  U R                  45      n[
        R                  " XWSU R                  U R                  45      n	[
        R                  " XgSU R                  U R                  45      n
O3U R                  U5      nU R                  U5      n	U R                  U5      n
U R                  (       a  UR                  SSS5      nU	R                  SSS5      n	U
R                  SSS5      n
UR                  S   nUS-  S:X  a  [        US-  5      nO;US-  S:X  a  [        US-  5      nO#US-  S:X  a  [        US-  5      nO[        U5      n[        XXS	S
9nUR                  SSS5      nU R                  U5      nGOU R                  (       a  [
        R                  " SX5      nO[
        R                  " SX5      nXR                   -  n["        R$                  " XR                  (       a  SOSS9nU R                  (       aW  [
        R                  " SX5      nUR                  S   n[
        R                  " XSU R                  U R                  -  45      nO([
        R                  " SX5      nU R                  U5      nU R'                  U5      nU R)                  XS9$ )Nr   r   r{   rz   r   r]         i @  )rG   r   zb t n h, b f n h -> b n f tzb i d, b j d->b i jr   zb n f t, b t n h -> b f n hzb i j, b j d -> b i ddeterministic)r   r   r   rb   r+   r   r}   r\   r^   r   ra   r~   intrV   r   r   rm   rn   softmaxrp   rr   )rt   hidden_statescontextr   
query_projkey_proj
value_projbquery_states
key_statesvalue_statesflatten_latent_dimrG   attention_scoresattention_probss                  r   __call__FlaxAttention.__call__   s   #*?-ZZ.
88G$ZZ(
##A&A;;zr4::t}}3UVLX2tzz4==/QRJ;;zr4::t}}3UVL:::FL88BJ:::FL..'11!Q:L#--aA6J'11!Q:L
 ".!3!3B!7!B&!+#&'9B'>#? #b(A-#&'9B'>#? #a'1,#&'9A'=#> #&'9#: :,jrM *33Aq!<M ;;MJM ""#&::.KZ#f #&::.C\#^ /**< jj)9FYFY_`aO "" #

+H/ h!''* #Mr4::PTP]P]C];^ _ #

+BO b $ ? ? N}5!!-!MMr!   )rr   r   rp   r   rm   r   )NT)__name__
__module____qualname____firstlineno____doc__r   __annotations__r\   r^   r`   floatra   boolrb   r   float32rc   rv   r   r   r   __static_attributes__ r!   r   rX   rX   z   sg    , NE3NHcGU+0"D0 ND {{E399"
;<Nr!   rX   c                       \ rS rSr% Sr\\S'   \\S'   \\S'   Sr\\S'   Sr	\
\S	'   \R                  r\R                  \S
'   Sr\
\S'   Sr\
\S'   S rSS jrSrg)FlaxBasicTransformerBlock   a!  
A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
https://huggingface.co/papers/1706.03762


Parameters:
    dim (:obj:`int`):
        Inner hidden states dimension
    n_heads (:obj:`int`):
        Number of heads
    d_head (:obj:`int`):
        Hidden states dimension inside each head
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    only_cross_attention (`bool`, defaults to `False`):
        Whether to only apply cross attention.
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
    use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
        enable memory efficient attention https://huggingface.co/papers/2112.05682
    split_head_dim (`bool`, *optional*, defaults to `False`):
        Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
        enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
r   n_headsd_headr_   r`   Fonly_cross_attentionrc   ra   rb   c           
         [        U R                  U R                  U R                  U R                  U R
                  U R                  U R                  S9U l        [        U R                  U R                  U R                  U R                  U R
                  U R                  U R                  S9U l	        [        U R                  U R                  U R                  S9U l        [        R                  " SU R                  S9U l        [        R                  " SU R                  S9U l        [        R                  " SU R                  S9U l        [        R"                  " U R                  S9U l        g )Nrc   )r   r`   rc   h㈵>)epsilonrc   rk   )rX   r   r   r   r`   ra   rb   rc   attn1attn2FlaxFeedForwardffrn   	LayerNormnorm1norm2norm3rq   rr   rt   s    r   rv   FlaxBasicTransformerBlock.setup  s    "HHLLKKLL//**

 #HHLLKKLL//**

 "dhhDJJW\\$djjA
\\$djjA
\\$djjA
ZZT\\:r!   c                 `   UnU R                   (       a   U R                  U R                  U5      X#S9nOU R                  U R                  U5      US9nX-   nUnU R                  U R	                  U5      X#S9nX-   nUnU R                  U R                  U5      US9nX-   nU R                  XS9$ Nr   )r   r   r   r   r   r   r   rr   )rt   r   r   r   residuals        r   r   "FlaxBasicTransformerBlock.__call__2  s     $$ JJtzz-'@'JgM JJtzz-'@P]J^M%0 !

4::m#<g
c%0 !

= 9W%0!!-!MMr!   )r   r   rr   r   r   r   r   NT)r   r   r   r   r   r   r   r`   r   r   r   r   r   rc   ra   rb   rv   r   r   r   r!   r   r   r      s_    2 
HLKGU!&$&{{E399"+0"D0 ND ;6Nr!   r   c                       \ rS rSr% Sr\\S'   \\S'   \\S'   Sr\\S'   Sr\	\S	'   S
r
\\S'   S
r\\S'   \R                  r\R                  \S'   S
r\\S'   S
r\\S'   S rSS jrSrg)FlaxTransformer2DModeliH  a  
A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
https://huggingface.co/papers/1506.02025


Parameters:
    in_channels (:obj:`int`):
        Input number of channels
    n_heads (:obj:`int`):
        Number of heads
    d_head (:obj:`int`):
        Hidden states dimension inside each head
    depth (:obj:`int`, *optional*, defaults to 1):
        Number of transformers block
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    use_linear_projection (`bool`, defaults to `False`): tbd
    only_cross_attention (`bool`, defaults to `False`): tbd
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
    use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
        enable memory efficient attention https://huggingface.co/papers/2112.05682
    split_head_dim (`bool`, *optional*, defaults to `False`):
        Whether to split the head dimension into a new axis for the self-attention computation. In most cases,
        enabling this flag should speed up the computation for Stable Diffusion 2.x and Stable Diffusion XL.
in_channelsr   r   r{   depthr_   r`   Fuse_linear_projectionr   rc   ra   rb   c                 (   [         R                  " SSS9U l        U R                  U R                  -  nU R
                  (       a$  [         R                  " XR                  S9U l        O'[         R                  " USSSU R                  S9U l        [        U R                  5       Vs/ s HY  n[        UU R                  U R                  U R                  U R                  U R                  U R                  U R                   S9PM[     snU l        U R
                  (       a$  [         R                  " XR                  S9U l        O'[         R                  " USSSU R                  S9U l        [         R&                  " U R                  S	9U l        g s  snf )
N    r   )
num_groupsr   r   )r{   r{   VALID)kernel_sizestridespaddingrc   )r`   r   rc   ra   rb   rk   )rn   	GroupNormnormr   r   r   ro   rc   proj_inConvranger   r   r`   r   ra   rb   transformer_blocksproj_outrq   rr   )rt   ru   rJ   s      r   rv   FlaxTransformer2DModel.setupo  s2   LLB=	LL4;;.	%%88IZZ@DL77"jjDL& 4::&#
 ' &%)%>%>jj/3/R/R#22	 '#
 %%HHYjjADMGG"jjDM  ZZT\\:3#
s   'A Fc                    UR                   u  pEpgUnU R                  U5      nU R                  (       a'  UR                  XEU-  U5      nU R	                  U5      nO&U R	                  U5      nUR                  XEU-  U5      nU R
                   H
  n	U	" XUS9nM     U R                  (       a$  U R                  U5      nUR                  XEXg5      nO#UR                  XEXg5      nU R                  U5      nX-   nU R                  XS9$ r   )r+   r   r   r}   r   r   r   rr   )
rt   r   r   r   batchheightwidthchannelsr   transformer_blocks
             r   r   FlaxTransformer2DModel.__call__  s    )6)<)<&u 		-0%%)11%%RM LL7M LL7M)11%%RM!%!8!8-mTabM "9 %% MM-8M)11%QM)11%QM MM-8M%0!!-!MMr!   )rr   r   r   r   r   Nr   )r   r   r   r   r   r   r   r   r`   r   r   r   r   r   r   rc   ra   rb   rv   r   r   r   r!   r   r   r   H  su    6 LKE3NGU"'4'!&$&{{E399"+0"D0 ND (;TNr!   r   c                   x    \ rS rSr% Sr\\S'   Sr\\S'   \	R                  r\	R                  \S'   S rSS jrS	rg
)r   i  a  
Flax module that encapsulates two Linear layers separated by a non-linearity. It is the counterpart of PyTorch's
[`FeedForward`] class, with the following simplifications:
- The activation function is currently hardcoded to a gated linear unit from:
https://huggingface.co/papers/2002.05202
- `dim_out` is equal to `dim`.
- The number of hidden dimensions is hardcoded to `dim * 4` in [`FlaxGELU`].

Parameters:
    dim (:obj:`int`):
        Inner hidden states dimension
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
r   r_   r`   rc   c                     [        U R                  U R                  U R                  5      U l        [
        R                  " U R                  U R                  S9U l        g )Nr   )	FlaxGEGLUr   r`   rc   net_0rn   ro   net_2r   s    r   rv   FlaxFeedForward.setup  s:     txxtzzB
XXdhhdjj9
r!   c                 F    U R                  XS9nU R                  U5      nU$ r   r   r   )rt   r   r   s      r   r   FlaxFeedForward.__call__  s&    

=
N

=1r!   r   Nr   r   r   r   r   r   r   r   r`   r   r   r   rc   rv   r   r   r   r!   r   r   r     s4    " 
HGU{{E399":r!   r   c                   x    \ rS rSr% Sr\\S'   Sr\\S'   \	R                  r\	R                  \S'   S rSS jrS	rg
)r   i  a  
Flax implementation of a Linear layer followed by the variant of the gated linear unit activation function from
https://huggingface.co/papers/2002.05202.

Parameters:
    dim (:obj:`int`):
        Input hidden states dimension
    dropout (:obj:`float`, *optional*, defaults to 0.0):
        Dropout rate
    dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
        Parameters `dtype`
r   r_   r`   rc   c                     U R                   S-  n[        R                  " US-  U R                  S9U l        [        R
                  " U R                  S9U l        g )Nr   rz   r   rk   )r   rn   ro   rc   projrq   r`   rr   rs   s     r   rv   FlaxGEGLU.setup  s>    HHqL	HHY]$**=	ZZT\\:r!   c                     U R                  U5      n[        R                  " USSS9u  p4U R                  U[        R
                  " U5      -  US9$ )Nrz   r   r   )r   r   splitrr   rn   gelu)rt   r   r   hidden_linearhidden_gelus        r   r   FlaxGEGLU.__call__  sJ    		-0%(YY}aa%H"!!-"''+2F"FVc!ddr!   )rr   r   Nr   r   r   r!   r   r   r     s5     
HGU{{E399";
er!   r   )r   )r8   rR   
flax.linenlinenrn   r   	jax.numpynumpyr   r   rF   r   	PrecisionHIGHESTrV   ModulerX   r   r   r   r   r   r!   r   <module>r      s       
 0$ 0$h "%!2!2!:!:TXpt-)NQ-)jm-)`wNBII wNtQN		 QNhgNRYY gNTbii De		 er!   