
    +h.              $          S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	J
r
JrJrJrJr  S SKrSSKJrJrJrJrJrJrJrJrJrJrJrJr  SSKJrJr  Sr Sr!S	r"S
r#Sr$\" 5       =(       a	    \" S\ 5      r%\" 5       r&\" 5       =(       a	    \" S\!5      r'\" S\"5      r(\" 5       r)\" 5       =(       a	    \" S\#5      r*\" 5       =(       a	    \" S\$5      r+\%(       a	  S SK,J-r-J.r.  OSr-Sr.\&(       a  S SK/J-r0  S SK/J.r1  OSr0Sr1\'(       a  S SK2J3r3J4r4J5r5J6r6J7r7J8r8  OSr3Sr6Sr7Sr4Sr5Sr8\((       a  S SK9J:s  J;s  J<r<  \)(       a  S SK=J>r>  OSr>\*(       a  S SK?J@rA  OSrA\+(       a  S SKBJCrD  OSrD\R                  S:  a-  \R                  R                  rH\R                  R                  rJOSqSSS.S jjrKSqSSS.S jjrL\KrH\LrJ\" \M5      rN\S   rO\S   rP\S   rQ " S S\R\5      rS " S S5      rT\ R                  \SR                  4S \\R\S4   4S! jj5       rW      SrSS".S#\R                  S$\R                  S%\R                  S&\\R                     S'\YS(\ZS)\\Y   S*\ZS+\\	\R\4      S \\S   S,\R                  4S- jjjr[S&\\R                     S(\ZS,S4S. jr\S#\R                  S$\R                  S%\R                  S,S4S/ jr]S#\R                  S$\R                  S%\R                  S,S4S0 jr^S1\_S2\_S,\4S3 jr`S#\R                  S$\R                  S%\R                  S,S4S4 jraS#\R                  S$\R                  S%\R                  S,S4S5 jrb SqS#\R                  S$\R                  S%\R                  S&\\R                     S,S4
S6 jjrcS \SS,S4S7 jrd\R                  " S8S99 SqS:\_S;\_S<\_S=\\R                     4S> jj5       rg SqS:\_S;\_S&\R                  S=\\R                     4S? jjrh  SsS:\_S;\_S<\_S&\\R                     S=\\R                     S,S4S@ jjriS&\R                  S:\_SA\_S,\R                  4SB jrjSC rk\H" SDSESFSG9S#\R                  S$\R                  S%\R                  S,\\R                  \R                  4   4SH j5       rl\J" SD5      S#\R                  S$\R                  S%\R                  S,\\R                  \R                  4   4SI j5       rm\TR                  \SR                  \]\b\c/SJ9        StS#\R                  S$\R                  S%\R                  S'\YS)\\Y   S(\ZSK\\_\_4   SL\YSM\\R                     SN\ZSO\ZS,\R                  4SP jj5       rp\TR                  \SR                  \]\b\c/SJ9             SuS#\R                  S$\R                  S%\R                  SQ\\R                     SR\\R                     SS\\_   ST\\_   S'\YS)\\Y   S(\ZSK\\_\_4   SL\YSM\\R                     SN\ZSO\ZS&\\R                     S,\R                  4"SU jj5       rr\TR                  \SR                  \]\b\c/SJ9      SvS#\R                  S$\R                  S%\R                  S)\\Y   S(\ZSK\\_\_4   SL\YSN\ZSO\ZS,\R                  4SV jj5       rt\TR                  \SR                  \]\b\c/SJ9           SwS#\R                  S$\R                  S%\R                  SQ\\R                     SR\\R                     SS\\_   ST\\_   S)\\Y   S(\ZSK\\_\_4   SL\YSN\ZSO\ZS&\\R                     S,\R                  4SW jj5       rv\TR                  \SR                  \\\]\c/SJ9      SxS#\R                  S$\R                  S%\R                  S&\\\R                  SX4      S(\ZS)\\Y   S*\ZSY\ZSZ\\	\R\4      S,\R                  4S[ jj5       rx\TR                  \SR                  \]\c/SJ9     SyS#\R                  S$\R                  S%\R                  S&\\R                     S'\YS(\ZS)\\Y   S*\ZS,\R                  4S\ jj5       ry\TR                  \SR                  \]\b\c/SJ9     SyS#\R                  S$\R                  S%\R                  S&\\R                     S'\YS(\ZS)\\Y   S*\ZS,\R                  4S] jj5       r{\TR                  \SR                  \]\c/SJ9     SyS#\R                  S$\R                  S%\R                  S&\\R                     S'\YS(\ZS)\\Y   S*\ZS,\R                  4S^ jj5       r}\TR                  \SR                  \]\b\c/SJ9    SzS#\R                  S$\R                  S%\R                  S'\YS(\ZS)\\Y   S*\ZS,\R                  4S_ jj5       r\TR                  \SGR                   \]\c/SJ9     SyS#\R                  S$\R                  S%\R                  S&\\R                     S'\YS(\ZS)\\Y   S*\ZS,\R                  4S` jj5       r\TR                  \SGR                  \]\b\c/SJ9  S{S#\R                  S$\R                  S%\R                  S'\YS)\\Y   S,\R                  4Sa jj5       r\TR                  \SGR                  \]\c/SJ9 S|S#\R                  S$\R                  S%\R                  S(\ZS,\R                  4
Sb jj5       r\TR                  \SGR                  \^\b\c/SJ9   S}S#\R                  S$\R                  S%\R                  S(\ZS)\\Y   SY\ZS,\R                  4Sc jj5       r\TR                  \SGR                  \^\b\c/SJ9        S~S#\R                  S$\R                  S%\R                  SQ\\R                     SR\\R                     SS\\_   ST\\_   S(\ZS)\\Y   Sd\ZS&\\R                     S,\R                  4Se jj5       r\TR                  \SGR                  \`" SfS 5      \c/SJ9       SS#\R                  S$\R                  S%\R                  S(\ZS)\\Y   Sg\PSh\OSd\ZSi\ZSY\ZS,\R                  4Sj jj5       r\TR                  \SGR                  \`" SfS 5      \c/SJ9      SS#\R                  S$\R                  S%\R                  S(\ZS)\\Y   Sg\PSh\OSd\ZSY\ZS,\R                  4Sk jj5       r\TR                  \SGR                  \`" SlS 5      \c/SJ9       SS#\R                  S$\R                  S%\R                  S(\ZS)\\Y   Sg\PSh\OSd\ZSi\ZSY\ZS,\R                  4Sm jj5       r\TR                  \SGR                   \`" SlS 5      \c/SJ9     SS#\R                  S$\R                  S%\R                  S(\ZS)\\Y   Sn\QSd\ZSY\ZS,\R                  4So jj5       r\TR                  \SGR$                  \\\]\c/SJ9     SyS#\R                  S$\R                  S%\R                  S&\\R                     S'\YS(\ZS)\\Y   S*\ZS,\R                  4Sp jj5       rg)    N)Enum)AnyCallableDictListLiteralOptionalTupleUnion   )
get_loggeris_flash_attn_3_availableis_flash_attn_availableis_flash_attn_versionis_sageattention_availableis_sageattention_versionis_torch_npu_availableis_torch_versionis_torch_xla_availableis_torch_xla_versionis_xformers_availableis_xformers_version)DIFFUSERS_ATTN_BACKENDDIFFUSERS_ATTN_CHECKSz2.6.3z2.1.12.5.0z2.2z0.0.29>=)flash_attn_funcflash_attn_varlen_funcr   )r   )sageattnsageattn_qk_int8_pv_fp8_cuda!sageattn_qk_int8_pv_fp8_cuda_sm90sageattn_qk_int8_pv_fp16_cudasageattn_qk_int8_pv_fp16_tritonsageattn_varlen)npu_fusion_attention)flash_attentionz2.4.0)device_typesschemac                   S nUc  U$ U$ )Nc                     U $ N funcs    ]/home/james-whalen/.local/lib/python3.13/site-packages/diffusers/models/attention_dispatch.pywrapcustom_op_no_op.<locals>.wrapx       K    r-   )namefnmutates_argsr(   r)   r1   s         r0   custom_op_no_opr8   w       	 zt)r)r4      )lib_stacklevelc                   S nUc  U$ U$ )Nc                     U $ r,   r-   r.   s    r0   r1   !register_fake_no_op.<locals>.wrap~   r3   r4   r-   )opr6   r;   r<   r1   s        r0   register_fake_no_oprA   }   r9   r4   )fp32	fp32+fp32)
per_threadper_warp)cudatritonc                   `    \ rS rSrSrSrSrSrSrSr	Sr
S	rS
rSrSrSrSrSrSrSrSrSrSrSrg)AttentionBackendName   flashflash_varlen_flash_3_flash_varlen_3flexnative_native_cudnn_native_efficient_native_flash_native_math_native_npu_native_xlasagesage_varlen_sage_qk_int8_pv_fp8_cuda_sage_qk_int8_pv_fp8_cuda_sm90_sage_qk_int8_pv_fp16_cuda_sage_qk_int8_pv_fp16_tritonxformersr-   N)__name__
__module____qualname____firstlineno__FLASHFLASH_VARLEN_FLASH_3_FLASH_VARLEN_3FLEXNATIVE_NATIVE_CUDNN_NATIVE_EFFICIENT_NATIVE_FLASH_NATIVE_MATH_NATIVE_NPU_NATIVE_XLASAGESAGE_VARLEN_SAGE_QK_INT8_PV_FP8_CUDA_SAGE_QK_INT8_PV_FP8_CUDA_SM90_SAGE_QK_INT8_PV_FP16_CUDA_SAGE_QK_INT8_PV_FP16_TRITONXFORMERS__static_attributes__r-   r4   r0   rI   rI      sn     E!LH'O DF#M+#M!LKK DK ;%E"!=#A  Hr4   rI   c                       \ rS rSr0 r0 r0 r\" \5      r	\
r\S	S\S\\\      4S jj5       r\S 5       r\S 5       rSrg)
_AttentionBackendRegistry   Nbackendconstraintsc                 R   ^ ^^ [         R                  ST ST 35        UU U4S jnU$ )NzRegistering attention backend: z with constraints: c                    > U TR                   T'   T=(       d    / TR                  T'   [        [        R                  " U 5      R
                  R                  5       5      TR                  T'   U $ r,   )	_backends_constraintssetinspect	signature
parameterskeys_supported_arg_names)r/   ry   clsrz   s    r0   	decorator5_AttentionBackendRegistry.register.<locals>.decorator   sZ    %)CMM'"(3(9rCW%03G4E4Ed4K4V4V4[4[4]0^C$$W-Kr4   )loggerdebug)r   ry   rz   r   s   ``` r0   register"_AttentionBackendRegistry.register   s-    6wi?RS^R_`a	 r4   c                 L    U R                   U R                  U R                      4$ r,   )_active_backendr}   r   s    r0   get_active_backend,_AttentionBackendRegistry.get_active_backend   s"    ""CMM#2E2E$FFFr4   c                 H    [        U R                  R                  5       5      $ r,   )listr}   r   r   s    r0   list_backends'_AttentionBackendRegistry.list_backends   s    CMM&&())r4   r-   r,   )r^   r_   r`   ra   r}   r~   r   rI   r   r   r   _checks_enabledclassmethodr	   r   r   r   r   r   ru   r-   r4   r0   rw   rw      sw    IL*+ABO+O	3 	(4PX>BZ 	 	 G G * *r4   rw   ry   c              #      #    U [         R                  ;  a  [        SU  S35      e[        U 5      n [	        U 5        [         R
                  nU [         l         Sv   U[         l        g! U[         l        f = f7f)z6
Context manager to set the active attention backend.
zBackend z is not registered.N)rw   r}   
ValueErrorrI   %_check_attention_backend_requirementsr   )ry   old_backends     r0   attention_backendr      sl     
 /9998G9,?@AA"7+G)'2+;;K07-@4?!1K!1s   AA8A( A8(A55A8ry   querykeyvalue	attn_mask	dropout_p	is_causalscale
enable_gqaattention_kwargsreturnc	                   U=(       d    0 nU	c  [         R                  5       u  pO*[        U	5      n
[         R                  R	                  U
5      nU UUUUUUS.UEn[        SS5      (       a  X|S'   [         R                  (       ay  [        U5      [        [         R                  U
   5      -
  nU(       a  [        R                  SU
 SU S35        [         R                  R	                  U
5       H  nU" S0 UD6  M     UR                  5        VVs0 s H"  u  nnU[         R                  U
   ;   d  M  UU_M$     nnnU" S0 UD6$ s  snnf )	N)r   r   r   r   r   r   r   r   r   r   z5Removing unsupported arguments for attention backend z: .r-   )rw   r   rI   r}   getr   r   r   r   r   warningr~   items)r   r   r   r   r   r   r   r   r   ry   backend_name
backend_fnkwargsremoved_kwargscheckkvs                    r0   dispatch_attention_fnr      sI    (-2 $=#O#O#Q j+G4.88<<\J
 	 	F g&&)| 00Vs+D+Y+YZf+g'hhNNRS_R``bcqbrrstu.;;??MEOFO N  &||~s~tq!6O6d6deq6r1rdad~Fs ts   E4Ec                 0    U b  U(       a  [        S5      eg g )Nz8`is_causal` cannot be True when `attn_mask` is not None.)r   )r   r   r   s      r0   _check_attn_mask_or_causalr     s    STT "+r4   c                     U R                   UR                   :w  d  U R                   UR                   :w  a  [        S5      eU R                  UR                  :w  d  U R                  UR                  :w  a  [        S5      eg )Nz1Query, key, and value must be on the same device.z/Query, key, and value must have the same dtype.)devicer   dtyper   r   r   r   s       r0   _check_devicer     s]    ||szz!U\\U\\%ALMM{{cii5;;%++#=JKK $>r4   c                 f    [        XU5        U R                  R                  S:w  a  [        S5      eg )NrF   z/Query, key, and value must be on a CUDA device.)r   r   typer   r   s       r0   _check_device_cudar   %  s/    %e$||F"JKK #r4   majorminorc                    ^ ^ S[         R                  S[         R                  S[         R                  SS 4U U4S jjnU$ )Nr   r   r   r   c                    > [        XU5        [        R                  R                  U R                  5      TT4:  a  [        ST ST S35      eg )NzJQuery, key, and value must be on a CUDA device with compute capability >= r   )r   torchrF   get_device_capabilityr   r   )r   r   r   r   r   r   s       r0   check_device_cuda:_check_device_cuda_atleast_smXY.<locals>.check_device_cuda,  sW    5u-::++ELL9UENJ\]b\ccdejdkklm  Kr4   )r   Tensor)r   r   r   s   `` r0   _check_device_cuda_atleast_smXYr   +  s>     ELL  dh   r4   c                     U R                   UR                   :w  a  [        S5      eU R                   UR                   :w  a  [        S5      eg )Nz'Query and key must have the same dtype.z)Query and value must have the same dtype.)r   r   r   s       r0   _check_qkv_dtype_matchr   6  s?    {{ciiBCC{{ekk!DEE "r4   c                     [        XU5        U R                  [        R                  [        R                  4;  a  [        S5      eg )Nz9Query, key, and value must be either bfloat16 or float16.)r   r   r   bfloat16float16r   r   s       r0   _check_qkv_dtype_bf16_or_fp16r   =  s6    5u-{{5>>5==99TUU :r4   c                    U R                   S   UR                   S   :w  a  [        S5      eU R                   S   UR                   S   :w  a  [        S5      eUb,  UR                   S   UR                   S   :w  a  [        S5      eg g )Nz0Query and key must have the same last dimension.z<Query and value must have the same second to last dimension.z=Attention mask must match the key's second to last dimension.)shaper   )r   r   r   r   r   s        r0   _check_shaper   C  s     {{2#))B-'KLL{{2%++b/)WXX!4		"!EXYY "Fr4   c                 2   U [         R                  [         R                  4;   a,  [        (       d   [	        SU R
                   S[         S35      eg U [         R                  [         R                  4;   a%  [        (       d  [	        SU R
                   S35      eg U [         R                  [         R                  [         R                  [         R                  [         R                  [         R                  4;   a,  [         (       d   [	        SU R
                   S["         S35      eg U [         R$                  :X  a%  [&        (       d  [	        SU R
                   S	35      eg U [         R(                  :X  a%  [*        (       d  [	        S
U R
                   S35      eg U [         R,                  :X  a,  [.        (       d   [	        SU R
                   S[0         S35      eg U [         R2                  :X  a,  [4        (       d   [	        SU R
                   S[6         S35      eg g )NzFlash Attention backend 'zb' is not usable because of missing package or the version is too old. Please install `flash-attn>=z`.zFlash Attention 3 backend 'zp' is not usable because of missing package or the version is too old. Please build FA3 beta release from source.zSage Attention backend 'ze' is not usable because of missing package or the version is too old. Please install `sageattention>=zFlex Attention backend 'zd' is not usable because of missing package or the version is too old. Please install `torch>=2.5.0`.zNPU Attention backend 'za' is not usable because of missing package or the version is too old. Please install `torch_npu`.zXLA Attention backend 'za' is not usable because of missing package or the version is too old. Please install `torch_xla>=zXformers Attention backend 'z`' is not usable because of missing package or the version is too old. Please install `xformers>=)rI   rb   rc   _CAN_USE_FLASH_ATTNRuntimeErrorr   _REQUIRED_FLASH_VERSIONrd   re   _CAN_USE_FLASH_ATTN_3rn   ro   rp   rq   rr   rs   _CAN_USE_SAGE_ATTN_REQUIRED_SAGE_VERSIONrf   _CAN_USE_FLEX_ATTNrl   _CAN_USE_NPU_ATTNrm   _CAN_USE_XLA_ATTN_REQUIRED_XLA_VERSIONrt   _CAN_USE_XFORMERS_ATTN_REQUIRED_XFORMERS_VERSIONr   s    r0   r   r   U  sg   '--/C/P/PQQ""+GMM?  ;]  ^u  ]v  vx  y  #
 
)224H4X4XY	Y$$-gmm_  =m  n  %
 
!!((66;;7799 
 "!*7==/  :_  `v  _w  wy  z  "
 
(--	-!!*7==/  :^  _  "
 
(44	4  )'--  9Z  [  !
 
(44	4  )'--  9Z  [p  Zq  qs  t  !
 
(11	1%%.w}}o  >^  _y  ^z  z|  }  & 
2r4      )maxsize
batch_size	seq_len_q
seq_len_kvr   c                    [         R                  " U 4U[         R                  US9n[         R                  " U 4U[         R                  US9n[         R                  " U S-   [         R                  US9n[         R                  " U S-   [         R                  US9n[         R                  " USS9USS & [         R                  " USS9USS & UR                  5       R                  5       nUR                  5       R                  5       n	XE4Xg4X44$ )Nr   r   r:   r   dim)r   fullint32zeroscumsummaxitem)
r   r   r   r   	seqlens_q	seqlens_kcu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_ks
             r0   3_prepare_for_flash_attn_or_sage_varlen_without_maskr     s     

J=)5;;vVI

J=*EKKPVWI;;zA~U[[PL;;zA~U[[PL||I15L||I15L==?'')L==?'')L!L#?,A]]]r4   c                    [         R                  " U 4U[         R                  US9nUR                  S[         R                  S9n[         R                  " U S-   [         R                  US9n[         R                  " U S-   [         R                  US9n[         R
                  " USS9USS & [         R
                  " USS9USS & UR                  5       R                  5       nUR                  5       R                  5       n	XE4Xg4X44$ )Nr   r:   )r   r   r   r   )r   r   r   sumr   r   r   r   )
r   r   r   r   r   r   r   r   r   r   s
             r0   0_prepare_for_flash_attn_or_sage_varlen_with_maskr     s     

J=)5;;vVI!5;;7I;;zA~U[[PL;;zA~U[[PL||I15L||I15L==?'')L==?'')L!L#?,A]]]r4   c                 8    Uc  [        XX$5      $ [        XX45      $ r,   )r   r   )r   r   r   r   r   s        r0   &_prepare_for_flash_attn_or_sage_varlenr     s&     B:Zdmm;JS\eer4   	seq_len_kc           	         U R                   [        R                  :w  a  [        SU R                    S35      eU R                  S:X  a"  U R                  S5      R                  X5      n GOAU R                  S:X  aH  U R                  S5      SU4;  a  [        SU R                  S    SU S35      eU R                  X5      n OU R                  S	:X  aW  U R                  S5      SU4;  a  [        SU R                  S    SU S
35      eU R                  SS9n U R                  X5      n OU R                  S:X  aZ  U R                  S5      SU4;  a  [        SU R                  S    SU S35      eU R                  USSU5      n U R                  SS9n O[        SU R                   35      eU R                  X4:w  a  [        SU R                   SU SU S35      eU $ )z
Normalize an attention mask to shape [batch_size, seq_len_k] (bool) suitable for inferring seqlens_[q|k] in
FlashAttention/Sage varlen.

Supports 1D to 4D shapes and common broadcasting patterns.
z)Attention mask must be of type bool, got r   r:   r   r   zattn_mask.shape[0] (z) must be 1 or z for 2D attention mask.   z for 3D attention mask.r      z for 4D attention mask.r   )r:   r   z"Unsupported attention mask shape: z.Normalized attention mask shape mismatch: got z, expected (z, ))
r   r   boolr   ndim	unsqueezeexpandsizer   any)r   r   r   s      r0   _normalize_attn_maskr    s    %**$DY__DUUVWXX~~''*11*H		1	>>!Q
O3&yq'9&:/*Ulm  $$Z;		1	 >>!Q
O3&yq'9&:/*Ulm  MMaM(	$$Z;		1	>>!Q
O3&yq'9&:/*Ulm  $$ZRC	MMfM-	 =ioo=NOPP:11<Y__<M\ZdYeeghqgrrst
 	
 r4   c                 
    X#:  $ r,   r-   )	batch_idxhead_idxq_idxkv_idxs       r0   _flex_attention_causal_mask_modr    s
    ?r4   z!flash_attn_3::_flash_attn_forwardr-   rF   )r7   r(   c                 J    [        XU5      u  p4UR                  SSS5      nX44$ )Nr   r   r:   )flash_attn_3_funcpermute)r   r   r   outlses        r0   _wrapped_flash_attn_3_originalr    s+     !U3HC
++aA
C8Or4   c                 v    U R                   u  p4pVX4U4n[        R                  " U 5      U R                  U5      4$ r,   )r   r   
empty_like	new_empty)r   r   r   r   seq_len	num_headshead_dim	lse_shapes           r0   _r    s9    /4{{,Ji0IE"EOOI$>>>r4   )rz   window_sizesoftcapalibi_slopesdeterministicreturn_attn_probsc                 ,    [        U UUUUUUUUU	U
S9nU$ )N)qr   r   r   softmax_scalecausalr  r  r  r  r  r   )r   r   r   r   r   r   r  r  r  r  r  r  s               r0   _flash_attentionr$    s6    " 


!#+C Jr4   r   r   r   r   c                 D   U R                   u  nn  nUR                   u  nn  nUb  [        UUU5      n[        S X4XV4 5       5      (       a"  [        UUUXR                  S9u  u  nnu  p4u  pVO[
        R                  " U4U[
        R                  U R                  S9nUR                  [
        R                  U R                  S9nUR                  [
        R                  U R                  S9n/ / nn[        U5       H8  nUU   nUR                  UUS U24   5        UR                  UUS U24   5        M:     U R                  SS5      n[
        R                  " USS9n[
        R                  " USS9n[        UUUUUUUUUU	U
UUUUS9nUR                  SUS45      nU$ )	Nc              3   (   #    U  H  oS L v   M
     g 7fr,   r-   .0xs     r0   	<genexpr>*_flash_varlen_attention.<locals>.<genexpr>E       
WV9V   r   r   r   r   r:   r   )r!  r   r   r   r   r   r   r   r"  r#  r  r  r  r  r  r   )r   r  r  r   r   r   r   r   torangeappendflattencatr   	unflatten)r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r   r   r   r  r   r   	key_validvalue_validb	valid_lenquery_packed
key_packedvalue_packedr  s                                r0   _flash_varlen_attentionr<  )  s   , #(++J	1a))Az1a(J
K	

W|<V
WWW2IzY|| 	SI46R|\ JJ
}l%++V[VbVbc	#U[[N#U[[N{I:aL	Q

]+,5JYJ/0 
 ==A&L9!,J99[a0L
 


!!!!!#+C" --J+
,CJr4   c	                     [        S0 SU _SU_SU_SU_SU_SS _SS _SS _S	S _S
U_SS_SU_SS_SS _SU_SS_6tpnU(       a  X4$ U	$ )Nr!  r   r   r"  r#  qv	q_descale	k_descale	v_descaler  attention_chunkr   r  
num_splitsr:   pack_gqar  	sm_marginr-   )r  )r   r   r   r   r   r  r  r  r  r  r  r  s               r0   _flash_attention_3rF  p  s     % 

  	
            $  !LCq$ +C:33r4   c                    U R                   u  p  nUR                   u  nn  nUb  [        XU5      n[        S X4XV4 5       5      (       a!  [        XUXR                  S9u  u  nnu  p4u  pVO[
        R                  " U4U[
        R                  U R                  S9nUR                  [
        R                  U R                  S9nUR                  [
        R                  U R                  S9n/ / nn[        U5       H8  nUU   nUR                  UUS U24   5        UR                  UUS U24   5        M:     U R                  SS5      n[
        R                  " USS9n[
        R                  " USS9n[        S0 SU_SU_S	U_S
U_SU_SU_SU_SS _SS _SU_SU_SS _SS _SS _SS _SU	_SU
_SS_SS _SU_SS_6tnnnUR                  SUS45      nU(       a  UU4$ U$ )Nc              3   (   #    U  H  oS L v   M
     g 7fr,   r-   r'  s     r0   r*  ,_flash_varlen_attention_3.<locals>.<genexpr>  r,  r-  r.  r   r   r:   r   r!  r   r   r   r   r   r   	seqused_q	seqused_kr"  r#  r>  r?  r@  rA  r  r  rC  rD  r  rE  r   r-   )r   r  r  r   r   r   r   r   r/  r0  r1  r2  r3  flash_attn_3_varlen_funcr4  )r   r   r   r   r   r   r   r   r   r  r  r  r  r   r   r   r  r   r   r5  r6  r7  r8  r9  r:  r;  r  r  s                               r0   _flash_varlen_attention_3rM    s_   ( #(++J1a))Az1a(
K	

W|<V
WWW2zY|| 	SI46R|\ JJ
}l%++V[VbVbc	#U[[N#U[[N{I:aL	Q

]+,5JYJ/0 
 ==A&L9!,J99[a0L+ 

  "	
 " " "           !" #$ %& '( $)* +LCq. --J+
,C*C:33r4   zflex_attention.BlockMask
return_lsekernel_optionsc	                   ^ S n	S n
U R                   u  ppUR                   u  p  nTb  [        T[        R                  5      (       a  Tn
OU(       a(  [        R                  " [
        XXU R                  5      n
O[        R                  " T5      (       a  TR                  S:X  a2  TR                  TR                  S5      STR                  S5      S5      mTR                  XX5      mTR                  [        R                  :X  a+  U4S jn[        R                  " UUS XU R                  5      n
OU4S jn	O[        S5      eS XU4 5       u  pn[        R                  " U UUU	U
UUUUS9	nUR!                  SSSS	5      nU$ )
Nr   r   r:   c                    > TXX#4   $ r,   r-   )r  r	  r
  r  r   s       r0   mask_mod(_native_flex_attention.<locals>.mask_mod  s     e!CDDr4   c                    > U TXX44   -   $ r,   r-   )scorer  r	  r
  r  r   s        r0   	score_mod)_native_flex_attention.<locals>.score_mod
  s    ye)KLLLr4   zCAttention mask must be either None, a BlockMask, or a 2D/4D tensor.c              3   H   #    U  H  oR                  S SSS5      v   M     g7fr   r   r:   r   Nr  r'  s     r0   r*  )_native_flex_attention.<locals>.<genexpr>  "     L8K11aA..8K    ")	r   r   r   rV  
block_maskr   r   rN  rO  r   )r   
isinstanceflex_attention	BlockMaskcreate_block_maskr  r   r   	is_tensorr  viewr  r  r   r   r   r  )r   r   r   r   r   r   r   rN  rO  rV  r^  r   r   r  r  r   rR  r  s      `              r0   _native_flex_attentionre    sh     IJ*/++'J9))A1aJy.2J2JKK
	#55+ZI[`[g[g

 
	#	#>>Q!y~~a'8!Y^^A=NPQRI$$ZIR	??ejj(E (99*dI5<<J
M ^__LU8KLE

'
'%
C ++aAq
!CJr4   c                     S XU4 5       u  pn[         R                  R                  R                  U UUUUUUUS9nUR	                  SSSS5      nU$ )Nc              3   H   #    U  H  oR                  S SSS5      v   M     g7frY  rZ  r'  s     r0   r*  $_native_attention.<locals>.<genexpr>-  r\  r]  r   r   r   r   r   r   r   r   r   r   r:   r   )r   nn
functionalscaled_dot_product_attentionr  	r   r   r   r   r   r   r   r   r  s	            r0   _native_attentionrn    sk     MU8KLE
((


:
: ; 	C ++aAq
!CJr4   c                    S XU4 5       u  pn[         R                  R                  R                  [         R                  R                  R                  R
                  5         [         R                  R                  R                  U UUUUUUUS9nS S S 5        WR                  SSSS5      nU$ ! , (       d  f       N$= f)Nc              3   H   #    U  H  oR                  S SSS5      v   M     g7frY  rZ  r'  s     r0   r*  *_native_cudnn_attention.<locals>.<genexpr>J  r\  r]  ri  r   r   r:   r   )	r   rj  	attentionsdpa_kernel
SDPBackendCUDNN_ATTENTIONrk  rl  r  rm  s	            r0   _native_cudnn_attentionrv  <  s     MU8KLE				'	'(:(:(E(E(U(U	Vhh!!>>! ? 	
 
W ++aAq
!CJ 
W	V   $/B11
B?c                    S XU4 5       u  pn[         R                  R                  R                  [         R                  R                  R                  R
                  5         [         R                  R                  R                  U UUUUUUUS9nS S S 5        WR                  SSSS5      nU$ ! , (       d  f       N$= f)Nc              3   H   #    U  H  oR                  S SSS5      v   M     g7frY  rZ  r'  s     r0   r*  ._native_efficient_attention.<locals>.<genexpr>h  r\  r]  ri  r   r   r:   r   )	r   rj  rr  rs  rt  EFFICIENT_ATTENTIONrk  rl  r  rm  s	            r0   _native_efficient_attentionr|  Z  s     MU8KLE				'	'(:(:(E(E(Y(Y	Zhh!!>>! ? 	
 
[ ++aAq
!CJ 
[	Zrw  c                    S XU4 5       u  pn[         R                  R                  R                  [         R                  R                  R                  R
                  5         [         R                  R                  R                  U UUS UUUUS9nS S S 5        WR                  SSSS5      nU$ ! , (       d  f       N$= f)Nc              3   H   #    U  H  oR                  S SSS5      v   M     g7frY  rZ  r'  s     r0   r*  *_native_flash_attention.<locals>.<genexpr>  r\  r]  ri  r   r   r:   r   )	r   rj  rr  rs  rt  FLASH_ATTENTIONrk  rl  r  )r   r   r   r   r   r   r   r  s           r0   _native_flash_attentionr  x  s     MU8KLE				'	'(:(:(E(E(U(U	Vhh!!>>! ? 	
 
W ++aAq
!CJ 
W	Vrw  c                    S XU4 5       u  pn[         R                  R                  R                  [         R                  R                  R                  R
                  5         [         R                  R                  R                  U UUUUUUUS9nS S S 5        WR                  SSSS5      nU$ ! , (       d  f       N$= f)Nc              3   H   #    U  H  oR                  S SSS5      v   M     g7frY  rZ  r'  s     r0   r*  )_native_math_attention.<locals>.<genexpr>  r\  r]  ri  r   r   r:   r   )	r   rj  rr  rs  rt  MATHrk  rl  r  rm  s	            r0   _native_math_attentionr    s     MU8KLE				'	'(:(:(E(E(J(J	Khh!!>>! ? 	
 
L ++aAq
!CJ 
L	Krw  c                     [        U UUU R                  S5      SS Uc&  S[        R                  " U R                  S   5      -  OUSSSU-
  SSS9S   $ )	Nr   BSNDg      ?r   i   Fr   )input_layoutpser   pre_tockensnext_tockens	keep_probsyncinner_precise)r&   r  mathsqrtr   )r   r   r   r   r   s        r0   _native_npu_attentionr    sg      

127-cDIIekk"o..U	/ 	 	r4   c                     S XU4 5       u  pnU [         R                  " U R                  S   5      -  n [        U UUUS9nUR	                  SSSS5      nU$ )Nc              3   H   #    U  H  oR                  S SSS5      v   M     g7frY  rZ  r'  s     r0   r*  (_native_xla_attention.<locals>.<genexpr>  r\  r]  r   )r!  r   r   r#  r   r   r:   r   )r  r  r   xla_flash_attentionr  )r   r   r   r   r  s        r0   _native_xla_attentionr    se     MU8KLEDIIekk"o..E



	C ++aAq
!CJr4   c           
           [        U UUSUUUS9$ )NNHD)r!  r   r   tensor_layoutr   sm_scalerN  )r    )r   r   r   r   r   rN  s         r0   _sage_attentionr    s&     


 r4   smooth_kc                 0   U R                   u  p  nUR                   u  p  nU
b  [        XU5      n
[        S X4XV4 5       5      (       a   [        XXU R                  S9u  u  pu  p4u  pVO[
        R                  " U4U[
        R                  U R                  S9nUR                  [
        R                  U R                  S9nUR                  [
        R                  U R                  S9n/ / nn[        U5       H8  nUU   nUR                  UUS U24   5        UR                  UUS U24   5        M:     U R                  SS5      n[
        R                  " USS9n[
        R                  " USS9n[        UUUUUUUUUU	S9
nUR                  SUS45      nU$ )	Nc              3   (   #    U  H  oS L v   M
     g 7fr,   r-   r'  s     r0   r*  )_sage_varlen_attention.<locals>.<genexpr>  r,  r-  r.  r   r   r:   r   )
r!  r   r   r   r   r   r   r   r  r  r   )r   r  r  r   r   r   r   r   r/  r0  r1  r2  r3  r%   r4  )r   r   r   r   r   r   r   r   r   r  r   r   r   r  r   r   r5  r6  r7  r8  r9  r:  r;  r  s                           r0   _sage_varlen_attentionr    s   " #(++J1a))A1a(
K	

W|<V
WWW2zu|| 	S46R|\ JJ
}l%++V[VbVbc	#U[[N#U[[N{I:aL	Q

]+,5JYJ/0 
 ==A&L9!,J99[a0L



!!!!C --J+
,CJr4   	   qk_quant_granpv_accum_dtypesmooth_vc
                 (    [        U UUSUUUUUUU	S9$ Nr  )r!  r   r   r  r   r  r  r  r  r  rN  )r!   
r   r   r   r   r   r  r  r  r  rN  s
             r0   #_sage_qk_int8_pv_fp8_cuda_attentionr  9  s2      (


#% r4   c	                 &    [        U UUSUUUUUUS9
$ )Nr  )
r!  r   r   r  r   r  r  r  r  rN  )r"   )	r   r   r   r   r   r  r  r  rN  s	            r0   (_sage_qk_int8_pv_fp8_cuda_sm90_attentionr  X  s/     -


#% r4      c
                 (    [        U UUSUUUUUUU	S9$ r  )r#   r  s
             r0   $_sage_qk_int8_pv_fp16_cuda_attentionr  u  s2      )


#% r4   quantization_backendc                 $    [        U UUSUUUUUS9	$ )Nr  )	r!  r   r   r  r  r   r  r  rN  )r$   )r   r   r   r   r   r  r  rN  s           r0   &_sage_qk_int8_pv_fp16_triton_attentionr    s,     +


1
 
r4   c                    U R                   u  ppUR                   u  ppU(       a  [        R                  " 5       nOUb  UR                  S:X  a3  UR	                  UR                  S5      SUR                  S5      S5      nOUR                  S:w  a  [        S5      eUR                  XX5      R                  U 5      nU(       ay  X-  S:w  a  [        S5      eX-  nU R                  SUS45      n UR                  SUS45      R                  SSSUS5      nUR                  SUS45      R                  SSSUS5      n[        R                  " XX#XF5      nU(       a  UR                  SS5      nU$ )	Nr   r   r:   r   zDOnly 2D and 4D attention masks are supported for xformers attention.zKNumber of heads in query must be divisible by number of heads in key/value.r   r   )r   xopsLowerTriangularMaskr  rd  r  r   r  type_asr4  memory_efficient_attentionr2  )r   r   r   r   r   r   r   r   r   r   num_heads_qr  r   num_heads_kvnum_heads_per_groupr  s                   r0   _xformers_attentionr    sX    -2KK)J;%(YY"A<,,.			>>Q!y~~a'8!Y^^A=NPQRI^^q cdd$$ZiT\\]bc	%*jkk)9L"#56mmAb1299"b"FY[]^L"#56==b"bJ]_ab

)
)%e	
YCkk!QJr4   r,   )N        FNFN)NN)r  NFr   r   r  NFF)NNNNr  NFr  r  NFFN)NFr  r  FF)NNNNNFr  r  FFN)NFNFFN)Nr  FNF)r  FNF)r  N)F)FNF)NNNNFNTN)FNrD   rC   TFF)FNrD   rC   TF)FNrD   rB   TFF)FNrG   TF)
contextlib	functoolsr   r  enumr   typingr   r   r   r   r   r	   r
   r   r   utilsr   r   r   r   r   r   r   r   r   r   r   r   utils.constantsr   r   r   r   _REQUIRED_FLEX_VERSIONr   r   r   r   r   r   r   r   r   
flash_attnr   r   flash_attn_interfacer  rL  sageattentionr    r!   r"   r#   r$   r%   !torch.nn.attention.flex_attentionrj  rr  r`  	torch_npur&   $torch_xla.experimental.custom_kernelr'   r  xformers.opsopsr  __version__library	custom_op
_custom_opregister_fake_register_faker8   rA   r^   r   _SAGE_ATTENTION_PV_ACCUM_DTYPE_SAGE_ATTENTION_QK_QUANT_GRAN$_SAGE_ATTENTION_QUANTIZATION_BACKENDstrrI   rw   contextmanagerrg   r   r   floatr   r   r   r   r   intr   r   r   r   r   	lru_cacher   r   r   r   r  r  r  r  r   rb   r$  rc   r<  rd   rF  re   rM  rf   re  rn  rh   rv  ri   r|  rj   r  rk   r  rl   r  rm   r  rn   r  ro   r  rp   r  rq   r  rr   r  rs   r  rt   r  r-   r4   r0   <module>r     s        M M M     L "      % -/h4I$Pg4h 13 /1l6NtUk6l %d,BC *, *,b1EdLa1b .0j5HOi5j  BBO! IW#    H$(!&*##' (,%O  ?> . [ D 	((J]]00N*UY **4Q * !J(N 
H	 "))<!=  '(@ A './?'@ $3 D* *8 BVB]B] @uS*>%>? @ @. )-!15,  /3, <<, 	,  <<,  %	, 
 ,  ,  E?,  ,  tCH~.,  *+,  \\, fU(5<<*@ UT U`d U
L LELL L L\` LLell L Lell Lae L3 s x F%,, FU\\ F%,, Fei FV VELL VQVQ]Q] Vlp V )-	Z<<Z	Z <<Z %	Z 
Z$03G 0D 0f S!
 &*	^^^ ^ U\\"	^ "^* &*	^^^ ||^ U\\"	^* )-%)	f	f	f 	f %		f
 U\\"	f 
	f1ELL 1c 1c 1V[VbVb 1h /bvV<<#ll38<<
5<<%& W 34?U\\ ? ?U\\ ?eELLZ_ZfZfLfFg ? 5? ## =|L $  !#++/#<<	 << 	
 E?  sCx  5<<(   \\	: ##%% =|L $  ,0+/"&"&!#++/#(,!@<<@	@ <<@ 5<<(	@
 5<<(@ 3-@ 3-@ @ E?@ @ sCx@ @ 5<<(@ @ @  %!@" \\#@	@F ##!! =|L $  "#+#4<<4	4 <<4 E?	4
 4 sCx4 4 4 4 \\4	4@ ##(( =|L $  ,0+/"&"&!#+#(,D4<<D4	D4 <<D4 5<<(	D4
 5<<(D4 3-D4 3-D4 E?D4 D4 sCxD4 D4 D4 D4 %D4 \\D4	D4N ##+]LI $  LP!/39<<9	9 <<9 ell,FFGH	9
 9 E?9 9 9 T#s(^,9 \\9	9x ##- $  )-!<<	 << %	
   E?  \\	2 ##&& =|L $  )-!<<	 << %	
   E?  \\	4 ##**- $  )-!<<	 << %	
   E?  \\	4 ##&& =|L $  !<<	 << 	
  E?  \\	2 ##%%- $  )-!<<	 << %	
   E?  \\	4 ##$$ =|L $  !	<<			 <<	 		
 E?	 \\			0 ##$$- $  	<<	 << 	
 \\	$ ###%BLQ $  !<<	 << 	
 E?  \\	& ##$$#%BLQ $  ,0+/"&"&!(,6<<6	6 <<6 5<<(	6
 5<<(6 3-6 3-6 6 E?6 6 %6 \\6	6r ##220A6E $  !3?5@<<	 << 	
 E? 1 3    \\	6 ##770A6E $  !3?5@<<	 << 	
 E? 1 3   \\	2 ##330A6E $  !3?5;<<	 << 	
 E? 1 3    \\	6 ##550A6E $  !AI<<	 << 	
 E? ?   \\	. ##!!+]LI $  )-!#<<#	# <<# %	#
 # # E?# # \\#	#r4   