
    ΅i/                    |   % S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
  SSKJrJr  SS	KJr  \
(       a  SS
KJr  SSKrSSKJr  S/rSqS\S'   \ " S S5      5       r\S%S j5       r S&   S'S jjr\S(S j5       rS)S jr S*         S+S jjr                  S,S jr                      S-S jr\" S5      r S.S jr! S/                       S0S jjr" S1                       S2S jjr#SSSSSSS.                               S3S jjr$SSSS.                                 S4S jjr%   S5SS .             S6S! jjjr&SS .                             S7S" jjr'\RP                  " S#\S$9  g)8zUBER PROTOTYPE!!!    )annotationsN)	dataclass)cache)AnyTYPE_CHECKING)TypeVarTupleUnpack   )	_registry)
ModuleType)Libraryregister_flash_attention_fa4
str | None_FA4_MODULE_PATHc                  *    \ rS rSr% S\S'   SS jrSrg)
_FA4Handle   zLibrary | Nonelibraryc                    S U l         g Nr   )selfs    Q/home/james-whalen/.local/lib/python3.13/site-packages/torch/nn/attention/_fa4.pyremove_FA4Handle.remove"   s	        r   N)returnNone)__name__
__module____qualname____firstlineno____annotations__r   __static_attributes__ r   r   r   r      s    r   r   c                H    [         R                  R                  U 5      u  pU$ r   )torchcudaget_device_capability)devicemajor_s      r   _get_device_majorr-   &   s    zz//7HELr   c                B    [        U 5      nU q[        [        5       5      $ )z
Register FA4 flash attention kernels with the PyTorch dispatcher.

Args:
    module_path: Python module path to the FA4 implementation.
)_fa4_import_moduler   r   _fa4_register_kernels)module_pathr,   s     r   r   r   ,   s#     	;'A"+-..r   c                    [         R                  " U 5      n[        US5      (       a  [        US5      (       d  [        SU  S35      eU$ )N_flash_attn_fwd_flash_attn_bwdzModule 'z' does not expose FA4 kernels)	importlibimport_modulehasattrRuntimeError)r1   modules     r   r/   r/   ;   sG    $$[1F6,--WVEV5W5WXk]2OPQQMr   c                     [        SSS5      n U R                  S[        S5        U R                  S[        S5        U R                  S[        S5        U R                  S[
        S5        U $ )NatenIMPLCUDA_flash_attention_forward_flash_attention_backward#_scaled_dot_product_flash_attention,_scaled_dot_product_flash_attention_backward)r   impl!_fa4_flash_attention_forward_impl"_fa4_flash_attention_backward_impl4_fa4_scaled_dot_product_flash_attention_forward_impl5_fa4_scaled_dot_product_flash_attention_backward_impl)libs    r   r0   r0   C   sg    
&&&
)CHH')JFSHH(*LfUHH-<
 HH6=
 Jr   c                ,   [        S U 5       5      (       d  g[        U Vs1 s H  oDR                  iM     sn5      S:w  a  gU R                  [        R
                  [        R                  4;  a  gU H*  u  pVUR                  [        R                  :w  d  M%  U S3s  $    Uc  U R                  5       S:w  a  gUb  U R                  5       S	:w  a  g
[        R                  R                  5       (       d  g[        U R                  5      S;  a  gg s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7fr   )is_cuda.0ts     r   	<genexpr>,_fa4_common_support_error.<locals>.<genexpr>Z   s     *'Qyy's   zinputs must be CUDA tensorsr
   inputs must share devicez'query dtype must be float16 or bfloat16z dtype must be float32   zdense query must be 4D   zragged query must be 3DzCUDA not available)	   
   z+FA4 requires compute capability 9.0 or 10.0)alllenr*   dtyper'   float16bfloat16float32dimr(   is_availabler-   )querytensors	cum_seq_qrequire_fp32rM   nametensors          r   _fa4_common_support_errorrc   T   s     *'***,
g&gHHg&'1,){{5==%..998$<<5==(V122 % UYY[A-'!1(::""$$#&g5< 's   Dc                    US:w  a  gU(       a  gUb  gUb1  UR                   [        R                  :w  a  gUR                  (       d  g[	        U XU4U5      nUb	  US:X  a  gU$ g )	N        dropout_p must be 0zreturn_debug_mask must be Falsezalibi_slopes not supportedzseqused_k must be int32zseqused_k must be CUDArP   z(query, key, value must be on same device)rW   r'   int32rJ   rc   )	r]   keyvalue	dropout_preturn_debug_maskalibi_slopes	seqused_kr_   errors	            r   _fa4_forward_support_errorro   n   sw     C$0+??ekk),  +%	UE
 ..=r   c
           	     N    US:w  a  gUc  U	b  g[        UXX#XE4USU44S9n
U
b  U
$ g )Nre   rf   z windowed attention not supported	logsumexp)r`   )rc   )grad_outr]   rh   ri   outrq   rj   r_   window_size_leftwindow_size_rightrn   s              r   _fa4_backward_support_errorrv      sS     C$#'8'D1%	#c5"I.0	E r   Tsc                 &    [        S U  5       5      $ )Nc              3  D   #    U  H  oR                  S S5      v   M     g7f)r
      N)	transposerK   s     r   rN   #_transpose_dense.<locals>.<genexpr>   s     4GqQ""Gs    )tuple)r^   s    r   _transpose_denser~      s    4G444r   c           	         [         c  [        S5      e[        [         5      nUUUUSUUU	b  U	R                  5       OS S.nU
b  XS'   UR                  " XU40 UD6u  pXR                  5       4$ )NFA4 not registeredT)softmax_scalecausalrt   ru   
return_lsecu_seqlens_qcu_seqlens_krm   rs   )r   r8   r/   
contiguousr3   )r]   rh   ri   cu_seq_qcu_seq_kscale	is_causalrt   ru   rm   rs   r9   kwargslses                 r   _fa4_run_forwardr      s     /00 01F ,.  /8/DY))+$	F u%%e%B6BHC   r   c                    [         c  [        S5      e[        [         5      nUR                  UUUUU UR	                  5       UU	UUU
S9u  pnXU4$ )Nr   )r   r   r   r   deterministic)r   r8   r/   r4   r   )rr   r]   rh   ri   rs   rq   r   r   r   r   r   r9   dqdkdvs                  r   _fa4_run_backwardr      sp     /00 01F''# ( JBB 2:r   )r   rt   ru   rm   rl   rs   c
                  [        U UUUU	UUU5      nUb  [        SU 35      e[        U UUUUU
UUUUU5      u  nn[        R                  " S[        R
                  U R                  S9n[        R                  " S[        R
                  U R                  S9n[        R                  " SU R                  U R                  S9nUUUUU4$ )Nz)FA4 flash_attention forward unsupported: )rz   )rW   r*   r%   r   )	ro   r8   r   r'   zerosuint64r*   emptyrW   )r]   rh   ri   r_   	cum_seq_kmax_qmax_krj   r   rk   r   rt   ru   rm   rl   rs   rn   r   	rng_statephilox_offset
debug_masks                        r   rC   rC      s    & '	E FugNOOHC DU\\JIKK%,,u||LMQekk%,,GJYz99r   )r   rt   ru   c                   [        U UUUUUU
UUU5
      nUb  [        SU 35      e[        R                  " 5       n[	        U UUUUUUUUUU5      u  nnnUUU4$ )Nz*FA4 flash_attention backward unsupported: )rv   r8   r'   $are_deterministic_algorithms_enabledr   )rr   r]   rh   ri   rs   rq   r_   r   r   r   rj   r   r   unusedr   rt   ru   rn   r   r   r   r   s                         r   rD   rD   $  s    ( (E GwOPP>>@M"JBB r2:r   r   c                  [        U UUUUS S S 5      nUb  [        SU 35      e[        XU5      u  pn
[        R                  " U 5      nUR                  SS5      nUR                  S5      nU	R                  S5      n[        UU	U
S S UUUUUUUS9u  nnnnnU R                  S5      nUR                  S5      nUUS S UUUUU4	$ )NzFA4 SDPA forward unsupported: r
   rz   )r   rs   )ro   r8   r~   r'   
empty_liker{   sizerC   )r]   rh   ri   rj   r   rk   r   rn   qkvout_bhsdout_bshdmax_q_flashmax_k_flashr,   r   r   r   r   r   r   s                         r   rE   rE   W  s	    '	E ;E7CDDu51GA!
 &H!!!Q'H&&)K&&)K3T			40AsI}j JJqMEHHQKE
 
r   c                  [        U UUUUUU
S S S 5
      nUb  [        SU 35      e[        XX4U 5      u  nnnnnUR                  S5      nUR                  S5      n	[	        UUUUUUS S UU	U
UUUUS9u  nnn[        UUU5      u  nnnUUU4$ )NzFA4 SDPA backward unsupported: rz   r   )rv   r8   r~   r   rD   )rr   r]   rh   ri   rs   rq   r_   r   r   r   rj   r   philox_seedr   r   rn   r   r   r   ogor   r   r   s                           r   rF   rF     s    $ (E <UGDEE%e%hGNAq!QJJqMEHHQKE3
				JBB" ""b"-JBBr2:r   FA4)register_fn)r*   ztorch.devicer   int)zflash_attn.cute.interface)r1   strr   r   )r1   r   r   r   )r   r   )r%   )
r]   torch.Tensorr^   ztuple[torch.Tensor, ...]r_   torch.Tensor | Noner`   z$tuple[tuple[str, torch.Tensor], ...]r   r   )r]   r   rh   r   ri   r   rj   floatrk   boolrl   r   rm   r   r_   r   r   r   )rr   r   r]   r   rh   r   ri   r   rs   r   rq   r   rj   r   r_   r   rt   
int | Noneru   r   r   r   )r^   z
Unpack[Ts]r   ztuple[Unpack[Ts]]r   )r]   r   rh   r   ri   r   r   r   r   r   r   float | Noner   r   rt   r   ru   r   rm   r   rs   r   r   z!tuple[torch.Tensor, torch.Tensor])F)rr   r   r]   r   rh   r   ri   r   rs   r   rq   r   r   r   r   r   r   r   r   r   r   r   r   z/tuple[torch.Tensor, torch.Tensor, torch.Tensor]) r]   r   rh   r   ri   r   r_   r   r   r   r   r   r   r   rj   r   r   r   rk   r   r   r   rt   r   ru   r   rm   r   rl   r   rs   r   )"rr   r   r]   r   rh   r   ri   r   rs   r   rq   r   r_   r   r   r   r   r   r   r   rj   r   r   r   r   r   r   r   r   r   rt   r   ru   r   )re   FF)r]   r   rh   r   ri   r   rj   r   r   r   rk   r   r   r   )rr   r   r]   r   rh   r   ri   r   rs   r   rq   r   r_   r   r   r   r   r   r   r   rj   r   r   r   r   r   r   r   r   r   ))__doc__
__future__r   r5   dataclassesr   	functoolsr   typingr   r   typing_extensionsr   r	    r   typesr   r'   torch.libraryr   __all__r   r#   r   r-   r   r/   r0   rc   ro   rv   rw   r~   r   r   rC   rD   rE   rF   register_flash_attention_implr%   r   r   <module>r      s2    #  !  % 2     ! #
  $ * #      3///  * :<	% # 7	
 4	  	
  & # # B 
 	
 
   # ! " 6 $5  $!!	! ! "	!
 "! ! ! !! "! #! 
! '!X   
 	
 
  " "    5X #'$(%)(,##/:/:	/: /: #	/:
 #/: /: /: /: /: /: /: !/: "/: #/:  &!/:" 
#/:D #'$(%000 
0 	0
 
0 0 #0 #0 0 0 0 0 0 0  !0" !#0$ "%0n #: ::	: : 	:
 : : :Z !555 
5 	5
 
5 5 #5 #5 5 5 5 5 5  5  !5p 
 ' ';W Xr   