
    ΅i|*              !       t   S r SSKrSSKJr  SSKJrJr  SSKr\R                  " \	5      r
SS/r\" SS9S	\S
\4S j5       r " S S\5      r\R                   R#                  S0 S9  S,S\R$                  S\R$                  S\R$                  S\R$                  S\R$                  S\S\S\S\S-  S
\\R$                  \R$                  \R$                  4   4S jj5       r\R,                    S,S\R$                  S\R$                  S\R$                  S\R$                  S\R$                  S\S\S\S\S-  S
\\R$                  \R$                  \R$                  4   4S jj5       r   S-S\R$                  S\R$                  S\R$                  S\R$                  S\R$                  S\S\S\S\S-  S\S-  S
\R$                  \\R$                  \R$                  4   -  4S jjrS\S\\S4   S\S
S4S  jr\R                   R#                  S!0 S9 S.S"\R$                  S\R$                  S\R$                  S\R$                  S#\R$                  S$\R$                  S\R$                  S\R$                  S\S\S\S%\R$                  S\S-  S
\\R$                  \R$                  \R$                  4   4S& jj5       r\R,                   S.S"\R$                  S\R$                  S\R$                  S\R$                  S#\R$                  S$\R$                  S\R$                  S\R$                  S\S\S\S%\R$                  S\S-  S
\\R$                  \R$                  \R$                  4   4S' jj5       rS\S"\R$                  S(\R$                  S)\R$                  S
\\R$                  S-  S4   4
S* jr\R;                  \\S+9  g)/z
Variable-length attention implementation using Flash Attention.

This module provides a high-level Python interface for variable-length attention
that calls into the optimized Flash Attention kernels.
    N)	lru_cache)Any
NamedTuplevarlen_attn
AuxRequest   )maxsizedevice_indexreturnc                     g)z;Cache device capability check to avoid repeated CUDA calls.F )r
   s    S/home/james-whalen/.local/lib/python3.13/site-packages/torch/nn/attention/varlen.py_should_use_cudnnr      s         c                   (    \ rS rSr% SrSr\\S'   Srg)r      z
Request which auxiliary outputs to compute from varlen_attn.

Each field is a boolean indicating whether that auxiliary output should be computed.
Flser   N)	__name__
__module____qualname____firstlineno____doc__r   bool__annotations____static_attributes__r   r   r   r   r      s     Cr   ztorch_attn::_varlen_attn)mutates_argsquerykeyvaluecu_seq_qcu_seq_kmax_qmax_k	is_causalscalec	                    U R                   =(       a    [        U R                  R                  5      n	U	(       aW  [        R                  S5        [        R                  R                  R                  U UUSUUUUSSUSUS9n
U
S   U
S   U
S	   pnOK[        R                  S
5        [        R                  R                  R                  U UUUUUUSUSUS9u  pn  n[        R                  " S[        R                  U R                  S9nXU4$ )z
Private custom op for variable-length attention.

This is the internal implementation. Users should use the public varlen_attn function instead.
#Using cuDNN backend for varlen_attnNT        Fr%   r         -Using Flash Attention backend for varlen_attn)return_debug_maskr%      dtypedevice)is_cudar   r2   indexloginfotorchopsaten_cudnn_attention_forward_flash_attention_forwardzerosuint64)r   r   r   r    r!   r"   r#   r$   r%   	use_cudnnresultoutputsoftmax_lse	rng_state_
rng_state_s                   r   _varlen_attnrE   $   s   $ G"3ELL4F4F"GI6788 9 
  *0F1IvayYY@A/4yy~~/V/V# 0W 0
,Y1 ELLJ 
**r   c	                    [         R                  " U 5      n	U R                  S5      n
U R                  S5      n[         R                  R                  (       aE  UR                  S5      S-
  n[         R
                  " XU4[         R                  U R                  S9nO/[         R
                  " X4[         R                  U R                  S9n[         R
                  " S[         R                  U R                  S9nXU4$ )z
Fake implementation for meta tensor computation and tracing.

Based on the 3D varlen path from meta__flash_attention_forward:
- query shape: (total, num_heads, head_dim)
- logsumexp shape: (num_heads, total_q)
r   r*   r0   r.   )	r7   
empty_likesizeversionhipemptyfloatr2   r=   )r   r   r   r    r!   r"   r#   r$   r%   r@   total_q	num_heads
batch_size	logsumexprB   s                  r   _varlen_attn_fakerQ   a   s    ( e$F jjmG

1I}}]]1%)
KKE*%++ell
	 KK ELL
	 DU\\JIi''r   
return_auxc
                     [         R                  R                  R                  XX#XEXgU	5	      u  pnUb  UR                  (       a  X4$ U
$ )a
  
Compute variable-length attention using Flash Attention.
This function is similar to scaled_dot_product_attention but optimized for
variable-length sequences using cumulative sequence position tensors.

Args:
    query (Tensor): Query tensor; shape :math:`(T_q, H, D)`
    key (Tensor): Key tensor; shape :math:`(T_k, H, D)`
    value (Tensor): Value tensor; shape :math:`(T_k, H, D)`
    cu_seq_q (Tensor): Cumulative sequence positions for queries; shape :math:`(N+1,)`
    cu_seq_k (Tensor): Cumulative sequence positions for keys/values; shape :math:`(N+1,)`
    max_q (int): Maximum query sequence length in the batch.
    max_k (int): Maximum key/value sequence length in the batch.
    is_causal (bool, optional): If set to True, applies causal masking (default: False).
    return_aux (Optional[AuxRequest]): If not None and ``return_aux.lse`` is True, also returns the logsumexp tensor.
    scale (float, optional): Scaling factor for attention scores

Returns:
    output (Tensor): Output tensor from attention computation; shape :math:`(T_q, H, D)`.

    If ``return_aux`` is not None and ``return_aux.lse`` is True:
        lse (Tensor): Log-sum-exp of attention scores; shape :math:`(T_q, H)`.

Shape legend:
    - :math:`N`: Batch size
    - :math:`T_q`: Total number of query tokens in the batch (sum of all query sequence lengths)
    - :math:`T_k`: Total number of key/value tokens in the batch (sum of all key/value sequence lengths)
    - :math:`H`: Number of attention heads
    - :math:`D`: Head dimension

Example::

    >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
    >>> batch_size, max_seq_len, embed_dim, num_heads = 2, 512, 1024, 16
    >>> head_dim = embed_dim // num_heads
    >>> seq_lengths = []
    >>> for _ in range(batch_size):
    ...     length = torch.randint(1, max_seq_len // 64 + 1, (1,)).item() * 64
    ...     seq_lengths.append(min(length, max_seq_len))
    >>> seq_lengths = torch.tensor(seq_lengths, device="cuda")
    >>> total_tokens = seq_lengths.sum().item()
    >>>
    >>> # Create packed query, key, value tensors
    >>> query = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> key = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>> value = torch.randn(
    ...     total_tokens, num_heads, head_dim, dtype=torch.float16, device="cuda"
    ... )
    >>>
    >>> # Build cumulative sequence tensor
    >>> cu_seq = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
    >>> cu_seq[1:] = seq_lengths.cumsum(0)
    >>> max_len = seq_lengths.max().item()
    >>>
    >>> # Call varlen_attn
    >>> output = varlen_attn(
    ...     query, key, value, cu_seq, cu_seq, max_len, max_len, is_causal=False
    ... )
)r7   r8   
torch_attnrE   r   )r   r   r   r    r!   r"   r#   r$   rR   r%   outr   rC   s                r   r   r      sH    V ))&&33EXeKCa *..xJr   ctxinputs.r@   c           
      v    Uu	  p4pVpxpnUu  pnU R                  X4XVX|X5        Xl        Xl        Xl        Xl        g N)save_for_backwardr"   r#   r$   r%   )rV   rW   r@   r   r   r   r    r!   r"   r#   r$   r%   rU   r   rB   s                  r   _setup_contextr[      sD    LRIE%E Ci%excUIIMIr   z!torch_attn::_varlen_attn_backwardgrad_outrU   r   rB   c                    [         R                  " SUR                  S9nUR                  =(       a    [	        UR                  R
                  5      nU(       aO  [        R                  S5        [         R                  R                  R                  U UUUUUUUUU	SU
UUUS9u  nnnON[        R                  S5        [         R                  R                  R                  U UUUUUUUUU	SU
UUUS9u  nnnUUU4$ )Nr   )r2   r'   r(   r)   r,   )r7   rK   r2   r3   r   r4   r5   r6   r8   r9   _cudnn_attention_backward_flash_attention_backward)r\   r   r   r   rU   r   r    r!   r"   r#   r$   rB   r%   unusedr>   dqdkdvs                     r   _varlen_attn_backwardrd      s     [[5<<0FG"3ELL4F4F"GI67YY^^== > 

B$ 	@AYY^^== > 

B" r2:r   c                     [         R                  " U5      n[         R                  " U5      n[         R                  " U5      nXU4$ )z>
Fake implementation for meta tensor computation and tracing.
)r7   rG   )r\   r   r   r   rU   r   r    r!   r"   r#   r$   rB   r%   
grad_querygrad_key
grad_values                   r   _varlen_attn_backward_fakeri   %  s?    ( !!%(J$H!!%(J++r   grad_lsegrad_rngc                    U R                   u  pEpgppU R                  nU R                  nU R                  nU R                  n[
        R                  R                  R                  UUUUU	U
UUUUUUU5      u  nnnUUUS S S S S S 4	$ rY   )	saved_tensorsr"   r#   r$   r%   r7   r8   rT   rd   )rV   r\   rj   rk   r   r   r   r    r!   rU   r   rB   r"   r#   r$   r%   ra   rb   rc   s                      r   	_backwardrn   @  s     BEARAR>EIIEIIEIIIE%%;;JBB r2tT4tT99r   )setup_context)FN)FNNrY   )r   logging	functoolsr   typingr   r   r7   	getLoggerr   r5   __all__intr   r   r   library	custom_opTensorrL   tuplerE   register_fakerQ   r   r[   rd   ri   rn   register_autogradr   r   r   <module>r|      s     "  !,
' 1C D  
  3"E 9+<<9+	9+ <<9+ ll	9+
 ll9+ 9+ 9+ 9+ 4<9+ 5<<u||349+ F9+x  %(<<%(	%( <<%( ll	%(
 ll%( %( %( %( 4<%( 5<<u||34%( %(` $(P<<P	P <<P ll	P
 llP P P P T!P 4<P \\E%,,455Pf	 	U38_ 	c 	d 	 <2N 8ll8<<8 
8 <<	8
 
8 
8 ll8 ll8 8 8 8 ||8 4<8 5<<u||348 O8v $$ ,ll,<<, 
, <<	,
 
, 
, ll, ll, , , , ||, 4<, 5<<u||34, %,4:	::05:HM:
5<<$#$:8   y  Gr   