
    oi=                        / S Qr SSKrSSKrSSKJrJr  Sr SSKJr	  SSKJ
rJr  SrSS	KJrJr  \" S
 \" \R&                  R)                  5       5       5       5      rSr\S::  a  SSSSSSS.r\R.                  " \\S9rO \S::  a  SSSSSSS.r\R.                  " \\S9r \" \5      r
\R0                  S$S j5       r\R0                  S$S j5       rS$S jrS$S jrS rS%S jrS%S jr\R0                  S\4S j5       r S%S\4S jjr!S%S\4S jjr"S\S\RF                  4S jr$S\S\RF                  4S  jr% " S! S"5      r& g!   S#rSr	Sr
SrSrSr Sr& g= f)&)HAS_FLEX_ATTENTIONFLEX_ATTENTION_BLOCK_SIZE_flex_attentionflex_attentioncreate_block_mask_cachedcreate_block_maskcompiled_create_block_maskFlexAttentionCachecausal_mask!generate_causal_mask_with_padding*generate_decoding_causal_mask_with_paddinggenerate_sliding_window_mask)generate_sliding_window_mask_with_padding2generate_decoding_sliding_window_mask_with_padding    N   )torch_compile_torch_compilei   )_DEFAULT_SPARSE_BLOCK_SIZE)r   r   T)_score_mod_signature_mask_mod_signaturec              #      #    U  H:  n[         R                  R                  R                  U5      S    S-  S-  S-  v   M<     g7f)i   N)torchcudamemorymem_get_info).0is     Z/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/flex_attention/utils.py	<genexpr>r    6   s;     uTtqejj''44Q7;D@EdJTts   AA       )BLOCK_MBLOCK_NBLOCK_M1BLOCK_N1BLOCK_M2BLOCK_N2)kernel_options   @   c           	          [        U SSXUS9$ )JCreate block mask for Flex Attention. Assume bsz=any(None), head=any(None)Ndevice_create_block_maskmask_modMNr/   s       r   r   r   O   s     "(D$vNN    c           	          [        XX#XES9$ )r-   r.   r0   r3   bszheadr4   r5   r/   s         r   r   r   T   s     "(!MMr6   c           
          [        U SSXUSS9$ )r-   NTr/   _compiler0   r2   s       r   !compiled_create_block_mask_cachedr>   Y   s     "(D$vZ^__r6   c           
          [        XX#XESS9$ )r-   Tr<   r0   r8   s         r   r   r   _   s     "(!Y]^^r6   c                 
    X#:  $ zCausal mask for Flex Attention )	batch_idxhead_idxq_idxkv_idxs       r   r
   r
   d   s
    r6   c                    ^  T b  [        T 5      [        R                  L d   eT R                  5       S:X  d   eT R                  S   S:  d   eU 4S jnS=Ul        Ul        U$ )a9  
Causal mask for Flex Attention with left padding support.
Normal causal mask:
    k0 k1 k2 k3 k4
q0   X
q1   X  X
q2   X  X  X
q3   X  X  X  X
q4   X  X  X  X  X
If we add 2 tokens as padded tokens, we get:
    #0 #1 k2 k3 k4
#0
#1
q2         X
q3         X  X
q4         X  X  X
Assume padding_start_idx == [2]
   r   c                 6   > UTU    :  nUTU    :  nXE-  X#:  -  $ rA   rB   )rC   rD   rE   rF   q_startk_startpadding_start_idxs         r   r
   6generate_causal_mask_with_padding.<locals>.causal_mask~   s3     1) <<G 1) <<G$88r6   causal_mask_with_left_paddingtyper   Tensordimshape__name____doc__rL   r
   s   ` r   r   r   h   sp    & !,6G1HELL1XXX $$&!+++ &&q)Q...	9
 8UU{2r6   c                    ^  T b  [        T 5      [        R                  L d   eT R                  5       S:X  d   eT R                  S   S:  d   eU 4S jnS=Ul        Ul        U$ )z
For decoding purposes only. We remove q_padded since decoding attends to 1 q
Assume padded tokens = 5
    #0 #1 #2 #3 #4 k5 k6
#0   #
#1   #  #
#2   #  #  #
#3   #  #  #  #
#4   #  #  #  #  #
q5   #  #  #  #  #  X
q6   #  #  #  #  #  X  X
rH   r   c                 "   > UTU    :  nXBU:  -  $ rA   rB   )rC   rD   rE   rF   rK   rL   s        r   r
   ?generate_decoding_causal_mask_with_padding.<locals>.causal_mask   s      1) <<Gvo..r6   &decoding_causal_mask_with_left_paddingrO   rV   s   ` r   r   r      sp     !,6G1HELL1XXX $$&!+++ &&q)Q...	/ 8^^{2r6   window_sizec                 6   ^  U 4S jnST  3=Ul         Ul        U$ )z&Sliding window mask for Flex Attentionc                 $   > X#:  n X#-
  T:  nXE-  $ NrB   )rC   rD   rE   rF   r
   windowed_maskr[   s         r   sliding_window4generate_sliding_window_mask.<locals>.sliding_window   s(    /K "N[8M ..r6   sliding_window_)rT   rU   )r[   r`   s   ` r   r   r      s(    	/* >M[M;ZZ."8r6   c                    ^ ^ Tb  [        T5      [        R                  L d   eTR                  5       S:X  d   eTR                  S   S:  d   eUU 4S jnST  3=Ul        Ul        U$ )NrH   r   c                 N   > X#:  nX#-
  T	:  nUTU    :  nUTU    :  nXg-  U-  U-  $ r^   rB   )
rC   rD   rE   rF   r
   r_   q_paddedk_paddedrL   r[   s
           r   r`   Agenerate_sliding_window_mask_with_padding.<locals>.sliding_window   sI    /K!N[8M!29!==H!29!==H&4}DDr6   !sliding_window_with_left_padding_rO   )r[   rL   r`   s   `` r   r   r      sx     ,6G1HELL1XXX $$&!+++ &&q)Q...	E >__j^k;ll."8r6   c                     [        U 5      $ )z
We cannot use padding_start_idx[batch_idx] for SWA decoding since
assume padding_start_idx=[3406, 4000, 0] and SW=128 then it'll always
be masked since the KV size=128.

Since we set padded tokens = 0 always, we simply return the generic SWA.
)r   )r[   rL   s     r   r   r      s     ,K88r6   	score_mod_offsetc                    ^ ^ UU 4S jnU$ )Nc                    > T" XX#T-   U5      $ r^   rB   )scorebhqkvrk   rj   s        r   
_score_mod*get_score_mod_w_offset.<locals>._score_mod   s    Uqg+r::r6   rB   )rj   rk   rs   s   `` r   get_score_mod_w_offsetru      s    	;r6   r3   c                    ^ ^ UU 4S jnU$ )Nc                    > T" XUT-   U5      $ r^   rB   )ro   rp   rq   rr   rk   r3   s       r   	_mask_mod(get_mask_mod_w_offset.<locals>._mask_mod   s    A!g+r22r6   rB   )r3   rk   rx   s   `` r   get_mask_mod_w_offsetrz      s    	3r6   c                   $    \ rS rSrSrS rS rSrg)r	      )	offsetoffset_tensormask_mod_with_offset
block_maskr3   
max_length
block_sizer`   block_mask_slicec           	         UR                   u  pEpgUcT   [        U[        5      u  p[        U-  U	S:w  a  [        OS-   n
US-
  U l        U R                  S::  a  SU l        S U l        O7 Un
[        X65      S-
  U l        U R                  S::  a  SU l        US-
  U l        [        R                  " U R                  UR                  [        R                  S9U l
        [        X$XZXR                  S9U l        X l        Xl        U R                  R                  S   U l        [#        U R                  U R                  5      U l        S U l        g )Nr   r   r   rH   )r/   dtyper.   )rS   divmodFLEX_ATTENTION_KV_INCREMENTr}   r`   minr   tensorr/   int32r~   r   r   r3   r   
BLOCK_SIZEr   rz   r   r   )selfkeyr3   r`   r9   heads_KVqlen_KVrR   divmodns              r   __init__FlexAttentionCache.__init__   s*   *-))'C7%: "'+FG/3VY]^V^7Rdef%k ;;"$"$DK&*#$J #!.:Q>;;"$"$DK&4q&8#!&dkkCJJX]XcXc!dD8UVakaklDO$MO"oo88;DO(=dmmTM_M_(`D%$(D!r6   c           	         UR                   u  p#pEU R                  b  U R                  U R                  :  a1  U =R                  S-  sl        U R                  R	                  S5        OU R                  b  U R
                  $ U R                  U R                  :  aq  U =R                  [        -  sl        [        U R                  X#U R                  U R                  UR                  S9U l        U R                  R                  S   U l        U R                  U R                  -  nU R                  S S 2S S 2U4   nU R                  Ul	        SU4Ul        Xpl        U$ )NrH   r.   r   )rS   r`   r}   r~   add_r   r   r   r   r3   r/   r   r   r   r   seq_lengths)r   r   r9   r   r   rR   block_offsetr   s           r   __call__FlexAttentionCache.__call__?  s0   *-))'C7 ##+t?R?R1Rq ""''*%%1 ,,,{{doo- #>>"<T]]C[_[j[jlpl{l{  GJ  GQ  GQ  #R"&//"<"<Q"?;;$//9L#q!\/AB(,(A(A% -.w<($4!##r6   )	r   r   r   r3   r   r   r}   r~   r`   N)rT   
__module____qualname____firstlineno__	__slots__r   r   __static_attributes__rB   r6   r   r	   r	      s    N 	]	)~	$r6   r	   F)r   r^   )'__all__r   	functoolstemporary_patches.commonr   r   r   !torch.nn.attention.flex_attentionr   r   r   r   r   r1   r   r   r   r   ranger   device_countvram_of_gpur)   partial	lru_cacher   r>   r   r
   r   r   intr   r   r   r   ru   rz   r	   rB   r6   r   <module>r      s  "&   D! zi [ uTYZ_ZdZdZqZqZsTtuuKNb
 $++On]		
 $++On]#O4NO O N N`_
<. #  4s 9 9*>  
(; ell ~$ ~$~ 	  $N#K#'  s   D(E E