
    bCi.                         S SK JrJr  S SKJr  S SKJr  S SKJr  SSK	J
r
   " S S\5      r " S	 S
\5      r " S S\5      rg)    )ABCabstractmethod)deque)ceil)Optional   )loggerc                      \ rS rSr% Sr\\S'   \\\	\   4   \S'   \
S\S\S\\   S\\   4S	 j5       rS\S\\   SS
4S jr\
S\S\S\S\	\   4S j5       r\
S\S\S\S\	\   4S j5       r\
S\S\S\S\\\4   4S j5       rSrg
)CacheAllocator   zAbstract base class for cache managers. Cache managers keep track of per-request cache allocations, determine
when a new physical block needs to be allocated and compute physical indices for reading or writing to the cache._index_block_tablen_blocks
request_idfree_blocksreturnc                     g)zpAllocates n_blocks for a given request_id. Returns the num of blocks allocated if successful and None
otherwise.N selfr   r   r   s       s/home/james-whalen/.local/lib/python3.13/site-packages/transformers/generation/continuous_batching/cache_manager.pyallocate_blocksCacheAllocator.allocate_blocks   s     	    Nc                     XR                   ;   a-  U R                   R                  U5      nUR                  U5        g[        R                  " SU R
                   SU 35        g)z.Frees all blocks associated with a request_id.zCacheAllocator z7 attempted to free blocks for non-existent request_id: N)r   popextendr	   warningr   )r   r   r   blocks_to_frees       r   r   CacheAllocator.free_blocks$   sU    ***!..22:>N~.NN!$++.efpeqrr   past_lengthquery_lengthc                     g)zUReturns the physical indices of where to read request_id's cache in the cache tensor.Nr   r   r   r!   r"   s       r   get_read_indicesCacheAllocator.get_read_indices.        	r   c                     g)zVReturns the physical indices of where to write request_id's cache in the cache tensor.Nr   r$   s       r   get_write_indices CacheAllocator.get_write_indices3   r'   r   c                     g)gReturns the attention type of the cache allocator and the key sequence length for the given request_id.Nr   r$   s       r   get_seqlens_kCacheAllocator.get_seqlens_k8   r'   r   r   )__name__
__module____qualname____firstlineno____doc__int__annotations__dictstrlistr   r   r   r   r   r%   r)   tupler-   __static_attributes__r   r   r   r   r      s'   y KsDI~&&  5QT: ZbcfZg  
c c
 t  3 S PS X\]`Xa   C c QT Y]^aYb    # S UZ[^`c[cUd  r   r   c            
           \ rS rSrSrS\S\SS4S jrS\S	\S
\\   S\	\   4S jr
S	\S\S\S\\   4S jrS	\S\S\S\\   4S jrS	\S\S\S\\\4   4S jrSrg)FullAttentionCacheAllocator>   z3Cache manager for a group of full attention layers.index
block_sizer   Nc                 *    Xl         X l        0 U l        g)zInitializes the cache manager for a group of full attention layers.
Args:
    - index: the index of the associated layer group
    - block_size: the size of the blocks in the cache
N)r   r?   r   )r   r>   r?   s      r   __init__$FullAttentionCacheAllocator.__init__A   s     $r   r   r   r   c                    ^ [        T5      U:  a  gX R                  ;  a  / U R                  U'   U R                  U   R                  U4S j[        U5       5       5        U$ )zAllocate blocks for a given request_id. Returns the number of blocks allocated if successful and None
otherwise. For group of full attention layers, we always allocate the number of requested blocks.Nc              3   D   >#    U  H  nTR                  5       v   M     g 7fNpopleft.0_r   s     r   	<genexpr>>FullAttentionCacheAllocator.allocate_blocks.<locals>.<genexpr>R   s     ,\Oq[-@-@-B-BO    )lenr   r   ranger   s      `r   r   +FullAttentionCacheAllocator.allocate_blocksK   sZ     {h&...,.Dj)*%,,,\ERZO,\\r   r!   r"   c                    U R                   R                  U5      nUc  [        SU 35      e/ n[        X#-   5       HD  nX`R                  -  nX`R                  -  nXG   U R                  -  U-   n	UR                  U	5        MF     U$ )zReturns the physical indices of where to read request_id's cache. For a group of full attention layers, we
first write the new cache to the cache tensor and then read the entire cache from the beginning to the end.!No block table found for request r   get
ValueErrorrO   r?   append
r   r   r!   r"   block_tablephysical_indicesi	block_idxblock_offsetphysical_indexs
             r   r%   ,FullAttentionCacheAllocator.get_read_indicesU   s     ''++J7@MNN{12A__,I.L(3dooETN##N3	 3
  r   c                    U R                   R                  U5      nUc  [        SU 35      e/ n[        X"U-   5       HD  nX`R                  -  nX`R                  -  nXG   U R                  -  U-   n	UR                  U	5        MF     U$ )zReturns the physical indices for writing to the cache. For a group of full attention layers, we write the new
cache as a continuation of the existing cache for the same request.rR   rS   rW   s
             r   r)   -FullAttentionCacheAllocator.get_write_indicese   s     ''++J7@MNN{,$>?A__,I.L(3dooETN##N3	 @
  r   c                     X#-   nSU4$ )r,   full_attentionr   r   r   r!   r"   	seqlens_ks        r   r-   )FullAttentionCacheAllocator.get_seqlens_kt   s    .	**r   )r   r   r?   r/   r0   r1   r2   r3   r4   rA   r7   r   r   r   r8   r%   r)   r9   r-   r:   r   r   r   r<   r<   >   s    =c s t   5QT: ZbcfZg  3  S  PS  X\]`Xa    C  c  QT  Y]^aYb  + +# +S +UZ[^`c[cUd +r   r<   c            
           \ rS rSrSrS\S\S\SS4S jrS	\S
\S\\   S\	\   4S jr
S
\S\S\S\\   4S jrS
\S\S\S\\   4S jrS
\S\S\S\\\4   4S jrSrg)SlidingAttentionCacheAllocatorz   z2Cache manager for sliding window attention layers.r>   r?   sliding_windowr   Nc                     Xl         X l        X0l        [        U R                  U R                  -  5      U l        0 U l        g)zInitializes the cache manager for a group of sliding window attention layers.
Args:
    - index: the index of the associated layer group
    - block_size: the size of the blocks in the cache
    - sliding_window: the size of the sliding window
N)r   r?   rj   r   _max_blocks_per_requestr   )r   r>   r?   rj   s       r   rA   'SlidingAttentionCacheAllocator.__init__}   s8     $,'+D,?,?$//,Q'R$r   r   r   r   c                 N  ^ X R                   ;  a  / U R                   U'   [        U R                   U   5      nX@R                  :X  a  g[        XA-   U R                  5      nXT-
  n[        T5      U:  a  gU R                   U   R	                  U4S j[        U5       5       5        U$ )zAllocate blocks for a given request_id. Returns the number of blocks allocated if successful and None
otherwise. For group of sliding window attention layers, we only allocate up to the point where we can fit an
entire sliding window in the cache tensor.r   Nc              3   D   >#    U  H  nTR                  5       v   M     g 7frE   rF   rH   s     r   rK   ASlidingAttentionCacheAllocator.allocate_blocks.<locals>.<genexpr>   s     ,cLbq[-@-@-B-BLbrM   )r   rN   rl   minr   rO   )r   r   r   r   already_allocatedafter_allocationactual_n_blockss      `   r   r   .SlidingAttentionCacheAllocator.allocate_blocks   s     ...,.Dj) 1 1* => < <<0;T=Y=YZ*>{o-*%,,,cERaLb,ccr   r!   r"   c                    U R                   R                  U5      nUc  [        SU 35      eX R                  :  a  SOX R                  -  n[	        X R                  S-
  5      n/ n[        XUU-   5       HR  nXR                  -  nXR                  -  n	XR                  -  n
XI   U R                  -  U
-   nUR                  U5        MT     US/U-  -   $ )a  Returns the physical indices of where to read request_id's cache in the cache tensor.
For a group of sliding window attention layers, we read from the cache tensor before writing on it, because the
new cache can overwrite the old one. To form the cache + new key / values states, we read the at most
sliding_window - 1 cache page and then manually add the new key / values states after. Hence the -1 indices
which indicate where to store the new key or values indices.rR   r   r   r   rT   rU   rj   rq   rO   r?   rV   )r   r   r!   r"   rX   start_indexcache_lengthrY   rZ   r[   r\   r]   s               r   r%   /SlidingAttentionCacheAllocator.get_read_indices   s     ''++J7@MNN&)<)<<a+PcPcBc;(;(;a(?@{,$>?A$$$A__,I.L(3dooETN##N3 @  2$"555r   c                    U R                   R                  U5      nUc  [        SU 35      eX R                  -  n[	        X0R                  5      nX6-
  n/ n[        XUU-   5       HR  n	XR                  -  n	XR                  -  n
XR                  -  nXJ   U R                  -  U-   nUR                  U5        MT     US:  a	  S/U-  U-   nU$ )a2  Returns the physical indices of where to write request_id's cache in the cache tensor. For a group of
sliding window attention layers, we write the new cache in rolling-buffer kind of way: if we reach the end of
the allocated physical cache, we start writing from the beginning of the physical cache again.rR   r   rw   rx   )r   r   r!   r"   rX   ry   rz   padding_lengthrY   rZ   r[   r\   r]   s                r   r)   0SlidingAttentionCacheAllocator.get_write_indices   s    
 ''++J7@MNN!$7$77<)<)<=%4{,$>?A$$$A__,I.L(3dooETN##N3 @ A "tn47GGr   c                 @    U[        X R                  S-
  5      -   nSU4$ )r,   r   sliding_attention)rq   rj   rc   s        r   r-   ,SlidingAttentionCacheAllocator.get_seqlens_k   s'     3{4G4G!4K#LL	"I--r   )r   r   rl   r?   rj   rf   r   r   r   rh   rh   z   s    <c s C D   5QT: ZbcfZg &63 6S 6PS 6X\]`Xa 6. C  c  QT  Y]^aYb  0. .# .S .UZ[^`c[cUd .r   rh   N)abcr   r   collectionsr   mathr   typingr   requestsr	   r   r<   rh   r   r   r   <module>r      s>    $    $S $N9+. 9+xU.^ U.r   