
    oi_7                     :   % S SK r S SKJr  S SKrSSKJr  SSKJrJ	r	J
r
  S\R                  S\R                  S\R                  S	\R                  S
\R                  4
S jr\ R                  " \5      r\ R                  " S S S.5      " \5      rSr\\S'   S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S
\R                  4S jr\ R                  " \5      r\ R                  " SS 05      " \5      r " S S\R&                  R(                  5      r\R,                  R.                   SS j5       r " S S\R&                  R(                  5      r " S S\R&                  R(                  5      rS rg)     N   )DEVICE_COUNT   )calculate_settingstorch_gpu_devicetorch_device_streamhead_dim	n_heads_KBACKWARD_PASSHAS_ROPE_INDICES
BLOCK_SIZEc                 "   [         R                  " S5      n[         R                  " S5      n[         R                  " SU5      nUS-  nUU:  nU(       a6  [         R                  " UU-   SS9R	                  [         R
                  5      nOUU-  nUUU	-  -   nU
UU-  -   n[         R                  " UU-   USS9n[         R                  " UU-   USS9nU(       a  U* nUU-  nUUU-  -
  nU UU-  -   UU-  -   UU-  -   n[         R                  " UU-   USS9n [         R                  " UU-   U-   USS9n![         R                  " UU-   U U-  U!U-  -
  US9  [         R                  " UU-   U-   U!U-  U U-  -   US9  UU:  a  UUU-  -   UU-  -   UU-  -   n"[         R                  " U"U-   USS9n#[         R                  " U"U-   U-   USS9n$[         R                  " U"U-   U#U-  U$U-  -
  US9  [         R                  " U"U-   U-   U$U-  U#U-  -   US9  g g )Nr   r   r   evict_first)eviction_policymaskotherr   )tl
program_idarangeloadtoint32store)%QQ_batch_strideQ_head_strideQ_seq_strideKK_batch_strideK_head_strideK_seq_stridecoscos_row_stridesinsin_row_striderope_embedding_indicesseqlenr	   r
   r   r   r   row_positionhead_positioncol_offsetshalf_head_dimr   rot_positioncos_ptrsin_ptrsin1cos1batch_id	seq_indexq_ptrq0q1k_ptrk0k1s%                                        X/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/rope_embedding.py_rope_embedding_QKr<      si   * ==#LMM!$M))Az*KMM&Dww"\1+
 "RXX, 	
 $f,L>11GL>11G77+D
 77+D
 uv%Hx&00I 	

^
#	$
-
'	( l
"	# 
 
$T1	=B	&4T1	MBHHU[ "t)b4i"7EHHU]"[0"t)b4i2GPTUy '(m+, ,&' 	 WWU[(qAWWU]*[8qQ
$b4i"t)&;DI
&4b4i"t)6KTXY !    c                     [        U S   5      $ Nr   boolargss    r;   <lambda>rD   g       d4+@&Ar=   c                     [        U S   5      $ )Nr   r@   rB   s    r;   rD   rD   h   s    d3E.F)Gr=   )r   r      ROPE_GROUP_SIZEQ_row_strider%   r'   n_headsc                 &   Sn[         R                  " S5      n[         R                  " S5      n[         R                  " SU
5      nUS-  nX:  n[         R                  " UX-  U-  -   US-  -   U-   USS9n[         R                  " UX-  U-  -   US-  -   U-   USS9nU	(       a  U* nX-  n[	        UU-   U5      n[        UU5       H  nX-  UU-  -   U-   nX-  UU-  -   U-   U-   n[         R                  " U U-   USS9R                  UR                  5      n[         R                  " U U-   USS9R                  UR                  5      n[         R                  " U U-   UU-  UU-  -
  US9  [         R                  " U U-   UU-  UU-  -   US9  M     g)zn
Calculates the RoPE Embedding quickly
RoPE is Q * cos + rotate_half(Q) * sin
See our blog post for more info
rG   r   r   r   r   r   N)	r   r   r   r   minranger   dtyper   )r   rI   r$   r%   r&   r'   r)   r	   rJ   r   r   rH   r*   group_head_positionr,   r-   r   r1   r2   
head_starthead_endkoffs_q1offs_q2Q1Q2s                             r;   _rope_embeddingrW   p   s   $ O==#L--*))Az*KMM&D77 N
2	3
!
	 	 D 77 N
2	3
!
	 	 D u %6JJ07;H :x(-H<{J'!h,6D}T 	
 WWQ[q9<<TZZHWWQ[q9<<TZZH
Wb4i"t)3DA
Wb4i"t)3DA )r=   c                     [        U S   5      $ r?   r@   rB   s    r;   rD   rD      rE   r=   c                   4    \ rS rSr\S 5       r\S 5       rSrg)Fast_RoPE_Embedding   c                 l   UR                  5       UR                  5       p2UR                  u  pEpgUR                  XE-  Xg-  5      nUR                  u  pXRR                  S   ::  d   e[        US-  5      u  p[	        U[
        5      u  pXS:g  -   n[        UR                  5         [        UU4   " UUR                  S5      UUR                  S5      UUR                  S5      UUUSU
US9  S S S 5        Xl
        Xl        Xl        X l        X0l        UR                  XEXg5      $ ! , (       d  f       N>= f)Nr   r   Fr   r   	num_warps)squeezeshapereshaper   divmodrH   r   devicerW   strider   r^   n_groupsr$   r&   )ctxr   r$   r&   batchseq_lenrJ   r	   n_rowsn_colsr   r^   divmodre   s                  r;   forwardFast_RoPE_Embedding.forward   s&   ;;=#++-S
 -.GG)IIeow'9: ))A,&&& !38q= A

 '?3ax(ahh' 

1

1 %'%# (( $!yy;;3 ('s   %AD%%
D3c                    UR                   u  p#pEUR                  X#-  XE-  5      nUR                   u  pgU R                  nU R                  n	[	        UR
                  5         [        UU R                  4   " UUR                  S5      UUR                  S5      U	U	R                  S5      UUUSU R                  U R                  S9  S S S 5        UR                  X#XE5      nUS S 4$ ! , (       d  f       N%= f)Nr   Tr]   )r`   ra   r$   r&   r   rc   rW   re   rd   r   r^   )
rf   dYrg   rh   rJ   r	   ri   rj   r$   r&   s
             r;   backwardFast_RoPE_Embedding.backward   s     -/HH)ZZ);< ggggbii(LL 		!

1

1 $ ^^MM# )( ZZ:
 	
+ )(s   !A&C&&
C4 N__name__
__module____qualname____firstlineno__staticmethodrm   rq   __static_attributes__rs   r=   r;   rZ   rZ      s)    0< 0<d '
 '
r=   rZ   c                    Ub  [         R                  XX#U5      u  pVO[        R                  U R                  SS5      R	                  5       X#5      R                  SS5      n[        R                  UR                  SS5      R	                  5       X#5      R                  SS5      n[
        S:  a#  [        U R                  5      R                  5         XV4$ )Nr   r   )	Fast_RoPE_Embedding_QKapplyrZ   	transpose
contiguousr   r   rc   synchronize)r   r    r$   r&   r(   Q_outK_outs          r;   fast_rope_embeddingr     s     )-33#2
u $))KK1((*C

)Aq/ 	 $))KK1((*C

)Aq/ 	 aAHH%113<r=   c                   4    \ rS rSr\S 5       r\S 5       rSrg)r|   i1  c                 2   US LnUR                  5       UR                  5       pCUR                  u  pxpUR                  u  p  nUR                  5       (       d  UR                  5       OUnUR                  5       (       d  UR                  5       OUnU(       a8  UR	                  S5      R                  [        R                  UR                  S9nOUR                  S[        R                  S9n[        U
5      u  nnUR                  S5      UR                  S5      UR                  S5      nnnUR                  S5      UR                  S5      UR                  S5      nnn[        UR                  5         [        Xy-  U4   " UUUUUUUUUUR                  S5      UUR                  S5      UU	U
USUUUS9  S S S 5        UU l        UU l        X`l        X0l        X@l        U(       a  UOS U l        Xl        Xl        Xl        UU4$ ! , (       d  f       NT= f)	N)rN   rc   r   rN   r   r   Fr	   r
   r   r   r   r^   )r_   r`   is_contiguousclonera   r   torchr   rc   	new_emptyr   rd   r   r<   
block_sizer^   has_indicesr$   r&   rope_indicesrh   	n_heads_Qr
   )rf   r   r    r$   r&   r   r   rg   r   rh   r	   _r
   r   r   rope_ptrr   r^   r   r   r   r!   r"   r#   s                           r;   rm   Fast_RoPE_Embedding_QK.forward2  s   "$.;;=#++-S./gg+'WWa "#!2!2	!"!2!2	#++B/225;;QRQYQY2ZH}}Q}<H 28 <
I LLOLLOLLO (4 LLOLLOLLO (4 ahh';<

1

1#% %#.'%) (0 $!%'28!! 
 	
E ('s   7AH
Hc                    UR                   u  n  pEU R                  (       a  U R                  O'U R                  R	                  S[
        R                  S9nUR                  5       (       d  UR                  5       OUnUR                  5       (       d  UR                  5       OUnUR                  S5      UR                  S5      UR                  S5      pn	UR                  S5      UR                  S5      UR                  S5      pn[        UR                  5         [        X0R                  -  U R                  4   " UU	U
UUUUUU R                  U R                  R                  S5      U R                  U R                  R                  S5      UU R                  UU R                   SU R                  U R"                  U R$                  S9  S S S 5        XxS S S 4$ ! , (       d  f       N= f)Nr   r   r   r   Tr   )r`   r   r   r$   r   r   r   r   r   rd   r   rc   r<   rh   r   r&   r
   r   r^   )rf   dQdKrg   r   r	   r   dQ_outdK_outr   r   r   r!   r"   r#   s                  r;   rq   Fast_RoPE_Embedding_QK.backwardx  s    "q!  ""1ekk": 	 $&#3#3#5#52#%#3#3#5#52 MM!MM!MM! (4 MM!MM!MM! (4 bii( 3S]]CDq!q!#MM $#&?? ^^MM) )0 dD111 )(s   B1G
G*rs   Nrt   rs   r=   r;   r|   r|   1  s+    C
 C
J 02 02r=   r|   c                   4    \ rS rSr\S 5       r\S 5       rSrg)Slow_RoPE_Embeddingi  c                    Ubf  UR                  S5      R                  S5      nUR                  S5      R                  S5      nX$   R                  S5      nX4   R                  S5      nUR                  S   S-  n[        R                  " USUS 24   * USS U24   4SS9nX-  nUR                  Xc5        U R                  X#5        U$ )Nr   r   r   r   .dim)r_   	unsqueezer`   r   cataddcmul_save_for_backward)rf   r   r$   r&   position_idshalfRH_Qs          r;   rm   Slow_RoPE_Embedding.forward  s    #++a.((+C++a.((+C#--a0C#--a0C wwr{ayy1S$%Z=.!C$J-8C		

4 	c'r=   c                     U R                   u  p#UR                  S   S-  n[        R                  " USUS 24   USS U24   * 4SS9nX-  nUR	                  XS5        US S S 4$ )Nr   r   .r   )saved_tensorsr`   r   r   r   )rf   rp   r$   r&   r   RH_dYs         r;   rq   Slow_RoPE_Embedding.backward  sr    $$xx|q 		2c45j>BsETEzN?;2F
	
E 4t##r=   rs   Nrt   rs   r=   r;   r   r     s(     $ 	$ 	$r=   r   c                     [         R                  XX45      n [         R                  XX45      n[        U R                  5      R	                  5         X4$ N)r   r}   r   rc   r   )r   r    r$   r&   r   s        r;   inplace_rope_embeddingr     sB    !!!#<A!!!#<A!--/4Kr=   r   )tritontriton.languagelanguager   r   device_typer   utilsr   r   r   	constexprr<   jit
heuristicsrH   int__annotations__rW   autogradFunctionrZ   compilerdisabler   r|   r   r   rs   r=   r;   <module>r      s       & L LJZ llJZ  ||!JZ" <<#JZ$ ll%JZ& 'JZZ ZZ 23 &&AG
    >B,,>B LL	>B LL>B ll>B \\>B <<>B >BB **_-##A 	\
%..11 \
@  " .x2U^^44 x2v$%..11 $Br=   