
    h                        S SK JrJr  S SKrS SKrS SKJr  S SKJ	r	  S SK
JrJrJr  S\R                  S\R                  S\R                  S\R                  S	\R                  S
\R                  S\R                  S\R                  S\R                  4S jr\R                   " \5      r\R"                  " S S S S S S S.5      " \5      r\	" 5       " \5      r\   S"S\R$                  S-  S\S-  S\S   S\R$                  4S jj5       r\   S#S\R$                  S-  S\S-  S\S   S\\R$                  \R$                  4   4S jj5       r\   S"S\R$                  S-  S\S-  S\S\\R$                  \R$                  4   \R$                  -  4S jj5       r   S"S\R$                  S \R$                  S\R$                  S-  S\S-  S\S\\R$                  \R$                  4   \R$                  -  4S! jjrg)$    )LiteraloverloadN)cce_forward_autotune)b_bin_fntl_logaddexptl_softcapping
HAS_VALIDSBLOCK_BBLOCK_VBLOCK_DGROUP_BEVEN_DHAS_SOFTCAPHAS_LADOT_PRECISIONc                 n   [         R                  " SS9n[         R                  " UU5      n[         R                  " UU5      nUU-  nUU-  nUU-  n [        UU -
  U5      n!U UU-  U!-  -   n"UU-  U!-  n#U"U-  [         R                  " SU5      -   U-  n$U(       a  [         R
                  " X_U$-  -   5      n$U#U-  [         R                  " SU5      -   U-  n%[         R                  " SU5      n&U U$S S 2S 4   U
-  U&S S S 24   U-  -   -   n'UU%S S S 24   U-  U&S S 2S 4   U-  -   -   n([         R                  " UU4[         R                  S9n)[        S[         R                  " U	U5      5       H  n*U(       aF  [         R
                  " U'5      n+[         R
                  " U(5      R                  U+R                  5      n,Oe[         R
                  " U'U&S S S 24   U	U*U-  -
  :  SS9n+[         R
                  " U(U&S S 2S 4   U	U*U-  -
  :  SS9R                  U+R                  5      n,[         R                  " U+U,U)US9n)U'UU-  -  n'U(UU-  -  n(M     U#U-  [         R                  " SU5      -   U:  n-[         R                  " U-S S S 24   U)[        S5      * 5      n.U(       a  [        U.U5      n.U"U-  [         R                  " SU5      -   n/U/U:  n0U(       aR  [         R                  " U0S S 2S 4   U.S5      n.[         R                  " U.S5      U-  n1[         R                   " UU%-   U1U-S9  [         R"                  " U.S	S9n2[         R$                  " U.U2S S 2S 4   -
  5      n+U2[         R&                  " [         R                  " U+S	S95      -   n3X.U/-  -   n4UU"[         R                  " UUU-  5      -  -   n5[         R(                  " U5SS	5      S	:X  a   [         R(                  " U5SS	5      S	:X  a  M  [         R
                  " U4U0SS
S9n6[+        U6U35      n6[         R,                  " U4U6U0S
S9  [         R.                  " U5S5        g )Nr   )axisdtype        )maskother)input_precisioninf)r      
evict_last)r   r   eviction_policy)r   r   )tl
program_idcdivminarangeloadzerosfloat32rangetor   dotwherefloatr   sum
atomic_addmaxexplog
atomic_casr   storeatomic_xchg)7ECLSELALocksValidssoftcapBVD	stride_eb	stride_ed	stride_cv	stride_cdstride_lse_b	stride_vb	num_locksB_BINr	   r
   r   r   r   r   r   r   r   pid	num_pid_b	num_pid_vnum_pid_in_groupgroup_idfirst_pid_bgroup_size_bpid_bpid_voffs_boffs_voffs_de_ptrsc_ptrsaccumdecv_masklogitsoff_bo_maskthis_avg_logitthis_mxthis_lselse_ptrs
this_lockslses7                                                          [/home/james-whalen/.local/lib/python3.13/site-packages/cut_cross_entropy/cce_lse_forward.py_cce_lse_forward_kernelrb      s   < --Q
C7#I7#I*&&HW$Ky;.8LC"22lBCE##4Ego		!W 55:Ff"445go		!W 55:FYYq'"F&D/I-tQw)0KKLF&q/I-q$w)0KKLFHHgw'rzz:E1bgga)* A""177+AVD!G_q1w;%FcRAVAtG_q1w;%FcRUUVWV]V]^Aq!UMB'I%%'I%% + go		!W 55:FXXfT1WouuU|m<F0GObii733EQYF&D/637*Q.
b6k>?ffV!$G
v4(()Aqq 122HU*+H%2771g	.A#BBCJ
--
Aq
)Q
. --
Aq
)Q
. ''(sL
QC
sH
%CHHXsFNN:q!    c                     U S   U S   -  S:H  $ )Nr<   r   r    argss    ra   <lambda>rh   k   s    tCy4	?:a?rc   c                     U S   S L$ )Nr8   re   rf   s    ra   rh   rh   l   s    4>#=rc   c                     U S   S L$ )Nr9   re   rf   s    ra   rh   rh   m   s    DO4$?rc   c                     U S   S L$ )Nr6   re   rf   s    ra   rh   rh   n   s    tDz5rc   c                     g)N   re   rf   s    ra   rh   rh   o   s    rc   c                 <    [         R                  " 5       S:X  a  S$ S$ )Nhightf32ieee)torchget_float32_matmul_precisionrf   s    ra   rh   rh   p   s%    --/69 '- ''rc   )r   r	   r   r   r   r   Fvalidsr9   return_logit_avgreturnc                     g Nre   rU   rV   rt   r9   ru   s        ra   cce_lse_forward_kernelrz   x   s     rc   Tc                     g rx   re   ry   s        ra   rz   rz      s     ),rc   c                     g rx   re   ry   s        ra   rz   rz      s     8;rc   rU   rV   c                   ^^ U R                   S   UR                   S   :X  d   S5       eU R                  5       (       d   S5       eUb#  UR                  S:X  d   eUR                  5       mOU R                   u  mnUR                   u  mnU R	                  T4[        S5      * [        R                  S9nU R	                  [        R                  " TS5      4S[        R                  S9nU(       a!  U R	                  T4S[        R                  S9n	OS n	S	[        [           4UU4S
 jjn
[        U
   " U UUU	UUUTTUU R                  S5      U R                  S5      UR                  S5      UR                  S5      UR                  S5      Uc  SOUR                  S5      UR                  S5      [!        T5      S9  U(       a  U	c   eXy4$ U$ )Nr   zIncompatible dimensionszMatrix A must be contiguousr   r      r   r   rv   c                 p   > [         R                  " TU S   5      [         R                  " TU S   5      -  4$ )Nr
   r   )tritonr    )METAr:   r;   s    ra   grid$cce_lse_forward_kernel.<locals>.grid   s/    AtI/&++ai2QQSSrc   )rC   rD   )shapeis_contiguousndimnumelnew_fullr*   rr   r%   r   r    uint32tupleintrb   stridesizer   )rU   rV   rt   r9   ru   _r<   r`   locks	logit_avgr   r:   r;   s              @@ra   rz   rz      s    771:#>%>>#??;;;{{aLLNww177DAq
**aTE%L=*
>CJJ	Q		ll  E JJtSJ>		TeCj T T D!									

1^q!1**Q-qk%* $$$~
rc   )NNF)NNT)typingr   r   rr   r   triton.languagelanguager   cut_cross_entropy.tl_autotuner   cut_cross_entropy.tl_utilsr   r   r   	constexprrb   jit
heuristicsTensorr*   rz   r   boolre   rc   ra   <module>r      s   $    > M MY"( )Y"* \\+Y", \\-Y". \\/Y"0 \\1Y"2 LL3Y"4 5Y"6 LL7Y"8 <<9Y"x !**%<=  ++?=?5!	   /01HI  
 #' ', LL4 T\	
 en \\ 
 
 #' &*, LL4, T\	,
 dm, 5<<%&, 
, 
 #' "; LL4; T\	;
 ; 5<<%&5; 
; #' ";||;||; LL4; T\	;
 ; 5<<%&5;rc   