
    hK&                     @   S SK r S SKrS SKJr  S SKJr  S SKJrJ	r	J
r
JrJr  \R                  S\R                  S\R                  4S j5       r\R                  S\R                   S\R                   S	\R                   4S
 j5       rS\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  S\R                  4S jr\R                  " \5      r\R&                  " S S S S S S S S  S! S" S# S$.5      " \5      r\" 5       " \5      r    S0S%\ R(                  S&\ R(                  S'\ R(                  S(\ R(                  S)\ R(                  S-  S*\S-  S\S-  S+\ R(                  S-  S,\S-\ R(                  S-  S.\S	\\ R(                  \ R(                  4   4S/ jjrg)1    N)cce_backward_autotune)b_bin_fntl_and_reduce_fntl_lock_addtl_softcappingtl_softcapping_gradBLOCK_DEVEN_Dc                 J   [         R                  " SU
5      S S S 24   nXU-  -   nX\U-  -   n[        S[         R                  " X5      5       H  nU(       a  UnOXlXU
-  -
  :  -  n[         R                  " X^SS9R                  U R                  5      n[         R                  " X5      R                  UR                  R                  5      nU(       a  UnOX,XU
-  -
  :  -  nU[         R                  " XU-  5      -  nUU-   n[        UUUU5        XZU-  -  nXU-  -  nM     g )Nr           maskother)
tlarangerangecdivloadtodtypedot
element_tyr   )doda_ptrspartial_mask_ada_lock_ptrn_locksb_ptrspartial_mask_b	stride_ad	stride_bdDr	   r
   d_indsdr   bda_ilock_offsetthis_da_lock_ptrs                      X/home/james-whalen/.local/lib/python3.13/site-packages/cut_cross_entropy/cce_backward.py_mm_backwardr*      s    YYq'"47+F**Gy((F1bgga)*!D!qw;%?@DGGFS144RXX>vvb} 8 89!D!qw;%?@D2771&788&4GT4)9:I%%Y&&+ +    	check_val
filter_epsreturnc                 >    [         R                  " X:  S [        5      $ )N)r   reducer   )r,   r-   s     r)   _block_is_filteredr1   <   s    99Y+T3CDDr+   BLOCK_BBLOCK_VMM_BACK_BLOCK_DGROUP_BMM_BACK_EVEN_DITEM_DO
HAS_VALIDSHAS_VOCAB_ORDERINGFILTER_GRADHAS_TARGETSHAS_SOFTCAPSHIFTREQUIRES_GRADc*                    [         R                  " SS9n*[         R                  " UU5      n+[         R                  " UU5      n,UU,-  n-U*U--  n.U.U-  n/[        U+U/-
  U5      n0U/U*U--  U0-  -   n1U*U--  U0-  n2U1U-  [         R                  " SU5      -   U-  n3U#(       a  [         R
                  " UUU3-  -   5      n3U2U-  [         R                  " SU5      -   U-  n4U$(       a  [         R
                  " UU4-   5      n4[         R                  " SU5      n5U U3S S 2S 4   U-  U5S S S 24   U-  -   -   n6UU4S S S 24   U-  U5S S 2S 4   U-  -   -   n7[         R                  " UU4[         R                  S9n8[        S[         R                  " UU5      5       H  n9U (       aF  [         R
                  " U65      n:[         R
                  " U75      R                  U:R                  5      n;Oe[         R
                  " U6U5S S S 24   UU9U-  -
  :  SS9n:[         R
                  " U7U5S S 2S 4   UU9U-  -
  :  SS9R                  U:R                  5      n;[         R                  " U:U;U8SS9n8U6UU-  -  n6U7UU-  -  n7M     U'(       a  [        U8U5      n8U#(       a8  [         R
                  " UU1U-  [         R                  " SU5      -   U-  -   5      n<O[         R
                  " UU3-   5      n<[         R                  " U8U<S S 2S 4   -
  5      n=U&(       aT  [         R
                  " UU((       a  U3S-   OU3-   5      n>U>S S 2S 4   U4S S S 24   :H  n?U=[         R                  " U?S	S5      -  n=OS n?U1U-  [         R                  " SU5      -   S S 2S 4   U:  U2U-  [         R                  " SU5      -   S S S 24   U:  -  n@[         R                  " U@U=S5      n=U%(       a&  [        [         R                   " U=5      U5      (       a  g U'(       a  [#        U=U8U5      n=U"(       a  [         R
                  " U5      nAO,[         R
                  " UU((       a  U3S-   OU3-   5      S S 2S 4   nAUWA-  nAU=UA-  R                  U6R                  R$                  5      n=U1U-  [         R                  " SU5      S S 2S 4   -   U:  nBU2U-  [         R                  " SU5      S S 2S 4   -   U:  nCU1[         R                  " UUU-  5      -  U-  nDU
UD-  n
['        U=U	U3S S 2S 4   U-  -   UBU
UUU4S S 2S 4   U-  -   UCUUUUU!5        U2[         R                  " UUU-  5      -  U-  nDUUD-  nU)(       aE  ['        [         R(                  " U=5      UU4S S 2S 4   U-  -   WCUUU U3S S 2S 4   U-  -   WBUUUUU!5        g g )
Nr   )axisr   r   r   ieee)input_precision   g      )r   
program_idr   minr   r   zerosfloat32r   r   r   r   r   expwherer1   absr   r   r*   trans)EECLSEdOut
grad_scaleValidsVocabOrderingsoftcapTargetsdEdELocksdCdCLocksBr"   Vn_de_locks_0n_de_locks_1n_dc_locks_0n_dc_locks_1	stride_eb	stride_ed	stride_cv	stride_cd	stride_vbr-   B_BINr2   r3   r	   r4   r5   r
   r6   r7   r8   r9   r:   r;   r<   r=   r>   pidnum_b_chunksnum_v_chunksnum_v_in_groupgroup_idfirst_pid_bgroup_size_bpid_bpid_voffs_boffs_voffs_de_ptrsc_ptrsaccumr$   eclsed_accumtargets	is_targetaccum_valid_maskd_outb_maskv_maskr'   sE                                                                        r)   _cce_backward_kernelr   A   sK   X --Q
C771g&L771g&L|+Nn$HW$K|k17;LC.0L@AE>!l2Ego		!W 55:F)f"445go		!W 55:F/0YYq'"F&D/I-tQw)0KKLF&q/I-q$w)0KKLFHHgw'rzz:E1bgga)*A""177+AVD!G_q1w;%FcRAVAtG_q1w;%FcRUUVWV]V]^Aq!Uf='I%%'I%% + ug.ggcUW_ryyG/DDIIJggcFl#ffUSD\)*G'''UfqjGHAtG$tQw7	288ItS11	299Q+@@!T'JQN	299Q0	0$':Q> hh'#6GbffWoz::%gug>!6BCAtGLE""6<<#:#:;Ggo		!W 5ag >>!CFgo		!W 5ag >>!CFBGGAw'=>>,NK{G
fQWo	)*	F1d7Oi''	 BGGAw'=>>,NK{GHHW&D/I-.49,-	
 r+   c                     U S   U S   -  S:H  $ )Nr"   r	   r    argss    r)   <lambda>r      s    S	DO ;Ar+   c                     U S   S-  $ )Nr	      r   r   s    r)   r   r      s    Y!(;r+   c                 $    U S   U S   S-  -  S:H  $ )Nr"   r	   r   r   r   r   s    r)   r   r      s    S	T)_q5H(Ia'Or+   c                     U S   S L$ )NrR   r   r   s    r)   r   r      s    4>#=r+   c                     U S   S L$ )NrS   r   r   s    r)   r   r      s    4+@+Lr+   c                     U S   S L$ )Nr-   r   r   s    r)   r   r      s    D$6d$Br+   c                     U S   S L$ )NrU   r   r   s    r)   r   r          DO4$?r+   c                     U S   S L$ )NrT   r   r   s    r)   r   r      r   r+   c                 .    U S   R                  5       S:H  $ )NrP   rD   )numelr   s    r)   r   r      s    V 2 2 4 9r+   c                     g)N   r   r   s    r)   r   r      s    r+   c                     U S   $ )Nr>   r   r   s    r)   r   r      s	    tO'<r+   )r
   r4   r6   r8   r9   r:   r;   r<   r7   r5   r>   r   ru   rv   rw   validsrT   ry   shiftvocab_orderingrQ   c                 	  ^^ U R                  5       UR                  S5      S4;   d   eTR                  S5      UR                  S5      :X  d   eUR                  S5      UR                  S5      :X  d)  Ub$  UR                  S5      UR                  S5      :X  d   eUR                  [        R                  [        R
                  4;   d   S5       eTR                  [        R                  [        R
                  [        R                  4;   d   S5       eU R                  5       n UR                  5       n[        R                  " U5      nUR                  5       UR                  5       :X  d   eTR                  (       aF  [        R                  " TUR                  S9nUR                  5       TR                  5       :X  d   eSnOTnSnUb$  UR                  S:X  d   eUR                  S5      mOUR                  S5      mU R                  5       S:  ao  U R                  5       n UR                  5       nU R                  S5      UR                  S5      :X  d+   SU R                  5       < S	UR                  5       < 35       eUU4S
 jnU	bN  U	R                  S:X  d   eU	R                  5       UR                  S5      :X  d   eU	R                  S5      S:X  d   e[        R                  " TR                  S5      S5      nUR                  [        R                  " TU5      U4[        R                  S9nTR                  [        R                  " TR                  S5      U5      U4[        R                  S9n[         U   " UTUU U
UU	UUUUUUTUR                  S5      TR                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      UR                  S5      TR                  S5      TR                  S5      Uc  SOUR                  S5      U4[#        T5      UUS.6  X(       a  UR%                  TR                  5      4$ S 4$ )Nr   rD   z0Backwards requires embeddings to be bf16 or fp16z8Backwards requires classifier to be bf16 or fp16 or fp32rA   TFzdo.stride()=z, lse.stride()=c                    > [         R                  " TU S   5      [         R                  " TR                  S5      U S   5      -  4$ )Nr2   r   r3   )tritonr   size)METArZ   rv   s    r)   grid!cce_backward_kernel.<locals>.grid&  s7    AtI/&++affQii2YY[[r+   @   )re   r=   r>   )r   r   r   torchfloat16bfloat16rH   
contiguous
zeros_likestriderequires_gradndimr   r   	new_zerosint32r   r   r   )r   ru   rv   rw   r   rT   r-   ry   r   r   rQ   dedcr>   r   nd_locksde_locksdc_locksrZ   s     `               @r)   cce_backward_kernelr      s    88:!&&)Q'''66!9q	!!!88A;!&&)#(:sxx{fkkZ[n?\]]77  : ::  77  B B	B  
B
..
C			!	B99;!((*$$$a1yy{ahhj((({{aKKNFF1I	xxzA~]]_nnyy|szz!},P>N.PP,\ !""a'''##%333$$Q'1,,,{{166!9b)H{{FKK84h?u{{{SH{{FKKq	8<hGu{{{[H		


		q		q	aaaa				^q!156 qk%;@ ruuQWW~88D88r+   )NFNg      ?)r   r   triton.languagelanguager   cut_cross_entropy.tl_autotuner   cut_cross_entropy.tl_utilsr   r   r   r   r   jit	constexprr*   tensorr1   r   
heuristicsTensorfloatbooltupler   r   r+   r)   <module>r      s      ?  (' \\(' LL(' ('V E")) E Eryy E E\
8 \\9\
: \\;\
< \\=\
> \\?\
@ \\A\
B LLC\
D LLE\
F \\G\
H I\
J K\
L M\
N O\
P Q\
R <<S\
T <<U\
~ zz"67 ((A;O=LB??9!<   -./CD  $(*._9_9||_9 ||_9 
	_9
 LL4_9 T\_9 _9 \\D _9 _9 LL4'_9 _9 5<<%&_9r+   