
    oia8                        S SK r S SKJr  S SKrSSKJrJrJrJ	r	J
r
Jr  S SKJr  S SKJr  S SKJrJr  S\R(                  S\R(                  S	\R(                  S
\R(                  S\R(                  S\R(                  4S jr\ R,                  " \5      r\ R.                  " S S S.5      " \5      rS\R(                  S\R(                  S\R(                  S\R(                  S	\R(                  S
\R(                  S\R(                  S\R(                  4S jr\ R,                  " \5      r\ R.                  " S S S.5      " \5      rS\R(                  S\R(                  S\R(                  S\R(                  S	\R(                  S
\R(                  S\R(                  S\R(                  4S jr\ R,                  " \5      r\ R.                  " S S S.5      " \5      rSr " S S\R4                  R6                  5      r   S!S jr\" \R<                  5      \" S5      :  a   \" \S5      (       d  \R@                  " \5      rS"S  jrg)#    N   )calculate_settingsMAX_FUSED_SIZEtriton_tanhtriton_casttorch_gpu_deviceis_cdna)logger)Version)patch_loss_functionspost_patch_loss_function
VOCAB_SIZE
BLOCK_SIZEDO_SOFTCAPPINGSOFTCAPDO_LOGIT_SCALINGLOGIT_SCALEc           	         [         R                  " S5      nX[        U[         R                  5      -  -  n X+-  nX;-  nXK-  n[         R                  " SU5      nX:  n[         R
                  " U5      R                  [         R                  5      n[         R
                  " X-   U[        S5      * S9R                  [         R                  5      nU	(       a  X-  nU(       a  U[        X-  5      -  n[         R                  " US5      nU[         R                  " [         R                  " [         R                  " UU-
  5      S5      5      -   nUS:w  a_  [         R
                  " X-   5      R                  [         R                  5      nU	(       a  U
U-  nU(       a  U[        UU-  5      -  nUU-
  nOSn[         R                  " UU5        [         R                  " UU5        g)a  
Cross Entropy Loss = 1/n sum [ -yi log(Pi) ]
Pi = exp(xi) / sum(exp(xi))
CE_i = -y log(p) = -y log[ exp(x) / sum(exp(x)) ]
     = -y [ x - log[sum(exp(x))] ]
     = y * (log[sum(exp(x))] - x)
If y == 0: CE_i = 0
If y == 1: CE_i = logsumexp - x

logsumexp is also stable
Take    y =         log[sum(exp(x))]
   exp(y) =             sum(exp(x))
   exp(y) =             sum(exp(x - c)*exp(c)) Since e^(x-c)*e^c = e^x
   exp(y) =      exp(c)*sum(exp(x - c))
       y  = log(exp(c)*sum(exp(x - c)))
       y  = c + log[sum(exp(x - c))]
This means we can set c = max(x) to make sure
exp(x - c) always is exp(x - max(x)).
This ensures exp(x - max(x))'s maximum is 1 as exp(0) = 1.
r   infmaskother        Ntl
program_idr   int64arangeloadtoint32floatfloat32r   maxlogsumexpstore)
logits_ptrlogits_row_strideloss_ptrlogsumexp_ptr
labels_ptrr   r   r   r   r   r   row_idxcol_offsetsr   	label_idxlogitsc	logsumexpxlosss                       \/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/kernels/cross_entropy_loss.py_cross_entropy_forwardr8   #   sw   B mmAGK(9288DDDJHMJ))Az*K#D
#&&rxx0IWWZ-dU5\MRUU


F
 %;v'788
vqABFF266"&&!"4a899IDGGJ*+..rzz:aA+a'k22A1}HH]I&HHXt    c                     [        U S   5      $ Nr   boolargss    r7   <lambda>r@   n       tD1A,B'Cr9   c                     [        U S   5      $ Nr   r<   r>   s    r7   r@   r@   o       d3E.F)Gr9   )r   r   r+   N_CHUNKSc           	         [         R                  " S5      n[         R                  " S5      nX[        U[         R                  5      -  -  n X,-  nX<U-  U-   -  nXL-  nX-  [         R                  " SU5      -   nX:  n[         R
                  " U5      R                  [         R                  5      n[         R
                  " X-   U[        S5      * S9R                  [         R                  5      nU
(       a  UU-  nU(       a  U	[        UU	-  5      -  n[         R                  " US5      nU[         R                  " [         R                  " [         R                  " UU-
  5      S5      5      -   nUS:X  a  US:w  a`  [         R
                  " U U-   5      R                  [         R                  5      nU
(       a  UU-  nU(       a  U	[        UU	-  5      -  nSU-  nOSn[         R                  " UU5        [         R                  " UU5        g)	a  
256K vocab divided in 4 chunks

|-65536-| |-65536-| |-65536-| |-65536-|
|-------| |-------| |-------| |-------|
|-------| |-------| |-------| |-------|

If y == 0: CE_i = 0
If y == 1: CE_i = logsumexp - x

Notice we can do logsumexp for each chunk and then
logsumexp[chunk_sum(logsumexp)] == logsumexp

chunk_sum = log[chunk_sum(logsumexp)]
          = log[exp(logsumexp(a)) + ... + exp(logsumexp(z))]
          = log[exp(log[sum(exp(a))]) + ... + exp(log[sum(exp(z))])]
          = log[sum(exp(a)) + ... + sum(exp(z))]
          = logsumexp(x)

This means we can perform a logsumexp for each chunk, then do a
final logsumexp reduction!

Ie do: logsumexp(chunked_logsumexp) - x
r   r   r   r   r   g      r   Nr   )r*   r+   r,   r-   r.   r   rE   r   r   r   r   r   r/   	chunk_idxr0   r   r1   r2   r3   r4   r5   r6   s                         r7   _chunked_cross_entropy_forwardrH   t   s   L mmAGa IK(9288DDDJHx')33MJ(299Q
+CCK#D
#&&rxx0IWWZ-dU5\MRUU


F
 v%;v'788
vqABFF266"&&!"4a899IA~ 
Y./222::>A!Ok!g+66!8DD
4 HH]I&r9   c                     [        U S   5      $ r;   r<   r>   s    r7   r@   r@      rA   r9   c                     [        U S   5      $ rC   r<   r>   s    r7   r@   r@      rD   r9   dloss_row_stridec                 n   [         R                  " S5      n[         R                  " S5      nX[        U[         R                  5      -  -  n X,U-  -  nX-  [         R                  " SU5      -   nX:  n[         R
                  " X\-   5      R                  [         R                  5      nUS:w  a  [         R
                  " U5      nOSn[         R
                  " X-   U[        S5      * S9R                  [         R                  5      nU
(       a  UU-  nUnU(       a  [        UU	-  5      nU	U-  n[         R
                  " XL-   5      n[         R                  " UU-
  5      n[         R                  " UU:H  US-
  U5      nU
(       a  UU-  nU(       a  USUU-  -
  -  n[         R                  " X-   UU-  US9  g	)
a  
CE_i = -y log(P) = y * (log[sum(exp(x))] - x)
dC/dx = d/dx (y * log[sum(exp(x))] - x * y)

From https://en.wikipedia.org/wiki/LogSumExp
d/dx logsumexp = exp(x) / sum(exp(x)) = softmax(x)

dC/dx = y * exp(x) / sum(exp(x)) - d/dx (x * y)
dC/dx = y * exp[ log[exp(x) / sum(exp(x))] ] using x = exp(log(x)) trick
dC/dx = y * exp[x - logsumexp] - d/dx (x * y)

If y == 0: dC/dx = 0
If y == 1 and x == label: dC/dlabel = exp[x - logsumexp] - 1
If y == 1 and x != label: dC/dx     = exp[x - logsumexp]
r   r   r   r   r   r   g      ?)r   N)r   r   r   r   r   r    r!   r"   r#   r$   r   r(   wherer)   )r*   r+   	dloss_ptrrK   r-   r.   r   r   r   r   r   r   r/   	block_idxr0   r   r1   dlossr5   partialr4   ys                         r7   _cross_entropy_backwardrS      s   : mmAGa IK(9288DDDJ+++I(299Q
+CCK#D
,-00:ID	"

(e}MPPQSQ[Q[\A O Ga'k*g/0I
q9}A
y 	C		A Ow(() HHZ%uqy>r9   c                     [        U S   5      $ r;   r<   r>   s    r7   r@   r@     rA   r9   c                     [        U S   5      $ rC   r<   r>   s    r7   r@   r@     rD   r9   i   c                   F    \ rS rSr\ SS\S\4S jj5       r\S 5       rSrg)	Fast_CrossEntropyLossi'  logit_softcappinglogit_scalingc                    UR                   u  pVUR                  n[        U[        5      u  pXS:g  -   n
[        R
                  " U[        R                  US9n[        US:g  5      n[        US:g  5      nU
S:X  a  [        U5      u  p[        5       (       a  US-  n[        R
                  " U[        R                  US9n[        U5         [        U4   " UUR                  S5      UUUUUUUUUUS9  S S S 5        O[        R
                  " UU
4[        R                  US9n[        U5         [        UU
4   " UUR                  S5      UUUUU
[        UUUU[        5       (       d  SOSS9  S S S 5        [        R                  " USS	9nUU-  nUR                  US
:H  S5        U R!                  UUU5        Xl        X0l        Xl        X@l        U$ ! , (       d  f       N;= f! , (       d  f       N{= f)Nr   )dtypedevicer      r   r   r   r   r   r   	num_warps       )r   rE   r   r   r   r   r   r_   )dimr   )shaper\   divmodr   torchemptyr$   r=   r   r	   r   r8   striderH   r4   masked_fill_save_for_backwardr   rX   r   rY   )ctxr2   labelsrX   rY   n_rows
vocab_sizer\   divmodn_chunkslossesr   r   r   r_   r4   s                    r7   forwardFast_CrossEntropyLoss.forward(  s    $\\*n5ax(VU]]VL#$5$:;!%mq&8!9 q=$6z$B!Jyy%N	FEMMFSI!&)&y1MM!$!+!+%3/'7"/ ) *)"  I "&).  MM!$!+'!/%3/'7"/*1))% *. 	;IiF$2fi8+ 1/)u *)4 *)s   
)G-A G"
G"
G0c                    U R                   u  p#nUR                  u  pVSn[        Xg5      u  pXS:g  -   n
[        UR                  5         [
        UU
4   " UUR                  S5      UUR                  S5      UUUUU R                  U R                  U R                  U R                  SS9  S S S 5        US S S 4$ ! , (       d  f       N= f)Ni   r      r^   )saved_tensorsrc   rd   r   r\   rS   rg   r   rX   r   rY   )rj   dlossesr2   r4   rk   rl   rm   r   rn   ro   n_blockss              r7   backwardFast_CrossEntropyLoss.backward}  s    $'$5$5!6 $\\
 *1ax(gnn-# a q!''!$!3!3//#&#7#7!//% ., 	
 	
+ .-s   	A"B99
C N)r   r   )	__name__
__module____qualname____firstlineno__staticmethodr#   rr   ry   __static_attributes__r{   r9   r7   rW   rW   '  s@    RSR05RJOR Rh &
 &
r9   rW   c                    U R                   u  pVnUR                   XV4:X  d   e[        R                  U R                  XV-  U5      UR                  S5      UU5      nUc  [        R
                  " US:g  5      nUR                  5       U-  $ )zn
Arguments:
    logits: (batch, seq_len, vocab_size)
    labels: (batch, seq_len,)
Returns:
    losses: float
r   )rc   rW   applyviewre   count_nonzeror'   )	r2   rk   rX   rY   n_itemsbatchseq_lendr6   s	            r7   fast_cross_entropy_lossr     s     EA<<E++++ &&EOQ'B	D %%fn588:r9   z2.4.0__wrapped__c                      [        [        U S9  g )Ntorch_compile)_patch_loss_functionsr   r   s    r7   r   r     s    1=Qr9   )r   r   N)T)!tritontriton.languagelanguager   re   utilsr   r   r   r   r   r	   (transformers.models.llama.modeling_llamar
   unsloth_zoo.utilsr   unsloth_zoo.loss_utilsr   r   r   	constexprr8   jit
heuristicsrH   rS   autogradFunctionrW   r   __version__hasattr_disable_dynamor{   r9   r7   <module>r      s       < %E E E LLE \\E llE EP  $:; **CG
  N'||N' N' llN' N' LLN' \\N' llN' N'b "(,J!K !'!2!2CG"
 !"" J?||J? ll	J? J? J? LLJ? \\J? llJ? J?Z !**%<=  ++CG
   }
ENN33 }
F  8 E!117]< < $334KLRr9   