
    oirC                     J   / S Qr SSKrSSKJrJrJrJr  SSKrSSKrSSK	r	SSK
r
SSKJrJrJr  SSKJr  \
R"                  R%                  SS5      r\
R"                  R%                  SS5      r\R*                  S	 5       r S
 r    SS\R0                  S\R0                  S\\R0                     S\R0                  S\\R0                     S\\   S\S\\R0                  \\R0                  4   4   4S jjr \R*                  SS j5       r SS jr  " S S\R<                  R>                  5      r        SS\R0                  S\R0                  S\\R0                     S\R0                  S\\R0                     S\\R0                     S\\   S\\!   S\\   S\\   4S jjr"g) )unsloth_fused_ce_lossapply_autograd_functioncompute_fused_ce_loss    N)OptionalTupleCallableDict   )UNSLOTH_ENABLE_LOGGINGtorch_compile_optionslogger)DEVICE_TYPEUNSLOTH_CE_LOSS_TARGET_GBUNSLOTH_CE_LOSS_N_CHUNKSc                 2   [         R                  " [        U S5      5      R                  n[	        U5      nUR                  SS 5        [        UR                  5       5      [        UR                  5        Vs/ s H  o"R                  PM     sn5      4$ s  snf )Nforwardctx)
inspect	signaturegetattr
parametersdictpoptuplekeysvaluesdefault)autogradr   xs      e/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/fused_losses/cross_entropy_loss.py_get_mappingr!   $   sr    ""78Y#?@KKJj!JNN5$"#Uz?P?P?R+S?R!II?R+S%TTT+Ss   7B
c                 b   ^ [        U 5      u  p#[        U S5      " U4S j[        X#5       5       6 $ )Napplyc              3   J   >#    U  H  u  pTR                  X5      v   M     g 7fN)get).0old_keyr   mappings      r    	<genexpr>*apply_autograd_function.<locals>.<genexpr>.   s'      ( 9G 	G%% 9s    #)r!   r   zip)r   r)   r   defaultss    `  r    r   r   ,   s6    '1J8W% ( #J 9(      hidden_stateslm_head_weightlm_head_biaslabelsn_itemsscalingshift_labelsreturnc                 p   UR                   nU(       a+  [        R                  " X8S9n	USSS24   U	SSS24'   SU	S'   U	n [        R                  R                  R                  U R                  UR                  US9UU5      n
UR                  S	   nUR                  S
S5      nUR                  SS5      nUR                  SS5      nUS	:w  a  Ub  X-  n
US	:w  a  Ub  X-  n
US	:w  a!  Ub  X-  n
[        R                  " U
5      n
X-  n
Ub  SOSn[        R                  R                  R                  U
R                  SU5      R                  5       R                  5       UR                  S5      R                  U5      R                  5       US9nUb  UU-  OUnUb  UU-  OUnUUR                  5       44$ )a  
Computes cross_entropy_loss(X @ W + b, labels)
* shift_labels does hidden_states[..., :-1] and labels[..., 1:]
* If n_items is not given, does mean(ce_loss), otherwise sum(ce_loss)/n_items
* Allows scaling factor from mixed precision fp16, fp8
* Upcasts to float32 and allows kwargs to have:
1) logit_scale_multiply (X = X * logit_scale_multiply)
2) logit_scale_divide   (X = X / logit_scale_divide)
3) logit_softcapping    (X = tanh(X / logit_softcapping) * logit_softcapping)
device.   N.r;   dtyper9   r   logit_scale_multiplylogit_scale_dividelogit_softcappingsummean)inputtarget	reduction)r9   torch
empty_likenn
functionallineartor?   shaper&   tanhcross_entropyviewfloat
contiguousdetach)r/   r0   r1   r2   r3   r4   r5   kwargsr9   _labelslogits
vocab_sizer@   rA   rB   rG   lossscaled_losss                     r    r   r   4   s   ( ""F""6;"37OSbSXX  ''!5!5GF
  %%a(J "::&<dC$8$?

#6=q %9%E.Q#5#A,A"3"?+F#+ !,&I88,,R,224??AR##F+668 - D
 %04'>dD$+$7$.TK(((r.   c                    Uc\  [         S:X  a  [        R                  R                  S5      O[        R                  R                  S5      u  p#US-  S-  S-  nUS-  nUn US::  a  [        S5      eU S-  S-  S-  S-  U-  nUS-  nU$ )<Gets chunk size that fits the target max memory usage (1GB) xpur   i   g      ?g&.>zGUnsloth: No or negligible GPU memory available for fused cross entropy.   )r   rH   r]   mem_get_infocudaRuntimeError)rX   	target_gbfreetotalfree_gb
multipliers         r    _get_chunk_multiplierrg   t   s     3>%3Geii,,Q/UZZMdMdefMg+$t+C-	 Ddeeq.4'$.5)DJaJr.   c                 Z    [        X#5      nX-  U-  n[        [        U5      S-  S5      nU$ )r\   r^   r:   )rg   maxround)bszqlenrX   rb   rf   n_splitss         r    get_chunk_sizern      s2    &z=JJ&H5?Q&*HOr.   c                   0   \ rS rSr\        SS\S\R                  S\R                  S\\R                     S\R                  S\\R                     S	\\R                     S
\\	   S\\
   S\\   S\\
   S\\
   S\\   4S jj5       r \S 5       rSrg)UnslothFusedLoss   Nloss_functionr/   r0   r1   r2   maskr3   r4   r5   rb   torch_compile	overwriteextra_kwargsc                   ^^ ^!^" UR                   nUc  0 nU	(       ae  [        R                  " X^S9nUSSS24   USSS24'   Ub%  UR                  US9nSUSSS24   USSS24   S:H  '    SUS'   UR	                  S5      nUn Ub  UOUS:g  R                  5       nUR                  5       S:w  a  UR                  5       S   nUR                  [        R                  US	9nUSL=(       a    UR                  m"USL=(       a    UR                  m!UR                  S   nU(       d  [        R                  " X.S9OUnT"(       a  [        R                  " X>S9OSnT!(       a  [        R                  " XNS9OSnUR                  u  nnn[        R                  " SUS9S   m S
U;   a  UR                  S
5      nO[        UUUU
S9n[        (       a#  [         R"                  " SU SU SU SU S3	5        [        R$                  " UUSS9n[        R$                  " UR	                  SU5      USS9n[        R$                  " UR	                  SU5      USS9n   SU U!U"U4S jjn U(       a  [        R&                  " USS[(        S9n[+        UUU5       H  u  nnnU" SUUUUUUUUUUU	S.UD6  M      U R-                  UUU5        Xl        T $ )a  
Computes chunked fused loss_function(chunk(X) @ W + b, chunk(labels))
* If n_items is not given, does mean(loss), otherwise sum(loss)/n_items
* shift_labels does hidden_states[..., :-1] and labels[..., 1:]
* Allows scaling factor from mixed precision fp16, fp8
* target_gb specifies the max GB memory the fused loss can use - default detects VRAM left
* overwrite allows hidden_states to be overwritten with gradients
* Place extra args in extra_kwargs which will be passed to (loss_function)
Nr8   .r:   r;   r<   r   r=   r>   n_chunks)rb   zFused CE Loss [bsz=z][qlen=z][vocab_size=z][n_chunks=])dimc           	        > T(       af  T(       a_  [         R                  R                  TSSS9" UUUUUU	U
(       + 40 UD6u  u  pnu  nu  nUR                  U5        UR                  U5        OT(       aM  [         R                  R                  TSSS9" UUUUUU	U
(       + 40 UD6u  u  pu  nu  nUR                  U5        OT(       aM  [         R                  R                  TSSS9" UUUUUU	U
(       + 40 UD6u  u  pu  nu  nUR                  U5        O;[         R                  R                  TSSS9" UUUUUU	U
(       + 40 UD6u  u  nu  nu  n TR                  U5        XS S & g )N)r   r:   r
   T)argnumshas_aux)r   r:   )r   r
   )r   )rH   funcgrad_and_valueadd_)rx   grad_inputs_jgrad_lm_headgrad_lm_head_biashidden_states_jr0   r1   labels_jdivisorr4   r5   rU   chunk_grad_inputchunk_grad_lm_headchunk_grad_lm_head_bias
chunk_lossunscaled_lossaccumulated_losslm_head_bias_requires_gradlm_head_requires_gradrr   s                    r    accumulate_chunk2UnslothFusedLoss.forward.<locals>.accumulate_chunk   s    %)C 

))!(" * 
 $" $$ /P!7N.-m !!"45!&&'>?&161J1J!%" 2K 2
 $" $$2 2/7!.-m !!"45+161J1J!%" 2K 2
 $" $$2 2/<!.-m "&&'>? 271J1J!"" 2K 2
 $" $$2 2/#!.-m !!-0/!r.   T)dynamic	fullgraphoptions)rx   r   r   r   r   r0   r1   r   r   r4   r5   )NNF )r9   rH   rI   rM   rQ   rC   numelravelfloat32requires_gradrN   
zeros_likezerosr   rn   r   r   infochunkcompiler   r,   save_for_backwardr4   )#r   rr   r/   r0   r1   r2   rs   r3   r4   r5   rb   rt   ru   rv   r9   rV   r   rX   grad_inputsr   r   rk   rl   hdrx   _UnslothFusedLoss__shift_labels_UnslothFusedLoss__shift_states_UnslothFusedLoss__grad_inputsr   r   r   r   r   r   r   s#    `                              @@@r    r   UnslothFusedLoss.forward   s   4  && &&v?G &sABwGC"Hwww/8<SbS!$sABw-1"45#GGll2&GF %0'v~6J6J6L==?a7==?1+=**U]]V*D .d : [~?[?[%1%=%\,B\B\"#))!,
 OXe&&}F]jLau''HgkOiE,,\Kos%++T2 ;;q6:1=%#''
3H%c4SH!!KK-cU'$}ZLXcdlcmmnopVQRS]%7%7B%?QRS[%5%5b"%=QRS  U	0 U	0l 	$}}  /	  ~~? 8]OX # -+$5"1!/+#!!+  @ 	k<9JKr.   c                    [         (       a_  U R                  b  U R                  OSn[        R                  " [        R                  " X:H  5      SU SUR                  5       S S  35        U R                  u  p4nS X4US S S S S S S S S 4$ )Ng      ?z*Fused losses expect grad_output to be all z
, but got 
   )r   r4   rH   _assertallr   saved_tensors)r   grad_outputr4   r   r   r   s         r    backwardUnslothFusedLoss.backwardN  s     "!%([[%<ckk#GMM%))K$:;?ijqirr|  ~I  ~O  ~O  ~Q  RU  SU  ~V  }W  >X  Y;>;L;L8$5k1BD$PTVZ\`bfhlnrtxzzr.   r   )NNNTNTFN)__name__
__module____qualname____firstlineno__staticmethodr   rH   Tensorr   rR   boolintr	   r   r   __static_attributes__r   r.   r    rp   rp      s    3726+/*.)-*.*/*.x !x  x  	x 
 "%,,/x  x  "%,,/x  "%,,/x  "%x  "$x  "#x  "$x  "$x  "$x  x r 	{ { 	r.   rp   rs   rb   rt   ru   c                 v   U b  U R                   R                  OSnUb  UR                  5       OUn[        US5      (       a  UR                  5       n[        (       a  [        [        5      nO'[        (       a  [        [        [        5      S5      US'   [        [        [        [        UUUUUUUSUU	U
US95      $ )aa  
Computes chunked fused cross_entropy_loss(chunk(X) @ W + b, chunk(labels))
* If n_items is not given, does mean(ce_loss), otherwise sum(ce_loss)/n_items
* Auto does shift of labels ie hidden_states[..., :-1] and labels[..., 1:]
* Allows scaling factor from mixed precision fp16, fp8
* target_gb specifies the max GB memory the fused loss can use - default detects VRAM left
* Upcasts to float32 and allows kwargs to have:
1) logit_scale_multiply (X = X * logit_scale_multiply)
2) logit_scale_divide   (X = X / logit_scale_divide)
3) logit_softcapping    (X = tanh(X / logit_softcapping) * logit_softcapping)
N	get_scaler:   rx   T)rr   r/   r0   r1   r2   rs   r3   r4   r5   rb   rt   ru   rv   )acceleratorscalerr   hasattr	TARGET_GBrR   N_CHUNKSri   r   r   rp   r   r   )trainerr/   r0   r1   r2   rs   r3   r4   rb   rt   ru   rU   r   s                r    r   r   Y  s    2 ,3+>W  ''DF$*$6f GGw$$0A0A0CgyeI.)	CM1(=6*%"#3T-%'#%6  r.   )NNTr%   )NNNNTF)#__all__rH   typingr   r   r   r	   r   	functoolsmathostemporary_patches.commonr   r   r   device_typer   environr&   r   r   cacher!   r   r   rR   r   r   rg   rn   r   Functionrp   r   r   r   r.   r    <module>r      s  "  2 2    	 \ \ % JJNN6=	::>>4d;
U U
   /3'+ <)\\<)\\<) ell+<) \\	<)
 ell+<) e_<) <) 5<<u||}-./<)z   "  E	u~~.. E	L  /3.2'+%)&*&+-\\- \\- ell+	-
 \\- ell+- ell+- e_- c]- d^- d^-\ r.   