
    oi3a                         S SK r S SKrS SKrS SKrS SKrS SKr/ SQr\ R                  SS j5       r	 / SS4S jr
 \ R                  / S4S j5       r S	r\ R                  S
 5       rg)    N)mean_of_trained_tokensadd_new_tokensfix_untrained_tokenspatch_tokenizer缉ؗҜ<c                    U R                  5       R                  R                  5       nU R                  5       R                  R                  5       n[        R
                  " USS9U:*  n[        R                  " U5      S   nUR                  S   nUR                  S   U-
  n[        R                  " U[        R                  SS9n[        R                  " U[        R                  SS9n	U[        R                  " X%   [        R                  SS9-  nU	[        R                  " X5   [        R                  SS9-  n	X-  n
X-  nX4$ )
Llama-3 for eg has untrained vectors in the base model.
These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
We reset them to the mean of the rest of the tokens
   axisr   dtyper   )
get_input_embeddingsweightcloneget_output_embeddingstorchamaxwhereshapesumfloat32)modelepsembedding_matrixlm_head_matrixindicator_untrainedwhere_untrainedn_untrained	n_trainedsum_embeddingsum_lm_headmean_embeddingmean_lm_heads               U/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/tokenizer_utils.pyr   r       s    114;;AAC224;;AACN  **%5a@CGkk"56q9O!''*K &&q)K7I II.aPMIInaPK UYY/@%--`abbMUYY~@%--`abbK $/N!/L''    meang      ?c                 
   [        U[        [        45      (       d   e[        U5      S:  d   eUS:X  d  US:X  d   eUS:  a  US::  d   e[	        U5      [	        UR
                  R                  5       5      -  n[        U5      S:w  a4  [        SU S[        U5       S35        U Vs/ s H  ofU;  d  M
  UPM     nn [        U 5      u  pxUR                  [        R                  5      nUR                  [        R                  5      nU R                  5       R                  n	U R                  5       R                  n
U	R                  S   nU
R                  S   nU R                   R"                  nU	R%                  5       U
R%                  5       :H  =(       d    U R                   R&                  n[        U5      nUR)                  U5        U R+                  [        U5      5        U R                  5       R                  nU R                  5       R                  nUR                  S   U[        U5      -   :w  a  [-        S5      eUR                  S   U[        U5      -   :w  a  [-        S	5      eU R                   R"                  U[        U5      -   :w  a  [-        S
5      e US:X  a  [        SSU-
   SU 35        [/        U5       H  u  nnU" USS9R0                  nUU   R3                  S[        R                  S9nUU   R3                  S[        R                  S9nUSU-
  -  UU-  -   nUSU-
  -  UU-  -   n[        R4                  " 5          UUUU-   '   UUUU-   '   SSS5        M     O([        R4                  " 5          UUUS& UUUS& SSS5         U n[7        US5      (       a&  SUl        UR:                  n[7        US5      (       a  M&   SUl        U n[7        US5      (       a  [7        US5      (       aq  [7        UR                   S5      (       a&  UR                   R=                  S[        U5      05        UR:                  n[7        US5      (       a  [7        US5      (       a  Mq  [7        US5      (       aR  [7        US5      (       aA  [7        UR                   S5      (       a&  UR                   R=                  S[        U5      05         U(       a  U R?                  5         [A        S5       H6  n[B        RD                  " 5         [        RF                  RI                  5         M8     gs  snf ! , (       d  f       GM}  = f! , (       d  f       GN= f)z
Smartly resizes the tokenizer and adds new tokens to the model.
We also disregard untrained tokens by removing them from the mean calculation.
r   r'   interpolationr
   z$Unsloth: You're adding new_tokens = z*
There are tokens which are overlapping = z1
We shall safely ignore these overlapping tokens.zVUnsloth: Embedding matrix size did not get resized properly. Please file a bug report!zTUnsloth: LM Head matrix size did not get resized properly. Please file a bug report!zZUnsloth: Model's config vocab_size did not get resized properly. Please file a bug report!zcUnsloth: You are using interpolation to add new tokens.
We shall set new tokens = mean(embeddings)*z + mean(new_tokens)*Fadd_special_tokens)r   r   Nr   Tconfig
vocab_size   )%
isinstancelisttuplelensetvocabkeysprintr   tor   r   r   r   r   r   r,   r-   data_ptrtie_word_embeddings
add_tokensresize_token_embeddingsRuntimeError	enumerate	input_idsr'   no_gradhasattr_need_to_train_embeddingsr   updatetie_weightsrangegccollectcudaempty_cache)r   	tokenizer
new_tokensmethodr)   overlapping_tokensxr#   r$   old_input_embeddingold_output_embeddingold_input_lengthold_output_lengthold_config_sizeis_tied
old_lengthr   r   jtokenr>   mean_embedding_tokenmean_lm_head_tokeninternal_modelcurrent_model_s                             r%   r   r   G   s    j4-0010z?QVv898A-1"454 Z3y/C/C/E+FF
!#2:, ?88<=O8P7Q R?@	

 ",KA8J/Ja
K $:%#@ N#&&u}}5N!&&u}}5L !558?? 668??+2215,2215//O #++-1E1N1N1PP .LL,,  YJ$	!!#i.1 114;;224;;N a %5Z%HId
 	
 A%6Z%HIb
 	
 ||_Z%HIh
 	
 	 ::;M/9JJ^_l^mo	
 "*-HAu!%eDNNI#3I#>#C#C1V[VcVc#C#d #1I#>#C#C1V[VcVc#C#d $21]?#CFZ[hFh#h #/1]?#CFX[hFh#h 1E A.1CA. ! . 	 ]]_,:Z[),8NZ[)  	 N
.'
*
*370'-- .'
*
* 	/3N, M
-
)
)gmX.N.N=''66  ''I(GH%++ -
)
)gmX.N.N }g&&7=(+K+K=''66  ''I(GH !!# 1X




   K L~ ! _s$   (	T,5T,T1U1
U	
Uc           	        ^1^2^3 U R                  5       R                  nU R                  5       R                  n[        USS5      m1[	        US5      (       a  UR
                  OUnU R                  R                  U;   a  g [        UR                  S   UR                  S   5      nUSU nUSU n[        R                  " USS9U:*  n[        R                  " USS9U:*  n	[        R                  " U5      S   n
XjR                  UR                  5         nUR                  5       R!                  5       R#                  5       R%                  S5      nSSKJn  U" 5       nU H0  o[+        UR,                  R/                  5       5      ==   S-  ss'   M2     U" UR1                  5        VVs0 s H  u  nnUS	:  d  M  UU_M     snn5      nU
R                  5       R#                  5       n
/ n[3        U5       HC  u  nn[+        UR,                  R/                  5       5      U;   d  M/  UR5                  U
U   5        ME     U	[        R6                  " U	5      -  n	S
U	U'   UR                  S5      U	R                  S5      -  nSnU HA  n[	        UUS-   5      (       a*  [9        SU S35      nUb  UUR                  S   :  a  SUU'   MC      [        R                  " U5      S   nUR                  S   nUR                  S   U-
  nUR;                  5       n[=        U5      S:X  a  g[?        U5      m3URA                  U5      nU Vs/ s H
  nUc  M  UPM     nnSnSnT1b  [C        U14S jU 5       5      n [E        U[F        RH                  5      (       a  g[=        U5      n[        US5      n[K        U5       H4  nUU   n SU ;   a&  U S   n [C        U34S jU  5       5      n!U!(       a  S
n  O M6      U(       dT  [M        US-
  S5      n"[K        U"U5       H4  nUU   n SU ;   a&  U S   n [C        U34S jU  5       5      n!U!(       a  S
n  O M6       U(       d  U(       d  gSn#URN                  (       d  S
n#URN                  (       d  S
n#U#(       Ga  / n$/ n%[=        U5      n[        US5      n[K        U5       HF  nUU   n SU ;   a8  U S   n U  H-  n&U&T3;   d  M  U$R5                  U&5        U%R5                  U5        M/     MH      [M        US-
  S5      n"[K        U"U5       HF  nUU   n SU ;   a8  U S   n U  H-  n&U&T3;   d  M  U$R5                  U&5        U%R5                  U5        M/     MH      [=        U$5      S:X  a  [=        U5      n[        US5      n[K        U5       HF  nUU   n SU ;   a8  U S   n U  H-  n&U&T3;   d  M  U$R5                  U&5        U%R5                  U5        M/     MH      [M        US-
  S5      n"[K        U"U5       HF  nUU   n SU ;   a8  U S   n U  H-  n&U&T3;   d  M  U$R5                  U&5        U%R5                  U5        M/     MH      [=        U$5      S:X  a  g [Q        [S        U$5      5      n'URU                  U'5      n([W        S[Q        [S        U%5      5       SU' SU( S35      e [X        RZ                  " [M        [=        U5      UR                  S   5      [X        R\                  S9m2U24S jn) UR_                  U)S
SS9  [        R`                  " U[        Rb                  SS9n*[        R`                  " U[        Rb                  SS9n+U*[        R`                  " UU   [        Rb                  SS9-  n*U+[        R`                  " UU   [        Rb                  SS9-  n+U*U-  n,U+U-  n-T2U   [M        T2RM                  5       S5      -  n.[        Rd                  " U.U,R                  S9Rg                  S5      n.U,Ri                  US45      U.-  n,U-Ri                  US45      U.-  n-U.Rk                  5       S:H  n/SU,U/'   SU-U/'   [m        S 5        U,R                  URn                  5      UU'   U-R                  URn                  5      UU'   [K        S5       H6  n0[p        Rr                  " 5         [        Rt                  Rw                  5         M8      gs  snnf s  snf )!r	   chat_templateNrI   r   r
   r   r.   )Counter   Tcpu)	bos_token	eos_token	unk_token	sep_token	pad_token	cls_token
mask_token_idz
tokenizer.Fc              3   ,   >#    U  H	  oT;   v   M     g 7fN ).0rM   r]   s     r%   	<genexpr>'fix_untrained_tokens.<locals>.<genexpr>"  s     I7H!-7H      r>   c              3   ,   >#    U  H	  oT;   v   M     g 7frj   rk   rl   itemwhere_untrained_sets     r%   rm   rn   1  s     K!44ro   c              3   ,   >#    U  H	  oT;   v   M     g 7frj   rk   rr   s     r%   rm   rn   @  s     OYT%88Yro   i  z#Unsloth: Untrained tokens in rows [z] found.
The token ids are [z] and tokens are [a4  ].
The issue is the embed_tokens & lm_head not trainable, which will cause NaNs. Restart then add `embed_tokens` & `lm_head` to `FastLanguageModel.get_peft_model(target_modules = [..., "embed_tokens", "lm_head",]). `Are you using the `base` model? Instead, use the `instruct` version to silence this warning.r   c                    > U S   n[         R                  " [        R                  R	                  U5      [         R
                  S9n[         R                  R                  TUS5        g )Nr>   rv   r
   )npfromiter	itertoolschainfrom_iterableint32addat)examplesr>   counterfinal_countss      r%   mapping%fix_untrained_tokens.<locals>.mapping  sG    [)	++ioo;;IFPRPXPXY
		,+r&   zCounting untrained tokens)batcheddescr   )devicezmUnsloth: Setting embed_tokens & lm_head untrained tokens to mean(trained) to counteract NaNs during training.)<r   r   r   getattrr@   rI   r,   _name_or_pathminr   r   r   r   r7   r   r`   floatnumpyroundcollectionsr^   hashdatatobytesitemsr=   append
zeros_likeevaltolistr2   	frozensetconvert_ids_to_tokensanyr/   datasetsIterableDatasetrD   maxrequires_gradr0   r3   decode
ValueErrorrx   zerosint64mapr   r   tensor	unsqueezerepeatravelr6   r   rE   rF   rG   rH   )4r   rI   train_datasetIGNORED_TOKENIZER_NAMESr   r   r   min_sizeindicator_untrained1indicator_untrained2lm_head_wherelm_head_badr^   r   rowkcfinal_bad_lm_headrU   r   special_tokensspecial_tokentoken_idr   r   r    actual_bad_tokensrM   if_bad_firstif_bad_secondsize_datasetsizer>   if_badleftbad_not_trainablefinal_bad_itemswhich_locationsrs   	token_idstokensr   r!   r"   r#   r$   scaling
where_nullr[   r]   r   rt   s4                                                    @@@r%   r   r      s    114;;224;;NI=M'.y+'F'F	##II ||!!%<< #))!,n.B.B1.EFH'	2%	2N !::&6qASH !::nqASH KK 45a8M !1!1.2G2G!HIK//#))+11399!<K#iGD)9)9);$<=B={B116tq!tBCG!%%'--/MK(3  "#w.$$]1%56 ) 0%2B2BCW2XX.2*+ /11%8;O;R;RSX;YYN (9me344js;<H#3F3L3LQ3O(O05#H- ( 	kk"56q9O!''*K &&q)K7I &,,.O
?q & $O4!77H$5G$5q$5G LM I7HII-!9!9:: 	 }%L|S!D4[!!$	)#!+.IKKKF $  	 <#Q't\*A%a(Ii'%k2	OYOO$(M + 	 v ))t+<))t+< =)<%tA%a(Ii'%k2	%D22'..t4'..q1 &   	 <#Q't\*A%a(Ii'%k2	%D22'..t4'..q1 &  + 	 1$}-L|T*D4[)!,	)+ )+ 6I )#66+2248+2215 !*  !  |D(!,D4.)!,	)+ )+ 6I )#66+2248+2215 !*  / ?#q(&_-.	!!),1$s?7K2L1M N""+,>vh Gkk
 	
 	 88CI0@0F0Fq0IJTVT\T\]L, 	g6QR II.aPMIInaPK UYY/@%--`abbMUYY~@%--`abbK $i/N!i/L ?+c,2B2B2Da.HHGll7^-B-BCMMaPG#**K+<=GN!**K+<=GLA%J!"N:!"L: 
	< )7(9(9:J:P:P(Q_%(4(9(9.:P:P(QN_% 1X




   	
s CX Hs   )c
:c
 c	c	)z<|finetune_right_pad_id|>z<pad>z<|vision_pad|>z<|image_pad|>z<|video_pad|>z
<|reservedz<|placeholderz[controlz|<EXTRA_TOKENS_z	<SPECIAL_z<unusedc                    SnSnUn[        US5      (       a  UR                  nSn[        US5      (       a'  UR                  b  UR                  UR                  :H  nO#[        US5      (       a  UR                  c  SnOSn U(       Ga  UR                  R                  5        Vs/ s H  n[        U5      PM     nnUR                  USSS2   5      nX-  nSn	Sn
[         H  n[        R                  " U5      n[        R                  " U U5      nSnSn[        U5       H  u  pUS	:X  a  UnX:  a  Sn  OM      Uc  M`  UR                  S	5      S	   nUR                  S	5      nUR                  UU5      nUUU nUb  UR!                  S
5      n UnU
(       d  U(       a  Sn
Un	  OSn
Un	M      U	nUc  [        US5      (       a  UR"                  n Ub_  U" USS9R$                  n['        U5      S:w  a  SnU b:  [        U R(                  S5      (       a  US	   U R(                  R*                  :  a  Sn Uc5  SnUUR-                  5       ;   a  SU S3nUUR-                  5       ;   a  M   Un U b  U R(                  R.                  OSn[1        U SU S35        UR3                  SU05        UUl        U bX  U R(                  R5                  SUR6                  05        [9        U SS5      b#  U R:                  R5                  UR6                  S9  OsU bo  U R(                  R6                  cX  U R(                  R5                  SUR6                  05        [9        U SS5      b#  U R:                  R5                  UR6                  S9    U bV  [9        U SS5      bH  [        U R(                  S5      (       a-  U R:                  R5                  U R(                  R<                  S9   X4$ s  snf )z
Phi3's pad_token isn't set. We set it to <|placeholder...
Llama-3 is <|reserved...
Llama-2 is <unk>
Check if pad_token is not the same as eos_token otherwise the loss will ignore it!!
Fixes https://github.com/unslothai/unsloth/issues/5
z =+= r_   rI   Fre   NTr   )>z|>])rc   r*   r
   r-   z<|PAD_TOKEN|><r   Modelz5 does not have a padding token! Will use pad_token = .pad_token_idgeneration_config)r   max_position_embeddings)
max_length)r@   rI   re   rb   added_tokens_decodervaluesstrjoinPOSSIBLE_RESERVED_TOKENSreescapefinditerr=   spangroupfindendswithrc   r>   r2   r,   r-   	get_vocabr   r6   r+   rB   r   r   r   r   )r   rI   joinernumber_repetitionsoriginal_tokenizerbad_pad_tokenrM   added_tokensall_added_tokensfinal_pad_tokenfinal_good_matchpossible_reserved_tokenfoundfirst_match
good_matchrU   startpossible_pad_tokenendcheck_pad_tokennew_pad_tokennames                         r%   r   r     s    F"y+&&I4G4G	My+&&9+>+>+J!++y/B/BB	K	(	(Y-@-@-H(1(F(F(M(M(OP(O1A(OP!;;|DbD'9:" '?#&(ii0G&H#KK#:";>NOEKJ!%(6;*!%J ) "H  $$Q'*E!,!2!21!5"''6C*55K&/889NO
!, $
#' "4#( "4E (@F 	, %')[*I*I!*!4!4 )'(:QVWaaO?#q(%)" l33"ell&=&==%)"%+M9#6#6#88"#M?! 4  9#6#6#88!.-2->u||))GfIJ\I]]^_	

 	$$k4F%GH0	LL)2H2H IJu148D''..i>T>T.U||((0##^i6L6L$MN5"5t<H++22)BXBX2Y5-t4@u||%>??''..ELL<`<`.a$$G Qs   &O$)r   )r   rE   r   rx   rz   r   r   __all__inference_moder   r   r   r   r   rk   r&   r%   <module>r      s   "  	    	 "( "(F  ~~  TV^c ~ ~~   ~% ~%~ r&   