
    oiik                         / S Qr SSKJrJrJrJrJr  SSKrS r S r	 SS jr
       SS jr SS// S	Q/ S
QSS4S jr SSKJrJr   SSKJr  S\\\4   S\S\\\/\4      S\S\\\4   4
S jrg!   Sr N)= f))train_on_responses_onlysft_prepare_datasetstandardize_data_formats    )UnionCallableOptionalListDictNc                 *   [        U 5      nU S   n[        U5      nSn[        U5       Hf  n[        US-   US-   5       HM  nX%U nSn[        SU5       H  nXpU   ;  d  M    O   US-   U:X  d  M1  [        U5      [        U5      :  d  MK  UnMO     Mh     U$ )Nr       )lenrange)	arrnslresijstemks	            S/home/james-whalen/.local/lib/python3.13/site-packages/unsloth_zoo/dataset_utils.py_old_longest_common_substringr      s    CAAAAA
C1Xq1ua!e$Aq6DA1a[1v% ! A
s3x#d)3 %  J    c                    ^  T (       d  / $ [        S T  5       5      nUS:X  a  / $ U 4S jn SUpC/ nX4::  a/  X4U-
  S-  -   nU" U5      u  pxU(       a  UnUS-   nOUS-
  nX4::  a  M/   U$ )a0  
Finds the longest common sublist among multiple lists.

Parameters:
lists (List[List[int]]): A list of lists.

Returns:
List[int]: The longest common sublist. If multiple sublists have the same maximum length,
           one of them is returned. If there's no common sublist, an empty list is returned.
c              3   8   #    U  H  n[        U5      v   M     g 7fN)r   ).0lsts     r   	<genexpr>*_longest_common_sublist.<locals>.<genexpr><   s     ,es#c((e   r   c                   > [        5       nTS   n[        [        U5      U -
  S-   5       H#  n[        X#X0-    5      nUR	                  U5        M%      TSS  Hd  n[        5       n[        [        U5      U -
  S-   5       H*  n[        XSX0-    5      nXA;   d  M  UR	                  U5        M,     UnU(       a  M`  S/ 4s  $     S[        UR                  5       5      4$ )z
Checks if there's a common sublist of the given length across all lists.

Returns:
(bool, List): Tuple of whether such a sublist exists and the sublist itself.
r   r   NFT)setr   r   tupleaddlistpop)lengthcommonfirstr   subr    currentlistss          r   has_common_sublist3_longest_common_sublist.<locals>.has_common_sublist?   s     as5zF*Q./A
+,CJJsO 0 	 9CeG3s8f,q01C!*-.=KK$ 2 F6by   	 T&**,'''r   r      )min)	r/   min_lenr0   leftrightresultmidexistssublists	   `        r   _longest_common_sublistr;   .   s     ) ,e,,G!|BY(: 	W%F
-dlq((,S1F7D!GE - 	Mr   c                 h  ^ SnU R                  S5      (       a  SnOU R                  S5      (       a  SnSnU R                  S5      (       a  SnOU R                  S5      (       a  SnU R                  5       n/ nU(       d  [        S5       Hs  n[        S5       Ha  nXt-  U-   X-  -   n	U" U	SS9R                  n	UR                  U	5        US-  U-   US-  -   n	U" U	SS9R                  n	UR                  U	5        Mc     Mu     O"U" U SS9R                  n	UR                  U	5         [        U V	s/ s H  oS/-   PM
     sn	5      n
U
S/:X  a7  [        US   5      S:X  a%  US   S   m[        U4S	 jU 5       5      (       a  T/n
 [        [        S
 U 5       5      5      S:X  a/  [        US   5      S-   [        U
5      :X  a  US   U
SS :X  a  US   n
 U" U SS9R                  n[        [        U5      5       H  nXU[        U
5      -    U
:X  d  M    O   USW nX[        U
5      -   S nXU4$ s  sn	f )z
    
### User:


    

### User:


    etc
    we need to find the middle most repeatted part.
    Tokenizers can tokenize newlines or spaces as 1 token!
    r    
   F)add_special_tokensr   r   c              3   .   >#    U  H
  nTU;   v   M     g 7fr    )r   xsingle_tokens     r   r!   )_find_common_token_ids.<locals>.<genexpr>   s     8-Q|q -s   c              3   8   #    U  H  n[        U5      v   M     g 7fr   )str)r   rC   s     r   r!   rE      s     .1Ar#   N)
endswith
startswithstripr   	input_idsappendr;   r   allr%   )	component	tokenizerforce_match
right_text	left_textstrippedall_input_idsr5   r6   rC   	substringoriginalr   optional_leftoptional_rightrD   s                  @r   _find_common_token_idsrZ   o   sY    J			S	!	!:			D	!	!:I			s	#	#Y			d	#	#Y H M!HDqNX-0@@ae<FF$$Q'I(5:5ae<FF$$Q' "   	ie<FFQ (-(H-QaS-(HII
 QCCa 01Q6$Q'*8-888%I
 	C.../14	]1		"c)n	4	q	Ys^	+ "!$	 ?IIH3x=!C	N*+y8% "bq\MI./0N^33= )Is   0H/c                 t	  ^^^^^^^^^^^ Uc,  U b)  [        U S5      (       a  U R                  OU R                  n[        US5      (       d  [        US5      (       a  UR                  n[        US5      (       a  [        US5      (       d  Ub  Uc  [        S5      eOKUc  Ub-  [        US5      (       d  [        US5      (       a  [        S5      eUR                  nUR
                  n [        XU5      u  mpx[        X$U5      u  mpTS	   m[        T5      mU	SSS
2   mU
mTS	   m[        T5      mUSSS
2   mUm[        R                  m[        R                  nUUUUUUUUUUU4S jn U(       a  U$ Ub  [        U5      [        Lao  S	SKn[        [        UR!                  5       =(       d    SS-   S5      S5      nUR#                  5       R$                  S-  nUS::  a  SnO[        U[        U5      5      n[        U S5      (       a  U R&                  b  [        U R&                  S5      (       d  [)        S5      e[+        U R&                  [,        5      (       a>  U R&                  R/                  XR&                  R0                  R2                  SS9U l        O U R&                  R/                  USUS9U l         [        U S5      (       Gav  U R4                  Gbh  [        U R4                  5      [6        L a  U R4                  R9                  5        H  u  nn[        US5      (       d  [)        S5      e[+        U[,        5      (       a4  UR/                  UUR0                  R2                  SS9U R4                  U'   Mk  UR/                  USUS9U R4                  U'   M     O[        U R4                  S5      (       d  [)        S5      e[+        U R4                  [,        5      (       a>  U R4                  R/                  XR4                  R0                  R2                  SS9U l        O U R4                  R/                  USUS9U l          S	SKJn  [?        U R@                  SS5      n[        U S5      (       a-  [+        U RB                  U5      (       d  U(       d  U" US9U l!        SSK"J#n  U" SX@R&                  5        U $ )zs
Trains only on responses and not on the instruction by masking out
the labels with -100 for the instruction part.
Nprocessing_classimage_processorrP   _unsloth_input_part_unsloth_output_partz:Unsloth: instruction_part and response_part must be given!zaUnsloth: Your tokenizer already has instruction and response parts set - do not give custom ones!r   rH   c                   > U S   nSn[        U5      TL a  SnUR                  5       nSU ;   a.  U S   R                  5       n[        U5      [        U5      :X  d   eOS /[        U5      -  n/ n[        X5       GHU  u  pV[        U5      nS/U-  nSn	Ub  Sn	U[        U5      :X  d   eUS-
  n
SnX:  Ga  X[   T:X  a  X[UT-   =n T:X  a  T H  nUS:  a    OXUS-
     :X  a  US-  nM    O    T H  nX:  a    OXUS-      :X  a  US-  nM    O    UnUnX:  a  X:X  d  X[   T:X  ay  X[UT-   =n T:X  al  T H  nUS:  a    OXUS-
     :X  a  US-  nM    O    T H  nX:  a    OXUS-      :X  a  US-  nM    O    UnUU
:w  a  UnOUnUn U	(       d  X_U XU& OXoU XU& O US-  nX:  a  M    US-  nX:  a  GM   UR	                  U5        GMX      SU(       a$  [
        R                  " U[
        R                  S90$ U0$ )	NrL   FTlabelsir   r   )dtype)typetolistr   ziprM   torchtensorint64)examples
input_ids_use_tensorslabels_
all_labelsrL   
old_labelsr   ra   use_old_labels	n_minus_1r   r   rX   rY   assistant_kuser_jA_firstA_left_reversedA_mustA_right_forwardQ_firstQ_left_reversedQ_mustQ_right_forward
len_A_must
len_Q_musttorch_Tensors                    r   _train_on_responses_only9train_on_responses_only.<locals>._train_on_responses_only   s   k*

|+K#**,Jxx(//1Gw<3z?232fS_,G
%(%=!IIAVaZF"N%!%C
O+,+AIA%LG+Z$7A9VC *9q5%(acN:AFA# *9 *9>5)qs^;Q!VQ# +: "#K#A% N'lg5'!j.-@QBfL 2A#$q5%#0acN#BAFA&+ 2A !2A#$>5#1qs^#CQ!VQ&+ 3B !%&F%2 %&)*$% #1?HX^?_V < @JX^?_V <!QO %P QA %B f%Y &>Z 	[ELLU[[Ijj^hjjr   r      r2   @      @train_datasetmapz8Unsloth: train_on_responses_only does not work on lists!T)
batch_sizebatched)r   num_proceval_dataset)DataCollatorForSeq2SeqpackingFdata_collator)rP   )fix_zero_training_loss)$hasattrr\   rP   
ValueErrorr^   r_   rZ   r   rf   Tensorrh   rc   intpsutilr3   max	cpu_countvirtual_memory	availabler   	TypeError
isinstanceIterableDatasetr   _ex_iterabler   r   dictitemstransformersr   getattrargsr   training_utilsr   )trainerinstruction_partresponse_partrQ   rP   return_functionr   Q_leftQ_rightA_leftA_righttorch_int64r~   r   memory_gb_leftkeyvaluer   packing_enabledr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   s                       @@@@@@@@@@@r   r   r      sr    W007AS0T0TG,,ZaZkZk	y+,,	;0O0O''	I455I566#}'<YZZ

&-*C	1	2	2giI_6`6`|}}$88$99 55ER]^FF4]R]^FF QiGVJTrTlOOQiGVJTrTlOO<<L;;K[k [kx 	''4>4sF,,.3!Q6:B?..0::gFQH 8S%89Hw((W-B-B-Nw,,e44VWWg++_==$+$9$9$=$=>Vezez  fH  fH  fS  fS  _c$=  %dG!$+$9$9$=$=>Vbfs{$=$|G!w''G,@,@,L$$%-%2288:
Uue,,#$^__e_5505		:Rafasasa~a~  KO	  1PG((-05		:R^bow	0xG((- ; 7//77 Z[['..@@'.';';'?'?@Xg{g{  hI  hI  hT  hT  `d'?  (e$'.';';'?'?@Xdhu}'?'~$ 4gllIu=O))7002HII 69 M 74,A,ABNr   system)userhumaninput)gpt	assistantoutputi  c                   ^^^^ SSK nSSKnSmUb$  [        US5      (       d  [        US5      (       a  Sm[        [	        [        U 5      5      R                  5       5      n	SU	;  a  U $ UR                  U S5      n
UR                  [        5      nU
 HX  nUS    HL  nUR                  5        H5  u  p[        U5      [        La  [        S	5      eX   R                  U5        M7     MN     MZ      [        UR                  5       5      S
:X  d   e[        UR                  5       5      n[        [        UUS      5      5      n[        [        UUS      5      5      nUU:  a  US   mUS   mO
US   mUS   m [        X#-   U-   5      n[        UT   5      nUU-  U-
  n[        U5      S:w  a  [!        S[        U5       S35      e 0 mU H  nSTU'   M
     U H  nSTU'   M
     U H  nSTU'   M
     UUUU4S jn SUS.n[#        U [$        5      (       d,  SSKJn  Ub  [        U5      [*        La  U" 5       nUUS'   SUS'   U R,                  " U40 UD6$ )a]  
Standardizes ShareGPT and other formats to user/assistant Hugging Face format.

Get aliases for the system, user and assistant roles.
These shall map to "system", "user" and "assistant" respectively.

aliases_for_system    = ["system",],
aliases_for_user      = ["user", "human", "input",],
aliases_for_assistant = ["gpt", "assistant", "output",],
r   NFr]   rP   Tconversations
   z.Unsloth: Cannot standardize non text datasets!r2   r   	Unsloth: z+ are not in aliases. Please update aliases.r   r   r   c                    > U S   n/ nU HO  n/ nU H2  nT	UT      nUT
   nT(       a  SUS./nXgS.nUR                  U5        M4      UR                  U5        MQ      SU0$ )Nr   text)rc   r   )rolecontent)rM   )ri   convos
all_convosconvo	new_convomessager   r   rC   aliases_mappingcontent_keyis_vlmrole_keys            r   _standardize_dataset6standardize_data_formats.<locals>._standardize_dataset  s    /*
EI &wx'89{+ft$D"F4"5  # ! i(  	 :00r   )r   r   )r   r   zUnsloth: Standardizing formatsdesc)collections	itertoolsr   r%   nextiterkeysislicedefaultdictr(   r   rc   rG   RuntimeErrorrM   r   r   r   r   multiprocessingr   r   r   )datasetrP   aliases_for_systemaliases_for_useraliases_for_assistantr   r   r   r   column_namesri   uniquesexampler   r   r   r   length_firstlength_secondall_aliasesrolesleftover_aliasesrC   r   dataset_map_kwargsr   r   r   r   r   s                             @@@@r   r   r     s   & 
 F9/00GI{4S4SFtDM*//12Ll*,H%%d+G/G%mmo
;c)&'WXX##E* . 0  	 w||~!#$#DGDG,-.LGDG,-.Mm#1g1g1g1g (;>SSTK!"E#e+{:
!-.//Z[
 	
 	 OOA$6OA$6"OA$6"1 1 	  
 g//-tH~S8[()1:&%E6";;
 r   )Datasetr   )ConstantLengthDatasetr   r   formatting_funcdataset_namereturnc           	        ^^^^^^^  [        U[        5      (       a  U$  0 n[        U[        5      n[        US5      n	UmU	(       a  UR                  m[        USS5      mTS:X  a  [        USS5      mTS:X  a  [        U SS5      mTS:X  a  [        U SS5      mTS:X  a  [        S5      e[        USS5      mTS:g  mS	mS
n
[        [        [        U5      5      R                  5       5      nS/nSU;   a  UR                  S5        SSKJnJn  SU;   aR  U	(       a*  [        TS5      (       d  [        SUR                   S35      eU" T5      U l        UR                  S5        S	n
O\SU;   a@  U	(       a*  [        TS5      (       d  [        SUR                   S35      eU" TS	S9U l        S	n
OTU;  a  S
mTc  [        S5      e U
(       Ga  T(       a@  T" [        [        U5      5      5      n[        U["        5      (       d  [%        S5      eUS   nO[        [        U5      5      T   S   n[        USS5      nUS:X  a  U	(       a  [        TSS5      nUc  SnS
m[        USS 5      n[        TSS 5      nU=(       d    UnUb)  UR'                  U5      (       d  UU;   a  S	m[)        S5         UUUUUUU4S jn [        U[*        5      (       d  [        USS 5      nUc  SS Kn[/        UR1                  5       =(       d    SS-   S5      nUR3                  5       R4                  S-  nUS::  a  SnO8US::  a  [7        SU5      nO%US ::  a  [7        SU5      nOUS!::  a  [7        SU5      nUUS"'   OUR8                  R:                  US#'   U(       a	  S$T S%3US&'   UR<                  " U4S'S
0UD6nU	(       a  [        US5      (       d  U" TS	S9nUU l          U(       aP   [>          TS:X  a  [%        S)5      eU(       a	  S*U S+3US&'   [?        URA                  U5      T[        US,S-5      U5      n U$ !    GN= f!   [)        S(5        Us $ = f).NrP   
max_lengthr   max_seq_lengthmax_seqz1Unsloth: max_seq_length is 0! Please specify one!dataset_text_fieldr   FTrL   attention_mask)r   DataCollatorForLanguageModelingra   padr   z does not have .pad!)mlmz-Unsloth: You must specify a `formatting_func`zIUnsloth: The `formatting_func` should return a list of processed strings.chat_templater   	bos_tokenzHUnsloth: We found double BOS tokens - we shall remove one automatically.c                 <   > T" T(       d  U T   OT" U 5      TTSTS9$ )NF)
truncationr   return_token_type_idsr@   rB   )r   r@   r   do_formatting_funcdo_truncationr   r   rP   s    r   	_tokenize&sft_prepare_dataset.<locals>._tokenize[  s/    3E*+?[bKc*+(-%7 r   dataset_num_procr   r   r2   r            r   r   zUnsloth: Tokenizing ["z"]r   r   zPUnsloth: Hugging Face's packing is currently buggy - we're disabling it for now!z:When packing is enabled, `max_seq_length` can't be `None`.zUnsloth: Packing z datasetpacking_strategybfd)!r   r   r   r   rP   r   r   r%   r   r   r   rM   r   r   r   	__class__r   r(   r   rJ   printr   r   r   r   r   r   r3   r   r   r   pack_datasetselect_columns)selfr   r\   r   r   r   r   
map_kwargsuse_descr   do_tokenizer   used_column_namesr   r   	test_textr   bos_token_1bos_token_2r   r   r   r   r   r   r@   r   r   r   r   rP   s        `                   @@@@@@r   r   r     s   g455g~5 J'7+H%{3F I+55y T<3NWT;KQ-ONWT;KQ-ONWT9a-HN,/b"cc ';VD"a'MK tDM*//12L$<'  !12 U<')U33+;+E+E*FFZ[\\3I>  *		$')U33+;+E+E*FFZ[\\<YeT	<	/!"NOO'T'](;<Ii.. _  "!IT']+,>?BI   0/2FB6#ICM M ".TBid;.;	 ##I..)}2L%*"`a	 	 	'?33&t-?F'#&(8(8(:(?a'BA#F !'!6!6!8!B!Bg!N!Q&'($#q('*1.>'?$#q('*1.>'?$#r)'*1.>'?$%5Jz"'.';';'F'FJ|$,BCUBVVX*YZ'++iF4F:F '"2E::;IUSM!.D	
 QYZZ,=l^8*TZ'""#45D,e4	
 	NUt	deNs   P, P4 ,P14Q)F)NNTNFN)__all__typingr   r   r   r	   r
   rf   r   r;   rZ   r   r   datasetsr   r   trl.trainer.utilsr   boolr   rG   r   rB   r   r   <module>r     s   " 9 8  =| E4L 
 HR 
 !%K7;  n^  1!7V7O+,V
 V hvs{34V V 7O#$Vn y! s   A4 4A: