
    oiV4                         S SK r S SKrS SKrS SKrS SKJrJrJrJrJ	r	  S SK
Jr  S SKJr  SS/rSSS	S	S
S.r " S S5      r " S S5      rg)    N)ListDictAnyUnionOptional)Dataset)PathRawTextDataLoaderTextPreprocessor
plain_textmarkdown
json_linescsv_text_column)z.txtz.mdz.jsonz.jsonlz.csvc                   d    \ rS rSrSS jrS rSS jrSS jrSS jrS r	SS	 jr
S
 rS rS rSrg)r
   %   c                     US::  a  [        SU 35      eX2:  a  [        SU SU S35      eXl        X l        X0l        X@l        g )Nr   z!chunk_size must be positive, got zstride (z#) must be smaller than chunk_size ())
ValueError	tokenizer
chunk_sizestridereturn_tokenized)selfr   r   r   r   s        S/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/dataprep/raw_text.py__init__RawTextDataLoader.__init__&   sZ    ?@MNN6("Ej\QRS  #$ 0    c                 t    [        U5      R                  R                  5       n[        R	                  US5      $ )z-Auto-detect file format and parse accordinglyr   )r	   suffixlowerSUPPORTED_FORMATSget)r   	file_path	extensions      r   detect_formatRawTextDataLoader.detect_format2   s-    O**002	 $$Y==r   Nc                 *   Uc  U R                   nU R                  U5      nU R                  X5      nU(       a  UR                  5       (       d  [	        SU S35      eU R                  X@R                  U R                  U5      nU R                  U5      $ )z$Load raw text and convert to datasetzFile 'z&' is empty or contains only whitespace)	r   r%   _read_file_by_formatstripr   smart_chunk_textr   r   create_causal_dataset)r   r#   r   file_formattext_contentchunkss         r   load_from_file RawTextDataLoader.load_from_file7   s    ##44((300H<#5#5#7#7vi[0VWXX&&//4;;8H
 ))&11r   c                    Uc  U R                   n/ nU H]  nU R                  U5      nU R                  XE5      nU R                  X`R                  U R
                  U5      nUR                  U5        M_     U R                  U5      $ )zLoad multiple text files)r   r%   r(   r*   r   r   extendr+   )r   
file_pathsr   
all_chunksr#   r,   r-   r.   s           r   load_from_files!RawTextDataLoader.load_from_filesD   s    ##44
#I,,Y7K44YLL**oot{{<LF f% $ ))*55r   c                 n    Uc  U R                   nU R                  XR                  U R                  U5      $ )z"Split text into overlapping chunks)r   r*   r   r   )r   textr   s      r   
chunk_textRawTextDataLoader.chunk_textR   s7    ##44$$//4;;0@
 	
r   c                 H   U(       au  [        US   [        5      (       a]  U Vs/ s H  o"S   PM	     nnU Vs/ s H  o"S   PM	     nnU Vs/ s H  n[        U5      PM     nn[        R                  " UUUS.5      $ [        R                  " SU05      $ s  snf s  snf s  snf )z+Create dataset for causal language modelingr   	input_idsattention_mask)r<   r=   labelsr8   )
isinstancedictlistr   	from_dict)r   r.   chunkr<   r=   idsr>   s          r   r+   'RawTextDataLoader.create_causal_datasetZ   s    jD11 :@@{+I@CIJ6%$456NJ+459Cd3i9F5$$!*&4$  $$ff%566 AJ5s   BBBc                 \   U R                  USSS9nUS   n[        US5      (       a)  [        U5      S:  a  [        US   S5      (       a  US   nO)[        U[        5      (       a  [        [        U5      5      n[        U5      U::  a  U(       ak  [        U R                   SS5      nUb=  [        US	5      (       a  UR                  5       O
[        U5      nUR                  U5        S
/[        U5      -  nXhS./$ U R                   R                  (       a  U R                   R                  OSn	X-   /$ / n
SnU[        U5      :  Gae  [        X-   [        U5      5      nXkU nU(       a  [        US	5      (       a  UR                  5       O
[        U5      nU[        U5      :X  d  [        U5      U:X  a+  [        U R                   SS5      nUb  UR                  U5        S
/[        U5      -  nU
R                  XS.5        OU R                   R                  USS9nU[        U5      :X  d  [        U5      U:X  a7  U R                   R                  (       a  U R                   R                  OSn	X-  nU
R                  U5        U[        U5      :X  a   U
$ XU-
  -  nU[        U5      :  a  GMe  U
$ )z
Intelligent chunking that:
1. Respects sentence/paragraph boundaries
2. Handles various text formats (.txt, .md, .json, etc.)
3. Maintains context with stride overlap
4. Returns tokenized chunks directly (more efficient) or text chunks
ptF)return_tensorsadd_special_tokensr<   __len__r   eos_token_idNtolist   )r<   r=    T)skip_special_tokens)r   hasattrlenr?   intrA   rangegetattrrL   append	eos_tokenmindecode)r   r8   r   r   r   	tokenizedtokensrK   r=   rV   r.   	start_idxend_idxchunk_tokenschunk_tokens_listr9   s                   r   r*   "RawTextDataLoader.smart_chunk_textn   s    NN4$UZN[	;' 69%%#f+/vay),,$$%-(Fv;*$&t~~~tL++268+D+D$v,  MM,/ #$s6{!2&,OPP8<8P8PDNN44VX	())	#f+%)0#f+>G "G4L |X66 !'')l+ " c&k)S1B-Cz-Q#*4>>>4#PL#/)00> #$s+<'=!="3V
 "^^22  3 

 c&k)S->*-L48NN4L4L00RT  +Jj) #f+%  f,,I[ #f+%^ r   c                    [        USSS9 nUS:X  d  US:X  a  UR                  5       sSSS5        $ US:X  au  / nU HS  n [        R                  " UR	                  5       5      nU R                  U5      nU(       a  UR                  U5        MS  MU     SR                  U5      sSSS5        $ US	:X  af  [        R                  " U5      n/ n	U H.  n
U R                  U
5      nU(       d  M  U	R                  U5        M0     SR                  U	5      sSSS5        $  SSS5        g
! [        R                   a     M  f = f! , (       d  f       g
= f)z+Read file content based on detected format.rutf-8)encodingr   r   Nr   

r   rN   )openreadjsonloadsr)   _extract_text_from_jsonrU   JSONDecodeErrorjoincsv
DictReader_extract_text_from_csv_row)r   r#   r,   flineslinedatar8   readertextsrows              r   r(   &RawTextDataLoader._read_file_by_format   s2   )SW5l*kZ.Gvvx 65 ,D!#zz$**,7#;;DA!LL.  	  {{5) 65  11*!C::3?DtT* " {{5)+ 65 2 6,   // ! ! 65, sA   D>D> AD#D>-9D>*%D>#D;7D>:D;;D>>
Ec                 f    / SQnU H'  nX1;   d  M
  [        X   [        5      (       d  M#  X   s  $    g)z7Extract text from JSON object using common field names.r8   contentmessagebodydescriptionpromptrN   )r?   str)r   rr   text_fieldsfields       r   ri   )RawTextDataLoader._extract_text_from_json   s1    U E}DK!=!={" ! r   c                 J    / SQnU H  nX1;   d  M
  X   (       d  M  X   s  $    g)z4Extract text from CSV row using common column names.rx   rN    )r   ru   text_columnscolumns       r   rn   ,RawTextDataLoader._extract_text_from_csv_row   s)    V"F}{" # r   )r   r   r   r   )i   i   T)N)T)__name__
__module____qualname____firstlineno__r   r%   r/   r5   r9   r+   r*   r(   ri   rn   __static_attributes__r   r   r   r
   r
   %   s6    
1>
26
7(Yv4r   c                   ,    \ rS rSrS rS rS rS rSrg)r      c                     [         R                  " SSU5      n[         R                  " SSU5      nUR                  SS5      R                  SS5      n[         R                  " SS	U5      nUR                  5       $ )
z0Remove unwanted characters, normalize whitespacez\s+ z[^\x20-\x7E\n\t]rN   z

z\n{3,}rd   )resubreplacer)   r   r8   s     r   
clean_textTextPreprocessor.clean_text   sb    vvfc4(vv)2t4||FD)11$=vvi.zz|r   c                     / nU HJ  n[         R                  " XA[         R                  [         R                  -  5      nUR	                  U5        ML     U$ )z5Extract specific sections (e.g., code blocks, quotes))r   findall	MULTILINEDOTALLr2   )r   r8   patternssectionspatternmatchess         r   extract_sections!TextPreprocessor.extract_sections   sA    Gjjryy0HIGOOG$   r   c                 .   [         R                  " SSU[         R                  S9n[         R                  " SSU[         R                  S9n[         R                  " SSU[         R                  S9n[         R                  " SS	U[         R                  S9nU$ )
z5Add special tokens for structure (chapters, sections)z^# (.+)$z<|chapter|>\1<|/chapter|>)flagsz	^## (.+)$z<|section|>\1<|/section|>z
^### (.+)$z<|subsection|>\1<|/subsection|>z```(\w*)\n(.*?)\n```z<|code|\1|>\2<|/code|>)r   r   r   r   r   s     r   add_structure_tokens%TextPreprocessor.add_structure_tokens  s    vv5tR\\
 vv6bll
 vv=tR\\
 vv#%>bii
 r   c           	         [        U5      S[        S5      SSSS/ S.nUS   n/ n[        5       n[        U5       H  u  pgU(       a  [        UR	                  5       5      S:X  a  US==   S-  ss'   M8   UR                  S5        [        U5      nUR                  U5        [        US	   U5      US	'   [        US
   U5      US
'   [        UR	                  5       5      n	X;   a  US==   S-  ss'   M  UR                  U	5        M     U(       a6  [        U5      [        U5      -  US'   US	   [        S5      :w  a  US	   OSUS	'   US   S:  a  US   R                  SUS    S35        US   S:  a  US   R                  SUS    S35        US   S:  a  US   R                  SUS    S35        US	   S:  a  US   R                  S5        U$ ! [         a    US==   S-  ss'    GNdf = f)zm
Check for:
- Minimum/maximum sequence lengths
- Character encoding issues
- Repeated content
- Empty chunks
r   inf)total_samplesempty_samples
min_length
max_length
avg_lengthrepeated_contentencoding_issueswarningsr8   r   rM   rb   r   r   r   r   r   r   zFound z empty samplesz repeated samplesz encoding issues
   z-Some samples are very short (< 10 characters))rQ   floatset	enumerater)   encodeUnicodeEncodeErrorrU   rW   maxhashaddsum)
r   datasetstatsrt   text_lengths
seen_textsir8   length	text_hashs
             r   validate_dataset!TextPreprocessor.validate_dataset  s1    !\, ! 	
 U
 'GA3tzz|,1o&!+&.G$
 YF'"%eL&96"BE,"%eL&96"BE, TZZ\*I&()Q.)y)- (2 "%l"3c,6G"GE,',\':eEl'Jl#PQ ,
 !A%*$$veO.D-E^%TU#$q(*$$1233DE "#a'*$$0122BC #*$$%TUQ & .'(A-(.s   3GG)(G)r   N)	r   r   r   r   r   r   r   r   r   r   r   r   r   r      s     Gr   )osr   rg   rl   typingr   r   r   r   r   datasetsr   pathlibr	   __all__r!   r
   r   r   r   r   <module>r      s`    
 	  
 3 3     L L^h hr   