
    h                         S SK r S SKJr  S SKJr  S SKJr  S SKrS SK	r	S SK
Jr  S SKJr  \	R                  R                  R!                  \5      r\ " S S\	R&                  5      5       r " S	 S
\	R*                  5      rg)    N)	dataclass)StringIO)Optionalrequire_storage_cast)
table_castc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\	S'   Sr\\   \	S'   Sr\\	S	'   S
r\\	S'   Sr\\	S'   Srg)
TextConfig   zBuilderConfig for text files.Nfeatureszutf-8encodingencoding_errorsi   	chunksizeFkeep_linebreaksline	sample_by )__name__
__module____qualname____firstlineno____doc__r   r   datasetsFeatures__annotations__r   strr   r   intr   boolr   __static_attributes__r       ]/home/james-whalen/.local/lib/python3.13/site-packages/datasets/packaged_modules/text/text.pyr
   r
      sP    ',0Hhx(()0Hc%)OXc])Is!OT!Isr    r
   c                   d    \ rS rSr\rS rS rS\R                  S\R                  4S jr
S rSrg	)
Text   c                 R    [         R                  " U R                  R                  S9$ )N)r   )r   DatasetInfoconfigr   )selfs    r!   _info
Text._info   s    ##T[[-A-ABBr    c                    U R                   R                  (       d"  [        SU R                   R                   35      eSUR                  l        UR                  U R                   R                  5      n/ nUR                  5        Hc  u  pE[        U[        5      (       a  U/nU Vs/ s H  oaR                  U5      PM     nnUR                  [        R                  " USU0S95        Me     U$ s  snf )zThe `data_files` kwarg in load_dataset() can be a str, List[str], Dict[str,str], or Dict[str,List[str]].

If str or List[str], then the dataset returns only the 'train' split.
If dict, then keys should be from the `datasets.Split` enum.
z=At least one data file must be specified, but got data_files=Tfiles)name
gen_kwargs)r'   
data_files
ValueErrordownload_configextract_on_the_flydownload_and_extractitems
isinstancer   
iter_filesappendr   SplitGenerator)r(   
dl_managerr/   splits
split_namer,   files          r!   _split_generatorsText._split_generators"   s     {{%%\]a]h]h]s]s\tuvv8<
""544T[[5K5KL
!+!1!1!3J%%%=BCUT**40UECMM(11zwX]N^_`	 "4
  Ds   'C/pa_tablereturnc                    U R                   R                  by  U R                   R                  R                  n[        S U R                   R                  R	                  5        5       5      (       a  UR                  U5      nU$ [        X5      nU$ UR                  [        R                  " S[        R                  " 5       05      5      $ )Nc              3   B   #    U  H  n[        U5      (       + v   M     g 7f)Nr   ).0features     r!   	<genexpr>#Text._cast_table.<locals>.<genexpr>7   s     bDa+G444Das   text)
r'   r   arrow_schemaallvaluescastr   paschemastring)r(   r?   rM   s      r!   _cast_tableText._cast_table4   s    ;;+[[))66FbDKKDXDXD_D_Dabbb#==0 O &h7O==FBIIK+@!ABBr    c           
   #     #    U R                   R                  b  [        U R                   R                  5      OS/n[        [        R
                  R                  U5      5       GH  u  p4[        X@R                   R                  U R                   R                  S9 nU R                   R                  S:X  a  Sn UR                  U R                   R                  5      nU(       d  GO_XuR                  5       -  n[        U5      R                  5       nU R                   R                   (       d   U Vs/ s H  oR#                  S5      PM     nn[$        R&                  R)                  [$        R*                  " U5      /US9n	X64U R-                  U	5      4v   US-  nM  U R                   R                  S:X  Ga  SnS	n UR                  U R                   R                  5      n
U
(       d  OXz-  nXuR                  5       -  nUR/                  S
5      n[$        R&                  R)                  [$        R*                  " US S  Vs/ s H  o(       d  M  UPM     sn5      /US9n	X64U R-                  U	5      4v   US-  nUS   nM  U(       aJ  [$        R&                  R)                  [$        R*                  " U/5      /US9n	X64U R-                  U	5      4v   OrU R                   R                  S:X  aX  UR                  5       n[$        R&                  R)                  [$        R*                  " U/5      /US9n	X0R-                  U	5      4v   S S S 5        GM     g s  snf s  snf ! , (       d  f       GM  = f7f)NrG   )r   errorsr   r   
)names   	paragraph z

document)r'   r   list	enumerate	itertoolschainfrom_iterableopenr   r   r   readr   readliner   	readlinesr   rstriprL   Tablefrom_arraysarrayrO   split)r(   r,   pa_table_namesfile_idxr<   f	batch_idxbatchr   r?   	new_batchexamplerG   s                r!   _generate_tablesText._generate_tablesA   s    7;{{7K7K7Wdkk223^d]e'	(E(Ee(LMNHd[[%9%9$++B]B]^bc;;((F2 !I !t{{'<'< =$!- ( 9 9 ;#{{::CH$I54[[%65E$I#%88#7#7%8IQ_#7#`  (3T5E5Eh5OOO!Q	  [[**k9 !IE$%FF4;;+@+@$A	(!*- %F 3#%88#7#7XXeCRj&Tj7Gwj&TUV^l $8 $  (3T5E5Eh5OOO!Q	 %b	   #%88#7#75'9J8KSa#7#b'3T5E5Eh5OOO[[**j8668D!xx33RXXtf5E4Fn3]H"$4$4X$>>>W _^ N %J$ 'U; _^sE   BM2BM2MC6M
MMC1MM2
M
M/	)	M2r   N)r   r   r   r   r
   BUILDER_CONFIG_CLASSr)   r=   rL   rd   rO   ro   r   r   r    r!   r#   r#      s6    %C$CBHH C C/?r    r#   )r\   dataclassesr   ior   typingr   pyarrowrL   r   datasets.features.featuresr   datasets.tabler   utilslogging
get_loggerr   loggerBuilderConfigr
   ArrowBasedBuilderr#   r   r    r!   <module>r~      sn     !     ; % 
			*	*8	4 ''  T?8%% T?r    