
    hc}                     ^   S SK r S SKrS SKJr  S SKJr  S SKJrJr  S SK	J
r
JrJr  S SKrS SKJr  S SKJr  S SKJr  S S	KJr  S
SKJr  S
SKJr  S
SKJr  S
SKJr  S
SKJr  S
SKJr   S
SK!J"r"J#r#J$r$J%r%J&r&  S
SK'J(r(  \\)\*\*4   \)\*   \)S   4   r+\*" \RX                  5      r-\R\                  " \/5      r0 " S S\*5      r1 " S S\25      r3Sr4\RX                  SS/\Rj                  / SQ\Rl                  / SQ0r7Sr8\Rr                  \Rt                  " S5      :  a	  SS /r;/ S!Qr<O1\Rr                  \Rt                  " S"5      :  a	  S#S /r;/ S$Qr<OS%S&/r;/ S'Qr<\RX                  \Rj                  \Rl                  /r=\= V VVs0 s H4  n U \7U     VVs/ s H  n\;  H  nUR}                  U\8S(9PM     M     snn_M6     snnn r?\= V VVs0 s H4  n U \7U     VVs/ s H  n\<  H  nUR}                  U\8S(9PM     M     snn_M6     snnn r@\RX                  S)/0rA\4/rB\@\?\A/rCS*rD/ S+QrES,\*S-\F4S. jrGS/\\H\I\*4   S-\H\*\\I\*   S04   4   4S1 jrJS2\*S,\*S-\F4S3 jrKS2\*S,\*S-\F4S4 jrLS5\
\*/\I\*   4   S-\H\*\I\*   4   4S6 jrM  SHS,\*S7\*S8\\I\*      S9\\   S-\I\*   4
S: jjrNSIS7\*S9\\   S-\H\*\I\*   4   4S; jjrO SIS<\*S9\\   S-\+4S= jjrP  SHS>\I\*   S9\\   S?\\Q   S-\I\+   4S@ jjrR " SA S0\I\*   5      rS " SB SC\H\*\S4   5      rT " SD SE\I\*   5      rU " SF SG\H\*\U4   5      rVgs  snnf s  snnn f s  snnf s  snnn f )J    N)partial)	has_magic)PathPurePath)CallableOptionalUnion)	url_to_fs)HfFileSystem)version)
thread_map   )config)DownloadConfig)	_split_re)Split)logging)tqdm)!_prepare_path_and_storage_optionsis_local_pathis_relative_path	xbasenamexjoin)string_to_dict c                       \ rS rSrSrg)Url!   r   N__name__
__module____qualname____firstlineno____static_attributes__r       M/home/james-whalen/.local/lib/python3.13/site-packages/datasets/data_files.pyr   r   !       r%   r   c                       \ rS rSrSrg)EmptyDatasetError%   r   Nr   r   r%   r&   r)   r)   %   r'   r%   r)   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z{keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**)keywordsepz**z*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonpatternreturnc                 6   ^  [        U 4S j[         5       5      $ )Nc              3   ,   >#    U  H	  oT;   v   M     g 7fNr   ).0wildcard_characterr7   s     r&   	<genexpr>%contains_wildcards.<locals>.<genexpr>s   s     [GZ1CW,GZs   )anyWILDCARD_CHARACTERS)r7   s   `r&   contains_wildcardsrB   r   s    [GZ[[[r%   patternsDataFilesListc           	         [        U [        5      (       aG  U R                  5        VVs0 s H)  u  p[        U5      [        U[        5      (       a  UOU/_M+     snn$ [        U [        5      (       a	  [
        U /0$ [        U [        5      (       Ga  [        S U  5       5      (       a  U  He  n[        U[        5      (       aA  [        U5      S:X  a2  SU;   a,  [        UR                  S5      [        [        45      (       a  MY  [        SU 35      e   U  Vs/ s H  o3S   PM	     nn[        [        U5      5      [        U5      :w  a  [        SU 35      eU  Vs0 s H3  n[        US   5      [        US   [        5      (       a  US   OUS   /_M5     sn$ [
        U 0$ [        [	        U 5      5      $ s  snnf s  snf s  snf )a  
Take the data_files patterns from the user, and format them into a dictionary.
Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
The default split is "train".

Returns:
    patterns: dictionary of split_name -> list of patterns
c              3   B   #    U  H  n[        U[        5      v   M     g 7fr;   )
isinstancedict)r<   r7   s     r&   r>   $sanitize_patterns.<locals>.<genexpr>   s     AWz'4((s      splitpathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got z*Some splits are duplicated in data_files: )rG   rH   itemsstrlistSANITIZED_DEFAULT_SPLITr@   lenget
ValueErrorsetsanitize_patterns)rC   keyvaluer7   splitss        r&   rU   rU   v   s    (D!!ZbZhZhZjkZjJCC:eT#:#:%GZjkk	Hc	"	"'(44	Hd	#	#AAAA#w--G)7*"7;;v#6dDD$wx  xA  B  $ 7??h7g&hF?3v;3v;. #MfX!VWW  ('G GG$%*WV_VZ:[:[wvbijpbqarr' 
 ,X66 h003 l @s   0F3F9:F>matched_rel_pathc                 \   [        U 5      R                  R                   Vs/ s H  o"R                  S5      (       d  M  UPM     nn[        U5      R                  R                   Vs/ s H  o"R                  S5      (       d  M  UPM     nn[	        U5      [	        U5      :g  $ s  snf s  snf )u  
When a path matches a pattern, we additionally check if it's inside a special directory
we ignore by default (if it starts with a double underscore).

Users can still explicitly request a filepath inside such a directory if "__pycache__" is
mentioned explicitly in the requested pattern.

Some examples:

base directory:

    ./
    └── __pycache__
        └── b.txt

>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
True
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
True
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
False
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
False
__)r   parentparts
startswithrQ   )rY   r7   partdata_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patterns        r&   "_is_inside_unrequested_special_dirrb      s    8 5==M4N4U4U4[4["u4[D_n_nos_t44["u7?7H7O7O7U7U%o7UtYhYhimYnd7U"%o*+s3Q/RRR #v%os   B$B$(B)B)c                 |   [        U 5      R                   Vs/ s H.  o"R                  S5      (       d  M  [        U5      S1:X  a  M,  UPM0     nn[        U5      R                   Vs/ s H.  o"R                  S5      (       d  M  [        U5      S1:X  a  M,  UPM0     nn[	        U5      [	        U5      :g  $ s  snf s  snf )u  
When a path matches a pattern, we additionally check if it's a hidden file or if it's inside
a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

Users can still explicitly request a filepath that is hidden or is inside a hidden directory
if the hidden part is mentioned explicitly in the requested pattern.

Some examples:

base directory:

    ./
    └── .hidden_file.txt

>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
False

base directory:

    ./
    └── .hidden_dir
        └── a.txt

>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
False
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
False

base directory:

    ./
    └── .hidden_dir
        └── .hidden_file.txt

>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
False
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
False
.)r   r]   r^   rT   rQ   )rY   r7   r_   hidden_directories_in_pathhidden_directories_in_patterns        r&   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dirrg      s    l ""2399"9__S=QZ]^bZchkglZl9  " "'*00%0OOC4HQTUYQZ_b^cQc0 " % )*c2O.PPP"%s"   B4B4B4&B9B9B9pattern_resolverc                    [          GH  nUR                  SS5      n U " U5      n[        U5      S:  d  M0  [	        5       nU H:  n[        [        U5      [        U5      5      nUc   eUR                  US   5        M<     [        S U 5       5      (       a  [        S[         SU S35      e[         Vs/ s H  owU;   d  M
  [        U5      PM     sn[        U[         Vs1 s H  n[        U5      iM     sn-
  5      -   nU Vs0 s H  owUR                  US	9/_M     sns  $    [         Hq  n	/ n
U	R!                  5        H:  u  p{U H/  n U " U5      n[        U5      S:  d  M  U
R#                  U5          M8     M<     U
(       d  M\  U
 Vs0 s H  owX   _M	     sns  $    [        S
W SU  35      e! [         a     GM  f = fs  snf s  snf s  snf ! [         a     M  f = fs  snf )a  
Get the default pattern from a directory or repository by testing all the supported patterns.
The first patterns to return a non-empty list of data files is returned.

In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
z{split}*r   rK   c              3   b   #    U  H%  n[         R                  " [        U5      (       + v   M'     g 7fr;   )rematchr   )r<   rK   s     r&   r>   +_get_data_files_patterns.<locals>.<genexpr>  s!     Fverxx	5111vs   -/zSplit name should match 'z'' but got 'z'.)rK   zCouldn't resolve pattern z with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorrQ   rT   r   r   addr@   rS   r   DEFAULT_SPLITSrN   sortedformatALL_DEFAULT_PATTERNSrM   append)rh   split_patternr7   
data_filesrX   pp_partsrK   sorted_splitspatterns_dictnon_empty_splitsrC   s               r&   _get_data_files_patternsr      s    ,+''	37	)'2J z?Q"uF(1y7OP***

77+,  
 FvFFF #<YK|TZS[[]!^__5CW^EPVZSZ^WZ`.A.#e*.AA[ M MZZM5M00u0=>>MZZ% ,( .,224OE#!1'!:J z?Q&$++E2 $  5 =MN=ME=//=MNN . 7yP`Oab
cc; ! 		 XAZ )  OsA   F36	GG"G
GGG%3
GG
G"	!G"		base_pathallowed_extensionsdownload_configc           	      d  ^ [        U 5      (       a  [        X5      n OF[        U 5      (       a4  [        R                  R                  U 5      S   [        R                  -   nOSn[        XS9u  p[        U 40 UD6u  pV[        [        5      [        U 5      1-
  n[        UR                  [        5      (       a  UR                  OUR                  S   nUS:w  a  US-   OSn	0 n
US:X  a  SU
S'   UR                  " U 4S	S
0U
D6R!                  5        VVs/ s H  u  pUS   S:X  d[  UR#                  S5      (       d  M&  [        R                  R%                  [        R                  R'                  U5      5      (       d  Mi  [        U5      U;  d  Mz  [)        X5      (       a  M  [+        X5      (       a  M  UR-                  U	5      (       a  UOX-   PM     nnnTb  U Vs/ s H<  n[/        U4S j[        U5      R1                  S5      SS  5       5      (       d  M:  UPM>     nn[3        U5      [3        U5      :  a;  [5        [        U5      [        U5      -
  5      n[6        R9                  SU  SU 35        OUnU(       d%  SU  S3nTb  US[5        T5       3-  n[;        U5      eU$ s  snnf s  snf )aY  
Resolve the paths and URLs of the data files from the pattern passed by the user.

You can use patterns to resolve multiple local files. Here are a few examples:
- *.csv to match all the CSV files at the first level
- **.csv to match all the CSV files at any level
- data/* to match all the files inside "data"
- data/** to match all the files inside "data" and its subdirectories

The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
other than a forward slash /.

More generally:
- '*' matches any character except a forward-slash (to match just the file or directory name)
- '**' matches any character including a forward-slash /

Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
The same applies to special directories that start with a double underscore like "__pycache__".
You can still include one if the pattern explicitly mentions it:
- to include a hidden file: "*/.hidden.txt" or "*/.*"
- to include a hidden directory: ".hidden/*" or ".*/*"
- to include a special directory: "__special__/*" or "__*/*"

Example::

    >>> from datasets.data_files import resolve_pattern
    >>> base_path = "."
    >>> resolve_pattern("docs/**/*.py", base_path)
    [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

Args:
    pattern (str): Unix pattern or paths or URLs of the data files to resolve.
        The paths can be absolute or relative to base_path.
        Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
    base_path (str): Base path to use when resolving relative paths.
    allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
        For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
    download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters.
Returns:
    List[str]: List of paths or URLs to the local or remote files that match the patterns.
r    r   filez://hfFexpand_infodetailTtypeislinkNc              3   4   >#    U  H  nS U-   T;   v   M     g7f)rd   Nr   )r<   suffixr   s     r&   r>   "resolve_pattern.<locals>.<genexpr>r  s     gDf&3<#55Dfs   rd   r   z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   osrL   
splitdriver6   r   r
   rT   FILES_TO_IGNOREr   rG   protocolrN   globrM   rR   isfilerealpathrb   rg   r^   r@   rK   rQ   rO   loggerinforq   )r7   r   r   r   storage_optionsfs
fs_patternfiles_to_ignorer   protocol_prefixglob_kwargsfilepathr   matched_pathsoutinvalid_matched_files	error_msgs     `              r&   resolve_patternr   '  s   `   	+	w		GG&&w/2RVV;		@jGw:/:NB/*i.@-AAO(c::r{{AH*2f*<h&"OK4%*M" !gggJdJkJPPRRNHLF"txx'9 	Y>@ggnnRWWM]M]^fMg>h 	Y x 7 	Y 38H	 	Y
 PPXe 	YH''88o>XXR   % *
)gIhDWD]D]^aDbcdceDfgg ) 	 

 s8c-(($(]);c#h)F$G!KK27);hi~h  A &wiq1	)9$?Q:R9STTI	**J5
s0   %J'<?J'?J'J'"J'4 J'9J-J-c                 r    [        [        XS9n [        U5      $ ! [         a    [	        SU  S35      Sef = f)uI	  
Get the default pattern from a directory testing all the supported patterns.
The first patterns to return a non-empty list of data files is returned.

Some examples of supported patterns:

Input:

    my_dataset_repository/
    ├── README.md
    └── dataset.csv

Output:

    {'train': ['**']}

Input:

    my_dataset_repository/
    ├── README.md
    ├── train.csv
    └── test.csv

    my_dataset_repository/
    ├── README.md
    └── data/
        ├── train.csv
        └── test.csv

    my_dataset_repository/
    ├── README.md
    ├── train_0.csv
    ├── train_1.csv
    ├── train_2.csv
    ├── train_3.csv
    ├── test_0.csv
    └── test_1.csv

Output:

    {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
     'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}

Input:

    my_dataset_repository/
    ├── README.md
    └── data/
        ├── train/
        │   ├── shard_0.csv
        │   ├── shard_1.csv
        │   ├── shard_2.csv
        │   └── shard_3.csv
        └── test/
            ├── shard_0.csv
            └── shard_1.csv

Output:

    {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
     'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}

Input:

    my_dataset_repository/
    ├── README.md
    └── data/
        ├── train-00000-of-00003.csv
        ├── train-00001-of-00003.csv
        ├── train-00002-of-00003.csv
        ├── test-00000-of-00001.csv
        ├── random-00000-of-00003.csv
        ├── random-00001-of-00003.csv
        └── random-00002-of-00003.csv

Output:

    {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
     'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
     'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
)r   r   zThe directory at z doesn't contain any data filesN)r   r   r   rq   r)   )r   r   resolvers      r&   get_data_patternsr     sL    h )]Hj'11 j"3I;>] ^_eiijs   
 6	data_filec                 R   [        XS9u  p[        U 40 UD6tp4[        U[        5      (       a)  UR	                  U 5      nUR
                  UR                  4$ U R                  [        R                  5      (       a~  [        [        R                  UR                  S9nSU [        [        R                  5      S-   S  R                  SSS5      -   n UR	                  U 5      nUR
                  UR                  4$ UR                  U 5      nS H  nX;   d  M
  [        Xx   5      4s  $    g)	Nr   )endpointtokenhf://r   z	/resolve/@)ETagetagmtimer   )r   r
   rG   r   resolve_pathrepo_idrevisionr^   r   HF_ENDPOINTr   rQ   rp   r   rN   )	r   r   r   r   _resolved_pathhffsr   rV   s	            r&   _get_single_origin_metadatar     s    "C9!nIy4O4FB"l##	2$$m&<&<<<			f00	1	1V%7%7?T?TUiF,>,>(?!(C(EFNN{\_abcc	)))4$$m&<&<<<779D(;	N$$ ) r%   ry   max_workersc           
      :   Ub  UO[         R                  n[        S U  5       5      (       a8  [        U S[	        U 5      S:*  =(       d    S S9 Vs/ s H  n[        X1S9PM     sn$ [        [        [
        US9U U[        S[	        U 5      S:*  =(       d    S S9$ s  snf )Nc              3   ,   #    U  H
  nS U;   v   M     g7f)r   Nr   )r<   r   s     r&   r>   '_get_origin_metadata.<locals>.<genexpr>  s     
<I7is   zResolving data files   )descdisabler   )r   
tqdm_classr   r   )r   &HF_DATASETS_MULTITHREADING_MAX_WORKERSallhf_tqdmrQ   r   r   r   )ry   r   r   r   s       r&   _get_origin_metadatar     s    
 "-!8+f>k>kK

<
<<<
 %+J2-5	
	 (	S
 	
 +_M#J2%- 
s   Bc                     ^  \ rS rSrSrS\\   S\\   SS4U 4S jjrSS jr	\
   SS	\\   S
\R                  R                  S\\   S\\\      S\\   SS 4S jj5       r\
   SS	\\   S\\   S\\\      S\\   SS 4
S jj5       r\
   SS	\\   S\\   S\\\      S\\   SS 4
S jj5       rSSS.S\\\      S\\\      SS 4S jjrSrU =r$ )rD   i  a  
List of data files (absolute local paths or URLs).
It has two construction methods given the user's data files patterns:
- ``from_hf_repo``: resolve patterns inside a dataset repository
- ``from_local_or_remote``: resolve patterns from a local path

Moreover, DataFilesList has an additional attribute ``origin_metadata``.
It can store:
- the last modified time of local files
- ETag of remote files
- commit sha of a dataset repository

Thanks to this additional attribute, it is possible to hash the list
and get a different hash if and only if at least one file changed.
This is useful for caching Dataset objects that are obtained from a list of data files.
ry   origin_metadatar8   Nc                 0   > [         TU ]  U5        X l        g r;   )super__init__r   )selfry   r   	__class__s      r&   r   DataFilesList.__init__#  s    $.r%   c                 P    [        / U QUQU R                  UR                  -   5      $ r;   )rD   r   r   others     r&   __add__DataFilesList.__add__'  s(    _t_e_d.B.BUEZEZ.Z[[r%   rC   dataset_infor   r   r   c                     SUR                    SUR                   SU=(       d    S 3R                  S5      nU R                  XXES9$ )Nzhf://datasets/r   /r   r   r   r   )idsharstripfrom_patterns)clsrC   r   r   r   r   s         r&   from_hf_repoDataFilesList.from_hf_repo*  sX     %\__$5Q|7G7G6H)/WYIZ[bbcfg	  >P ! 
 	
r%   c                 x    Ub  UO%[        5       R                  5       R                  5       nU R                  XX4S9$ Nr   )r   resolveas_posixr   )r   rC   r   r   r   s        r&   from_local_or_remote"DataFilesList.from_local_or_remote8  s@     "+!6IDFNN<L<U<U<W	  >P ! 
 	
r%   c           
         Ub  UO%[        5       R                  5       R                  5       n/ nU H  n UR                  [	        UUUUS95        M!     [        XTS9nU " XW5      $ ! [
         a    [        U5      (       d  e  MR  f = fNr   r   )r   r   r   extendr   rq   r   r   )r   rC   r   r   r   ry   r7   r   s           r&   r   DataFilesList.from_patternsE  s     "+!6IDFNN<L<U<U<W	
G!!#"++=(7	   /z[://	 %  )) *s   A$$BB
extensions
file_namesr   r   c          	        ^ / nU(       aA  SR                  S U 5       5      nUR                  [        R                  " SU S35      5        U(       aA  SR                  S U 5       5      nUR                  [        R                  " SU S35      5        U(       aB  [	        U  V^s/ s H!  m[        U4S jU 5       5      (       d  M  TPM#     snU R                  S	9$ [	        [        U 5      U R                  S	9$ s  snf )
N|c              3   N   #    U  H  n[         R                  " U5      v   M     g 7fr;   rl   escape)r<   exts     r&   r>   'DataFilesList.filter.<locals>.<genexpr>d  s     "HZc299S>>Z   #%z.*(z	)(\..+)?$c              3   N   #    U  H  n[         R                  " U5      v   M     g 7fr;   r   )r<   fns     r&   r>   r   g  s     !E*B"))B--*r   z.*[\/]?(z)$c              3   D   >#    U  H  oR                  T5      v   M     g 7fr;   )rm   )r<   r7   r   s     r&   r>   r   k  s     7i`hU\i8P8P`hs    r   )joinrw   rl   compilerD   r@   r   rO   )r   r   r   rC   ext_pattern
fn_patternr   s         `r&   filterDataFilesList.filter_  s     (("HZ"HHKOOBJJ[M'DEF!E*!EEJOOBJJ(:,b'ABC ,0jDyC7i`h7i4iDj $ 4 4 
 !dT=Q=QRR	 ks   %C:C:r   )r   rD   r8   rD   NNN)r    r!   r"   r#   __doc__rO   rN   SingleOriginMetadatar   r   classmethodhuggingface_hubhf_apiDatasetInfor   r   r   r   r   r   r$   __classcell__r   s   @r&   rD   rD     s   "/49 /tDX?Y /^b /\ 
 $(2648
s)
 &,,88
 C=	

 %T#Y/
 ".1
 

 
  $(2648

s)

 C=

 %T#Y/	


 ".1

 


 

  $(26480s)0 C=0 %T#Y/	0
 ".10 
0 04 48[_S%d3i0SEMdSViEXS	S Sr%   c                      \ rS rSrSr\   SS\\\\	\   \
4   4   S\\   S\\	\      S\\   SS 4
S	 jj5       r\   SS\\\\	\   \
4   4   S
\R                  R                   S\\   S\\	\      S\\   SS 4S jj5       r\   SS\\\\	\   \
4   4   S\\   S\\	\      S\\   SS 4
S jj5       rSSS.S\\	\      S\\	\      SS 4S jjrSrg)DataFilesDictir  as  
Dict of split_name -> list of data files (absolute local paths or URLs).
It has two construction methods given the user's data files patterns :
- ``from_hf_repo``: resolve patterns inside a dataset repository
- ``from_local_or_remote``: resolve patterns from a local path

Moreover, each list is a DataFilesList. It is possible to hash the dictionary
and get a different hash if and only if at least one file changed.
For more info, see [`DataFilesList`].

This is useful for caching Dataset objects that are obtained from a list of data files.

Changing the order of the keys of this dictionary also doesn't change its hash.
NrC   r   r   r   r8   c                     U " 5       nUR                  5        H4  u  pg[        U[        5      (       a  UO[        R                  UUUUS9XV'   M6     U$ r   )rM   rG   rD   r   r   rC   r   r   r   r   rV   patterns_for_keys           r&   r   "DataFilesDict.from_local_or_remote  sa     e%-^^%5!C .>> !"77$''9$3	 8  H &6 
r%   r   c           	          U " 5       nUR                  5        H5  u  px[        U[        5      (       a  UO[        R                  UUUUUS9Xg'   M7     U$ )N)r   r   r   r   )rM   rG   rD   r   )	r   rC   r   r   r   r   r   rV   r	  s	            r&   r   DataFilesDict.from_hf_repo  sd     e%-^^%5!C .>> !"//$!-''9$3 0  H &6 
r%   c                     U " 5       nUR                  5        H4  u  pg[        U[        5      (       a  UO[        R                  UUUUS9XV'   M6     U$ r   )rM   rG   rD   r   r  s           r&   r   DataFilesDict.from_patterns  sa     e%-^^%5!C .>> !"00$''9$3	 1  H &6 
r%   r   r   r   c                z    [        U 5      " 5       nU R                  5        H  u  pEUR                  XS9X4'   M     U$ )Nr   )r   rM   r   )r   r   r   r   rV   data_files_lists         r&   r   DataFilesDict.filter  s<     4jl$(JJL C&---[CH %1
r%   r   r   )r    r!   r"   r#   r   r   rH   rN   r	   rO   rD   r   r   r   r   r  r  r   r   r   r$   r   r%   r&   r  r  r  s     $(2648sE$s)]":;;< C= %T#Y/	
 ".1 
 * 
 $(2648sE$s)]":;;< &,,88 C=	
 %T#Y/ ".1 
 .  $(2648sE$s)]":;;< C= %T#Y/	
 ".1 
 , 48[_%d3i0EMdSViEX	 r%   r  c                      ^  \ rS rSrSrS\\   S\\\\         4U 4S jjrS r	\
 SS\\   S\\\      SS 4S jj5       r SS	\S
\\   SS4S jjrS\\   SS 4S jrSrU =r$ )DataFilesPatternsListi  z
List of data files patterns (absolute local paths or URLs).
For each pattern there should also be a list of allowed extensions
to keep, or a None ot keep all the files for the pattern.
rC   r   c                 0   > [         TU ]  U5        X l        g r;   )r   r   r   )r   rC   r   r   s      r&   r   DataFilesPatternsList.__init__  s    
 	""4r%   c                 P    [        / U QUQU R                  UR                  -   5      $ r;   )rD   r   r   s     r&   r   DataFilesPatternsList.__add__  s(    _t_e_d.E.EH`H`.`aar%   r8   c                 ,    U " X/[        U5      -  5      $ r;   )rQ   )r   rC   r   s      r&   r   #DataFilesPatternsList.from_patterns  s     81CMABBr%   r   r   rD   c           
      <   Ub  UO%[        5       R                  5       R                  5       n/ n[        X R                  5       H!  u  pE UR                  [        UUUUS95        M#     [        X2S9n[        X65      $ ! [         a    [        U5      (       d  e  MW  f = fr   )r   r   r   zipr   r   r   rq   r   r   rD   )r   r   r   ry   r7   r   r   s          r&   r   DataFilesPatternsList.resolve  s    
 "+!6IDFNN<L<U<U<W	
+.t5L5L+M'G!!#"++=(7	 ,N /z[Z99	 %  )) *s   	A<<BBr   c                 \    [        X R                   Vs/ s H  o"U-   PM	     sn5      $ s  snf r;   )r  r   )r   r   r   s      r&   filter_extensions'DataFilesPatternsList.filter_extensions  s0    $MdMdeMd7I
2Mde
 	
es   )
r   r;   )r    r!   r"   r#   r   rO   rN   r   r   r   r   r   r   r   r  r$   r  r  s   @r&   r  r    s    5s)5 !$s)!455b LPCCyC6>tCy6IC	 C C 59:: ".1: 
	:.
DI 
:Q 
 
r%   r  c                       \ rS rSrSr\ SS\\\\   4   S\	\\      SS 4S jj5       r
 SS\S	\	\   SS
4S jjrS\\   SS 4S jrSrg)DataFilesPatternsDicti  zS
Dict of split_name -> list of data files patterns (absolute local paths or URLs).
NrC   r   r8   c                     U " 5       nUR                  5        H2  u  pE[        U[        5      (       a  UO[        R                  UUS9X4'   M4     U$ )Nr   )rM   rG   r  r   )r   rC   r   r   rV   r	  s         r&   r   #DataFilesPatternsDict.from_patterns	  s\     e%-^^%5!C .0EFF !*88$'9 9  H &6 
r%   r   r   r  c                 r    [        5       nU R                  5        H  u  pEUR                  X5      X4'   M     U$ r;   )r  rM   r   )r   r   r   r   rV   data_files_patterns_lists         r&   r   DataFilesPatternsDict.resolve  s5    
 o-1ZZ\)C/77	SCH .:
r%   r   c                 ~    [        U 5      " 5       nU R                  5        H  u  p4UR                  U5      X#'   M     U$ r;   )r   rM   r  )r   r   r   rV   r&  s        r&   r  'DataFilesPatternsDict.filter_extensions#  s7    4jl-1ZZ\)C/AA*MCH .:
r%   r   r;   )r    r!   r"   r#   r   r   rH   rN   rO   r   r   r   r   r  r$   r   r%   r&   r"  r"    s     W[CcN+AI$s)AT	  $ 59 ".1 
	DI :Q r%   r"  )NNr;   )Wr   rl   	functoolsr   r   r   pathlibr   r   typingr   r   r	   r   fsspec.corer
   r   	packagingr   tqdm.contrib.concurrentr   r   r   downloadr   namingr   rX   r   utilsr   r   r   utils.file_utilsr   r   r   r   r   utils.py_utilsr   tuplerN   r   TRAINrP   
get_loggerr    r   r   rq   r)   SPLIT_PATTERN_SHARDED
VALIDATIONTESTSPLIT_KEYWORDSNON_WORDS_CHARSFSSPEC_VERSIONparse"KEYWORDS_IN_FILENAME_BASE_PATTERNS"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSrs   ru   "DEFAULT_PATTERNS_SPLIT_IN_FILENAME"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAMEDEFAULT_PATTERNS_ALLro   rv   rA   r   boolrB   rH   rO   rU   rb   rg   r   r   r   r   intr   rD   r  r  r"  )rK   r5   r7   s   000r&   <module>rF     s   	 	   " , ,  ! (  .  $    " r r * U38_eCj%)CD  ekk*  
		H	%	# 		) 	 a  
KK':&	;	JJ9
 	7==44*GI\)]&*& W]];77*IK^)_&*& +AB`)a&*& ++u//<  &   
%e,,G9G 	wO<9 	=, 
  & "  &   
%e,,G9G 	wO<9 	=, 
  & " 
KK$  ,, && 
  \ \ \#1dD#o 6 #14U4PS9VeKeEf@f;g #1LS Ss St SB;QVY ;Qdg ;Qlp ;Q|)dxtCy8H/I )ddSVX\]`XaSaNb )d^ /304	YYY !c+Y n-	Y
 
#YYxXj Xjx7O Xj[_`ceijmen`n[o Xjz 15n- 0 15!%S	n- # 

	:^SDI ^SBZDm+, Zz2
DI 2
j#D&;!;< #q&&s0   ;L
$L.LL($L"4L(L"L(