
    h7#                        S SK r S SKJr  S SKJrJrJr  S SKrS SK	J
r  S SKJr  S SKrS SKJr  \R$                  R&                  R)                  \5      r\ " S S\R.                  5      5       r " S S\R2                  5      rg)	    N)	dataclass)LiteralOptionalUnion)
table_castc                     ^  \ rS rSr% SrSr\\   \S'   Sr	\\
\      \S'   Sr\\R                     \S'   Sr\\\R$                  \
\   \
\
\      4      \S'   Sr\\R*                     \S'   S	r\S
   \S'   U 4S jrSrU =r$ )ParquetConfig   a	  
BuilderConfig for Parquet.

Args:
    batch_size (`int`, *optional*):
        Size of the RecordBatches to iterate on.
        The default is the row group size (defined by the first row group).
    columns (`list[str]`, *optional*)
        List of columns to load, the other ones are ignored.
        All columns are loaded by default.
    features: (`Features`, *optional*):
        Cast the data to `features`.
    filters (`Union[pyarrow.dataset.Expression, list[tuple], list[list[tuple]]]`, *optional*):
        Return only the rows matching the filter.
        If possible the predicate will be pushed down to exploit the partition information
        or internal metadata found in the data source, e.g. Parquet statistics.
        Otherwise filters the loaded RecordBatches before yielding them.
    fragment_scan_options (`pyarrow.dataset.ParquetFragmentScanOptions`, *optional*)
        Scan-specific options for Parquet fragments.
        This is especially useful to configure buffering and caching.

        <Added version="4.2.0"/>
    on_bad_files (`Literal["error", "warn", "skip"]`, *optional*, defaults to "error")
        Specify what to do upon encountering a bad file (a file that can't be read). Allowed values are :
        * 'error', raise an Exception when a bad file is encountered.
        * 'warn', raise a warning when a bad file is encountered and skip that file.
        * 'skip', skip bad files without raising or warning when they are encountered.

        <Added version="4.2.0"/>

Example:

Load a subset of columns:

```python
>>> ds = load_dataset(parquet_dataset_id, columns=["col_0", "col_1"])
```

Stream data and efficiently filter data, possibly skipping entire files or row groups:

```python
>>> filters = [("col_0", "==", 0)]
>>> ds = load_dataset(parquet_dataset_id, streaming=True, filters=filters)
```

Increase the minimum request size when streaming from 32MiB (default) to 128MiB and enable prefetching:

```python
>>> import pyarrow
>>> import pyarrow.dataset
>>> fragment_scan_options = pyarrow.dataset.ParquetFragmentScanOptions(
...     cache_options=pyarrow.CacheOptions(
...         prefetch_limit=1,
...         range_size_limit=128 << 20
...     ),
... )
>>> ds = load_dataset(parquet_dataset_id, streaming=True, fragment_scan_options=fragment_scan_options)
```

N
batch_sizecolumnsfeaturesfiltersfragment_scan_optionserror)r   warnskipon_bad_filesc                 "   > [         TU ]  5         g N)super__post_init__)self	__class__s    c/home/james-whalen/.local/lib/python3.13/site-packages/datasets/packaged_modules/parquet/parquet.pyr   ParquetConfig.__post_init__V   s         )__name__
__module____qualname____firstlineno____doc__r   r   int__annotations__r   liststrr   datasetsFeaturesr   r   ds
Expressiontupler   ParquetFragmentScanOptionsr   r   r   __static_attributes____classcell__)r   s   @r   r	   r	      s    ;z !%J$#'GXd3i ',0Hhx(()0NRGXeBMM4;T%[8IIJKREI8B$A$ABI5<L'12<   r   r	   c                   d    \ rS rSr\rS rS rS\R                  S\R                  4S jr
S rSrg	)
ParquetZ   c                    U R                   R                  b  U R                   R                  by  [        U R                   R                  5      [        U R                   R                  5      :w  a9  [	        SU R                   R                   SU R                   R                   35      e[
        R                  " U R                   R                  S9$ )NzIThe columns and features argument must contain the same columns, but got z and )r   )configr   r   set
ValueErrorr'   DatasetInfo)r   s    r   _infoParquet._info]   s    KK+$$0DKK''(C0D0D,EE[;;&&'uT[[-A-A,BC  ##T[[-A-ABBr   c                 @   U R                   R                  (       d"  [        SU R                   R                   35      eSUR                  l        UR                  U R                   R                  5      n/ nUR                  5        GH3  u  pE[        U[        5      (       a  U/nU Vs/ s H  oaR                  U5      PM     nnU R                  R                  c  [        R                  R                  U5       H\  n [        US5       n[         R"                  R%                  [&        R(                  " U5      5      U R                  l         SSS5          O   U R                  R                  c"  [        SU R                   R                   35      eUR=                  [         R>                  " USU0S95        GM6     U R                   R@                  b  [C        U R                   R@                  5      [C        U R                  R                  5      :w  aw  [         R"                  " U R                  R                  R                  5        V	V
s0 s H"  u  pXR                   R@                  ;   d  M   X_M$     sn
n	5      U R                  l        U$ s  snf ! , (       d  f       GM  = f! [*        R,                   a  nU R                   R.                  S:X  a2  [0        R3                  SU S[5        U5      R6                   SU 35        e U R                   R.                  S	:X  a9  [0        R9                  S
U S[5        U5      R6                   SU S35         SnAGM_  [0        R;                  S
U S[5        U5      R6                   SU S35         SnAGM  SnAff = fs  sn
n	f )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=TNrbr   zFailed to read schema from '' with error : r   zSkipping bad schema from ''. `zPAt least one valid data file must be specified, all the data_files are invalid: files)name
gen_kwargs)"r3   
data_filesr5   download_configextract_on_the_flydownload_and_extractitems
isinstancer&   
iter_filesinfor   	itertoolschainfrom_iterableopenr'   r(   from_arrow_schemapqread_schemapaArrowInvalidr   loggerr   typer   warningdebugappendSplitGeneratorr   r4   )r   
dl_managerrB   splits
split_namer?   filefecolfeats              r   _split_generatorsParquet._split_generatorsi   s   {{%%\]a]h]h]s]s\tuvv8<
""544T[[5K5KL
!+!1!1!3J%%%=BCUT**40UECyy!!)%OO99%@Di!$-191B1B1T1TUWUcUcdeUf1gDII.! .- A yy!!) fgkgrgrg}g}f~  MM(11zwX]N^_`1 "42 ;;*s4;;3F3F/G3tyyOaOaKb/b!)!2!2,0II,>,>,D,D,Fe,Fys#Q\Q\QdQdJd,Fe"DII 3 D
 .- ?? i;;33w>"LL+Gv][_`a[b[k[kZllnopnq)rs!![[55?"NN-GvSQUVWQXQaQaPbbdefdggh+ijj"LL+EdV3tTUwO_O_N``bcdbeef)ghhi fsP   (J>J&
AJJ&N
/N

J#J&#J&&N:BN2NNpa_tablereturnc                     U R                   R                  b)  [        XR                   R                  R                  5      nU$ r   )rI   r   r   arrow_schema)r   rc   s     r   _cast_tableParquet._cast_table   s3    99) "(II,>,>,K,KLHr   c              #     #    U R                   R                  b  U R                   R                  b  [        S U R                  R                  R
                   5       5      [        U R                   R                  5      :w  a:  [        SU R                   R                   SU R                  R                   S35      e[        U R                   R                  [        5      (       a*  [        R                  " U R                   R                  5      OU R                   R                  n[        R                  " U R                   R                  S9n[        [         R"                  R%                  U5      5       H  u  pE ['        US5       nUR)                  U5      nUR*                  (       a  U R                   R,                  =(       d    UR*                  S   R.                  n[        UR1                  UU R                   R                  USSS95       H?  u  p[2        R4                  R7                  U
/5      nU S	U	 3U R9                  U5      4v   MA     S S S 5        M     g ! , (       d  f       M  = f! [2        R:                  [        4 a  nU R                   R<                  S
:X  a2  [>        RA                  SU S[C        U5      RD                   SU 35        e U R                   R<                  S:X  a9  [>        RG                  SU S[C        U5      RD                   SU S35         S nAGM  [>        RI                  SU S[C        U5      RD                   SU S35         S nAGM  S nAff = f7f)Nc              3   8   #    U  H  oR                   v   M     g 7fr   )r@   ).0fields     r   	<genexpr>+Parquet._generate_tables.<locals>.<genexpr>   s     N.MUjj.Ms   z)Tried to load parquet data with columns 'z' with mismatching features '')default_fragment_scan_optionsr:   r   )r   r   filterbatch_readaheadfragment_readahead_r   zFailed to read file 'r;   r<   r   zSkipping bad file 'r=   r>   )%r3   r   r   sortedrI   rf   r5   rG   r   r%   rO   filters_to_expressionr)   ParquetFileFormatr   	enumeraterJ   rK   rL   rM   make_fragment
row_groupsr   num_rows
to_batchesrQ   Tablefrom_batchesrg   rR   r   rS   r   rT   r   rU   rV   )r   r?   filter_exprparquet_file_formatfile_idxr\   r]   parquet_fragmentr   	batch_idxrecord_batchrc   r^   s                r   _generate_tablesParquet._generate_tables   s    ;;+0C0C0ONdii.@.@.M.MNNRXY]YdYdYlYlRmm ?@S@S?TTqrvr{r{  sE  sE  rF  FG  H 
 $++--t44 $$T[[%8%89$$ 	
 !22QUQ\Q\QrQrs'	(E(Ee(LMNHZ$%':'H'H'K$'22%)[[%;%;%f?O?Z?Z[\?]?f?f
7@,77+5(,(;(;'20134 8 83I (*xx'<'<l^'LH &.Ja	{";T=M=Mh=W"WW8	 &% N%%$ OOZ0 Z;;++w6LL#8mDQRGL\L\K]]_`a_b!cd[[--7NN%8c$q'BRBRASSUVWUXXY#Z[[LL#6tfCQ@P@P?QQSTUSVVW!XYYZsb   E8M,;I2CI I2M, 
I/	*I2-M,/I22M)BM$$M,+2M$M,$M))M,r   N)r   r   r    r!   r	   BUILDER_CONFIG_CLASSr7   ra   rQ   r}   rg   r   r-   r   r   r   r0   r0   Z   s5    (
C$LBHH  'Zr   r0   )rJ   dataclassesr   typingr   r   r   pyarrowrQ   pyarrow.datasetdatasetr)   pyarrow.parquetparquetrO   r'   datasets.tabler   utilslogging
get_loggerr   rS   BuilderConfigr	   ArrowBasedBuilderr0   r   r   r   <module>r      sw     ! + +     % 
			*	*8	4 F H** F  F RcZh(( cZr   