
    h                         S SK r S SKrS SKJrJrJr  S SKrS SKJr	  SSK
JrJrJrJr  SSKJrJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr   " S S\5      r  " S S5      r!g)    N)BinaryIOOptionalUnion   )DatasetFeatures
NamedSplitconfig)$get_writer_batch_size_from_data_size#get_writer_batch_size_from_features)query_table)_PACKAGED_DATASETS_MODULES)Parquet)tqdm)NestedDataStructureLikePathLike   )AbstractDatasetReaderc                   v   ^  \ rS rSr      SS\\   S\\   S\\   S\	S\
S\
S\\   4U 4S	 jjjrS
 rSrU =r$ )ParquetDatasetReader   path_or_pathssplitfeatures	cache_dirkeep_in_memory	streamingnum_procc           
         > [         T
U ]  " U4UUUUUUS.UD6  [        U[        5      (       a  UOU R                  U0n[
        S   S   n	[        SUUUU	S.UD6U l        g )N)r   r   r   r   r   r   parquetr   )r   
data_filesr   hash )super__init__
isinstancedictr   r   r   builder)selfr   r   r   r   r   r   r   kwargsr"   	__class__s             M/home/james-whalen/.local/lib/python3.13/site-packages/datasets/io/parquet.pyr%   ParquetDatasetReader.__init__   s     			
)		
 		
 *4M4)H)Htzz[hNi))4Q7 
$	

 
    c                 ,   U R                   (       a%  U R                  R                  U R                  S9nU$ S nS nS nS nU R                  R	                  UUUUU R
                  S9  U R                  R                  U R                  X@R                  S9nU$ )N)r   )download_configdownload_modeverification_mode	base_pathr   )r   r2   	in_memory)r   r(   as_streaming_datasetr   download_and_preparer   
as_datasetr   )r)   datasetr0   r1   r2   r3   s         r,   readParquetDatasetReader.read2   s    >>ll77djj7IG$  #O M $ILL-- /+"3# .  ll--jj4EQdQd . G r.   )r(   )NNNFFN)__name__
__module____qualname____firstlineno__r   r   r   r	   r   strboolintr%   r9   __static_attributes____classcell__)r+   s   @r,   r   r      s     '+'+$"&
.x8
 
#
 8$	

 
 
 
 3-
 
> r.   r   c                       \ rS rSr    SS\S\\\4   S\\	   S\\
   S\\\
4   S\4S	 jjrS
\	4S jrS\S\	S
\	4S jrSrg)ParquetDatasetWriterJ   Nr8   path_or_buf
batch_sizestorage_optionsuse_content_defined_chunkingwrite_page_indexc                 $   Xl         X l        U=(       d?    [        UR                  5      =(       d#    [	        [        U5      UR                  5       5      U l        U=(       d    0 U l        Xpl	        USL a  [        R                  nXPl        X`l        g )NT)r8   rG   r   r   r   len_estimate_nbytesrH   rI   parquet_writer_kwargsr
   DEFAULT_CDC_OPTIONSrJ   rK   )r)   r8   rG   rH   rI   rJ   rK   rO   s           r,   r%   ParquetDatasetWriter.__init__K   s     & ^273C3CD^3CL'BZBZB\] 	
  /4"%:"'4/+1+E+E(,H) 0r.   returnc                    [        U R                  [        [        [        R
                  45      (       ai  [        R                  " U R                  S40 U R                  =(       d    0 D6 nU R                  " SUU R                  S.U R                  D6nS S S 5        U$ U R                  " SU R                  U R                  S.U R                  D6nU$ ! , (       d  f       W$ = f)Nwb)file_objrH   r#   )r&   rG   r?   bytesosr   fsspecopenrI   _writerH   rO   )r)   bufferwrittens      r,   writeParquetDatasetWriter.writec   s    d&&eR[[(ABBT--tT8L8L8RPRTX^++ ## 00 U  kk ))?? ,,G
  UT s   **C
C"rU   c           	         SnUR                  SS5      nU R                  R                  R                  n[        R
                  " U4UU R                  U R                  S.UD6n[        [        S[        U R                  5      U5      SSS9 Ha  n[        U R                  R                  [        XU-   5      U R                  R                  S9n	UR                  U	5        XIR                   -  nMc     U R                  S	La1  UR#                  S
[$        R&                  " U R                  5      05        UR)                  5         U$ )zxWrites the pyarrow table as Parquet to a binary file handle.

Caller is responsible for opening and closing the handle.
r   rG   N)schemarJ   rK   baz"Creating parquet from Arrow format)unitdesc)tablekeyindicesFcontent_defined_chunking)popr8   r   arrow_schemapqParquetWriterrJ   rK   hf_tqdmrangerM   r   _dataslice_indiceswrite_tablenbytesadd_key_value_metadatajsondumpsclose)
r)   rU   rH   rO   r\   _r`   writeroffsetbatchs
             r,   rZ   ParquetDatasetWriter._writes   s$   
 !%%mT:&&33!!
)-)J)J!22	

 $
 !S&
35
F
  ll((&:"56--E
 u%||#G
 ,,E9))+EtzzRVRsRsGt*uvr.   )rH   r8   rO   rG   rI   rJ   rK   )NNTT)r;   r<   r=   r>   r   r   r   r   r   rA   r'   r@   r%   r]   rZ   rB   r#   r.   r,   rE   rE   J   s    
 %)*.:>!%11 8X-.1 SM	1
 "$1 ',D$J&71 10s  #x #S #VY #r.   rE   )"rt   rW   typingr   r   r   rX   pyarrow.parquetr    rj    r   r   r	   r
   arrow_writerr   r   
formattingr   packaged_modulesr    packaged_modules.parquet.parquetr   utilsr   rl   utils.typingr   r   abcr   r   rE   r#   r.   r,   <module>r      sL     	 , ,   4 4 d $ 9 6 # < &50 5pL Lr.   