
    hl                       S SK Jr  S SKrS SKrS SKJr  S SKJr  S SKJ	r	J
r
Jr  S SKJr  S SKJr  S SKJr  S SKJrJr  S S	KJr  S S
KJrJrJr  S SKJr  S SKJ r   S SK!J"r"  S SK#J$r$  S SK%J&r&  \RN                  " \(5         S SK)J*r*  S SK)J+r,  SSS5        \
(       a,  S SKJ-r-  S SKJ.r.J/r/J0r0  S SK1J2r2J3r3J4r4J5r5J6r6J7r7  S SK8J9r9  S SK:J;r;  \" SSSS9\" SSSS9SSSS SSSSSSSSSSSS SSSSS!SS".                                               S*S# jj5       5       r<SSSSSS$.             S+S% jjr=S,S& jr>   S-         S.S' jjr+\" SSSS9\" SSSS9SSS SSSSSSSSSSSSSS SS!SS!SSSSSS(.                                                       S/S) jj5       5       r?g! , (       d  f       GNG= f)0    )annotationsN)Sequence)Path)IOTYPE_CHECKINGAny)concat)import_optional)deprecate_renamed_parameterissue_deprecation_warning)issue_unstable_warning)is_int_sequenceis_path_or_str_sequencenormalize_filepath)wrap_ldf)
from_arrow)prepare_file_arg)!_init_credential_provider_builder)ScanOptions)PyLazyFrame)read_parquet_metadata)Literal)	DataFrameDataType	LazyFrame)ColumnMappingDefaultFieldValuesDeletionFiles
FileSourceParallelStrategy
SchemaDict)CredentialProviderFunction)ScanCastOptionsrow_count_namerow_index_namez0.20.4versionrow_count_offsetrow_index_offsetautoTF   raise)columnsn_rowsr%   r)   paralleluse_statisticshive_partitioningglobschemahive_schematry_parse_hive_datesrechunk
low_memorystorage_optionscredential_providerretriesuse_pyarrowpyarrow_options
memory_mapinclude_file_pathsmissing_columnsallow_missing_columnsc          	     n   U	b  Sn[        U5        U
b  Sn[        U5        U(       aN  Ub  Sn[        U5      eUb  Sn[        U5      eU	b  Sn[        U5      eU
b  Sn[        U5      e[        U UUUUUS9$ Ub  [	        SS	S
9  U(       a  SOSn[        U 40 SU_SU_SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SS_SU_SU_SU_SU_SU_SU_6nUbV  [        U5      (       a5  UR                  [        R                  " U5      5      nUR                  5       $ UR                  U5      nUR                  5       $ ) a  
Read into a DataFrame from a parquet file.

.. versionchanged:: 0.20.4
    * The `row_count_name` parameter was renamed `row_index_name`.
    * The `row_count_offset` parameter was renamed `row_index_offset`.

Parameters
----------
source
    Path(s) to a file or directory
    When needing to authenticate for scanning cloud locations, see the
    `storage_options` parameter.

    File-like objects are supported (by "file-like object" we refer to objects
    that have a `read()` method, such as a file handler like the builtin `open`
    function, or a `BytesIO` instance). For file-like objects, the stream position
    may not be updated accordingly after reading.
columns
    Columns to select. Accepts a list of column indices (starting at zero) or a list
    of column names.
n_rows
    Stop reading from parquet file after reading `n_rows`.
    Only valid when `use_pyarrow=False`.
row_index_name
    Insert a row index column with the given name into the DataFrame as the first
    column. If set to `None` (default), no row index column is created.
row_index_offset
    Start the row index at this offset. Cannot be negative.
    Only used if `row_index_name` is set.
parallel : {'auto', 'columns', 'row_groups', 'none'}
    This determines the direction of parallelism. 'auto' will try to determine the
    optimal direction.
use_statistics
    Use statistics in the parquet to determine if pages
    can be skipped from reading.
hive_partitioning
    Infer statistics and schema from Hive partitioned URL and use them
    to prune reads. This is unset by default (i.e. `None`), meaning it is
    automatically enabled when a single directory is passed, and otherwise
    disabled.
glob
    Expand path given via globbing rules.
schema
    Specify the datatypes of the columns. The datatypes must match the
    datatypes in the file(s). If there are extra columns that are not in the
    file(s), consider also passing `missing_columns='insert'`.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
hive_schema
    The column names and data types of the columns by which the data is partitioned.
    If set to `None` (default), the schema of the Hive partitions is inferred.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
try_parse_hive_dates
    Whether to try parsing hive values as date/datetime types.
rechunk
    Make sure that all columns are contiguous in memory by
    aggregating the chunks into a single array.
low_memory
    Reduce memory pressure at the expense of performance.
storage_options
    Options that indicate how to connect to a cloud provider.

    The cloud providers currently supported are AWS, GCP, and Azure.
    See supported keys here:

    * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
    * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
    * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
    * Hugging Face (`hf://`): Accepts an API key under the `token` parameter:           `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.

    If `storage_options` is not provided, Polars will try to infer the information
    from environment variables.
credential_provider
    Provide a function that can be called to provide cloud storage
    credentials. The function is expected to return a dictionary of
    credential keys along with an optional credential expiry time.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
retries
    Number of retries if accessing a cloud instance fails.
use_pyarrow
    Use PyArrow instead of the Rust-native Parquet reader. The PyArrow reader is
    more stable.
pyarrow_options
    Keyword arguments for `pyarrow.parquet.read_table
    <https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html>`_.
memory_map
    Memory map underlying file. This will likely increase performance.
    Only used when `use_pyarrow=True`.
include_file_paths
    Include the path of the source file(s) as a column with this name.
    Only valid when `use_pyarrow=False`.
missing_columns
    Configuration for behavior when columns defined in the schema
    are missing from the data:

    * `insert`: Inserts the missing columns using NULLs as the row values.
    * `raise`: Raises an error.

allow_missing_columns
    When reading a list of parquet files, if a column existing in the first
    file cannot be found in subsequent files, the default behavior is to
    raise an error. However, if `allow_missing_columns` is set to
    `True`, a full-NULL column is returned instead of erroring for the files
    that do not contain the column.

    .. deprecated:: 1.30.0
        Use the parameter `missing_columns` instead and pass one of
        `('insert', 'raise')`.

Returns
-------
DataFrame

See Also
--------
scan_parquet: Lazily read from a parquet file or multiple files via glob patterns.
scan_pyarrow_dataset

Warnings
--------
Calling `read_parquet().lazy()` is an antipattern as this forces Polars to
materialize a full parquet file and therefore cannot push any optimizations
into the reader. Therefore always prefer `scan_parquet` if you want to work
with `LazyFrame` s.

z@the `schema` parameter of `read_parquet` is considered unstable.zEthe `hive_schema` parameter of `read_parquet` is considered unstable.z/`n_rows` cannot be used with `use_pyarrow=True`z;`include_file_paths` cannot be used with `use_pyarrow=True`z/`schema` cannot be used with `use_pyarrow=True`zwcannot use `hive_partitions` with `use_pyarrow=True`

Hint: Pass `pyarrow_options` instead with a 'partitioning' entry.r-   r8   r<   r=   r6   zthe parameter `allow_missing_columns` for `read_parquet` is deprecated. Use the parameter `missing_columns` instead and pass one of `('insert', 'raise')`.1.30.0r&   insertr,   r.   r%   r)   r/   r0   r1   r3   r4   r5   r6   r7   cacheFr8   r9   r:   r2   r>   r?   )r   
ValueError	TypeError_read_parquet_with_pyarrowr   scan_parquetr   selectFnthcollect)sourcer-   r.   r%   r)   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   msglfs                            U/home/james-whalen/.local/lib/python3.13/site-packages/polars/io/parquet/functions.pyread_parquetrR   4   s   H Ps#Us# CCS/!)OCS/!CCS/!"X  C. )++!
 	
 (!% 		
 '<( 


 &
 *	

 
 &
 ,
 
  
 2
 
 
 
 (
 0
  !
" #
$ .%
& ('
B, 7##155>*B ::< 7#B::<    rB   c                  [        SSSS9nU=(       d    0 n/ n[        U [        5      (       a>  [        U 5      S:  a+  [        U S   [        [
        R                  45      (       a  U nOU /nOU /n/ nU HH  n [        U SUS9 n	UR                  " U	4UUS.UD6n
S S S 5        [        W
US	9nUR                  U5        MJ     [        U5      S
:X  a  US   $ [        U5      $ ! , (       d  f       NL= f)Nzpyarrow.parquet z<is required when using `read_parquet(..., use_pyarrow=True)`)
err_prefix
err_suffixr   T)r;   r8   )r=   r-   )r6      )r
   
isinstancelistlenbytesioIOBaser   
read_tabler   appendplconcat)rN   r-   r8   r<   r=   r6   pyarrow_parquetsourcesresultssource_preppa_tableresults               rQ   rH   rH   $  s     &QO
 &+OMOG&$v;?z&)eRYY5GHHGhG(!G+
 &11% "	H
 Hg6v  7|qqz  #
 
s   C
C,	c                4    [        U 5      R                  5       $ )a  
Get the schema of a Parquet file without reading data.

If you would like to read the schema of a cloud file with authentication
configuration, it is recommended use `scan_parquet` - e.g.
`scan_parquet(..., storage_options=...).collect_schema()`.

Parameters
----------
source
    Path to a file or a file-like object (by "file-like object" we refer to objects
    that have a `read()` method, such as a file handler like the builtin `open`
    function, or a `BytesIO` instance). For file-like objects, the stream position
    may not be updated accordingly after reading.

Returns
-------
dict
    Dictionary mapping column names to datatypes

See Also
--------
scan_parquet
)rI   collect_schema)rN   s    rQ   read_parquet_schemarj   Z  s    2 ..00rS   c                    [        U [        [        45      (       a
  [        U SS9n [	        X US5      nA[        U Ub  [        UR                  5       5      OSUUS9$ )a7  
Get file-level custom metadata of a Parquet file without reading data.

.. warning::
    This functionality is considered **experimental**. It may be removed or
    changed at any point without it being considered a breaking change.

Parameters
----------
source
    Path to a file or a file-like object (by "file-like object" we refer to objects
    that have a `read()` method, such as a file handler like the builtin `open`
    function, or a `BytesIO` instance). For file-like objects, the stream position
    may not be updated accordingly after reading.
storage_options
    Options that indicate how to connect to a cloud provider.

    The cloud providers currently supported are AWS, GCP, and Azure.
    See supported keys here:

    * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
    * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
    * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
    * Hugging Face (`hf://`): Accepts an API key under the `token` parameter:           `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.

    If `storage_options` is not provided, Polars will try to infer the information
    from environment variables.
credential_provider
    Provide a function that can be called to provide cloud storage
    credentials. The function is expected to return a dictionary of
    credential keys along with an optional credential expiry time.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
retries
    Number of retries if accessing a cloud instance fails.

Returns
-------
dict
    Dictionary with the metadata. Empty if no custom metadata is available.
Fcheck_not_directoryrI   N)r8   r9   r:   )rY   strr   r   r   _read_parquet_metadatarZ   items)rN   r8   r9   r:   credential_provider_builders        rQ   r   r   v  sg    d &3+&&#FF"C_n# 	!-<-HD&&()d7 rS   )r.   r%   r)   r/   r0   r1   r2   hidden_file_prefixr3   r4   r5   r6   r7   rE   r8   r9   r:   r>   r?   r@   extra_columnscast_options_column_mapping_default_values_deletion_files_table_statisticsc               4   U	b  Sn[        U5        U
b  Sn[        U5        Ub  Sn[        U5        Ub  Sn[        U5        Ub  [        SSS9  U(       a  S	OS
n[        U [        [        45      (       a  [        U SS9n O+[        U 5      (       a  U  V s/ s H  n [        U SS9PM     sn n [        UXS5      nA[        U [        5      (       a  [        U [        [        45      (       a  U /OU n[        R                  " UU	UUU[        S$0 SUb  X#4OS_SUb  SU4OS_SU_SU_SU_SU_SU_S[        U[        5      (       a  U/OU_SU_SU
_SU_SU_SU_SUb  [        UR                  5       5      OS_SU_SU_SU_S U_S!U_S"U_6S#9n[        U5      $ s  sn f )%a  
Lazily read from a local or cloud-hosted parquet file (or files).

This function allows the query optimizer to push down predicates and projections to
the scan level, typically increasing performance and reducing memory overhead.

.. versionchanged:: 0.20.4
    * The `row_count_name` parameter was renamed `row_index_name`.
    * The `row_count_offset` parameter was renamed `row_index_offset`.

.. versionchanged:: 1.30.0
    * The `allow_missing_columns` is deprecated in favor of `missing_columns`.

Parameters
----------
source
    Path(s) to a file or directory
    When needing to authenticate for scanning cloud locations, see the
    `storage_options` parameter.
n_rows
    Stop reading from parquet file after reading `n_rows`.
row_index_name
    If not None, this will insert a row index column with the given name into the
    DataFrame
row_index_offset
    Offset to start the row index column (only used if the name is set)
parallel : {'auto', 'columns', 'row_groups', 'prefiltered', 'none'}
    This determines the direction and strategy of parallelism. 'auto' will
    try to determine the optimal direction.

    The `prefiltered` strategy first evaluates the pushed-down predicates in
    parallel and determines a mask of which rows to read. Then, it
    parallelizes over both the columns and the row groups while filtering
    out rows that do not need to be read. This can provide significant
    speedups for large files (i.e. many row-groups) with a predicate that
    filters clustered rows or filters heavily. In other cases,
    `prefiltered` may slow down the scan compared other strategies.

    The `prefiltered` settings falls back to `auto` if no predicate is
    given.

    .. warning::
        The `prefiltered` strategy is considered **unstable**. It may be
        changed at any point without it being considered a breaking change.

use_statistics
    Use statistics in the parquet to determine if pages
    can be skipped from reading.
hive_partitioning
    Infer statistics and schema from hive partitioned URL and use them
    to prune reads.
glob
    Expand path given via globbing rules.
hidden_file_prefix
    Skip reading files whose names begin with the specified prefixes.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
schema
    Specify the datatypes of the columns. The datatypes must match the
    datatypes in the file(s). If there are extra columns that are not in the
    file(s), consider also passing `missing_columns='insert'`.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
hive_schema
    The column names and data types of the columns by which the data is partitioned.
    If set to `None` (default), the schema of the Hive partitions is inferred.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
try_parse_hive_dates
    Whether to try parsing hive values as date/datetime types.
rechunk
    In case of reading multiple files via a glob pattern rechunk the final DataFrame
    into contiguous memory chunks.
low_memory
    Reduce memory pressure at the expense of performance.
cache
    Cache the result after reading.
storage_options
    Options that indicate how to connect to a cloud provider.

    The cloud providers currently supported are AWS, GCP, and Azure.
    See supported keys here:

    * `aws <https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html>`_
    * `gcp <https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html>`_
    * `azure <https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html>`_
    * Hugging Face (`hf://`): Accepts an API key under the `token` parameter:           `{'token': '...'}`, or by setting the `HF_TOKEN` environment variable.

    If `storage_options` is not provided, Polars will try to infer the information
    from environment variables.
credential_provider
    Provide a function that can be called to provide cloud storage
    credentials. The function is expected to return a dictionary of
    credential keys along with an optional credential expiry time.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.
retries
    Number of retries if accessing a cloud instance fails.
include_file_paths
    Include the path of the source file(s) as a column with this name.
missing_columns
    Configuration for behavior when columns defined in the schema
    are missing from the data:

    * `insert`: Inserts the missing columns using NULLs as the row values.
    * `raise`: Raises an error.

allow_missing_columns
    When reading a list of parquet files, if a column existing in the first
    file cannot be found in subsequent files, the default behavior is to
    raise an error. However, if `allow_missing_columns` is set to
    `True`, a full-NULL column is returned instead of erroring for the files
    that do not contain the column.

    .. deprecated:: 1.30.0
        Use the parameter `missing_columns` instead and pass one of
        `('insert', 'raise')`.
extra_columns
    Configuration for behavior when extra columns outside of the
    defined schema are encountered in the data:

    * `ignore`: Silently ignores.
    * `raise`: Raises an error.

cast_options
    Configuration for column type-casting during scans. Useful for datasets
    containing files that have differing schemas.

    .. warning::
        This functionality is considered **unstable**. It may be changed
        at any point without it being considered a breaking change.

See Also
--------
read_parquet
scan_pyarrow_dataset

Examples
--------
Scan a local Parquet file.

>>> pl.scan_parquet("path/to/file.parquet")  # doctest: +SKIP

Scan a file on AWS S3.

>>> source = "s3://bucket/*.parquet"
>>> pl.scan_parquet(source)  # doctest: +SKIP
>>> storage_options = {
...     "aws_access_key_id": "<secret>",
...     "aws_secret_access_key": "<secret>",
...     "aws_region": "us-east-1",
... }
>>> pl.scan_parquet(source, storage_options=storage_options)  # doctest: +SKIP
Nz@the `schema` parameter of `scan_parquet` is considered unstable.zEthe `hive_schema` parameter of `scan_parquet` is considered unstable.zFThe `cast_options` parameter of `scan_parquet` is considered unstable.zLThe `hidden_file_prefix` parameter of `scan_parquet` is considered unstable.zthe parameter `allow_missing_columns` for `scan_parquet` is deprecated. Use the parameter `missing_columns` instead and pass one of `('insert', 'raise')`.rC   r&   rD   r,   Frl   rI   	row_index	pre_slicer   rt   rs   r?   r>   r2   rr   r1   r4   r5   r6   rE   r8   r9   r:   column_mappingdefault_valuesdeletion_filestable_statistics)rc   r3   r/   r7   r0   scan_options )r   r   rY   rn   r   r   r   r   r   r\   r   new_from_parquetr   rZ   rp   r   )rN   r.   r%   r)   r/   r0   r1   r2   rr   r3   r4   r5   r6   r7   rE   r8   r9   r:   r>   r?   r@   rs   rt   ru   rv   rw   rx   rO   rq   rc   pylfs                                  rQ   rI   rI     sT   F Ps#Us#Vs#%\s#(!% 		
 '<(&3+&&#FF	 	(	(PV
PVfv5APV
 #DVn# 	 &(++z&3,/O/O 
  ''%  
 "-  2	
 &,%7q&kT
 &
 (
 ,
  2
 
 0#66 $$'
" 0#
$ $%
& "6'
( )
* +
. 2A1L_**,-RV/
2 !<3
4 5
6 +7
8 +9
: +;
< /=
&DP D>q
s   F)0rN   r   r-   list[int] | list[str] | Noner.   
int | Noner%   
str | Noner)   intr/   r    r0   boolr1   bool | Noner2   r   r3   SchemaDict | Noner4   r   r5   r   r6   r   r7   r   r8   dict[str, Any] | Noner9   3CredentialProviderFunction | Literal['auto'] | Noner:   r   r;   r   r<   r   r=   r   r>   r   r?   Literal['insert', 'raise']r@   r   returnr   )rN   zWstr | Path | IO[bytes] | bytes | list[str] | list[Path] | list[IO[bytes]] | list[bytes]r-   r   r8   r   r<   r   r=   r   r6   r   r   r   )rN   str | Path | IO[bytes] | bytesr   zdict[str, DataType])Nr*   r+   )
rN   r   r8   r   r9   r   r:   r   r   zdict[str, str])8rN   r   r.   r   r%   r   r)   r   r/   r    r0   r   r1   r   r2   r   rr   zstr | Sequence[str] | Noner3   r   r4   r   r5   r   r6   r   r7   r   rE   r   r8   r   r9   r   r:   r   r>   r   r?   r   r@   r   rs   zLiteral['ignore', 'raise']rt   zScanCastOptions | Noneru   zColumnMapping | Nonerv   zDefaultFieldValues | Nonerw   zDeletionFiles | Nonerx   zDataFrame | Noner   r   )@
__future__r   
contextlibr]   collections.abcr   pathlibr   typingr   r   r   polars.functions	functionsrK   polarsr	   ra   polars._dependenciesr
   polars._utils.deprecationr   r   polars._utils.unstabler   polars._utils.variousr   r   r   polars._utils.wrapr   polars.convertr   polars.io._utilsr   ,polars.io.cloud.credential_provider._builderr   polars.io.scan_options._optionsr   suppressImportErrorpolars._plrr   r   ro   r   r   r   r   polars._typingr   r   r   r   r    r!   polars.io.cloudr"   polars.io.scan_optionsr#   rR   rH   rj   rI   r   rS   rQ   <module>r      st   "  	 $  ) )  % 0 : 
 ( % 8%'K & 55  ;6 -/?R/1CXV -1!%!'%) $%)!%-1OU-1%)29)-1kk *k 	k
 k k k k #k k k #k k k k  +!k" M#k$ %k& 'k( +)k* +k, #-k. 0/k0 '1k2 3k W Skp -1-1-13!3! *3! +3! +3! 3! 3! 3!l1< .2OU	A*A*A MA 	A
 AH -/?R/1CXV !%!'%)59 $%)!%-1OU%)29)-07+/,015,0*.9VV V 	V
 V V V #V V 3V V #V V V V  !V" +#V$ M%V& 'V( #)V* 0+V, '-V. ./V0 )1V2 *3V4 /5V6 *7V8 (9V: ;V W SVy &%s   ?G
G