
    h,                        S SK r S SKJrJr  S SKJr  S SKJr  S SKJ	r	J
r
JrJrJrJr  S SKrSSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  \	(       a
  S SKrSSKJr  SSS\ 4S jr!\ " S S5      5       r"SSS\#4S jr$g)    N)	dataclassfield)BytesIO)Path)TYPE_CHECKINGAnyClassVarDictOptionalUnion   )config)DownloadConfig)
array_cast)is_local_pathxopen)no_op_if_value_is_nullstring_to_dict   )FeatureTypepdfpdfplumber.pdf.PDFreturnc                     [        5        nU R                   H(  nUR                  UR                  R                  5        M*     UR                  5       sSSS5        $ ! , (       d  f       g= f)z-Convert a pdfplumber.pdf.PDF object to bytes.N)r   pageswriter   streamgetvalue)r   bufferpages      O/home/james-whalen/.local/lib/python3.13/site-packages/datasets/features/pdf.pypdf_to_bytesr"      s?    	fIIDLL)   
s   AA
A+c                      \ rS rSr% SrSr\\S'   \" SSS9r	\
\   \S'   S	r\\   \S
'   \R                  " \R                   " 5       \R"                  " 5       S.5      r\\   \S'   \" S SSS9r\\S'   S rS\\\\\S	4   S\4S jrSS\SS	4S jjrS\S\\S4   4   4S jrS\\R<                  \R>                  \R@                  4   S\R>                  4S jr!SS\R>                  S\R>                  4S jjr"Sr#g)Pdf   a  
**Experimental.**
Pdf [`Feature`] to read pdf documents from a pdf file.

Input: The Pdf feature accepts as input:
- A `str`: Absolute path to the pdf file (i.e. random access is allowed).
- A `pathlib.Path`: path to the pdf file (i.e. random access is allowed).
- A `dict` with the keys:
    - `path`: String with relative path of the pdf file in a dataset repository.
    - `bytes`: Bytes of the pdf file.
  This is useful for archived files with sequential access.

- A `pdfplumber.pdf.PDF`: pdfplumber pdf object.

Args:
    mode (`str`, *optional*):
        The mode to convert the pdf to. If `None`, the native mode of the pdf is used.
    decode (`bool`, defaults to `True`):
        Whether to decode the pdf data. If `False`,
        returns the underlying dictionary in the format `{"path": pdf_path, "bytes": pdf_bytes}`.

Examples:

```py
>>> from datasets import Dataset, Pdf
>>> ds = Dataset.from_dict({"pdf": ["path/to/pdf/file.pdf"]}).cast_column("pdf", Pdf())
>>> ds.features["pdf"]
Pdf(decode=True, id=None)
>>> ds[0]["pdf"]
<pdfplumber.pdf.PDF object at 0x7f8a1c2d8f40>
>>> ds = ds.cast_column("pdf", Pdf(decode=False))
>>> ds[0]["pdf"]
{'bytes': None,
'path': 'path/to/pdf/file.pdf'}
```
TdecodeNF)defaultrepridr   dtypebytespathpa_type)r'   initr(   _typec                     U R                   $ N)r.   )selfs    r!   __call__Pdf.__call__M   s    ||    valuer   c                    [         R                  (       a  SSKnOSn[        U[        5      (       a  USS.$ [        U[
        5      (       a  [	        UR                  5       5      SS.$ [        U[        [        45      (       a  SUS.$ Ub/  [        XR                  R                  5      (       a  [        U5      $ UR                  S5      b;  [        R                  R                  US   5      (       a  SUR                  S5      S.$ UR                  S5      c  UR                  S5      b#  UR                  S5      UR                  S5      S.$ [!        SU S35      e)	zEncode example into a format for Arrow.

Args:
    value (`str`, `bytes`, `pdfplumber.pdf.PDF` or `dict`):
        Data passed as input to Pdf feature.

Returns:
    `dict` with "path" and "bytes" fields
r   Nr-   r,   r-   r+   r,   zRA pdf sample should have one of 'path' or 'bytes' but they are missing or None in .)r   PDFPLUMBER_AVAILABLE
pdfplumber
isinstancestrr   absoluter,   	bytearrayr   PDFencode_pdfplumber_pdfgetosr-   isfile
ValueError)r3   r7   r<   s      r!   encode_examplePdf.encode_exampleP   s"    &&JeS!!!D11t$$ 01DAAy122 511#
5..:L:L(M(M(//YYv*rww~~eFm/L/L!599V+<==YYw+uyy/@/L"YYw/69JKKdejdkklm r6   c                    U R                   (       d  [        S5      e[        R                  (       a  SSKnO[        S5      eUc  0 nUS   US   pTUc  Uc  [        SU S35      e[        U5      (       a  UR                  " U5      nU$ UR                  S	5      S
   nUR                  [        R                  5      (       a  [        R                  O[        R                  n [        Xx5      S   n	UR                  U	5      n
[!        U
S9n[#        USUS9nUR                  " U5      $ UR                  " [%        U5      5       nUnSSS5        U$ ! [         a    Sn
 N\f = f! , (       d  f       W$ = f)a  Decode example pdf file into pdf data.

Args:
    value (`str` or `dict`):
        A string with the absolute pdf file path, a dictionary with
        keys:

        - `path`: String with absolute or relative pdf file path.
        - `bytes`: The bytes of the pdf file.

    token_per_repo_id (`dict`, *optional*):
        To access and decode pdf files from private repositories on
        the Hub, you can pass a dictionary
        repo_id (`str`) -> token (`bool` or `str`).

Returns:
    `pdfplumber.pdf.PDF`
zKDecoding is disabled for this feature. Please use Pdf(decode=True) instead.r   Nz6To support decoding pdfs, please install 'pdfplumber'.r-   r,   z@A pdf should have one of 'path' or 'bytes' but both are None in r:   ::repo_idtokenrbdownload_config)r&   RuntimeErrorr   r;   r<   ImportErrorrF   r   opensplit
startswithHF_ENDPOINTHUB_DATASETS_URLHUB_DATASETS_HFFS_URLr   rC   r   r   r   )r3   r7   token_per_repo_idr<   r-   bytes_r   
source_urlpatternrL   rN   rQ   fps                 r!   decode_examplePdf.decode_examples   si   & {{lmm&&VWW$ "V}eGnf>| #cdicjjk!lmm &&$//$/C( 
% "&D!1"!5J &001C1CDD //#99 
%"0"Ei"P 1 5 5g > '55&AOdD/JA%??1--1Q 2 
 & % $% 21 
s   "E E!EE!
E0r   c                 V    SSK Jn  U R                  (       a  U $ U" S5      U" S5      S.$ )zfIf in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.r   )Valuebinarystringr+   )featuresrc   r&   )r3   rc   s     r!   flattenPdf.flatten   s2    # {{ 	
 xh	
r6   storagec                    [         R                  R                  UR                  5      (       ag  [         R                  " S/[        U5      -  [         R                  " 5       S9n[         R                  R                  X!/SS/UR                  5       S9nGO[         R                  R                  UR                  5      (       ag  [         R                  " S/[        U5      -  [         R                  " 5       S9n[         R                  R                  X/SS/UR                  5       S9nGO*[         R                  R                  UR                  5      (       a  UR                  R                  S5      S:  a  UR                  S5      nO5[         R                  " S/[        U5      -  [         R                  " 5       S9nUR                  R                  S5      S:  a  UR                  S5      nO5[         R                  " S/[        U5      -  [         R                  " 5       S9n[         R                  R                  X#/SS/UR                  5       S9n[        XR                   5      $ )a  Cast an Arrow array to the Pdf arrow storage type.
The Arrow types that can be converted to the Pdf pyarrow storage type are:

- `pa.string()` - it must contain the "path" data
- `pa.binary()` - it must contain the image bytes
- `pa.struct({"bytes": pa.binary()})`
- `pa.struct({"path": pa.string()})`
- `pa.struct({"bytes": pa.binary(), "path": pa.string()})`  - order doesn't matter
- `pa.list(*)` - it must contain the pdf array data

Args:
    storage (`Union[pa.StringArray, pa.StructArray, pa.ListArray]`):
        PyArrow array to cast.

Returns:
    `pa.StructArray`: Array in the Pdf arrow storage type, that is
        `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
Ntyper,   r-   maskr   )patypes	is_stringrl   arraylenrd   StructArrayfrom_arraysis_null	is_binaryre   	is_structget_field_indexr   r   r.   )r3   ri   bytes_array
path_arrays       r!   cast_storagePdf.cast_storage   s   & 88gll++((D6CL#8ryy{KKnn00+1G'SYIZahapapar0sGXX--4&3w<"7biikJJnn00'1FRXHY`g`o`o`q0rGXX--||++G49%mmG4 hhvG'<299;O||++F3q8$]]62
XXtfs7|&;"))+N
nn00+1JWV\L]dkdsdsdu0vG'<<00r6   c           	      x  ^ Tc  0 m[         U4S j5       n[        R                  " UR                  5        Vs/ s H  nUb  US   c  U" US   5      OUS   OSPM      sn[        R                  " 5       S9n[        R                  " UR                  S5      R                  5        Vs/ s H&  ofb  [        R                  R                  U5      OSPM(     sn[        R                  " 5       S9n[        R                  R                  XW/SS/UR                  5       S9n[        XR                  5      $ s  snf s  snf )zEmbed PDF files into the Arrow array.

Args:
    storage (`pa.StructArray`):
        PyArrow array to embed.

Returns:
    `pa.StructArray`: Array in the PDF arrow storage type, that is
        `pa.struct({"bytes": pa.binary(), "path": pa.string()})`.
Nc                 z  > U R                  S5      S   nUR                  [        R                  5      (       a  [        R                  O[        R
                  n[        X5      nUb  TR                  US   5      OS n[        US9n[        U SUS9 nUR                  5       sS S S 5        $ ! , (       d  f       g = f)NrJ   rK   rL   rM   rO   rP   )rU   rV   r   rW   rX   rY   r   rC   r   r   read)r-   r\   r]   source_url_fieldsrN   rQ   r^   rZ   s          r!   path_to_bytes(Pdf.embed_storage.<locals>.path_to_bytes   s    D)"-J+5+@+@ASAS+T+T''Z`ZvZv  !/z CK\Kh%))*;I*FGnrE,59OtT?Cqvvx DCCs   B,,
B:r,   r-   rk   rm   )r   ro   rr   	to_pylistrd   r   rD   r-   basenamere   rt   ru   rv   r   r.   )r3   ri   rZ   r   xrz   r-   r{   s     `     r!   embed_storagePdf.embed_storage   s*    $ "			  
 		  hh !**,,A UVTaQwZ-?qy)QwZgkk, 
 XXNUmm\bNcNmNmNopNod'7RWWd#TANop

 ..,,k-FRXHY`k`s`s`u,v'<<00 qs   %D2'-D7 r2   )$__name__
__module____qualname____firstlineno____doc__r&   bool__annotations__r   r)   r   r>   r*   r	   ro   structrd   re   r.   r   r0   r4   r   r,   r@   dictrG   r`   r
   rg   StringArrayrt   	ListArrayr|   r   __static_attributes__r   r6   r!   r$   r$      s*   #J FDd7B7 0E8C=/YYbiik'RSGXc]Su5u=E3=!E#uiG[*[$\ !ae !F8D 8EY 8t
}d33E.FFG 
#1E".."..",,*V$W #1\^\j\j #1J&1R^^ &1PRP^P^ &1 &1r6   r$   c                     [        U S5      (       aO  [        U R                  S5      (       a4  U R                  R                  (       a  U R                  R                  SS.$ S[        U 5      S.$ )a!  
Encode a pdfplumber.pdf.PDF object into a dictionary.

If the PDF has an associated file path, returns the path. Otherwise, serializes
the PDF content into bytes.

Args:
    pdf (pdfplumber.pdf.PDF): A pdfplumber PDF object.

Returns:
    dict: A dictionary with "path" or "bytes" field.
r   nameNr9   )hasattrr   r   r"   )r   s    r!   rB   rB     sP     sH'#**f"="=#**//

$77 |C'899r6   )%rD   dataclassesr   r   ior   pathlibr   typingr   r   r	   r
   r   r   pyarrowro    r   download.download_configr   tabler   utils.file_utilsr   r   utils.py_utilsr   r   r<   rf   r   r,   r"   r$   r   rB   r   r6   r!   <module>r      sy    	 (   F F   5  3 C %!* !u ! f1 f1 f1R:3 : :r6   