
    i                        S r SSKrSSKrSSKrSSKrSSKJrJr  SSKJ	r	J
r
JrJrJr  SSKJrJr  SSKJr  SSKJr  SSKJrJr  SS	KJr  \R4                  " \5      rS
r " S S\5      r " S S\5      r " S S\5      r g)z6
Docs parser.

Contains parsers for docx, pdf files.

    N)PathPurePosixPath)AnyDictListOptionalUnion)retrystop_after_attempt)AbstractFileSystem)
BaseReader)get_default_fsis_default_fsDocument   c                       \ rS rSrSrSS\\   SS4S jjr\" \	" \
5      S9  SS\\\4   S	\\   S
\\   S\\   4S jj5       rSrg)	PDFReader   zPDF parser.return_full_documentreturnNc                     Xl         g)z
Initialize PDFReader.
Nr   )selfr   s     \/home/james-whalen/.local/lib/python3.13/site-packages/llama_index/readers/file/docs/base.py__init__PDFReader.__init__   s
     %9!    )stopfile
extra_infofsc           	        ^ U=(       d
    [        5       n[        U5      (       a  [        O[        n[	        U[        [        45      (       d  U" U5      n SSKnUR                  [        U5      S5       n[        U5      (       a  UO#[        R                  " UR                  5       5      nUR                  U5      m[        TR                  5      n/ n	U R                  (       a_  SUR                   0n
Ub  U
R#                  U5        SR%                  U4S j['        U5       5       5      nU	R)                  [+        XS95        Ox['        U5       Hi  nTR                  U   R-                  5       nTR.                  U   nXR                   S	.n
Ub  U
R#                  U5        U	R)                  [+        XS95        Mk     U	sSSS5        $ ! [         a    [        S5      ef = f! , (       d  f       g= f)
Parse file.r   Nz8pypdf is required to read PDF files: `pip install pypdf`rb	file_name
c              3   ^   >#    U  H"  nTR                   U   R                  5       v   M$     g 7fN)pagesextract_text).0pagepdfs     r   	<genexpr>&PDFReader.load_data.<locals>.<genexpr>O   s)      !?OtCIIdO0022?Os   *-textmetadata)
page_labelr&   )r   r   r   r   
isinstancepypdfImportErroropenstrioBytesIOread	PdfReaderlenr*   r   nameupdatejoinrangeappendr   r+   page_labels)r   r    r!   r"   _Pathr6   fpstream	num_pagesdocsr3   r2   r-   	page_textr4   r.   s                  @r   	load_dataPDFReader.load_data%   s    #>#%b))}$} 566;D	 WWSY% ),,R"**RWWY2GF //&)C CIIID (('3)OOJ/ yy !?DY?O!  H$BC "),D #		$ < < >I!$!6J.8yyQH!- 
3KKi KL - S &%  	J 	
 &%s   G 4EGG
G-r   )FNN)__name__
__module____qualname____firstlineno____doc__r   boolr   r
   r   RETRY_TIMESr	   r   r   r   r   r   r   rK   __static_attributes__ r   r   r   r      s    9Xd^ 9 9 , &*+/	<D-'(< TN< '(	<
 
h<<r   r   c            
       L    \ rS rSrSr  S
S\S\\   S\\   S\	\
   4S jjrS	rg)
DocxReaderg   zDocx parser.Nr    r!   r"   r   c                    [        U[        5      (       d  [        U5      n SSKnU(       a5  UR	                  [        U5      5       nUR                  U5      nSSS5        OUR                  U5      nSUR                  0nUb  UR                  U5        [        WU=(       d    0 S9/$ ! [         a    [        S5      ef = f! , (       d  f       N]= f)r$   r   NzIdocx2txt is required to read Microsoft Word files: `pip install docx2txt`r&   r1   )
r5   r   docx2txtr7   r8   r9   processr?   r@   r   )r   r    r!   r"   r[   fr2   r3   s           r   rK   DocxReader.load_dataj   s     $%%:D	 T#q''* $# ##D)D+!OOJ'dX^<==  	) 	 $#s   B) C)B?
CrV   rM   )rN   rO   rP   rQ   rR   r   r   r   r   r   r   rK   rU   rV   r   r   rX   rX   g   sK    
 &*+/	>> TN> '(	>
 
h> >r   rX   c            
         ^  \ rS rSrSrS\S\SS4U 4S jjr  SS\S	\\	   S
\\
   S\\   4S jjrS\\   S\4S jrS\\   S\\   4S jr SS\S	\\	   S\4S jjrS\4S jrS\S\\   S\4S jrS\S\4S jrS\S\S\4S jrSrU =r$ )	HWPReader   zHwp Parser.argskwargsr   Nc                    > [         TU ]  " U0 UD6  SU l        SU l        [	        S5      U l        SU l        S/U l        SU l        g )N
FileHeaderzHwpSummaryInformationSectionBodyTextC    )	superr   FILE_HEADER_SECTIONHWP_SUMMARY_SECTIONr>   SECTION_NAME_LENGTHBODYTEXT_SECTIONHWP_TEXT_TAGSr2   )r   rb   rc   	__class__s      r   r   HWPReader.__init__   sJ    $)&)#/ #> #&y>  * T	r   r    r!   r"   c                 H   SSK nU(       a  [        R                  S5        [        U[        5      (       d  [	        U5      nUR                  U5      nUR                  5       nU R                  U5      SL a  [        S5      eU R                  XV5      nU R                  XrS9nU/$ )zy
Load data and extract table from Hwp file.

Args:
    file (Path): Path for the Hwp file.

Returns:
    List[Document]

r   Nzxfs was specified but HWPReader doesn't support loading from fsspec filesystems. Will load from local filesystem instead.FzNot Valid HwpFiler2   r!   )olefileloggerwarningr5   r   	OleFileIOlistdiris_valid	Exception	_get_text_text_to_document)	r   r    r!   r"   rt   	load_filefile_dirresult_textresults	            r   rK   HWPReader.load_data   s      	NNT
 $%%:D%%d+	$$&=="e+/00nnY9''['Pxr   dirsc                 F    U R                   /U;  a  gU R                  /U;   $ )NF)rk   rl   )r   r   s     r   ry   HWPReader.is_valid   s*    $$%T1(()T11r   c                     / nU HB  nUS   U R                   :X  d  M  UR                  [        US   U R                  S  5      5        MD     [	        U5       Vs/ s H  nS[        U5      -   PM     sn$ s  snf )Nr      zBodyText/Section)rn   rC   intrm   sortedr9   )r   r   mdxs        r   get_body_sectionsHWPReader.get_body_sections   sn    Att,,,QqT$":":"<=>?  6<AY?Y"SV+Y???s   A3r2   c                 &    [        X=(       d    0 S9$ )Nrs   r   )r   r2   r!   s      r   r|   HWPReader._text_to_document   s     T.>B??r   c                     U R                   $ r)   )r2   )r   s    r   get_textHWPReader.get_text   s    yyr   r}   	file_dirsc                     U R                  U5      nSnU H  nX@R                  X5      -  nUS-  nM     X@l        U R                  $ )Nri   r'   )r   get_text_from_sectionr2   )r   r}   r   sectionsr2   sections         r   r{   HWPReader._get_text   sN    )))4G..yBBDDLD   	yyr   c                 Z    UR                  S5      nUR                  5       nUS   S-  S:H  $ )Nre   $   r   )
openstreamr<   )r   r}   headerheader_datas       r   is_compressedHWPReader.is_compressed   s1    %%l3kkmB!#))r   r   c                    UR                  U5      nUR                  5       nU R                  U5      (       a  [        R                  " US5      OUn[        U5      nSnSnXv:  ar  [        R                  " SXW5      S   n	U	S-  n
U	S-	  S-    U	S-	  S-  nXR                  ;   a%  XWS	-   US	-   U-    nXR                  S
5      -  nUS-  nUS	U-   -  nXv:  a  Mr  U$ )Nir   ri   z<Ii  
      i     zutf-16r'   )
r   r<   r   zlib
decompressr>   structunpack_fromro   decode)r   r}   r   bodytextdataunpacked_datasizeir2   r   rec_typerec_lenrec_datas                r   r   HWPReader.get_text_from_section   s    ''0}} +/*<*<Y*G*GDOOD#&T 	 =!h''m?BF~Hr\U"|u,G---(QQA11WA h r   )rn   rk   rl   ro   rm   r2   rM   r)   )rN   rO   rP   rQ   rR   r   r   r   r   r   r   r   r   rK   r9   rS   ry   r   r|   r   r{   r   r   rU   __classcell__)rp   s   @r   r`   r`      s   c S T  &*+/	!! TN! '(	!
 
h!F2T#Y 24 2@d3i @DI @ 7;@@%-d^@	@
# 
3 49  *s *t *
s S S  r   r`   )!rR   r:   loggingr   r   pathlibr   r   typingr   r   r   r   r	   tenacityr
   r   fsspecr   llama_index.core.readers.baser   "llama_index.core.readers.file.baser   r   llama_index.core.schemar   	getLoggerrN   ru   rT   r   rX   r`   rV   r   r   <module>r      so    
    ' 3 3 . % 4 L ,			8	$H
 HV> >Bo
 or   