
    i7                     $   S r SSKrSSKrSSKrSSKrSSKJrJr  SSKJr  SSK	J
r
  SSKJrJrJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  \R:                  " \R<                  SS9  \R>                  " \ 5      r! " S S\5      r"g)zc
Enhanced Slides parser.

Contains parsers for .pptx files with comprehensive content extraction.

    N)ThreadPoolExecutoras_completed)datetime)Path)AnyDictListOptionalUnion)AbstractFileSystem)
BaseReader)Document)BaseLLM)Settings   )SlideContentExtractorz)%(asctime)s - %(levelname)s - %(message)s)levelformatc                      \ rS rSrSr      SS\S\S\\   S\S\S	\S
S4S jjr	  SS\
\\4   S\\   S\\   S
\\   4S jjr   SS\S\S\S\\   S
\\\4   4
S jjrS\S\S\S\S\S\S
\4S jr SS\S\\   S
\\\4   4S jjrSrg)
PptxReader    z
Enhanced PowerPoint parser.

Extract text, tables, charts, speaker notes, and optionally caption images.
Supports multithreaded processing and LLM-based content consolidation.
Always returns one Document per slide.
Nextract_imagescontext_consolidation_with_llmllm
batch_sizenum_workersraise_on_errorreturnc                     U(       a  Uc  [         R                  nXl        X l        X0l        X@l        XPl        X`l        [        U R                  U R                  U R                  S9U l        X`l        g)a  
Initialize enhanced PptxReader.

Args:
    extract_images: Whether to extract and caption images
    context_consolidation_with_llm: Whether to use LLM for contextual content consolidation
    llm: LLM instance for content consolidation (optional)
    batch_size: Number of slides to process in parallel batches
    num_workers: Number of worker threads (0 for sequential processing)

N)r   r   r   )	r   r   r   r   r   r   r   r   content_extractor)selfr   r   r   r   r   r   s          ^/home/james-whalen/.local/lib/python3.13/site-packages/llama_index/readers/file/slides/base.py__init__PptxReader.__init__)   sf    * *ck,,C -.L+$&,!6..+/+N+N"

 -    file
extra_infofsc                    [         R                  SU 35        [        U5      nU R                  UU R                  U R
                  US9nUS   (       d1  U R                  (       d   [         R                  SU SUS    35        / $ US   (       d%  U R                  (       a  [        SU SUS    35      e/ n[        US   S   S	S
9 GH   u  px[        U5      UUR                  SS5      UR                  S/ 5      UR                  S/ 5      UR                  S/ 5      UR                  S/ 5      UR                  SS5      UR                  S/ 5      UR                  S/ 5      S.
n	U(       a  U	R                  U5        UR                  [        US   U	[        U	R                  5       5      [        U	R                  5       5      S95        GM     [         R                  S[!        U5       SU 35        U$ )z
Parse PowerPoint file with enhanced content extraction.

Args:
    file: Path to the PowerPoint file
    extra_info: Additional metadata to include
    fs: File system to use for reading

Returns:
    List of Documents (one per slide)

zLoading PPTX file: )	file_pathr   r   r(   successzFailed to extract data from : errorsdataslidesr   )starttitle extraction_errorsextraction_warningstableschartsnotesimagestext_sections)
r*   
page_labelr1   r3   r4   r5   r6   r7   r8   r9   content)textmetadataexcluded_embed_metadata_keysexcluded_llm_metadata_keyszSuccessfully loaded  slides from )loggerdebugstrextract_with_validationr   r   r   error
ValueError	enumerategetupdateappendr   listkeyslen)
r!   r&   r'   r(   file_path_strresultdocsislider=   s
             r"   	load_dataPptxReader.load_dataO   s   $ 	*4&12D	 --#..+/+N+N	 . 
 i )<)<LL.}oRx@P?QR I	"t':':.}oRx@P?QR 
 !&.":!DHA !Y7B/%*YY/BB%G',yy1F'K))Hb1))Hb17B/))Hb1!&?B!?H 
+KKy)%15 2 04 0	# E< 	+CI;mD6JKr%   r*   c                 0   SS/ / 0 S.nU R                  X5      nUR                  SS5      (       d  UR                  S/ 5      US'   U$ UR                  S5      nUc  US   R                  S5        U$ [        U5      R                  n[
        R                  SU 35         [        R                  " 5       n	[        UR                  5      n
[
        R                  S	U
 S
U 35        / n[        SXR                  5       Vs/ s H  nU[        XR                  -   U
5      4PM     nnU R                  (       a  U R                  S:  a  [        U R                  S9 nU VVs0 s H*  u  nnUR!                  U R"                  UUUUUU5      UU4_M,     nnn[%        U5       HB  nUU   u  nn UR'                  5       nUR)                  U5        [0        R2                  " 5         MD     SSS5        OFU H@  u  nnUR)                  U R#                  UUUUUU5      5        [0        R2                  " 5         MB     [        R                  " 5       U	-
  R5                  5       nU
[        U5      [7        S U 5       5      UU(       a  SO![8        R:                  R=                  U5      S-  S.nUR?                  SU[A        US S9U
U[        R                  " 5       RC                  5       UUS.S./ US.5        [
        R                  SU SUS    SUS   S  S!35        A[0        R2                  " 5         U$ s  snf s  snnf ! [*         a`  n[
        R-                  SUS-    SU SU 35        [        UU5       H$  nUR                  US-   [/        U5      SS.5        M&      SnAGNSnAff = f! [0        R2                  " 5         f = f! , (       d  f       GN= f! [*         a<  nS"U SU 3n[
        R-                  U5        US   R                  U5         SnAU$ SnAff = f)#zRExtract content from PowerPoint file with validation and multithreaded processing.FN)r+   r.   r-   warningsstatsvalidr-   presentationz1Failed to get presentation object from validationzProcessing file: zProcessing r@   r   )max_workerszBatch r   -z	 failed: Tslide_numberrE   partial_extractionc              3      #    U  H5  nUR                  S 5      (       d  UR                  S5      (       d  M1  Sv   M7     g7f)rE   r3   r   NrH   ).0ss     r"   	<genexpr>5PptxReader.extract_with_validation.<locals>.<genexpr>   s2      $(uuW~~/B)C A(s   0?	?i   )total_slidesprocessed_slidestotal_errorsprocessing_time_secondsfile_size_mbc                 &    U R                  SS5      $ )Nr]   r   r`   )rb   s    r"   <lambda>4PptxReader.extract_with_validation.<locals>.<lambda>	  s    quu^Q7Or%   )key)re   r*   processing_timestampr   r   )filenamer/   r=   )r+   r.   r-   rW   zSuccessfully processed r,   rf   z slides in rh   z.2frb   zCritical error processing )"_validate_filerH   rJ   r   namerA   rB   r   nowrM   r/   ranger   minr   r   submit_process_batchr   rO   extend	ExceptionrE   rC   gccollecttotal_secondssumospathgetsizerI   sorted	isoformat)r!   r*   r   r   r(   rO   
validationrY   ro   
start_timere   slides_datarQ   batchesexecutorr0   endfuturesfutbatch_resultseidxprocessing_timerW   	error_msgs                            r"   rD   "PptxReader.extract_with_validation   s)    "
 ((7
~~gu--)~~h;F8M "~~n58##$WXM	?''(
34l	/!J|223LLL;|nM(LM !#K
 q,@@A COO+\:;@   D$4$4q$8'D4D4DE +2 +2JE3 ! //(!$*: "3<( +2    ,G4%,S\
s),/JJLM'..}= JJL!  5 FE@ #*JE3&&++(!$*:	 JJL #*  (||~
:IIKO ,$'$4 # $($ !
 ,;%'RWW__Y-G;-W
E MM#$,"('-O# -9)24<LLN4L4L4N.<>\% !"!* LL)(2e<N6O5P Q56s;1> JJL K"  ) 	""LL6%!)AcU)A3)OP',UC'8 + 2 28;a14Q>B%&!" (9	" JJL; FEr  	/4XJbDILL#8##I..	/s    AO ?$L+#6O N=1L0N=*!L6N=$EO +O 0N=6
N  ANN#N  N##N::N==
OO 
P1PPrY   r0   r   ro   c                    [         R                  " 5       R                  n[        R	                  SU SUS-    SU 35        / n[        X#5       H  n	 UR                  U	   n
U R                  R                  U
U	S-   [        U5      S9nUR                  U5        UR                  S5      (       a2  [        R                  SU SU	S-    SUR                  S5       35        M  M     [        R	                  SU SUS-    SU 35        U$ ! [         aN  n[        R                  SU S	U	S-    S
U 35        UR                  U	S-   [        U5      SS.5         SnAGM  SnAff = f)zs
Process slides in the range [start, end) and return their extracted data.
Runs in the context of a worker thread.
[z] Starting batch r   r[   )rR   r]   ro   r3   z] Slide z had extraction errors: z] Error on slide r,   Tr\   Nz] Finished batch )	threadingcurrent_threadrq   rA   rB   rs   r/   r    extract_slide_safer   rJ   rH   warningrx   rC   )r!   rY   r0   r   ro   r   r   thread_name
batch_datar   rR   
slide_datar   s                r"   rv   PptxReader._process_batch(  sw     ..055q%6uqyk3%HI
$C$++C0!33FF!$q!(^ G 

 !!*->>"566NNK=q	9QR\R`R`atRuQvw 7 %, 	q%6uqyk3%HI  ;-/@q	A3OP!!(+a!$Q.2 s   BD
E AEE c                    SSK Jn  SSKnS/ / SS.nUR                  5       R	                  S5      (       d  US   R                  S5        U(       dB  [        R                  R                  U5      (       d  S	US
'   US   R                  SU 35        U$  U(       a@  UR                  U5       nU" UR                  UR                  5       5      5      nSSS5        OU" U5      n[        WR                  5      nUS:X  a  US   R                  S5        OUS:  a  US   R                  SU S35        XuS'   U$ ! , (       d  f       Nb= f! [         a'  n	S	US
'   US   R                  SU	 35         Sn	A	U$ Sn	A	ff = f)zXValidate that the file exists, and can be opened. Returns presentation object for reuse.r   )PresentationNT)rX   r-   rV   rY   )z.pptxz.pptrV   z/File extension not typical for PowerPoint filesFrX   r-   zFile not found: zPresentation contains no slidesi  zLarge presentation: z slidesrY   z Cannot open as PowerPoint file: )pptxr   iolowerendswithrJ   r}   r~   existsopenBytesIOreadrM   r/   rx   )
r!   r*   r(   r   r   r   frY   countr   s
             r"   rp   PptxReader._validate_fileR  s    	&  	&

  ))*;<<z"))A
 77>>),,&+
7#8$++.>yk,JK!!	PWWY'1#/

16680D#EL ('  ,I6++,Ez:&--.OP:&--0DUG7.ST *6~& % ('  	P"'Jwx ''*J1#(NOO		Ps1   E ,&D7A#E 7
EE 
E9E44E9)r   r    r   r   r   r   r   )FFN
      F)NN)TFN)N)__name__
__module____qualname____firstlineno____doc__boolr
   r   intr#   r   rC   r   r   r   r	   r   rS   r   rD   rK   rv   rp   __static_attributes__ r%   r"   r   r       s     %/4!%$$-$- )-$- g	$-
 $- $- $- 
$-R &*+/	HCIH TNH '(	H
 
hHZ  $/4+/MM M )-	M
 '(M 
c3hM^(( ( 	(
 ( ( )-( 
(V BF00"*+=">0	c3h0 0r%   r   )#r   ry   loggingr}   r   concurrent.futuresr   r   r   pathlibr   typingr   r   r	   r
   r   fsspecr   llama_index.core.readers.baser   llama_index.core.schemar   llama_index.core.base.llms.baser   llama_index.corer   r    r   basicConfigINFO	getLoggerr   rA   r   r   r%   r"   <module>r      sq    
  	  ?   3 3 % 4 , 3 % 4   
,,6 
		8	$b br%   