
    ip                     p    S SK Jr  S SKJrJrJrJr  S SKJr  S SK	J
r
  \(       a  S SKJr   " S S\5      rg)	    )Path)TYPE_CHECKINGDictListOptional)
BaseReader)Document)Tagc                   ~   ^  \ rS rSrSr  SS\S\SS4U 4S jjjr SS\S	\	\
   S\\   4S
 jjrSSS\4S jrSrU =r$ )HTMLTagReader   z
Read HTML files and extract text from a specific tag with BeautifulSoup.

By default, reads the text from the ``<section>`` tag.
tagignore_no_idreturnNc                 :   > Xl         X l        [        TU ]  5         g N)_tag_ignore_no_idsuper__init__)selfr   r   	__class__s      \/home/james-whalen/.local/lib/python3.13/site-packages/llama_index/readers/file/html/base.pyr   HTMLTagReader.__init__   s    
 	)    file
extra_infoc                      SSK Jn  [        USS9 nU" US5      nS S S 5        WR	                  U R
                  5      n/ nU H  nUR                  S5      n	U R                  U5      n
U R                  (       a	  U	(       d  M?  U R
                  U	[        U5      S.nUR                  U=(       d    0 5        [        U
US	9nUR                  U5        M     U$ ! [         a    [        S5      ef = f! , (       d  f       N= f)
Nr   )BeautifulSoup#bs4 is required to read HTML files.zutf-8)encodingzhtml.parserid)r   tag_id	file_path)textmetadata)bs4r   ImportErroropenfind_allr   get_extract_text_from_tagr   strupdater	   append)r   r   r   r   	html_filesouptagsdocsr   r#   tag_textr&   docs                r   	load_dataHTMLTagReader.load_data   s    	E) $)Y M:D * }}TYY'CWWT]F2237H!!& yy  YH
 OOJ,"-!C KK% & 7  	ECDD	E *)s   C 
C/C,/
C=r
   c                     SSK Jn  / nUR                   H  n[	        XB5      (       a8  UR                  5       (       a!  UR                  UR                  5       5        MI  MK  UR                  U R                  :X  a  Mg  UR                  UR                  5       R                  5       5        M     SR                  U5      $ ! [         a    [        S5      ef = f)Nr   )NavigableStringr    
)r'   r9   r(   children
isinstancestripr/   namer   get_textjoin)r   r   r9   textselems        r   r,   $HTMLTagReader._extract_text_from_tag>   s    	E+ LLD$00::<<LL.  dii'T]]_2245 ! yy  	ECDD	Es   B? ?C)r   r   )sectionFr   )__name__
__module____qualname____firstlineno____doc__r-   boolr   r   r   r   r   r	   r6   r,   __static_attributes____classcell__)r   s   @r   r   r      sy     "  
	  8<  &.tn 	h D %  C    r   r   N)pathlibr   typingr   r   r   r   llama_index.core.readers.baser   llama_index.core.schemar	   r'   r
   r    r   r   <module>rR      s(     6 6 4 ,B J B r   