
    cCi+                         S r SSKrSSKJrJr  SSKJrJrJr  \" 5       (       a
  SSK	r	SSK	J
r
  \R                  " \5      r " S S\5      rS/rg)	z'
Feature extractor class for MarkupLM.
    N   )BatchFeatureFeatureExtractionMixin)is_bs4_availableloggingrequires_backends)BeautifulSoupc                   L   ^  \ rS rSrSrU 4S jrS rS rS rS\	4S jr
S	rU =r$ )
MarkupLMFeatureExtractor!   a[  
Constructs a MarkupLM feature extractor. This can be used to get a list of nodes and corresponding xpaths from HTML
strings.

This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
of the main methods. Users should refer to this superclass for more information regarding those methods.

c                 @   > [        U S/5        [        TU ]  " S0 UD6  g )Nbs4 )r   super__init__)selfkwargs	__class__s     r/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/markuplm/feature_extraction_markuplm.pyr   !MarkupLMFeatureExtractor.__init__+   s    $("6"    c           
        ^ / n/ nUR                   (       a  UOUR                  mTR                   Hy  nUR                  TR                   SS9nUR	                  TR                   5        UR	                  S[        U5      :X  a  SO[        U4S j[        US5       5       5      5        UmM{     UR                  5         UR                  5         X#4$ )NF)	recursive   r   c              3   :   >#    U  H  u  pUTL d  M  Uv   M     g 7f)Nr   ).0ischilds      r   	<genexpr>6MarkupLMFeatureExtractor.xpath_soup.<locals>.<genexpr>7   s      1e@VZ[_dZd!!@Vs   	)	nameparentparentsfind_allappendlennext	enumeratereverse)r   element
xpath_tagsxpath_subscriptsr#   siblingsr   s         @r   
xpath_soup#MarkupLMFeatureExtractor.xpath_soup/   s    
"<<W^^mmFuzzUCHejj)###h-'T1e	(TU@V1e-e E $ 	  "++r   c                    [        US5      n/ n/ n/ nUR                   H  n[        U[        R                  R
                  5      (       d  M.  [        UR                  5      [        R                  R                  La  M`  [        R                  " U5      R                  5       nU(       d  M  UR                  U5        U R                  U5      u  pUR                  U5        UR                  U	5        M     [        U5      [        U5      :w  a  [        S5      e[        U5      [        U5      :w  a  [        S5      eX4U4$ )Nzhtml.parserz3Number of doc strings and xtags does not correspondz3Number of doc strings and xsubs does not correspond)r	   descendants
isinstancer   r+   NavigableStringtyper#   Taghtmlunescapestripr&   r/   r'   
ValueError)
r   html_string	html_codeall_doc_stringsstring2xtag_seqstring2xsubs_seqr+   text_in_this_tagr,   r-   s
             r   get_three_from_single.MarkupLMFeatureExtractor.get_three_from_single>   s   !+}=	 ,,G'3;;#>#>??'s{{>#'==#9#?#?#A '&&'78/3w/G,
&&z2 ''(89 - 3#77RSS3'7#88RSS1AAAr   c                 d    Sn[        X5       H  u  pEUSU 3-  nUS:w  d  M  USU S3-  nM      U$ )N /r   [])zip)r   r,   r-   xpathtagnamesubss         r   construct_xpath(MarkupLMFeatureExtractor.construct_xpath[   sH     >MGq	]"Eqy1TF!$ ? r   returnc                    Sn[        U[        5      (       a  SnOD[        U[        [        45      (       a)  [	        U5      S:X  d  [        US   [        5      (       a  SnU(       d  [        S[        U5       S35      e[        U[        [        45      =(       a    [        US   [        5      nU(       d  U/n/ n/ nU Hs  nU R                  U5      u  pxn	UR                  U5        / n
[        XxU	5       H(  u  pnU R                  X5      nU
R                  U5        M*     UR                  U
5        Mu     XES.n[        USS9nU$ )	ad  
Main method to prepare for the model one or several HTML strings.

Args:
    html_strings (`str`, `list[str]`):
        The HTML string or batch of HTML strings from which to extract nodes and corresponding xpaths.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **nodes** -- Nodes.
    - **xpaths** -- Corresponding xpaths.

Examples:

```python
>>> from transformers import MarkupLMFeatureExtractor

>>> page_name_1 = "page1.html"
>>> page_name_2 = "page2.html"
>>> page_name_3 = "page3.html"

>>> with open(page_name_1) as f:
...     single_html_string = f.read()

>>> feature_extractor = MarkupLMFeatureExtractor()

>>> # single example
>>> encoding = feature_extractor(single_html_string)
>>> print(encoding.keys())
>>> # dict_keys(['nodes', 'xpaths'])

>>> # batched example

>>> multi_html_strings = []

>>> with open(page_name_2) as f:
...     multi_html_strings.append(f.read())
>>> with open(page_name_3) as f:
...     multi_html_strings.append(f.read())

>>> encoding = feature_extractor(multi_html_strings)
>>> print(encoding.keys())
>>> # dict_keys(['nodes', 'xpaths'])
```FTr   zQHTML strings must of type `str`, `list[str]` (batch of examples), but is of type .)nodesxpathsN)datatensor_type)r3   strlisttupler'   r:   r5   rA   r&   rH   rL   r   )r   html_stringsvalid_strings
is_batchedrQ   rR   r;   r=   r>   r?   xpath_stringsnodetag_listsub_listxpath_stringrS   encoded_inputss                    r   __call__!MarkupLMFeatureExtractor.__call__c   sD   `  lC(( MtUm44< A%LOS)I)I $""&|"4!5Q8 
  tUm<c*\Z[_^aBb
(>L 'KAEA[A[\gAh>O.>LL)M,/Rb,c(#33HG$$\2 -d MM-( ( 1%4TBr   r   )__name__
__module____qualname____firstlineno____doc__r   r/   rA   rL   r   ra   __static_attributes____classcell__)r   s   @r   r   r   !   s1    #,B:T T Tr   r   )rg   r7   feature_extraction_utilsr   r   utilsr   r   r   r   r	   
get_loggerrc   loggerr   __all__r   r   r   <module>ro      sU     L A A ! 
		H	%V5 Vr &
&r   