
    D_i                        S r SSKrSSKrSSKJr  SSKJrJr  \R                  " \	5      r
SrSrSSR                  \ V s/ s H  n \R                  U 5      S	-   PM     sn 5      -   S
-   rSSR                  \ V s/ s H  n \R                  U 5      PM     sn 5      -   S
-   rS\ S\ S3rSS.S\S\\R&                  -  S-  S\\   4S jjrSSSSSS.S\S\S\S-  S\\R&                  -  S-  S\S\\   S\S\\   4S jjrgs  sn f s  sn f )z Utilities for working with HTML.    N)Sequence)urljoinurlparse)zjavascript:zmailto:#)z.cssz.jsz.icoz.pngz.jpgz.jpegz.gifz.svgz.csvz.bz2z.zipz.epubz(?!|z[\#'\"])z
href=[\"']z((?:z.)*?)[\#'\"]patternraw_htmlr
   returnc                p    U=(       d    [         n[        [        [        R                  " X5      5      5      $ )zExtract all links from a raw HTML string.

Args:
    raw_html: original HTML.
    pattern: Regex to use for extracting links from raw HTML.

Returns:
    all links
)DEFAULT_LINK_REGEXlistsetrefindall)r   r
   s     S/home/james-whalen/.local/lib/python3.13/site-packages/langchain_core/utils/html.pyfind_all_linksr   $   s(     ++GBJJw1233    T F)base_urlr
   prevent_outsideexclude_prefixescontinue_on_failureurlr   r   r   r   c                  ^ Ub  UOUn[        U5      n[        U5      n	[        XS9n
[        5       nU
 H  n [        U5      nUR                  S;   a  UnO`UR	                  S5      (       a  U	R                   SU 3nO8[        XR                  5      nUR                  (       a  USUR                   3-  nUR                  U5        M     / nU Hv  m[        U4S jU 5       5      (       a  M  U(       a?  [        T5      nUR                  UR                  :w  a  MM  TR	                  U5      (       d  Me  UR                  T5        Mx     U$ ! [         a*  nU(       a  [        R                  SX5         SnAGME  e SnAff = f)	a[  Extract all links from a raw HTML string and convert into absolute paths.

Args:
    raw_html: original HTML.
    url: the url of the HTML.
    base_url: the base URL to check for outside links against.
    pattern: Regex to use for extracting links from raw HTML.
    prevent_outside: If `True`, ignore external links which are not children
        of the base URL.
    exclude_prefixes: Exclude any URLs that start with one of these prefixes.
    continue_on_failure: If `True`, continue if parsing a specific link raises an
        exception. Otherwise, raise the exception.

Returns:
    sub links.
Nr	   >   httphttpsz//:?z-Unable to load link %s. Raised exception:

%sc              3   F   >#    U  H  nTR                  U5      v   M     g 7f)N)
startswith).0exclude_prefixpaths     r   	<genexpr>$extract_sub_links.<locals>.<genexpr>k   s     VEU>t~..EUs   !)r   r   r   schemer"   r   r%   queryadd	Exceptionloggerwarninganynetlocappend)r   r   r   r
   r   r   r   base_url_to_useparsed_base_url
parsed_url	all_linksabsolute_pathslinkparsed_linkabsolute_patheresultsparsed_pathr%   s                     @r   extract_sub_linksr<   4   sd   4 #+"6hCO/O#Jx9IUN	"4.K!!%66 $&&#-#4#4"5Qtf = '-=-= >$$!q):):(;%<<M}- , GVEUVVV"4.K%%););; ???33t   N3  	"Et 	s   BE
E?E:9E::E?)__doc__loggingr   collections.abcr   urllib.parser   r   	getLogger__name__r,   PREFIXES_TO_IGNORESUFFIXES_TO_IGNOREjoinescapeSUFFIXES_TO_IGNORE_REGEXPREFIXES_TO_IGNORE_REGEXr   strPatternr   r   boolr<   )ss   0r   <module>rM      s   &  	 $ *			8	$4   
CHH9KL9KAbiilZ/9KLMMPSS  
CHH,>?,>qbiil,>?@@3F  *+40H/IV  :>44"RZZ/$64	#Y4(  '+ &( %FF	F Dj	F
 2::$F F smF F 
#YF5 M @s   D
9D
