
    h2                        S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	J
r
Jr  S SKJr  S SKJrJr  SSKJr  SSKJr  SS	KJr  \" \5      r " S
 S5      r " S S\
5      r " S S\\
5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S  S!\5      r$ " S" S#5      r%g)$    N)ABCabstractmethod)Path)OptionalUnion   )config   )FileLock)
get_loggerc                   n    \ rS rSrSS\\   4S jjrS\S\4S jrS\S	\S\4S
 jr	SS\S	\S\4S jjr
Srg)ExtractManager   N	cache_dirc                     U(       a.  [         R                  R                  U[        R                  5      O[        R
                  U l        [        U l        g N)	ospathjoinr	   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr   s     P/home/james-whalen/.local/lib/python3.13/site-packages/datasets/utils/extract.py__init__ExtractManager.__init__   s6    FOBGGLLF$A$ABU[UsUs 	 #    r   returnc                     SSK Jn  [        R                  R	                  U5      n[        R                  R                  U R                  U" U5      5      $ )Nr
   )hash_url_to_filename)
file_utilsr"   r   r   abspathr   r   )r   r   r"   abs_paths       r   _get_output_pathExtractManager._get_output_path   s:    4 77??4(ww||D,,.B8.LMMr   output_pathforce_extractc                     U=(       dl    [         R                  R                  U5      (       + =(       aA    [         R                  R                  U5      =(       a    [         R                  " U5      (       + $ r   )r   r   isfileisdirlistdir)r   r(   r)   s      r   _do_extractExtractManager._do_extract%   sI     
{++lRWW]];5O5kTVT^T^_jTk0l	
r   
input_pathc                     U R                   R                  U5      nU(       d  U$ U R                  U5      nU R                  XB5      (       a  U R                   R	                  XU5        U$ r   )r   infer_extractor_formatr&   r.   extract)r   r0   r)   extractor_formatr(   s        r   r3   ExtractManager.extract*   s\    >>@@L++J7K77NN"":<LMr   )r   r   r   F)__name__
__module____qualname____firstlineno__r   strr   r&   boolr.   r3   __static_attributes__ r   r   r   r      sa    #(3- #NS NS N
s 
4 
D 

# d s  r   r   c                       \ rS rSr\\S\\\4   S\	4S j5       5       r
\\S\\\4   S\\\4   SS4S j5       5       rS	rg)
BaseExtractor4   r   r    c                     g r   r>   clsr   kwargss      r   is_extractableBaseExtractor.is_extractable5   s    GJr   r0   r(   Nc                     g r   r>   )r0   r(   s     r   r3   BaseExtractor.extract9   s    VYr   r>   )r7   r8   r9   r:   classmethodr   r   r   r;   r<   rF   staticmethodr3   r=   r>   r   r   r@   r@   4   sc    J%c	"2JJ  JYE$),Y5s;KYPTY  Yr   r@   c                   ~    \ rS rSr% / r\\   \S'   \S\	\
\4   S\4S j5       r\SS\	\
\4   S\S\4S jj5       rS	rg
)MagicNumberBaseExtractor>   magic_numbersr   magic_number_lengthc                 r    [        U S5       nUR                  U5      sS S S 5        $ ! , (       d  f       g = f)Nrb)openread)r   rP   fs      r   read_magic_number*MagicNumberBaseExtractor.read_magic_numberA   s'    $66-. s   (
6magic_numberr    c                    ^ T(       d.  [        S U R                   5       5      n U R                  X5      m[	        U4S jU R                   5       5      $ ! [         a     gf = f)Nc              3   8   #    U  H  n[        U5      v   M     g 7fr   )len).0cls_magic_numbers     r   	<genexpr>:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>I   s     %fTe@Pc*:&;&;Tes   Fc              3   F   >#    U  H  nTR                  U5      v   M     g 7fr   )
startswith)r\   r]   rX   s     r   r^   r_   N   s#     gUfAQ<**+;<<Ufs   !)maxrO   rV   OSErrorany)rD   r   rX   rP   s     ` r   rF   'MagicNumberBaseExtractor.is_extractableF   sa    "%%fTWTeTe%f"f"44TO gUXUfUfggg  s   A 
A#"A#r>   Nr   )r7   r8   r9   r:   rO   listbytes__annotations__rK   r   r   r;   intrV   rJ   r<   rF   r=   r>   r   r   rM   rM   >   sq    !#M4;#/dCi 0 /s / / h%c	"2 h% hRV h hr   rM   c                   ~    \ rS rSr\S\\\4   S\4S j5       r	\
S 5       r\
S\\\4   S\\\4   SS4S	 j5       rS
rg)TarExtractorQ   r   r    c                 .    [         R                  " U5      $ r   )tarfile
is_tarfilerC   s      r   rF   TarExtractor.is_extractableR   s    !!$''r   c              #     ^^#    S[         S[         4S jmS[         S[         S[        4U4S jjmS[         S[        4UU4S jjnT" U5      nU  H  nT" UR                  U5      (       a%  [        R	                  SUR                   S35        M@  UR                  5       (       a>  U" XC5      (       a1  [        R	                  SUR                   S	UR                   35        M  UR                  5       (       a>  U" XC5      (       a1  [        R	                  SUR                   S
UR                   35        M  Uv   M     g7f)a  
Fix for CVE-2007-4559
Desc:
    Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
    module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
    sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
From: https://stackoverflow.com/a/10077309
r   r    c                 z    [         R                  R                  [         R                  R                  U 5      5      $ r   )r   r   realpathr$   )r   s    r   resolved*TarExtractor.safemembers.<locals>.resolvedb   s$    77##BGGOOD$9::r   basec                 v   > T" [         R                  R                  X5      5      R                  U5      (       + $ r   )r   r   r   ra   )r   rw   ru   s     r   badpath)TarExtractor.safemembers.<locals>.badpathe   s)    T 89DDTJJJr   c                    > T" [         R                  R                  U[         R                  R                  U R                  5      5      5      nT" U R
                  US9$ )N)rw   )r   r   r   dirnamenamelinkname)inforw   tipry   ru   s      r   badlink)TarExtractor.safemembers.<locals>.badlinki   s>    277<<bggoodii.HIJC4==s33r   zExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r;   r<   r}   loggererrorissymr~   islnk)membersr(   r   rw   finfory   ru   s        @@r   safemembersTarExtractor.safemembersV   s    	;3 	;3 	;	K# 	KS 	KT 	K	4 	4 	4 	4
 $Euzz4((~ejj\9STU75#7#7~ejj\9QRWR`R`Qabc75#7#7~ejj\9STYTbTbScde s   D:D>r0   r(   Nc                     [         R                  " USS9  [        R                  " U 5      nUR	                  U[
        R                  X!5      S9  UR                  5         g )NTexist_ok)r   )r   makedirsro   rS   
extractallrl   r   close)r0   r(   tar_files      r   r3   TarExtractor.extractz   sF    
K$/<<
+K1I1I(1`ar   r>   )r7   r8   r9   r:   rJ   r   r   r;   r<   rF   rK   r   r3   r=   r>   r   r   rl   rl   Q   s|    (%c	"2 ( ( ( ! !F E$), 5s;K PT  r   rl   c                   N    \ rS rSrS/r\S\\\4   S\\\4   SS4S j5       r	Sr
g)	GzipExtractor   s   r0   r(   r    Nc                     [         R                  " U S5       n[        US5       n[        R                  " X#5        S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = fNrR   wb)gziprS   shutilcopyfileobj)r0   r(   	gzip_fileextracted_files       r   r3   GzipExtractor.extract   sI    YYz4(Ik4(N""9= ) )((( )(!   AAA
A	A
A,r>   r7   r8   r9   r:   rO   rK   r   r   r;   r3   r=   r>   r   r   r   r      sC     MM>E$), >5s;K >PT > >r   r   c                      ^  \ rS rSr/ SQr\SS\\\4   S\	S\
4U 4S jjj5       r\S\\\4   S\\\4   SS	4S
 j5       rSrU =r$ )ZipExtractor   )s   PKs   PKs   PKr   rX   r    c                 4  > [         TU ]  XS9(       a  g SSKJnJnJnJnJnJnJ	n	J
n
JnJn  [        US5       nU	" U5      nU(       a  X   S:X  a  X   S:X  a  X   S:X  a
   S S S 5        gX   X   :X  az  UR                  X   5        UR                  5       X   :X  aQ  X   U
:  aI  UR!                  U
5      n[#        U5      U
:X  a)  [$        R&                  " X5      nUU   U:X  a
   S S S 5        gS S S 5        g! , (       d  f       g= f! [(         a     gf = f)NrX   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirrR   F)superrF   zipfiler   r   r   r   r   r   r   r   r   r   rS   seektellrT   r[   structunpack	Exception)rD   r   rX   r   r   r   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__s                    r   rF   ZipExtractor.is_extractable   s   7!$!B	   dD!R$R1Q66;LPQ;QV\VimnVn#	 "!
  1V5LL 34779(;;@QUc@c#%77>#:D"4yN:*0--8H*O#*=#9=M#M+/ "!!  "!  		sA   $D
 )C9#D
 ,A;C9'D
 0D
 9
DD
 D
 

DDr0   r(   Nc                     [         R                  " USS9  [        R                  " U S5       nUR	                  U5        UR                  5         S S S 5        g ! , (       d  f       g = f)NTr   r)r   r   r   ZipFiler   r   )r0   r(   zip_files      r   r3   ZipExtractor.extract   sD    
K$/__Z-,NN .--s   "A
A&r>   rf   )r7   r8   r9   r:   rO   rJ   r   r   r;   rh   r<   rF   rK   r3   r=   __classcell__)r   s   @r   r   r      sz    M "%c	"2 "% "RV " "H E$), 5s;K PT  r   r   c                   N    \ rS rSrS/r\S\\\4   S\\\4   SS4S j5       r	Sr
g)	XzExtractor   s   7zXZ r0   r(   r    Nc                     [         R                  " U 5       n[        US5       n[        R                  " X#5        S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr   )lzmarS   r   r   r0   r(   compressed_filer   s       r   r3   XzExtractor.extract   sG    YYz"ok4(N""?C ) #"(( #"s!   AAA
A	A
A+r>   r   r>   r   r   r   r      sI    01MDE$), D5s;K DPT D Dr   r   c                   P    \ rS rSrSS/r\S\\\4   S\\\4   SS4S j5       r	S	r
g)
RarExtractor   s   Rar! s   Rar! r0   r(   r    Nc                     [         R                  (       d  [        S5      eSS Kn[        R
                  " USS9  UR                  U 5      nUR                  U5        UR                  5         g )NzPlease pip install rarfiler   Tr   )	r	   RARFILE_AVAILABLEImportErrorrarfiler   r   RarFiler   r   )r0   r(   r   rfs       r   r3   RarExtractor.extract   sK    '':;;
K$/__Z(
k"

r   r>   r   r>   r   r   r   r      sG    (*ABME$), 5s;K PT  r   r   c                   N    \ rS rSrS/r\S\\\4   S\\\4   SS4S j5       r	Sr
g)	ZstdExtractor   s   (/r0   r(   r    Nc                 &   [         R                  (       d  [        S5      eSS KnUR	                  5       n[        U S5       n[        US5       nUR                  XE5        S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)NzPlease pip install zstandardr   rR   r   )r	   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorrS   copy_stream)r0   r(   zstddctxifhofhs         r   r3   ZstdExtractor.extract   sg    ))<== $$&*d#sDd,CsS& -D##,C,C##s$   BA1 B1
A?	;B
Br>   r   r>   r   r   r   r      sD    ()M'E$), '5s;K 'PT ' 'r   r   c                   N    \ rS rSrS/r\S\\\4   S\\\4   SS4S j5       r	Sr
g)	Bzip2Extractor   s   BZhr0   r(   r    Nc                     [         R                  " U S5       n[        US5       n[        R                  " X#5        S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = fr   )bz2rS   r   r   r   s       r   r3   Bzip2Extractor.extract   sI    XXj$'?k4(N""?C ) ('(( ('r   r>   r   r>   r   r   r   r      sI    $%MDE$), D5s;K DPT D Dr   r   c                   N    \ rS rSrS/r\S\\\4   S\\\4   SS4S j5       r	Sr
g)	SevenZipExtractor   s   7z'r0   r(   r    Nc                     [         R                  (       d  [        S5      eSS Kn[        R
                  " USS9  UR                  U S5       nUR                  U5        S S S 5        g ! , (       d  f       g = f)NzPlease pip install py7zrr   Tr   r   )r	   PY7ZR_AVAILABLEr   py7zrr   r   SevenZipFiler   )r0   r(   r   archives       r   r3   SevenZipExtractor.extract   sT    %%899
K$/
C0G{+ 100s   A''
A5r>   r   r>   r   r   r   r      sD    01M,E$), ,5s;K ,PT , ,r   r   c                   N    \ rS rSrS/r\S\\\4   S\\\4   SS4S j5       r	Sr
g)	Lz4Extractor   s   "Mr0   r(   r    Nc                 0   [         R                  (       d  [        S5      eSS KnUR                  R                  U S5       n[        US5       n[        R                  " X45        S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)NzPlease pip install lz4r   rR   r   )r	   LZ4_AVAILABLEr   	lz4.frameframerS   r   r   )r0   r(   lz4r   r   s        r   r3   Lz4Extractor.extract  se    ##677YY^^J-k4(N""?C ) .-(( .-s$   BA6%B6
B	 B
Br>   r   r>   r   r   r   r      sI    ()MDE$), D5s;K DPT D Dr   r   c            
          \ rS rSr% \\\\\\	\
\\S.	r\\\\   4   \S'   \S 5       r\S\\\4   S\4S j5       r\SS\\\4   S\S	\4S
 jj5       r\S\\\4   S	\\   4S j5       r\S\\\4   S\\\4   S\S	S4S j5       rSrg)r   i  )	tarr   zipxzrarr   r   7zr   
extractorsc                 V    [        S U R                  R                  5        5       5      $ )Nc              3      #    U  H;  n[        U[        5      (       d  M  UR                    H  n[        U5      v   M     M=     g 7fr   )
issubclassrM   rO   r[   )r\   r   extractor_magic_numbers      r   r^   9Extractor._get_magic_number_max_length.<locals>.<genexpr>  sI      
4	)%=> ( +4*A*A& &'' +B (4s
   A&A)rb   r   values)rD   s    r   _get_magic_number_max_length&Extractor._get_magic_number_max_length  s)     
 ^^224
 
 	
r   r   rP   c                 J     [         R                  XS9$ ! [         a     gf = f)N)rP   r   )rM   rV   rc   )r   rP   s     r   _read_magic_numberExtractor._read_magic_number$  s-    	+==d=ll 		s    
""return_extractorr    c                     [         R                  " S[        S9  U R                  U5      nU(       a  U(       d  S$ SU R                  U   4$ U(       d  S$ S$ )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.)categoryTF)FN)warningswarnFutureWarningr2   r   )rD   r   r  r4   s       r   rF   Extractor.is_extractable+  sU    4"	

 55d;/4]dCNNK[<\5]],u?-?r   c                     U R                  5       nU R                  X5      nU R                  R                  5        H  u  pEUR	                  XS9(       d  M  Us  $    g )Nr   )r  r
  r   itemsrF   )rD   r   magic_number_max_lengthrX   r4   r   s         r   r2    Extractor.infer_extractor_format7  sW    "%"B"B"D--dL+.>>+?+?+A''''HH'' ,Br   r0   r(   r4   Nc                 b   [         R                  " [         R                  R                  U5      SS9  [	        [        U5      R                  S5      5      n[        U5         [        R                  " USS9  U R                  U   nUR                  X5      sS S S 5        $ ! , (       d  f       g = f)NTr   z.lock)ignore_errors)r   r   r   r|   r;   r   with_suffixr   r   rmtreer   r3   )rD   r0   r(   r4   	lock_pathr   s         r   r3   Extractor.extract?  sx     	BGGOOK04@[)55g>?	i MM+T:'78I$$Z= !  s   !5B  
B.r>   r6   ) r7   r8   r9   r:   rl   r   r   r   r   r   r   r   r   r   dictr;   typer@   ri   rJ   r  rK   r   r   rj   r
  r<   rF   r   r2   r3   r=   r>   r   r   r   r     s.    
2JS$}--. 
 
 
 tSy!1    	@%c	"2 	@d 	@W[ 	@ 	@ (%c	*: (x} ( ( >$)$> 49%> 	>
 
> >r   r   )&r   r   r   r   r   r   ro   r  r   abcr   r   pathlibr   typingr   r    r	   	_filelockr   loggingr   r7   r   r   r@   rM   rl   r   r   r   r   r   r   r   r   r   r>   r   r   <module>r%     s    
   	      #  "    
H	 <ZC Zh}c h&.= .b>, >1+ 1hD* D+ ', 'D- D,0 ,D+ D?> ?>r   