
    h                         S SK r S SKrS SKJr  S SKJrJrJr  S SKrSSK	J
r
  SSKJrJr  SSKJr  SSK
JrJrJr  SS	KJr  S
SKJr  S
SKJr  SrSS\4S jjr " S S\5      r " S S\5      r " S S\5      rS/rg)    N)Path)AnyDictUnion   )util)BaseDefaultsLanguage)Doc)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)
STOP_WORDSzU
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.vi.VietnameseTokenizer"
use_pyvi = true
use_pyvic                    ^  U 4S jnU$ )Nc                 ,   > [        U R                  TS9$ )Nr   )VietnameseTokenizervocab)nlpr   s    P/home/james-whalen/.local/lib/python3.13/site-packages/spacy/lang/vi/__init__.pyvietnamese_tokenizer_factoryAcreate_vietnamese_tokenizer.<locals>.vietnamese_tokenizer_factory   s    "399x@@     )r   r   s   ` r   create_vietnamese_tokenizerr      s    A ('r   c                       \ rS rSrSS\S\4S jjrS rS\S\	4S jr
S	 rS
 rS\\\4   4S jr0 4S\\\4   SS4S jjrS\4S jrS\SS 4S jrS\\\4   SS4S jrS\\\4   SS 4S jrSrg)r       r   r   c                     Xl         X l        U R                  (       a   SSKJn  X0l        g g ! [         a    Sn[	        U5      S ef = f)Nr   )ViTokenizerz`Pyvi not installed. Either set use_pyvi = False, or install it https://pypi.python.org/pypi/pyvi)r   r   pyvir#   ImportError)selfr   r   r#   msgs        r   __init__VietnameseTokenizer.__init__!   sQ    
 ==	1,#. 	 
  1F  "#&D01s	   - Ac                 >    [         U R                  U R                  44$ N)r   r   r   r&   s    r   
__reduce__VietnameseTokenizer.__reduce__0   s    "TZZ$???r   textreturnc                    U R                   (       a=  U R                  U5      n[        R                  " X!5      u  p#[	        U R
                  X#S9$ [        R                  " UR                  5       U5      u  p#[	        U R
                  X#S9$ )N)wordsspaces)r   pyvi_tokenizer   get_words_and_spacesr   r   split)r&   r/   r2   r3   s       r   __call__VietnameseTokenizer.__call__3   sg    ==&&t,E 55eBMEtzz>> 55djjlDIMEtzz>>r   c                 d   / SQnSnSnSnSnSn/ SQn/ n	U	R                  U5        U	R                  U5        U	R                  XT/5        U	R                  X7U/5        SS	R                  U	5      -   S
-   n	[        R                  " X[        R                  5      n
U
 Vs/ s H  oS   PM	     sn$ s  snf )zIModified from pyvi to preserve whitespace and skip unicode
normalization.)z==>z->z\.\.\.z>>z\d+([\.,_]\d+)+z2([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)z\w+://[^\s]+z\w+z[^\w\s])u
   [A-ZĐ]+\.zTp\.zMr\.zMrs\.zMs\.zDr\.zThS\.z(\s+||)r   )extendjoinrefindallUNICODE)r&   r/   specialsdigitemailwebwordnon_wordabbreviationspatternstokenstokens               r   pyvi_sylabelize_with_ws+VietnameseTokenizer.pyvi_sylabelize_with_ws?   s     5"E
 &!%$/0chhx0036HBJJ7&,-fUaf---s   B-c                    [        U5      S:X  a  / $ UR                  5       (       a  U/$ U R                  U5      n/ n/ n[        U5       He  u  pVUR                  5       (       a  M  UR	                  U5        UR	                  US:X  d  X%S-
     R                  5       (       d  SOX%S-
     5        Mg     U R
                  R
                  R                  R                  U R
                  R
                  R                  US5      /5      nUS   n/ n[        S[        US   5      5       H  nUS   U   S:X  a  X5   [        R                  ;  a  X5S-
     [        R                  ;  a|  X5   S   R                  5       (       db  X5S-
     S   R                  5       (       dE  X5   S   R                  5       (       a  X5S-
     S   R                  5       (       a  XdU   -   X5   -   nM  UR	                  U5        X5   nM     UR	                  U5        U$ )z3Modified from pyvi to preserve text and whitespace.r   r    FI_W)lenisspacerK   	enumerateappendr#   modelpredictsent2featuresrangestringpunctuationisdigitistitle)	r&   r/   segsr2   preceding_wsirJ   labelsrI   s	            r   r4   !VietnameseTokenizer.pyvi_tokenize]   s   t9>I\\^^6M++D1!$HA==??U###6!e)<)<)>)>BTa%[ ( !!--33;;))77uEF
 aq#fQi.)Aq	!%HF$6$66a%L(:(::++--!eQ//11!,,..uU|A7N7N7P7PQ/%(:e$ * 	er   c                     SU R                   0$ )Nr   r   r,   s    r   _get_configVietnameseTokenizer._get_config   s    DMM**r   configNc                 2    UR                  SS5      U l        g )Nr   F)getr   )r&   rd   s     r   _set_configVietnameseTokenizer._set_config   s    

:u5r   c                 B   ^  SU 4S j0n[         R                  " U/ 5      $ )Ncfgc                  L   > [         R                  " T R                  5       5      $ r+   )srsly
json_dumpsrb   r,   s   r   <lambda>.VietnameseTokenizer.to_bytes.<locals>.<lambda>   s    e&6&6t7G7G7I&Jr   )r   to_bytes)r&   kwargsserializerss   `  r   rp   VietnameseTokenizer.to_bytes   s    JK}}["--r   datac                 F   ^  SU 4S j0n[         R                  " X/ 5        T $ )Nrj   c                 N   > TR                  [        R                  " U 5      5      $ r+   )rg   rl   
json_loads)br&   s    r   rn   0VietnameseTokenizer.from_bytes.<locals>.<lambda>   s    $*:*:5;K;KA;N*Or   )r   
from_bytes)r&   rt   rq   deserializerss   `   r   rz   VietnameseTokenizer.from_bytes   s!     OPR0r   pathc                 p   ^  [         R                  " U5      nSU 4S j0n[         R                  " X/ 5        g )Nrj   c                 N   > [         R                  " U TR                  5       5      $ r+   )rl   
write_jsonrb   pr&   s    r   rn   -VietnameseTokenizer.to_disk.<locals>.<lambda>   s    (8(8D<L<L<N(Or   )r   ensure_pathto_diskr&   r}   rq   rr   s   `   r   r   VietnameseTokenizer.to_disk   s,    %OPT+r   c                 r   ^  [         R                  " U5      nSU 4S j0n[         R                  " X/ 5        T $ )Nrj   c                 N   > TR                  [        R                  " U 5      5      $ r+   )rg   rl   	read_jsonr   s    r   rn   /VietnameseTokenizer.from_disk.<locals>.<lambda>   s    (8(89K(Lr   )r   r   	from_diskr   s   `   r   r   VietnameseTokenizer.from_disk   s1    %LMt"-r   )r#   r   r   )F)__name__
__module____qualname____firstlineno__r   boolr(   r-   strr   r7   rK   r4   r   r   rb   rg   bytesrp   rz   r   r   r   r   __static_attributes__r   r   r   r   r       s    1e 1t 1@?S ?S ?.<"H+T#s(^ + 46 6$sCx. 6$ 6.E .u 3H 
,E#t), ,4 ,
eCI. =R r   r   c                   ,    \ rS rSr\" \5      r\r\	r
Srg)VietnameseDefaults   r   N)r   r   r   r   r   DEFAULT_CONFIGrd   r   lex_attr_gettersr   
stop_wordsr   r   r   r   r   r      s    !.1F Jr   r   c                       \ rS rSrSr\rSrg)
Vietnamese   vir   N)r   r   r   r   langr   Defaultsr   r   r   r   r   r      s    D!Hr   r   )T) r>   rX   pathlibr   typingr   r   r   rl   rN   r   languager	   r
   rI   r   r   r   r   r   r   	lex_attrsr   r   r   r   r   r   r   r   r   __all__r   r   r   <module>r      sr    	   # #   .  B B    "($ (y. yx " "
 .r   