
    h1                        S SK r S SKrS SKJr  S SKJr  S SKJrJrJ	r	J
r
JrJr  S SKrSSKJr  SSKJrJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJr  SSKJrJrJr  SSK J!r!  SSK"J#r#  SSK$J%r%  Sr&Sr' " S S\(\5      r)\)RT                  4S\)4S jjr+ " S S\5      r, " S S\5      r- " S S\5      r.S r/S\\(   S\\(   4S  jr0S"S! jr1S/r2g)#    N)Enum)Path)AnyCallableDictIterableListOptional   )util)ErrorsWarnings)BaseDefaultsLanguage)Scorer)Doc)Examplevalidate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)
STOP_WORDSzinstall spacy-pkuseg with `pip install "spacy-pkuseg>=0.0.27,<0.1.0"` or `conda install -c conda-forge "spacy-pkuseg>=0.0.27,<0.1.0"`z
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char"

[initialize]

[initialize.tokenizer]
pkuseg_model = null
pkuseg_user_dict = "default"
c                   0    \ rS rSrSrSrSr\S 5       rSr	g)	Segmenter'   charjiebapkusegc                 H    [        U R                  R                  5       5      $ N)list__members__keys)clss    P/home/james-whalen/.local/lib/python3.13/site-packages/spacy/lang/zh/__init__.pyvaluesSegmenter.values,   s    COO((*++     N)
__name__
__module____qualname____firstlineno__r   r    r!   classmethodr)   __static_attributes__r,   r+   r(   r   r   '   s#    DEF, ,r+   r   	segmenterc                    ^  U 4S jnU$ )Nc                 ,   > [        U R                  TS9$ )Nr3   )ChineseTokenizervocab)nlpr3   s    r(   chinese_tokenizer_factory;create_chinese_tokenizer.<locals>.chinese_tokenizer_factory2   s    		Y??r+   r,   )r3   r:   s   ` r(   create_chinese_tokenizerr<   1   s    @ %$r+   c                      \ rS rSr\R
                  4S\S\4S jjr SSSSS.S\\	/ \
\   4      S	\\   S
\\   S\\   4S jjjrS\S\4S jrSS\\   S\4S jjrS rS\\\4   4S jr0 4S\\\4   SS4S jjrS rS rS rS rSrg)r7   8   r8   r3   c                    Xl         [        U[        5      (       a  UR                  OUU l        S U l        S U l        U R                  [        R                  5       ;  av  [        R                  R                  SU R                  SR                  [        R                  5       5      SS9n[        R                  " U5        [        R                  U l        U R                  [        R                  :X  a  [!        5       U l        g g )NChinese, 'char' (character segmentation)langr3   	supporteddefault)r8   
isinstancer   valuer3   
pkuseg_seg	jieba_segr)   r   W103formatjoinwarningswarnr   r    try_jieba_import)selfr8   r3   warn_msgs       r(   __init__ChineseTokenizer.__init__9   s    
))Y??IOOY 	 >>!1!1!33}}++..))I$4$4$679	 , H MM(#&^^DN>>Y__,-/DN -r+   NrF   )r9   pkuseg_modelpkuseg_user_dictget_examplesr9   rU   rV   c                h    U R                   [        R                  :X  a  Uc  Un[        X4S9U l        g g )N)rU   rV   )r3   r   r!   try_pkuseg_importrI   )rQ   rW   r9   rU   rV   s        r(   
initializeChineseTokenizer.initializeL   s6     >>Y---'#/ /)DO .r+   textreturnc                    U R                   [        R                  :X  ah  [        U R                  R                  USS9 Vs/ s H  o"(       d  M  UPM     sn5      n[        R                  " X15      u  p4[        U R                  X4S9$ U R                   [        R                  :X  am  U R                  c  [        [        R                  5      eU R                  R                  U5      n[        R                  " X15      u  p4[        U R                  X4S9$ U R                   [        R                  :w  aa  [         R"                  R%                  SU R                   SR'                  [        R)                  5       5      SS9n[*        R,                  " U5        [        U5      n[        R                  " X15      u  p4[        U R                  X4S9$ s  snf )NFcut_all)wordsspacesr@   rA   rB   rC   )r3   r   r    r$   rJ   cutr   get_words_and_spacesr   r8   r!   rI   
ValueErrorr   E1000r   r   rK   rL   rM   r)   rN   rO   )rQ   r\   xra   rb   rR   s         r(   __call__ChineseTokenizer.__call__[   sX   >>Y__,T^^%7%7e%7%LR%LPQ!%LRSE"77DOUtzz>>^^y///& ..OO''-E"77DOUtzz>> >>Y^^+}}++..))I$4$4$679	 , H MM(# T
33E@4::U::/ Ss    
GGra   resetc                    U R                   [        R                  :X  aj  U(       a%   SS KnUR	                  S 5      U R
                  l        U H7  nU R
                  R                  R                  UR                  5       S5        M9     g [        R                  R                  SU R                   S9n[        R                  " U5        g ! [         a    S[        -   n[        U5      S ef = f)Nr   zEspacy_pkuseg not installed: unable to reset pkuseg user dict. Please  r!   )targetcurrent)r3   r   r!   spacy_pkusegPreprocesserrI   preprocesserImportError_PKUSEG_INSTALL_MSGinsertstripr   W104rL   rN   rO   )rQ   ra   rj   ro   msgwordrR   s          r(   pkuseg_update_user_dict(ChineseTokenizer.pkuseg_update_user_dictv   s    >>Y---	5'3?3L3LT3RDOO0 ,,33DJJL"E   }}++8T^^+THMM(# # 5-/BC  &c*45s   $C  C(c                 F    [        US5        [        R                  " U5      $ )NzChineseTokenizer.score)r   r   score_tokenization)rQ   exampless     r(   scoreChineseTokenizer.score   s    ($<=((22r+   c                     SU R                   0$ Nr3   r6   rQ   s    r(   _get_configChineseTokenizer._get_config   s    
 	
r+   configc                 N    UR                  S[        R                  5      U l        g r   )getr   r   r3   )rQ   r   s     r(   _set_configChineseTokenizer._set_config   s    K@r+   c                   ^ ^^^ SmSmS mT R                   (       Gan  [        R                  " 5        nT R                   R                  R	                  U5        T R                   R
                  R	                  U5        [        U5      n[        US-  S5       nUR                  5       mS S S 5        [        US-  S5       nUR                  5       mS S S 5        S S S 5        [        T R                   R                  R                  5      T R                   R                  R                  [        [        T R                   R                  R                   5      5      [        [        T R                   R                  R"                  5      5      4mU 4S jU4S jU4S jU4S jS	.n[$        R&                  " U/ 5      $ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f)
Nr+   features.msgpackrbweights.npzc                  L   > [         R                  " T R                  5       5      $ r#   )srsly
json_dumpsr   r   s   r(   <lambda>+ChineseTokenizer.to_bytes.<locals>.<lambda>   s    5++D,<,<,>?r+   c                     > T $ r#   r,   )pkuseg_features_bs   r(   r   r      s    '8r+   c                     > T $ r#   r,   )pkuseg_weights_bs   r(   r   r      s    &6r+   c                  0   > [         R                  " T 5      $ r#   )r   msgpack_dumps)pkuseg_processors_datas   r(   r   r      s    )<)<=S)Tr+   cfgpkuseg_featurespkuseg_weightspkuseg_processors)rI   tempfileTemporaryDirectoryfeature_extractorsavemodelr   openread_get_pkuseg_trie_datarq   triepostprocesser
do_processsortedr$   common_wordsother_wordsr   to_bytes)rQ   kwargstempdirfilehserializersr   r   r   s   `    @@@r(   r   ChineseTokenizer.to_bytes   s`   !%???,,.'1166w?%%**73w-'$66=(-

% >'M148E',zz|$ 9 / &doo&B&B&G&GH--88tDOO99FFGHtDOO99EEFG	&" @86!T	
 }}["--! >=88 /.s<   A%GF9(G?GG9
G	G
G	G
G,c                   ^ ^ SSS S.mU4S jnU4S jnU4S jnU 4S jUUUS.n[         R                  " X/ 5        TS   (       GaG  TS	   (       Ga<  [        R                  " 5        n[	        U5      n[        US
-  S5       nUR                  TS   5        S S S 5        [        US-  S5       nUR                  TS	   5        S S S 5         SS Kn	U	R                  [        U5      5      T l        S S S 5        TS   (       a  TS   n
U
u  ppW	R                  U5      T R                  l        UT R                  R                  l        [#        U5      T R                  R                  l        [#        U5      T R                  R                  l        T $ ! , (       d  f       N= f! , (       d  f       N= f! [         a    [        S[        -   5      S ef = f! , (       d  f       N= f)Nr+   )
features_b	weights_bprocessors_datac                    > U TS'   g )Nr   r,   bpkuseg_datas    r(   deserialize_pkuseg_features@ChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_features   s    ()K%r+   c                    > U TS'   g )Nr   r,   r   s    r(   deserialize_pkuseg_weights?ChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_weights   s    '(K$r+   c                 8   > [         R                  " U 5      TS'   g )Nr   )r   msgpack_loadsr   s    r(   deserialize_pkuseg_processorsBChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_processors   s    -2-@-@-CK)*r+   c                 N   > TR                  [        R                  " U 5      5      $ r#   )r   r   
json_loads)r   rQ   s    r(   r   -ChineseTokenizer.from_bytes.<locals>.<lambda>   s    T--e.>.>q.ABr+   r   r   r   r   wbr   r   /spacy-pkuseg not installed. To use this model, r   )r   
from_bytesr   r   r   r   writero   rr   rs   r!   strrI   rp   rq   r   r   setr   r   )rQ   datar   r   r   r   deserializersr   r   ro   r   	user_dictr   r   r   r   s   `              @r(   r   ChineseTokenizer.from_bytes   s   %(stT	*	)	D C:8!>	
 	R0|$$[)A)A,,.'w-'$66=KKL 9: >'M148EKKK 89 9 ' #/"5"5c'l"C / ,-"-.?"@ETB/;/H/H/S,;E--8=@=N--:<?<L--9) >=88 #  %I-.     /.sT   (GFG/F"GF3G
F	G"
F0	,G3GG
G"c                    ^ ^^ [         R                  " U5      nU 4S jmU 4S jmU 4S jU4S jU4S jS.n[         R                  " X/ 5      $ )Nc                   > TR                   (       ao  U R                  5       (       d  U R                  SS9  TR                   R                  R	                  U 5        TR                   R
                  R	                  U 5        g g )NT)parents)rI   existsmkdirr   r   r   )pathrQ   s    r(   save_pkuseg_model3ChineseTokenizer.to_disk.<locals>.save_pkuseg_model   sX    {{}}JJtJ,%%**401166t<	 r+   c                   > TR                   (       a  [        TR                   R                  R                  5      TR                   R                  R
                  [        [        TR                   R                  R                  5      5      [        [        TR                   R                  R                  5      5      4n[        R                  " X5        g g r#   )rI   r   rq   r   r   r   r   r$   r   r   r   write_msgpack)r   r   rQ   s     r(   save_pkuseg_processors8ChineseTokenizer.to_disk.<locals>.save_pkuseg_processors   s    )$//*F*F*K*KLOO11<<4 = = J JKL4 = = I IJK	 ##D/ r+   c                 N   > [         R                  " U TR                  5       5      $ r#   )r   
write_jsonr   prQ   s    r(   r   *ChineseTokenizer.to_disk.<locals>.<lambda>   s    U--a1A1A1CDr+   c                    > T" U 5      $ r#   r,   )r   r   s    r(   r   r          &7&:r+   c                    > T" U 5      $ r#   r,   )r   r   s    r(   r   r          +A!+Dr+   r   rU   r   )r   ensure_pathto_disk)rQ   r   r   r   r   r   s   `   @@r(   r   ChineseTokenizer.to_disk   sC    %	=	0 E:!D

 ||Dr22r+   c                    ^ ^^ [         R                  " U5      nU 4S jmU 4S jmU 4S jU4S jU4S jS.n[         R                  " X/ 5        g )Nc                    >  SS K nU R                  5       (       a  WR	                  U 5      Tl        g g ! [         a4    TR                  [        R                  :X  a  [        S[
        -   5      S e Njf = f)Nr   r   )ro   rr   r3   r   r!   rs   r   rI   )r   ro   rQ   s     r(   load_pkuseg_model5ChineseTokenizer.from_disk.<locals>.load_pkuseg_model   st     # {{}}"."5"5d";    >>Y%5%55%I-.    6 s   4 ;A21A2c                   >  SS K nTR                  [        R                  :X  a  [        R                  " U 5      nUu  p4pVWR                  U5      TR                  l
        UTR                  R                  l        [        U5      TR                  R                  l        [        U5      TR                  R                  l        g g ! [         a7    TR                  [        R                  :X  a  [        TR
                  5      S e Nf = f)Nr   )ro   rr   r3   r   r!   _pkuseg_install_msgr   read_msgpackrp   rI   rq   r   r   r   r   r   )r   ro   r   r   r   r   r   rQ   s          r(   load_pkuseg_processors:ChineseTokenizer.from_disk.<locals>.load_pkuseg_processors  s    J# ~~!1!11))$/EIB/;/H/H/S,;E--8=@=N--:<?<L--9 2  J>>Y%5%55%d&>&>?TI 6Js   C >DDc                 N   > TR                  [        R                  " U 5      5      $ r#   )r   r   	read_jsonr   s    r(   r   ,ChineseTokenizer.from_disk.<locals>.<lambda>  s    T--eooa.@Ar+   c                    > T" U 5      $ r#   r,   )r   r   s    r(   r   r     r   r+   c                    > T" U 5      $ r#   r,   )r   r   s    r(   r   r     r   r+   r   )r   r   	from_disk)rQ   r   r   r   r   r   s   `   @@r(   r   ChineseTokenizer.from_disk   sA    %
	<	M B:!D

 	t"-r+   )rJ   rI   r3   r8   r#   )F)r-   r.   r/   r0   r   r   r   rS   r
   r   r   r   r   r   rZ   r   rh   r	   boolry   r~   r   r   r   r   r   r   r   r   r2   r,   r+   r(   r7   r7   8   s    <ENN 0e 0	 0* CG #'&**3xHW,=(=>? h	
 sm #3-;S ;S ;6$T#Y $t $&3
T#s(^ 

 46 A$sCx. A$ A.6+Z36".r+   r7   c                   8    \ rS rSr\" \5      r\r\	r
SSSS.rSrg)ChineseDefaultsi  ltrF)	directionhas_casehas_lettersr,   N)r-   r.   r/   r0   r   DEFAULT_CONFIGr   r   lex_attr_gettersr   
stop_wordswriting_systemr2   r,   r+   r(   r   r     s#    !.1F J#(eERNr+   r   c                       \ rS rSrSr\rSrg)r@   i$  zhr,   N)r-   r.   r/   r0   rD   r   Defaultsr2   r,   r+   r(   r@   r@   $  s    DHr+   r@   c                  z     SS K n [        U R                  SSS95        U $ ! [         a    Sn[        U5      S ef = f)Nr   u   作为Fr_   znJieba not installed. To use jieba, install it with `pip  install jieba` or from https://github.com/fxsjy/jieba)r    r$   rc   rr   )r    rw   s     r(   rP   rP   )  sO    ) 	UYYxY/0 )E 	 #D()s   ! :rU   rV   c                      SS K n UR                  XS9$ ! [         a    S[        -   n[        U5      S ef = f! [         a$    S[        U =(       d    S5      -   n[	        U5      S ef = f)Nr   z+spacy-pkuseg not installed. To use pkuseg, )r   z"Unable to load pkuseg model from: rl   )ro   rr   rs   r!   FileNotFoundErrorr   )rU   rV   ro   rw   s       r(   rY   rY   9  s}    )
/""<"LL	  );>QQ#D()
  /2S9K5LL$$./s    9  6.A'c                     / n[        U R                  R                  5       5       H"  u  p4UR                  [	        XAU-   5      5        M$     U R
                  (       a  UR                  XR                  45        U$ r#   )r   childrenitemsextendr   iswordappendusertag)noder   r   c
child_nodes        r(   r   r   G  sZ    D 3 3 56)*Qh?@ 7{{T<<()Kr+   )rl   )3r   rN   enumr   pathlibr   typingr   r   r   r   r	   r
   r   rl   r   errorsr   r   languager   r   scorerr   tokensr   trainingr   r   r   r   r   r8   r   	lex_attrsr   r  r   rs   r  r   r   r   r<   r7   r   r@   rP   rY   r   __all__r,   r+   r(   <module>r     s        @ @   & .   2 B B    " b ,T , 5>NN %	 %b.~ b.JSl Sh 
) /HSM /Xc] / +r+   