
    h                     8   S SK r S SKJr  S SKJrJr  Sr\S    / SQ4/r\S    / SQ4/r\ R                  R                  S\5      S 5       r\ R                  R                  S	\5      S
 5       r\ R                  R                  S	\5      S 5       rS rS rS rS rg)    N)ConfigValidationError)Chinese_get_pkuseg_trie_data)ul   作为语言而言，为世界使用人数最多的语言，目前世界有五分之一人口做为母语。)   作为   语言   而言   ，   为   世界   使用u   人u	   数最多   的r   r	      目前r      有   五分之一   人口u   做r
      母语   。)r   r   r   r	   r
   r   r   u   人数u   最多r   r   r	   r   r   r   r   r   u   做为r   r   textc                 r    U " U5       Vs/ s H  o"R                   PM     nnU[        U5      :X  d   eg s  snf N)r   list)zh_tokenizer_charr   tokentokenss       \/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/lang/zh/test_tokenizer.pytest_zh_tokenizer_charr      s6    &7&=>&=Ujj&=F>T$Z ?s   4ztext,expected_tokensc                 ^    U " U5       Vs/ s H  o3R                   PM     nnXB:X  d   eg s  snf r   r   )zh_tokenizer_jiebar   expected_tokensr   r   s        r   test_zh_tokenizer_jiebar!      s1    &8&>?&>Ujj&>F?$$$ @   *c                 ^    U " U5       Vs/ s H  o3R                   PM     nnXB:X  d   eg s  snf r   r   )zh_tokenizer_pkusegr   r    r   r   s        r   test_zh_tokenizer_pkusegr%   #   s1    &9$&?@&?Ujj&?F@$$$ Ar"   c                 &   [        U R                  R                  R                  5      nU R	                  S/5        [        U R                  R                  R                  5      n[        U5      [        U5      S-
  :X  d   eU R	                  / SS9  [        U R                  R                  R                  5      n[        U5      S:X  d   e[        R                  " [        5         UR	                  S/5        S S S 5        g ! , (       d  f       g = f)Nnonsense_asdf   T)resetr   )	r   
pkuseg_segpreprocessertriepkuseg_update_user_dictlenpytestwarnsUserWarning)r$   r   	user_dictupdated_user_dictreset_user_dicts        r   "test_zh_tokenizer_pkuseg_user_dictr5   )   s    %&9&D&D&Q&Q&V&VWI//0AB-&&3388 y>S!23a7777 //$/?+&&3388O 1$$$ 
k	"11?2CD 
#	"	"s   &D
Dc                 >    U " S5      nUS   R                   S:X  d   eg )NzI   like cheese.r(   z  )orth_)r   r   s     r   test_zh_extra_spacesr8   =   s#    12F!9??d"""    c                      SSSS000n [         R                  " [        5         [        R                  " U 5        S S S 5        g ! , (       d  f       g = f)Nnlp	tokenizer	segmenterunk)r/   raisesr   r   from_config)configs    r   test_zh_unsupported_segmenterrB   C   s=    kK#789F	,	-F# 
.	-	-s   A
Ac                      SSSS000n [         R                  " U 5      nSUR                  l        [        R
                  " [        5         U" S5        S S S 5        g ! , (       d  f       g = f)Nr;   r<   r=   charpkusegtest)r   r@   r<   r=   r/   r?   
ValueError)rA   r;   s     r   test_zh_uninitialized_pkusegrH   I   sQ    kK#89:F


f
%C&CMM	z	"F 
#	"	"s   
	A
A*)r/   	thinc.apir   spacy.lang.zhr   r   TEXTSJIEBA_TOKENIZER_TESTSPKUSEG_TOKENIZER_TESTSmarkparametrizer   r!   r%   r5   r8   rB   rH    r9   r   <module>rQ      s     + 8 	z
1X	!"  1X	  '  ( 
 /1FG% H%
 /1GH% I%
E(#$r9   