
    hE                        S SK r S SKrS SKrS SKJrJr  S SKJr  S SKJ	r	  S SK
Jr  S SKJrJrJrJrJr  SSKJrJr  S	 r\R,                  R/                  S
5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       rS r\R,                  R9                  S/ SQ5      S 5       rS rg)    N)ENT_IOBENT_TYPE)English)	Tokenizer)Doc)compile_infix_regexcompile_prefix_regexcompile_suffix_regexget_lang_class
load_model   )assert_packed_msg_equalmake_tempdirc                 \    [        S5      " 5       R                  nUR                  U 5        U$ )Nen)r   	tokenizer
from_bytes)btoks     h/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/serialize/test_serialize_tokenizer.pyload_tokenizerr      s&    


 
*
*CNN1J    i  c                 P   [        U SS/S9n[        R                  " [        5         [        R
                  " US   5        SSS5        [        R                  " [        5         [        R
                  " USS 5        SSS5        g! , (       d  f       NK= f! , (       d  f       g= f)zATest that a custom error is raised if a token or span is pickled.Helloworld)wordsr   Nr   )r   pytestraisesNotImplementedErrorpickledumps)en_vocabdocs     r   test_issue2833r$      sn     hw0
1C	*	+SV 
,	*	+S1X 
,	+ 
,	+	+	+s   B#B
B
B%i  c                    / SQn/ SQn/ SQn/ SQn[        XX#US9nUR                  S5      (       d   eSnUS   R                  US   R                  US   R                  US   R
                  4U:X  d   e[        [        /nUR                  U5      nUR                  Xx5        US   R                  US   R                  US   R                  US   R
                  4U:X  d   eUR                  5       n	[        U 5      R                  U	5      n
U
S   R                  U
S   R                  U
S   R                  U
S   R
                  4U:X  d   eg	)
zeTest that the is_tagged attribute doesn't get overwritten when we from_array
without tag information.)Thisis10%.)DTVBZCDNNr*   )DETVERBNUMNOUNPUNCT)Or4   z	B-PERCENTz	I-PERCENTr4   )r   tagsposentsTAG)r(   r1   r-   PERCENTr   N)r   has_annotationtextpos_tag_	ent_type_r   r   to_array
from_arrayto_bytesr   )r"   r   r5   r6   r7   r#   expectedheader	ent_array	doc_bytesdoc2s              r   test_issue3012rG   %   s0    +E)D
1C4D
h$d
CCe$$$$-HFKKQc!fkk3q63C3CDPPPx FV$INN6%FKKQc!fkk3q63C3CDPPPIx=##I.DGLL$q',,Qd1g6G6GHHTTTr   i^  c                     S n Sn[        5       nU" U5      nU Vs/ s H  oDR                  PM     nnU " U5        U" U5      nU Vs/ s H  oDR                  PM     nn[        5        nUR                  U5        [	        U5      n	S S S 5        W	" U5      n
U
 Vs/ s H  oDR                  PM     nnX{:X  d   eU	R
                  R                  SL d   eg s  snf s  snf ! , (       d  f       N^= fs  snf )Nc           
      "   [        U R                  R                  5      n[        U R                  R                  5      n[        U R                  R                  5      n[        U R                  R                  5      R                  5        VVs0 s H!  u  pE[        U5      S:X  a  US   S:X  a  M  XE_M#     nnn[        U R                  UUR                  UR                  UR                  U R                  R                   SS9nXpl        g s  snnf )Nr      r*   F)prefix_searchsuffix_searchinfix_finditertoken_matchfaster_heuristics)r	   Defaultsprefixesr
   suffixesr   infixesdicttokenizer_exceptionsitemslenr   vocabsearchfinditerr   rN   )nlp	prefix_re	suffix_reinfix_rekv
exceptionsnew_tokenizers           r   customize_tokenizer+test_issue4190.<locals>.customize_tokenizer=   s    ()>)>?	()>)>?	&s||';';< S\\>>?EEG
GFaKAaDCK ADG 	 

 "II#**#**#,,11#
 &
s   D0DzTest c.F)r   r;   r   to_diskr   r   rO   )rc   test_stringnlp_1doc_1atoken	result_1adoc_1b	result_1b	model_dirnlp_2doc_2result_2s               r   test_issue4190rq   ;   s    &* KIE;F)/0I0;F)/0I0	9i 9% 
 +E(-.u

H.   ??,,555 1 1	
 /s   CC(CC.
C+c                     [        XR                  S9nUR                  5       n[        U 5      R                  U5        [	        S5      " 5       R
                  n[        R                  " S5      R                  Ul	        UR                  0 :w  d   eUR                  c   eUR                  c   eUR                  c   eUR                  c   eUR                  U5        UR                  0 :X  d   eUR                  b   eUR                  b   eUR                  b   eUR                  b   e[        U SSS0SS0/0S	9n0 Ul
        UR                  5       n[        U 5      R                  U5      nUR                  0 :X  d   eg)
zTest that custom tokenizer with not all functions defined or empty
properties can be serialized and deserialized correctly (see #2494,
#4991).)rL   r   testNzABC.ORTHABCr*   )rules)r   rL   rA   r   r   r   recompilematchrN   rv   	url_matchrK   rM   )r"   en_tokenizerr   tokenizer_bytestokenizer_reloadeds        r   test_serialize_custom_tokenizerr~   f   s    (2L2LMI((*Oh""?3 t$&00IJJv.44I??b     ,,,***""...##///)??b     (((&&&""***##+++(6VUOfc]3S*TUIIO((*O"8,77H##r)))r   r;   )u   I💜youu	   they’reu   “hello”c                 ~   U n[        UR                  5       5      n[        UR                  5       UR                  5       5        UR                  5       UR                  5       :X  d   eU" U5      nU" U5      nU Vs/ s H  ofR                  PM     snU Vs/ s H  ofR                  PM     sn:X  d   eg s  snf s  snf )N)r   rA   r   r;   )r{   r;   r   rb   doc1rF   ri   s          r   (test_serialize_tokenizer_roundtrip_bytesr      s    I"9#5#5#78MM224i6H6H6JK!!#y'9'9';;;;T?DD$()D5JJD)d-KdUjjd-KKKK)-Ks   <B5B:c                     U n[        5        nUS-  nUR                  U5        U R                  U5      nUR                  5       UR                  5       :X  d   e S S S 5        g ! , (       d  f       g = f)Nr   )r   re   	from_diskrA   )r{   r   d	file_pathtokenizer_ds        r   'test_serialize_tokenizer_roundtrip_diskr      s_    I	1O	)$",,Y7!!#{';';'====	 
s   AA##
A1)r    rw   r   spacy.attrsr   r   spacy.lang.enr   spacy.tokenizerr   spacy.tokensr   
spacy.utilr   r	   r
   r   r   utilr   r   r   markissuer$   rG   rq   r~   parametrizer   r    r   r   <module>r      s     	  ) ! %   9 4  4U U* 4'6 '6T*< !IJL KL>r   