
    hH                     
   S SK r S SKrS SKrS SKJr  S SKJr  S SKJr  S SK	J
r
  S SKJr  S SKJr  S SKJrJrJrJr  S S	KJr  \R,                  R/                  S
5      S 5       r\R,                  R/                  S5      \R,                  R3                  SS9\R,                  R5                  SS/ SQ4S/ SQ4S/ SQ4S/ SQ4S/ SQ4S/ SQ4S/ SQ4/5      S 5       5       5       r\R,                  R/                  S5      S  5       r\R,                  R/                  S!5      S" 5       r\R,                  R3                  S#S9\R,                  R/                  S$5      S% 5       5       r\R,                  R/                  S&5      S' 5       r\R,                  R/                  S(5      S) 5       r \R,                  R/                  S*5      S+ 5       r!\R,                  R/                  S,5      S- 5       r"\R,                  R/                  S.5      S/ 5       r#\R,                  R3                  S0S9\R,                  R/                  S15      S2 5       5       r$\R,                  R/                  S35      S4 5       r%\R,                  R5                  S5S6S7/5      \R,                  R/                  S85      S9 5       5       r&\R,                  R/                  S:5      S; 5       r'\R,                  R/                  S<5      S= 5       r(\R,                  R/                  S>5      S? 5       r)\R,                  R3                  S@S9\R,                  R/                  SA5      SB 5       5       r*\R,                  R5                  SCSD/ SEQ4SFSF/4/5      SG 5       r+SH r,\R,                  R5                  S5SI/5      SJ 5       r-SK r.SL r/SM r0\R,                  R5                  S5/ SNQ5      SO 5       r1\R,                  R5                  S5SP/5      SQ 5       r2\R,                  R5                  S5/ SRQ5      SS 5       r3ST r4\R,                  R5                  SUSV/5      SW 5       r5SX r6\R,                  R5                  SSISYSZ0SYS[0/4/5      S\ 5       r7\R,                  R5                  SSISYSZ0SYS]0/4SISZS^S_.SYS[0/4/5      S` 5       r8\R,                  R5                  SSISZSaSb.SYS[0/4/5      Sc 5       r9Sd r:Se r;Sf r<Sg r=Sh r>Si r?Sj r@Sk rASl rB\R,                  R/                  Sm5      Sn 5       rCSo rDg)p    N)German)English)ORTH)	Tokenizer)Doc)Example)compile_infix_regexcompile_prefix_regexcompile_suffix_regexensure_path)Vocabi  c                  |    [        [        5       SS/5      n U S   n[        U/5      n[        U5      nUS   UL d   eg )Nhelloworldr   )r   r   setlist)doctokensitemss       ^/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/tokenizer/test_tokenizer.pytest_issue743r      sD    
eg)
*CFEUGAGE8u    i!  zECan not be fixed unless with variable-width lookbehinds, cf. PR #3218)reasonztext,tokensz"deserve,"--and)"deservez,"--andzexception;--exclusive)	exceptionz;--	exclusivezday.--Is)dayz.--Iszrefinement:--just)
refinementz:--justzmemories?--To)memoriesz?--TozUseful.=--Therefore)Useful.=--	Thereforez=Hope.=--Pandora)=Hoper'   Pandorac                     U " U5      n[        U5      [        U5      :X  d   eU Vs/ s H  oDR                  PM     snU:X  d   egs  snf )z;Test that special characters + hyphens are split correctly.Nlentext)en_tokenizerr/   tokensr   ts        r   test_issue801r3      sF    $ t
Cs8s6{""" CqFFC F*** s   Ai%  c                     Sn [        5       R                  nU" U 5      nSU Vs/ s H  o3R                  PM     sn;   d   eSU Vs/ s H  o3R                  PM     sn;  d   eUR                  S[        S0/5        U" U 5      nSU Vs/ s H  o3R                  PM     sn;   d   eSU Vs/ s H  o3R                  PM     sn;  d   e[        5       R                  nUR                  S[        S0/5        U" U 5      nSU Vs/ s H  o3R                  PM     sn;   d   eSU Vs/ s H  o3R                  PM     sn;  d   egs  snf s  snf s  snf s  snf s  snf s  snf )z>Test special-case works after tokenizing. Was caching problem.zTI like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.MATH_MATH_N)r   	tokenizerr/   add_special_caser   )r/   r7   r   ws       r   test_issue1061r:   5   sC    bD	##I
D/Cc*cffc****C0CqFFC0000x4*:);<
D/C,1,,,,#.#Q&&#.... 	##Ix4*:);<
D/C,1,,,,#.#Q&&#.... +0 -. -.s#   EE	E+EE$Ei  c                 4   U " S5      n[         R                  " [        U5      S4SS9Ul        UR	                  5        nUR                  USS 5        SSS5        [        U5      S:X  d   eUR                  R                  S	:X  d   eg! , (       d  f       N<= f)
z(Test that doc.merge() resizes doc.tensorza b c d   f)dtyper      N   )r@   r<   )numpyonesr.   tensor
retokenizemergeshape)r0   r   retokenizers      r   test_issue1963rH   K   s~     y
!CSXsO37CJ		[#a(# 
s8q==::x''' 
	s   B		
BzICan not be fixed without variable-width look-behind (which we don't want)i  c                      [        5       n SnU " U5      n[        U5      S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS	   R                  S
:X  d   eUS   R                  S:X  d   eg)z@Test that g is not split of if preceded by a number and a letterz
e2g 2g 52g   r   e2g   2r?   gr@   52   N)r   r.   r/   )nlp	testwordsr   s      r   test_issue1235rS   V   s     )CI
i.Cs8q==q6;;%q6;;#q6;;#q6;;$q6;;#r   i  c                      [        5       n U " S5      n[        U5      S:X  d   e[        U R                  SS/5      5      n[        US   5      S:X  d   e[        US   5      S:X  d   eg )N r   r   rL   )r   r.   r   pipe)rQ   r   docss      r   test_issue1242rX   g   se    
)C
b'Cs8q=="g'(DtAw<1tAw<1r   i  c                      [        [        5       / SQS9n [        [        5       / SQS9nU S   US   :w  d   eU S   US   :X  a   eg)z#Test that tokens compare correctly.abcwords)r[   r]   er   N)r   r   )doc1doc2s     r   test_issue1257rc   q   sO     uwo.Duwo.D7d1gAw$q'!!!!r   i_  c                     [        [        5       / SQS9n [        R                  " [        5         U S   R                  S5      (       d   e SSS5        U S   R                  S5      R                  S:X  d   e[        R                  " [        5         U S   R                  S5      (       d   e SSS5        U S   R                  S5      R                  S	:X  d   eg! , (       d  f       N= f! , (       d  f       ND= f)
zBTest that token.nbor() raises IndexError for out-of-bounds access.)01rM   r^   r   NrL   re   r?   rM   )r   r   pytestraises
IndexErrornborr/   )r   s    r   test_issue1375rl   z   s     eg_
-C	z	"1v{{2 
#q6;;r?3&&&	z	"1v{{1~~~ 
#q6;;q>#%%% 
#	" 
#	"s   CC-
C*-
C;i  c                  D  ^^^^ [         R                  " S5      m[         R                  " S5      m[         R                  " S5      m[         R                  " S5      mUUUU4S jn [        5       nU " U5      Ul        U" S5      nU H  nUR                  (       a  M   e   g)zBTest that tokenizer can parse DOT inside non-whitespace separatorsz[\[\("']z[\]\)"']z[-~\.]z
^https?://c           	         > [        U R                  0 TR                  TR                  TR                  TR                  S9$ )N)prefix_searchsuffix_searchinfix_finditertoken_match)r   vocabsearchfinditermatch)rQ   infix_re	prefix_resimple_url_re	suffix_res    r   my_tokenizer$test_issue1488.<locals>.my_tokenizer   s>    II#**#**#,,%++
 	
r   zThis is a test.Nrecompiler   r7   r/   )r{   rQ   r   r   rw   rx   ry   rz   s       @@@@r   test_issue1488r      s     

?+I

?+Izz-(HJJ01M
 
 )C %CM

 Czzzz r   i  c                     ^ [         R                  " S5      mS/ SQ4SSS/4S/ SQ4/n U4S	 jn[        5       nU" U5      Ul        U  H-  u  p4U" U5       Vs/ s H  oUR                  PM     snU:X  a  M-   e   g
s  snf )z&Test if infix_finditer works correctlyz[^a-z]ztoken 123test)r   rf   rM   3testztoken 1testr   1testzhello...test)r   .r   r   r   c                 B   > [        U R                  0 TR                  S9$ )N)rq   )r   rs   ru   )rQ   rw   s    r   new_tokenizer%test_issue1494.<locals>.new_tokenizer   s    Bx7H7HIIr   Nr}   )
test_casesr   rQ   r/   expectedr   rw   s         @r   test_issue1494r      s     zz-(H	:;	'*+	9:JJ )C!#&CM$(+D	2	u

	2h>>> %2s   A:zJCan not be fixed without iterative looping between prefix/suffix and infixi  c                  J    [        5       n U " S5      n[        U5      S:X  d   eg)zJTest that checks that a dot followed by a quote is handled
appropriately.
z.First sentence."A quoted sentence" he said ...   N)r   r.   rQ   r   s     r   test_issue2070r      s&     )C
>
?Cs8r>>r   in  c                    U " S5      n[        U5      S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS	   R                  S
:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eg)z\Test that the tokenizer correctly splits tokens separated by a slash (/)
ending in a digit.
z"Learn html5/css3/javascript/jquery   r   LearnrL   html5r?   /r@   css3rP   rJ   
javascript      jqueryNr-   )fr_tokenizerr   s     r   test_issue2926r      s    
 ;
<Cs8q==q6;;'!!!q6;;'!!!q6;;#q6;;&   q6;;#q6;;,&&&q6;;#q6;;("""r   r/   u  ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of VolumeOver Registration of VolumeUnder Registration of VolumeOver Registration of VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of VolumeOver Registration of VolumeUnder Registration of VolumeOver Registration of VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of VolumeOver Registration of VolumeUnder Registration of VolumeOver Registration of VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volumezoow.jspsearch.eventoracleopenworldsearch.technologyoraclesolarissearch.technologystoragesearch.technologylinuxsearch.technologyserverssearch.technologyvirtualizationsearch.technologyengineeredsystemspcodewwmkmppscem:iB
  c                 &    U " U5      nU(       d   eg)zDCheck that sentence doesn't cause an infinite loop in the tokenizer.N )r0   r/   r   s      r   test_issue2626_2835r      s     t
CJ3r   i`
  c                    U " S5      n[        U5      S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS	   R                  S
:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eg)zXTest that tokenizer correctly splits off punctuation after numbers with
decimal points.
z&I went for 40.3, and got home by 10.0.r   r   IrL   wentr?   forr@   z40.3rP   ,rJ   r   r   gotr   homer   by	   z10.0
   r   Nr-   )r0   r   s     r   test_issue2656r      s   
 ?
@Cs8r>>q6;;#q6;;&   q6;;%q6;;&   q6;;#q6;;%q6;;%q6;;&   q6;;$q6;;&   r7<<3r   i
  c                 x    U " S5      nUS   R                   S:X  d   eU " S5      nUS   R                   S:X  d   eg)zFTest that words like 'a' and 'a.m.' don't get exceptional norm values.r[   r   amN)norm_)r0   r[   r   s      r   test_issue2754r      sD     	SAQ4::	d	Ba5;;$r   i  c                  J    [        5       n U " S5      n[        U5      S:X  d   eg)z;Test that the tokenizer doesn't hang on a long list of dotszW880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange ZahlrJ   N)r   r.   r   s     r   test_issue3002r      s*     (C
aC s8q==r   z;default suffix rules avoid one upper-case letter before dotiy  c                      [        5       n U R                  S5        SnSnSnU " U5      nU " U5      nU " U5      nUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eg )Nsentencizerz>He gave the ball to I. Do you want to go to the movies with I?z?He gave the ball to I.  Do you want to go to the movies with I?z>He gave the ball to I.
Do you want to go to the movies with I?rJ   r   )r   add_piper/   )rQ   text1text2text3t1t2t3s          r   test_issue3449r     s     )CLLLEMEME	UB	UB	UBa5::a5::a5::r   z
text,wordszA'B C)A'BCzA-Bc                 F    U " U5      n[         R                  " USU05        g )Nr_   )r   	from_dict)r0   r/   r_   r   s       r   test_gold_misalignedr     s#     t
CcGU+,r   c                 6    U " S5      n[        U5      S:X  d   eg )NrU   r   r.   )r7   r1   s     r   test_tokenizer_handles_no_wordr      s    r]Fv;!r   loremc                 >    U " U5      nUS   R                   U:X  d   eg )Nr   r/   r7   r/   r1   s      r   "test_tokenizer_handles_single_wordr   %  s!    t_F!9>>T!!!r   c                     SnU " U5      n[        U5      S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:w  d   eg )	NzLorem, ipsum.rP   r   LoremrL   r   r?   ipsumr-   r   s      r   test_tokenizer_handles_punctr   +  sz    Dt_Fv;!!9>>W$$$!9>>S   !9>>W$$$!9>>W$$$r   c                 :    SnU " U5      n[        U5      S:X  d   eg )NzLorem, (ipsum).r   r   r   s      r   #test_tokenizer_handles_punct_bracesr   5  s"    Dt_Fv;!r   c                     SS/nSnU " U5      nUS   R                   U;  a<  [        U5      S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eg g )	NhubnzLorem ipsum: 1984.r   rJ   r   r@   1984)lang_r.   r/   )r7   
exceptionsr/   r1   s       r   test_tokenizer_handles_digitsr   ;  sl    JDt_Fayj(6{aay~~(((ay~~''' )r   )z
google.comz
python.orgzspacy.iozexplosion.aizhttp://www.google.comc                 6    U " U5      n[        U5      S:X  d   eg NrL   r   r   s      r   test_tokenizer_keep_urlsr   F  s    
 t_Fv;!r   zNASDAQ:GOOGc                 6    U " U5      n[        U5      S:X  d   eg )Nr@   r   r   s      r   test_tokenizer_colonsr   O  s    t_Fv;!r   )zhello123@example.comzhi+there@gmail.itzmatt@explosion.aic                 6    U " U5      n[        U5      S:X  d   eg r   r   r   s      r   test_tokenizer_keeps_emailr   U  s     t_Fv;!r   c                 :    SnU " U5      n[        U5      S:  d   eg )Na  Lorem ipsum dolor sit amet, consectetur adipiscing elit

Cras egestas orci non porttitor maximus.
Maecenas quis odio id dolor rhoncus dignissim. Curabitur sed velit at orci ultrices sagittis. Nulla commodo euismod arcu eget vulputate.

Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, non lacinia enim nibh eget ipsum. Vestibulum in bibendum mauris.

"Nullam porta fringilla enim, a dictum orci consequat in." Mauris nec malesuada justo.rJ   r   r   s      r    test_tokenizer_handles_long_textr   ]  s%    ZD t_Fv;??r   	file_namezsun.txtc                    [        [        5      R                  U-  nUR                  SSS9 nUR	                  5       nS S S 5        [        W5      S:w  d   eU " U5      n[        U5      S:  d   eg ! , (       d  f       N9= f)Nrutf8)encodingr   d   )r   __file__parentopenreadr.   )r7   r   locinfiler/   r1   s         r   $test_tokenizer_handle_text_from_filer   k  sm    
h

&
&
2C	#	'6{{} 
(t9>>t_Fv;	 
(	's   A11
A?c                     SnSnU " U5      nU " U5      nUS   R                   S:X  d   eUS   R                   S:X  d   eg )Nz2Lorem dolor sit amet, consectetur adipiscing elit.z8Lorem ipsum dolor sit amet, consectetur adipiscing elit.r   r   r   )r7   r   r   tokens1tokens2s        r   (test_tokenizer_suspected_freeing_stringsr   u  sL    @EFEGG1:??g%%%1:??g%%%r   orthloremc                     U R                  X5        U " U5      nUS   R                  US   S   :X  d   eUS   R                  US   S   :X  d   eg )Nr   r   rL   r8   r/   )r7   r/   r1   r   s       r   test_tokenizer_add_special_caser   ~  sW    t,
D/Cq6;;&)F++++q6;;&)F++++r   r~   r   )r   tagc                     [         R                  " [        5         U R                  X5        S S S 5        g ! , (       d  f       g = f)N)rh   ri   
ValueErrorr8   r   s      r   $test_tokenizer_validate_special_caser     s*     
z	"""40 
#	"	"s	   6
ALO)r   normc                 
   [        5       n[        U0 S S S 5      nUR                  X5        U" U 5      nUS   R                  US   S   :X  d   eUS   R                  US   S   :X  d   eUS   R                  US   S   :X  d   eg )Nr   r   r   rL   )r   r   r8   r/   r   )r/   r1   rs   r7   r   s        r   #test_tokenizer_add_special_case_tagr     s     GE%T46It,
D/Cq6;;&)F++++q6<<6!9V,,,,q6;;&)F++++r   c                     SnU R                  SSS0/5        U R                  SSS0/5        U " U5      nU Vs/ s H  o3R                  PM     sn/ SQ:X  d   eg s  snf )Nz(((_SPECIAL_ A/B, A/B-A/B")	_SPECIAL_r   A/B)(r  r  r   r   r   r   -r   r   )r   r7   r/   r   r   s       r   )test_tokenizer_special_cases_with_affixesr    sj    (D{fk-B,CDu&78
D/C$'(C5JJC( -   (s   Ac                      [        5       R                  n 0 U l        SnU R                  SSS0/5        U " U5      R                  U:X  d   eU R                  SSS0SS0/5        SnU " U5      R                  U:X  d   eg )Nz''a'' z''r   abr[   r\   zab ab ab ''ab ab'' ab'' ''ab)r   r7   rulesr8   r/   )r7   r/   s     r   8test_tokenizer_special_cases_with_affixes_preserve_spacyr	    s    	##IIO Dtvtn%56T?4''' tvsmfc]%CD)DT?4'''r   c                     SnU R                  SSS0/5        U " U5      nU Vs/ s H  o3R                  PM     snSS/:X  d   eg s  snf )Nz
_SPECIAL_.r   r   r   r   r  s       r   (test_tokenizer_special_cases_with_periodr    sO    D{fk-B,CD
D/C$'(C5JJC([#,>>>>(s   Ac                     SnU R                  SSS0SS0/5        U " U5      nUS   R                  S:X  d   eUS   R                  S	:X  d   eg )
Nz
the _ID'X_z_ID'X_r   _IDz'X_rL   rP   r?   r   )r8   idx)r7   r/   r   s      r    test_tokenizer_special_cases_idxr    sU    Dx65/FE?)KL
D/Cq6::??q6::??r   c                     U " S5       Vs/ s H  oR                   PM     sn/ SQ:X  d   eU R                  SSS0/5        U " S5       Vs/ s H  oR                   PM     snS/:X  d   eg s  snf s  snf )Nza b crZ   r   )r/   r8   )r7   r2   s     r   #test_tokenizer_special_cases_spacesr    sp    %g./.qFF./?BBBw&'):(;<%g./.qFF./G9<<< 0/s   A*
A/c                     [         R                  " S5      n[        U UR                  S9nU" S5       Vs/ s H  o3R                  PM     snSS/:X  d   eS Ul        U" S5       Vs/ s H  o3R                  PM     snS/:X  d   eg s  snf s  snf )N[\.]$)rp   za.r[   r   )r~   r   r   rt   r/   rp   )en_vocabrz   r7   r2   s       r   test_tokenizer_flush_cacher    s    

8$I&&I &dO,OqFFO,c
:::"I%dO,OqFFO,666 -,s   B&Bc                 2   [         R                  " S5      nSSS0/0n[        U UR                  US9nU" S5       Vs/ s H  oDR                  PM     snSS/:X  d   e0 Ul        U" S5       Vs/ s H  oDR                  PM     sn/ SQ:X  d   eg s  snf s  snf )Nr  za ar   )rp   r  za a.r   )r[   r[   r   )r~   r   r   rt   r/   r  )r  rz   r  
tokenizer1r2   s        r   test_tokenizer_flush_specialsr    s    

8$Ife_%&E&&J
 'v./.qFF./E3<???J&v./.qFF./?BBB 0/s   B.Bc                 :   S/nSS/n[        U5      n[        U5      n[        U UR                  UR                  S9nU" S5       Vs/ s H  ofR                  PM     nnU/ SQ:X  d   eUR                  S5       Vs/ s H  ofS   PM	     nnXx:X  d   eg s  snf s  snf )Nza(?=.)z	(?<=\w)\.z(?<=a)\d+\.)ro   rp   za10.)r[   10r   rL   )r
   r   r   rt   r/   explain)	r  prefixessuffixesrx   rz   r7   r2   r1   explain_tokenss	            r   /test_tokenizer_prefix_suffix_overlap_lookbehindr    s    zHn-H$X.I$X.I&&&&I
 (/0/ff/F0%%%%$-$5$5f$=>$=qd$=N>### 1>s   B;Bc                 8   S/nS/n[        U5      n[        U5      n[        U UR                  UR                  S9nU" S5       Vs/ s H  ofR
                  PM     nnUSS/:X  d   eUR                  S5       Vs/ s H  ofS   PM	     nnXx:X  d   eg s  snf s  snf )N   ±%)rq   rp   u   ±10%u   ±10rL   )r	   r   r   ru   rt   r/   r  )	r  infixesr  rw   rz   r7   r2   r1   r  s	            r   test_tokenizer_infix_prefixr$    s    fGuH"7+H$X.I((&&I
 (010ff0F1fc]"""$-$5$5g$>?$>qd$>N?### 2?s   B:Bif'  c                 .   SnSU l         U " U5      nSU Vs/ s H  o3R                  PM     sn;   d   eSU Vs/ s H  o3R                  PM     sn;   d   eSU l         U " U5      nSU Vs/ s H  o3R                  PM     sn;   d   egs  snf s  snf s  snf )z5Test special case works when part of infix substring.zNo--don't seeFzn'tdoTzdon'tN)faster_heuristicsr/   )r0   r/   r   r9   s       r   test_issue10086r(    s     D &+L"
t
CS)SVVS))))C(CqFFC(((( &*L"
t
Cs+s!vvs++++ *(
 ,s   BB*Bc                    [        U [        R                  " S5      R                  SSS0SS0/0S9nU" S5       Vs/ s H  o"R                  PM     nnUR                  S5       Vs/ s H  o"S   PM	     nnX4:X  d   eg s  snf s  snf )Nz^id$idr   id)rr   r  rL   )r   r~   r   rv   r/   r  )r  r7   r2   r1   r  s        r   +test_tokenizer_initial_special_case_explainr-  #  s    JJv&,,FC=63-0
I (o.offoF.$-$5$5d$;<$;qd$;N<### /<s   A>&B)Er~   rA   rh   spacy.lang.der   spacy.lang.enr   spacy.symbolsr   spacy.tokenizerr   spacy.tokensr   spacy.trainingr   
spacy.utilr	   r
   r   r   spacy.vocabr   markissuer   skipparametrizer3   r:   rH   rS   rX   rc   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r	  r  r  r  r  r  r  r$  r(  r-  r   r   r   <module>r:     s   	     !  %  "   3  3R   	;<	 "CD	)*	;<	34	 ?@	=>+  + 4/ /* 4( ( V   4
 
 4  4" " 4& & 4 0 4? ?$ W   4	 	 4# #  
 	C 	c 4  4 & 4  4  VW4  X G12UUG4DE--

 '+" ,"
%( 
U	
 -1 2
 
N
 yk2 3& 'VTNVUO3T)U(VW, X, 	FD>FD>23	D-?@11
 Wd;fe_MNO,,(( ?=7
C$"$" 5, , 
$r   