
    h2                     z   S SK r \ R                  R                  S5      S 5       r\ R                  R                  S5      S 5       r\ R                  R                  S5      \ R                  R                  SSS	/5      S
 5       5       r\ R                  R                  S5      \ R                  R                  SSS/5      S 5       5       r\ R                  R                  S5      \ R                  R                  SSS/5      S 5       5       r\ R                  R                  S5      \ R                  R                  S/ SQ5      S 5       5       r	\ R                  R                  S5      \ R                  R                  S/ SQ5      S 5       5       r
\ R                  R                  S5      \ R                  R                  SSS/5      S 5       5       r\ R                  R                  S5      \ R                  R                  SSS/5      S  5       5       r\ R                  R                  S!5      \ R                  R                  SS"S#/5      S$ 5       5       r\ R                  R                  S%5      \ R                  R                  SS&/5      S' 5       5       r\ R                  R                  S(5      \ R                  R                  SS)/5      S* 5       5       r\ R                  R                  S+5      \ R                  R                   S, 5       5       r\ R                  R                  SS-S./5      \ R                  R                  S/5      S0 5       5       r\ R                  R                  S15      S2 5       r\ R                  R                  S35      S4 5       r\ R                  R                  S55      S6 5       r\ R                  R                  S7/ S8Q5      \ R                  R                  S95      S: 5       5       r\ R                  R                  S;5      \ R                  R                  SS<S=/5      S> 5       5       rg)?    Ni_  c                     U " S5      nUS   R                   S:X  d   e[        US   5      S:X  d   eUS   R                   S:X  d   eg )Nz   This is a cat.r         )idxlenen_tokenizerdocs     \/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/lang/en/test_tokenizer.pytest_issue351r      sK    
*
+Cq6::??s1v;!q6::??    ih  c                 6    U " S5      n[        U5      S:  d   eg)z!Test tokenization of big ellipsisz$45...............Asking   Nr   r	   tokenss     r   test_issue360r      s     45Fv;??r   i  ztext,number)7am7)z11p.m.11c                 `    U " U5      n[        U5      S:X  d   eUS   R                  U:X  d   eg)z\Test that times like "7am" are tokenized correctly and that numbers are
converted to string.r   r   Nr   text)r	   r   numberr   s       r   test_issue736r      s6    
 $Fv;!!9>>V###r   i  r   z3/4/2012z
01/12/1900c                 6    U " U5      n[        U5      S:X  d   eg)zTest that dates are not split and kept as one token. This behaviour is
currently inconsistent, since dates separated by hyphens are still split.
This will be hard to prevent without causing clashes with numeric ranges.r   Nr   r	   r   r   s      r   test_issue740r      s      $Fv;!r   i  zWe were scaredzWe Were Scaredc                 |    U " U5      n[        U5      S:X  d   eUS   R                  R                  5       S:X  d   eg)zmTest that 'were' and 'Were' are excluded from the contractions
generated by the English tokenizer exceptions.r   r   wereN)r   r   lowerr   s      r   test_issue744r"   '   s?    
 $Fv;!!9>>!V+++r   i  ztext,is_num))oneT)tenT)	tenelevenFc                 >    U " U5      nUS   R                   U:X  d   eg )Nr   )like_num)r	   r   is_numr   s       r   test_issue759r)   1   s&    
 $F!9'''r   i  )ShellshellShedshedc                 `    U " U5      n[        U5      S:X  d   eUS   R                  U:X  d   eg)zoTest that 'Shell' and 'shell' are excluded from the contractions
generated by the English tokenizer exceptions.r   r   Nr   r   s      r   test_issue775r/   :   s6    
 $Fv;!!9>>T!!!r   i  zThis is a string c                     U " U5      nSR                  U Vs/ s H  o3R                  PM     sn5      U:X  d   egs  snf )zGTest for Issue #792: Trailing whitespace is removed after tokenization. Njointext_with_wsr	   r   r
   tokens       r   test_issue792r7   D   ;     t
C77C8C5&&C89TAAA8   ;zThis is a stringzThis is a string
c                     U " U5      nSR                  U Vs/ s H  o3R                  PM     sn5      U:X  d   egs  snf )z6Test base case for Issue #792: Non-trailing whitespacer1   Nr2   r5   s       r   test_control_issue792r;   L   r8   r9   i[  zaaabbb@ccc.com
Thank you!zaaabbb@ccc.com 
Thank you!c                 8    U " U5      nUR                   U:X  d   eg)z5Test that no extra space is added in doc.text method.N)r   r	   r   r
   s      r   test_issue859r>   T   s      t
C88tr   iv  zDatum:2014-06-02
Dokument:76467c                     U " U5      nU HR  n[        UR                  5      [        UR                  5      :X  d   eXR                     UR                  S   :X  a  MR   e   g)zLTest that token.idx matches the original text index for texts with newlines.r   N)r   r   r4   r   r5   s       r   test_issue886r@   ^   sU     t
C5::#e&8&8"9999II%**Q-/// r   i{  z	want/needc                 `    U " U5      n[        U5      S:X  d   eUS   R                  S:X  d   eg)z(Test that / infixes are split correctly.r   r   /Nr   r   s      r   test_issue891rC   h   s6     $Fv;!!9>>S   r   i  c                     [         R                  " S5        S H9  nSn[        SS5       H  nX![        U5      -   -  nM     U " U5      nU(       a  M9   e   g)zTest that spaCy doesn't hang on many punctuation characters.
If this test hangs, check (new) regular expressions for conflicting greedy operators
pytest_timeout)	.,'":?!;-0r   d   N)pytestimportorskiprangestr)r	   punctstringir
   s        r   test_issue957rX   q   sU     ()>q#Ac!fn$F 6"s
s ?r   ztest@example.comzjohn.doe@example.co.uki  c                 b    U " U5      n[        U5      S:X  d   eUS   R                  (       a   eg)z6Test that doc doesn't identify email-addresses as URLsr   r   N)r   like_urlr=   s      r   test_issue1698r[      s1     t
Cs8q==1vr   i  c                 6    U " S5      n[        U5      S:X  d   eg)zDTest that "would've" is handled by the English tokenizer exceptions.zwould'ver   Nr   r   s     r   test_issue1758r]      s      *%Fv;!r   i  c                 f    U " S5      nUS   R                   S:X  a  US   R                  S:w  d   egg)zuTest that spaces don't receive a POS but no TAG. This is the root cause
of the serialization issue reported in #1773.
r   SPACEr1   N)pos_tag_r   s     r   test_issue1773rc      s9     t
C
1v{{g1v{{b    r   i  c                     U " S5      n[        U5      S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eg)	z2Test that hyphens are split correctly as prefixes.uA   —Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.   r   u   —   u   –	   Nr   )es_tokenizerr
   s     r   test_issue3277ri      sa     Z
[Cs8r>>q6;;("""q6;;("""q6;;("""r   word)zdon'tu   don’tzI'du   I’di  c                 @    U " U5      S   nUR                   (       d   eg )Nr   )is_stop)r	   rj   toks      r   test_issue3521rn      s     t
Q
C;;;r   i)  thesesthisrec                 6    U " U5      n[        U5      S:X  d   eg)zqTest that 'theses' and 'thisre' are excluded from the contractions
generated by the English tokenizer exceptions.r   Nr   r   s      r   test_issue10699rr      s     
 $Fv;!r   )rQ   markissuer   r   parametrizer   r   r"   r)   r/   r7   r;   r>   r@   rC   slowrX   r[   r]   rc   ri   rn   rr    r   r   <module>rx      sF    3  3  37G(HI$ J $ 3*l!;< =  3"24D!EF, G , 3G( (
 3!CD" E " 3"57O!PQB R B 3"46J!KLB M B 3
)+HI  3"D!EF0 G 0 3+/! 0 ! 3   "46N!OP4  Q 4  4! ! 4# # !EF4  G 5(H!56 7 r   