
    h                     |    S SK r S SKrS SKJr  S SKJr  S SKJrJrJ	r	  \R                  S 5       rS rS rS rS	 rg)
    N)English)	Tokenizer)compile_infix_regexcompile_prefix_regexcompile_suffix_regexc           	      t   [        [        R                  R                  5      n[	        [        R                  R
                  5      n/ SQn[        U5      n[        R                  " S5      n[        U [        R                  R                  UR                  UR                  UR                  UR                  S9$ )N)z\.\.\.+z(?<=[0-9])-(?=[0-9])z[0-9]+(,[0-9]+)+u   [\[\]!&:,()\*—–\/-]a-b)token_match)r   r   Defaultsprefixesr   suffixesr   recompiler   tokenizer_exceptionssearchfinditermatch)en_vocab	prefix_re	suffix_recustom_infixesinfix_retoken_match_res         g/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/lang/en/test_customized_tokenizer.pycustom_en_tokenizerr   
   s    $W%5%5%>%>?I$W%5%5%>%>?IN #>2HZZ&N--"((     c                     SnU " U5       Vs/ s H  o"R                   PM     nnU/ SQ:X  d   eSnU " U5       Vs/ s H  o"R                   PM     nnU/ SQ:X  d   eg s  snf s  snf )Nz\The 8 and 10-county definitions are not used for the greater Southern California Megaregion.)The8and10-countydefinitionsarenotusedforthegreaterSouthern
California
Megaregion.z]The 8- and 10-county definitions are not used for the greater Southern California Megaregion.)r   r   r"   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   textr   sentencewordcontexts       r   ,test_en_customized_tokenizer_handles_infixesr5       s    mH%8%BC%BTyy%BGC    ( oH%8%BC%BTyy%BGC    / D, Ds
   AA!c                 h    SnU " U5       Vs/ s H  o"R                   PM     nnU/ SQ:X  d   eg s  snf )Nz\The 8 and 10-county definitions a-b not used for the greater Southern California Megaregion.)r   r   r    r!   r"   r#   r$   r	   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r1   s       r   0test_en_customized_tokenizer_handles_token_matchr7   O   sA    mH%8%BC%BTyy%BGC     D   /c                 h    SnU " U5       Vs/ s H  o"R                   PM     nnU/ SQ:X  d   eg s  snf )N_The 8 and 10-county definitions are not used for the greater Southern California Megaregion. :))r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   :)r/   r1   s       r   *test_en_customized_tokenizer_handles_rulesr<   g   sA    pH%8%BC%BTyy%BGC     Dr8   c                     SnU R                   nUS	 X l         U " U5       Vs/ s H  o3R                  PM     nnU/ SQ:X  d   eg s  snf )Nr:   r;   )r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   :))rulesr0   )r   r2   r@   r3   r4   s        r   3test_en_customized_tokenizer_handles_rules_propertyrA      sY    pH%%Ed %%8%BC%BTyy%BGC     Ds   A)r   pytestspacy.lang.enr   spacy.tokenizerr   
spacy.utilr   r   r   fixturer   r5   r7   r<   rA    r   r   <module>rH      sD    	  ! % V V  *,^02r   