
    h                        S SK JrJrJr  SSKJrJr  SSKJr  SSK	J
r
Jr  SSKJr  SSKJr  SSKJrJrJr  SS	KJr  S
SKJr  S
SKJr  S
SKJr  S
SKJr  SrS r " S S\5      r  " S S\5      r! " S S\5      r"SS jr#S r$S/r%g)    )AnyDictIterator   )BaseDefaultsLanguage)Scorer)POSX)Doc)validate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)TOKENIZER_INFIXES)
STOP_WORDS)TAG_MAPzA
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
c                      S n U $ )Nc                 ,    [        U R                  5      $ NKoreanTokenizervocab)nlps    P/home/james-whalen/.local/lib/python3.13/site-packages/spacy/lang/ko/__init__.pykorean_tokenizer_factory2create_tokenizer.<locals>.korean_tokenizer_factory   s    syy))     )r   s    r   create_tokenizerr#      s    * $#r!   c                   r    \ rS rSrS\4S jr\S 5       rS rS\	S\
4S jrS\	S\\\	\4      4S	 jrS
 rSrg)r      r   c                 <    Xl         [        5       U l        S U l        g r   )r   try_mecab_import_mecab_mecab_tokenizer)selfr   s     r   __init__KoreanTokenizer.__init__   s    
&( $r!   c                 `    U R                   c  U R                  S5      U l         U R                   $ )Nz-F%f[0],%f[7])r)   r(   r*   s    r   mecab_tokenizerKoreanTokenizer.mecab_tokenizer$   s-       ($(KK$@D!$$$r!   c                 (    [         U R                  44$ r   r   r.   s    r   
__reduce__KoreanTokenizer.__reduce__0   s    --r!   textreturnc           
         [        U R                  U5      5      nU Vs/ s H  o3S   PM	     nn[        U R                  U[        [	        X5      5      S9n[        XR5       Hk  u  pgUS   R                  S5      u  pn
Xl        UR                  [        ;   a   [        UR                     [           Ul
        O[        Ul
        US   Ul        Mm     U Vs/ s H  o3S   PM	     snUR                  S'   U$ s  snf s  snf )Nsurface)wordsspacestag+lemma	full_tags)listdetailed_tokensr   r   check_spaceszip	partitiontag_r   r
   posr   lemma_	user_data)r*   r4   dtokensdtsurfacesdoctokendtoken	first_tagsep	eomi_tagss              r   __call__KoreanTokenizer.__call__3   s    t++D12,34GbyMG4$**HT,t:V5WX .ME(.u(?(?(D%II"JzzW$#EJJ/4		!'?EL / ;B%B'Bi'%Bk"
 5 &Cs   C4C9c              #     #    U R                   R                  USS9 Hi  nUR                  5       (       a    g UR                  nUR                  nUR                  S5      u  pVnUR                  S5      u  pn	US:X  a  UnX8US.v   Mk     g 7f)NT)as_nodes,/*)r7   r<   r:   )r/   parseis_eosr7   featurerB   )
r*   r4   noder7   rY   r:   _exprr<   	remainders
             r   r?   KoreanTokenizer.detailed_tokensB   s      ((..td.CD{{}}llGllG",,S1LCD"&.."5Ei|%cBB Ds   B	Bc                 F    [        US5        [        R                  " U5      $ )NzKoreanTokenizer.score)r   r	   score_tokenization)r*   exampless     r   scoreKoreanTokenizer.scoreP   s    ($;<((22r!   )r(   r)   r   N)__name__
__module____qualname____firstlineno__r   r+   propertyr/   r2   strr   rP   r   r   r   r?   rb   __static_attributes__r"   r!   r   r   r      sb    %e %
 	% 	%.S S CC CHT#s(^,D C3r!   r   c                   <    \ rS rSr\" \5      r\r\	r
SSSS.r\rSrg)KoreanDefaultsU   ltrF)	directionhas_casehas_lettersr"   N)rd   re   rf   rg   r   DEFAULT_CONFIGconfigr   lex_attr_gettersr   
stop_wordswriting_systemr   infixesrj   r"   r!   r   rl   rl   U   s(    !.1F J#(eERNGr!   rl   c                       \ rS rSrSr\rSrg)Korean]   kor"   N)rd   re   rf   rg   langrl   Defaultsrj   r"   r!   r   ry   ry   ]   s    DHr!   ry   Nc                  H     SSK Jn   U $ ! [         a    [        S5      S ef = f)Nr   MeCabzThe Korean tokenizer ("spacy.ko.KoreanTokenizer") requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), and [natto-py](https://github.com/buruzaemon/natto-py))nattor   ImportErrorr   s    r   r'   r'   b   s6    
 E

 	s   
 !c              #      #    SnSnU H0  nU R                  XC5      nUS:  a  X%:g  v   U[        U5      -   nUnM2     US:  a  Sv   g g 7f)Nr   F)findlen)r4   tokensprev_endstartrK   idxs         r   r@   r@   p   s\     HEii%a</!U#  qy s   AA	)r5   N)&typingr   r   r   languager   r   scorerr	   symbolsr
   r   r   r   trainingr   utilr   r   r   r   r   	lex_attrsr   punctuationr   ru   r   tag_mapr   rr   r#   r   rl   ry   r'   r@   __all__r"   r!   r   <module>r      ss    & & .    ) B B    * " $43n 43n \  X 

 *r!   