
    h1                     $   S SK r S SKJr  S SKJr  S SKJrJrJrJ	r	J
r
  S SKrS SKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJr  SSK J!r!  SSKJ"r"J#r#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,  SSK-J.r.  SSK/J0r0  Sr1S7S\	\2   4S jjr3 " S S\"5      r4 " S S\5      r5 " S S \5      r6\6Ro                  S!S"S#/\S$S$S%S&0S'.S(S(SS).S*9S+\S,\S-\2S.\8S/\8S0\	\   4S1 j5       r9\" S2/ S3Q5      r:S8S4 jr;S5 r<S9S6 jr=S /r>g):    N)
namedtuple)Path)AnyCallableDictOptionalUnion)Model   )util)Errors)BaseDefaultsLanguage)Morphologizer)DEFAULT_MORPH_MODEL)Scorer)POS)DocMorphAnalysis)validate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )
STOP_WORDS)SYNTAX_ITERATORS)TAG_BIGRAM_MAP)TAG_MAP)TAG_ORTH_MAPzU
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.ja.JapaneseTokenizer"
split_mode = null

split_modec                    ^  U 4S jnU$ )Nc                 ,   > [        U R                  TS9$ )Nr!   )JapaneseTokenizervocab)nlpr!   s    P/home/james-whalen/.local/lib/python3.13/site-packages/spacy/lang/ja/__init__.pyjapanese_tokenizer_factory4create_tokenizer.<locals>.japanese_tokenizer_factory$   s     zBB     )r!   r)   s   ` r(   create_tokenizerr-   #   s    C &%r+   c                       \ rS rSrSS\S\\   SS4S jjrS rS\S\	4S	 jr
SS
\4S jjrS rS rS\\\4   4S jr0 4S\\\4   SS4S jjrS\4S jrS\SS 4S jrS\\\4   SS4S jrS\\\4   SS 4S jrSrg)r%   *   Nr&   r!   returnc                     Xl         X l        [        U R                  5      U l        US L =(       d    US:H  (       + U l        g )NA)r&   r!   try_sudachi_import	tokenizerneed_subtokens)selfr&   r!   s      r(   __init__JapaneseTokenizer.__init__+   s6    
$+DOO<#-#5#Js9JKr+   c                 >    [         U R                  U R                  44$ N)r%   r&   r!   r6   s    r(   
__reduce__JapaneseTokenizer.__reduce__2   s     4::t"???r+   textc           	         U R                   R                  U5      nU R                  U5      n[        X15      u  p4U(       a  [	        U6 O/ /S-  u  pVpxpn[        U5      n[        U R                  XTS9nS n[        [	        X5      5       GH  u  nu  nnUR                  Ul
        U(       a	  Xl        S nOB[        UR                  UR                  US-   [        U5      :  a  XnS-      OS 5      u  Ul        nUR                  (       a  UR                  OUR                   Ul        0 nUR$                  (       a  UR$                  US'   UR&                  Ul        UR*                  (       a%  [,        R.                  " SSUR*                  5      US'   [1        U R                  U5      Ul        GM"     U R4                  (       a  XR6                  S'   U$ )	N   )wordsspacesr   
Inflectionz[=|]_Reading
sub_tokens)r4   tokenize_get_dtokensget_dtokens_and_spacesziplistr   r&   	enumeratetagtag_posresolve_posorth_lenlemmasurfacelemma_infnormnorm_readingresubr   morphr5   	user_data)r6   r>   sudachipy_tokensdtokensrB   rA   tagsinflectionslemmasnormsreadingssub_tokens_listdocnext_posidxtokendtokenr\   s                     r(   __call__JapaneseTokenizer.__call__5   su   >>2248##$450? %CM2$( 	K[%? /$**E9$-c#.?$@ C%EJ$	&1KKJJ%(1Ws4y%8DqMd'#	8 ,2<<6<<V^^ELEzz&,jjl# ++EK~~ $&66&#v~~#Fi '

E:EK- %A. *9MM,'
r+   need_sub_tokensc                    U(       a  U R                  U5      OS n[        U5       VVVs/ s H  u  pE[        UR                  5       5      S:  d  M$  [	        UR                  5       SR                  UR                  5       S S  Vs/ s H  ofS:w  d  M
  UPM     sn5      SR                  UR                  5       SS   Vs/ s H  ofS:w  d  M
  UPM     sn5      UR                  5       UR                  5       UR                  5       U(       a  X4   OS 5      PM     nnnn[        U5       VVs/ s Hw  u  pHUS:X  dj  UR                  R                  5       (       aK  UR                  S:w  d;  XtS-
     R                  R                  5       (       a  XtS-
     R                  S:w  d  Mu  UPMy     snn$ s  snf s  snf s  snnnf s  snnf )Nr   -   *;   空白r   )_get_sub_tokensrL   rR   rT   DetailedTokenjoinpart_of_speechdictionary_formnormalized_formreading_formisspacerM   )	r6   r^   rm   re   rh   ri   xxr_   ts	            r(   rH   JapaneseTokenizer._get_dtokens]   s   6ED  !124 	 ((89
 :
5==?#a'
Mu';';'=bq'AO'A3Y"'AOPu';';'=ab'AO'A3Y"'AOP%%'%%'""$"  $
 : 	 
( $G,
,ax99$$&&uu 7#++3355Qw##x/ ,
 	
 PO	
$
sC   #F74F7	F-F-&F7>	F2F2AF7+A4F>#F>-
F7c                    U R                   (       d  g / nU GH)  nUR                  U R                  R                  R                  5      n[        U5      S:X  a  UR                  S 5        MU  U R                  S:X  a$  UR                  U R                  US5      /5        M  UR                  U R                  R                  R                  5      n[        U5      [        U5      :X  a&  U R                  US5      nUR                  Xf/5        M  UR                  U R                  US5      U R                  US5      /5        GM,     U$ )Nr   BF)
r5   splitr4   	SplitModer2   rR   appendr!   rH   r   )r6   r^   re   ri   sub_asub_br_   s          r(   rt   !JapaneseTokenizer._get_sub_tokens}   s
   ""%EKK 8 8 : :;E5zQ&&t,C'&&(9(9%(G'HIDNN$<$<$>$>?u:U+"//u=G#**G+=>#** --eU; --eU; &$ r+   c                 F    [        US5        [        R                  " U5      $ )NzJapaneseTokenizer.score)r   r   score_tokenization)r6   exampless     r(   scoreJapaneseTokenizer.score   s    ($=>((22r+   c                     SU R                   0$ Nr!   r$   r;   s    r(   _get_configJapaneseTokenizer._get_config   s    doo..r+   configc                 2    UR                  SS 5      U l        g r   )getr!   )r6   r   s     r(   _set_configJapaneseTokenizer._set_config   s     **\48r+   c                 B   ^  SU 4S j0n[         R                  " U/ 5      $ )Ncfgc                  L   > [         R                  " T R                  5       5      $ r:   )srsly
json_dumpsr   r;   s   r(   <lambda>,JapaneseTokenizer.to_bytes.<locals>.<lambda>   s    e&6&6t7G7G7I&Jr+   )r   to_bytes)r6   kwargsserializerss   `  r(   r   JapaneseTokenizer.to_bytes   s    JK}}["--r+   datac                 z   ^  SU 4S j0n[         R                  " X/ 5        [        T R                  5      T l        T $ )Nr   c                 N   > TR                  [        R                  " U 5      5      $ r:   )r   r   
json_loads)br6   s    r(   r   .JapaneseTokenizer.from_bytes.<locals>.<lambda>   s    $*:*:5;K;KA;N*Or+   )r   
from_bytesr3   r!   r4   )r6   r   r   deserializerss   `   r(   r   JapaneseTokenizer.from_bytes   s1     OPR0+DOO<r+   pathc                 p   ^  [         R                  " U5      nSU 4S j0n[         R                  " X/ 5        g )Nr   c                 N   > [         R                  " U TR                  5       5      $ r:   )r   
write_jsonr   pr6   s    r(   r   +JapaneseTokenizer.to_disk.<locals>.<lambda>   s    (8(8D<L<L<N(Or+   )r   ensure_pathto_diskr6   r   r   r   s   `   r(   r   JapaneseTokenizer.to_disk   s,    %OPT+r+   c                    ^  [         R                  " U5      nSU 4S j0n[         R                  " X/ 5        [        T R                  5      T l        T $ )Nr   c                 N   > TR                  [        R                  " U 5      5      $ r:   )r   r   	read_jsonr   s    r(   r   -JapaneseTokenizer.from_disk.<locals>.<lambda>   s    (8(89K(Lr+   )r   r   	from_diskr3   r!   r4   r   s   `   r(   r   JapaneseTokenizer.from_disk   sA    %LMt"-+DOO<r+   )r5   r!   r4   r&   r:   )T)__name__
__module____qualname____firstlineno__r   r   strr7   r<   r   rk   boolrH   rt   r   r   r   r   r   bytesr   r   r	   r   r   r   __static_attributes__r,   r+   r(   r%   r%   *   s    Le L# L$ L@&S &S &P
d 
@43/T#s(^ / 46 9$sCx. 9$ 9.E .u 3F ,E#t), ,4 ,
eCI. =P r+   r%   c                   8    \ rS rSr\" \5      r\r\	r
SSSS.rSrg)JapaneseDefaults   ltrF)	directionhas_casehas_lettersr,   N)r   r   r   r   r   DEFAULT_CONFIGr   r   
stop_wordsr   syntax_iteratorswriting_systemr   r,   r+   r(   r   r      s#    !.1FJ'#(eERNr+   r   c                       \ rS rSrSr\rSrg)Japanese   jar,   N)r   r   r   r   langr   Defaultsr   r,   r+   r(   r   r      s    DHr+   r   morphologizerztoken.morphz	token.posTz@scorerszspacy.morphologizer_scorer.v1)model	overwriteextendscorerg      ?)pos_accmorph_micro_fmorph_per_feat)assignsdefault_configdefault_score_weightsr'   r   namer   r   r   c           	      .    [        U R                  XX4US9$ )N)r   r   r   )r   r&   )r'   r   r   r   r   r   s         r(   make_morphologizerr      s    . 		5)6 r+   ru   )rT   rM   rV   rS   rW   rY   rF   c                     SSK JnJn  UR                  R                  R
                  UR                  R                  R
                  UR                  R                  R                  UR                  R                  R                  S.U    n UR                  5       R                  U S9nU$ ! [         a    [        S5      Sef = f)zSudachiPy is required for Japanese support, so check for it.
It it's not available blow up and explain how to fix it.
split_mode should be one of these values: "A", "B", "C", None->"A".r   )
dictionaryr4   )Nr2   r   C)modezJapanese support requires SudachiPy and SudachiDict-core (https://github.com/WorksApplications/SudachiPy). Install with `pip install sudachipy sudachidict_core` or install spaCy with `pip install spacy[ja]`.N)	sudachipyr   r4   	Tokenizerr   r2   r   r   
DictionarycreateImportError)r!   r   r4   toks       r(   r3   r3      s    3 %%//11$$..00$$..00$$..00	

 
 ##%,,*,=
 :

 	s   B(B+ +Cc                     U[         ;   a  [         U   nX;   a  X0   S4$ U(       a0  X4nU[        ;   a#  [        U   u  pVUc  [        U   [           U4$ XV4$ [        U   [           S4$ )a  If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
in the sentence. This function returns resolved POSs for both token
and next_token by tuple.
N)r    r   r   r   )orthrM   next_tagorth_map
tag_bigramcurrent_posrg   s          r(   rP   rP      s     l$>4'' ]
'$2:$>!K"CL% 
 #,, 3<d""r+   c                    U  Vs/ s H  o3R                   PM     nnSR                  SR                  U5      R                  5       5      SR                  UR                  5       5      :w  a&  [        [        R
                  R                  XS95      e/ n/ nSn[        U5      S:X  a  XV4$ [        U Vs/ s H  oR                  5       (       a  M  UPM     sn5      S:X  a-  UR                  5       (       d   e[        XSXS S 5      /nS/nXV4$ [        [        X@5      5       H  u  n	u  pUR                  5       (       a  M   XS  R                  U5      nUS:  a9  XX{-    nUR                  [        XSXS S 5      5        UR                  S5        X{-  nUR                  U
5        UR                  S5        U[        U5      -  nU	S-   [        U 5      :  d  M  X	S-      R                   S:X  d  M  SUS'   US-  nM     U[        U5      :  a3  XS  nUR                  [        XSXS S 5      5        UR                  S5        XV4$ s  snf s  snf ! [         a(    [        [        R
                  R                  XS95      S ef = f)	N )r>   rA   r   Fr    T)rT   rv   r   
ValueErrorr   E194formatrR   r{   ru   rL   rJ   indexr   )r_   r>   gap_tagxrA   text_dtokenstext_spacestext_poswordirj   
word_startws                r(   rI   rI      sS    '(1YYE(	wwrwwu~##%&"''$**,*??+++CDDLKH
5zQ((	u;utLLNdu;	<	A||~~~%dRT4PQg(( 's5':;>D<<>>	Si..t4J
 > 56Aa"aD$ OPu%"H 	F#5!CIq53w<GEN$:$:c$A"KOMH/ <4 #d)OM!b!dKL5!$$a ) <  	SV[[//T/GHdR	Ss   I 2II I

2I<r:   )r2   )rs   )?rZ   collectionsr   pathlibr   typingr   r   r   r   r	   r   	thinc.apir
   r   r   errorsr   languager   r   pipeliner   pipeline.morphologizerr   r   r   symbolsr   tokensr   r   trainingr   r   r   r   r&   r   r   r   r   r   tag_bigram_mapr   tag_mapr   tag_orth_mapr    r   r   r-   r%   r   r   factoryr   r   ru   r3   rP   rI   __all__r,   r+   r(   <module>r     sL   	 "  7 7     . % 9   ( ) B B  " . *  &&# &K K\S| S x  
 
K($>?	   
	

 
 	

 
 X

 X
0#D2%j ,r+   