
    h$(                         S SK r S SKJr  SSKJrJrJr  SSKJrJ	r	  SSK
Jr  SSKJr       SS	 jrS
 r    SS jrSS jr    SS jrS rg)    N)Printer   )DocSpanToken)biluo_tags_to_spansiob_to_biluo)Vocab   )n_sents_infoc              +   &  #    Sn[        US9n[        X5        [        U UUUUS9n	/ n
U	 HB  nU
R                  U5        [	        U
5      U-  S:X  d  M(  [
        R                  " U
5      v   / n
MD     U
(       a  [
        R                  " U
5      v   gg7f)a2  
Convert conllu files into JSON format for use with train cli.
append_morphology parameter enables appending morphology to tags, which is
useful for languages such as Spanish, where UD tags are not so rich.

Extract NER tags if available and convert them so that they follow
BILUO and the Wikipedia scheme
z%^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$)no_print)append_morphologyner_tag_patternner_mapmerge_subtokensr   N)r   r   read_conllxappendlenr   	from_docs)
input_datan_sentsr   r   r   r   _MISC_NER_PATTERNmsg	sent_docssent_docs_to_mergesent_docs               b/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/converters/conllu_to_docs.pyconllu_to_docsr       s     " ?
8
$C+('I !!(+!"W,1-- 233!#	 
 mm.// s   AB?Bc                    U R                  5       R                  S5       H  nUR                  5       R                  S5      nU(       d  M+  US   R                  S5      (       a,  UR                  S5        US   R                  S5      (       a  M,  U HT  nUR                  S5      nUu
  pgppppUR                  S5       H#  n[        R
                  " UU5      (       d  M!        g   MV     M     g)	z%
Check the MISC column for NER tags.



r   #	|TF)stripsplit
startswithpoprematch)r   r   sentlineslinepartsid_wordlemmapostagmorphheaddep_1misc	misc_parts                    r   has_nerr<   0   s       "((0

""4(5(%%c**		! (%%c**

4(INF5s4b!%CIxx;;# "1  1     c              #     #    [        5       n[        X5      nU R                  5       R                  S5       H  nUR                  5       R                  S5      nU(       d  M+  US   R	                  S5      (       a,  UR                  S5        US   R	                  S5      (       a  M,  [        UUUUUUUS9n	U	v   M     g7f)z!Yield docs, one for each sentencer"   r#   r   r$   )r   r   r   set_entsN)r
   r<   r'   r(   r)   r*   conllu_sentence_to_doc)
r   r   r   r   r   vocabr?   r-   r.   docs
             r   r   r   B   s      GEz3H  "((0

""4(5(%%c**		! (%%c**( /"3!C I 1s   AB?"AB?'B?c                    / nU  H;  nUR                  S5      nUu
  pgppppSU;   d  SU;   a  M*  UR                  U5        M=     / nU H  nSnUR                  S5       H  n[        R                  " UU5      nU(       d  M#  UR	                  S5      nUR	                  S5      nU(       a9  U(       a2  US-   U-   nU(       a#  UR                  UU5      nUS:X  a  SnOUS-   U-   n  O   UR                  U5        M     [        U5      $ )	at  Find entities in the MISC column according to the pattern and map to
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
the pattern is not matched.

lines (str): CONLL-U lines for one sentences
tag_pattern (str): Regex pattern for entity tag
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
RETURNS (list): List of BILUO entity tags
r%   -.Or&      r    )r(   r   r+   r,   groupgetr	   )r.   tag_patternr   miscsr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   iobiob_tagr;   	tag_matchprefixsuffixs                         r   get_entitiesrR   ]   s    E

4 AF>5s4b#:T  CCIi8Iy"+"+f$slV3G!(VV!<!R<&)G&,slV&;G ) 	

7! " r=   c                 
   [         R                  " S5      (       d  [         R                  " SSS9  [         R                  " S5      (       d  [         R                  " SSS9  [         R                  " S5      (       d  [         R                  " SSS9  [         R                  " S5      (       d  [         R                  " SSS9  / / / / / / 4u  pxpp/ / pSnSn[        [	        U5      5       GH  nUU   nUR                  S5      nUu
  nnnnnnnnnnS	U;   a  M/  S
U;   a  SnS
U;   a  SnUnUR                  S
5      u  nnSU;  n M\  U(       a  U(       a  UR                  U5        OUR                  U5        U(       a*  UW:X  a  UR                  W 5        O;UR                  S5        O)SU;   a  UR                  S5        OUR                  S5        U(       a
  UW:X  a  SnSn[        U5      S-
  nUS;  a  [        U5      S-
  OUnUS:X  a  UOUnUS:w  a  UOSnUS:w  a  UOSnUS:X  a  SOUnUR                  U5        U
R                  U5        U	R                  U5        UR                  U5        UR                  U5        UR                  U5        GM     [        U UUU	U
UUUUS9	n![        [	        U!5      5       H_  nUU   U!U   R                  l
        UU   U!U   R                  l        UU   U!U   R                  l        UU   U!U   R                  l        Ma     Sn"U(       a  [        XU5      n"[        U!U"5      U!l        U(       a  [#        UU!5      n!/ / / / / / 4u  pxpp/ / p[%        U!5       GHi  u  nn#UR                  U#R                  R                  5        UR                  U#R                  R                  5        UR                  U#R                  R                  5        UR                  U#R                  R                  5        U(       aQ  U#R                  R                  (       a6  U	R                  U#R&                  S-   U#R                  R                  -   5        OU	R                  U#R&                  5        U
R                  U#R(                  5        UR                  U#R*                  R,                  5        UR                  U#R.                  5        GMl     [        U UUU	UUU
UUS9	n$U(       aJ  U!R                    V%s/ s H-  n%[1        U$U%R2                  U%R4                  U%R6                  S9PM/     sn%U$l        U$$ s  sn%f )a4  Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required.

lines (str): The non-comment lines for a CoNLL-U sentence
ner_tag_pattern (str): The regex pattern for matching NER in MISC col
RETURNS (Example): An example containing the annotation
merged_orthrH   )defaultmerged_lemmamerged_morphmerged_spaceafterFr%   rE   rD   TzSpaceAfter=Nor   )0r   r   rootROOT)wordsspacestagsr4   depslemmasmorphsheadsN__)r\   r]   r^   ra   r`   r4   r_   rb   )label)r   has_extensionset_extensionranger   r(   r   intr   r   rT   rW   rV   rX   rR   r   entsmerge_conllu_subtokens	enumeratetag_pos_r7   idep_r   startendrd   )&rA   r.   r   r   r   r   r?   r\   r]   r^   posesra   r`   rb   r_   subtok_word	in_subtokrn   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   subtok_start
subtok_endsubtok_spaceafterrB   ri   tdoc_xents&                                         r   r@   r@      s   $ }--M26~..NB7~..NB7233/<13RRR1G.E4b4KI3u:Qx

4 AF>T5#sE4b$#:#:I#:IK'*yy~$L* /t ;yLL%LLj /0e$$MM% MM$
*KI#hl"&j"8D	AcCZcSCZcR#2vf3eSCeTCS V 
C 3s8_$QxA &q	A &q	A%+AYA"	 
 DEG<&sD1$UC0 24RRR1G.E4b4#1QSS__%acc&&'acc++,acc&&'!1!1KK(8(889KKQVVQVVXXAFF  
E HK
HPD		377#))<

 L	
s   4Uc                    / nU  GH;  nUR                  S5      nUu
  pVpxpppSU;   d  M%  UR                  S5      u  nnU[        U5      S-
  [        U5       nUR                  U5        / n0 n/ nU H  nUR                  UR                  5        UR                  UR                  5        UR
                  R                  (       d  MV  UR
                  R                  R                  S5       HW  nUR                  SS5      u  nnUU;  a  [        5       UU'   UR                  S5       H  nUU   R                  U5        M     MY     M     UR                  5        H)  u  nnUS-   SR                  [        U5      5      -   UU'   M+     U H  nUR                  UR
                  l        SR                  U5      UR
                  l        SR                  U5      Ul        SR                  [        UR                  5       5      5      UR
                  l        US	   R                   (       a  S
OSUR
                  l        M     GM>     UR%                  5        nU H  nUR'                  U5        M     S S S 5        U$ ! , (       d  f       U$ = f)Nr%   rD   r   r&   =, r   TF)r(   rh   r   rl   lemma_r   rW   setadditemsjoinsortedorth_rT   rV   valueswhitespace_rX   
retokenizemerge)r.   rB   subtok_spansr/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   ru   rv   subtok_spanr^   ra   r`   tokenfeaturefieldr   valueretokenizerspans                               r   rj   rj     s   L

4 AF>5s4b#:'*yy~$L*c,/!3c*oFK,DFF$EJJ'ell+77'''#(77#7#7#=#=c#B(/c1(=v .,/EF5M%+\\#%6E"5M--e4 &7	 $C	 % "(v %chhvf~.F Fu "0 %&+kk#'*xx'7$ XXd^
'*xxv}}0G'H$'O77DU ) %5 F 
	[ Dd# ! 
 J	 
	 Js   I::
J	)
   FNFF)FFrH   N)N)FFNF)r+   wasabir   tokensr   r   r   trainingr   r	   rA   r
   conll_ner_to_docsr   r    r<   r   rR   r@   rj    r=   r   <module>r      si    	  & & 9  +
 "0J( 6$V AH*r=   