
    h!                     v    S SK Jr  SSKJr  SSKJrJr  SSKJr  SSK	J
r
Jr  SSKJr   SS
 jrSS jrS rS rg	)    )Printer   )Errors)DocSpan)iob_to_biluo)get_lang_class
load_model   )tags_to_entitiesNc              +     #    [        US9nSnSU ;   a  U(       a  UR                  S5        SnXp;   a  U(       a  UR                  S5        SnSU ;   a#  Xp;  a  U(       a  [        Xa5        [        XU5      n SU ;  a  Xp;   a  U(       a  [	        U SSX6S	9n SU ;  a>  Xp;  a9  US:  a  U(       d  UR                  S
U S35        O[        Xa5        [	        XXsUS	9n SU ;  a  UR                  S5        Xp;  a  UR                  S5        U(       a  [        U5      nO[        S5      " 5       nU R                  5       R                  U5       GH  n	U	R                  5       n	U	(       d  M  / n
/ n/ n/ nU	R                  S5       GH=  nUR                  5       nU(       d  M  UR                  S5       Vs/ s H)  oR                  5       (       d  M  UR                  5       PM+     nn[        [        U Vs/ s H  oR                  5       PM     sn6 5      n[        U5      S:  a  [        [        R                  5      e[        US   5      nU
R                  US   5        UR                  S/S/US-
  -  -   5        UR                  [!        US   5      5        UR                  [        U5      S:  a  US   OS/U-  5        GM@     [#        UR$                  U
S9n['        U5       H  u  nnUU   Ul        UU   Ul        M     [-        U5      nU VVVs/ s H  u  nnn[/        UUUS-   US9PM     snnnUl        Uv   GM     gs  snf s  snf s  snnnf 7f)a  
Convert files in the CoNLL-2003 NER format and similar
whitespace-separated columns into Doc objects.

The first column is the tokens, the final column is the IOB tags. If an
additional second column is present, the second column is the tags.

Sentences are separated with whitespace and documents can be separated
using the line "-DOCSTART- -X- O O".

Sample format:

-DOCSTART- -X- O O

I O
like O
London B-GPE
and O
New B-GPE
York I-GPE
City I-GPE
. O

)no_printz-DOCSTART- -X- O O

zNSentence boundaries found, automatic sentence segmentation with `-s` disabled.FzNDocument delimiters found, automatic document segmentation with `-n` disabled.r    )modelmsgz4No sentence boundaries found to use with option `-n zD`. Use `-s` to automatically segment sentences or `-n 0` to disable.zJNo sentence boundaries found. Use `-s` to automatically segment sentences.zWNo document delimiters found. Use `-n` to automatically group sentences into documents.xx
r   T   -words)startendlabelN)r   warnn_sents_infosegment_docssegment_sents_and_docsr
   r	   stripsplitlistziplen
ValueErrorr   E903extendr   r   vocab	enumeratetag_is_sent_startr   r   ents)
input_datan_sents	seg_sentsr   r   kwargsr   doc_delimiternlp	conll_docr   sent_startspos_tags
biluo_tags
conll_sentlinelinescolslengthdocitokenentitiesLses                             e/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/converters/conll_ner_to_docs.pyconll_ner_to_docsrE   
   s    6 8
$C(M		
 	"w	
  ?GS"!*}E
ZM$?I+J2UT
 ZM$CQ;yHHFwi P  &/]SJ Z	
 &(	

 T"$%%'--m<	OO%	
#//&1J#))+J.8.>.>t.DU.Dd

\TZZ\.DEUu=utjjlu=>?D4y1} --a\FLLa!v6A:(>>?l4845OOs4y1}DG3%&.I 2 #))5)!#HAu!!EJ"-a.E ' $J/JRS(wq!QDA1q5:(S	9 = V= Ts1   F+M-L>	L>M-MDMM
*%Mc                 h   S nU(       a}  [        U5      nSUR                  ;   ab  UR                  SU S35        UR                   H,  u  pxS[	        US/ 5      ;   d  M  UR                  USS/5        M.     UR                  S5      nU(       d2  UR                  S5        [        S5      " 5       nUR                  S5      nU R                  5       R                  S	5      n	U	 V
s/ s H#  oR                  5       R                  5       S
   PM%     nn
[        WR                  US9nU" U5        / nS
n[        U5       H`  u  nnUR                  (       a6  U(       a  X-  S
:X  a  UR                  U5        UR                  S5        US-  nUR                  X   5        Mb     S	R!                  U5      $ s  sn
f )Nparserz-Segmenting sentences with parser from model 'z'.listening_componentszmodel.tok2veczhSegmenting sentences with sentencizer. (Use `-b model` for improved parser-based sentence segmentation.)r   sentencizerr   r   r   r   r   )r
   
pipe_namesinfopipelinegetattrreplace_listenersget_piper	   create_piper!   r"   r   r)   r*   r,   appendjoin)r=   r/   r2   r   r   rI   r3   nameprocr:   r9   r   nlpdoclines_with_segs
sent_countr>   r?   s                    rD   r    r    y   s   Ks~~%HHDUG2NO!ll
wt-CRHH))$?:KL + ,,x0K<	
 T"$oom4IIKd#E167ZZ\!!$E7%(FOJf%5:/14&&}5""2&!OJux( & 99_%% 8s   #*F/c                     SnU R                  U5      n[        S[        U5      U5       Vs/ s H	  oTXUU-    PM     nnSn U H  nXU-   -  n XR                  U5      -  n M     U $ s  snf )Nr   r   r   )r"   ranger%   rR   )r.   r/   r2   sent_delimitersentsr>   docsr=   s           rD   r   r      s~    N^,E,1!SZ,IJ,Iq!'k",IDJJ}44
))#..
   Ks   A%c                 ^    U R                  SU S35        US:X  a  U R                  S5        g g )NzGrouping every z sentences into a document.r   z^To generate better training data, you may want to group sentences into documents with `-n 10`.)rK   r   )r   r/   s     rD   r   r      s5    HHwi'BCD!|5	
     )
   FNF)NN)wasabir   errorsr   tokensr   r   trainingr   utilr	   r
   r   r   rE   r    r   r    r^   rD   <module>rf      s5       $ .  CHl^&B
r^   