
    h.                        S SK r S SKrS SKJr  S SKJrJrJrJrJ	r	J
r
Jr  S SKrSSKJr  SSKJrJr  SSKJrJr  SSKJr  S	S
KJr  S	SKJr  \(       a  SSKJr  Sr\R<                  R?                  S5         S(S\
\   S\ S\!S\!S\
\   S\S/\\   4   4S jj5       r"\R<                  R?                  S5         S)S\
\\#\4      S\!S\!S\!S\S/\\   4   4
S jj5       r$\R<                  R?                  S5      SS.S\S\ 4S jj5       r%\R<                  R?                  S5        S*S\
\   S\!S\!S\S/\\   4   4S  jj5       r&S\\#\4   S\	\   4S! jr' " S" S#5      r( " S$ S%5      r) " S& S'5      r*g)+    N)Path)TYPE_CHECKINGCallableIterableIteratorListOptionalUnion   )util)ErrorsWarnings)DocDocBin)Vocab   )dont_augment)Example)Languagez.spacyzspacy.Corpus.v1pathgold_preproc
max_lengthlimit	augmenterreturnr   c                     U c  [        [        R                  5      e[        R                  R                  SU 5        [        U UUUUS9$ )NzLoading corpus from path: %s)r   r   r   r   )
ValueErrorr   E913r   loggerdebugCorpus)r   r   r   r   r   s        O/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/corpus.pycreate_docbin_readerr#      sI     |%%KK4d;!     zspacy.JsonlCorpus.v1
min_lengthc                     [        XX#S9$ )N)r%   r   r   )JsonlCorpus)r   r%   r   r   s       r"   create_jsonl_readerr(   *   s     tzWWr$   zspacy.read_labels.v1F)requirer)   c                h    U(       d  U R                  5       (       d  g [        R                  " U 5      $ N)existssrsly	read_json)r   r)   s     r"   read_labelsr/   4   s#     4;;==??4  r$   zspacy.PlainTextCorpus.v1c                 N    U c  [        [        R                  5      e[        XUS9$ )  Iterate Example objects from a file or directory of plain text
UTF-8 files with one line per doc.

path (Path): The directory or filename to read from.
min_length (int): Minimum document length (in tokens). Shorter documents
    will be skipped. Defaults to 0, which indicates no limit.
max_length (int): Maximum document length (in tokens). Longer documents will
    be skipped. Defaults to 0, which indicates no limit.

DOCS: https://spacy.io/api/corpus#plaintextcorpus
r%   r   )r   r   r   PlainTextCorpus)r   r%   r   s      r"   create_plain_text_readerr4   =   s%    " |%%4:NNr$   c                    [         R                  " U 5      n U R                  5       (       d&  U R                  S   R	                  U5      (       a  U /$ U nU /n/ n[        5       nU H  n [        U 5      U;   a  M  UR                  [        U 5      5        U R                  (       a%  U R                  S   R                  S5      (       a  Md  U R                  5       (       a!  UR                  U R                  5       5        M  U R                  S   R	                  U5      (       d  M  UR                  U 5        M     [        U5      S:X  a1  [        R                  " [        R                   R#                  X!S95        UR%                  5         U$ )N.r   )r   format)r   ensure_pathis_dirpartsendswithsetstradd
startswithextenditerdirappendlenwarningswarnr   W090r8   sort)r   	file_type	orig_pathpathslocsseens         r"   walk_corpusrN   S   s   D!D;;==TZZ^44Y??vIFED5Dt9T::$**R.33C88[[]]LL(ZZ^$$Y//KK  4yA~hmm**	*LMIIKKr$   c                       \ rS rSrSrSSSSSS.S\\\4   S\S	\	S
\S\
\   S\	SS4S jjrSSS\\   4S jrSSS\S	\	S\4S jrSSS\\   S\\   4S jrSSS\\   S\\   4S jrS\S\\\\4      S\\   4S jrSrg)r!   l   a  Iterate Example objects from a file or directory of DocBin (.spacy)
formatted data files.

path (Path): The directory or filename to read from.
gold_preproc (bool): Whether to set up the Example object with gold-standard
    sentences and tokens for the predictions. Gold preprocessing helps
    the annotations align to the tokenization, and may result in sequences
    of more consistent length. However, it may reduce run-time accuracy due
    to train/test skew. Defaults to False.
max_length (int): Maximum document length. Longer documents will be
    split into sentences, if sentence boundaries are available. Defaults to
    0, which indicates no limit.
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
    Defaults to 0, which indicates no limit.
augment (Callable[Example, Iterable[Example]]): Optional data augmentation
    function, to extrapolate additional examples from your annotations.
shuffle (bool): Whether to shuffle the examples.

DOCS: https://spacy.io/api/corpus
r   FN)r   r   r   r   shuffler   r   r   r   r   rQ   r   c                    [         R                  " U5      U l        X0l        X@l        X l        Ub  UO[        U l        X`l        g r+   )	r   r9   r   r   r   r   r   r   rQ   )selfr   r   r   r   r   rQ   s          r"   __init__Corpus.__init__   s9     $$T*	($
&/&;r$   nlpr   c              #     #    U R                  UR                  [        U R                  [        5      5      nU R
                  (       a!  [        U5      n[        R
                  " U5        U R                  (       a  U R                  X5      nOU R                  X5      nU H  nU R                  X5       H  nUv   M	     M!     g7f)zYield examples from the data.

nlp (Language): The current nlp object.
YIELDS (Example): The examples.

DOCS: https://spacy.io/api/corpus#call
N)read_docbinvocabrN   r   	FILE_TYPErQ   listrandomr   make_examples_gold_preprocmake_examplesr   )rS   rV   ref_docsexamplesreal_egaugmented_egs         r"   __call__Corpus.__call__   s      ##CII{499i/PQ<<H~HNN8$66sEH))#8HG $s <"" !=  s   CC	referencec                 J   U(       d  UR                   (       a]  [        [        UR                  U Vs/ s H  oDR                  PM     snU Vs/ s H  n[        UR                  5      PM     snS9U5      $ [        UR                  UR                  5      U5      $ s  snf s  snf )Nwordsspaces)has_unknown_spacesr   r   rY   textboolwhitespace_make_doc)rS   rV   re   r   words        r"   _make_exampleCorpus._make_example   s     977II1:;99;?HIytD!1!12yI
   3<<	7CC <Is   BB reference_docsc              #     #    U H  n[        U5      S:X  a  M  U R                  S:X  d  [        U5      U R                  :  a  U R                  XS5      v   MS  UR                  S5      (       d  Mk  UR                   Ha  n[        U5      S:X  a  M  U R                  S:X  d  [        U5      U R                  :  d  M?  U R                  XR                  5       S5      v   Mc     M     g 7f)Nr   F
SENT_START)rD   r   rp   has_annotationsentsas_doc)rS   rV   rr   re   ref_sents        r"   r^   Corpus.make_examples   s      (I9~"A%Y$//)I((??)),77 )H8}) A-X1P"00oo6GOO	 !0 (s   A*C&0AC&;+C&c              #   $  #    U H  nUR                  S5      (       a*  UR                   Vs/ s H  oDR                  5       PM     nnOU/nU H5  nU R                  XS5      n[	        UR
                  5      (       d  M1  Uv   M7     M     g s  snf 7f)Nrt   T)ru   rv   rw   rp   rD   x)rS   rV   rr   re   sent	ref_sentsrx   egs           r"   r]   !Corpus.make_examples_gold_preproc   sx      (I''557@Gt[[]	G	&K	%''t<rtt99H & (Gs   *BB5B>BrY   rL   c              #     #    SnU H  n[         R                  " U5      nUR                  S   R                  [        5      (       d  MB  [        5       R                  U5      nUR                  U5      nU HB  n[        U5      (       d  M  Uv   US-  nU R                  S:  d  M0  X0R                  :  d  MA    M     M     g7f)z(Yield training examples as example dictsr   r6   r   N)
r   r9   r;   r<   rZ   r   	from_diskget_docsrD   r   )rS   rY   rL   ilocdoc_bindocsdocs           r"   rX   Corpus.read_docbin   s      C""3'Cyy}%%i00 (,,S1''.C3xx!	Q::?qJJ!   s   AC 	=C 
C %C 6
C )r   r   r   r   r   rQ   )__name__
__module____qualname____firstlineno____doc__r
   r>   r   intrl   r	   r   rT   r   r   rc   r   rp   r   r^   r]   r   rX   __static_attributes__ r$   r"   r!   r!   l   s0   2 "(,CI 	
   H%  
"#J #8G+< #*DD*-D=AD	DPP/7}P	'	P/7}	'	"""*5d+;"<"	#"r$   r!   c                   n    \ rS rSrSrSrSSSS.S\\\\	4      S\
S\
S	\
S
S4
S jjrSSS
\\   4S jrSrg)r'      a;  Iterate Example objects from a file or directory of jsonl
formatted raw text files.

path (Path): The directory or filename to read from.
min_length (int): Minimum document length (in tokens). Shorter documents
    will be skipped. Defaults to 0, which indicates no limit.

max_length (int): Maximum document length (in tokens). Longer documents will
    be skipped. Defaults to 0, which indicates no limit.
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
    Defaults to 0, which indicates no limit.

DOCS: https://spacy.io/api/corpus#jsonlcorpus
jsonlr   )r   r%   r   r   r   r%   r   r   Nc                ^    [         R                  " U5      U l        X0l        X@l        X l        g r+   )r   r9   r   r%   r   r   )rS   r   r   r%   r   s        r"   rT   JsonlCorpus.__init__   s%     $$T*	$$
r$   rV   r   c              #   ,  #    [        U R                  S5       H  n[        R                  " U5      nU H  nUR	                  US   5      nU R
                  S:  a  [        U5      U R
                  :  a  MB  U R                  S:  a  [        U5      U R                  :  a  Mm  U Vs/ s H  ofR                  PM     nnU Vs/ s H  n[        UR                  5      PM     nn[        U[        UR                  XxS95      v   M     M     gs  snf s  snf 7f)zYield examples from the data.

nlp (Language): The current nlp object.
YIELDS (Example): The example objects.

DOCS: https://spacy.io/api/corpus#jsonlcorpus-call
z.jsonlrk   r   rg   N)rN   r   r-   
read_jsonlrn   r%   rD   r   rk   rl   rm   r   r   rY   )	rS   rV   r   recordsrecordr   wrh   ri   s	            r"   rc   JsonlCorpus.__call__  s      tyy(3C&&s+G!ll6&>2??a'CHt,F__)c#h$//.I-01SVVSE1;>?3ad1==13F? "#s399E'QRR " 4 2?s   B#D%D
:D D5D)r   r   r%   r   r   r   r   r   r   rI   r	   r
   r>   r   r   rT   r   r   rc   r   r   r$   r"   r'   r'      sw     I uS$Y'( 	
   
SJ S8G+< Sr$   r'   c            	       h    \ rS rSrSrSrSSS.S\\\\	4      S\
S\
S	S
4S jjrSSS	\\   4S jrSrg
)r3   i  r1   txtr   r2   r   r%   r   r   Nc                R    [         R                  " U5      U l        X l        X0l        g r+   )r   r9   r   r%   r   )rS   r   r%   r   s       r"   rT   PlainTextCorpus.__init__+  s      $$T*	$$r$   rV   r   c              #     #    [        U R                  S5       H  n[        USS9 nU H  nUR                  S5      n[	        U5      (       d  M&  UR                  U5      nU R                  S:  a  [	        U5      U R                  :  a  Mb  U R                  S:  a  [	        U5      U R                  :  a  M  [        XUR                  5       5      v   M     SSS5        M     g! , (       d  f       M  = f7f)zYield examples from the data.

nlp (Language): The current nlp object.
YIELDS (Example): The example objects.

DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
z.txtzutf-8)encodingz
r   N)
rN   r   openrstriprD   rn   r%   r   r   copy)rS   rV   r   frk   r   s         r"   rc   PlainTextCorpus.__call__6  s      tyy&1CcG,D;;v.D4yy!ll40??a/CHt4N$!__1c#h6P$ &c88:66  -, 2,,s#   $C4%C"BC"C4"
C1	,C4)r   r%   r   r   r   r$   r"   r3   r3     sg    
 I 	%uS$Y'(	% 		%
 	% 
	%7J 78G+< 7r$   r3   )r   r   N)r   r   r   )r   r   )+r\   rE   pathlibr   typingr   r   r   r   r   r	   r
   r-    r   errorsr   r   tokensr   r   rY   r   augmentr   exampler   languager   rZ   registryreadersrl   r   r#   r>   r(   r/   r4   rN   r!   r'   r3   r   r$   r"   <module>r      s      U U U   %    ! #	 () $(
4.  	
 ! zlHW--. *& -. 	X
5d#
$XX X 	X
 zlHW--.X /X -./4 !d ! ! /! 12 O
4.OO O zlHW--.	O 3O*eCI& d4j 2v" v"r4S 4Sn.7 .7r$   