
    h>3                        S SK r S SKrS SKJr  S SKJrJrJrJrJ	r	J
r
Jr  SSKJr  SSKJr  SSKJrJr  \(       a  SS	KJr  S
\S\S\
\\\	\   4      S\S\S\
\	\      S\S\/\\   4   4S jrSSSSSSS.SSS\S
\S\S\
\\\	\   4      S\S\S\
\	\      S\\   4S jjrS\S\S\\\	\   4   S\S\/\\   4   4S jrS\S\S\/\\   4   4S jrSSS\S\\   4S jrSSS\S\S\\   4S jrSSS\4S jrSSS.SSS\S\\\	\   4   S\S\S\\   4S  jjrS!S".SSS#\S$\\\	\   4   S\\\	\\\	\   4      4   S\S\\\\\	\   4   4   4S% jjr SSS\S&\S'\!S\4
S( jr"S) r#g)*    N)partial)TYPE_CHECKINGCallableDictIteratorListOptionalTuple   )registry   )Example)_doc_to_biluo_tags_with_partialsplit_bilu_label)Languagelower_level
orth_levelorth_variantswhitespace_levelwhitespace_per_tokenwhitespace_variantsreturnr   c           
      (    [        [        U UUUUUS9$ )a,  Create a data augmentation callback that uses orth-variant replacement.
The callback can be added to a corpus or other data iterator during training.

lower_level (float): The percentage of texts that will be lowercased.
orth_level (float): The percentage of texts that will be augmented.
orth_variants (Optional[Dict[str, List[Dict]]]): A dictionary containing the
    single and paired orth variants. Typically loaded from a JSON file.
whitespace_level (float): The percentage of texts that will have whitespace
    tokens inserted.
whitespace_per_token (float): The number of whitespace tokens to insert in
    the modified doc as a percentage of the doc length.
whitespace_variants (Optional[List[str]]): The whitespace token texts.
RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
r   r   r   r   r   r   )r   combined_augmenterr   s         P/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/augment.pycreate_combined_augmenterr      s&    , #)1/     g        r   nlpexamplec             #     #    [         R                   " 5       U:  a  [        X5      nU(       a  [         R                   " 5       U:  an  UR                  nUR                  5       n	[	        UR
                  5      U	S   S'   [        U UU	S   USS9u  pXS'   UR                  U R                  U
5      U	5      nU(       a  [         R                   " 5       U:  a{  [        [        [        UR
                  5      U-  5      5       HM  n[        U U[         R                  " U5      [         R                  " S[        UR
                  5      5      5      nMO     Uv   g 7f)Ndoc_annotationentitiestoken_annotationFlowerr   )randommake_lowercase_varianttextto_dictr   	referencemake_orth_variants	from_dictmake_docrangeintlenmake_whitespace_variantchoice	randrange)r   r    r   r   r   r   r   r   raw_text	orig_dictvariant_textvariant_token_annot_s                r   r   r   /   s"     }}$(6:5<<OO%	2Q3
	"#J/ -?()-
) )<$%##CLL$>	Jv}}1AAs3w0014HHIJA-12  C(9(9$:;	G K Ms   EElevelr&   c                      [        [        X US9$ )a  Create a data augmentation callback that uses orth-variant replacement.
The callback can be added to a corpus or other data iterator during training.

level (float): The percentage of texts that will be augmented.
lower (float): The percentage of texts that will be lowercased.
orth_variants (Dict[str, List[Dict]]): A dictionary containing
    the single and paired orth variants. Typically loaded from a JSON file.
RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
)r   r:   r&   )r   orth_variants_augmenter)r:   r&   r   s      r   create_orth_variants_augmenterr=   V   s     }QV r   c                     [        [        U S9$ )a#  Create a data augmentation callback that converts documents to lowercase.
The callback can be added to a corpus or other data iterator during training.

level (float): The percentage of texts that will be augmented.
RETURNS (Callable[[Language, Example], Iterator[Example]]): The augmenter.
r:   )r   lower_casing_augmenterr?   s    r   create_lower_casing_augmenterrA   g   s     )77r   c              #      #    Uv   g 7fN )r   r    s     r   dont_augmentrE   s   s
     
Ms   c             #   b   #    [         R                   " 5       U:  a  Uv   g [        X5      v   g 7frC   )r'   r(   )r   r    r:   s      r   r@   r@   w   s%      }}%$S22s   -/c                 .   UR                  5       n[        UR                  5      US   S'   U R                  UR                  R                  5       5      nUR                   Vs/ s H  oDR                  PM     snUS   S'   UR                  X25      $ s  snf )Nr"   r#   r$   ORTH)r*   r   r+   r.   r)   r&   lower_r-   )r   r    example_dictdocts        r   r(   r(      s    ??$L1P2L!":. ,,w||))+
,CBIBSBS/TBSQBS/TL#$V,S// 0Us   #B)r:   r&   c             #   j  #    [         R                   " 5       U:  a  Uv   g UR                  nUR                  5       n[        UR                  5      US   S'   [        U UUS   UUS L=(       a    [         R                   " 5       U:  S9u  pxXS'   UR                  U R                  U5      U5      v   g 7f)Nr"   r#   r$   r%   )r'   r)   r*   r   r+   r,   r-   r.   )	r   r    r   r:   r&   r5   r6   r7   r8   s	            r   r<   r<      s      }}%<<OO%	2Q3
	"#J/ -?()$&B6==?U+B-
) )<$%\ :IFFs   B1B3Fr%   raw
token_dictc                   UR                  S/ 5      nUR                  S/ 5      nU(       d  X4$ U(       a/  U Vs/ s H  owR                  5       PM     nnUR                  5       nU(       d  XRS'   X4$ UR                  S/ 5      nU V	s/ s H  n	[        R                  " U	S   5      PM     n
n	[	        [        U5      5       HB  n[	        [        U5      5       H'  nXk   X   S   ;   d  M  X[   X   S   ;   d  M!  X   X['   M)     MD     UR                  S/ 5      nU V	s/ s H  n	[        R                  " U	S   5      PM     n
n	[	        [        U5      5       H  n[	        [        U5      5       H  nXk   X   S   ;   d  M  UU   [        R                  R                  X   S   5      ;   d  M?  [        R                  " SS/5      n[        X   S   5      S	:X  a  X   S   R                  Xk   5      nO+X   S    H   nX[   U;   d  M  UR                  X[   5      nM"     X   U   X['   M     M     XRS'   [        U5      nX4$ s  snf s  sn	f s  sn	f )
NrH   TAGsinglevariantstagspairedr   r   r   )getr&   r'   r3   r/   r1   	itertoolschainfrom_iterableindexconstruct_modified_raw_text)r   rN   rO   r   r&   wordsrT   wndsvxpunct_choicesword_idx	punct_idxndpvpair_idxpairs                   r   r,   r,      s/    NN62&E>>%$D$)*EqE*iik"6Xr*D;?@4aV]]1Z=14M@#e*%s4y)I$/&"99Otz'BB"/": * & Xr*D;?@4aV]]1Z=14M@#e*%s4y)I~!88U>..tz/JK>L "==!Q0tv./14#v6<<T^LH !%
 ; ?d2'+zz%/'BH !< #0":8"D * &" v
%j
1C?M + A As   I 	#I#I

whitespacepositionc                    UR                  5       n[        UR                  5      US   S'   UR                  S0 5      nUR                  S0 5      n[	        UR                  5      S:X  d  SU;  d  [	        UR                  S/ 5      5      S:  db  [	        UR                  R
                  5      S:  d?  UR                  R                  S5      (       a!  UR                  R                  SSS	9(       d  U$ UR                  S/ 5      n[	        U5      nSUs=::  a  U::  d   e   eUR                  R                  S
5      (       aq  Sn	US:  aT  X8:  aO  US   US-
     n
US   U   nSU
;   a6  SU;   a0  [        U
5      u  p[        U5      u  pUS;   a  US;   a
  X:X  a  SU 3n	US   R                  X95        OUS	 US   R                  X25        US   R                  US5        UR                  R                  S5      (       a  US   R                  US5        OUS	 UR                  R                  S5      (       a  US   R                  X25        OUS	 UR                  R                  S5      (       a  US   R                  US5        OUS	 UR                  R                  S5      (       a  US   R                  US5        OUS	 UR                  R                  SSS	9(       a  US:X  a  US   R                  US5        OUS   R                  X3S-
  5        [        [	        US   5      5       H!  nUS   U   U:  d  M  US   U==   S-  ss'   M#     US   R                  US5        OUS	 US	 UR                  R                  S5      (       a  US   R                  US5        OUS	 [        U5      n[        R                  " U R                  U5      U5      $ )a  Insert the whitespace token at the specified token offset in the doc.
This is primarily intended for v2-compatible training data that doesn't
include links or spans. If the document includes links, spans, or partial
dependency annotation, it is returned without modifications.

The augmentation follows the basics of the v2 space attachment policy, but
without a distinction between "real" and other tokens, so space tokens
may be attached to space tokens:
- at the beginning of a sentence attach the space token to the following
  token
- otherwise attach the space token to the preceding token

The augmenter does not attempt to consolidate adjacent whitespace in the
same way that the tokenizer would.

The following annotation is used for the space token:
TAG: "_SP"
MORPH: ""
POS: "SPACE"
LEMMA: ORTH
DEP: "dep"
SENT_START: False

The annotation for each attribute is only set for the space token if there
is already at least partial annotation for that attribute in the original
example.

RETURNS (Example): Example with one additional space token.
r"   r#   r$   r   rH   linksDEPT)require_completeENT_TYPEOr   -)BI)rp   LzI-SPACYFrQ   _SPLEMMAPOSSPACEMORPH HEADdep
SENT_START)r*   r   r+   rV   r1   spanshas_annotationr   insertr/   r[   r   r-   r.   )r   r    rf   rg   rJ   doc_dictrO   r\   lengthentityent_prevent_nextent_iob_prevent_type_prevent_iob_nextent_type_nextirN   s                     r   r2   r2      s   F ??$L1P2L!":.  0"5H!!"4b9J 	G!##x||GR()A-w  &&'!+,,U33%%44UT4R NN62&EZF"F"""""''
33a<H-
+HqL9H
+H5Hh3(?.>x.H+.>x.H+ J.$
2%6!-1F##H5Z vh3wx/''..5  51u''007""88w''..5  73u''007""8R0w'''Eq=v%%h2v%%h1=s:f-./A&!!$06"1%*% 0 	5  51vu''55< ''%8|$
%j
1CS\\#.==r   c                 d    Sn[        U S   U S   5       H  u  p#X-  nU(       d  M  US-  nM     U$ )z2Construct modified raw text from words and spaces.rx   rH   rr    )zip)rO   rN   orthspacys       r   r[   r[   R  sA    
C:f-z'/BC53JC D Jr   )$rW   r'   	functoolsr   typingr   r   r   r   r   r	   r
   utilr   r    r   	iob_utilsr   r   languager   floatstrr   r   r=   rA   rE   r@   r(   r<   boolr,   r0   r2   r[   rD   r   r   <module>r      s      Q Q Q   H# Dd4j12 	
   "$s), z7#Xg%667J 59!"%/3$	$$ 	$
 $ Dd4j12$ $  $ "$s),$ g$N/3CdO/Dz7#Xg%667"	8	8z7#Xg%667	8j 7 x7H 3	3%3163g30
 0W 0 G	GG T$Z(G
 G G gGB 4	4	4 S$s)^$4 T$sDI~"6778	4 4 3S$s)^$$%4ns>	s>s> s> 	s>
 s>lr   