
    h,O                     D   S SK r S SKrS SKrS SKJr  S SKJr  S SKJrJ	r	J
r
JrJrJrJrJrJr  S SKrSSKJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*  Sr+\
\,\\,\\
\,\4      4   4   r-S r.S r/ " S S\*5      r0S r1g)    N)defaultdict)Path)	AnyCallableDictIterableListOptionalSequenceTupleUnion   )ErrorsWarnings)Language)MatcherPhraseMatcher)levenshtein_compareget_ner_prf)DocSpan)Example)SimpleFrozenListensure_path	from_diskregistryto_disk   )Pipez||c                     [        U 5      $ Nr   )exampleskwargss     T/home/james-whalen/.local/lib/python3.13/site-packages/spacy/pipeline/entityruler.pyentity_ruler_scorer&      s    x      c                      [         $ r"   )r&    r'   r%   make_entity_ruler_scorerr*      s    r'   c                      \ rS rSrSr S/S\SS\S\S.S\S\	S\
\\\	4      S	\S
\S\S\	S\
\\      S\
\   SS4S jjjrS\4S jrS\	S\4S jrS\S\4S jrS\4S jrS r\S\\	S4   4S j5       rSSS.S\/ \\   4   S\
\   S\
\\      4S jjr\S\\
\	   S4   4S j5       r\S\\   4S j5       r S\\   SS4S jr!S0S  jr"S!\	SS4S" jr#S0S# jr$S\	S\\	\
\	   4   4S$ jr%S\&S!\&S\	4S% jr'\(" 5       S&.S'\)S(\\	   SS 4S) jjr*\(" 5       S&.S(\\	   S\)4S* jjr+\(" 5       S&.S+\\	\,4   S(\\	   SS 4S, jjr-\(" 5       S&.S+\\	\,4   S(\\	   SS4S- jjr.S.r/g)1EntityRuler    a  The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical
`EntityRecognizer` to boost accuracy, or used on its own to implement a
purely rule-based entity recognition system. After initialization, the
component is typically added to the pipeline using `nlp.add_pipe`.

DOCS: https://spacy.io/api/entityruler
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
NF)phrase_matcher_attrmatcher_fuzzy_comparevalidateoverwrite_ents
ent_id_seppatternsscorernlpnamer.   r/   r0   r1   r2   r3   r4   returnc                   Xl         X l        X`l        [        [        5      U l        [        [        5      U l        XPl        X@l        [        UR                  XPR                  S9U l        X0l        [        UR                  U R                  US9U l        Xpl        [        [         5      U l        Ub  U R%                  U5        Xl        g)ac  Initialize the entity ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"`
key. A pattern can either be a token pattern (list) or a phrase pattern
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.

nlp (Language): The shared nlp object to pass the vocab to the matchers
    and process phrase patterns.
name (str): Instance name of the current pipeline component. Typically
    passed in automatically from the factory when the component is
    added. Used to disable the current entity ruler while creating
    phrase patterns with the nlp object.
phrase_matcher_attr (int / str): Token attribute to match on, passed
    to the internal PhraseMatcher as `attr`.
matcher_fuzzy_compare (Callable): The fuzzy comparison method for the
    internal Matcher. Defaults to
    spacy.matcher.levenshtein.levenshtein_compare.
validate (bool): Whether patterns should be validated, passed to
    Matcher and PhraseMatcher as `validate`
patterns (iterable): Optional patterns to load in.
overwrite_ents (bool): If existing entities are present, e.g. entities
    added by the model, overwrite them by matches if necessary.
ent_id_sep (str): Separator used internally for entity IDs.
scorer (Optional[Callable]): The scoring method. Defaults to
    spacy.scorer.get_ner_prf.

DOCS: https://spacy.io/api/entityruler#init
r0   fuzzy_compareattrr0   N)r5   r6   	overwriter   listtoken_patternsphrase_patterns	_validater/   r   vocabmatcherr.   r   phrase_matcherr2   tuple_ent_idsadd_patternsr4   )
selfr5   r6   r.   r/   r0   r1   r2   r3   r4   s
             r%   __init__EntityRuler.__init__+   s    P 	')$/*40!%:"II8R8R
 $7 +IID44x
 %#E*h'r'   c                     [        S U R                  R                  5        5       5      n[        S U R                  R                  5        5       5      nX-   $ )z5The number of all patterns added to the entity ruler.c              3   8   #    U  H  n[        U5      v   M     g 7fr"   len.0ps     r%   	<genexpr>&EntityRuler.__len__.<locals>.<genexpr>i   s     L/K!s1vv/K   c              3   8   #    U  H  n[        U5      v   M     g 7fr"   rM   rO   s     r%   rR   rS   j   s     N0M1A0MrT   )sumr?   valuesr@   )rH   n_token_patternsn_phrase_patternss      r%   __len__EntityRuler.__len__g   sK    Lt/B/B/I/I/KLLN0D0D0K0K0MNN33r'   labelc                 H    XR                   ;   =(       d    XR                  ;   $ )z+Whether a label is present in the patterns.)r?   r@   )rH   r\   s     r%   __contains__EntityRuler.__contains__m   s    +++Lu8L8L/LLr'   docc                     U R                  5       n U R                  U5      nU R                  X5        U$ ! [         a  nU" U R                  X/U5      s SnA$ SnAff = f)zFind matches in document and add them as entities.

doc (Doc): The Doc object in the pipeline.
RETURNS (Doc): The Doc with added entities, if available.

DOCS: https://spacy.io/api/entityruler#call
N)get_error_handlermatchset_annotations	Exceptionr6   )rH   r`   error_handlermatcheses        r%   __call__EntityRuler.__call__q   s\     ..0	<jjoG  .J 	< D%;;	<s   #6 
A AAAc           
         U R                  5         [        R                  " 5          [        R                  " SSS9  [	        U R                  U5      5      [	        U R                  U5      5      -   nS S S 5        [        W VVVs/ s H  u  p4oTU:w  d  M  X4U4PM     snnn5      nS n[        XgSS9nU$ ! , (       d  f       NI= fs  snnnf )Nignorez\[W036)messagec                 $    U S   U S   -
  U S   * 4$ )Nr   r   r)   )ms    r%   <lambda>#EntityRuler.match.<locals>.<lambda>   s    !A$1+!u!5r'   T)keyreverse)	_require_patternswarningscatch_warningsfilterwarningsr>   rC   rD   setsorted)rH   r`   rg   m_idstartendfinal_matchesget_sort_keys           r%   rc   EntityRuler.match   s     $$&##Hi@4<<,-T5H5H5M0NNG ' 8?P$4DC<d3P
 6}M '&
 Qs   AB4CC4
Cc           	         [        UR                  5      n/ n[        5       nU H  u  pgn[        S XU  5       5      (       a  U R                  (       d  M2  Xu;  d  M9  US-
  U;  d  MD  X`R
                  ;   a  U R
                  U   u  p[        XXU
S9nO
[        XXS9nUR                  U5        U Vs/ s H&  oR                  U:  a  UR                  U:  a  M$  UPM(     nnUR                  [        Xx5      5        M     X4-   Ul        gs  snf )zModify the document in placec              3   8   #    U  H  oR                   v   M     g 7fr"   )ent_type)rP   ts     r%   rR   .EntityRuler.set_annotations.<locals>.<genexpr>   s     6~!::~rT   r   )r\   span_id)r\   N)r>   entsrx   anyr=   rF   r   appendr{   r|   updaterange)rH   r`   rg   entitiesnew_entitiesseen_tokensmatch_idr{   r|   r\   ent_idspanrh   s                r%   rd   EntityRuler.set_annotations   s    >e$+ HS6s~666t~~'C!G;,F}},$(MM($;MECfMDC@D##D)''!#!%%%-Ax   ""5#45 %, *	s   3#D
D
.c                 v   [        U R                  R                  5       5      nUR                  U R                  R                  5       5        [        5       nU HJ  nU R
                  U;   a&  U R                  U5      u  pEUR                  U5        M9  UR                  U5        ML     [        [        U5      5      $ )z|All labels present in the match patterns.

RETURNS (set): The string labels.

DOCS: https://spacy.io/api/entityruler#labels
)
rx   r?   keysr   r@   r2   _split_labeladdrE   ry   )rH   r   
all_labelslr\   _s         r%   labelsEntityRuler.labels   s     4&&++-.D((--/0U
A!#,,Q/u%q!  VJ'((r'   )r5   r3   get_examplesc                V    U R                  5         U(       a  U R                  U5        gg)a[  Initialize the pipe for training.

get_examples (Callable[[], Iterable[Example]]): Function that
    returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
patterns Optional[Iterable[PatternType]]: The list of patterns.

DOCS: https://spacy.io/api/entityruler#initialize
N)clearrG   )rH   r   r5   r3   s       r%   
initializeEntityRuler.initialize   s#      	

h' r'   c                 B   [        U R                  R                  5       5      nUR                  U R                  R                  5       5        [        5       nU H9  nU R
                  U;   d  M  U R                  U5      u  pEUR                  U5        M;     [        U5      $ )zAll entity ids present in the match patterns `id` properties

RETURNS (set): The string entity ids.

DOCS: https://spacy.io/api/entityruler#ent_ids
)	rx   r?   r   r   r@   r2   r   r   rE   )rH   r   all_ent_idsr   r   r   s         r%   ent_idsEntityRuler.ent_ids   s     4&&++-.D((--/0eA!# --a0	'  [!!r'   c                    / nU R                   R                  5        HA  u  p#U H6  nU R                  U5      u  pVXTS.nU(       a  XgS'   UR                  U5        M8     MC     U R                  R                  5        HK  u  p#U H@  nU R                  U5      u  pVXTR
                  S.nU(       a  XgS'   UR                  U5        MB     MM     U$ )zGet all patterns that were added to the entity ruler.

RETURNS (list): The original patterns, one dictionary per pattern.

DOCS: https://spacy.io/api/entityruler#patterns
r\   patternid)r?   itemsr   r   r@   text)rH   all_patternsr\   r3   r   	ent_labelr   rQ   s           r%   r3   EntityRuler.patterns   s     #2288:OE#$($5$5e$<!	'<$dG##A& $  ;  $3399;OE#$($5$5e$<!	'LLA$dG##A& $  < r'   c                     Sn[        U R                  R                  5       H  u  nu  pEX:X  d  M  Un  O   U R                  R                  US  Vs/ s H  oUPM     nnU R                  R                  US9   / n/ n/ n	/ n
U H  n[        US   [        5      (       aJ  UR                  US   5        U	R                  US   5        U
R                  UR                  S5      5        Me  [        US   [        5      (       d  M  UR                  U5        M     / n[        UU R                  R                  U	5      U
5       H'  u  pnXS.nU(       a  UUS'   UR                  U5        M)     X|-    GH  nUS   nSU;   aE  UnU R                  XS   5      nU R                  R                  U5      nUUS   4U R                   U'   US   n[        U["        5      (       a<  U R$                  U   R                  U5        U R&                  R)                  X/5        M  [        U[        5      (       a<  U R*                  U   R                  U5        U R                  R)                  X/5        M  [	        [,        R.                  R1                  US95      e   SSS5        gs  snf ! [         a    / n GNOf = f! , (       d  f       g= f)	aN  Add patterns to the entity ruler. A pattern can either be a token
pattern (list of dicts) or a phrase pattern (string). For example:
{'label': 'ORG', 'pattern': 'Apple'}
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}

patterns (list): The patterns to add.

DOCS: https://spacy.io/api/entityruler#add_patterns
N)disabler   r\   r   r   )r   )	enumerater5   pipeline
pipe_names
ValueErrorselect_pipes
isinstancestrr   getr>   zippipe_create_labelrC   _normalize_keyrF   r   r@   rD   r   r?   r   E097format)rH   r3   current_indexir6   r   subsequent_pipesr?   phrase_pattern_labelsphrase_pattern_textsphrase_pattern_idsentryr@   r\   r   r   phrase_patternr   rr   s                      r%   rG   EntityRuler.add_patterns   s   	"M#,TXX->->#?<D<$%M $@ 261D1D]^1TU1T1TU XX""+;"<N$&!#% !#!eI.44)00w@(//i0@A&--eiio>i 0$77"))%0 " !O*-%23"+&
 ,1!E+1N4(&&~6+ (9g5= %I ..uDkBE,,55e<C*3U4[)ADMM#&	*gs++((/66w?''++E9=..''.55g>LL$$UI6$V[[%7%7%7%HII :- =<  V 	"!	"<<s<   ,J  !J  JJ  9BJ3FJ3J   J0/J03
Kc                 \   [        [        5      U l        [        [        5      U l        [        [        5      U l        [        U R                  R                  U R                  U R                  S9U l        [        U R                  R                  U R                  U R                  S9U l        g)zReset all patterns.r9   r;   N)r   r>   r?   r@   rE   rF   r   r5   rB   rA   r/   rC   r   r.   rD   rH   s    r%   r   EntityRuler.clear3  sw    )$/*40#E*HHNN^^44

 ,HHNN!9!9DNN
r'   r   c           	         U R                   R                  5        VVs/ s H  u  p#X1:X  d  M  X#4PM     nnnU(       d1  [        [        R                  R                  SXR                  S95      eU VVs/ s H  u  p#U R                  X#5      PM     nnn[        [        U R                  R                  5        VVs0 s H  u  p&X%;  d  M  X&_M     snn5      U l
        [        [        U R                  R                  5        VVs0 s H  u  p&X%;  d  M  X&_M     snn5      U l        U HJ  nX R                  ;   a  U R                  R                  U5        M/  U R                  R                  U5        ML     gs  snnf s  snnf s  snnf s  snnf )zRemove a pattern by its ent_id if a pattern with this ent_id was added before

ent_id (str): id of the pattern to be removed
RETURNS: None
DOCS: https://spacy.io/api/entityruler#remove
ID)	attr_typer\   	componentN)rF   rW   r   r   E1024r   r6   r   r   r>   r@   r   r?   rD   removerC   )rH   r   r\   eidlabel_id_pairscreated_labelsvals          r%   r   EntityRuler.removeA  sx    .2]]-A-A-C
-C\es}LUL-C 	 
 ##d&II#V  @N
?M|Du*~ 	 
  + %)$8$8$>$>$@$@LU. 
$@ 
 * %)$7$7$=$=$?$?LU. 
$?
 $E+++##**51##E*	 $;

s(   E7E74E=;F
FF	F	c                     [        U 5      S:X  a<  [        R                  " [        R                  R                  U R                  S95        gg)z:Raise a warning if this component has no patterns defined.r   )r6   N)rN   ru   warnr   W036r   r6   r   s    r%   rt   EntityRuler._require_patternsk  s4    t9>MM(--..DII.>? r'   c                 r    U R                   U;   a!  UR                  U R                   S5      u  p#X#4$ UnSnX#4$ )zSplit Entity label into ent_label and ent_id if it contains self.ent_id_sep

label (str): The value of label in a pattern entry
RETURNS (tuple): ent_label, ent_id
r   N)r2   rsplit)rH   r\   r   r   s       r%   r   EntityRuler._split_labelp  sG     ??e# %T__a @I    IF  r'   c                 T    [        U[        5      (       a  U U R                   U 3nU$ )a  Join Entity label with ent_id if the pattern has an `id` attribute
If ent_id is not a string, the label is returned as is.

label (str): The label to set for ent.label_
ent_id (str): The label
RETURNS (str): The ent_label joined with configured `ent_id_sep`
)r   r   r2   )rH   r\   r   s      r%   r   EntityRuler._create_label}  s-     fc""gdoo.vh7Er'   )excludepatterns_bytesr   c                   [         R                  " U5      nU R                  5         [        U[        5      (       a  U R                  UR                  SU5      5        UR                  SS5      U l        UR                  SS5      U l        [        U R                  R                  U R                  S9U l        UR                  S[        5      U l        U $ U R                  U5        U $ )zLoad the entity ruler from a bytestring.

patterns_bytes (bytes): The bytestring to load.
RETURNS (EntityRuler): The loaded entity ruler.

DOCS: https://spacy.io/api/entityruler#from_bytes
r3   r=   Fr.   Nr<   r2   )srslymsgpack_loadsr   r   dictrG   r   r=   r.   r   r5   rB   rD   DEFAULT_ENT_ID_SEPr2   )rH   r   r   cfgs       r%   
from_bytesEntityRuler.from_bytes  s     !!.1

c4  cggj#67 WW[%8DN'*ww/Dd'KD$"/--#D "ggl4FGDO  c"r'   c                    U R                   U R                  U R                  U R                  S.n[        R
                  " U5      $ )zSerialize the entity ruler patterns to a bytestring.

RETURNS (bytes): The serialized patterns.

DOCS: https://spacy.io/api/entityruler#to_bytes
)r=   r2   r.   r3   )r=   r2   r.   r3   r   msgpack_dumps)rH   r   serials      r%   to_bytesEntityRuler.to_bytes  s<     //#'#;#;	
 ""6**r'   pathc                r  ^ ^ [        U5      nT R                  5         UR                  S5      nUR                  S:X  a`  UR                  (       a)  [
        R                  " U5      nT R                  U5        T $ [        [        R                  R                  US95      eUR	                  5       (       a)  [
        R                  " U5      nT R                  U5        T $ UR                  5       (       a  0 mSU 4S j0nSU4S j0n[        X0 5        TR                  SS5      T l        TR                  S	5      T l        TR                  S
["        5      T l        ['        T R(                  R*                  T R                   S9T l        [        X0 5        T $ [        [        R.                  R                  US95      e)a  Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line.

path (str / Path): The JSONL file to load.
RETURNS (EntityRuler): The loaded entity ruler.

DOCS: https://spacy.io/api/entityruler#from_disk
.jsonl)r   r3   c                 l   > TR                  [        R                  " U R                  S5      5      5      $ Nr   )rG   r   
read_jsonlwith_suffixrQ   rH   s    r%   rp   'EntityRuler.from_disk.<locals>.<lambda>  s'    d&7&7$$Q]]8%<='r'   r   c                 N   > TR                  [        R                  " U 5      5      $ r"   )r   r   	read_jsonrQ   r   s    r%   rp   r     s    #**U__Q=O2Pr'   r=   Fr.   r2   r   )r   r   r   suffixis_filer   r   rG   r   r   E1023r   is_dirr   r   r=   r.   r   r2   r   r5   rB   rD   E146)rH   r   r   depr_patterns_pathr3   deserializers_patternsdeserializers_cfgr   s   `      @r%   r   EntityRuler.from_disk  sx    4 

!--h7;;("|| ++D1!!(+2 / !!4!4$!4!?@@''))''(:;Hh'( ' [[]]C &"
 "'(P Qdr2 WW[%8DN'*ww/D'ED$!ggl4FGDO"/T%=%=#D dB7  V[[//T/:;;r'   c                   ^ ^ [        U5      nT R                  T R                  T R                  S.mU 4S jU4S jS.nUR                  S:X  a"  [
        R                  " UT R                  5        g[        X0 5        g)zSave the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL).

path (str / Path): The JSONL file to save.

DOCS: https://spacy.io/api/entityruler#to_disk
)r=   r.   r2   c                 d   > [         R                  " U R                  S5      TR                  5      $ r   )r   write_jsonlr   r3   r   s    r%   rp   %EntityRuler.to_disk.<locals>.<lambda>  s"    %"3"3h'#r'   c                 2   > [         R                  " U T5      $ r"   )r   
write_jsonr   s    r%   rp   r    s    U--a5r'   )r3   r   r   N)	r   r=   r.   r2   r   r   r  r3   r   )rH   r   r   serializersr   s   `   @r%   r   EntityRuler.to_disk  sj     4 #'#;#;//
 6	
 ;;("dDMM2Dr*r'   )rF   rA   r2   rC   r/   r6   r5   r=   rD   r.   r@   r4   r?   )entity_ruler)r7   N)0__name__
__module____qualname____firstlineno____doc__r   r   r&   r   r   r
   r   intr   boolr	   PatternTyperI   rZ   r^   r   ri   rc   rd   propertyr   r   r   r   r   r   r   r3   rG   r   r   rt   r   r   r   r   bytesr   r   r   r   r   __static_attributes__r)   r'   r%   r,   r,       s    #:
 :>*=$,04%7:: :
 &eCHo6:  (: : : : 4,-: ": 
:x4 4M# M$ M<C <C <  +, )c3h ) ). #'48(r8G#445( h	(
 8K01(( "x}c12 " "" ${+  0:JT+%6 :J4 :Jx
(+S (+T (+T@
!# !%Xc]0B*C !
3 
 
 
 BRAS#19#	2 4D3E +8C= +% +  CSBT*#t)$*2:3-*	*Z CSBT+#t)$+2:3-+	+ +r'   r,   c                 |    U S:X  a"  [         R                  " S5      nUR                  $ [        S[         SU  35      e)Nmake_entity_rulerzspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_moduler  AttributeErrorr  )r6   modules     r%   __getattr__r    sA    ""(()CD'''
78*,>tfE
FFr'   )2r  sysru   collectionsr   pathlibr   typingr   r   r   r   r	   r
   r   r   r   r   errorsr   r   languager   rC   r   r   matcher.levenshteinr   r4   r   tokensr   r   trainingr   utilr   r   r   r   r   r   r    r   r   r  r&   r*   r,   r  r)   r'   r%   <module>r(     s     
  #  X X X  %  , 5     N N  3c4S#X#77889!V+$ V+tGr'   