
    hdI                     @   S SK r S SKrS SKJr  S SKJr  S SKJrJrJ	r	J
r
JrJrJr  S SKrS SKJrJrJrJr  S SKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSK J!r!  SSK"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)J*r*  SSKJ+r+  SSK,J-r-  SSK.J/r/  SSK0J1r1  Sr2S r3 " S S\15      r4g)    N)islice)Path)AnyCallableDictIterableListOptionalUnion)CosineDistanceModel	Optimizerset_dropout_rate)Floats2d   )util)ErrorsWarnings)	CandidateKnowledgeBase)Language)empty_kb)Scorer)DocSpan)Examplevalidate_examplesvalidate_get_examples)SimpleFrozenList)Vocab   )deserialize_config)TrainablePipeTc                 R    [         R                  " U 4S[        R                  /0UD6$ )Nnegative_labels)r   score_linksEntityLinker_v1NIL)exampleskwargss     ]/home/james-whalen/.local/lib/python3.13/site-packages/spacy/pipeline/legacy/entity_linker.pyentity_linker_scorer,      s'    hX9L9L8MXQWXX    c                   P   \ rS rSrSrSr S2\\S.S\S\	S\
S\\
   S	\S
\S\S\S\\\/\\   4   S\S\\   SS4S jjjrS\\/\4   4S jrS3S jrSSS.S\/ \\   4   S\\   S\\\/\4      4S jjrSSSS.S\\   S\S\\   S\\\
\4      S\\
\4   4
S  jjrS\\   S!\4S" jrS#\\   S\ \
   4S$ jr!S#\\   S%\ \
   SS4S& jr"\#" 5       S'.S( jr$\#" 5       S'.S) jr%\&" 5       S'.S*\'\
\(4   S+\\
   SS4S, jjr)\&" 5       S'.S*\'\
\(4   S+\\
   SS 4S- jjr*SSS..S/ jr+S0 r,S1r-g)4r'   #   zVPipeline component for named entity linking.

DOCS: https://spacy.io/api/entitylinker
r(   )	overwritescorervocabmodelnamelabels_discardn_sents
incl_priorincl_contextentity_vector_lengthget_candidatesr0   r1   returnNc                    Xl         X l        X0l        [        U5      U l        XPl        X`l        Xpl        Xl        SU
0U l	        [        SS9U l        [        U5      " U R                   5      U l        Xl        g)a  Initialize an entity linker.

vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
    losses during training.
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
n_sents (int): The number of neighbouring sentences to take into account.
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
    produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init
r0   F)	normalizeN)r2   r3   r4   listr5   r6   r7   r8   r:   cfgr   distancer   kbr1   )selfr2   r3   r4   r5   r6   r7   r8   r9   r:   r0   r1   s               r+   __init__EntityLinker_v1.__init__+   sk    > 

	">2$(,$/#;&7 /0<r-   	kb_loaderc                     [        U5      (       d/  [        [        R                  R	                  [        U5      S95      eU" U R                  5      U l        g)zaDefine the KB of this pipe by providing a function that will
create it using this object's vocab.)arg_typeN)callable
ValueErrorr   E885formattyper2   rA   )rB   rE   s     r+   set_kbEntityLinker_v1.set_kbY   s@     	""V[[//i/IJJDJJ'r-   c                    U R                   c0  [        [        R                  R	                  U R
                  S95      e[        U R                   5      S:X  a0  [        [        R                  R	                  U R
                  S95      eg )Nr4   r   )rA   rI   r   E1018rK   r4   lenE139rB   s    r+   validate_kbEntityLinker_v1.validate_kba   sb    77?V\\00dii0@AAtww<1V[[//TYY/?@@ r-   )nlprE   get_examplesrW   c                   [        US5        Ub  U R                  U5        U R                  5         U R                  R                  n/ n/ n[        U" 5       S5       HR  nUR                  UR                  5        UR                  U R                  R                  R                  U5      5        MT     [        U5      S:  d,   [        R                  R                  U R                  S95       e[        U5      S:  d,   [        R                  R                  U R                  S95       eU R                  R!                  XPR                  R                  R#                  USS9S9  g)	aO  Initialize the pipe for training, using a representative set
of data examples.

get_examples (Callable[[], Iterable[Example]]): Function that
    returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
    Note that providing this argument, will overwrite all data accumulated in the current KB.
    Use this only when loading a KB as-such from file.

DOCS: https://spacy.io/api/entitylinker#initialize
zEntityLinker_v1.initializeN
   r   rP   float32)dtype)XY)r   rM   rU   rA   r9   r   appendxr3   opsalloc1frR   r   E923rK   r4   
initializeasarray)rB   rX   rW   rE   nO
doc_samplevector_sampleexamples           r+   rd   EntityLinker_v1.initializeh   s   & 	l,HI KK	"WW))
lnb1Ggii(  !7!7!;< 2 :"FFKK$6$6DII$6$FF"=!A%Iv{{'9'9tyy'9'II%

JJNN22=	2R 	 	
r-           )dropsgdlossesr)   rl   rm   rn   c                   U R                  5         Uc  0 nUR                  U R                  S5        U(       d  U$ [        US5        / nU GH  nUR                  R
                   Vs/ s H  owPM     nnUR                  SSS9n	UR                  R                   H  n
XR                     nU(       d  M   UR                  U
R                  5      n[        SXR                   -
  5      n[#        [%        U5      S-
  XR                   -   5      nX   R                  nX   R&                  nUR(                  UU R+                  5       nUR-                  U5        M     GM     [/        U R0                  U5        U(       d3  [2        R4                  " [6        R8                  R;                  S	S
95        U$ U R0                  R=                  U5      u  nnU R?                  UUS9u  nnU" U5        Ub  U RA                  U5        X@R                  ==   U-  ss'   U$ s  snf ! [         a    [        [        R                  5      Sef = f)a  Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.

examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
    Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.

DOCS: https://spacy.io/api/entitylinker#update
Nrk   zEntityLinker_v1.update	ENT_KB_IDT	as_stringr      zEntity LinkerrP   )sentence_encodingsr)   )!rU   
setdefaultr4   r   	referencesentsget_alignedentsstartindexsentAttributeErrorRuntimeErrorr   E030maxr6   minrR   end	predictedas_docr_   r   r3   warningswarnr   W093rK   begin_updateget_lossfinish_update)rB   r)   rl   rm   rn   sentence_docsegs	sentenceskb_idsentkb_id
sent_indexstart_sentenceend_sentencestart_token	end_tokensent_docrt   
bp_contextlossd_scoress                         r+   updateEntityLinker_v1.update   s   ( 	>F$))S)M($<=B$&LL$6$67$6q$6I7^^K4^@F||((yy)5B%.__SXX%>

 &)J,E%FN#&s9~'9:;T#UL"+";"A"AK ) 7 ; ;I!||K	BIIKH!((2' ) . 	T*MM(--..O.DEM)-)@)@)O&J1H ' 
h 	8?s#yyT!E 8 * B*6;;7TABs   'H5H%H>rt   c                    [        US5        / nU Hs  nUR                  SSS9nUR                  R                   HF  nXVR                     nU(       d  M  U R
                  R                  U5      nUR                  U5        MH     Mu     U R                  R                  R                  U5      nUR                  UR                  :w  a)  [        R                  R                  SSS9n	[        U	5      eU R                   R#                  X#5      n
U R                   R%                  X#5      nU['        U5      -  n[)        U5      U
4$ )NzEntityLinker_v1.get_lossrp   Trq   r   zgold entities do not match upmethodmsg)r   rx   rv   ry   rz   rA   
get_vectorr_   r3   ra   	asarray2fshaper   E147rK   r~   r@   get_gradr   rR   float)rB   r)   rt   entity_encodingsr   r   r   r   entity_encodingerr	gradientsr   s               r+   r   EntityLinker_v1.get_loss   s   ($>?B^^K4^@F||((yy)5&*gg&8&8&?O$++O<	 )   ::>>334DE##'7'='==++$$!'F % C s##MM**+=P	}}%%&8Kc*++T{I%%r-   docsc           	      r   U R                  5         Sn/ nU(       d  U$ [        U[        5      (       a  U/n[        U5       GH  u  pEUR                   Vs/ s H  ofPM     nn[        U5      S:  d  M2  UR                   GHb  nUR                  n	UR                  U	5      n
U
S:  d   e[        SXR                  -
  5      n[        [        U5      S-
  XR                  -   5      nX{   R                  nX|   R                  nX]U R                  5       nU R                  R                   R"                  nU R$                  (       aF  U R                  R'                  U/5      S   nUR(                  nUR*                  R-                  U5      nUS-  nUR.                  U R0                  ;   a  UR3                  U R4                  5        GMG  [7        U R9                  U R:                  U5      5      nU(       d  UR3                  U R4                  5        GM  [        U5      S:X  a!  UR3                  US   R<                  5        GM  [>        R@                  " U5        URC                  U Vs/ s H  nURD                  PM     sn5      nU RF                  (       d"  URC                  U Vs/ s H  nSPM     sn5      nUnU R$                  (       a  URC                  U Vs/ s H  nURH                  PM     sn5      nUR*                  R-                  USS9n[        U5      [        U5      :w  a'  [K        [L        RN                  RQ                  SSS95      eURS                  UW5      WU-  -  nURT                  URT                  :w  a  [W        [L        RX                  5      eUU-   UU-  -
  nUR[                  5       R]                  5       nUU   nUR3                  UR<                  5        GMe     GM     [        U5      U:X  d)  [L        RN                  RQ                  SSS9n[K        U5      eU$ s  snf s  snf s  snf s  snf )	a@  Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
no prediction.

docs (Iterable[Doc]): The documents to predict.
RETURNS (List[str]): The models prediction for each document.

DOCS: https://spacy.io/api/entitylinker#predict
r   rs   rk   )axispredictzvectors not of equal lengthr   z$result variables not of equal length)/rU   
isinstancer   	enumeraterw   rR   ry   r|   r{   r   r6   r   rz   r   r   r3   ra   xpr8   r   Tlinalgnormlabel_r5   r_   r(   r>   r:   rA   entity_randomshufflere   
prior_probr7   entity_vectorr~   r   r   rK   dotr   rI   E161argmaxitem)rB   r   entity_countfinal_kb_idsidocr   r   r   r|   r   r   r   r   r   r   r   sentence_encodingsentence_encoding_tsentence_norm
candidatescprior_probs_scoresr   entity_normsims
best_indexbest_candidater   s                                  r+   r   EntityLinker_v1.predict   s    	"$dC  6DoFA$'II.IqII.3x!|88C88D!*!6J%?*?%(J,E%FN#&s9~'9:;T#UL"+";"A"AK ) 7 ; ;I"y9@@BH**B((,0JJ,>,>z,J1,M).?.A.A+(*		7J(K A%LzzT%8%88$++DHH5%)$*=*=dggs*K%L
)(//9 _1(//
10E0EF"NN:6*,**J5WJqallJ5W*XK#'??.0jjz9Rz!#z9R.S%0F#0035::>H$IjQ__j$I4" 0 /1iinn=MTUn.V#&'7#8C<L#L*6(.(:(:3<0M ); )*+& %& (*vv.>@S'T$1K$?(" $(::1B1B#B*4V[[*A$A)4t);{T?Q)R)/)=)=)?J-7
-CN(//0F0FGs $	 &| L!\1++$$ &L % C s##E /D 6X9R
 %Js   P%%P*"P/P4r   c                    [        U VVs/ s H  o3R                    H  oDPM     M     snn5      nU[        U5      :w  a0  [        [        R                  R                  U[        U5      S95      eSnU R                  S   nU HG  nUR                   H4  nX&   nUS-  nU H"  n	U	R                  S:X  d	  U(       d  M  Xl        M$     M6     MI     gs  snnf )zModify a batch of documents, using pre-computed scores.

docs (Iterable[Doc]): The documents to modify.
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.

DOCS: https://spacy.io/api/entitylinker#set_annotations
)ry   idsr   r0   rs   N)	rR   ry   rI   r   E148rK   r?   	ent_kb_id
ent_kb_id_)
rB   r   r   r   r   
count_entsr   r0   r   tokens
             r+   set_annotationsEntityLinker_v1.set_annotations7  s     B####BC
V$V[[//ZS[/QRRHH[)	Cxx	Q E!+yy+0( !    Cs   C
excludec                  ^ ^ T R                  5         0 n[        T S5      (       a  T R                  b	  U 4S jUS'   UU 4S jUS'   T R                  R                  US'   T R
                  R                  US'   [        R                  " UT5      $ )zSerialize the pipe to a bytestring.

exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.

DOCS: https://spacy.io/api/entitylinker#to_bytes
r?   c                  D   > [         R                  " T R                  5      $ N)srsly
json_dumpsr?   rT   s   r+   <lambda>*EntityLinker_v1.to_bytes.<locals>.<lambda>W  s    u'7'7'Ar-   c                  6   > TR                   R                  T S9$ Nr   )r2   to_bytes)r   rB   s   r+   r   r   X  s    TZZ%8%8%8%Ir-   r2   rA   r3   )_validate_serialization_attrshasattrr?   rA   r   r3   r   )rB   r   	serializes   `` r+   r   EntityLinker_v1.to_bytesL  sx     	**,	4DHH$8AIeI	'''**	$!ZZ00	'}}Y00r-   c                   ^ ^ T R                  5         U 4S jn0 n[        T S5      (       a  T R                  b	  U 4S jUS'   UU 4S jUS'   U 4S jUS'   X4S'   [        R                  " XT5        T $ )	zLoad the pipe from a bytestring.

exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (TrainablePipe): The loaded object.

DOCS: https://spacy.io/api/entitylinker#from_bytes
c                    >  TR                   R                  U 5        g ! [         a    [        [        R
                  5      S ef = fr   )r3   
from_bytesr}   rI   r   E149brB   s    r+   
load_model.EntityLinker_v1.from_bytes.<locals>.load_modelg  s:    8

%%a(! 8 -478s	    %Ar?   c                 b   > TR                   R                  [        R                  " U 5      5      $ r   )r?   r   r   
json_loadsr   s    r+   r   ,EntityLinker_v1.from_bytes.<locals>.<lambda>o  s    488??5;K;KA;N+Or-   c                 8   > TR                   R                  U TS9$ r   )r2   r   )r   r   rB   s    r+   r   r   p  s    )>)>q')>)Rr-   r2   c                 :   > TR                   R                  U 5      $ r   )rA   r   r   s    r+   r   r   q  s    dgg&8&8&;r-   rA   r3   )r   r   r?   r   r   )rB   
bytes_datar   r   deserializes   ` `  r+   r   EntityLinker_v1.from_bytes]  sl     	**,	8 4DHH$8!OKRG;D)G
9r-   pathr   c                   ^ ^ 0 nUU 4S jUS'   U 4S jUS'   U 4S jUS'   U 4S jUS'   [         R                  " XT5        g	)
zSerialize the pipe to disk.

path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.

DOCS: https://spacy.io/api/entitylinker#to_disk
c                 8   > TR                   R                  U TS9$ r   )r2   to_diskpr   rB   s    r+   r   )EntityLinker_v1.to_disk.<locals>.<lambda>  s    tzz'9'9!W'9'Mr-   r2   c                 F   > [         R                  " U TR                  5      $ r   )r   
write_jsonr?   r   rB   s    r+   r   r     s    U%5%5a%Br-   r?   c                 :   > TR                   R                  U 5      $ r   )rA   r   r  s    r+   r   r     s    DGGOOA$6r-   rA   c                 :   > TR                   R                  U 5      $ r   )r3   r   r  s    r+   r   r     s    tzz'9'9!'<r-   r3   N)r   r   )rB   r   r   r   s   ` ` r+   r   EntityLinker_v1.to_diskv  s?     	M	'B	%6	$<	'Tg.r-   c                   ^ ^ U 4S jn0 nU 4S jUS'   UU 4S jUS'   U 4S jUS'   X4S'   [         R                  " XT5        T $ )	a&  Load the pipe from disk. Modifies the object in place and returns it.

path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (EntityLinker): The modified EntityLinker object.

DOCS: https://spacy.io/api/entitylinker#from_disk
c                    >  U R                  S5       nTR                  R                  UR                  5       5        S S S 5        g ! , (       d  f       g = f! [         a    [        [        R                  5      S ef = f)Nrb)openr3   r   readr}   rI   r   r   )r   infilerB   s     r+   r   -EntityLinker_v1.from_disk.<locals>.load_model  sV    8VVD\VJJ))&++-8 "\\! 8 -478s'   A *AA 
AA A %A=c                 L   > TR                   R                  [        U 5      5      $ r   )r?   r   r"   r  s    r+   r   +EntityLinker_v1.from_disk.<locals>.<lambda>  s    txx7I!7L'Mr-   r?   c                 8   > TR                   R                  U TS9$ r   )r2   	from_diskr   s    r+   r   r    s    )=)=a)=)Qr-   r2   c                 :   > TR                   R                  U 5      $ r   )rA   r  r  s    r+   r   r    s    dgg&7&7&:r-   rA   r3   )r   r  )rB   r   r   r   r   s   ` `  r+   r  EntityLinker_v1.from_disk  sH    	8 8:MEQG:D)Gt'2r-   )rm   rn   c                    [         er   NotImplementedError)rB   r)   rm   rn   configs        r+   rehearseEntityLinker_v1.rehearse      !!r-   c                     [         er   r  )rB   labels     r+   	add_labelEntityLinker_v1.add_label  r  r-   )r?   r@   r:   r8   r7   rA   r5   r3   r6   r4   r1   r2   )entity_linker)r;   N).__name__
__module____qualname____firstlineno____doc__r(   BACKWARD_OVERWRITEr,   r    r   strr   intboolr   r   r   r   r
   rC   rM   rU   r   r   rd   r   r   r   r   r   r   r   r	   r   r   tupler   r   r   r   r   r   r  r  r  __static_attributes__ r-   r+   r'   r'   #   sr   
 C $	, -%8,, , 	, !, , , , ", !-!68K!KL, , ", 
,\(%-)? @ (A #'@D!
r8G#445!
 h	!

 HeWm%;<=!
N #'-1?7#? 	?
 i ? c5j)*? 
c5j	?B&'!2 & &*THSM Td3i Tl1HSM 149 1 1* #(' 1" 16 4 CSBT/#t)$/2:3-/	/$ CSBT#t)$2:3-	6 )-T ""r-   r'   )5r   r   	itertoolsr   pathlibr   typingr   r   r   r   r	   r
   r   r   	thinc.apir   r   r   r   thinc.typesr    r   errorsr   r   rA   r   r   languager   mlr   r1   r   tokensr   r   trainingr   r   r   r   r2   r    piper"   trainable_piper#   r%  r,   r'   r+  r-   r+   <module>r9     sk        G G G  H H    & *      I I $  % *  YC"m C"r-   