
    h6                     ^   S SK r S SKrS SKJr  S SKJrJrJrJrJ	r	J
r
Jr  S SKrS SKJrJrJrJrJr  S SKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&  Sr'\" 5       RQ                  \'5      S   r)Sr*Sr+S\\   S\\,\4   4S jr-S r. " S S\&5      r/S r0g)    N)islice)AnyCallableDictIterableListOptionalTuple)ConfigModel	Optimizerget_array_moduleset_dropout_rate)Floats2d   )Errors)Language)Scorer)Doc)Examplevalidate_examplesvalidate_get_examples)registry)Vocab   )TrainablePipeaW  
[model]
@architectures = "spacy.TextCatEnsemble.v2"

[model.tok2vec]
@architectures = "spacy.Tok2Vec.v2"

[model.tok2vec.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = 64
rows = [2000, 2000, 500, 1000, 500]
attrs = ["NORM", "LOWER", "PREFIX", "SUFFIX", "SHAPE"]
include_static_vectors = false

[model.tok2vec.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = ${model.tok2vec.embed.width}
window_size = 1
maxout_pieces = 3
depth = 2

[model.linear_model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true
length = 262144
ngram_size = 1
no_output_layer = false
modelz
[model]
@architectures = "spacy.TextCatBOW.v3"
exclusive_classes = true
length = 262144
ngram_size = 1
no_output_layer = false
a`  
[model]
@architectures = "spacy.TextCatReduce.v1"
exclusive_classes = true
use_reduce_first = false
use_reduce_last = false
use_reduce_max = false
use_reduce_mean = true

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
pretrained_vectors = null
width = 96
depth = 4
embed_size = 2000
window_size = 1
maxout_pieces = 3
subword_features = true
examplesreturnc                 6    [         R                  " U S4SS0UD6$ )Ncatsmulti_labelF)r   
score_cats)r   kwargss     P/home/james-whalen/.local/lib/python3.13/site-packages/spacy/pipeline/textcat.pytextcat_scorer&   O   s/      	     c                      [         $ N)r&    r'   r%   make_textcat_scorerr+   X   s    r'   c                   J   \ rS rSrSr S&\S.S\S\S\S\	S\
\   S	S
4S jjjr\S 5       r\S	\\   4S j5       r\S	\\   4S j5       rS\\   4S jrS\\   S	S
4S jrSS
S
S.S\\   S\	S\
\   S\
\\\	4      S	\\\	4   4
S jjrSS
S
S.S\\   S\	S\
\   S\
\\\	4      S	\\\	4   4
S jjrS\\   S	\\R:                  \R:                  4   4S jrS\\   S	\\	\	4   4S jrS\S	\ 4S jr!S
S
S
S.S\/ \\   4   S \
\"   S!\
\\      S"\
\   S	S
4
S# jjr#S\\   4S$ jr$S%r%g
)'TextCategorizer\   zePipeline component for single-label text classification.

DOCS: https://spacy.io/api/textcategorizer
)scorervocabr   name	thresholdr/   r   Nc                n    Xl         X l        X0l        SU l        / USS.n[	        U5      U l        XPl        g)a  Initialize a text categorizer for single-label classification.

vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
    losses during training.
threshold (float): Unused, not needed for single-label (exclusive
    classes) classification.
scorer (Optional[Callable]): The scoring method. Defaults to
        Scorer.score_cats for the attribute "cats".

DOCS: https://spacy.io/api/textcategorizer#init
N)labelsr2   positive_label)r0   r   r1   _rehearsal_modeldictcfgr/   )selfr0   r   r1   r2   r/   r8   s          r%   __init__TextCategorizer.__init__b   s=    , 

	 $""

 9r'   c                     g)NFr*   r9   s    r%   support_missing_values&TextCategorizer.support_missing_values   s    
 r'   c                 2    [        U R                  S   5      $ )zvRETURNS (Tuple[str]): The labels currently added to the component.

DOCS: https://spacy.io/api/textcategorizer#labels
r4   )tupler8   r=   s    r%   r4   TextCategorizer.labels   s     TXXh'((r'   c                     U R                   $ )zvRETURNS (List[str]): Information about the component's labels.

DOCS: https://spacy.io/api/textcategorizer#label_data
)r4   r=   s    r%   
label_dataTextCategorizer.label_data   s     {{r'   docsc                    [        S U 5       5      (       du  U Vs/ s H  o"R                  PM     nnU R                  R                  R                  nUR                  [        [        U5      5      [        U R                  5      45      nU$ U R                  R                  U5      nU R                  R                  R                  U5      nU$ s  snf )zApply the pipeline's model to a batch of docs, without modifying them.

docs (Iterable[Doc]): The documents to predict.
RETURNS: The models prediction for each document.

DOCS: https://spacy.io/api/textcategorizer#predict
c              3   8   #    U  H  n[        U5      v   M     g 7fr)   len.0docs     r%   	<genexpr>*TextCategorizer.predict.<locals>.<genexpr>        ,t3s88t   )anytensorr   opsxpzerosrJ   listr4   predictasarray)r9   rF   rM   tensorsrU   scoress         r%   rX   TextCategorizer.predict   s     ,t,,,-12TczzTG2""BXXs4:DKK0@ABFM##D)''/ 3s   Cc                     [        U5       H?  u  p4[        U R                  5       H!  u  pV[        X#U4   5      UR                  U'   M#     MA     g)zModify a batch of Doc objects, using pre-computed scores.

docs (Iterable[Doc]): The documents to modify.
scores: The scores to set, produced by TextCategorizer.predict.

DOCS: https://spacy.io/api/textcategorizer#set_annotations
N)	enumerater4   floatr!   )r9   rF   r[   irM   jlabels          r%   set_annotationsTextCategorizer.set_annotations   sA      oFA%dkk2"'!t"5 3 &r'           )dropsgdlossesr   rf   rg   rh   c                   Uc  0 nUR                  U R                  S5        [        US5        U R                  U5        [	        S U 5       5      (       d  U$ [        U R                  U5        U R                  R                  U Vs/ s H  oUR                  PM     sn5      u  pgU R                  X5      u  pU" U	5        Ub  U R                  U5        X@R                  ==   U-  ss'   U$ s  snf )a  Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.

examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
    Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.

DOCS: https://spacy.io/api/textcategorizer#update
re   zTextCategorizer.updatec              3   p   #    U  H,  oR                   (       a  [        UR                   5      OS v   M.     g7f)r   N)	predictedrJ   )rL   egs     r%   rN   )TextCategorizer.update.<locals>.<genexpr>   s"     Oh3r||$!;hs   46)
setdefaultr1   r   _validate_categoriesrR   r   r   begin_updaterk   get_lossfinish_update)
r9   r   rf   rg   rh   rl   r[   	bp_scoreslossd_scoress
             r%   updateTextCategorizer.update   s    ( >F$))S)($<=!!(+OhOOOMT* JJ33H4UHb\\H4UVx8(?s#yyT! 5Vs   C,c                @   Uc  0 nUR                  U R                  S5        U R                  c  U$ [        US5        U R	                  U5        U Vs/ s H  oUR
                  PM     nn[        S U 5       5      (       d  U$ [        U R                  U5        U R                  R                  U5      u  pxU R                  R                  U5      u  pXy-
  nU" U5        Ub  U R                  U5        X@R                  ==   US-  R                  5       -  ss'   U$ s  snf )aS  Perform a "rehearsal" update from a batch of data. Rehearsal updates
teach the current model to make predictions similar to an initial model,
to try to address the "catastrophic forgetting" problem. This feature is
experimental.

examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
    Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.

DOCS: https://spacy.io/api/textcategorizer#rehearse
re   zTextCategorizer.rehearsec              3   8   #    U  H  n[        U5      v   M     g 7fr)   rI   rK   s     r%   rN   +TextCategorizer.rehearse.<locals>.<genexpr>   rP   rQ   r   )rn   r1   r6   r   ro   rk   rR   r   r   rp   rr   sum)r9   r   rf   rg   rh   rl   rF   r[   rs   target_gradients               r%   rehearseTextCategorizer.rehearse   s    , >F$))S)  (M($>?!!(+'/0xx0,t,,,MT* JJ33D9))66t<	?(?s#yyhk..00 1s   Dc                 (   [        [        U5      5      n[        R                  " U[        U R                  5      4SS9n[        R
                  " U[        U R                  5      4SS9n[        U5       Ht  u  pV[        U R                  5       HV  u  pxXR                  R                  ;   a  UR                  R                  U   X5U4'   M=  U R                  (       d  MP  SXEU4'   MX     Mv     U R                  R                  R                  U5      nX44$ )Nf)dtypere   )rJ   rW   numpyrV   r4   onesr^   	referencer!   r>   r   rT   rY   )	r9   r   nr_examplestruthsnot_missingr`   rl   ra   rb   s	            r%   _examples_to_truth"TextCategorizer._examples_to_truth	  s     $x.)k3t{{+;<CHjj+s4;;/?!@Lx(EA%dkk2LL---#%<<#4#4U#;Fa4L000(+K1%	 3 ) ''/""r'   c                     [        US5        U R                  U5        U R                  U5      u  p4U R                  R                  R                  U5      nX#-
  nXT-  nUS-  R                  5       n[        U5      U4$ )a5  Find the loss and gradient of loss for the batch of documents and
their predicted scores.

examples (Iterable[Examples]): The batch of examples.
scores: Scores representing the model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.

DOCS: https://spacy.io/api/textcategorizer#get_loss
zTextCategorizer.get_lossr   )r   ro   r   r   rT   rY   meanr_   )r9   r   r[   r   r   ru   mean_square_errors          r%   rq   TextCategorizer.get_loss  s{     	($>?!!(+"55h?jjnn,,[9?%q[..0&'11r'   rb   c                     [        U[        5      (       d  [        [        R                  5      eXR
                  ;   a  gU R                  5         U R                  S   R                  U5        U R                  (       a\  SU R                  R                  ;   aB  U R                  R                  S   " U R                  [        U R
                  5      5      U l	        U R                  R                  R                  U5        g)zAdd a new label to the pipe.

label (str): The label to add.
RETURNS (int): 0 if label is already present, otherwise 1.

DOCS: https://spacy.io/api/textcategorizer#add_label
r   r4   resize_outputr   )
isinstancestr
ValueErrorr   E187r4   _allow_extra_labelr8   appendr   attrsrJ   r0   stringsadd)r9   rb   s     r%   	add_labelTextCategorizer.add_label+  s     %%%V[[))KK!!!%(::/TZZ-=-==))/:4::s4;;GWXDJ

u%r'   )nlpr4   r5   get_examplesr   r4   r5   c                r   [        US5        U R                  U" 5       5        Uc=  U" 5        H1  nUR                  R                   H  nU R	                  U5        M     M3     OU H  nU R	                  U5        M     [        U R                  5      S:  a  [        [        R                  5      eUb  X@R                  ;  a2  [        R                  R                  X@R                  S9n[        U5      e[        U R                  5      S:w  a2  [        R                  R                  X@R                  S9n[        U5      eX@R                  S'   [        [        U" 5       S5      5      n	U	 V
s/ s H  oR                   PM     nn
U R#                  U	5      u  pU R%                  5         [        U5      S:  d,   [        R&                  R                  U R(                  S95       e[        U5      S:  d,   [        R&                  R                  U R(                  S95       eU R*                  R-                  XS	9  gs  sn
f )
a  Initialize the pipe for training, using a representative set
of data examples.

get_examples (Callable[[], Iterable[Example]]): Function that
    returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
labels (Optional[Iterable[str]]): The labels to add to the component, typically generated by the
    `init labels` command. If no labels are provided, the get_examples
    callback is used to extract the labels from the data.
positive_label (Optional[str]): The positive label for a binary task with exclusive classes,
    `None` otherwise and by default.

DOCS: https://spacy.io/api/textcategorizer#initialize
zTextCategorizer.initializeNr   )	pos_labelr4   r5   
   r   )r1   )XY)r   ro   yr!   r   rJ   r4   r   r   E867E920formatE919r8   rW   r   r   r   _require_labelsE923r1   r   
initialize)r9   r   r   r4   r5   examplecatrb   errsubbatchrl   
doc_samplelabel_sampler}   s                 r%   r   TextCategorizer.initialize>  s   , 	l,HI!!,.1>'>"99>>CNN3' * *  u%  t{{aV[[))%[[0kk((>++(V o%4;;1$kk((>++(V o%%3!"|~r23-56XrllX
611(;:"FFKK$6$6DII$6$FF"< 1$Hfkk&8&8dii&8&HH$


; 7s   *H4c                    U H  n[        UR                  R                  R                  5       5      nUR	                  S5      S:  a:  [        [        R                  R                  UR                  R                  S95      eU H7  nUS:X  a  M  US:X  a  M  [        [        R                  R                  US95      e   M     g)zKCheck whether the provided examples all have single-label cats annotations.g      ?r   )valuere   )valN)
rW   r   r!   valuescountr   r   E895r   E851)r9   r   exvalsr   s        r%   ro   $TextCategorizer._validate_categorieso  s    B))0023Dzz#" !3!3",,:K:K!3!LMMs
cSj$V[[%7%7C%7%@AA 	 r'   )r6   r8   r   r1   r/   r0   )textcat)&__name__
__module____qualname____firstlineno____doc__r&   r   r   r   r_   r	   r   r:   propertyr>   r
   r4   r   rD   r   r   rX   rc   r   r   r   rv   r   r   ndarrayr   rq   intr   r   r   ro   __static_attributes__r*   r'   r%   r-   r-   \   sm    	  &3     	    "  
 D   )c
 ) ) DI  HSM $
6HSM 
6d 
6  #'-1#7## 	#
 i # c5j)*# 
c5j	#R #'-1)7#) 	)
 i ) c5j)*) 
c5j	)V# )#	u}}emm+	,#2'!2 2uUE\?R 2&s s . #'*.(,/<r8G#445/< h	/<
 #'/< !/< 
/<bBXg-> Br'   r-   c                 |    U S:X  a"  [         R                  " S5      nUR                  $ [        S[         SU  35      e)Nmake_textcatzspacy.pipeline.factorieszmodule z has no attribute )	importlibimport_moduler   AttributeErrorr   )r1   modules     r%   __getattr__r   {  s@    ~(()CD"""
78*,>tfE
FFr'   )1r   sys	itertoolsr   typingr   r   r   r   r   r	   r
   r   	thinc.apir   r   r   r   r   thinc.typesr   errorsr   languager   r/   r   tokensr   trainingr   r   r   utilr   r0   r   trainable_piper   single_label_default_configfrom_strDEFAULT_SINGLE_TEXTCAT_MODELsingle_label_bow_configsingle_label_cnn_configr   r&   r+   r-   r   r*   r'   r%   <module>r      s     
  G G G  R R       H H   ) 8  &x001LMgV   *HW- DcN [Bm [B~Gr'   