
    h@7                     |   S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	J
r
JrJrJrJr  S SKrS SKrS SKrS SKJrJrJrJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r*  SSK(J+r+  SSK,J-r-  \
(       a  SSK.J/r/  SS.S\S\0SS4S jjr1SSSS.SSS\\   S\\   S\\2   SS4
S jjr3SS.SSS\\2\4   S \4SS4S! jjr5SSS"\\2\4   S#\\2\4   S\44S$ jr6S\*Rn                  S%S&.SSS'\\   S(\0S)\0S\\2   S*\2S+\2SS4S, jjr8\*Rn                  S-.S'\S.\0S*\24S/ jjr9S0\\2\4   S\	4S1 jr:S2 r;g)3    Nislice)Path)IOTYPE_CHECKINGAnyDictOptionalUnion)ConfigConfigValidationErrorfix_random_seedset_gpu_allocator   )ErrorsWarnings)Lookups)ConfigSchemaTraining)	DEFAULT_OOV_PROBOOV_RANKensure_pathget_sourced_components
load_modelload_model_from_configloggerregistryresolve_dot_names)Mode)Vectors   )get_tok2vec_ref)Language)use_gpuconfigr$   returnr"   c          	        ^^^ U nUR                  5       n SU S   ;  a&  [        [        R                  R	                  SS95      eSU S   ;  a&  [        [        R                  R	                  SS95      eU S   S   b  [        U S   S   5        U S   S   nUS:  a  U(       a  [        U5        [        U 5      n[        USS	9m[        R                  " S
5        TR                  R                  5       n [        R                  " U S   [        S9nUS   US   /n[        US   [         5      (       d1  [#        [        R$                  R	                  S['        US   5      S9S9e[        US   [         5      (       d1  [#        [        R$                  R	                  S['        US   5      S9S9e[)        X5      u  mnUS   nUS   n	U V
s/ s H  oU	;  d  M
  U
PM     nn
[        R                  " STR*                  5        U(       a>  TR-                  US9   [        R                  " SU5        TR/                  US9  S S S 5        TR1                  5         TR-                  / U	QUQS9   US   S:X  a0  Sm[        R2                  " ST5        TR5                  UUU4S jUS9  OTR5                  UU4S jUS9  [        R                  " STR*                  5        S S S 5        TR6                   H  u  p[9        US / 5       H  nUTR*                  ;  a  M  X;   a6  X;  a1  [        R:                  " [<        R>                  R	                  XS!95        X;  d  MW  X;   d  M^  XS"   ;  d  Mh  [        R:                  " [<        R@                  R	                  XS!95        M     M     T$ s  sn
f ! , (       d  f       GNp= f! , (       d  f       N= f)#Nseedtrainingz[training] seed)valuegpu_allocatorz[training] gpu_allocatorr   T)	auto_fillzSet up nlp object from config)schematrain_corpus
dev_corpusztraining.train_corpus)fieldtype)descztraining.dev_corpus	optimizerfrozen_componentszPipeline: %s)enablezResuming training for: %s)sgddisable
max_epochsr#   d   zDue to streamed train corpus, using only first %s examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labelsc                  (   > [        T" T 5      T5      $ Nr   )nlpsample_sizer.   s   S/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/initialize.py<lambda>init_nlp.<locals>.<lambda>\   s    |C0+>    c                     > T" T 5      $ r<    )r=   r.   s   r?   r@   rA   _   s
    <#4rB   z#Initialized pipeline components: %slistening_components)namelistenerannotating_components)!interpolate
ValueErrorr   E1015formatr   r   r   r   r   infor%   r   resolver   
isinstancestrr   E897r1   r   
pipe_namesselect_pipesresume_training_link_componentsdebug
initializepipelinegetattrwarningr   W087W086)r%   r$   
raw_config	allocatorsourcedT	dot_namesr/   r3   r4   presume_componentsrF   procrG   r=   r>   r.   s                  @@@r?   init_nlpre   $   sw   J##%FVJ'',,3D,EFFfZ00,,3M,NOOj&!-z*623z"?3I!|	)$$V,G
 t
<C
KK/0ZZ##%F
+4HIA>"AlO4Ia'--###-D>9J4K $ 
 	

 aos++###+$q2G $ 
 	

  1CL*+I-.$+JGq8I/IGJ
KK/%67KK35FGI. 8
 			"J$5"J8I"J		K\?b KLLB 	 NN>I   NN4)ND93>>J 
L  ll
("
H s~~-,1Nx}}333QR0T5N!899NN8==#7#7T#7#UV
 # JM K 87 
L	Ks%   	NN'N#%A0N5#
N25
O)datalookupsvectorsr=   rf   rg   rh   c                   U(       a@  X R                   l        [        R                  " SSR	                  UR
                  5      5        [        U5      nUb  [        R                  " U5      nU R                    H  n[        Ul
        M     U H/  nSU;   a  M  U R                   US      nUR                  " S0 UD6  M1     [        U R                   5      (       a   [        S U R                    5       5      S-
  nO[        nU R                   R                  R!                  SU05        [        R                  " S[        U R                   5      5        [        R                  " S	5        Ub"  [#        X5        [        R                  " S
U5        U R$                  R'                  S0 5      n	[        U	5      S:  a~  [)        U R                   R*                  R-                  S/S95      n
U	R/                  5        H=  u  pX:w  d  M  [0        R2                  " [4        R6                  R9                  US95        M?     [        R                  " S5        g )NzAdded vocab lookups: %sz, settingsorthc              3   8   #    U  H  oR                   v   M     g 7fr<   )prob).0lexs     r?   	<genexpr>init_vocab.<locals>.<genexpr>   s     9y88y   r    oov_probz%Added %d lexical entries to the vocabzCreated vocabularyzAdded vectors: %s_sourced_vectors_hashesr   strings)excluderF   z Finished initializing nlp objectrD   )vocabrg   r   rM   jointablesr   srsly
read_jsonlr   rank	set_attrslenminr   cfgupdateload_vectors_into_modelmetapophashrh   to_bytesitemswarningswarnr   W113rL   )r=   rf   rg   rh   	data_path	lex_attrslexemeattrsrs   sourced_vectors_hashesvectors_hashsourced_componentsourced_vectors_hashs                r?   
init_vocabr   r   s    #		-tyy/HID!I$$Y/	iiF"FK  EU"YYuV}-F%u%	 
 syy>>9syy99A=H'H		j(34;S^L
KK$%-'1 XX\\*CRH
!"Q&CII--66	{6KL7M7S7S7U33hmm228I2JK 8V KK23rB   T)add_stringsrF   r   c                L    S/nU(       d  UR                  S5        [        XR                  US9n[        UR                  R                  R                  5       5      S:X  a2  UR                  R                  R                  [        R                  :w  dY  UR                  R                  R                  S   S:X  ac  UR                  R                  R                  [        R                  :X  a1  [        R                  " [        R                  R!                  US	95        U R                   HF  n	U R                  R                  R"                  R%                  U	R&                  [(        5      U	l        MH     g! [         a$  nSU 3nSn[        R                  " XVUS9nUSeSnAff = f)
zHLoad word vectors from an installed model or path into a model instance.rg   ru   )rx   rv   z$Config validation error for vectors zThis typically means that there's a problem in the config.cfg included with the packaged vectors. Make sure that the vectors package you're loading is compatible with the current version of spaCy.)titler2   Nr   rw   )appendr   rx   r   
from_errorr   rh   keysmodeVectorsModefloretshaper   rZ   r   W112rL   key2rowgetrk   r   r}   )
r=   rF   r   rv   vectors_nlper   r2   errro   s
             r?   r   r      sK    +NN9% YYH 	K%%**,-2%%**k.@.@@!!''*a/%%**k.@.@@x}}+++67yy99$$,,008D ' ! 6tf=G 	
 $..qDIts   /E5 5
F#?FF#pretrain_configinit_configc                    UnUnS n[        US   5      nUb`  UR                  5       (       d!  SU 3nSS/US./n[        U R                  US9eUR	                  S5       n	U	R                  5       nS S S 5        Ub4  [        X5      n
U
R                  U5        [        R                  " SU5        gg	! , (       d  f       NF= f)
Ninit_tok2veczcan't find pretrained tok2vec: rW   )locmsg)r%   errorsrbz!Loaded pretrained weights from %sTF)
r   existsr   r%   openreadr!   
from_bytesr   rM   )r=   r   r   PIweights_datar   r   r   file_layers              r?   r   r      s     	AALq01L""$$3L>BC+^<SIJF'szz&IIt$ ::<L %'&7F %$s   B00
B>ORTH)rF   r   attrvectors_loctruncatepruner   r   c                   [        U5      nU(       Ga  UR                  S   R                  S5      (       a  US:w  a  [        S5      e[	        U R
                  R                  [        R                  " UR                  S5      5      S9U R
                  l
        U R
                   Hd  nUR                  (       d  M  UR                  [        :w  d  M,  U R
                  R                  R                  UR                  UR                  S9  Mf     U R
                  R                  5         GOU(       a=  [         R"                  " SU5        [%        UUUS	9u  pn
[         R"                  " S
U5        OSu  pU	b=  U[&        R(                  :w  a)  U	 H#  nXR
                  ;  d  M  U R
                  U     M%     Ub  U[&        R(                  :X  a3  [	        SU R
                  R                  UUS.W
D6U R
                  l
        OI[	        U R
                  R                  UU	US9U R
                  l
        U R
                  R                  5         Uc<  U R*                  S    SU R*                  S    S3U R
                  R                  l        OX@R
                  R                  l        U R
                  R                  R,                  U R*                  S   S'   US:  a1  U[&        R(                  :w  a  U R
                  R/                  U5        g g g )Nr#   z.npzr   z@ORTH is the only attribute supported for vectors in .npz format.r   )ru   rf   )rowzReading vectors from %sr   zLoaded vectors from %s)NN)ru   rf   r   )ru   rf   r   r   lang_rF   z.vectorsrh   r    rD   )r   partsendswithrJ   r   rx   ru   numpyloadr   rh   r}   r   addrk   deduplicate_vectorsr   rM   read_vectorsr   r   r   rF   prune_vectors)r=   r   r   r   rF   r   r   ro   vectors_datavector_keysfloret_settingswords               r?   convert_vectorsr      se    k*K{((,55f==6>R  $II%%EJJ{7G7G7M,N
		 99CxxxCHH0		!!%%chhCHH%=  			%%'KK1;?9E:6L
 KK0+>(4%L"t{/A/A'A#yy(IIdO $ #{)))$+ %II--%% &	%		! %,II--%$	%		! 		--/|$'HHV$4#5Qsxx7G6H!Q		!%		"%))"3"3"8"8CHHYzdk000		& 1zrB   r   truncate_vectorsc                   [        U 5      n[        U5      R                  5       n[        S US S  5       5      n0 nU[        R
                  :X  az  [        U5      S:w  a  [        S5      eS[        US   5      [        US   5      [        US   5      [        US   5      US	   US
   S.nUS:  a  [        [        R                  5      eO[        U5      S:X  d   eUS:  a  XS   4n[        R                  " USS9n/ n[        [        R                  " US S95       H  u  pU
R                  5       n
U
R!                  SUR"                  S   5      nUR%                  S5      n[        U5      UR"                  S   :w  a&  [        [        R&                  R)                  XS95      e[        R*                  " USS9Xy'   UR-                  U5        XS-
  :X  d  M    O   XxU4$ )Nc              3   8   #    U  H  n[        U5      v   M     g 7fr<   intrn   sizes     r?   rp   read_vectors.<locals>.<genexpr>  s     9(8#d))(8rr   r      z^Invalid header for floret vectors. Expected: bucket dim minn maxn hash_count hash_seed BOW EOWr                  )r   minnmaxn
hash_count	hash_seedboweowr    f)r   dtyper7    r   )line_numr   )r   )ensure_shapenextsplittupler   r   r   rJ   r   r   E860r   zeros	enumeratetqdmrstriprsplitr   r   E094rL   asarrayr   )r   r   r   r   header_partsr   r   r   vectors_keysilinepiecesr   s                r?   r   r     s    	[!A7==?L9Ra(899EO{!!!|!N 
 Q(Q(l1o.\!_-??
 q V[[)) ! < A%%%q %Qx0E;;U#6LLTYYq$78{{}S,"4"4Q"78zz!}v;,,,Q//V[[///LMM--c:D!1$$ 9 66rB   r   c                 2   [        U 5      n [        R                  " [        U 5      5      (       a   [        R                  " [        U 5      S5      $ U R
                  S   R                  S5      (       a'  S [        R                  " [        U 5      S5       5       $ U R
                  S   R                  S5      (       aL  [        R                  " [        U 5      5      nUR                  5       nUR	                  US   5      nS U 5       $ U R	                  SS	S
9$ )z%Handle .gz, .tar.gz or unzipped fileszr:gzr#   gzc              3   B   #    U  H  oR                  S 5      v   M     g7futf8Ndecodern   r   s     r?   rp   open_file.<locals>.<genexpr>D  s     I0HF##0H   rzipr   c              3   B   #    U  H  oR                  S 5      v   M     g7fr   r   r   s     r?   rp   r   I  s     6F##r   r   )encoding)r   tarfile
is_tarfilerP   r   r   r   gzipzipfileZipFilenamelist)r   zip_filenamesr   s       r?   	open_filer  >  s    
c
C#c(##||CHf--	2			%	%I		#c(C0HII	2			&	&??3s8,!!#eAh'666xxfx--rB   c              #     #    [        U 5      n[        U5      n [        S UR                  5       SS  5       5      nUb  Uv   U Sh  vN   OZ[        UR                  5       5      S-
  nSnU H  nUS-  nM
     U SU 3v   [        U 5      nU Sh  vN   UR                  5         UR                  5         g! [         a    Sn Nf = f N N97f)zEnsure that the first line of the data is the vectors shape.
If it's not, we read in the data and output the shape as the first result,
so that the reader doesn't have to deal with the problem.
c              3   8   #    U  H  n[        U5      v   M     g 7fr<   r   r   s     r?   rp   ensure_shape.<locals>.<genexpr>V  s     C,BDc$ii,Brr   Nr   r    r   )r  r   r   r   rJ   r   close)r   lines
first_liner   widthlengthr   lines2s           r?   r   r   N  s     
 k"EeJCJ,<,<,>r,BCC  J$$&'!+AaKF %!! ;'	KKM'  
 	 	sF   C#B: C	C
A
CC%C:C	CC		CC)<r  r  r   r  	itertoolsr   pathlibr   typingr   r   r   r	   r
   r   r   r{   r   	thinc.apir   r   r   r   r   r   r   rg   r   schemasr   utilr   r   r   r   r   r   r   r   r   rh   r   r   r   pretrainr!   languager"   r   re   rP   r   boolr   r   defaultr   r   r  r   rD   rB   r?   <module>r      s         @ @    W W %  *
 
 
 *  %# 02 KV K Kj Kb  !%!%4	%4 4.%4 g	%4
 c]%4 
%4R EIE	E d+E=AE	ED	&*38nCGS>	: ##='	='$=' 	='
 =' 3-=' =' =' 
='B >I=P=P(7(7),(77:(7V.5d# . . rB   