
    h%                        S SK r S SKrS SKJr  S SKJr  S SKJrJrJ	r	J
r
Jr  S SKrS SKJrJrJrJrJrJr  S SKJr  S SKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#       S(S\S\S\
\   S\
\$   S\$S\%S\%4S jjr&S\\\\#4      S\	\   4S jr' S)S\S\S\
\$   S\%S\$4
S jjr(S\S\\   S\S\S\)4
S jr*S  r+S! r, " S" S#5      r- S*S$\\)\$4   S%\$S&\$S\.4S' jjr/g)+    N)Counter)Path)CallableIterableListOptionalUnion)ConfigModel	Optimizerfix_random_seedset_dropout_rateset_gpu_allocator)ConfigValidationError)Printer   )Errors)ConfigSchemaPretrain)Doc)dot_to_objectload_model_from_configregistry   )Exampleconfig
output_dirresume_pathepoch_resumeuse_gpusilent	skip_lastc                   ^^^^ [        US9nU S   S   b  [        U S   S   5        U S   S   nUS:  a  U(       a  [        U5        S U S   S'   [        U 5      n	U	R                  R                  5       n
[        R                  " U
S   [        S	9n[        XS
   5      n[        R                  " S
U05      S
   nUS   n[        X5      mUS   mUb  [        TX#US9nOSnTR                  S   n[        SS9mUS   (       a  UR                  SU SUS    S35        OUR                  SU 35        SSS.nUR                  " S 0 UD6  S!UUUU4S jjn [!        X;S   5       H  n[#        U" U" U	5      5      5       Hm  u  nn[%        U5      n['        TUTU5      nTR)                  UUU5      nU(       a  UR                  " U40 UD6  US   (       d  MX  UUS   -  S:X  d  Mf  U" USS9  Mo     US   (       a!  UUS   -  S:X  d  UUS   S-
  :X  a  U" U5        OU" U5        STl        M     U(       d  U" US   SS9  g g ! U(       d  U" US   SS9  f f = f)"Nno_printtrainingseedgpu_allocatorr   
initializeinit_tok2vecpretraining)schemacorpusbatcher	optimizer)r    lossi'  )	frequencyn_save_epochz/Pre-training tok2vec layer - starting at epoch z - saving every z epoch)   
   r3         )rr6   r6   r6   r6   )widthsalignsc                 ^  > U(       a  SOSnTR                  TR                  5         U(       a  T	S-  nOT	SU  U S3-  nUR                  S5       nUR                  TR	                  S5      R                  5       5        S S S 5        T
R                  T
R                  T
R                  U S.nT	S	-  R                  S
5       nUR                  [        R                  " U5      S-   5        S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       N(= f! , (       d  f       g = f)Nz.temp zmodel-last.binmodelz.binwbtok2vec)nr_wordr/   
epoch_lossepochz	log.jsonla
)
use_paramsaveragesopenwriteget_refto_bytesr>   r/   r?   srsly
json_dumps)r@   is_tempis_lastis_temp_str	save_pathfile_logr;   r.   r   trackers          Q/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/pretrain.py_save_modelpretrain.<locals>._save_modelF   s    !(gbi001&>:	&5}D)II	!!$'5EMM)4==?@ ( #??%00	C {*005E,,S1D89 6 21
 (' 65 21s<   *D/C<AD)D+D<
D
	D
D	D
D,
max_epochsn_save_everyT)rK   r           )rL   ))#z# Wordsz
Total LossLosszw/s)FF)r   r   r   r   r   interpolater   resolver   r   create_pretraining_model_resume_modelattrsProgressTrackerdividerrowrange	enumerateensure_docsmake_updateupdater?   )r   r   r   r   r   r    r!   msg	allocatornlp_configPr,   r-   	objectiverow_settingsrS   r@   batch_idbatchdocsr/   progressr;   r.   rQ   s    `                     @@@rR   pretrainrr      s    6
"Cj&!-z*623z"?3I!|	)$+/F<(
 
(Cjj$$&G/8LMA7hK0Fx01(;F	lG$S,E+I$UKfU F#I.G=l^K[\]^l\m[nntu	
 	El^TU/;TULGGJ\J: :(7<<9E#,WVC[-A#B%"5)"5$	9E">>%t<GGH55^$$(Q~5F*F!*Kt4 $C  1^,,1UaoPQ>Q5Q&E"!$G :" ,6 y,6 s   A<H? 
H? #A	H? ?Iexamples_or_docsreturnc                     / nU  HF  n[        U[        5      (       a  UR                  U5        M+  UR                  UR                  5        MH     U$ N)
isinstancer   append	reference)rs   rp   	eg_or_docs      rR   rd   rd   p   sC    D%	i%%KK	"KK	++,	 &
 K    r;   c                    [        US9nUR                  SU 35        UR                  S5       nUR                  5       nU R	                  S5      R                  U5        S S S 5        Ucd  [        R                  " S[        U5      5      nU(       a$  [        UR                  S5      SS  S S 5      S	-   nO[        [        R                  5      eUR                  S
U 35        U$ ! , (       d  f       N= f)Nr#   zResume training tok2vec from: rbr=   zmodel\d+\.binr      r   zResuming from epoch: )r   inforE   readrG   
from_bytesresearchstrintgroup
ValueErrorr   E1020)r;   r   r   r    rg   rO   weights_data
model_names           rR   r]   r]   z   s     6
"CHH-k];<			$	5zz|i ++L9 
  YY/[1AB
z//2126s;<q@L V\\**HH$\N34 
 	s   1C%%
C3rp   r.   objective_funcc                     U R                  U5      u  pEU" U R                  X5      u  pgU" U5        U R                  U5        [        U5      $ )zPerform an update over a single batch of documents.

docs (iterable): A batch of `Doc` objects.
optimizer (callable): An optimizer.
RETURNS loss: A float for the loss.
)begin_updateopsfinish_updatefloat)r;   rp   r.   r   predictionsbackpropr/   	gradientss           rR   re   re      sK     "..t4K$UYYBODY		" ;r{   c                 x   U R                  / S9   U R                  5         SSS5        [        X5      n[        U5      R                  S:X  a9  UR
                  S:w  a  UR
                  OSnU R                  U5      R                  n UR                  U R                  S5      /S9  US   nU" U R                  U5      nUR                  U R                  S5      /S9  [        XqS   5        U$ ! , (       d  f       N= f! [         a1    US   nUS	   n[        [        R                  R                  XES
95      ef = f)a  Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
takes a batch of Doc objects (as a list), and returns a list of arrays.
Each array in the output needs to have one row per token in the doc.
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
)enableNTok2VecListener*r=   zGive it a doc to infer shapes)X	componentlayer)r   r   rl   dropout)select_pipesr(   get_tok2vec_reftype__name__upstream_nameget_piper;   make_docr   r   E874formatvocabr   )ri   pretrain_configr=   original_tok2vecr   r   create_functionr;   s           rR   r\   r\      s1    
				$ 
%c3GG}!22%,%:%:c%AG!!y 	 ,,/066Ocll+JKLM &k2OCIIw/E	%DEFGUI67L) 
%	$  O#K0	(++i+MNNOs   C-C> -
C;>;D9c                     US   nUc#  SnSnSS/US./n[        U R                  S   XSS9eU R                  U5      R                  nUS   (       a  UR	                  US   5      nU$ )Nr   zpTo use pretrained tok2vec weights, [pretraining.component] needs to specify the component that should load them.zcomponent can't be nullr*   )locrg   )r   errorsdescr   )r   r   r   r;   rG   )ri   r   tok2vec_componentr   errr   r   s          rR   r   r      s    '4 D 	 ((+6sCD#::m,V
 	
 LL*+11Ewog67Lr{   c                   $    \ rS rSrSS jrS rSrg)r_      c                     SU l         SU l        SU l        [        5       U l        Xl        [        R                  " 5       U l        SU l        SU l	        g )NrW   r   )
r/   	prev_lossr>   r   words_per_epochr0   time	last_timelast_updater?   )selfr0   s     rR   __init__ProgressTracker.__init__   sB    	&y"r{   c                    U =R                   U-  sl         U =R                  U-  sl        [        S U 5       5      nU R                  U==   U-  ss'   U =R                  U-  sl        U R                  U R
                  -
  nXPR                  :  a  U[        R                  " 5       U R                  -
  -  nU R                  U l        [        R                  " 5       U l        U R                   U R                  -
  nUU R                  [        U R                   SS9[        USS9[        U5      4n[        U R                   5      U l	        U$ g )Nc              3   8   #    U  H  n[        U5      v   M     g 7frv   )len).0docs     rR   	<genexpr>)ProgressTracker.update.<locals>.<genexpr>   s     6#SXXs   r3   )widthr4   )r/   r?   sumr   r>   r   r0   r   r   r   _smart_roundr   r   )	r   r@   r/   rp   words_in_batchwords_since_updatewpsloss_per_wordstatuss	            rR   rf   ProgressTracker.update   s    		T	4666U#~5#&!\\D,<,<</$		dnn(DEC#||D!YY[DN II6MTYYb1]!4CF #499-DNMr{   )r?   r0   r   r   r/   r>   r   r   N)i@B )r   
__module____qualname____firstlineno__r   rf   __static_attributes__ r{   rR   r_   r_      s    r{   r_   figurer   max_decimalc                     [        [        [        U 5      5      5      nXS-   -
  nUS::  a  [        [        U 5      5      $ [        XB5      nS[        U5      -   S-   nXP-  $ )z=Round large numbers as integers, smaller numbers as decimals.r   z%.f)r   r   r   min)r   r   r   n_digits	n_decimal
format_strs         rR   r   r      s_     3s6{#$HA&IA~3v;	/	C	N*S0
""r{   )NNTF)T)r3   r5   )0r   r   collectionsr   pathlibr   typingr   r   r   r   r	   rI   	thinc.apir
   r   r   r   r   r   thinc.configr   wasabir   r   r   schemasr   tokensr   utilr   r   r   exampler   r   boolrr   rd   r]   r   re   r\   r   r_   r   r   r   r{   rR   <module>r      s   	    < <   /   *  B B  #'"&S7S7S7 $S7 3-	S7
 S7 S7 S7l(5g+>"? DI  RV#3;C=JN. 2;MU
&>$! !J DE#%*#&)#=@##r{   