
    h /                        S SK Jr  S SKrS SKrS SKrS SKJrJr  S SK	J
r
  S SKJr  S SKJrJr  S SKJr  S SKJrJr  S S	KJr  S S
KJr  S SKJr  S SKJr  S SKJr  SSK
Jr  Sr Sr!Sr"0 SS0SSSSS./r#SSSSS.SSSSS./r$S r%\RL                  RO                  S \#5      \RL                  RO                  S!S"5      S# 5       5       r(\RL                  RO                  S \$5      S$ 5       r)\RL                  RO                  S \$5      S% 5       r*\RL                  RO                  S&\!\ /5      S' 5       r+S( r,S) r-S* r.S+ r/S, r0S- r1g).    )PathN)Configget_current_ops)util)English)DEFAULT_CONFIG_PATHDEFAULT_CONFIG_PRETRAIN_PATH)create_pretrain_vectors)DocDocBin)init_nlp)train)pretrain)Vectors)Vocab   )make_tempdiraE  
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}

[pretraining]
max_epochs = 5

[training]
max_epochs = 5
a  
[nlp]
lang = "en"
pipeline = ["tagger"]

[components]

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[pretraining]
max_epochs = 5

[training]
max_epochs = 5
a  
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]

[components]

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v2"

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}

[pretraining]
max_epochs = 5

[pretraining.objective]
@architectures = spacy.PretrainVectors.v1
maxout_pieces = 3
hidden_size = 300
loss = cosine

[training]
max_epochs = 5
@architectureszspacy.PretrainCharacters.v1   *   )r   maxout_pieceshidden_sizen_characterszspacy.PretrainVectors.v1   ,  cosine)r   r   r   loss   L2c                      [        5       R                  [        5      n [        R                  " U SSS9nUR
                  n[        R                  " [        5      nUR                  U5      nSUS   S   S   ;   d   eg)	z7Test that pretraining defaults to a character objectiveTF	auto_fillvalidatePretrainCharacterspretraining	objectiver   N)	r   from_strpretrain_string_internalr   load_model_from_configconfigload_configr	   merge)r*   nlpfilledpretrain_configs       _/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/training/test_pretraining.pytest_pretraining_defaultr1      sq    X78F

%
%fu
MCZZF&&'CDO""6*F6-#8#EFV#WWWW    r&   	skip_last)TFc                    [        5       R                  [        5      nXS   S'   [        R                  " USSS9nUR
                  n[        R                  " [        5      nUR                  U5      n[        5        n[        U5      nXtS   S'   UR                  5       nUS   S   S	:X  d   e[        XFUS
9  [        US-  5      R                  5       (       d   e[        US-  5      R                  5       (       d   e[        US-  5      R                  5       (       a   eU(       a$  [        US-  5      R                  5       (       a   eO#[        US-  5      R                  5       (       d   eSSS5        g! , (       d  f       g= f)z8Test that pretraining works with the character objectiver%   r&   TFr!   pathsraw_text	componenttok2vec)r3   
model0.bin
model4.bin
model5.binmodel-last.binNr   r'   pretrain_string_listenerr   r)   r*   r+   r	   r,   r   write_sample_jsonlinterpolater   r   exists)r&   r3   r*   r-   r.   r/   tmp_dir	file_paths           r0   #test_pretraining_tok2vec_charactersrD      s\    X78F)2=+&

%
%fu
MCZZF&&'CDO""6*F	7&w/	&/w
###%m$[1Y>>>I6Gl*+224444Gl*+224444,./668888G&667>>@@@@@"223::<<<< 
s   <C2E77
Fc                    [        5       R                  [        5      nXS   S'   [        R                  " USSS9nUR
                  n[        R                  " [        5      nUR                  U5      n[        5        n[        U5      nXcS   S'   UR                  5       nUS   S	   b   e[        R                  " [        5         [        X55        S
S
S
5        S
S
S
5        g
! , (       d  f       N= f! , (       d  f       g
= f)z]Test that pretraining doesn't works with the vectors objective if there are no static vectorsr%   r&   TFr!   r5   r6   
initializevectorsN)r   r'   r>   r   r)   r*   r+   r	   r,   r   r?   r@   pytestraises
ValueErrorr   )r&   r*   r-   r.   r/   rB   rC   s          r0   %test_pretraining_tok2vec_vectors_failrK      s     X78F)2=+&

%
%fu
MCZZF&&'CDO""6*F	7&w/	&/w
###%l#I.666]]:&V% ' 

 '& 
s%   <AC2C!C2!
C/	+C22
D c                    [        5       R                  [        5      nXS   S'   [        R                  " USSS9nUR
                  n[        R                  " [        5      nUR                  U5      n[        5        n[        U5      nXcS   S'   [        U5      nXsS   S	'   UR                  5       n[        X55        S
S
S
5        g
! , (       d  f       g
= f)zQTest that pretraining works with the vectors objective and static vectors definedr%   r&   TFr!   r5   r6   rF   rG   N)r   r'   r>   r   r)   r*   r+   r	   r,   r   r?   write_vectors_modelr@   r   )r&   r*   r-   r.   r/   rB   rC   nlp_paths           r0    test_pretraining_tok2vec_vectorsrO      s     X78F)2=+&

%
%fu
MCZZF&&'CDO""6*F	7&w/	&/w
#&w/*2|Y'##%! 
s   <A C
Cr*   c                    [        5       R                  [        5      n [        R                  " U SSS9nUR
                  n[        R                  " [        5      nUR                  U5      n[        5        n[        U5      nXRS   S'   SUS   S'   S	US   S
'   UR                  5       n[        X$5        [        US-  5      R                  5       (       d   e[        US-  5      R                  5       (       d   e[        US-  5      R                  5       (       d   e[        US-  5      R                  5       (       a   e SSS5        g! , (       d  f       g= f)z?Test pretraining of the tagger's tok2vec layer (via a listener)TFr!   r5   r6   taggerr%   r7   r8   layerr9   r:   r<   r;   Nr=   r*   r-   r.   r/   rB   rC   s         r0   test_pretraining_tagger_tok2vecrT      s0    X78F

%
%fu
MCZZF&&'CDO""6*F	7&w/	&/w
#-5}k*)2}g&##%!Gl*+224444Gl*+224444G../668888,./6688888 
s   5C
E		
Ec                     [        5       R                  [        5      n [        R                  " U SSS9nUR
                  n[        R                  " [        5      nUR                  U5      n[        5        n[        U5      nXRS   S'   SUS   S'   UR                  5       n[        R                  " [        5         [        X$5        S	S	S	5        S	S	S	5        g	! , (       d  f       N= f! , (       d  f       g	= f)
z\Test pretraining of the tagger itself will throw an error (not an appropriate tok2vec layer)TFr!   r5   r6   rQ   r%   r7   N)r   r'   r(   r   r)   r*   r+   r	   r,   r   r?   r@   rH   rI   rJ   r   rS   s         r0   test_pretraining_taggerrV      s    X78F

%
%fu
MCZZF&&'CDO""6*F	7&w/	&/w
#-5}k*##%]]:&V% ' 

 '& 
s%   5AC(:CC(
C%	!C((
C6c            	         [        5       R                  [        5      n [        R                  " U SSS9nUR
                  n[        R                  " [        5      nUR                  U5      n[        R                  " [        5      nUR                  U5      n[        5        nUS-  nUR                  5         [        U5      nXrS   S'   SUS   S	'   S
US   S'   US-  nUR                  5         [        U5      u  pXS   S'   XS   S'   UR                  5       nUS   n[        U5      nUR!                  US	   5      R"                  R%                  US   5      R%                  S5      nSnUR'                  5        H  nUR(                  S:X  d  M  UnM     [+        X&5        [-        US-  5      nUR/                  5       (       d   e[1        U5      US   S'   [        U5      nUR!                  US	   5      R"                  R%                  US   5      R%                  S5      nSnUR'                  5        H  nUR(                  S:X  d  M  UnM     [2        R4                  " [2        R6                  " UR9                  S5      UR9                  S5      5      5      (       d   e[;        X5        SSS5        g! , (       d  f       g= f)z5Test that training can use a pretrained Tok2Vec modelTFr!   r   r5   r6   rQ   r%   r7   r8   rR   r   devembedN	hashembedz
model3.binrF   init_tok2vecE)r   r'   r(   r   r)   r*   r+   r	   r,   r   r   mkdirr?   write_sample_trainingr@   r   get_pipemodelget_refwalknamer   r   rA   strnpany	not_equal	get_paramr   )r*   r-   r.   r/   train_configrB   pretrain_dirrC   	train_dir
train_pathdev_pathPnlp_base
model_base
embed_basenodepretrained_modelr`   rY   s                      r0   test_pretraining_trainingrt     sp   X78F

%
%fu
MCZZF&&'CDO""6*F##$78L'F	7+&|4	&/w
#-5}k*)2}g&g%	4Y?
#-w !)w##%=!F#an-33;;AgJGOOPWX 	 
OO%DyyK'!
 & 	&| ;<&&((((/23C/D|^,vQ{^,22::1W:FNNwWJJLDyyK' ! vvbll5??3#79M9Mc9RSTTTTcE 
s    C*J0B4J0A!J00
J>c                 h    SS0SSSS.S.SS0S	SSS.S./nU  S
3n[         R                  " X!5        U$ )Nid1z$This is the best TV you'll ever buy!   r   )posneg)metatextcats2zI wouldn't buy this again.z/text.jsonl)srslywrite_jsonl)rB   datarC   s      r0   r?   r?   :  s]     3K:a(	
 3K0a(	
D );'I	i&r2   c                     / SQn/ SQn[        [        5       R                  XS9n[        5       nUR	                  U5        U  S3nU  S3nUR                  U5        UR                  U5        XV4$ )N)Theplayersstart.)DTNNVBZr   )wordstagsz/train.spacyz
/dev.spacy)r   r   vocabr   addto_disk)rB   r   r   docdoc_binrl   rm   s          r0   r^   r^   L  sj    ,E#D
giooU
6ChGKK9L)J*%HOOJOOHr2   c                 x   SS K n[        5       nUR                  R                  SSS5      UR                  R                  SSS5      UR                  R                  SSS5      S.nUR	                  5        H  u  pEUR                  XE5        M     U S-  n[        U5      nUR                  U5        [        U5      $ )Nr   rx   )r   )dogcatorangevectors_model)	numpyr   randomuniformitems
set_vectorr   r   rd   )rB   r   r   vector_datawordvectorrN   r-   s           r0   rM   rM   Y  s    GE||##B62||##B62,,&&r1f5K
 $))+& ,(H
%.CKKx=r2   c                     [        5       n U R                  S5        U R                  5         [        SS9U R                  l        [        SSS5      " U R                  U R                  S5      R                  5        [        [        5       R                  R                  S5      SSS9U R                  l        [        SSS5      " U R                  U R                  S5      R                  5        [        R                  " [        SS	9   [        5       U R                  l        [        SSS5      " U R                  U R                  S5      R                  5        S S S 5        g ! , (       d  f       g = f)
Nr8   )
   r   )shaperx   r   floret)r   mode
hash_countE875)match)r   add_piperF   r   r   rG   r
   r_   r`   r   xpzerosrH   rI   rJ   )r-   s    r0   test_pretrain_default_vectorsr   j  s   
)CLLNN  h/CIIAq(+CIIs||I7N7T7TU  !!''1QCII Aq(+CIIs||I7N7T7TU 
z	0#I		1h/IIs||I.44	
 
1	0	0s   AE  
E.)2pathlibr   r   re   rH   r   	thinc.apir   r   spacyr   spacy.lang.enr   spacy.languager   r	   spacy.ml.models.multi_taskr
   spacy.tokensr   r   spacy.training.initializer   spacy.training.loopr   spacy.training.pretrainr   spacy.vectorsr   spacy.vocabr   r   r>   r(   pretrain_string_vectorsCHAR_OBJECTIVESVECTOR_OBJECTIVESr1   markparametrizerD   rK   rO   rT   rV   rt   r?   r^   rM   r    r2   r0   <module>r      s       -  ! L > $ . % , !  # J >) X 457		 5	 5	  X o6m4= 5 7=. &78& 9&" &78" 9"" $<>V#WX9 Y9(&"+\$
 "
r2   