
    h                        S SK r S SKrS SKrS SKrS SKJrJr  S SKJr  S SK	J
r
  S SKJrJr  S SKJr  S SKJr  S SKJr  S	S
KJr  \R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r\R,                  R/                  S5      S 5       r S r!S r"S r#S  r$S! r%S" r&g)#    N)DEPHEAD)English)Language)MatcherPhraseMatcher)Doc)Vectors)Vocab   )make_tempdiri  c                     [        [        5       5      n [        R                  " SSS9n[	        U/ SQS9nU R                  S5      nUR                  S5        UR                  R                  SS	5      S	:X  d   eX#R                  l
        [        5        nUR                  U5        U R                  S5      R                  U5      nUR                  R                  SS	5      S	:X  d   e S
S
S
5        g
! , (       d  f       g
= f)zbTest that models with no pretrained vectors can be deserialized
correctly after vectors are added.)   i,  fdtype)IamMatt)datakeystaggerPRPpretrained_dimsr   N)r   r   numpyonesr
   create_pipe	add_labelcfggetvocabvectorsr   to_disk	from_disk)nlpr   r"   r   paths        b/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/serialize/test_serialize_doc.pytest_issue1727r(      s     57
C::hc*D4&9:G__X&F
U::>>+Q/1444"LL	4t*44T:zz~~/3q888 
s   AC11
C?i  c            
      b   [         R                  " SS/SS/SS/SS/SS	/S
S/SS//SS9n [        [        5       SR	                  5       S9nUR
                  R                  R                  S5        UR                  [        [        /U 5      n[        [        UR                  5      5      S:X  d   eg)zWTest sentence boundaries are deserialized correctly, even for
non-projective sentences.   i     i  r   i  i  r   l   LP^& l    i  l    i  uint64r   zJust what I was looking for .wordsROOTN)r   asarrayr	   r   splitr!   stringsadd
from_arrayr   r   lenlistsents)
heads_depsdocs     r'   test_issue1799r:   #   s     HHHH#$!3'!3'	
 J eg<BBD
ECII&!
..$j
1CtCII1$$$    i*  c            
         / SQn [        [        5       U S9nSUS   l        [        UR                  5      R	                  UR                  5       5      nUS   R                  (       d   eUR                  S5      (       a   eUR                  S5      (       a   e[        [        5       U S/[        U 5      -  / SQS/[        U 5      -  S	9n[        UR                  5      R	                  UR                  5       5      nUS   R                  (       d   eUR                  S5      (       d   eUR                  S5      (       d   eg
)zRTest that sentence boundaries & parse/tag flags are not lost
during serialization.)	Thisisafirstsentence.Andanotheroner-   T   r   TAG)	r   r   r   r   r   r   rF   rF   rF   dep)r.   tagsheadsdepsN)	r	   r   is_sent_startr!   
from_bytesto_bytes
sent_starthas_annotationr5   )r.   r9   new_docs      r'   test_issue1834rR   9   s    SE
egU
#CCF#))n''7G1:    %%e,,,,%%e,,,,
Ws5z!)Ws5z!C #))n''7G1:    !!%((((!!%((((r;   i[  c                  2   [        [        5       5      n U R                  SSS0//5        [        U R                  S/S9n[        U " U5      5      S:X  d   e[        R                  " U 5      n[        UR                  S/S9n[        U" U5      5      S:X  d   eg )Npat1orthhellor-   r*   )r   r   r3   r	   r!   r5   copydeepcopy)matcherr9   new_matcherrQ   s       r'   test_issue1883r[   Q   s    egGKK67+,-.
gmmG9
-Cws|!!!--(K+##G95G{7#$)))r;   i
  c                  (   [        5       n U R                  S5      nUR                  S5        U R                  5         U " S5      nUR	                  S5      (       d   eU R                  SS/5      n[        U5      nUR	                  S5      (       d   eg)zQTest the tagger sets has_annotation("TAG") correctly when used via Language.pipe.r   Azhello worldrG   rV   worldN)r   add_piper   
initializerP   pipenext)r%   r   r9   docs	piped_docs        r'   test_issue2564re   \   s     *C\\(#F
SNN
m
Ce$$$$88Wg&'DT
I##E****r;   i  c                  N   [        5       n [        U R                  5      nUR                  SU " S5      U " S5      U " S5      /5        UR                  SU " S5      /5        [        R
                  " U5      n[        R                  " U5      n[        U5      [        U5      :X  d   eg)z5Test that the PhraseMatcher can be pickled correctly.TEST1r?   bcTEST2dN)r   r   r!   r3   pickledumpsloadsr5   )r%   rY   r   rZ   s       r'   test_issue3248_2ro   j   s     )CCII&GKK#c(CHc#h78KK#c($<< D,,t$K{s7|+++r;   i  c                      [        5       n U R                  S5        U R                  5       n[        5       nUR                  S5        UR                  U5        g)zaTest that Language.to_bytes handles serializing a pipeline component
with an uninitialized model.textcatN)r   r_   rN   rM   )r%   
bytes_datanew_nlps      r'   test_issue3289rt   v   sE     )CLLJiGYz"r;   i  c                     [        5       n U R                  S5        U " S5      nUS   R                  (       d   eUR                  S5      (       d   e[	        [        UR                  5      5      S:X  d   eUR                  5       n[        U R                  5      R                  U5      nUS   R                  (       d   eUR                  S5      (       d   e[	        [        UR                  5      5      S:X  d   eg)zxTest that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
be restored after serialization.sentencizerzHello worldr   
SENT_STARTr*   N)r   r_   rL   rP   r5   r6   r7   rN   r	   r!   rM   )r%   r9   	doc_bytesrQ   s       r'   test_issue3468ry      s     )CLL
m
Cq6l++++tCII1$$$I#))n''	2G1:####!!,////tGMM"#q(((r;   iw  c                  b   [        5       n U " S5      nUS   R                  S:X  d   eSUS   l        US   R                  S:X  d   e[        5        nUS-  nUR                  U5        U " S5      nUR	                  U5        US   R                  S:X  d   e SSS5        g! , (       d  f       g= f)z=Ensure that a modified pos attribute is serialized correctly.zSdisplaCy uses JavaScript, SVG and CSS to show you how computers understand languager    NOUNmy_docN)r   pos_r   r#   r$   )r%   r9   tmp_dir	file_pathdoc2s        r'   test_issue3959r      s     )C
]C q6;;"CFKq6;;&   	7h&	I2wy!Aw||v%%% 
s   AB  
B.c                    [        U 5      nUR                  5       n[        U 5      nUR                  U5        [        U5      [        U5      :X  d   e[	        X5       H!  u  pEUR
                  UR
                  :X  a  M!   e   g )N)r	   rN   rM   r5   ziptext)en_vocabr9   r   r   token1token2s         r'   test_serialize_empty_docr      sg    
h-C<<>Dx=DOODs8s4y   c.{{fkk))) )r;   c                     [        U SS/S9nSS0Ul        UR                  5       n[        U 5      R                  U5      nUR                  5       U:X  d   eg )NrV   r^   r-   r]   g      ?)r	   catsrN   rM   )r   r9   doc_brQ   s       r'   "test_serialize_doc_roundtrip_bytesr      sT    
hw0
1CSzCHLLNE(m&&u-G&&&r;   c                    [        U SS/S9n[        5        nUS-  nUR                  U5        [        U 5      R                  U5      nUR	                  5       UR	                  5       :X  d   e S S S 5        g ! , (       d  f       g = fNrV   r^   r-   r9   )r	   r   r#   r$   rN   r   r9   rk   r   doc_ds        r'   !test_serialize_doc_roundtrip_diskr      sh    
hw0
1C	1I	IH''	2||~!1111	 
s   AA66
Bc                 $   [        U SS/S9n[        5        nUS-  n[        U5      nUR                  U5        [        U 5      R	                  U5      nUR                  5       UR                  5       :X  d   e S S S 5        g ! , (       d  f       g = fr   )r	   r   strr#   r$   rN   r   s        r'   *test_serialize_doc_roundtrip_disk_str_pathr      sq    
hw0
1C	1I		N	IH''	2||~!1111 
s   A B
Bc                    [        U SS/S9nSUR                  S'   [        U 5      R                  UR                  5       5      nUR                  S   S:X  d   e[        U 5      R                  UR                  5       S/S9nUR                  (       a   e[        U 5      R                  UR                  S/S95      nUR                  (       a   eg )NrV   r^   r-   barfoo	user_data)exclude)r	   r   rM   rN   )r   r9   rQ   s      r'   test_serialize_doc_excluder      s    
hw0
1C CMM%(m&&s||~6GU#u,,,(m&&s||~}&MG    (m&&s||[M|'JKG     r;   c                    [        U / SQS9nUSS nSUl        SUl        SUl        U/UR                  S'   [        U 5      R                  UR                  5       5      n[        UR                  S   5      S	:X  d   eUR                  S   S   R                  S:X  d   eUR                  S   S   R                  S:X  d   eUR                  S   S   R                  S:X  d   eg )
N)rV   r^   !r-   r   r   $test_serialize_doc_span_groups_label!test_serialize_doc_span_groups_id$test_serialize_doc_span_groups_kb_idcontentr*   )r	   label_id_kb_id_spansrM   rN   r5   )r   r9   spanrQ   s       r'   test_serialize_doc_span_groupsr      s    
h5
6Cq8D8DK2DH8DK 6CIIi(m&&s||~6Gw}}Y'(A---==#A&--1WWWW==#A&**.QQQQ==#A&--1WWWWr;   )'rW   rl   r   pytestspacy.attrsr   r   spacy.lang.enr   spacy.languager   spacy.matcherr   r   spacy.tokensr	   spacy.vectorsr
   spacy.vocabr   utilr   markissuer(   r:   rR   r[   re   ro   rt   ry   r   r   r   r   r   r   r    r;   r'   <module>r      s       ! ! # 0  !   49 9  4% %* 4) ). 4* * 4
+ 
+ 4, , 4# # 4) )  4& &$*'22!Xr;   