
    h*              
       :   S SK r S SKrS SKJrJr  S SKJr  \R                  R                  S5      S 5       r	S r
S rS rS	 rS
 rS rS rS r\R                  R%                  SSS00 /SS00 /SS00 /SS0SS0/SSS.SS0/SS0/5      S 5       rS rS rS rg)    N)DocToken)Vocabi  c           	      H   / SQn[         R                  " SS/SS/SS/SS	/S
S/SS//SS9n[        XS9nX#l        / SQnU Vs/ s H  oUR                  PM     snU:X  d   e/ SQn[        U5       H  u  pxXU   l        M     U Vs/ s H  oUR                  PM     snU:X  d   eU Vs/ s H  oUR                  PM     n	n[        U	5      [        U5      :X  d   eUR                  5        n
US   S4US   /nSS/SS/SS/S.nU
R                  US   SS/XS9  S S S 5        / SQnU Vs/ s H  oUR                  PM     snU:X  d   e/ SQnU Vs/ s H  oUR                  PM     snU:X  d   eU Vs/ s H  oUR                  PM     nn[        U5      [        U5      :X  d   eU	S   R                  5       US   R                  5       :X  d   eU	S   R                  5       US   R                  5       :X  d   eU	S   R                  5       US   R                  5       :X  d   eU	S   R                  5       US   R                  5       :X  d   eU	S   R                  5       US   R                  5       :X  d   eg s  snf s  snf s  snf ! , (       d  f       GNr= fs  snf s  snf s  snf ) N)IliveinNewYorkrightnowg      ?g?g       @g @g      @g@g      @gffffff@g      @gffffff@g      @gffffff@f)dtypewords         PROPNNewYorkpobjcompound)POSLEMMADEP)headsattrs)r   r   r	   r   r   r   r   r            )numpyasarrayr   tensortext	enumeratelemma_vectorlen
retokenizesplittolist)en_vocabr   r#   doc	gold_texttoken
gold_lemmailemma	vectors_1retokenizerr   r   	vectors_2s                 _/home/james-whalen/.local/lib/python3.13/site-packages/spacy/tests/doc/test_retokenize_split.pytest_issue3540r7      s   :E]]
sc3Z#sc3Z#sc3ZPF h
$CJ>I$'(C5JJC(I555?Jj)A *&)*cULLc*j888+./3%3I/y>SX%%%		[a&!c!f%W%V_J'

 	#a&5&/L 
 CI$'(C5JJC(I555CJ&)*cULLc*j888+./3%3I/y>SX%%%Q< IaL$7$7$9999Q< IaL$7$7$9999Q< IaL$7$7$9999Q< IaL$7$7$9999Q< IaL$7$7$99997 ) +/ 
	 )*/s0   I4I9(I>)/J(JJ0J
Jc                    / SQn/ SQnS/[        U5      -  n[        XX#S9n[        U5      S:X  d   e[        [        U5      5      S:X  d   eUS   R                  R                  S:X  d   eUS	   R                  R                  S
:X  d   eUR                  5        nUR                  US   SS/US   S	4US	   /S/S-  SS/S/S-  S/S-  S.S9  S S S 5        [        U5      S:X  d   eUS   R                  S:X  d   eUS   R                  R                  S:X  d   eUS   R                  S:X  d   e[        US   R                  5      S:X  d   eUS	   R                  S:X  d   eUS	   R                  S:X  d   eUS	   R                  R                  S:X  d   e[        US	   R                  5      S:X  d   eUS   R                  S:X  d   eUS   R                  R                  S
:X  d   eUS   R                  S
:X  d   eUS   R                  R                  S
:X  d   e[        [        U5      5      S:X  d   eg ! , (       d  f       GNq= f)N
LosAngelesstart.r   r   r   depr   r   depsr      r   r;   r   r<   LosAngelesNNPr   GPEzNumber=Sing)tagr2   ent_typemorphr   r   )	r(   r   strheadr$   r)   r*   idxrH   )r,   r   r   r@   r-   r4   s         r6   test_doc_retokenize_splitrM   0   sM   (EE7SZD
h5
<Cs8q==s3x=Bq6;;w&&&q6;;s"""		[FI!fa[#a&!w{+"GaK'1,		 	 
	
 
 s8q==q6;;%q6;;y(((q6::??s1v||---q6::??q6;;)###q6;;w&&&s1v||---q6;;'!!!q6;;s"""q6;;#q6;;s"""s3x=B3 
	s   5H::
I	c                    / SQn/ SQnS/[        U5      -  n[        XX#S9nUR                  5        nUR                  US   SS/US   S4US   /5        S S S 5        US   R                  S	:X  d   eUS   R                  S	:X  d   e/ SQn/ SQnS/[        U5      -  n[        XX#S9nU H
  nS
Ul        M     UR                  5        nUR                  US   SS/US   S4US   /5        S S S 5        US   R                  S:X  d   eUS   R                  S:X  d   eg ! , (       d  f       N= f! , (       d  f       NJ= f)Nr9   r=   r>   r?   r   rB   rC   r    a)r(   r   r)   r*   r&   )r,   r   r   r@   r-   r4   ts          r6    test_doc_retokenize_split_lemmasrR   U   sS   (EE7SZD
h5
<C		[FI!fa[#a&!	
 
 q6==Bq6==B )EE7SZD
h5
<C 		[FI!fa[#a&!	
 
 q6==E!!!q6==I%%%/ 
	  
	s   #D#D/
D,/
D=c           	         [        U / SQS9nUR                  R                  R                  S5      nUR                  R                  R                  S5      nUR	                  5        nUR                  US   SS/US   S4US   /S	X#/0S
9  S S S 5        US   R                  U:X  d   eUS   R                  U:X  d   eg ! , (       d  f       N9= f)Nr9   r   amodsubjectr   rB   rC   r   r>   rI   )r   vocabstringsaddr)   r*   r>   )r,   r-   dep1dep2r4   s        r6   &test_doc_retokenize_split_dependenciesr[   u   s    
h:
;C99  (D99  +D		[FI!fa[#a&!4,'	 	 	
 
 q6::q6:: 
	s   '%B??
Cc           
         [        U / SQS9n[        R                  " [        5         UR	                  5        nUR                  US   SS/US   /5        S S S 5        S S S 5        [        R                  " [        5         UR	                  5        nUR                  US   SS/US   US   US   /5        S S S 5        S S S 5        g ! , (       d  f       Nw= f! , (       d  f       N= f! , (       d  f       N9= f! , (       d  f       g = f)Nr9   r   r   rB   rC   r   r   pytestraises
ValueErrorr)   r*   r,   r-   r4   s      r6   %test_doc_retokenize_split_heads_errorrb      s    
h:
;C	z	"^^c!fui&83q6(C  
#
 
z	"^^c!fui&83q63q63q6:RS  
#	"	  
#	"  
#	"sF   CCC?C9%C(5C9
C	C
C%(
C6	2C99
Dc            	      ,   / SQn [        [        5       U S9nUR                  R                  R	                  S5      SS4/Ul        US   R                  S:X  d   eUS   R                  S:X  d   eUR                  5        nUR                  US   / S	QUS   S4US   S4US   /5        S S S 5        US   R                  S:X  d   eUS   R                  S:X  d   eUS   R                  S:X  d   eUS
   R                  S:X  d   eg ! , (       d  f       Nc= f)N)abcder   zent-abcdr   r   Br   r   )rP   bcr   )	r   r   rV   rW   rX   entsent_iob_r)   r*   )r   r-   r4   s      r6   *test_doc_retokenize_spans_entity_split_iobrl      s   E
egU
#C""&&z2Aq9:CHq6??c!!!q6??c!!!		[#a&/SVQK#a&!cRSf3UV 
q6??c!!!q6??c!!!q6??c!!!q6??c!!! 
	s   ?)D
Dc           	         / SQn/ SQn/ SQn[        XX#S9n[        UR                  5      u  pV[        U5      n[        U5      nUR	                  5        n	U	R                  US   SS/US   S4US   /S	S
S/0S9  U	R                  US   SS/US   S4US   /S	S
S/0S9  S S S 5        [        UR                  5      u  pV[        U5      US-   :X  d   e[        U5      US-   :X  d   eg ! , (       d  f       NN= f)N)
StewartLeeisrP   standupcomedianr<   Helivesr	   EnglandandlovesJoePasqualer<   )r   r   r   r   r   r   r      ry   ry   	   ry   ry         )nsubjROOTdetrT   prtattrpunctr}   r~   prepr   ccconjr   r   r?   r   StewartLeer   r>   r   r}   rI      JoePasqualer|   dobj)r   listsentsr(   r)   r*   )
r,   r   r   r@   r-   sent1sent2init_len	init_len2r4   s
             r6   5test_doc_retokenize_spans_sentence_update_after_splitr      s%   KE;EGD h5
<C		?LE5zHE
I		[F!fa[#a&!:w/0	 	 	
 	GJ"gq\3r7#:v./	 	 	
 
 		?LEu:A%%%u:Q&&& 
	s   AC''
C5c           	          [        U / SQS9n[        R                  " [        5         UR	                  5        nUR                  US   SS/US   S4US   S4/5        SSS5        SSS5        g! , (       d  f       N= f! , (       d  f       g= f)aX  Test that the regular retokenizer.split raises an error if the orths
don't match the original token text. There might still be a method that
allows this, but for the default use cases, merging and splitting should
always conform with spaCy's non-destructive tokenization policy. Otherwise,
it can lead to very confusing and unexpected results.
r9   r   r   LANr]   ra   s      r6   (test_doc_retokenize_split_orths_mismatchr      su     h:
;C	z	"^^c!fsCjCFA;Q2LM  
#	" 
#	"s"   A?%A.A?.
A<	8A??
Bc                 t   [         R                  " SSSS9  [         R                  " SSSS9  [        U SS/S	9nUR                  5        nUS
   S4US   /nSSS.SS0/nSS/US.nUR	                  US
   SS/X5S9  S S S 5        US
   R
                  S:X  d   eUS
   R                  R                  SL d   eUS
   R                  R                  S:X  d   eUS   R
                  S:X  d   eUS   R                  R                  SL d   eUS   R                  R                  S:X  d   eg ! , (       d  f       N= f)NrP   FTdefaultforcerh   nothingr:   r;   r   r   r   1)rP   rh   2losangeles)r2   _rB   rC   rI   )	r   set_extensionr   r)   r*   r&   r   rP   rh   )r,   r-   r4   r   
underscorer   s         r6   )test_doc_retokenize_split_extension_attrsr      s=   	U$7	Yd;
h|W5
6C		[a&!c!f% s+c3Z8
 ),:>#a&5)"4eI	 

 q6==E!!!q688::q688::q6==I%%%q688::q688:: 
	s   	3D))
D7underscore_attrsrP   xrh   ri   )rP   r   c           	         [         R                  " SSSS9  [         R                  " SS SS9  [         R                  " SS	 SS
9  [        U SS/S9nSU0n[        R                  " [
        5         UR                  5        nUS   S4US   /nUR                  US   SS/XSS9  S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f)Nr   FTr   rP   c                     U $ N r   s    r6   <lambda>Ctest_doc_retokenize_split_extension_attrs_invalid.<locals>.<lambda>       a    )getterr   rh   c                     U $ r   r   r   s    r6   r   r      r   r   )methodr   r:   r;   r   r   r   r   rB   rC   rI   )r   r   r   r^   r_   r`   r)   r*   )r,   r   r-   r   r4   r   s         r6   1test_doc_retokenize_split_extension_attrs_invalidr      s     
U$7	Kt<	Kt<
h|W5
6C"#E	z	"^^!fa[#a&)Ec!fui&8%M  
#	" 
#	"s$   /C #B4#C4
C	>C
Cc                    [        U S/S9S   R                  (       a   e[        U S/S9S   R                  (       a   e[        U SS/S9nUS   R                  (       a   eUR                  5        nSSS	/0nUS   S
4US
   /nUR                  US   SS/XCS9  SSS5        US   R                  (       d   eUS
   R                  (       a   eg! , (       d  f       N;= f)a  Test that retokenization also sets attributes on the lexeme if they're
lexical attributes. For example, if a user sets IS_STOP, it should mean that
"all tokens with that lexeme" are marked as a stop word, so the ambiguity
here is acceptable. Also see #2390.
rB   r   r   rC   r:   r;   is_stopTFr   rI   N)r   r   r)   r*   )r,   r-   r4   r   r   s        r6   $test_doc_retokenizer_split_lex_attrsr      s     8E7+A.66668I;/2::::
h|W5
6C1v~~		[T5M*a&!c!f%#a&5)"4eI 
 q6>>>1v~~~ 
	s   1)C
Cc                 *   Sn[        XR                  5       SS S9nUR                  5        nUS   nUS4/[        U5      -  nUR                  X$R                     [        UR                  5      US9  SSS5        [        XR                  5       S9nUR                  5        nUS   nUS4/[        U5      -  nUR                  X$R                     [        UR                  5      US9  SSS5        g! , (       d  f       N= f! , (       d  f       g= f)zB#4604: realloc correctly when new tokens outnumber original tokenszOHyperglycemic adverse events following antipsychotic drug administration in theNr   r   r   )r   r*   r)   r(   r1   r   r$   )r,   r$   r-   r4   r/   r   s         r6   test_doc_retokenizer_reallocr     s    \D
hjjl3B/
0C		[As5z)#gg,UZZ(8F 
 hjjl
+C		[As5z)#gg,UZZ(8F 
	 
	
 
	s   AC3#AD3
D
Dc           
         Sn[        XR                  5       S9nSUS   l        US   nUR                  5        nUR                  U/ SQ[	        S5       Vs/ s H  oSU4PM     snS9  SSS5        US	   R
                  S
:X  d   eUS	   R                  S:X  d   eUS   R
                  S:X  d   eUS   R                  S:X  d   egs  snf ! , (       d  f       Nh= f)z#6060: reset norm in splitz6The quick brownfoxjumpsoverthe lazy dog w/ white spotsr   withr   r   )brownfoxjumpsoverther   Nrz   zw/r   )r   r*   norm_r)   ranger$   )r,   r$   r-   r/   r4   rL   s         r6   test_doc_retokenizer_split_normr     s    CD
hjjl
+C CFL FE		[4+0848C3<84 	 	
 
 q6;;$q6<<6!!!q6;;&   q6<<6!!! 5	 
	s   C
C#C
C


C)r!   r^   spacy.tokensr   r   spacy.vocabr   markissuer7   rM   rR   r[   rb   rl   r   r   r   parametrizer   r   r   r   r   r   r6   <module>r      s      #  4$: $:N"J&@
T"'<
N" 
sR
sR
sR
sc3Z 	Sz*	c

	N
	N$G"r   