
    hs#                        S SK r S SKJrJrJrJrJrJrJr  SSK	J
r
Jr  SSKJrJr  S\\   S\\   4S jrS\\   S\\   4S	 jrS\\   S\\   4S
 jrS\\   S\\   4S jrSS\S\4S jjrS\S\\   4S jr SS\S\\\\\\\4   4      S\S\\   4S jjrS\S\\   S\\   4S jrS\S\\   S\\\\\\\4   4      4S jrS\\   S\\\\\4      4S jrS\S\\\4   4S jrS\S\4S jr\r\r\rg)    N)DictIterableIteratorListTupleUnioncast   )ErrorsWarnings)DocSpantagsreturnc                     / n[        U 5      n U (       a=  UR                  [        U 5      5        UR                  [        U 5      5        U (       a  M=  U$ )N)listextend_consume_os_consume_ent)r   outs     R/home/james-whalen/.local/lib/python3.13/site-packages/spacy/training/iob_utils.pyiob_to_biluor      sC    C:D


;t$%

<%& $ J    c                     / nU  HN  nUc  UR                  U5        M  UR                  SSS5      R                  SSS5      nUR                  U5        MP     U$ )NU-B-   L-I-)appendreplace)r   r   tags      r   biluo_to_iobr#      sT    
C;JJsO++dD!,44T4CCJJsO  Jr   c              #   |   #    U (       a1  U S   S:X  a'  U R                  S5      v   U (       a  U S   S:X  a  M%  g g g g 7f)Nr   O)pop)r   s    r   r   r      s5     
47c>hhqk 47c>$>$s   4<<c                    U (       d  / $ U R                  S5      nSUSS  -   nSUSS  -   nSnU (       a3  U S   X#1;   a)  US-  nU R                  S5        U (       a  U S   X#1;   a  M)  USS  nUS:X  a;  [        U5      S:X  a&  [        [        R                  R                  US95      eSU-   /$ SU-   nS	U-   n[        SUS-
  5       Vs/ s H  nS
U 3PM
     n	nU/U	-   U/-   $ s  snf )Nr   Ir   Lr
   r"   r   r   r   r   )r&   len
ValueErrorr   E177formatrange)
r   r"   	target_intarget_lastlengthlabelstartend_middles
             r   r   r   !   s   	
((1+Cc!"gIAB-KF
47y66! 47y66 GE{u:?V[[//C/899u~uUl(-a!(<=(<1Bug,(<=w3%'' >s   C#docmissingc           	          [        U U R                   Vs/ s H%  o"R                  UR                  UR                  4PM'     snUS9$ s  snf )Nr9   )offsets_to_biluo_tagsents
start_charend_charlabel_)r8   r9   ents      r   doc_to_biluo_tagsrB   7   s?     ?BxxHx..#,,

	3xH Hs   ,A
c                 n    [        U SS9n[        U 5       H  u  p#UR                  S:X  d  M  SX'   M     U$ )N-r;   r
   r%   )rB   	enumerateent_iob)r8   r=   itokens       r   _doc_to_biluo_tags_with_partialrI   ?   s8    S#.DcN==ADG # Kr   entitiesc           
      <   0 nU  Vs0 s H  oDR                   UR                  _M     nnU  Vs0 s H%  oDR                   [        U5      -   UR                  _M'     nnU  Vs/ s H  nSPM     nnU H  u  pnU(       d   U H  nX:  d  M
  X:  d  M  SXU   '   M     M-  [        X5       HW  nXR	                  5       ;   a;  [        [        R                  R                  X=   S   X=   S   X=   S   4XU4S95      eXU4X='   MY     UR                  U	5      nUR                  U
5      nUc  M  Uc  M  X:X  a	  SU 3X'   M  S	U 3X'   [        US-   U5       H  nS
U 3UU'   M     SU 3X'   M     [        5       nU H)  u  pn[        X5       H  nUR                  U5        M     M+     U  HM  n[        UR                   UR                   [        U5      -   5       H  nUU;   d  M    M=     X(UR                  '   MO     SU;   a  US:w  a  [        U5      n[        R                  " [        R                   R                  [        U R"                  5      S:  a  U R"                  SS S-   OU R"                  [        U5      S:  a  USS S-   OUS95        U$ s  snf s  snf s  snf )u\  Encode labelled spans into per-token tags, using the
Begin/In/Last/Unit/Out scheme (BILUO).

doc (Doc): The document that the entity offsets refer to. The output tags
    will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start`
    and `end` should be character-offset integers denoting the slice into
    the original string.
missing (str): The label used for missing values, e.g. if tokenization
    doesn’t align with the entity offsets. Defaults to "O".
RETURNS (list): A list of unicode strings, describing the tags. Each tag
    string will be of the form either "", "O" or "{action}-{label}", where
    action is one of "B", "I", "L", "U". The missing label is used where the
    entity offsets don't align with the tokenization in the `Doc` object.
    The training algorithm will view these as missing values. "O" denotes a
    non-entity token. "B" denotes the beginning of a multi-token entity,
    "I" the inside of an entity of three or more tokens, and "L" the end
    of an entity of two or more tokens. "U" denotes a single-token entity.

EXAMPLE:
    >>> text = 'I like London.'
    >>> entities = [(len('I like '), len('I like London'), 'LOC')]
    >>> doc = nlp.tokenizer(text)
    >>> tags = offsets_to_biluo_tags(doc, entities)
    >>> assert tags == ["O", "O", 'U-LOC', "O"]
rD   r%   r   r   r
   )span1span2Nr   r   r   r   2   z...)textrJ   )idxrG   r+   r/   keysr,   r   E103r.   getsetaddstrwarningswarnr   W030rO   )r8   rJ   r9   tokens_in_entsrH   startsendsr6   biluor>   r?   r3   stoken_indexstart_token	end_tokenrG   entity_charsent_strs                      r   r<   r<   G   s   < CEN.12cUii cF29<=IIE
"EGG+D=#QS#E'/#
e?q|'*E)$   %Z:"5"5"77$** . ;A > . ;A > . ;A >#
 $."? + 	 	 0:U.K+  ; !**Z0K*I&9+@++-eWE&+-eWE&";?I>%'w<a ?)+E7|E$; (0> 5L'/#
ez,AQ - (0 uyy%))c%j"89AL  : %%''N  e|3h-MM  .1#((mb.@SXXcr]U*chh14W1B"- ! 	
 Lk 3=s    J,JJc                 v    [        U5      n/ nU H%  u  pEn[        XUS-   US9nUR                  U5        M'     U$ )a  Encode per-token tags following the BILUO scheme into Span object, e.g.
to overwrite the doc.ents.

doc (Doc): The document that the BILUO tags refer to.
tags (iterable): A sequence of BILUO tags with each tag describing one
    token. Each tag string will be of the form of either "", "O" or
    "{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of Span objects. Each token with a missing IOB
    tag is returned as a Span with an empty label.
r   r3   )tags_to_entitiesr   r    )r8   r   token_offsetsspansr3   	start_idxend_idxspans           r   biluo_tags_to_spansrl      sG     %T*ME%2!'CGaKu=T &3 Lr   c                     [        X5      nU Vs/ s H%  o3R                  UR                  UR                  4PM'     sn$ s  snf )a  Encode per-token tags following the BILUO scheme into entity offsets.

doc (Doc): The document that the BILUO tags refer to.
tags (iterable): A sequence of BILUO tags with each tag describing one
    token. Each tags string will be of the form of either "", "O" or
    "{action}-{label}", where action is one of "B", "I", "L", "U".
RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
    `end` will be character-offset integers denoting the slice into the
    original string.
)rl   r>   r?   r@   )r8   r   rh   rk   s       r   biluo_tags_to_offsetsrn      s7      *EFKLed__dmmT[[9eLLLs   ,?c           
          / nSn[        U 5       GHy  u  p4Ub  UR                  S5      (       a  Ub  SnM&  UR                  SX345        M;  UR                  S5      (       a  MS  UR                  S5      (       a;  Uc6  [        [        R
                  R                  S[        U 5      SUS-    S95      eM  UR                  S5      (       a  UR                  US	S X345        M  UR                  S
5      (       a  UnM  UR                  S5      (       aT  Uc6  [        [        R
                  R                  S[        U 5      SUS-    S95      eUR                  US	S X#45        SnGMV  [        [        R                  R                  US95      e   U$ )ztNote that the end index returned by this function is inclusive.
To use it for Span creation, increment the end by 1.NrD    r%   r(   r   )r4   r   Ur
   Br)   r*   )	rE   
startswithr    r,   r   E067r.   r   E068)r   rJ   r4   rG   r"   s        r   rf   rf      sn    HED/;#..-- Q
+^^C  ^^C  } KK&&StDz'AE7J&K   ^^C  OOSWaO,^^C  E^^C  } KK&&StDz'AE7J&K  OOSWe/0EV[[//C/8995 "6 Or   r3   c                 \    [        [        [        [        4   U R                  SS5      5      $ NrD   r   )r	   r   rV   splitre   s    r   split_bilu_labelry      s"    c3hS!!455r   c                 ,    U R                  SS5      S   $ rw   )rx   re   s    r   remove_bilu_prefixr{      s    ;;sAq!!r   )r%   ) rW   typingr   r   r   r   r   r   r	   errorsr   r   tokensr   r   rV   r   r#   r   r   rB   rI   intr<   rl   rn   rf   ry   r{   offsets_from_biluo_tagsspans_from_biluo_tagsbiluo_tags_from_offsets r   r   <module>r      s    E E E % x} c x} c d3i HSM 
(tCy (T#Y (,3   c  TWT	T sCsCx'@!ABTMPT	#YTnS  $t* &M	MSMM	%S%S/)
*+M" 8C=  T%S#2F-G  F6C 6E#s(O 6"c "c "
 0 + / r   