
    cCix              
          S r SSKrSSKrSSKrSSKJrJr  SSKJr  SSK	J
r
  SSKJrJrJrJrJr  SSKJr  SS	KJrJr  \\\\\4      \\\\\\4      \\\\\4         \\\\\\4         4   r " S
 S\SS9r " S S\SS9r " S S\SS9r " S S\5      rS\\\\\4   S\S\\\4   4S jrS\S\S\4S jr S r!S r"S r#SS jr$S/r%g) zProcessor class for KOSMOS-2.    N)OptionalUnion   )BatchFeature)
ImageInput)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack)
AddedToken)BatchEncoding	TextInputc                   L    \ rS rSr% \\\      \S'   \\   \S'   \\   \S'   Sr	g)Kosmos2ImagesKwargs%   bboxesnum_image_tokensfirst_image_token_id N)
__name__
__module____qualname____firstlineno__r   listfloat__annotations__int__static_attributes__r       h/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/kosmos2/processing_kosmos2.pyr   r   %   s%    T%[!!sm#"3-'r    r   F)totalc                   &    \ rS rSr% \\   \S'   Srg)Kosmos2TextKwargs+   add_eos_tokenr   N)r   r   r   r   r   boolr   r   r   r    r!   r$   r$   +   s    D>!r    r$   c            
       L    \ rS rSr% \\S'   \\S'   SSSSSSSSSS.	SS	0S
.rSrg)Kosmos2ProcessorKwargs/   text_kwargsimages_kwargsTFr   )	add_special_tokenspaddingstridereturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsverboser&   r   @   )r+   r,   r   N)	r   r   r   r   r$   r   r   	_defaultsr   r   r    r!   r)   r)   /   sC    ""&& #').*/&+%*"

 
Ir    r)   c                   |  ^  \ rS rSrSrSS/rSrSrSU 4S jjr    SS\	\
   S	\\\\   4   S
\\   S\4S jjrS rS r   SS\\\\   4   S\	\
   S\S\	\   S\\\\   4   4
S jjrSS jrSS jr\S 5       rS	\S\\\\      \\\      4   S\4S jrS\\\\4   \\\\\4   4   S\\\4   4S jrSr U =r!$ )Kosmos2ProcessorD   a  
Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
processor.

[`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
[`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
for more information.

Args:
    image_processor (`CLIPImageProcessor`):
        An instance of [`CLIPImageProcessor`]. The image processor is a required input.
    tokenizer (`XLMRobertaTokenizerFast`):
        An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
    num_patch_index_tokens (`int`, *optional*, defaults to 1024):
        The number of tokens that represent patch indices.
image_processor	tokenizer)CLIPImageProcessorCLIPImageProcessorFastAutoTokenizerc                   > SUl         SU l        SU l        SU l        SU l        SU l        SU l        SU l        S	U l        S
U l	        SU l
        SU l        U R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  U R                  U R                  U R                  /U l        X0l        [        U R                  5       Vs/ s H!  nS[        U5      R!                  S5       S3PM#     nn/ nU R                  U-    H  nUR#                  [%        USSSS95        M      UR'                  U5        [(        T	U ]U  X5        g s  snf )NFz</doc>z<image>z</image>z</chunk>z</line>z<phrase>z	</phrase>z<object>z	</object></delimiter_of_multi_objects/>z<grounding><patch_index_   >T)lstriprstrip
normalized)r3   	eod_token	boi_token	eoi_token	eoc_token	eol_token	bop_token	eop_token	boo_token	eoo_token	dom_token	grd_token
tag_tokensnum_patch_index_tokensrangestrzfillappendr   
add_tokenssuper__init__)
selfr:   r;   rS   kwargsxpatch_index_tokenstokens_to_addtoken	__class__s
            r!   rZ   Kosmos2Processor.__init__Z   sE   */	'!"##"#$#$9& NNNNNNNNNNNNNNNNNNNNNN
 '=#JOPTPkPkJlmJlQc!fll1o->a@Jlm__'99E  E$uY^!_` :]+4 ns   3(E1imagestextr\   returnc           
      
   Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUS   R                  SS5      nUS   R                  SS5      nUS   R                  SS5      n	US	   R                  S
S5      n
US	   S   nUS	   S   nUS	   R                  SS5      n[        5       nUb'  U R                  " U40 US   D6nUR                  U5        Ub  U R                  X!XxS9nU(       av  U
(       do  [        U[        5      (       a  U R                  R                   U 3nO?[        U[        5      (       a*  U Vs/ s H  nU R                  R                   U 3PM     nnUS	   S   =(       a    U
US	   S'   Uc  UOSUS	   S'   Uc  UOSUS	   S'   U R                  " SSU0US	   D6nUR                  U5        XS	   S'   XS	   S'   XS	   S'   UGbP  UGbL  U	c  U R                  R                  S-   n	Un[!        U5      S-   n[        [#        XU-   5      5      nS/S/U-  -   S/-   n/ n/ nUS   n[        U[        5      (       a  U/nUS   /US'   U Hw  nUSU U-   UUU-   S -   nUR%                  U5        [&        R&                  " U5      nU(       a  S/U-   nUS/[)        U5      [)        U5      -
  -  -  nUR%                  U5        My     [        U[        5      (       Ga  [+        [-        WR.                  5       VVs/ s H  u  nnU[)        U5      4PM     snnS S9nUS   u  nnUS   u  nnUS	   S   =(       a    U
US	   S'   SUS	   S'   U R                  " SSUU   /0US	   D6n[)        UR.                  S   5      n UU :w  GaI  U R                  R0                  S:X  a  U Vs/ s H,  nUU R                  R2                  /U [)        U5      -
  -  -   PM.     nnU Vs/ s H  nUS/U [)        U5      -
  -  -   PM     nnUS    Vs/ s H  nUS/U [)        U5      -
  -  -   PM     snUS'   OU R                  R0                  S:X  a  U Vs/ s H,  nU R                  R2                  /U [)        U5      -
  -  U-   PM.     nnU Vs/ s H  nS/U [)        U5      -
  -  U-   PM     nnUS    Vs/ s H  nS/U [)        U5      -
  -  U-   PM     snUS'   [        U[        5      (       a  Uc  US   nUS   S   US'   US   nUR                  [5        UUS   US.US95        U$ s  snf s  snnf s  snf s  snf s  snf s  snf s  snf s  snf )a  
This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
[`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.

Please refer to the docstring of the above two methods for more information.

The rest of this documentation shows the arguments specific to `Kosmos2Processor`.

Args:
    bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
        The bounding bboxes associated to `texts`.
    num_image_tokens (`int`, *optional* defaults to 64):
        The number of (consecutive) places that are used to mark the placeholders to store image information.
        This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
    first_image_token_id (`int`, *optional*):
        The token id that will be used for the first place of the subsequence that is reserved to store image
        information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
    add_eos_token (`bool`, defaults to `False`):
        Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
Nz*You have to specify either images or text.tokenizer_init_kwargsr,   r   r   r5   r   r+   r&   Fr-   r.   return_tensors)r   rd      r   	input_idsattention_maskc                     U S   $ Nr   )r]   s    r!   <lambda>+Kosmos2Processor.__call__.<locals>.<lambda>   s    defhdir    )keyrn   rightleft)rj   rk   image_embeds_position_mask)datatensor_typer   )
ValueError_merge_kwargsr)   r;   init_kwargspop
setdefaultr   r:   updatepreprocess_examples
isinstancerU   	bos_tokenr   unk_token_idr   rT   rW   copylensorted	enumeraterj   padding_sidepad_token_idr   )!r[   rc   rd   audiovideosr\   output_kwargsr   r   r   r&   r-   r.   rh   encodingimage_encodingstext_encodingwith_bosstart_indeximage_token_idsbase_image_embeds_position_maskrj   rt   all_input_idstext_idsmaskidxr]   sorted_length_min_len_not_paddedmax_len_paddeds!                                    r!   __call__Kosmos2Processor.__call__   sh   8 >dlIJJ**"
"&.."<"<
 
 /33HdC(9==>PRTU,_=AABXZ^_%m488%P*=9:NO.y9&}5@@AQSWX>!11&[M/<Z[NOON+++D&+dD!-dC(("nn667v>Dd++FJKdt~~778<dDKm,-ABT} -()=> BHgUZM-(3OU~^cgM-()9: NNUUm8TUMOOM*=Om$%9:29m$Y/9Gm$%56 2#+'+~~'B'BQ'F$ *H h-!+K #5)=Vf?f#ghO/0cQC:J4J.JaS.P+ I)+&$[1M$$$!..67G.H-I)*)#L[1OCh{]mOmOoFpp  *yy!@A3:Ds8}s4y899*11$7 * $%% &1:=;R;R1ST1SvsAc3q6]1STZi! )6a(8%%&r*Q!-01EFX= m,-AB BFm,-=> $ `T#YK `=Q^C_ `!$]%<%<Q%?!@%7~~22g=lu$vlughQ$..*E*E)F.[^_`[aJa)b%blu	$vIc6IcAA~A'> ??Ic 3 6 JRRbIc6IcAA~A'> ??Ic6!12 44>lu$vlughdnn&A&A%BnWZ[\W]F]%^ab%blu	$vIc6IcAQC>CF#:;a?Ic 3 6 JRRbIc6IcAQC>CF#:;a?Ic6!12
 $$$)?%aL	-56F-G-J)*-G-J* OO%.*23C*D6P
 !/	 K Lj U %w66 %w66s0   $U U
3U?U'U+3U$UU$c                    Uc  g[        U[        5      (       d  [        S5      eU H  nUc  M  [        U[        5      (       d  U/nU Hq  n[        U[        5      (       aP  [	        U5      S:X  a  [        S U 5       5      (       a  M@  [	        U5      S:X  a  [        S U 5       5      (       a  Mh  [        S5      e   M     g)a  
Check `bboxes` for a single text example. It could be
    - `None`: no bounding box associated to a text.
    - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
      in a text. This could be:
          - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
          - A tuple of 2 integers: A single bounding box specified by patch indices.
          - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
          - A list containing the above 2 tuple types: Multiple bounding boxes for a
           `<phrase> ... </phrase>` pair.
Nz@`bboxes` (for a single text example) should be `None` or a list.   c              3   B   #    U  H  n[        U[        5      v   M     g 7fN)r~   r   .0r]   s     r!   	<genexpr>AKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>B  s     .S7az!S/A/A7   rB   c              3   B   #    U  H  n[        U[        5      v   M     g 7fr   )r~   r   r   s     r!   r   r   C  s     1XPW1*Q2F2FPWr   a'  Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing 2 integers or 4 float point numbers, or a list containing such tuples. Also make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in batches or both for a single example.)r~   r   rw   tupler   all)r[   r   bboxelements       r!   _check_bboxes_for_single_text.Kosmos2Processor._check_bboxes_for_single_text)  s     >FD))_`` D|d++v!'511\Q&3.S7.S+S+SG)c1XPW1X.X.X$@    r    c                 \    UR                  5       nUb  U SU 3nU R                  X5      nU$ )N )strip_insert_patch_index_tokens)r[   rd   imager   img_info_tokenss        r!   _preprocess_single_example+Kosmos2Processor._preprocess_single_exampleL  s;    zz|%&av.D ..t<r    textsr   r   c                 \   U R                   /U-  nSR                  U R                   /U-   U R                  /-   5      nSn[        U[        5      (       a  SnU/nUc  S/[        U5      -  nO[        U[        5      (       d  U/n[        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eU(       d  U R                  U5        U/nOMUb;  [        U[        5      (       d  [        S5      eU H  nU R                  U5        M     OS/[        U5      -  n[        U5      [        U5      :w  a$  [        S	[        U5       S[        U5       S35      e[        XU5       V	V
Vs/ s H  u  pnU R                  XX5      PM     nn
n	nU(       d  US
   nU$ s  snn
n	f )a  Add image and bounding box information to `texts` as image and patch index tokens.

Args:
    texts (`Union[TextInput, list[TextInput]]`): The texts to be processed.
    images (`ImageInput`, *optional*): The images associated to `texts`.
    bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
        The bounding bboxes associated to `texts`.
    num_image_tokens (`int`, *optional*, defaults to 64):
        The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
        attribute in `Kosmos2Config`.

Returns:
    `Union[TextInput, list[TextInput]]`: The processed texts with image and patch index tokens.
r   TFNzGThe number of examples in `texts` and `images` should be the same. Got  v.s. 	 instead.zS`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.zGThe number of examples in `texts` and `bboxes` should be the same. Got r   )rH   joinrI   r~   rU   r   r   rw   r   zipr   )r[   r   rc   r   r   
img_tokensr   batchedr]   rd   r   r   results                r!   r}   $Kosmos2Processor.preprocess_examplesV  s   , nn%(88
((DNN#3j#@DNNCS#ST eS!!GGE>Vc%j(FFD))XFu:V$YZ]^cZdYeeklopvlwkx  yB  C  ..v6XFfd++ !vww2215  Vc%j(Fv;#e*$YZ]^cZdYeeklopvlwkx  yB  C  &)%?
%?!T ++DO%? 	 

 AYF
s   6F'c                 f    UR                  U R                  5      S   nU(       a  [        U5      $ U$ rm   )splitrI   +clean_text_and_extract_entities_with_bboxes)r[   rd   cleanup_and_extractcaptions       r!   post_process_generation(Kosmos2Processor.post_process_generation  s,    **T^^,R0>wGGr    c                 v    U R                   " U4SU0UD6nU Vs/ s H  oPR                  USS9PM     sn$ s  snf )ah  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
skip_special_tokensF)r   )batch_decoder   )r[   generated_outputsr   r\   generated_textsrd   s         r!   post_process_image_text_to_text0Kosmos2Processor.post_process_image_text_to_text  sJ      ++,=qSfqjpqZijZiRV,,Tu,MZijjjs   6c                 j    U R                   R                  nU R                  R                  nX-   S/-   $ )Nrt   )r;   model_input_namesr:   )r[   tokenizer_input_namesimage_processor_input_namess      r!   r   "Kosmos2Processor.model_input_names  s6     $ @ @&*&:&:&L&L#$BFbEcccr    c                    Ub  [        U5      S:X  a  U$ [        [        R                  " SUS95      n[        U5      [        U5      :w  a$  [	        S[        U5       S[        U5       S35      eSn/ n[        X25       H  u  pgUR                  5       u  pUR                  XU	 5        U	nUc  M1  [        U[        5      (       a  U/n/ n
[        S U 5       5      (       d  [	        S5      eU H,  nU R                  U5      u  pU
R                  U S	U 35        M.     [        U
5      S:X  a  M  S
R                  U
5      nUR                  SU S35        M     U[        U5      :  a  UR                  XS  5        SR                  U5      nU$ )Nr   z<phrase>.+?</phrase>)stringzuThe number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got r   r   c              3   (   #    U  H  oS Lv   M
     g 7fr   r   )r   boxs     r!   r   >Kosmos2Processor._insert_patch_index_tokens.<locals>.<genexpr>  s     7$3$$s   zTThe multiple bounding boxes for a single phrase should not contain any `None` value.r   z  </delimiter_of_multi_objects/> z	<object> z
 </object> )r   r   refinditerrw   r   spanrW   r~   r   r   #_convert_bbox_to_patch_index_tokensr   )r[   rd   r   matched_phrasescurr_posbuffermatchedr   r   endpatch_index_stringsr   patch_index_1patch_index_2position_strs                  r!   r   +Kosmos2Processor._insert_patch_index_tokens  s   >S[A-Kr{{+B4PQ3v;. H  IL  M\  I]  H^  ^d  eh  io  ep  dq  qz  {   9MG\\^FAMM$,-H|$&&v"$7$777 j  /3/W/WX[/\,#**m_Am_+MN  &'1,=BBCVWLMMIl^:>?/ :2 c$iMM$y/*wwvr    r   c                    [        U5      S:X  a  Uu  p#O6[        [        R                  " U R                  5      5      n[        X5      u  p#S[        U5      R                  S5       S3nS[        U5      R                  S5       S3nXV4$ )Nr   rA   rB   rC   )r   r   mathsqrtrS   coordinate_to_patch_indexrU   rV   )r[   r   idx_1idx_2num_patches_per_sidetoken_1token_2s          r!   r   4Kosmos2Processor._convert_bbox_to_patch_index_tokens  s     t9>LE5 $'tyy1L1L'M#N 4TPLE!#e*"2"21"5!6a8!#e*"2"21"5!6a8r    )rH   rN   rL   rP   rJ   rG   rI   rK   rO   rM   rQ   rS   rR   )i   )NNNN)NNr5   )T)"r   r   r   r   __doc__
attributesimage_processor_classtokenizer_classrZ   r   r   r   r   r   r   r)   r   r   r   r   	BboxInputr   rU   r}   r   r   propertyr   r   r   r   r   r   __classcell__)ra   s   @r!   r8   r8   D   s   " $[1JL%O+5^ (,26`$` ItI./` /0` 
`D!F (, *,@YY/0@ $@ 	@
 #3-@ 
sDI~	@Dk& d d
+s +E$uSzBRTXY^_dYeTfBf<g +lo +Z %S/5ue1K+LLM 	sCx   r    r8   r   r   re   c                 $   U u  p#pEXB:  a  XS:  d  [        S5      e[        R                  " X!-  5      n[        R                  " X1-  5      n[        R                  " XA-  S-
  5      n[        R                  " XQ-  S-
  5      n	Xq-  U-   n
X-  U-   nX4$ )a  Convert a bounding box to a pair of patch indices.

Args:
    bbox (`tuple[float, float, float, float]`):
        The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
        lower-right corners of the box. It should have x2 > x1 and y2 > y1.
    num_patches_per_side (`int`): the number of patches along each side.

Returns:
    `tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
zTThe coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.ri   )rw   r   floorceil)r   r   x1y1x2y2ul_xul_ylr_xlr_yul_idxlr_idxs               r!   r   r     s     RRGopp::b/0D::b/0D99R.23D99R.23D(4/F(4/F>r    r  r  c                     SU-  nX-  nX-  nX-  nX-  nX:X  a  XC-  nXS-  n	Xc-  U-   n
Xs-  U-   nOIXF:X  d  XW:X  a  XC-  nXS-  n	Xc-  U-   n
Xs-  U-   nO(XC-  US-  -   nXS-  US-  -   n	Xc-  US-  -   n
Xs-  US-  -   nXX4$ )ak  
Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

Args:
    ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
    lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
    num_patches_per_side (`int`): the number of patches along each side.

Returns:
    `tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
g      ?r   r   )r  r  r   	cell_sizer   r   r   r   r   r   r   r   s               r!   patch_index_to_coordinater    s     **I (D)D(D)D 	)	)		)	)	A-	A-	A-	A-2>r    c           
      h   Sn[         R                  " X5      n/ nU GH  nUR                  S5      nUR                  5       u  pgnU(       d*  SnUR                  S5      S   UR                  S5      S   4nUR	                  S5      n	/ n
U	 H  n[         R
                  " SU5      n[         R
                  " SUSS 5      nU(       d  M=  U(       d  MF  U(       aE  U
R                  [        UR                  S5      5      [        UR                  S5      5      45        M  U
R                  [        UR                  S5      5      [        UR                  S5      5      45        M     U(       a  UR                  XuU
45        GMd  U
 H&  nSUS    S	US    S
3nUR                  XU/45        M(     GM     U$ )a  Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

This function is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
processing happens, including converting to normalized coordinates and whitespace character cleaning up.

Examples:

```python
>>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
>>> entities = extract_entities_with_patch_indices(text)
>>> entities
[(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
```z(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>r   Nr   r@   z<patch_index_(\d+)>ri   rA   z><patch_index_rC   )	r   r   r   groupsr   searchrW   r   group)rd   patternmatchesentities_with_patch_indicesmatchr   
phrase_tagphrasematch_contentpatch_index_pairsentity_bboxespairr]   yr   entitys                   r!   #extract_entities_with_patch_indicesr  B  s~    kG kk'(G #%zz!},1LLN)
MFJJqM!$ejjmA&67D *//0PQ%D		0$7A		0$qr(;AqQQ!((#aggaj/3qwwqz?)KL!((#aggaj/3qwwqz?)KL & '..m/LM%(a	QyJ+22F4&3IJ &7 @ '&r    c           	          U u  nu  p4[        [        R                  " SSUSU 5      5      n[        [        R                  " SSUSU 5      5      nX%U44nU$ )zfAdjust the positions of the entities in `text` to be relative to the text with special fields removed.<.*?>r   N)r   r   sub)r  rd   entity_namestartr   adjusted_startadjusted_endadjusted_entitys           r!   adjust_entity_positionsr  |  s[     &K%T&5\:;Nrvvgr4:67L"\$BCOr    c                    U R                  5       n[        U 5      [        U R                  5       5      -
  n/ nU H  u  nu  pgn[        U5      [        UR                  5       5      -
  n	[        U5      [        UR                  5       5      -
  n
Xc-
  U	-   nXs-
  U
-
  nUR                  5       nUR	                  XVU4U45        M     X$4$ )z9Remove the spaces around the text and the entities in it.)r   r   rD   rE   rW   )rd   entitiesnew_textleading_spacesnew_entitiesr  r  r   r   entity_name_leading_spacesentity_name_trailing_spacess              r!   _cleanup_spacesr'    s    zz|HYT[[]!33NL-5)\e6%(%5K<N<N<P8Q%Q"&)+&6[=O=O=Q9R&R#&)CC"%@@!'')[#,?@ .6 !!r    c           	         [         R                  " SSU 5      n[        U 5      n/ nU HN  nUSS US   pv[        X`5      nU V	s/ s H  n	[	        U	S   U	S   U5      PM     n
n	UR                  X4-   5        MP     [        X$5      $ s  sn	f )ay  Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.

Examples:

```python
>>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
>>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
>>> clean_text
'An image of a snowman warming himself by a fire.'

>>> entities
[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
```r  r   r   r   ri   )r   r  r  r  r  rW   r'  )rd   r   processed_textr  r!  itemr  r   r  r   bboxes_in_coordss              r!   r   r     s     VVGR.N"Ed"KH+aDG1&?jpqjpbf5d1gtAwH\]jpq*==> , >44	 rs   B)    )&r   r   r   r   typingr   r   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   r   r   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r$   r)   r8   r   r  r  r  r'  r   __all__r   r    r!   <module>r4     s7   $   	 " 2 % b b , ? sCxueUE)	*+eCHo	eE5%'(	)*,	(,e ("
% "-U *o ~ o dE%u*D$E ]` ejknpskset >(c (3 (c (Z7't"*5: 
r    