
    cCit>                         S SK JrJr  S SKrSSKJr  SSKJrJ	r	J
r
  SSKJrJrJrJrJr  SSKJrJr  SSKJr   " S	 S
\SS9r " S S\SS9r " S S\5      rS/rg)    )OptionalUnionN   )BatchFeature)
ImageInputconcatenate_listmake_flat_list_of_images)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)
VideoInputc                   F    \ rS rSr% \\   \S'   \\   \S'   \\   \S'   Srg)InternVLImagesKwargs   crop_to_patchesmin_patchesmax_patches N)	__name__
__module____qualname____firstlineno__r   bool__annotations__int__static_attributes__r       j/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/internvl/processing_internvl.pyr   r      s     d^###r!   r   F)totalc                   :    \ rS rSr% \\S'   SSS.SS0SS	0S
.rSrg)InternVLProcessorKwargs!   images_kwargsleftF)padding_sidereturn_mm_token_type_idsr   Treturn_tensorspt)text_kwargsr'   videos_kwargsr   N)r   r   r   r   r   r   	_defaultsr    r   r!   r"   r%   r%   !   s4    '' #(-

 t
 d
Ir!   r%   c                   <  ^  \ rS rSrSr/ SQrSrSrSr     SS\	4U 4S jjjr
S	\\   S
\\	   S\\	   S\R                  S\R                  S\R                  4S jr    SS\\   S	\\\\\\   \\   4      S\\   S\\   S\4
S jjrSS jr\S 5       rSrU =r$ )InternVLProcessor1   a  
Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and
[`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information.
Args:
    image_processor ([`AutoImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
        The tokenizer is a required input.
    video_processor ([`AutoVideoProcessor`], *optional*):
        The video processor is a required input.
    image_seq_length (`int`, *optional*, defaults to 256):
        The number of image token to use per image patch. it should be set so that:
        image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
)image_processor	tokenizervideo_processorAutoImageProcessorAutoVideoProcessorAutoTokenizerimage_seq_lengthc                 x  > X@l         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        U R                  U R                  U R                  /U l
        [        TU ]0  " XU4SU0UD6  g )Nchat_template)r9   start_image_tokenend_image_tokenstart_image_token_idend_image_token_idcontext_image_tokenimage_tokenvideo_tokencontext_image_token_idimage_token_id	image_idssuper__init__)selfr3   r4   r5   r9   r;   kwargs	__class__s          r"   rG   InternVLProcessor.__init__I   s     !1!*!<!<(88$-$B$B!"+">">$88$00'>>--t/H/H$JaJab_lTaleklr!   textimage_num_patchesvideo_num_patchesimage_num_patches_indicesvideo_num_patches_indicesvideo_patch_indicesc	           	      R  ^ ^ Sn	Sn
/ n/ n/ nU GH  nUnT R                   U;   d  T R                  U;   Ga  T R                   U;   a  T R                  U;  d8  UR                  T R                   5      UR                  T R                  5      :  a  U	S:  a  XiS-
     OSnXi   nUR                  UUU 5        UR	                  T R                   SS5      nUR                  T R
                   T R                   T R                  -  XI   -   T R                   35        U	S-  n	OX   nXS-      nUU   nUU   nUR                  UUU 5        [        UUU 5      mSR                  UU 4S j[        [        T5      5       5       5      nUR                  U5        UR	                  T R                  SS5      nU
S-  n
T R                   U;   a  GM  T R                  U;   a  GM  SU;   a,  UR                  S5      nUR	                  SUS5      nSU;   a  M,  UR                  U5        GM     XX4$ )z
Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
image and video tokens while keeping track of the patches used.
r      z<placeholder>
c              3      >#    U  HE  nS US-    STR                    TR                  TR                  -  TU   -   TR                   3v   MG     g7f)FramerS   z: N)r<   rA   r9   r=   ).0inum_patchesrH   s     r"   	<genexpr>?InternVLProcessor._insert_media_placeholders.<locals>.<genexpr>   sr      -!8A  Awb)?)?(@AQAQTXTiTiAilwxylzAz@{  }A  }Q  }Q  |R  S!8s   AA)rA   rB   indexappendreplacer<   r9   r=   listjoinrangelenpop)rH   rL   image_pixel_valuesvideo_pixel_valuesrM   rN   rO   rP   rQ   image_indexvideo_indexprocessed_textimage_video_patchesreplace_stringsprompt
new_promptstart_index	end_indexcurrent_patch_indexend_patch_indexvideo_promptreplace_strrY   s   `                     @r"   _insert_media_placeholders,InternVLProcessor._insert_media_placeholders^   sr      FJ""j0D4D4D
4R##z1$$J6!''(8(89J<L<LTM]M]<^^ Q\^_P_";!O"LefK 9 FI'../A+i/XY!+!3!3D4D4DoWX!YJ#**11243C3CdF[F[3[^o^|3|2}  C  S  S  ~T  U  1$K
 +>*J'&9/&JO";<O"PK 9/ JI'../A+i/XY"&'89L_']"^K#'99 -!&s;'7!8- $L $**<8!+!3!3D4D4DoWX!YJ1$KA ""j0D4D4D
4RB "Z/-11!4'//aP
 "Z/ !!*-M P KLLr!   imagesvideosrI   returnc           
         Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6n[        U[        [        45      (       d  U/n/ nSn[        R                  " S/5      n	Ubu  U R                  R                  U5      n[        U5      nU R                  " SSU0US   D6n
U
R                  S5      nU
R                  S5      n[        R                  " U5      n	/ nSn[        R                  " S/5      n[        R                  " S/5      nUb  US	   nU R                  " SS
U0UD6nUR                  S5      nUR                   tnnn[        R"                  " UU5      n[%        U5      n[        R&                  " US-   [(        5      nSUS'   [        R                  " U5      USS& S/U-  n[        R&                  " US-   [(        5      nSUS'   [        R                  " U5      USS& UR+                  SS5      n0 nUc  Ubd  U R-                  UUUUUU	UU5      u  nnnnUb  U[/        U5      :w  a  [        S5      eUb  U[/        W5      :w  a  [        S5      eS[1        U5      0nUS   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R3                  UUS/S9  U(       ai  [        R                  " US   5      n[        R4                  " US   5      nSU[        R6                  " UU R8                  5      '   UR;                  5       US'   [=        0 UEUEUS9$ )a  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
NzYou have to specify text.tokenizer_init_kwargsr   ru   r'   rY   pixel_valuesr.   rv   pixel_values_videosrS   zONumber of image placeholders in the prompt does not match the number of images.zONumber of video placeholders in the prompt does not match the number of videos.r-   r+   r*   image)
modalities	input_idsmm_token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr%   r4   init_kwargs
isinstancer_   tuplenparrayr3   fetch_imagesr	   rc   cumsumr5   shapefullsumemptyr   flattenrs   rb   r   _check_special_mm_tokens
zeros_likeisinrE   tolistr   )rH   ru   rL   audiorv   rI   output_kwargsrM   rd   rO   image_inputsrN   re   rQ   rP   video_kwargsvideo_inputs
batch_size
num_frames_num_frames_per_videoimage_videos_inputsri   rf   rg   r+   r*   text_inputs	array_idsr   s                                 r"   __call__InternVLProcessor.__call__   s}   R <899**#
"&.."<"<
 
 $u..6D !$&HHaSM!))66v>F-f5F//`v`A_`L , 0 0 ?!-!1!1.!A(*		2C(D%! hhsm$&HHaSM!(9L//NvNNL!-!1!12G!H);)A)A&J
Q#%77:z#B 12J"$((:>3"?%&"&(ii0D&E#!"j 0(*a(E%+,%a(,.II6G,H%ab)!3!;!;Aq!A !3BFBaBa""!!))#	C?D%{K !kS[&@ !rss!kS9M5N&N !rss $23CDW3X"Y&}599:JDQ#0#?#C#CD^`d#e nnTJ]=-IJ%%dKWI%N#[!9:I "k+.F GDEbggi@A/@/G/G/IK+,!GK!G3F!GUcddr!   c                 ^   0 nUb  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R
                  " / UQUP76 PM#     nnU Vs/ s H  nSU R                  U-  -   PM     nnUR                  XS.5        [        S0 UD6$ s  snf s  snf )a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r'      )num_image_tokensnum_image_patchesr   )r%   r/   getupdater3   get_number_of_image_patchesr9   r   )	rH   image_sizesrI   vision_datar'   
image_sizer   rY   r   s	            r"   _get_num_multimodal_tokens,InternVLProcessor._get_num_multimodal_tokens  s     "3==AA/SUVM  ( #.!"-J $$@@\*\m\"-  !
 ^oo]nkT%:%:[%H I]no4Dmn,,,!
  ps   (B%)B*c                 b    U R                   R                  nU R                  R                  nX-   $ N)r4   model_input_namesr3   )rH   tokenizer_input_namesimage_processor_input_namess      r"   r   #InternVLProcessor.model_input_names1  s/     !% @ @&*&:&:&L&L#$BBr!   )	r=   r?   rE   r9   rA   rD   r<   r>   rB   )NNN   N)NNNNr   )r   r   r   r   __doc__
attributesimage_processor_classvideo_processor_classtokenizer_classr   rG   r_   strr   ndarrayrs   r   r   r   r   r   r   r   r%   r   r   r   propertyr   r    __classcell__)rJ   s   @r"   r1   r1   1   sA   $ EJ00%O  #m
 m m*>M3i>M
  9>M  9>M $&::>M $&::>M  ZZ>MD (,hl'+ue$ue uY(94	?DQbLccdeue
 $ue 01ue 
uen-8 C Cr!   r1   )typingr   r   numpyr   image_processing_utilsr   image_utilsr   r   r	   processing_utilsr
   r   r   r   r   tokenization_utils_baser   r   video_utilsr   r   r%   r1   __all__r   r!   r"   <module>r      sZ     #  2 Q Q f f C %<u .e  EC ECP 
r!   