
    cCi;                         S SK JrJr  S SKrSSKJr  SSKJr  SSK	J
r
JrJrJrJrJr  SSKJrJr  SSKJr  SS	KJr  \R.                  " \5      r " S
 S\SS9r " S S\
5      r " S S\SS9r " S S\5      rS/rg)    )OptionalUnionN   )BatchFeature)
ImageInput)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpackVideosKwargs)PreTokenizedInput	TextInput)logging)
VideoInputc                   0    \ rS rSr% \\\   \4   \S'   Srg)Glm4vVideosProcessorKwargs$   fps N)	__name__
__module____qualname____firstlineno__r   listfloat__annotations____static_attributes__r       d/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/glm4v/processing_glm4v.pyr   r   $   s    	tE{E!	""r   r   F)totalc                   F    \ rS rSr% \\   \S'   \\   \S'   \\   \S'   Srg)Glm4vImagesKwargs(   
patch_sizetemporal_patch_size
merge_sizer   N)r   r   r   r   r   intr   r   r   r   r    r#   r#   (   s     !#&r   r#   c                   @    \ rS rSr% \\S'   SSSS.SS0S.r\\S'   S	rg
)Glm4vProcessorKwargs.   images_kwargsF)paddingreturn_token_type_idsreturn_mm_token_type_idsreturn_metadataT)text_kwargsvideos_kwargsr2   r   N)	r   r   r   r   r#   r   	_defaultsr   r   r   r   r    r*   r*   .   s1    $$ %*(-

 ,T2I .-r   r*   c                      ^  \ rS rSrSr/ SQrSrSrSrSU 4S jjr	   SS\
\   S	\\\\\   \\   4   S
\
\   S\\   S\4
S jjrSS jr SS jrSrU =r$ )Glm4vProcessor;   a  
Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
[`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
Args:
    image_processor ([`Glm4vProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizerFast`], *optional*):
        The tokenizer is a required input.
    video_processor ([`Glm4vVideoProcessor`], *optional*):
        The video processor is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
)image_processor	tokenizervideo_processorAutoImageProcessorAutoVideoProcessor)PreTrainedTokenizerPreTrainedTokenizerFastc                   > [         TU ]  XX4S9  [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        [        USS 5      (       a  UR                  OUR                  U R                  5      U l        [        USS 5      (       a  UR                  U l        g UR                  U R                  5      U l        g )N)chat_templateimage_tokenz	<|image|>video_tokenz	<|video|>image_token_idvideo_token_id)	super__init__hasattrr@   rA   getattrrB   convert_tokens_to_idsrC   )selfr7   r8   r9   r?   kwargs	__class__s         r    rE   Glm4vProcessor.__init__P   s    _b.5i.O.O;U^UjUj.5i.O.O;U^UjUj y"2D99 $$001A1AB 	 y"2D99 $$ 	 001A1AB 	r   imagestextvideosrJ   returnc                    U R                   " [        4SU R                  R                  0UD6nUb  U R                  " SSU0US   D6nUS   nO0 nSnUb:  U R
                  " SSU0US   D6nSU;  a  UR                  S	5      n	OUS	   n	US
   n
O0 nSn
[        U[        5      (       d  U/nUR                  5       nUb  U R                  R                  S-  nSn[        [        U5      5       H  nU R                  X-   ;   aR  X|   R                  5       U-  nX-   R                  U R                  SU-  S5      X-'   US-  nU R                  X-   ;   a  MR  X-   R                  SU R                  5      X-'   M     U
Gb  U R
                  R                  S-  nSn[        [        U5      5       GH  nU R                   X-   ;   Ga  X   S   nSnW	U   nUR"                  c  [$        R'                  S5        UR"                  c  SOUR"                  Ul        UR(                  SSS2   n/ n[        S[        U5      5       H  nUR+                  UU   5        M     USU n[        U5      U:  a.  UR+                  U(       a  US   OS5        [        U5      U:  a  M.  [        U5       H(  nUU   nSU R                   S[-        U5       3nUU-  nM*     X-   R                  U R                   US5      X-'   X   R                  5       U-  X   S   -  n[        U5       H;  nU R                  X-   ;   d  M  X-   R                  U R                  SU-  S5      X-'   M=     US-  nU R                   X-   ;   a  GM  X-   R                  SU R                  5      X-'   GM     US   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R/                  UUSS/S9  U(       aW  [0        R2                  " US   5      n[0        R4                  " US   5      nSUUU R6                  :H  '   UR9                  5       US'   [;        0 UEUEUEUS9$ ) af	  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
the text.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `List[str]`, `List[List[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
        tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
    - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
    - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
tokenizer_init_kwargsNrM   r,   image_grid_thwrO   r2   r0   video_metadatavideo_grid_thw   r   z<|placeholder|>    a  SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.   z<|begin_of_image|>z<|end_of_image|>r1   return_tensorsr/   Fimagevideo)
modalities	input_idsmm_token_type_ids)datatensor_typer   )_merge_kwargsr*   r8   init_kwargsr7   r9   pop
isinstancer   copyr'   rangelenr@   prodreplacerA   r   loggerwarning_once
timestampsappendr(   _check_special_mm_tokensnparray
zeros_likerB   tolistr   )rI   rM   rN   rO   rJ   output_kwargsimage_inputsrS   videos_inputsrT   rU   merge_lengthindexinum_image_tokensvideo_index
num_framesvideo_structuremetadatarn   unique_timestampsidxselected_timestamps	frame_idxtimestamp_secframe_structurer[   r/   text_inputs	array_idsr`   s                                  r    __call__Glm4vProcessor.__call___   s   T ** 
"&.."<"<
 

 //`v`A_`L)*:;NL!N 00aa-P_B`aM .!.!2!23C!D!./?!@*+;<NM!N$%%6Dyy{%//::A=LE3t9%&&$'1'5'<'A'A'C|'S$"good.>.>@QTd@dfghDGQJE &&$'1 '//*;T=M=MN & %//::A=LK3t9%&&$'1!/!<Q!?J&(O-k:H||+++q
 *2)=28<<HL!)!4!4SqS!9J(*%$QJ8)00CA  9 +<KZ*H'12Z?+22Na3Fr3Jghi 12Z? &+:%6	(;I(F,>t?O?O>PP`aderas`t*u'?: &7
 #good.>.>QRSDG&388:lJnNijkNll % &+:%6	++tw6&*good6F6FHY\lHlno&pDG &7  1$KG &&$'1J '//*;T=M=MNM &N '}599:JDQ#0#?#C#CD^`e#f nnTJ]=-IJ%%dKWgDV%W#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,!QK!Q<!Q=!Q_mnnr   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ s H
  oUS-  -  PM     n
n	UR                  XS.5        Ubz  [         R                  R                  S0 5      nUR                  U5        U Vs/ s H!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ s H
  oWS-  -  PM     nn	XS'   [        S0 UD6$ s  snf s  sn	f s  snf s  sn	f )	a  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
    video_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (num_frames, height, width) per each video.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr,   r'   rV   )r{   num_image_patchesr2   num_video_tokensr   )
r*   r3   getupdater7   r'   get_number_of_image_patchesr9   get_number_of_video_patchesr	   )rI   image_sizesvideo_sizesrJ   vision_datar,   r'   
image_sizer   num_patchesr{   r2   
video_sizenum_video_patchesr   s                  r    _get_num_multimodal_tokens)Glm4vProcessor._get_num_multimodal_tokens   s    "0::>>PRSM  (&**<>a$BVBVBaBaJ #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd4Dmn"0::>>PRSM  ( #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd.>*+,,,#!  e!  es   *(EE6(E$Ec                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spaces)r8   batch_decode)rI   generated_outputsr   r   rJ   s        r    post_process_image_text_to_text.Glm4vProcessor.post_process_image_text_to_text  s3    ( ~~**
 3)E
 	
 	
r   )r@   rB   rA   rC   )NNNN)NNN)NN)TF)r   r   r   r   __doc__
attributesimage_processor_classvideo_processor_classtokenizer_classrE   r   r   r   r   r   r   r   r   r*   r   r   r   r   r   __classcell__)rK   s   @r    r5   r5   ;   s     EJ00HO
" (,^b'+	Do$Do I0$y/4HYCZZ[Do $	Do
 -.Do 
DoL$-N Y^
 
r   r5   )typingr   r   numpyrq   feature_extraction_utilsr   image_utilsr   processing_utilsr   r	   r
   r   r   r   tokenization_utils_baser   r   utilsr   video_utilsr   
get_loggerr   rl   r   r#   r*   r5   __all__r   r   r    <module>r      sv   * #  4 % t t C  % 
		H	%#U # 
.+5 
.i
^ i
X 
r   