
    cCiD              
       :   S r SSKJrJr  SSKrSSKJr  SSKJ	r	J
r
  SSKJrJrJrJr  SSKJrJr   " S	 S
\SS9r " S S\SS9rS\\   S\S\\\      4S jrS\\\\         S\\\      S\S\S\R0                  4
S jrS\S\S\S\4S jr " S S\5      rS/rg)zProcessor class for Mllama.    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   &    \ rS rSr% \\   \S'   Srg)MllamaImagesKwargs   max_image_tiles N)__name__
__module____qualname____firstlineno__r   int__annotations____static_attributes__r       f/home/james-whalen/.local/lib/python3.13/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s    c]"r   r   F)totalc                   ,    \ rS rSr% \\S'   SSS00rSrg)MllamaProcessorKwargs    images_kwargsimage_kwargsr      r   N)r   r   r   r   r   r   	_defaultsr   r   r   r   r   r       s    %% 	q
Ir   r   	input_idsimage_token_idreturnc                    [        U 5       VVs/ s H  u  p#X1:X  d  M  UPM     nnn[        U5      S:X  a  / $ [        U5      S:X  a  US   S//$ [        USS USS 5       VVs/ s H  u  pVXV/PM
     nnnUR                  US   [        U 5      /5        US   S   nUSSS2    H  n	U	S   U	S   S-
  :X  a  XS'   U	S   nM     U$ s  snnf s  snnf )a  
Generate a cross-attention token mask for image tokens in the input sequence.

This function identifies the positions of image tokens in the input sequence and creates
a mask that defines which subsequent tokens each image token should attend to.

Args:
    input_ids (list[int]): A list of token ids representing the input sequence.
    image_token_id (int): The id of the token used to represent images in the sequence.

Returns:
    list[list[int]]: A list of [start, end] pairs, where each pair represents the range
    of tokens an image token should attend to.

Notes:
    - If no image tokens are present, an empty list is returned.
    - For a single image token, it attends to all subsequent tokens until the end of the sequence.
    - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
    - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
r      N)	enumeratelenzipappend)
r%   r&   itokenimage_token_locationsloc1loc2vision_maskslast_mask_endvision_masks
             r   get_cross_attention_token_maskr7   *   s   , 09/C_/C81uG^Q/C_
 !Q&	  !Q&&q)2.//367LSb7QShijikSl3mn3mZTTL3mLn .r2C	NCD
 !$Q'M#DbD)q>[^a//*N#A *
 / ` os   CC$Ccross_attention_token_mask	num_tilesmax_num_tileslengthc           	      p   [        U 5      n[        S U  5       5      n[        R                  " XCXR4[        R                  S9n[        [        X5      5       H[  u  nu  p[        [        X5      5       H;  u  n
u  p[        U5      S:X  d  M  Uu  p[        X5      nUS:X  a  UnSXgX2U
SU24'   M=     M]     U$ )a  
Convert the cross attention mask indices to a cross attention mask 4D array.

This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

Args:
    cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
        - The outer list represents the batch dimension.
        - The middle list represents different images within each batch item.
        - The inner list contains pairs of integers [start, end] representing token ranges for each image.
    num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
    max_num_tiles (int): The maximum possible number of tiles.
    length (int): The total sequence length of the input.

Returns:
    np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
        The array contains `1` where attention is allowed and `0` where it is not.

Note:
    - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
c              3   8   #    U  H  n[        U5      v   M     g 7fNr,   ).0maskss     r   	<genexpr>?convert_sparse_cross_attention_mask_to_dense.<locals>.<genexpr>x   s     L1KU1K   )shapedtype   r*   r)   N)r,   maxnpzerosint64r+   r-   min)r8   r9   r:   r;   
batch_sizemax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                  r   ,convert_sparse_cross_attention_mask_to_denserX   Z   s    : /0JL1KLLN88>Ahh
 9B#F`Bl8m4
4\5>s<?b5c1H1y9~"&
#&"9 CYZ$Ho~o%UV 6d 9n  r   prompt	bos_tokenimage_tokenc                     X;   a  U $ SnU R                  U5      (       a+  U [        U5      S n US-  nU R                  U5      (       a  M+  X#-   U U  3$ )a  
Builds a string from the input prompt by adding `bos_token` if not already present.

Args:
    prompt (`str`):
        The input prompt string.
    bos_token (`str`):
        The beginning of sentence token to be added.
    image_token (`str`):
        The image token used to identify the start of an image sequence.

Returns:
    str: The modified prompt string with the `bos_token` added if necessary.

Examples:
    >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'

    >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
    '<|image|><begin_of_text>Hello world'

    >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'
r   Nr)   )
startswithr,   )rY   rZ   r[   num_image_tokens_on_starts       r   build_string_from_inputr_      so    4  !


K
(
(K(*+!Q&! 

K
(
( 56yk&JJr   c                      ^  \ rS rSrSrSS/rSrSrSU 4S jjr    SS\	\
   S	\	\\\\\   \\   4      S
\\   S\4S jjr SS jr\S 5       rSrU =r$ )MllamaProcessor   a  
Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
[`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
information.
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
    ```python
    from transformers import MllamaProcessor
    from PIL import Image

    processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")

    processor(
        images=your_pil_image,
        text=["<|image|>If I had to write a haiku for this one"],
        images_kwargs = {"size": {"height": 448, "width": 448}},
        text_kwargs = {"padding": "right"},
        common_kwargs = {"return_tensors": "pt"},
    )
    ```

Args:
    image_processor ([`MllamaImageProcessor`]):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.

image_processor	tokenizerMllamaImageProcessorPreTrainedTokenizerFastc                 H  > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        SU l        UR                  U R                  5      U l        UR                  U l        [        TU ]!  XUS9  g )Nr[   z	<|image|>z<|python_tag|>)chat_template)	hasattrr[   convert_tokens_to_idsr&   python_tokenpython_token_idrZ   super__init__)selfrc   rd   rh   	__class__s       r   rn   MllamaProcessor.__init__   s    y-00*D"+"A"A$BRBR"SD(44D"+":":D,(>>t?P?PQ",,=Qr   imagestextkwargsr'   c           
      ~   Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUS   nSUS'   US   nUS   n	0 n
UGb%  [        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S	5      eU Vs/ s H  oR                  U R                  5      PM     nnU Vs/ s H#  n[        XR                  U R                  5      PM%     nnUR                  S
S5      nU R                  " U40 UD6nU R                  X/S/S9  US    Vs/ s H  nUR                  U R                   5      PM      nnU
R#                  U5        S/nUbA  U R$                  R'                  U5      n[)        U5      nU Vs/ s H  n[+        U5      PM     nnUb  [-        S W 5       5      (       a"  [        S U 5       5      (       d  [        S5      e[/        U5      S:  aY  UU:w  d  WU:w  aM  Uc  [        S5      eSn[/        U5      [/        U5      :X  a	  UU:w  a  SnOWU:w  a  Sn[        SU SU SU 35      eUb5  U R$                  " U40 UD6nUR                  S5      nU
R#                  U5        Ubd  Uba  WS    Vs/ s H  n[1        UU R                   5      PM     nn[3        UWU R$                  R4                  [7        S US    5       5      S9nUU
S'   U	R                  SS5      n[9        U
US9nU$ s  snf s  snf s  snf s  snf s  snf )aF  
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` arguments to
MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
to the docstring of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return NumPy `np.ndarray` objects.
            - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsreturn_tensorsr!   common_kwargsc              3   B   #    U  H  n[        U[        5      v   M     g 7fr>   )
isinstancestr)r@   ts     r   rB   +MllamaProcessor.__call__.<locals>.<genexpr>  s     =_Z^UVjC>P>PZ^s   zAInvalid input text. Please provide a string, or a list of stringspadding_sideimage)
modalitiesr%   r   c              3   *   #    U  H	  oS :H  v   M     g7fr   Nr   r@   	batch_imgs     r   rB   r~   *  s     D3Ci>3C   c              3   *   #    U  H	  oS :H  v   M     g7fr   r   r   s     r   rB   r~   *  s      Q0@9Q0@r   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). r9   c              3   8   #    U  H  n[        U5      v   M     g 7fr>   r?   )r@   r%   s     r   rB   r~   O  s     Q;Pi3y>>;PrD   )r9   r:   r;   rO   )datatensor_type)
ValueError_merge_kwargsr   rd   init_kwargsr{   r|   listtupleallcountr[   r_   rZ   pop_check_special_mm_tokensr&   updaterc   fetch_imagesr   r,   anysumr7   rX   r   rH   r   )ro   rr   rs   audiovideosrt   output_kwargsrw   r!   ry   r   r}   n_images_in_text	text_item_encoding	token_idsn_images_in_idsn_images_in_imagessampleadd_messageimage_featuresr9   r8   rO   rx   batch_features                              r   __call__MllamaProcessor.__call__   s   N <FNFGG**!
"&.."<"<
 
 $M2(,$%%o6%o6$$$v e}55#=_Z^=_:_:_ !deeCGH4a(8(8 94Hjnojn]f+I~~tGWGWXjnDo5A~~d:k:H))$gY)OU]^iUjkUj	yt/B/BCUjOkKK!S))66v>F/7F<B!CF&#f+F!CD3CDDDS Q0@Q N N !w  #$q("&66/M_:_>$%ghh"$K-.#6F2GGL^brLr 'C(,>> 'Q$CDTCU V@@R?SSVWbVce 
 !11&JMJN&**;7IKK' $"2`hit`u*`uS\.y$:M:MN`u ' * $P*#"22BBQ8K;PQQ	$  ,@D'(&**+;TB$$NKw  Io l "DB*s   3$L&*L+%L0.L5. L:c                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spaces)rd   batch_decode)ro   generated_outputsr   r   rt   s        r   post_process_image_text_to_text/MllamaProcessor.post_process_image_text_to_textX  s3    ( ~~**
 3)E
 	
 	
r   c                     U R                   R                  nU R                  R                  nU Vs/ s H  o3S:w  d  M
  UPM     nn[        X-   S/-   5      $ s  snf )Nr9   rO   )rd   model_input_namesrc   r   )ro   tokenizer_input_namesimage_processor_input_namesnames       r   r   !MllamaProcessor.model_input_namess  sb     $ @ @&*&:&:&L&L# 9T&k8S_jWjt8S#&k)GKaJbbcc 'ls
   	AA)rZ   r[   r&   rk   rl   r>   )NNNN)TF)r   r   r   r   __doc__
attributesimage_processor_classtokenizer_classrn   r   r   r   r   r   r   r   r   r   r   r   propertyr   r   __classcell__)rp   s   @r   ra   ra      s    > $[1J2/OR (,hlv$v uY(94	?DQbLccdev ./v 
vr Y^
6 d dr   ra   )r   typingr   r   numpyrI   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   r   r   r   r   r7   ndarrayrX   r|   r_   ra   __all__r   r   r   <module>r      s     " "  4 A V V C#U #,E -d3i - -QUVZ[^V_Q` -`-  $T$s)_ 5- DI-  -  	- 
 ZZ- `"KC "KC "Kc "Kc "KJLdn Ld^ 
r   