
    h[                     :   S SK r S SKJrJr  S SKJr  S SKJr  S SKJ	r	  S SK
Jr  \ " S S5      5       rS	 rS
 rS r\	" S5      r\S:X  a  \" \5      r\R'                  5       S    r\" SS\R*                  S9r\R/                  \SS/\R*                  S9r\R1                  \\R*                  S9r\R/                  \S/\R*                  S9r\R2                  (       a?  \R3                  \R4                  \R*                  S9  \R3                  \R4                  SS9  ggg)    N)	dataclassfield)Optional)load_dataset)	ModelCard)HfArgumentParserc                   t    \ rS rSr% Sr\" SSS0S9r\\S'   \" SSS	0S9r	\
\S
'   \" SSS0S9r\\   \S'   Srg)ScriptArguments   a  
Arguments for the script.

Args:
    push_to_hub (`bool`, *optional*, defaults to `False`):
        Whether to push the dataset to the Hugging Face Hub.
    repo_id (`str`, *optional*, defaults to `"trl-lib/llava-instruct-mix"`):
        Hugging Face repository ID to push the dataset to.
    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of workers to use for dataset processing.
Fhelpz4Whether to push the dataset to the Hugging Face Hub.)defaultmetadatapush_to_hubztrl-lib/llava-instruct-mixz2Hugging Face repository ID to push the dataset to.repo_idNz0Number of workers to use for dataset processing.dataset_num_proc )__name__
__module____qualname____firstlineno____doc__r   r   bool__annotations__r   strr   r   int__static_attributes__r       ^/home/james-whalen/.local/lib/python3.13/site-packages/examples/datasets/llava_instruct_mix.pyr
   r
      si    
 PQK  ,NOGS  ',LM'hsm r   r
   c                     / n[         R                  " U S   5       HH  nUS   nUR                  SS5      R                  5       nUS   S:X  a  SOSnUR	                  XCS	.5        MJ     XS
   /S.$ )Nconversationsvaluez<image> fromhumanuser	assistant)rolecontentimage)messagesimages)astliteral_evalreplacestripappend)exampler*   messager(   r'   s        r   process_exampler3   4   sx    H##GO$<='"//)R0668 G3v:;	 >
 !W-=,>??r   c                 6    [        S U S    5       5      nUS:*  $ )Nc              3   >   #    U  H  n[        US    5      v   M     g7f)r(   Nlen).0msgs     r   	<genexpr>'filter_long_examples.<locals>.<genexpr>?   s     J6Iss3y>**6Is   r*   i  )sum)r1   total_lengths     r   filter_long_examplesr>   >   s"    Jgj6IJJL4r   c                 Z    [        U S   5      S:  d   eU S   SS U S'   U S   SS U S'   U $ )zd
Splits the messages into a prompt and a completion. The last message is considered the completion.
r*      Nprompt
completionr6   )r1   s    r   split_prompt_completionrD   C   sM     wz"#a'''
+CR0GH#J/4GLNr   a  
---
tags: [trl]
---

# LLaVA Instruct Mix

## Summary

The LLaVA Instruct Mix dataset is a processed version of [LLaVA Instruct Mix](https://huggingface.co/datasets/theblackcat102/llava-instruct-mix).

## Data Structure

- **Format**: [Conversational](https://huggingface.co/docs/trl/main/dataset_formats#conversational)
- **Type**: [Language-modeling](https://huggingface.co/docs/trl/main/dataset_formats#language-modeling)

Columns:
- `"images"`: The image associated with the text.
- `"prompt"`: A list of messages that form the context for the conversation.
- `"completion"`: The last message in the conversation, which is the model's response.

This structure allows models to learn from the context of the conversation, enhancing their understanding of how to generate descriptive text based on visual inputs.

## Generation script

The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/llava_instruct_mix.py).
__main__z!theblackcat102/llava-instruct-mixtrain)splitnum_procr    r)   )remove_columnsrH   )rH   r*   dataset)	repo_type)r,   dataclassesr   r   typingr   datasetsr   huggingface_hubr   transformersr   r
   r3   r>   rD   
model_cardr   parserparse_args_into_dataclassesscript_argsr   rJ   mapfilterr   r   r   r   r   <module>rW      s9    (  ! % )   6@ 
   
8 zo.F446q9K>gXcXtXtuGkk'(B[MiMi  G nn1K<X<XnYGkk1:,YdYuYukvGK//+:V:VW{22iH  r   