
    hO              	          S SK JrJr  S SKJr  S SKJr  S SKJr  S SK	J
r
Jr  \ " S S5      5       rS rS	 r\" S
5      r\S:X  a  \" \5      r\R%                  5       S    r\" SSSS9r\R+                  \\R,                  S9r\R/                  \\R,                  / SQS\
R0                  " S5      0S9r\R3                  SS9r\R4                  (       a6  \R5                  \R6                  5        \R5                  \R6                  SS9  ggg)    )	dataclassfield)Optional)load_dataset)	ModelCard)AutoTokenizerHfArgumentParserc                   t    \ rS rSr% Sr\" SSS0S9r\\S'   \" SSS	0S9r	\
\S
'   \" SSS0S9r\\   \S'   Srg)ScriptArguments   a  
Arguments for the script.

Args:
    push_to_hub (`bool`, *optional*, defaults to `False`):
        Whether to push the dataset to the Hugging Face Hub.
    repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-descriptiveness"`):
        Hugging Face repository ID to push the dataset to.
    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of workers to use for dataset processing.
Fhelpz4Whether to push the dataset to the Hugging Face Hub.)defaultmetadatapush_to_hubz,trl-lib/lm-human-preferences-descriptivenessz2Hugging Face repository ID to push the dataset to.repo_idNz0Number of workers to use for dataset processing.dataset_num_proc )__name__
__module____qualname____firstlineno____doc__r   r   bool__annotations__r   strr   r   int__static_attributes__r       p/home/james-whalen/.local/lib/python3.13/site-packages/examples/datasets/lm-human-preferences-descriptiveness.pyr   r      si    
 PQK  >NOGS  ',LM'hsm r   r   c                 L   ^  [        U 4S j[        SS5       5       5      (       + $ )Nc              3   @   >#    U  H  nTS    TSU 3   :H  v   M     g7f)sample0sampleNr   ).0jexamples     r   	<genexpr>'samples_not_all_same.<locals>.<genexpr>5   s&     T179%6!)>>s         )allrange)r&   s   `r   samples_not_all_samer-   4   s    TaQRTTTTr   c                     UR                  U S   5      R                  5       nU S   nUR                  U SU 3   5      n[        S5       H!  nUR                  U SU 3   5      nXF:w  d  M!    O   UW:w  d   eX$US.$ )Nquerybestr#   r*   )promptchosenrejected)decodestripr,   )r&   	tokenizerr1   best_idxr2   rejected_idxr3   s          r   to_prompt_completionr9   8   s    gg./557FvHgxj&9:;Fa##Gf\N,C$DE ! XHEEr   aH  
---
tags: [trl]
---

# LM-Human-Preferences-Descriptiveness Dataset

## Summary

The LM-Human-Preferences-Descriptiveness dataset is a processed subset of [OpenAI's LM-Human-Preferences](https://github.com/openai/lm-human-preferences), focusing specifically on enhancing the descriptiveness of generated text. It contains pairs of text samples, each labeled as either "chosen" or "rejected," based on human preferences regarding the level of detail and vividness in the descriptions. This dataset enables models to learn human preferences in descriptive language, improving their ability to generate rich and engaging narratives.

## Data Structure

- **Format**: [Standard](https://huggingface.co/docs/trl/main/dataset_formats#standard)
- **Type**: [Preference](https://huggingface.co/docs/trl/main/dataset_formats#preference)

Columns:
- `"prompt"`: The text sample.
- `"chosen"`: A version of the text with enhanced descriptiveness.
- `"rejected"`: A version of the text with less descriptiveness.

This structure allows models to learn to prefer the _chosen_ response over the _rejected_ one, thereby aligning with human preferences in descriptive language.

## Generation script

The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/lm-human-preferences-descriptiveness.py).
__main__jsonzfhttps://openaipublic.blob.core.windows.net/lm-human-preferences/labels/descriptiveness/offline_5k.jsontrain)
data_filessplit)num_proc)r/   r"   sample1sample2sample3r0   r6   gpt2)r?   remove_columns	fn_kwargsi  )
train_sizedataset)	repo_typeN)dataclassesr   r   typingr   datasetsr   huggingface_hubr   transformersr   r	   r   r-   r9   
model_cardr   parserparse_args_into_dataclassesscript_argsrG   filterr   mapfrom_pretrainedtrain_test_splitr   r   r   r   r   <module>rV      s+   )  ! % 8   8U	F   
8 zo.F446q9K{G nn1K<X<XnYGkk--T = =f EF	  G &&$&7GK//0{22iH - r   