
    h                        S SK JrJr  S SKJr  S SKJr  S SKJr  S SK	J
r
  \ " S S5      5       rS r\" S	5      r\S
:X  a  \
" \5      r\R!                  5       S    rSSSS.r\" S\S9r\R)                  \\R*                  / SQS9r\R,                  (       a6  \R-                  \R.                  5        \R-                  \R.                  SS9  ggg)    )	dataclassfield)Optional)load_dataset)	ModelCard)HfArgumentParserc                   t    \ rS rSr% Sr\" SSS0S9r\\S'   \" SSS	0S9r	\
\S
'   \" SSS0S9r\\   \S'   Srg)ScriptArguments   a  
Arguments for the script.

Args:
    push_to_hub (`bool`, *optional*, defaults to `False`):
        Whether to push the dataset to the Hugging Face Hub.
    repo_id (`str`, *optional*, defaults to `"trl-lib/tldr"`):
        Hugging Face repository ID to push the dataset to.
    dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
        Number of workers to use for dataset processing.
Fhelpz4Whether to push the dataset to the Hugging Face Hub.)defaultmetadatapush_to_hubztrl-lib/tldrz2Hugging Face repository ID to push the dataset to.repo_idNz0Number of workers to use for dataset processing.dataset_num_proc )__name__
__module____qualname____firstlineno____doc__r   r   bool__annotations__r   strr   r   int__static_attributes__r       P/home/james-whalen/.local/lib/python3.13/site-packages/examples/datasets/tldr.pyr
   r
      si    
 PQK  NOGS  ',LM'hsm r   r
   c                 R    SnUR                  U S   U S   U S   S9nSU S   -   nX#S.$ )	Nz>SUBREDDIT: r/{subreddit}

TITLE: {title}

POST: {post}

TL;DR:	subreddittitlepost)r    r!   r"    summary)prompt
completion)format)exampletldr_format_strr%   r&   s       r   to_prompt_completionr*   3   sH    \O##gk.B'RYJZahioap#qFwy))J77r   a3  
---
tags: [trl]
---

# TL;DR Dataset

## Summary

The TL;DR dataset is a processed version of Reddit posts, specifically curated to train models using the [TRL library](https://github.com/huggingface/trl) for summarization tasks. It leverages the common practice on Reddit where users append "TL;DR" (Too Long; Didn't Read) summaries to lengthy posts, providing a rich source of paired text data for training summarization models.

## Data Structure

- **Format**: [Standard](https://huggingface.co/docs/trl/main/dataset_formats#standard)
- **Type**: [Prompt-completion](https://huggingface.co/docs/trl/main/dataset_formats#prompt-completion)

Columns:
- `"prompt"`: The unabridged Reddit post.
- `"completion"`: The concise "TL;DR" summary appended by the author.

This structure enables models to learn the relationship between detailed content and its abbreviated form, enhancing their summarization capabilities.

## Generation script

The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/tldr.py).
__main__zghttps://openaipublic.blob.core.windows.net/summarize-from-feedback/datasets/tldr_3_filtered/train.jsonlzghttps://openaipublic.blob.core.windows.net/summarize-from-feedback/datasets/tldr_3_filtered/valid.jsonlzfhttps://openaipublic.blob.core.windows.net/summarize-from-feedback/datasets/tldr_3_filtered/test.jsonl)train
validationtestjson)
data_files)idr    r!   r"   r$   )num_procremove_columnsdataset)	repo_typeN)dataclassesr   r   typingr   datasetsr   huggingface_hubr   transformersr   r
   r*   
model_cardr   parserparse_args_into_dataclassesscript_argsr0   r4   mapr   r   r   r   r   r   <module>r@      s    )  ! % )   68   
6 zo.F446q9K {xJ
 6j9Gkk--F  G K//0{22iH % r   