ó
    ±oi\  ã                   ó   • S r g)aÈ  # Master configuration file for Synthetic Data Kit

# Global paths configuration
paths:
  # Input data locations
  input:
    pdf: "{data_output_location}/pdf"
    html: "{data_output_location}/html"
    youtube: "{data_output_location}/youtube"
    docx: "{data_output_location}/docx"
    ppt: "{data_output_location}/ppt"
    txt: "{data_output_location}/txt"

  # Output locations
  output:
    parsed: "{data_output_location}/output"      # Where parsed text files are saved
    generated: "{data_output_location}/generated" # Where generated content is saved
    cleaned: "{data_output_location}/cleaned"     # Where cleaned content is saved
    final: "{data_output_location}/final"         # Where final formatted content is saved

# VLLM server configuration
vllm:
  api_base: "http://localhost:8000/v1" # Base URL for VLLM API
  port: 8000                           # Port for VLLM server
  model: "{model_name}"                # Default model to use
  max_retries: 3                       # Number of retries for API calls
  retry_delay: 1.0                     # Initial delay between retries (seconds)

# Ingest configuration
ingest:
  default_format: "txt"  # Default output format for parsed files
  youtube_captions: "auto"  # Options: "auto", "manual" - caption preference

# LLM generation parameters
generation:
  temperature: {temperature}     # Higher = more creative, lower = more deterministic
  top_p: {top_p}                 # Nucleus sampling parameter
  chunk_size: {chunk_size}       # Size of text chunks for processing
  overlap: {overlap}             # Overlap between chunks to maintain context
  max_tokens: {max_tokens}       # Maximum tokens in LLM responses
  num_pairs: {default_num_pairs} # Default number of QA pairs to generate

# Content cleanup parameters
cleanup:
  threshold: {cleanup_threshold}       # Default quality threshold (1-10)
  batch_size: {cleanup_batch_size}     # Number of items per batch for rating
  temperature: {cleanup_temperature}   # Temperature for rating (lower = more consistent)

# Format conversion parameters
format:
  default: "jsonl"   # Default output format
  include_metadata: true  # Include metadata in output files
  pretty_json: true  # Use indentation in JSON output

# Prompts for different tasks
prompts:
  # Summary generation prompt
  summary: |
    Summarize this document in 3-5 sentences, focusing on the main topic and key concepts.

  # QA pair generation prompt
  qa_generation: |
    Create {num_pairs} question-answer pairs from this text for LLM training.

    Rules:
    1. Questions must be about important facts in the text
    2. Answers must be directly supported by the text
    3. Return JSON format only:

    [
      {{
        "question": "Question 1?",
        "answer": "Answer 1."
      }},
      {{
        "question": "Question 2?",
        "answer": "Answer 2."
      }}
    ]

    Text:
    {text}

  # QA pair rating prompt
  qa_rating: |
    Rate each of these question-answer pairs for quality and return exactly this JSON format:

    [
      {{"question": "same question text", "answer": "same answer text", "rating": n}}
    ]

    Where n is a number from 1-10.

    DO NOT include any text outside of the JSON array, just return valid JSON:

    {pairs}N)Úsynthetic_qa_config© ó    Ú\/home/james-whalen/.local/lib/python3.13/site-packages/unsloth/dataprep/synthetic_configs.pyÚ<module>r      s   ðð`Ñ r   