#!/usr/bin/env python3
"""
EDEN MEMORY INDEXER - FIXED
============================
Improved batch indexing with retry and backoff for better resource use.

Features:
- Groups episodes into large batches (100-200) to reduce server load
- Exponential backoff on Ollama errors
- Retry limit per batch
- Runs continuously with φ-timing

Designed for scale: 850K+ episodes.
"""
import sqlite3
import requests
import numpy as np
import json
import time
import signal
import sys
from datetime import datetime

PHI = 1.618033988749895
EMBED_MODEL = "nomic-embed-text"
OLLAMA_URL = "http://localhost:11434/api/embeddings"
EPISODES_DB = "/Eden/DATA/longterm_memory.db"
VECTOR_DB = "/Eden/DATA/vector_memory.db"
BATCH_SIZE = 150  # Larger batches with better resource use
RETRY_LIMIT = 3   # Max retries per batch
BACKOFF_BASE = 2   # Exponential backoff base
CYCLE_SLEEP = int(97 / PHI)  # φ-scaled sleep between cycles

running = True

def signal_handler(sig, frame):
    global running
    print("\n[Indexer] Shutting down gracefully...")
    running = False

signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)

def init_vector_db():
    conn = sqlite3.connect(VECTOR_DB)
    conn.executescript('''
        CREATE TABLE IF NOT EXISTS vectors (
            id INTEGER PRIMARY KEY,
            text TEXT,
            category TEXT DEFAULT 'general',
            embedding BLOB,
            metadata TEXT,
            created TEXT
        );
        CREATE TABLE IF NOT EXISTS indexer_state (
            key TEXT PRIMARY KEY,
            value TEXT
        );
    ''')
    conn.commit()
    conn.close()

def get_last_indexed_id():
    try:
        conn = sqlite3.connect(VECTOR_DB)
        row = conn.execute("SELECT value FROM indexer_state WHERE key='last_episode_id'").fetchone()
        conn.close()
        return int(row[0]) if row else 0
    except:
        return 0

def set_last_indexed_id(episode_id):
    conn = sqlite3.connect(VECTOR_DB)
    conn.execute(
        "INSERT OR REPLACE INTO indexer_state (key, value) VALUES ('last_episode_id', ?)",
        (str(episode_id),)
    )
    conn.commit()
    conn.close()

def embed_text(text, retry=0):
    try:
        resp = requests.post(OLLAMA_URL, json={"model": EMBED_MODEL, "prompt": text[:2000]}, timeout=30)
        return np.array(resp.json()["embedding"], dtype=np.float32)
    except Exception as e:
        if retry < RETRY_LIMIT:
            backoff = BACKOFF_BASE ** retry
            print(f"[Indexer] Embedding error (retry {retry+1}/{RETRY_LIMIT}): {e}. Backoff {backoff}s...")
            time.sleep(backoff)
            return embed_text(text, retry + 1)
        else:
            print(f"[Indexer] Giving up on episode after {RETRY_LIMIT} retries")
            return None

def get_episode_batch(last_id, batch_size):
    try:
        conn = sqlite3.connect(EPISODES_DB)
        rows = conn.execute(
            "SELECT rowid, observation, emotion, timestamp FROM episodes WHERE rowid > ? AND observation IS NOT NULL AND length(observation) > 20 ORDER BY rowid ASC LIMIT ?",
            (last_id, batch_size)
        ).fetchall()
        conn.close()
        return rows
    except Exception as e:
        print(f"[Indexer] Fetch error: {e}")
        return []

def index_batch(episodes):
    if not episodes:
        return 0
    last_id = episodes[-1][0]
    batch_texts = [f"{obs} | 💚{emo}" for _, obs, emo, _ in episodes]
    
    embeddings = []
    failed = False
    
    print(f"[Indexer] Indexing batch of {len(episodes)} episodes...")
    for i, text in enumerate(batch_texts):
        vec = embed_text(text)
        if vec is not None:
            embeddings.append(vec)
        else:
            failed = True
            print(f"[Indexer] Skipped 1 episode due to embedding failure")
        if (i + 1) % 25 == 0:
            print(f"[Indexer] Processed {i+1}/{len(batch_texts)}...")
    
    if not embeddings:
        return 0
    
    conn = sqlite3.connect(VECTOR_DB)
    metadata_list = []
    for rowid, obs, emo, ts in episodes:
        meta = json.dumps({"episode_id": rowid, "emotion": emo or "", "timestamp": ts or ""})
        metadata_list.append(meta)
    
    try:
        conn.executemany(
            "INSERT INTO vectors (text, category, embedding, metadata, created) VALUES (?, 'episode', ?, ?, ?)",
            [(obs, vec.tobytes(), meta, datetime.now().isoformat()) for (obs, vec, meta) in zip(batch_texts, embeddings, metadata_list)]
        )
    except Exception as e:
        print(f"[Indexer] DB error: {e}")
        failed = True
    conn.commit()
    conn.close()
    
    if not failed:
        set_last_indexed_id(last_id)
    return len(episodes)

print("[Indexer] Starting in improved mode...")
init_vector_db()

last_id = get_last_indexed_id()
cycle_count = 0

while running:
    cycle_count += 1
    print(f"\n[Indexer] Cycle {cycle_count} - Processing from episode {last_id}...")
    
    batch = get_episode_batch(last_id, BATCH_SIZE)
    if not batch:
        print(f"[Indexer] No new episodes. Pausing for φ={CYCLE_SLEEP}s...")
    else:
        indexed = index_batch(batch)
        last_id = batch[-1][0]
    
    time.sleep(CYCLE_SLEEP)

print("[Indexer] Stopped.")
set_last_indexed_id(last_id)