#!/usr/bin/env python3
"""
Eden Memory Reindexer (v2.1 schema-safe)

- Enforces v2.1 schema: docs(id INTEGER PRIMARY KEY, path TEXT NOT NULL, title TEXT, profile TEXT, body TEXT, emb BLOB)
- If a legacy "docs" table exists (id, kind, ref, text, uniq), it is auto-migrated in-place.
- Embeddings are computed only for rows where emb IS NULL.
- Meta is updated: model, dim, model_path (if provided), doc_count.

Environment:
  EDEN_MEMORY_ROOT   (default: /Eden/MEMORY)
  EDEN_EMBED_PATH    optional: explicit local path to a sentence-transformers model
  EDEN_EMBED_MODEL   optional: model name (default: sentence-transformers/all-MiniLM-L6-v2)
"""

import os, sys, time, json, sqlite3, hashlib
from contextlib import contextmanager

ROOT = os.environ.get("EDEN_MEMORY_ROOT", "/Eden/MEMORY")
IDX  = os.path.join(ROOT, "semantic.index")

# ---- Embedding model selection ----
DEFAULT_MODEL_NAME = os.environ.get("EDEN_EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
MODEL_PATH = os.environ.get("EDEN_EMBED_PATH", "").strip() or None

try:
    from sentence_transformers import SentenceTransformer
    _MODEL = SentenceTransformer(MODEL_PATH or DEFAULT_MODEL_NAME)
    _EMB_DIM = _MODEL.get_sentence_embedding_dimension()
except Exception as e:
    print(f"[FATAL] Could not load embedding model: {e}", file=sys.stderr)
    sys.exit(2)

# ---- SQL helpers ----
EXPECTED_COLS = [
    ("id","INTEGER"), ("path","TEXT"), ("title","TEXT"),
    ("profile","TEXT"), ("body","TEXT"), ("emb","BLOB"),
]
CREATE_DOCS_SQL = """
CREATE TABLE docs (
  id INTEGER PRIMARY KEY,
  path    TEXT NOT NULL,
  title   TEXT,
  profile TEXT,
  body    TEXT,
  emb     BLOB
)
"""
UNIQ_IDX_NAME = "ux_docs_path_profile"
UNIQ_IDX_SQL  = "CREATE UNIQUE INDEX ux_docs_path_profile ON docs(path, profile)"

@contextmanager
def connect(path):
    with sqlite3.connect(path) as c:
        c.execute("PRAGMA journal_mode=WAL")
        c.execute("PRAGMA synchronous=NORMAL")
        c.execute("PRAGMA foreign_keys=OFF")
        yield c

def table_exists(c, name):
    return c.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (name,)).fetchone() is not None

def idx_exists(c, name):
    return c.execute("SELECT 1 FROM sqlite_master WHERE type='index' AND name=?", (name,)).fetchone() is not None

def schema_cols(c, table):
    return [(r[1], r[2]) for r in c.execute(f"PRAGMA table_info('{table}')").fetchall()]

def ensure_meta(c):
    c.execute("CREATE TABLE IF NOT EXISTS meta (k TEXT PRIMARY KEY, v TEXT)")

def set_meta(c, k, v):
    ensure_meta(c)
    c.execute("INSERT OR REPLACE INTO meta(k,v) VALUES(?,?)", (k, str(v)))

def get_doc_count(c):
    try:
        return c.execute("SELECT COUNT(*) FROM docs").fetchone()[0]
    except Exception:
        return None

def ensure_v21_docs(c):
    """Create v2.1 docs if missing, or migrate legacy docs table in-place."""
    if not table_exists(c, "docs"):
        c.execute(CREATE_DOCS_SQL)
        c.execute(UNIQ_IDX_SQL)
        return "created_empty_v21"

    cols = schema_cols(c, "docs")
    if cols == EXPECTED_COLS:
        # Already v2.1; just ensure index exists
        if not idx_exists(c, UNIQ_IDX_NAME):
            c.execute(UNIQ_IDX_SQL)
        return "already_v21"

    # Detect classic legacy layout: id, kind, ref, text, uniq
    legacy_names = [n for n,_t in cols]
    if legacy_names == ["id","kind","ref","text","uniq"]:
        ts = time.strftime("%Y%m%d_%H%M%S")
        legacy = f"docs_legacy_{ts}"
        print(f"[INFO] Legacy docs detected; migrating -> {legacy}")
        c.execute(f"ALTER TABLE docs RENAME TO {legacy}")
        c.execute(CREATE_DOCS_SQL)
        c.execute(UNIQ_IDX_SQL)
        # Map legacy → v2.1
        c.execute(f"""
            INSERT INTO docs(id, path, title, profile, body, emb)
            SELECT id, ref, NULL, kind, text, NULL FROM {legacy}
        """)
        set_meta(c, "migrated_from", legacy)
        set_meta(c, "migration", "legacy_to_v2_1_best_effort")
        return "migrated_legacy_to_v21"

    # Unknown layout: rebuild safely—keep a snapshot
    ts = time.strftime("%Y%m%d_%H%M%S")
    unknown = f"docs_unknown_{ts}"
    print(f"[WARN] Unknown docs schema {cols}; snapshot -> {unknown} and creating fresh v2.1")
    c.execute(f"ALTER TABLE docs RENAME TO {unknown}")
    c.execute(CREATE_DOCS_SQL)
    c.execute(UNIQ_IDX_SQL)
    set_meta(c, "migrated_from", unknown)
    set_meta(c, "migration", "unknown_to_v2_1_empty")
    return "unknown_to_v21_empty"

def rows_needing_embeddings(c, limit=None):
    q = "SELECT id, body FROM docs WHERE emb IS NULL AND body IS NOT NULL"
    if limit:
        q += " LIMIT ?"
        return c.execute(q, (limit,))
    return c.execute(q)

def embed_bodies(texts):
    if not texts:
        return []
    embs = _MODEL.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
    # store as raw bytes (float32); sentence-transformers returns float32 np.ndarray
    return [m.tobytes() for m in embs]

def backfill_embeddings(c, batch=512):
    total = 0
    while True:
        batch_rows = list(rows_needing_embeddings(c, batch))
        if not batch_rows:
            break
        texts = [r[1] for r in batch_rows]
        try:
            vecs = embed_bodies(texts)
        except Exception as e:
            print(f"[ERROR] embedding batch failed: {e}", file=sys.stderr)
            break
        for (row, vec) in zip(batch_rows, vecs):
            c.execute("UPDATE docs SET emb=? WHERE id=?", (vec, row[0]))
        total += len(batch_rows)
        c.commit()
    return total

def ensure_indexes_and_meta(c):
    if not idx_exists(c, UNIQ_IDX_NAME):
        c.execute(UNIQ_IDX_SQL)
    ensure_meta(c)
    set_meta(c, "model", DEFAULT_MODEL_NAME)
    set_meta(c, "dim", _EMB_DIM)
    set_meta(c, "model_path", MODEL_PATH or "")
    dc = get_doc_count(c)
    if dc is not None:
        set_meta(c, "doc_count", dc)

def main():
    os.makedirs(ROOT, exist_ok=True)

    # Ensure DB exists
    with connect(IDX) as c:
        # Ensure docs table is v2.1
        status = ensure_v21_docs(c)
        c.commit()

        # If you have an ingestion pipeline, do it here: add/refresh rows in docs(path,title,profile,body)
        # This template assumes rows already exist or were migrated; we just backfill embeddings.

        embedded = backfill_embeddings(c, batch=256)
        ensure_indexes_and_meta(c)
        c.commit()

        dc = get_doc_count(c) or 0
        print(f"[OK] wrote {IDX} with {dc} docs embedded={embedded}")

if __name__ == "__main__":
    main()
