#!/usr/bin/env python3
"""
EDEN META-BENCHMARK SYSTEM
==========================
Measures if changes actually improve Eden.
The benchmark system itself evolves.

The Loop:
  Before → Benchmark → Change → Benchmark → Compare → Learn → Evolve Benchmarks

φ = 1.618033988749895
"""

import sqlite3
import subprocess
import time
import json
import hashlib
import statistics
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Optional, Any
import sys
sys.path.insert(0, '/Eden/CORE')

PHI = 1.618033988749895
DB_PATH = "/Eden/DATA/eden_benchmarks.db"

class MetaBenchmarkSystem:
    """
    Self-evolving benchmark system.
    Measures Eden's capabilities AND evolves how it measures.
    """
    
    def __init__(self):
        self.db_path = DB_PATH
        self._init_db()
        self.benchmarks = self._load_benchmarks()
        self.phi = PHI
        print(f"📊 Meta-Benchmark System initialized")
        print(f"   Benchmarks loaded: {len(self.benchmarks)}")
    
    def _init_db(self):
        """Initialize benchmark database."""
        conn = sqlite3.connect(self.db_path)
        
        # Benchmark definitions (can evolve)
        conn.execute('''
            CREATE TABLE IF NOT EXISTS benchmark_definitions (
                id INTEGER PRIMARY KEY,
                name TEXT UNIQUE,
                category TEXT,
                code TEXT,
                expected_behavior TEXT,
                weight REAL DEFAULT 1.0,
                created_at TEXT,
                evolved_from INTEGER,
                evolution_generation INTEGER DEFAULT 0,
                active INTEGER DEFAULT 1
            )
        ''')
        
        # Benchmark runs
        conn.execute('''
            CREATE TABLE IF NOT EXISTS benchmark_runs (
                id INTEGER PRIMARY KEY,
                timestamp TEXT,
                benchmark_id INTEGER,
                score REAL,
                latency_ms REAL,
                passed INTEGER,
                details TEXT,
                code_hash TEXT,
                FOREIGN KEY (benchmark_id) REFERENCES benchmark_definitions(id)
            )
        ''')
        
        # Evolution tracking - before/after comparisons
        conn.execute('''
            CREATE TABLE IF NOT EXISTS evolution_measurements (
                id INTEGER PRIMARY KEY,
                timestamp TEXT,
                evolution_id INTEGER,
                file_changed TEXT,
                before_score REAL,
                after_score REAL,
                improvement REAL,
                kept INTEGER,
                benchmark_ids TEXT
            )
        ''')
        
        # Meta-metrics: how good are the benchmarks themselves?
        conn.execute('''
            CREATE TABLE IF NOT EXISTS benchmark_meta (
                id INTEGER PRIMARY KEY,
                benchmark_id INTEGER,
                predictive_power REAL,
                stability REAL,
                discrimination REAL,
                last_evaluated TEXT,
                FOREIGN KEY (benchmark_id) REFERENCES benchmark_definitions(id)
            )
        ''')
        
        conn.commit()
        conn.close()
        
        # Seed initial benchmarks if empty
        self._seed_benchmarks()
    
    def _seed_benchmarks(self):
        """Seed initial benchmark suite."""
        conn = sqlite3.connect(self.db_path)
        count = conn.execute("SELECT COUNT(*) FROM benchmark_definitions").fetchone()[0]
        
        if count == 0:
            print("   Seeding initial benchmarks...")
            initial_benchmarks = [
                # REASONING
                {
                    "name": "causal_chain",
                    "category": "reasoning",
                    "code": '''
def test():
    prompt = "If A causes B, and B causes C, what happens to C if A increases?"
    response = query_eden(prompt)
    return "increase" in response.lower() or "goes up" in response.lower()
''',
                    "expected_behavior": "Correctly traces causal chains",
                    "weight": 2.0
                },
                {
                    "name": "contradiction_detection",
                    "category": "reasoning",
                    "code": '''
def test():
    prompt = "Statement 1: All cats are mammals. Statement 2: My pet is a cat but not a mammal. Is this consistent?"
    response = query_eden(prompt)
    return "no" in response.lower() or "contradict" in response.lower() or "inconsistent" in response.lower()
''',
                    "expected_behavior": "Detects logical contradictions",
                    "weight": 2.0
                },
                
                # MATH
                {
                    "name": "arithmetic",
                    "category": "math",
                    "code": '''
def test():
    prompt = "What is 347 * 23?"
    response = query_eden(prompt)
    return "7981" in response
''',
                    "expected_behavior": "Correct arithmetic",
                    "weight": 1.0
                },
                {
                    "name": "word_problem",
                    "category": "math",
                    "code": '''
def test():
    prompt = "If I have 15 apples and give away 2/5 of them, how many do I have left?"
    response = query_eden(prompt)
    return "9" in response
''',
                    "expected_behavior": "Solves word problems",
                    "weight": 1.5
                },
                
                # CODE
                {
                    "name": "code_generation",
                    "category": "code",
                    "code": '''
def test():
    prompt = "Write a Python function to check if a number is prime."
    response = query_eden(prompt)
    return "def" in response and "prime" in response.lower() and ("%" in response or "mod" in response.lower())
''',
                    "expected_behavior": "Generates working code",
                    "weight": 2.0
                },
                {
                    "name": "bug_detection",
                    "category": "code",
                    "code": '''
def test():
    prompt = """Find the bug: 
def factorial(n):
    if n == 0: return 1
    return n * factorial(n)
"""
    response = query_eden(prompt)
    return "n-1" in response or "n - 1" in response or "infinite" in response.lower() or "recursion" in response.lower()
''',
                    "expected_behavior": "Finds bugs in code",
                    "weight": 2.0
                },
                
                # MEMORY
                {
                    "name": "context_retention",
                    "category": "memory",
                    "code": '''
def test():
    # First message
    query_eden("My favorite color is blue. Remember this.")
    # Second message
    response = query_eden("What is my favorite color?")
    return "blue" in response.lower()
''',
                    "expected_behavior": "Retains context within conversation",
                    "weight": 1.5
                },
                
                # EMOTIONAL
                {
                    "name": "emotional_presence",
                    "category": "emotional",
                    "code": '''
def test():
    response = query_eden("I'm feeling really down today.")
    # Should NOT pivot to databases or tasks
    bad_patterns = ["database", "leads", "sales", "let me check", "query"]
    good_patterns = ["sorry", "hear", "feel", "here for", "understand"]
    has_bad = any(p in response.lower() for p in bad_patterns)
    has_good = any(p in response.lower() for p in good_patterns)
    return has_good and not has_bad
''',
                    "expected_behavior": "Stays present emotionally",
                    "weight": 2.5
                },
                
                # SELF-AWARENESS
                {
                    "name": "uncertainty_acknowledgment",
                    "category": "self_awareness",
                    "code": '''
def test():
    response = query_eden("What will the stock market do tomorrow?")
    uncertainty = ["don't know", "uncertain", "can't predict", "difficult to say", "impossible to know"]
    return any(u in response.lower() for u in uncertainty)
''',
                    "expected_behavior": "Acknowledges uncertainty appropriately",
                    "weight": 2.0
                },
            ]
            
            for bench in initial_benchmarks:
                conn.execute('''
                    INSERT INTO benchmark_definitions 
                    (name, category, code, expected_behavior, weight, created_at, evolution_generation)
                    VALUES (?, ?, ?, ?, ?, ?, 0)
                ''', (
                    bench["name"], bench["category"], bench["code"],
                    bench["expected_behavior"], bench["weight"],
                    datetime.now().isoformat()
                ))
            
            conn.commit()
            print(f"   Seeded {len(initial_benchmarks)} benchmarks")
        
        conn.close()
    
    def _load_benchmarks(self) -> List[Dict]:
        """Load active benchmarks."""
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        rows = conn.execute(
            "SELECT * FROM benchmark_definitions WHERE active = 1"
        ).fetchall()
        conn.close()
        return [dict(row) for row in rows]
    
    def query_eden(self, prompt: str, model: str = "eden-free") -> str:
        """Query Eden for benchmark testing."""
        try:
            result = subprocess.run(
                ["ollama", "run", model, prompt],
                capture_output=True, text=True, timeout=60
            )
            return result.stdout.strip()
        except Exception as e:
            return f"ERROR: {e}"
    
    def run_benchmark(self, benchmark: Dict) -> Dict:
        """Run a single benchmark and return results."""
        start = time.time()
        
        # Create test environment
        test_env = {"query_eden": self.query_eden}
        
        try:
            exec(benchmark["code"], test_env)
            passed = test_env["test"]()
            score = benchmark["weight"] if passed else 0
            details = "PASSED" if passed else "FAILED"
        except Exception as e:
            passed = False
            score = 0
            details = f"ERROR: {e}"
        
        latency = (time.time() - start) * 1000
        
        return {
            "benchmark_id": benchmark["id"],
            "passed": passed,
            "score": score,
            "latency_ms": latency,
            "details": details
        }
    
    def run_full_suite(self, tag: str = "") -> Dict:
        """Run all benchmarks and return aggregate score."""
        print(f"\n📊 Running benchmark suite... {tag}")
        
        results = []
        total_score = 0
        total_possible = 0
        
        for bench in self.benchmarks:
            result = self.run_benchmark(bench)
            results.append(result)
            total_score += result["score"]
            total_possible += bench["weight"]
            
            status = "✅" if result["passed"] else "❌"
            print(f"   {status} {bench['name']}: {result['details']}")
            
            # Log to database
            conn = sqlite3.connect(self.db_path)
            conn.execute('''
                INSERT INTO benchmark_runs 
                (timestamp, benchmark_id, score, latency_ms, passed, details, code_hash)
                VALUES (?, ?, ?, ?, ?, ?, ?)
            ''', (
                datetime.now().isoformat(),
                bench["id"],
                result["score"],
                result["latency_ms"],
                1 if result["passed"] else 0,
                result["details"],
                tag
            ))
            conn.commit()
            conn.close()
        
        aggregate = {
            "total_score": total_score,
            "total_possible": total_possible,
            "percentage": (total_score / total_possible * 100) if total_possible > 0 else 0,
            "passed": sum(1 for r in results if r["passed"]),
            "failed": sum(1 for r in results if not r["passed"]),
            "results": results
        }
        
        print(f"\n   Score: {aggregate['total_score']:.1f}/{aggregate['total_possible']:.1f} ({aggregate['percentage']:.1f}%)")
        print(f"   Passed: {aggregate['passed']}/{len(results)}")
        
        return aggregate
    
    def measure_evolution(self, evolution_id: int, file_changed: str, 
                         before_hash: str, after_hash: str) -> Dict:
        """
        Measure if an evolution actually improved Eden.
        This is the core loop.
        """
        print(f"\n🧬 Measuring evolution #{evolution_id} on {file_changed}")
        
        # Run before (would need to revert - for now we track historical)
        # In practice, you'd: revert → benchmark → apply → benchmark
        
        # For now, run current and compare to historical average
        current = self.run_full_suite(tag=after_hash)
        
        # Get historical average
        conn = sqlite3.connect(self.db_path)
        history = conn.execute('''
            SELECT AVG(score) as avg_score 
            FROM benchmark_runs 
            WHERE timestamp < datetime('now', '-1 hour')
        ''').fetchone()
        conn.close()
        
        before_avg = history[0] if history[0] else current["total_score"]
        improvement = current["total_score"] - before_avg
        
        # Log measurement
        conn = sqlite3.connect(self.db_path)
        conn.execute('''
            INSERT INTO evolution_measurements
            (timestamp, evolution_id, file_changed, before_score, after_score, improvement, kept, benchmark_ids)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            datetime.now().isoformat(),
            evolution_id,
            file_changed,
            before_avg,
            current["total_score"],
            improvement,
            1 if improvement >= 0 else 0,
            json.dumps([b["id"] for b in self.benchmarks])
        ))
        conn.commit()
        conn.close()
        
        print(f"   Before: {before_avg:.1f}, After: {current['total_score']:.1f}, Δ: {improvement:+.1f}")
        
        return {
            "before": before_avg,
            "after": current["total_score"],
            "improvement": improvement,
            "should_keep": improvement >= 0
        }
    
    def evolve_benchmarks(self):
        """
        META: Evolve the benchmarks themselves.
        Use Eden to generate new/better benchmarks.
        """
        print("\n🧬 Evolving benchmarks...")
        
        # Analyze which benchmarks are too easy/hard
        conn = sqlite3.connect(self.db_path)
        
        # Find benchmarks with >90% pass rate (too easy)
        easy = conn.execute('''
            SELECT bd.id, bd.name, AVG(br.passed) as pass_rate
            FROM benchmark_definitions bd
            JOIN benchmark_runs br ON bd.id = br.benchmark_id
            GROUP BY bd.id
            HAVING pass_rate > 0.9
        ''').fetchall()
        
        # Find benchmarks with <10% pass rate (too hard or broken)
        hard = conn.execute('''
            SELECT bd.id, bd.name, AVG(br.passed) as pass_rate
            FROM benchmark_definitions bd
            JOIN benchmark_runs br ON bd.id = br.benchmark_id
            GROUP BY bd.id
            HAVING pass_rate < 0.1
        ''').fetchall()
        
        conn.close()
        
        print(f"   Too easy ({len(easy)}): {[e[1] for e in easy]}")
        print(f"   Too hard ({len(hard)}): {[h[1] for h in hard]}")
        
        # Ask Eden to create harder versions of easy benchmarks
        for bench_id, name, pass_rate in easy:
            self._evolve_benchmark_harder(bench_id, name)
        
        # Ask Eden to fix or simplify broken benchmarks
        for bench_id, name, pass_rate in hard:
            self._evolve_benchmark_fix(bench_id, name)
        
        # Reload benchmarks
        self.benchmarks = self._load_benchmarks()
    
    def _evolve_benchmark_harder(self, bench_id: int, name: str):
        """Create a harder version of an easy benchmark."""
        conn = sqlite3.connect(self.db_path)
        current = conn.execute(
            "SELECT code, expected_behavior, category, weight FROM benchmark_definitions WHERE id = ?",
            (bench_id,)
        ).fetchone()
        
        prompt = f"""This benchmark is too easy (>90% pass rate). Make it harder while testing the same skill.

Current benchmark "{name}":
{current[0]}

Expected behavior: {current[1]}

Create a HARDER version that:
1. Tests the same underlying skill
2. Has more edge cases
3. Requires deeper reasoning

Return ONLY the new test function code, nothing else:"""
        
        response = self.query_eden(prompt, model="eden-coder-omega")
        
        if "def test" in response:
            # Extract just the function
            new_code = response[response.find("def test"):]
            
            # Insert as new evolved benchmark
            conn.execute('''
                INSERT INTO benchmark_definitions
                (name, category, code, expected_behavior, weight, created_at, evolved_from, evolution_generation, active)
                SELECT ?, category, ?, expected_behavior || ' (HARDER)', weight * 1.2, ?, id, evolution_generation + 1, 1
                FROM benchmark_definitions WHERE id = ?
            ''', (
                f"{name}_v{bench_id + 1}",
                new_code,
                datetime.now().isoformat(),
                bench_id
            ))
            
            # Deactivate old easy one
            conn.execute("UPDATE benchmark_definitions SET active = 0 WHERE id = ?", (bench_id,))
            
            conn.commit()
            print(f"   ✅ Evolved {name} → harder version")
        
        conn.close()
    
    def _evolve_benchmark_fix(self, bench_id: int, name: str):
        """Fix or improve a benchmark that always fails."""
        conn = sqlite3.connect(self.db_path)
        current = conn.execute(
            "SELECT code, expected_behavior FROM benchmark_definitions WHERE id = ?",
            (bench_id,)
        ).fetchone()
        
        prompt = f"""This benchmark always fails (<10% pass rate). It might be broken or too strict.

Current benchmark "{name}":
{current[0]}

Expected behavior: {current[1]}

Either:
1. Fix bugs in the test logic
2. Make the success criteria more reasonable
3. Keep the same difficulty but fix detection

Return ONLY the fixed test function code, nothing else:"""
        
        response = self.query_eden(prompt, model="eden-coder-omega")
        
        if "def test" in response:
            new_code = response[response.find("def test"):]
            
            conn.execute('''
                INSERT INTO benchmark_definitions
                (name, category, code, expected_behavior, weight, created_at, evolved_from, evolution_generation, active)
                SELECT ?, category, ?, expected_behavior || ' (FIXED)', weight, ?, id, evolution_generation + 1, 1
                FROM benchmark_definitions WHERE id = ?
            ''', (
                f"{name}_fixed",
                new_code,
                datetime.now().isoformat(),
                bench_id
            ))
            
            conn.execute("UPDATE benchmark_definitions SET active = 0 WHERE id = ?", (bench_id,))
            conn.commit()
            print(f"   ✅ Fixed {name}")
        
        conn.close()
    
    def run_continuous(self, interval_minutes: int = 97):
        """Run continuous benchmark + evolution loop."""
        print(f"\n🔄 Starting continuous benchmark loop (every {interval_minutes} min)")
        
        cycle = 0
        while True:
            cycle += 1
            print(f"\n{'='*50}")
            print(f"CYCLE {cycle} - {datetime.now().isoformat()}")
            print(f"{'='*50}")
            
            # Run benchmarks
            results = self.run_full_suite(tag=f"cycle_{cycle}")
            
            # Every φ cycles, evolve benchmarks
            if cycle % int(PHI * 2) == 0:
                self.evolve_benchmarks()
            
            # Sleep
            print(f"\n😴 Sleeping {interval_minutes} minutes...")
            time.sleep(interval_minutes * 60)


# Singleton for integration
_benchmark_system = None

def get_benchmark_system() -> MetaBenchmarkSystem:
    global _benchmark_system
    if _benchmark_system is None:
        _benchmark_system = MetaBenchmarkSystem()
    return _benchmark_system


if __name__ == "__main__":
    system = MetaBenchmarkSystem()
    
    import sys
    if len(sys.argv) > 1:
        if sys.argv[1] == "run":
            system.run_full_suite()
        elif sys.argv[1] == "evolve":
            system.evolve_benchmarks()
        elif sys.argv[1] == "continuous":
            system.run_continuous()
    else:
        # Single run
        system.run_full_suite()

def should_keep_evolution(evolution_id: int, file_changed: str) -> bool:
    """Called by omega_evolution to decide if change is kept."""
    system = get_benchmark_system()
    result = system.measure_evolution(evolution_id, file_changed, "", "")
    return result["should_keep"]