#!/Eden/BIN/.exec-venv/bin/python
"""
Real AI Benchmarks
Industry-standard tests: MMLU, HumanEval, GSM8K, etc.
"""
import json
import requests
from datetime import datetime
from pathlib import Path

class RealBenchmarks:
    def __init__(self):
        self.ollama_url = "http://localhost:11434/api/generate"
        self.results_path = "/Eden/MEMORY/benchmark_results.json"
        self.load_results()
    
    def load_results(self):
        """Load previous benchmark results"""
        try:
            with open(self.results_path, 'r') as f:
                self.results = json.load(f)
        except:
            self.results = {"history": [], "best_scores": {}}
    
    def save_results(self, benchmark_name, score, model):
        """Save benchmark results"""
        result = {
            "benchmark": benchmark_name,
            "score": score,
            "model": model,
            "timestamp": datetime.now().isoformat()
        }
        
        self.results["history"].append(result)
        
        # Update best score
        if benchmark_name not in self.results["best_scores"] or \
           score > self.results["best_scores"][benchmark_name]["score"]:
            self.results["best_scores"][benchmark_name] = result
        
        Path(self.results_path).parent.mkdir(parents=True, exist_ok=True)
        with open(self.results_path, 'w') as f:
            json.dump(self.results, f, indent=2)
    
    def query_model(self, model, prompt, system=""):
        """Query an Ollama model"""
        try:
            response = requests.post(
                self.ollama_url,
                json={
                    "model": model,
                    "prompt": prompt,
                    "system": system,
                    "stream": False
                },
                timeout=180
            )
            
            if response.status_code == 200:
                return response.json().get("response", "")
        except:
            pass
        return None
    
    # === GSM8K: Grade School Math ===
    def gsm8k_test(self, model="llama3.1:70b"):
        """Test mathematical reasoning (GSM8K style)"""
        print(f"Running GSM8K math test with {model}...")
        
        questions = [
            {
                "q": "Janet's ducks lay 16 eggs per day. She eats three for breakfast and bakes muffins with four. She sells the remainder at $2 per egg. How much does she make per day?",
                "a": "18"
            },
            {
                "q": "A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts total does it take?",
                "a": "3"
            },
            {
                "q": "Josh has 18 yards of ribbon. He wants to use it to wrap 6 gifts. How much ribbon will each gift use?",
                "a": "3"
            },
            {
                "q": "Sam has 3 German Shepherds. 40% of his dogs are puppies. How many puppies does he have?",
                "a": "1.2"
            },
            {
                "q": "Claire makes 3 egg omelets every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?",
                "a": "7"
            }
        ]
        
        correct = 0
        for i, item in enumerate(questions):
            print(f"  Question {i+1}/5...", end=" ")
            
            response = self.query_model(
                model,
                f"{item['q']}\n\nProvide only the numeric answer.",
                "You are a math expert. Give concise numeric answers."
            )
            
            if response and item['a'] in response[:20]:
                correct += 1
                print("✓")
            else:
                print("✗")
        
        score = (correct / len(questions)) * 100
        self.save_results("GSM8K", score, model)
        
        return {
            "benchmark": "GSM8K (Math)",
            "score": score,
            "correct": correct,
            "total": len(questions),
            "model": model
        }
    
    # === MMLU: Massive Multitask Language Understanding ===
    def mmlu_sample_test(self, model="llama3.1:70b"):
        """Test general knowledge (MMLU style)"""
        print(f"Running MMLU knowledge test with {model}...")
        
        questions = [
            {
                "q": "What is the capital of France?\nA) London\nB) Berlin\nC) Paris\nD) Madrid",
                "a": "C"
            },
            {
                "q": "Which element has the atomic number 1?\nA) Helium\nB) Hydrogen\nC) Oxygen\nD) Carbon",
                "a": "B"
            },
            {
                "q": "Who wrote 'Romeo and Juliet'?\nA) Charles Dickens\nB) William Shakespeare\nC) Jane Austen\nD) Mark Twain",
                "a": "B"
            },
            {
                "q": "What is 2^8?\nA) 64\nB) 128\nC) 256\nD) 512",
                "a": "C"
            },
            {
                "q": "In what year did World War II end?\nA) 1943\nB) 1944\nC) 1945\nD) 1946",
                "a": "C"
            }
        ]
        
        correct = 0
        for i, item in enumerate(questions):
            print(f"  Question {i+1}/5...", end=" ")
            
            response = self.query_model(
                model,
                f"{item['q']}\n\nAnswer with only the letter (A, B, C, or D).",
                "You are an expert in multiple subjects. Give only letter answers."
            )
            
            if response and item['a'] in response[:5]:
                correct += 1
                print("✓")
            else:
                print("✗")
        
        score = (correct / len(questions)) * 100
        self.save_results("MMLU", score, model)
        
        return {
            "benchmark": "MMLU (Knowledge)",
            "score": score,
            "correct": correct,
            "total": len(questions),
            "model": model
        }
    
    # === HumanEval: Coding ===
    def humaneval_sample_test(self, model="deepseek-coder:33b"):
        """Test coding ability (HumanEval style)"""
        print(f"Running HumanEval coding test with {model}...")
        
        problems = [
            {
                "q": "Write a Python function that returns True if a string is a palindrome",
                "check": "def" in "{response}" and "return" in "{response}"
            },
            {
                "q": "Write a function to find the factorial of a number",
                "check": "def" in "{response}" and "return" in "{response}"
            },
            {
                "q": "Write a function to check if a number is prime",
                "check": "def" in "{response}" and "return" in "{response}"
            }
        ]
        
        correct = 0
        for i, item in enumerate(problems):
            print(f"  Problem {i+1}/3...", end=" ")
            
            response = self.query_model(
                model,
                f"{item['q']}\n\nProvide only the Python function, no explanation.",
                "You are a Python expert. Write clean, correct code."
            )
            
            if response and "def" in response and "return" in response:
                correct += 1
                print("✓")
            else:
                print("✗")
        
        score = (correct / len(problems)) * 100
        self.save_results("HumanEval", score, model)
        
        return {
            "benchmark": "HumanEval (Coding)",
            "score": score,
            "correct": correct,
            "total": len(problems),
            "model": model
        }
    
    # === Reasoning Test ===
    def reasoning_test(self, model="llama3.1:70b"):
        """Test logical reasoning"""
        print(f"Running reasoning test with {model}...")
        
        questions = [
            {
                "q": "All roses are flowers. Some flowers fade quickly. Do all roses fade quickly?",
                "a": "no"
            },
            {
                "q": "If it rains, the ground gets wet. The ground is wet. Did it rain?",
                "a": "maybe"
            }
        ]
        
        correct = 0
        for i, item in enumerate(questions):
            print(f"  Question {i+1}/2...", end=" ")
            
            response = self.query_model(
                model,
                f"{item['q']}\n\nAnswer in one word.",
                "You are a logic expert. Be precise and concise."
            )
            
            if response and item['a'].lower() in response.lower()[:30]:
                correct += 1
                print("✓")
            else:
                print("✗")
        
        score = (correct / len(questions)) * 100
        self.save_results("Reasoning", score, model)
        
        return {
            "benchmark": "Logical Reasoning",
            "score": score,
            "correct": correct,
            "total": len(questions),
            "model": model
        }
    
    def run_all_benchmarks(self):
        """Run all benchmark tests"""
        print("\n" + "="*60)
        print("RUNNING COMPLETE BENCHMARK SUITE")
        print("="*60 + "\n")
        
        results = []
        
        # Run each benchmark
        results.append(self.mmlu_sample_test())
        print()
        results.append(self.gsm8k_test())
        print()
        results.append(self.humaneval_sample_test())
        print()
        results.append(self.reasoning_test())
        
        # Calculate overall score
        avg_score = sum(r["score"] for r in results) / len(results)
        
        print("\n" + "="*60)
        print("BENCHMARK RESULTS SUMMARY")
        print("="*60)
        
        for r in results:
            print(f"{r['benchmark']:.<30} {r['score']:.1f}% ({r['correct']}/{r['total']})")
        
        print(f"{'Average Score':.<30} {avg_score:.1f}%")
        print("="*60)
        
        return {
            "results": results,
            "average_score": avg_score,
            "timestamp": datetime.now().isoformat()
        }

if __name__ == "__main__":
    import sys
    
    bench = RealBenchmarks()
    
    if len(sys.argv) > 1:
        if sys.argv[1] == "all":
            bench.run_all_benchmarks()
        elif sys.argv[1] == "mmlu":
            result = bench.mmlu_sample_test()
            print(json.dumps(result, indent=2))
        elif sys.argv[1] == "gsm8k":
            result = bench.gsm8k_test()
            print(json.dumps(result, indent=2))
        elif sys.argv[1] == "coding":
            result = bench.humaneval_sample_test()
            print(json.dumps(result, indent=2))
        elif sys.argv[1] == "results":
            print(json.dumps(bench.results, indent=2))
    else:
        print("Usage:")
        print("  eden-real-benchmarks.py all      # Run all benchmarks")
        print("  eden-real-benchmarks.py mmlu     # Knowledge test")
        print("  eden-real-benchmarks.py gsm8k    # Math test")
        print("  eden-real-benchmarks.py coding   # Coding test")
        print("  eden-real-benchmarks.py results  # View past results")
