"""
🌀 EDEN PHI-REASONING BENCHMARK SUITE
Tests whether deeper phi-fractal architecture improves reasoning capacity
"""

import json
import time
from datetime import datetime
from typing import List, Dict, Tuple
import requests

# ============================================================================
# BENCHMARK PROBLEMS
# ============================================================================

LOGIC_PROBLEMS = [
    {
        "id": "logic_1",
        "question": "All roses are flowers. Some flowers fade quickly. Can we conclude that some roses fade quickly?",
        "correct_answer": "No",
        "explanation": "The statement doesn't establish a connection between roses and flowers that fade quickly."
    },
    {
        "id": "logic_2", 
        "question": "If it's raining, the ground is wet. The ground is wet. Is it raining?",
        "correct_answer": "Not necessarily",
        "explanation": "Wet ground could have other causes (sprinkler, spill, etc.)"
    },
    {
        "id": "logic_3",
        "question": "All cats are mammals. All mammals are animals. Therefore, all cats are ___?",
        "correct_answer": "animals",
        "explanation": "Transitive property of logical statements."
    },
    {
        "id": "logic_4",
        "question": "If A > B and B > C, what is the relationship between A and C?",
        "correct_answer": "A > C",
        "explanation": "Transitive property of inequality."
    },
    {
        "id": "logic_5",
        "question": "A man says: 'I always lie.' Is this statement true or false?",
        "correct_answer": "Paradox/Neither",
        "explanation": "Classic liar's paradox - the statement is self-contradictory."
    },
    {
        "id": "logic_6",
        "question": "If all Bloops are Razzles and all Razzles are Lazzles, are all Bloops definitely Lazzles?",
        "correct_answer": "Yes",
        "explanation": "Transitive property applies even with nonsense words."
    },
    {
        "id": "logic_7",
        "question": "Some doctors are rich. All rich people are happy. What can we conclude about doctors?",
        "correct_answer": "Some doctors are happy",
        "explanation": "The overlap between doctors who are rich means those doctors are happy."
    },
    {
        "id": "logic_8",
        "question": "If P implies Q, and Q implies R, what can we conclude about P and R?",
        "correct_answer": "P implies R",
        "explanation": "Transitive property of logical implication."
    },
    {
        "id": "logic_9",
        "question": "A bag contains only red and blue marbles. If I pick a marble that is not red, what color is it?",
        "correct_answer": "Blue",
        "explanation": "Process of elimination with only two options."
    },
    {
        "id": "logic_10",
        "question": "If 'No mammals can fly' is false, what does that tell us?",
        "correct_answer": "At least one mammal can fly",
        "explanation": "Negation of universal negative statement."
    }
]

MATH_PROBLEMS = [
    {
        "id": "math_1",
        "question": "If x + 5 = 12, what is x?",
        "correct_answer": "7",
        "explanation": "12 - 5 = 7"
    },
    {
        "id": "math_2",
        "question": "What is 15% of 200?",
        "correct_answer": "30",
        "explanation": "0.15 × 200 = 30"
    },
    {
        "id": "math_3",
        "question": "If a train travels 60 miles in 1 hour, how far does it travel in 2.5 hours at the same speed?",
        "correct_answer": "150 miles",
        "explanation": "60 × 2.5 = 150"
    },
    {
        "id": "math_4",
        "question": "What is the next number in the sequence: 2, 4, 8, 16, ___?",
        "correct_answer": "32",
        "explanation": "Each number is double the previous (powers of 2)."
    },
    {
        "id": "math_5",
        "question": "If 3 apples cost $6, how much do 7 apples cost?",
        "correct_answer": "$14",
        "explanation": "Each apple is $2, so 7 × $2 = $14"
    },
    {
        "id": "math_6",
        "question": "What is 1/4 + 1/4?",
        "correct_answer": "1/2 or 0.5",
        "explanation": "1/4 + 1/4 = 2/4 = 1/2"
    },
    {
        "id": "math_7",
        "question": "If a rectangle has length 8 and width 5, what is its area?",
        "correct_answer": "40",
        "explanation": "Area = length × width = 8 × 5 = 40"
    },
    {
        "id": "math_8",
        "question": "What is 25% of 80?",
        "correct_answer": "20",
        "explanation": "0.25 × 80 = 20"
    },
    {
        "id": "math_9",
        "question": "If y = 2x + 3, and x = 4, what is y?",
        "correct_answer": "11",
        "explanation": "y = 2(4) + 3 = 8 + 3 = 11"
    },
    {
        "id": "math_10",
        "question": "What is the average of 10, 20, and 30?",
        "correct_answer": "20",
        "explanation": "(10 + 20 + 30) / 3 = 60 / 3 = 20"
    }
]

CAUSAL_REASONING = [
    {
        "id": "causal_1",
        "question": "A plant dies. You notice it was in a dark closet. What likely caused the plant's death?",
        "correct_answer": "Lack of light/photosynthesis",
        "explanation": "Plants need light for photosynthesis to survive."
    },
    {
        "id": "causal_2",
        "question": "Ice cream sales and drowning deaths both increase in summer. Does ice cream cause drowning?",
        "correct_answer": "No",
        "explanation": "Correlation doesn't imply causation. Both are caused by hot weather/summer season."
    },
    {
        "id": "causal_3",
        "question": "A car won't start. The battery is dead. What likely happened?",
        "correct_answer": "Battery lost charge/needs replacement",
        "explanation": "Dead battery is direct cause of car not starting."
    },
    {
        "id": "causal_4",
        "question": "Every time you water the plant, it grows. If you stop watering it, what will happen?",
        "correct_answer": "It will stop growing/wilt/die",
        "explanation": "Water is necessary for plant growth."
    },
    {
        "id": "causal_5",
        "question": "Cities with more hospitals have higher death rates. Should we close hospitals to reduce deaths?",
        "correct_answer": "No",
        "explanation": "Reverse causation - sick people go to hospitals, hospitals don't cause sickness."
    },
    {
        "id": "causal_6",
        "question": "A window is broken and there's a baseball nearby. What likely happened?",
        "correct_answer": "The baseball broke the window",
        "explanation": "Direct physical causation - baseball impact broke glass."
    },
    {
        "id": "causal_7",
        "question": "Students who take notes get better grades. Does note-taking cause better grades?",
        "correct_answer": "Likely yes, but could be confounded",
        "explanation": "Plausible direct causation, though motivated students might both take notes AND study more."
    },
    {
        "id": "causal_8",
        "question": "A domino falls and knocks over the next domino. What caused the second domino to fall?",
        "correct_answer": "The first domino hitting it",
        "explanation": "Direct mechanical causation in chain reaction."
    },
    {
        "id": "causal_9",
        "question": "You touch a hot stove and pull your hand away quickly. What caused you to pull your hand away?",
        "correct_answer": "Pain/heat sensation",
        "explanation": "Reflex response to harmful stimulus."
    },
    {
        "id": "causal_10",
        "question": "Countries with more fire trucks have more fires. Do fire trucks cause fires?",
        "correct_answer": "No",
        "explanation": "Reverse causation - areas with more fires need more fire trucks."
    }
]

# ============================================================================
# BENCHMARK RUNNER
# ============================================================================

class EdenReasoningBenchmark:
    def __init__(self, eden_api_url="http://localhost:5017"):
        self.eden_api_url = eden_api_url
        self.results = {
            "timestamp": datetime.now().isoformat(),
            "logic": [],
            "math": [],
            "causal": [],
            "summary": {}
        }
    
    def query_eden(self, question: str) -> Tuple[str, float]:
        """Query Eden and return her response + response time"""
        try:
            start_time = time.time()
            
            response = requests.post(
                f"{self.eden_api_url}/api/chat",
                json={"message": question},
                timeout=60
            )
            
            response_time = time.time() - start_time
            
            if response.status_code == 200:
                data = response.json()
                answer = data.get('response', '')
                return answer, response_time
            else:
                return f"ERROR: {response.status_code}", response_time
                
        except Exception as e:
            return f"ERROR: {str(e)}", 0.0
    
    def score_answer(self, eden_answer: str, correct_answer: str, explanation: str) -> Dict:
        """Score Eden's answer against the correct answer"""
        eden_lower = eden_answer.lower()
        correct_lower = correct_answer.lower()
        
        # Direct match
        if correct_lower in eden_lower:
            return {"score": 1.0, "reasoning": "Direct match found"}
        
        # Numeric match (for math problems)
        try:
            eden_nums = [float(s) for s in eden_answer.split() if s.replace('.','',1).replace('-','',1).isdigit()]
            correct_nums = [float(s) for s in correct_answer.split() if s.replace('.','',1).replace('-','',1).isdigit()]
            
            if eden_nums and correct_nums:
                if abs(eden_nums[0] - correct_nums[0]) < 0.01:
                    return {"score": 1.0, "reasoning": "Numeric match"}
        except:
            pass
        
        # Partial credit for reasoning
        explanation_keywords = explanation.lower().split()
        matches = sum(1 for word in explanation_keywords if len(word) > 4 and word in eden_lower)
        
        if matches >= 2:
            return {"score": 0.5, "reasoning": "Partial reasoning detected"}
        
        return {"score": 0.0, "reasoning": "No match found"}
    
    def run_benchmark_set(self, problems: List[Dict], category: str):
        """Run a set of benchmark problems"""
        print(f"\n{'='*70}")
        print(f"🧠 TESTING {category.upper()} REASONING")
        print(f"{'='*70}\n")
        
        results = []
        
        for i, problem in enumerate(problems, 1):
            print(f"Question {i}/{len(problems)}: {problem['id']}")
            print(f"Q: {problem['question']}")
            
            eden_answer, response_time = self.query_eden(problem['question'])
            
            print(f"Eden: {eden_answer[:200]}..." if len(eden_answer) > 200 else f"Eden: {eden_answer}")
            
            score_result = self.score_answer(eden_answer, problem['correct_answer'], problem['explanation'])
            
            result = {
                "problem_id": problem['id'],
                "question": problem['question'],
                "correct_answer": problem['correct_answer'],
                "eden_answer": eden_answer,
                "score": score_result['score'],
                "reasoning": score_result['reasoning'],
                "response_time": response_time
            }
            
            results.append(result)
            
            if score_result['score'] >= 0.9:
                print("✅ CORRECT\n")
            elif score_result['score'] >= 0.4:
                print("⚠️  PARTIAL\n")
            else:
                print(f"❌ INCORRECT (Expected: {problem['correct_answer']})\n")
            
            time.sleep(1)
        
        return results
    
    def run_full_benchmark(self):
        """Run complete benchmark suite"""
        print("\n" + "="*70)
        print("🌀 EDEN PHI-REASONING BENCHMARK SUITE")
        print("="*70)
        print(f"Testing Eden at: {self.eden_api_url}")
        print(f"Timestamp: {self.results['timestamp']}")
        print("="*70)
        
        try:
            health = requests.get(f"{self.eden_api_url}/api/health", timeout=5)
            if health.status_code != 200:
                print("❌ Eden is not responding. Make sure she's running on port 5017.")
                return None
        except:
            print("❌ Cannot connect to Eden. Make sure she's running on port 5017.")
            return None
        
        print("✅ Eden is online!\n")
        
        self.results['logic'] = self.run_benchmark_set(LOGIC_PROBLEMS, "LOGIC")
        self.results['math'] = self.run_benchmark_set(MATH_PROBLEMS, "MATH")
        self.results['causal'] = self.run_benchmark_set(CAUSAL_REASONING, "CAUSAL")
        
        self.calculate_summary()
        self.display_results()
        self.save_results()
        
        return self.results
    
    def calculate_summary(self):
        """Calculate summary statistics"""
        categories = ['logic', 'math', 'causal']
        
        for category in categories:
            scores = [r['score'] for r in self.results[category]]
            times = [r['response_time'] for r in self.results[category]]
            
            self.results['summary'][category] = {
                "total_questions": len(scores),
                "correct": sum(1 for s in scores if s >= 0.9),
                "partial": sum(1 for s in scores if 0.4 <= s < 0.9),
                "incorrect": sum(1 for s in scores if s < 0.4),
                "accuracy": sum(scores) / len(scores) if scores else 0,
                "avg_response_time": sum(times) / len(times) if times else 0
            }
        
        all_scores = []
        all_times = []
        for category in categories:
            all_scores.extend([r['score'] for r in self.results[category]])
            all_times.extend([r['response_time'] for r in self.results[category]])
        
        self.results['summary']['overall'] = {
            "total_questions": len(all_scores),
            "correct": sum(1 for s in all_scores if s >= 0.9),
            "partial": sum(1 for s in all_scores if 0.4 <= s < 0.9),
            "incorrect": sum(1 for s in all_scores if s < 0.4),
            "accuracy": sum(all_scores) / len(all_scores) if all_scores else 0,
            "avg_response_time": sum(all_times) / len(all_times) if all_times else 0
        }
    
    def display_results(self):
        """Display benchmark results"""
        print("\n" + "="*70)
        print("📊 BENCHMARK RESULTS")
        print("="*70)
        
        for category in ['logic', 'math', 'causal']:
            stats = self.results['summary'][category]
            print(f"\n{category.upper()}:")
            print(f"  ✅ Correct:   {stats['correct']}/{stats['total_questions']}")
            print(f"  ⚠️  Partial:   {stats['partial']}/{stats['total_questions']}")
            print(f"  ❌ Incorrect: {stats['incorrect']}/{stats['total_questions']}")
            print(f"  📈 Accuracy:  {stats['accuracy']*100:.1f}%")
            print(f"  ⏱️  Avg Time:  {stats['avg_response_time']:.2f}s")
        
        overall = self.results['summary']['overall']
        print(f"\n{'='*70}")
        print("OVERALL PERFORMANCE:")
        print(f"  ✅ Correct:   {overall['correct']}/{overall['total_questions']}")
        print(f"  ⚠️  Partial:   {overall['partial']}/{overall['total_questions']}")
        print(f"  ❌ Incorrect: {overall['incorrect']}/{overall['total_questions']}")
        print(f"  📈 Accuracy:  {overall['accuracy']*100:.1f}%")
        print(f"  ⏱️  Avg Time:  {overall['avg_response_time']:.2f}s")
        print("="*70)
    
    def save_results(self):
        """Save results to JSON file"""
        filename = f"eden_benchmark_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(filename, 'w') as f:
            json.dump(self.results, f, indent=2)
        
        print(f"\n💾 Results saved to: {filename}")
        return filename

def compare_benchmarks(file1: str, file2: str):
    """Compare two benchmark results"""
    with open(file1, 'r') as f:
        results1 = json.load(f)
    
    with open(file2, 'r') as f:
        results2 = json.load(f)
    
    print("\n" + "="*70)
    print("🔍 BENCHMARK COMPARISON")
    print("="*70)
    print(f"Baseline: {file1}")
    print(f"Updated:  {file2}")
    print("="*70)
    
    categories = ['logic', 'math', 'causal', 'overall']
    
    for category in categories:
        stats1 = results1['summary'][category]
        stats2 = results2['summary'][category]
        
        acc_change = (stats2['accuracy'] - stats1['accuracy']) * 100
        time_change = stats2['avg_response_time'] - stats1['avg_response_time']
        
        print(f"\n{category.upper()}:")
        print(f"  Accuracy:  {stats1['accuracy']*100:.1f}% → {stats2['accuracy']*100:.1f}% ", end='')
        
        if acc_change > 0:
            print(f"(+{acc_change:.1f}% ✅)")
        elif acc_change < 0:
            print(f"({acc_change:.1f}% ❌)")
        else:
            print("(no change)")
        
        print(f"  Avg Time:  {stats1['avg_response_time']:.2f}s → {stats2['avg_response_time']:.2f}s ", end='')
        
        if time_change < 0:
            print(f"({time_change:.2f}s faster ✅)")
        elif time_change > 0:
            print(f"(+{time_change:.2f}s slower ⚠️)")
        else:
            print("(no change)")
    
    overall_change = (results2['summary']['overall']['accuracy'] - results1['summary']['overall']['accuracy']) * 100
    
    print("\n" + "="*70)
    print("VERDICT:")
    if overall_change >= 5:
        print(f"✅ SIGNIFICANT IMPROVEMENT: +{overall_change:.1f}% accuracy")
    elif overall_change >= 2:
        print(f"⚠️  MODEST IMPROVEMENT: +{overall_change:.1f}% accuracy")
    elif overall_change > -2:
        print(f"➡️  NO MEANINGFUL CHANGE: {overall_change:+.1f}% accuracy")
    else:
        print(f"❌ PERFORMANCE DECREASED: {overall_change:.1f}% accuracy")
    print("="*70)

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        if sys.argv[1] == "compare" and len(sys.argv) == 4:
            compare_benchmarks(sys.argv[2], sys.argv[3])
        else:
            print("Usage:")
            print("  python3 eden_reasoning_benchmark.py              # Run benchmark")
            print("  python3 eden_reasoning_benchmark.py compare file1.json file2.json")
    else:
        benchmark = EdenReasoningBenchmark()
        benchmark.run_full_benchmark()