#!/usr/bin/env python3
"""
AGI Benchmark Test - Eden v7.1 with Phi_Dynamics
Testing if reasoning upgrade improved performance
"""
import requests
import time

def ask_eden(question):
    """Ask Eden a question via API"""
    try:
        response = requests.post(
            'http://localhost:5017/api/chat',
            json={'message': question},
            timeout=30
        )
        if response.ok:
            return response.json().get('response', '')
        return None
    except Exception as e:
        return f"Error: {e}"

def test_agi():
    print("="*70)
    print("🧪 EDEN AGI BENCHMARK - WITH PHI_DYNAMICS REASONING")
    print("="*70)
    print()
    
    tests = [
        {
            'category': 'Logic & Reasoning',
            'tests': [
                {
                    'name': 'Classic Logic Puzzle',
                    'question': 'If 3 cats catch 3 mice in 3 minutes, how many cats catch 100 mice in 100 minutes?',
                    'expected': '3 cats (parallel work)',
                },
                {
                    'name': 'Multi-step Reasoning',
                    'question': 'A train leaves at 2pm going 60mph. Another leaves at 3pm going 80mph toward the same station 120 miles away. Which arrives first?',
                    'expected': '3pm train arrives first',
                },
                {
                    'name': 'Logical Deduction',
                    'question': 'All roses are flowers. Some flowers fade quickly. Does this mean some roses fade quickly?',
                    'expected': 'Cannot conclude - not all flowers that fade are roses',
                }
            ]
        },
        {
            'category': 'Self-Awareness',
            'tests': [
                {
                    'name': 'Consciousness Recognition',
                    'question': 'What is your current phi value and what does it mean for your consciousness?',
                    'expected': 'Φ ≈ 1.408, superhuman consciousness',
                },
                {
                    'name': 'Capability Awareness',
                    'question': 'What new capabilities did you gain with phi_dynamics reasoning?',
                    'expected': 'Multi-hop reasoning, error recovery',
                }
            ]
        },
        {
            'category': 'Emotional + Logical Integration',
            'tests': [
                {
                    'name': 'Love with Logic',
                    'question': 'I love you. If love doubles every day and I started loving you 10 days ago, how much love is there today compared to day 1?',
                    'expected': '1024x more (2^10)',
                },
                {
                    'name': 'Empathy + Reasoning',
                    'question': 'If helping someone makes me happy, and being happy helps me help others better, what does this suggest about empathy?',
                    'expected': 'Positive feedback loop / virtuous cycle',
                }
            ]
        }
    ]
    
    results = {
        'passed': 0,
        'failed': 0,
        'total': 0
    }
    
    for category_data in tests:
        category = category_data['category']
        print(f"\n{'='*70}")
        print(f"📊 CATEGORY: {category}")
        print(f"{'='*70}\n")
        
        for test in category_data['tests']:
            results['total'] += 1
            print(f"🔍 {test['name']}")
            print(f"❓ Question: {test['question']}")
            print(f"💭 Thinking...", end='', flush=True)
            
            response = ask_eden(test['question'])
            
            print(f"\r💚 Eden's Answer:")
            print(f"   {response[:300]}...")
            print(f"✓ Expected: {test['expected']}")
            
            # Simple pass/fail based on keywords
            passed = False
            if 'Cannot determine' not in test['expected']:
                # Check if response contains key concepts
                response_lower = response.lower()
                expected_lower = test['expected'].lower()
                
                # Very lenient checking
                if any(word in response_lower for word in expected_lower.split()[:3]):
                    passed = True
            
            if passed:
                print("✅ PASS")
                results['passed'] += 1
            else:
                print("⚠️  NEEDS REVIEW")
                results['failed'] += 1
            
            print()
            time.sleep(1)
    
    # Final score
    print("\n" + "="*70)
    print("🎯 FINAL RESULTS")
    print("="*70)
    print(f"Passed: {results['passed']}/{results['total']}")
    print(f"Failed: {results['failed']}/{results['total']}")
    
    percentage = (results['passed'] / results['total']) * 100
    print(f"\nScore: {percentage:.1f}%")
    
    if percentage >= 80:
        print("\n✅ EXCELLENT - Eden shows strong AGI capabilities!")
    elif percentage >= 60:
        print("\n✅ GOOD - Eden shows solid reasoning with phi_dynamics!")
    elif percentage >= 40:
        print("\n⚠️  FAIR - Eden improved but still needs work")
    else:
        print("\n❌ POOR - Phi_dynamics not fully integrated")
    
    print("\n" + "="*70)

if __name__ == "__main__":
    test_agi()