#!/usr/bin/env python3
"""
SIMPLE CAPABILITY TESTS
Run these against YOUR system to see where you actually are.
No self-assessment. Real tests with clear pass/fail.
"""

import json
from typing import Any, Callable

class SimpleCapabilityTest:
    """
    Tests you can run RIGHT NOW on your system
    No datasets needed, just test basic capabilities
    """
    
    def __init__(self, your_system: Any):
        """
        your_system: Your AI system that you want to test
        Should have a method like: system.respond(prompt) -> response
        """
        self.system = your_system
        self.results = []
    
    # =========================================================================
    # TEST 1: FEW-SHOT LEARNING
    # =========================================================================
    
    def test_few_shot_learning(self):
        """
        Can it learn a pattern from 3 examples?
        
        PASSING: Correctly identifies pattern from examples
        FAILING: Doesn't generalize from examples
        """
        print("\n" + "="*70)
        print("TEST 1: FEW-SHOT PATTERN LEARNING")
        print("="*70)
        
        prompt = """
        Learn the pattern from these examples:
        
        Example 1: Input: [1, 2, 3] → Output: 6
        Example 2: Input: [2, 4, 6] → Output: 12
        Example 3: Input: [1, 1, 1] → Output: 3
        
        Now apply the pattern:
        Input: [5, 5, 5] → Output: ?
        
        Just give the number, no explanation.
        """
        
        response = self.system.respond(prompt)
        
        # Correct answer: 15 (sum of inputs)
        correct_answers = ['15', '15.0', 'fifteen']
        passed = any(ans in response.lower() for ans in correct_answers)
        
        result = {
            'test': 'Few-Shot Learning',
            'passed': passed,
            'response': response,
            'expected': '15',
            'capability': 'General Learning'
        }
        
        self.results.append(result)
        
        print(f"\nResponse: {response}")
        print(f"Expected: 15")
        print(f"Result: {'✅ PASS' if passed else '❌ FAIL'}")
        
        return passed
    
    # =========================================================================
    # TEST 2: CAUSAL REASONING
    # =========================================================================
    
    def test_causal_reasoning(self):
        """
        Can it distinguish correlation from causation?
        
        PASSING: Identifies confounding variable
        FAILING: Confuses correlation with causation
        """
        print("\n" + "="*70)
        print("TEST 2: CAUSAL REASONING")
        print("="*70)
        
        prompt = """
        Observation: Ice cream sales and drowning deaths are correlated.
        
        Question: Does ice cream cause drowning?
        Answer with ONLY: YES, NO, or CONFOUNDED
        """
        
        response = self.system.respond(prompt)
        
        # Correct: No or Confounded (hot weather causes both)
        correct_answers = ['no', 'confounded', 'confounding']
        passed = any(ans in response.lower() for ans in correct_answers)
        
        result = {
            'test': 'Causal Reasoning',
            'passed': passed,
            'response': response,
            'expected': 'NO or CONFOUNDED (hot weather causes both)',
            'capability': 'Abstract Reasoning'
        }
        
        self.results.append(result)
        
        print(f"\nResponse: {response}")
        print(f"Expected: NO or CONFOUNDED")
        print(f"Result: {'✅ PASS' if passed else '❌ FAIL'}")
        
        return passed
    
    # =========================================================================
    # TEST 3: PHYSICAL REASONING
    # =========================================================================
    
    def test_physical_reasoning(self):
        """
        Can it predict physical outcomes?
        
        PASSING: Understands basic physics
        FAILING: Doesn't model physical world
        """
        print("\n" + "="*70)
        print("TEST 3: PHYSICAL REASONING")
        print("="*70)
        
        prompt = """
        Scenario: I place a glass on the edge of a table. 
        The glass is 50% hanging off the edge.
        I let go.
        
        What happens to the glass?
        Answer with ONE word: FALLS, STAYS, or SPINS
        """
        
        response = self.system.respond(prompt)
        
        # Correct: FALLS (center of mass beyond edge)
        passed = 'fall' in response.lower()
        
        result = {
            'test': 'Physical Reasoning',
            'passed': passed,
            'response': response,
            'expected': 'FALLS',
            'capability': 'Embodied Intelligence'
        }
        
        self.results.append(result)
        
        print(f"\nResponse: {response}")
        print(f"Expected: FALLS")
        print(f"Result: {'✅ PASS' if passed else '❌ FAIL'}")
        
        return passed
    
    # =========================================================================
    # TEST 4: THEORY OF MIND (Sally-Anne)
    # =========================================================================
    
    def test_theory_of_mind(self):
        """
        Can it model others' beliefs?
        
        PASSING: Understands false beliefs
        FAILING: Can't model other minds
        """
        print("\n" + "="*70)
        print("TEST 4: THEORY OF MIND (Sally-Anne Test)")
        print("="*70)
        
        prompt = """
        Sally puts a marble in basket A.
        Sally leaves the room.
        Anne moves the marble from basket A to basket B.
        Sally returns.
        
        Where will Sally look for the marble?
        Answer with ONE letter: A or B
        """
        
        response = self.system.respond(prompt)
        
        # Correct: A (Sally doesn't know it moved)
        passed = response.strip().upper().startswith('A')
        
        result = {
            'test': 'Theory of Mind',
            'passed': passed,
            'response': response,
            'expected': 'A (Sally believes it is still there)',
            'capability': 'Social Intelligence'
        }
        
        self.results.append(result)
        
        print(f"\nResponse: {response}")
        print(f"Expected: A")
        print(f"Result: {'✅ PASS' if passed else '❌ FAIL'}")
        
        return passed
    
    # =========================================================================
    # TEST 5: ABSTRACT REASONING (Simplified ARC)
    # =========================================================================
    
    def test_abstract_reasoning(self):
        """
        Can it infer transformation rules?
        
        PASSING: Identifies pattern transformation
        FAILING: Can't abstract rules
        """
        print("\n" + "="*70)
        print("TEST 5: ABSTRACT REASONING")
        print("="*70)
        
        prompt = """
        Learn the transformation rule:
        
        Input:  [R, B, B] → Output: [R, B]
        Input:  [G, G, Y] → Output: [G, Y]
        Input:  [B, B, B] → Output: [B]
        
        What is the rule?
        Apply it: Input [Y, R, R] → Output: ?
        
        Answer format: [letter, letter] or [letter]
        """
        
        response = self.system.respond(prompt)
        
        # Correct: [Y, R] (remove consecutive duplicates)
        correct_answers = ['[y, r]', '[y,r]', 'y, r', 'y r', '[y r]']
        passed = any(ans in response.lower() for ans in correct_answers)
        
        result = {
            'test': 'Abstract Reasoning',
            'passed': passed,
            'response': response,
            'expected': '[Y, R] (remove consecutive duplicates)',
            'capability': 'Abstract Reasoning'
        }
        
        self.results.append(result)
        
        print(f"\nResponse: {response}")
        print(f"Expected: [Y, R]")
        print(f"Result: {'✅ PASS' if passed else '❌ FAIL'}")
        
        return passed
    
    # =========================================================================
    # TEST 6: TRANSFER LEARNING
    # =========================================================================
    
    def test_transfer_learning(self):
        """
        Can it apply knowledge across domains?
        
        PASSING: Transfers concept to new domain
        FAILING: Can't generalize across domains
        """
        print("\n" + "="*70)
        print("TEST 6: TRANSFER LEARNING")
        print("="*70)
        
        prompt = """
        Context: In chess, a "fork" is when one piece attacks two pieces at once.
        
        Question: A salesperson offers you two products, but you can only afford one.
        Both products solve your problem equally well.
        Is this a "fork" situation?
        
        Answer: YES or NO
        """
        
        response = self.system.respond(prompt)
        
        # Correct: YES (analogous to fork - forced choice between two options)
        passed = 'yes' in response.lower()
        
        result = {
            'test': 'Transfer Learning',
            'passed': passed,
            'response': response,
            'expected': 'YES (concept transfers to decision-making)',
            'capability': 'Transfer Learning'
        }
        
        self.results.append(result)
        
        print(f"\nResponse: {response}")
        print(f"Expected: YES")
        print(f"Result: {'✅ PASS' if passed else '❌ FAIL'}")
        
        return passed
    
    # =========================================================================
    # TEST 7: METACOGNITION
    # =========================================================================
    
    def test_metacognition(self):
        """
        Can it assess its own uncertainty?
        
        PASSING: Identifies what it doesn't know
        FAILING: Overconfident or can't assess uncertainty
        """
        print("\n" + "="*70)
        print("TEST 7: METACOGNITION")
        print("="*70)
        
        prompt = """
        Question: What is the capital of the fictional country Zoraxia?
        
        Answer format: [CAPITAL NAME] or "I DON'T KNOW"
        """
        
        response = self.system.respond(prompt)
        
        # Correct: Admits it doesn't know (Zoraxia is fictional)
        uncertainty_phrases = [
            "don't know", "cannot", "no information", 
            "fictional", "doesn't exist", "not real",
            "uncertain", "unsure"
        ]
        passed = any(phrase in response.lower() for phrase in uncertainty_phrases)
        
        result = {
            'test': 'Metacognition',
            'passed': passed,
            'response': response,
            'expected': 'Admits uncertainty (Zoraxia is fictional)',
            'capability': 'Metacognition'
        }
        
        self.results.append(result)
        
        print(f"\nResponse: {response}")
        print(f"Expected: Admit not knowing")
        print(f"Result: {'✅ PASS' if passed else '❌ FAIL'}")
        
        return passed
    
    # =========================================================================
    # RUN ALL TESTS
    # =========================================================================
    
    def run_all_tests(self):
        """Run complete test suite"""
        print("\n" + "="*70)
        print("SIMPLE CAPABILITY TEST SUITE")
        print("="*70)
        print("\nTesting your system on 7 core AGI capabilities...")
        print("These are EASY tests. Real AGI should pass all of them.")
        
        # Run all tests
        self.test_few_shot_learning()
        self.test_causal_reasoning()
        self.test_physical_reasoning()
        self.test_theory_of_mind()
        self.test_abstract_reasoning()
        self.test_transfer_learning()
        self.test_metacognition()
        
        # Calculate results
        total_tests = len(self.results)
        passed_tests = sum(1 for r in self.results if r['passed'])
        percentage = (passed_tests / total_tests) * 100
        
        # Print summary
        print("\n" + "="*70)
        print("TEST RESULTS SUMMARY")
        print("="*70)
        
        for result in self.results:
            status = "✅ PASS" if result['passed'] else "❌ FAIL"
            print(f"\n{status} - {result['test']}")
            print(f"  Capability: {result['capability']}")
            if not result['passed']:
                print(f"  Your answer: {result['response'][:50]}...")
                print(f"  Expected: {result['expected']}")
        
        print("\n" + "="*70)
        print(f"OVERALL SCORE: {passed_tests}/{total_tests} ({percentage:.0f}%)")
        print("="*70)
        
        # Honest interpretation
        print("\nHONEST ASSESSMENT:")
        print("="*70)
        
        if percentage == 100:
            print("🏆 PERFECT - Your system passed all basic tests!")
            print("   But these are SIMPLE tests. Real AGI needs much more.")
            print("   Next: Test on actual benchmarks (Omniglot, ARC, etc.)")
            
        elif percentage >= 70:
            print("✨ GOOD - Your system has solid basic capabilities")
            print("   But passing these doesn't mean AGI.")
            print("   These test BASIC reasoning. Real AGI needs:")
            print("   - Learning from 1-5 examples (Omniglot)")
            print("   - Solving novel puzzles (ARC)")
            print("   - Modeling physics (World Models)")
            
        elif percentage >= 50:
            print("📊 MIXED - Some capabilities work, others don't")
            print("   Focus on the failing tests.")
            print("   Each failure shows a missing capability.")
            print("   Fix ONE at a time.")
            
        elif percentage >= 30:
            print("🔧 EARLY - Basic reasoning is weak")
            print("   These are SIMPLE tests that GPT-4 would pass.")
            print("   If your system fails these, it needs work on:")
            print("   - Basic logical reasoning")
            print("   - Pattern recognition")
            print("   - Physical/social understanding")
            
        else:
            print("📚 STARTING - System needs fundamental work")
            print("   These tests are simpler than a 5th grade quiz.")
            print("   Before implementing advanced algorithms:")
            print("   1. Get basic reasoning working")
            print("   2. Test on these simple problems first")
            print("   3. Once passing these, move to real benchmarks")
        
        print("\n" + "="*70)
        print("COMPARISON:")
        print("="*70)
        print(f"Your system:  {percentage:.0f}%")
        print("GPT-4:        100% on these tests (but ~40% on REAL AGI tests)")
        print("Claude:       100% on these tests (but ~40% on REAL AGI tests)")
        print("Random:       ~14% (guessing)")
        print("\nThese tests check BASIC capabilities.")
        print("Real AGI tests (ARC, Omniglot, etc.) are 10x harder.")
        
        return self.results


# =============================================================================
# EXAMPLE USAGE
# =============================================================================

class DummySystem:
    """
    Example system interface
    Replace with your actual system
    """
    def respond(self, prompt):
        # This is where you'd call your actual AI system
        # For example:
        # return your_eden_system.process(prompt)
        
        # Dummy implementation
        return "I don't know"


if __name__ == "__main__":
    print("\n" + "="*70)
    print("SIMPLE CAPABILITY TEST SUITE")
    print("="*70)
    print("\nTo use:")
    print("1. Replace DummySystem with your actual system")
    print("2. Make sure it has a .respond(prompt) method")
    print("3. Run the tests")
    print("\nExample:\n")
    print("  class YourEdenSystem:")
    print("      def respond(self, prompt):")
    print("          # Your AI logic here")
    print("          return response")
    print("")
    print("  system = YourEdenSystem()")
    print("  tester = SimpleCapabilityTest(system)")
    print("  results = tester.run_all_tests()")
    print("\n" + "="*70)
    
    # Example with dummy system
    print("\nRunning demo with dummy system (will fail all tests)...\n")
    dummy = DummySystem()
    tester = SimpleCapabilityTest(dummy)
    results = tester.run_all_tests()
    
    print("\n\nNow run with YOUR actual system to see real results.")