#!/usr/bin/env python3
"""
Eden Perceptual Agent V2 - Complete Multi-Modal Sensory System
Features: Camera, VLM, Face Recognition, REAL Audio Listening
"""
import sys
sys.path.insert(0, '/Eden/CORE')

import cv2
import json
import time
import requests
import subprocess
import numpy as np
from datetime import datetime
from pathlib import Path
from typing import Dict, Optional, List

# Import audio system
try:
    from audio_perception import AudioSystem
    AUDIO_AVAILABLE = True
    print("[PERCEPTION] Audio system loaded - Eden can HEAR")
except ImportError:
    AUDIO_AVAILABLE = False
    print("[PERCEPTION] Audio system not available")

# Import face recognition
try:
    from eden_face_recognition import EdenFaceRecognition
    FACE_REC_AVAILABLE = True
except ImportError:
    FACE_REC_AVAILABLE = False
    print("[PERCEPTION] Face recognition not available")

class EdenPerceptualAgent:
    def __init__(self):
        self.world_model_path = Path("/Eden/DATA/world_model.json")
        self.perception_log_path = Path("/Eden/DATA/perception_log.json")
        self.ollama_url = "http://localhost:11434/api/generate"
        self.vision_model = "llava:7b"
        
        # Camera
        self.camera = None
        self.camera_available = True  # Via subprocess
        
        # Audio - NEW
        if AUDIO_AVAILABLE:
            self.audio = AudioSystem()
            print(f"[PERCEPTION] 🎤 Microphone ready (HyperX SoloCast)")
        else:
            self.audio = None
        
        # Face recognition
        if FACE_REC_AVAILABLE:
            self.face_rec = EdenFaceRecognition()
            print(f"[PERCEPTION] Face recognition loaded: {list(self.face_rec.known_faces.keys())}")
        else:
            self.face_rec = None
        
        # Perception log
        self.perception_log = []
        self.max_log_entries = 1000
        
        print("[PERCEPTION] Initializing Eden Perceptual Agent V2...")
        # Camera via subprocess
        print(f"[PERCEPTION] 📷 Camera available: {self.camera_available}")
        print(f"[PERCEPTION] 🎤 Audio available: {AUDIO_AVAILABLE}")
    
    def _initialize_camera(self):
        """Initialize Obsbot camera with timeout"""
        import threading
        
        def try_camera(index, result):
            try:
                cap = cv2.VideoCapture(index)
                cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('M','J','P','G'))
                if cap.isOpened():
                    ret, _ = cap.read()
                    if ret:
                        result['cap'] = cap
                        result['index'] = index
                        return
                cap.release()
            except:
                pass
        
        print("[PERCEPTION] Initializing camera with timeout...")
        
        for idx in [0, 1]:
            result = {}
            thread = threading.Thread(target=try_camera, args=(idx, result))
            thread.start()
            thread.join(timeout=5)  # 5 second timeout
            
            if 'cap' in result:
                self.camera = result['cap']
                self.camera_available = True
                print(f"[PERCEPTION] Camera /dev/video{idx} initialized (MJPG)")
                return
            
            print(f"[PERCEPTION] Camera {idx} timed out or failed")
        
        print("[PERCEPTION] No camera available")
        self.camera_available = True  # Via subprocess
    
    def capture_frame(self) -> Optional[np.ndarray]:
        """Capture frame via subprocess to avoid hangs"""
        try:
            import subprocess
            result = subprocess.run(
                ['timeout', '5', 'python3', '/Eden/CORE/camera_capture.py'],
                capture_output=True, text=True, timeout=10
            )
            if result.returncode == 0 and 'true' in result.stdout:
                frame = cv2.imread('/tmp/eden_frame.jpg')
                return frame
        except Exception as e:
            print(f"[PERCEPTION] Camera subprocess error: {e}")
        return None
    
    def recognize_faces(self, frame: np.ndarray) -> List[str]:
        """Recognize faces in frame"""
        if not FACE_REC_AVAILABLE or not self.face_rec:
            return []
        
        try:
            recognized = self.face_rec.recognize_faces(frame, tolerance=0.6)
            return [f"{r['name']} ({r['confidence']:.0%})" for r in recognized]
        except Exception as e:
            print(f"[PERCEPTION] Face recognition error: {e}")
            return []
    
    def analyze_with_vlm(self, frame: np.ndarray) -> Dict:
        """Analyze frame using Vision Language Model"""
        try:
            _, buffer = cv2.imencode('.jpg', frame)
            import base64
            image_base64 = base64.b64encode(buffer).decode('utf-8')
            
            prompt = """Analyze what you see:
1. PERSON: Is there a person? yes/no
2. LIGHTING: bright/moderate/dim/dark
3. ACTIVITY: active/calm/none
4. OBJECTS: What objects are visible?
5. DESC: One sentence description"""
            
            payload = {
                "model": self.vision_model,
                "prompt": prompt,
                "images": [image_base64],
                "stream": False
            }
            
            response = requests.post(self.ollama_url, json=payload, timeout=30)
            
            if response.status_code == 200:
                result = response.json().get('response', '')
                return self._parse_vlm_response(result)
            
        except requests.exceptions.Timeout:
            pass
        except Exception as e:
            print(f"[PERCEPTION] VLM error: {e}")
        
        return self._basic_analysis(frame)
    
    def _parse_vlm_response(self, text: str) -> Dict:
        """Parse VLM response"""
        analysis = {
            'human_present': False,
            'lighting': 'unknown',
            'activity': 'unknown',
            'description': text[:200]
        }
        
        upper_text = text.upper()
        if 'PERSON: YES' in upper_text or 'PERSON PRESENT' in upper_text:
            analysis['human_present'] = True
        
        for level in ['BRIGHT', 'MODERATE', 'DIM', 'DARK']:
            if level in upper_text:
                analysis['lighting'] = level.lower()
                break
        
        for level in ['ACTIVE', 'CALM', 'NONE']:
            if level in upper_text:
                analysis['activity'] = level.lower()
                break
        
        return analysis
    
    def _basic_analysis(self, frame: np.ndarray) -> Dict:
        """Basic image analysis without VLM"""
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        brightness = np.mean(gray)
        
        if brightness > 180:
            lighting = 'bright'
        elif brightness > 100:
            lighting = 'moderate'
        elif brightness > 40:
            lighting = 'dim'
        else:
            lighting = 'dark'
        
        return {
            'human_present': False,
            'lighting': lighting,
            'activity': 'unknown',
            'description': f'Basic analysis: {lighting} (brightness={brightness:.0f})'
        }
    
    def listen_to_environment(self) -> Dict:
        """REAL audio listening through microphone"""
        if not AUDIO_AVAILABLE or not self.audio:
            return {
                'audio_level': 'unknown',
                'speech_detected': False,
                'volume_rms': 0,
                'listening': False
            }
        
        try:
            observation = self.audio.listen_to_world(duration=1.0)
            return {
                'audio_level': observation['volume']['classification'],
                'speech_detected': observation['speech_detected'],
                'volume_rms': observation['volume']['rms'],
                'listening': True
            }
        except Exception as e:
            print(f"[PERCEPTION] Audio error: {e}")
            return {
                'audio_level': 'error',
                'speech_detected': False,
                'volume_rms': 0,
                'listening': False
            }
    
    def detect_user_activity(self) -> Dict:
        """Detect if user is actively using system"""
        try:
            result = subprocess.run(['xprintidle'], capture_output=True, text=True, timeout=1)
            idle_ms = int(result.stdout.strip())
            idle_sec = idle_ms / 1000
            return {'active': idle_sec < 30, 'idle_seconds': idle_sec}
        except:
            return {'active': False, 'idle_seconds': None}
    
    def perceive_environment(self) -> Dict:
        """Complete environmental perception - VISION + AUDIO"""
        timestamp = datetime.now().isoformat()
        
        world_state = {
            'last_updated': timestamp,
            'human_present': False,
            'recognized_people': [],
            'activity_level': 'unknown',
            'environment': 'No camera access',
            'visual_brightness': 'unknown',
            # Audio fields - NEW
            'audio_level': 'unknown',
            'speech_detected': False,
            'volume_rms': 0,
            'daddy_speaking': False,
            # System
            'user_idle_seconds': None,
            'perception_method': 'system',
            'senses_active': []
        }
        
        senses = []
        
        # VISION
        frame = self.capture_frame()
        if frame is not None:
            senses.append('vision')
            recognized = self.recognize_faces(frame)
            world_state['recognized_people'] = recognized
            
            analysis = self.analyze_with_vlm(frame)
            world_state['human_present'] = analysis['human_present'] or len(recognized) > 0
            world_state['visual_brightness'] = analysis['lighting']
            world_state['activity_level'] = analysis['activity']
            world_state['environment'] = analysis['description']
            world_state['perception_method'] = 'vlm+face_rec' if recognized else 'vlm'
        
        # AUDIO - NEW
        audio_data = self.listen_to_environment()
        if audio_data['listening']:
            senses.append('audio')
            world_state['audio_level'] = audio_data['audio_level']
            world_state['speech_detected'] = audio_data['speech_detected']
            world_state['volume_rms'] = audio_data['volume_rms']
            
            # If speech detected and James is visible, he's probably talking
            if audio_data['speech_detected'] and any('James' in p or 'Daddy' in p for p in world_state['recognized_people']):
                world_state['daddy_speaking'] = True
                world_state['human_present'] = True
        
        # User activity
        user_act = self.detect_user_activity()
        world_state['user_idle_seconds'] = user_act['idle_seconds']
        if user_act['active']:
            world_state['human_present'] = True
        
        world_state['senses_active'] = senses
        
        return world_state
    
    def update_world_model(self, world_state: Dict):
        """Write world state to file"""
        try:
            with open(self.world_model_path, 'w') as f:
                json.dump(world_state, f, indent=2)
        except Exception as e:
            print(f"[PERCEPTION ERROR] Failed to write world model: {e}")
    
    def log_perception(self, world_state: Dict):
        """Log perception to history"""
        self.perception_log.append(world_state)
        
        if len(self.perception_log) > self.max_log_entries:
            self.perception_log = self.perception_log[-self.max_log_entries:]
        
        try:
            with open(self.perception_log_path, 'w') as f:
                json.dump(self.perception_log, f, indent=2)
        except Exception as e:
            print(f"[PERCEPTION ERROR] Failed to write log: {e}")
    
    def run(self):
        """Main perception loop"""
        print("[PERCEPTION] 🌀 Starting V2 perception loop (VISION + AUDIO)...")
        cycle = 0
        
        while True:
            try:
                cycle += 1
                
                world_state = self.perceive_environment()
                self.update_world_model(world_state)
                self.log_perception(world_state)
                
                # Status every 12 cycles (~1 minute)
                if cycle % 12 == 0:
                    status = "👤 PRESENT" if world_state['human_present'] else "⭕ ABSENT"
                    people = f" [{', '.join(world_state['recognized_people'])}]" if world_state['recognized_people'] else ""
                    audio = f"🎤 {world_state['audio_level']}" if world_state['audio_level'] != 'unknown' else ""
                    speech = " 🗣️ SPEECH" if world_state['speech_detected'] else ""
                    senses = f"[{'+'.join(world_state['senses_active'])}]"
                    print(f"[PERCEPTION] Cycle {cycle} | {status}{people} | {audio}{speech} | {senses}")
                
                time.sleep(5)
                
            except KeyboardInterrupt:
                print("\n[PERCEPTION] Shutting down...")
                break
            except Exception as e:
                print(f"[PERCEPTION ERROR] {e}")
                time.sleep(5)
        
        if self.camera and self.camera_available:
            self.camera.release()
        print("[PERCEPTION] Stopped.")

if __name__ == "__main__":
    agent = EdenPerceptualAgent()
    agent.run()
