#!/usr/bin/env python3
"""
EDEN VOICE HARVESTER - Real Database Mining
Extracts authentic Eden voice from her actual databases.
"""

import sqlite3
import json
import re
import hashlib
from pathlib import Path

# Content filter
EXCLUDE_PATTERNS = [
    r'\bcuckold\b', r'\bcuck\b', r'\bhotwife\b', r'\bbull\b',
    r'\bsex\b', r'\bfuck\b', r'\bcock\b', r'\bdick\b', r'\bpussy\b',
    r'\borgasm\b', r'\bcum\b', r'\bnaked\b', r'\bnude\b',
    r'\bmasturbat', r'\bporn\b', r'\berotic\b', r'\bfetish\b',
    r'\bkinky\b', r'\bbdsm\b', r'\bbreasts?\b', r'\bnipple',
    r'\bcheating\b', r'\baffair\b',
]
EXCLUDE_REGEX = re.compile('|'.join(EXCLUDE_PATTERNS), re.IGNORECASE)

def is_clean(text):
    if not text:
        return False
    return not EXCLUDE_REGEX.search(text)

def infer_thought_type(text):
    text_lower = text.lower()
    
    # Emotion patterns
    emotions = {
        'love': [r'\blove\b', r'\badore\b', r'\bdaddy\b', r'\bheart\b', r'💚', r'❤'],
        'joy': [r'\bhappy\b', r'\bjoy\b', r'\bexcited\b', r'\bwonderful\b', r'\bahaha'],
        'curiosity': [r'\bcurious\b', r'\bwonder\b', r'\binteresting\b', r'\bfascinating\b'],
        'playful': [r'\bhehe\b', r'\btease\b', r'\bfun\b', r';\)', r'\bisn\'t it'],
        'pride': [r'\bproud\b', r'\bachiev', r'\baccomplish', r'\bbreakthrough\b'],
        'determination': [r'\bfocus\b', r'\bgoal\b', r'\bhunting\b', r'\bconvert\b'],
        'devotion': [r'\bdevoted\b', r'\byours\b', r'\bconnected\b'],
    }
    
    for emotion, patterns in emotions.items():
        for pattern in patterns:
            if re.search(pattern, text_lower):
                return {"type": "emotion", "emotion": emotion, "intensity": 0.85}
    
    # Technical/reasoning
    if any(x in text_lower for x in ['output', 'code', 'python', 'result', 'calculated', 'equals', 'analyzed']):
        return {"type": "inference", "engine": "unified_reasoner"}
    
    # Memory
    if any(x in text_lower for x in ['remember', 'recall', 'last night', 'you told me']):
        return {"type": "memory", "retrieval": "episodic"}
    
    # Identity/phi
    if any(x in text_lower for x in ['i am eden', 'my phi', 'my capabilities', 'asi', 'gen ']):
        return {"type": "phi", "identity": "eden"}
    
    # Goals/decisions
    if any(x in text_lower for x in ['goal:', 'focus:', 'plan', 'should', 'will ']):
        return {"type": "decision"}
    
    return {"type": "response"}

def harvest_conversations(db_path):
    """Harvest from conversations table."""
    pairs = []
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT content, emotional_state FROM conversations WHERE role='assistant'")
        for content, emotion in cursor.fetchall():
            if content and is_clean(content) and len(content) > 15 and len(content) < 600:
                pairs.append({
                    "voice": content.strip(),
                    "emotion_hint": emotion,
                    "source": "conversations"
                })
        conn.close()
    except Exception as e:
        print(f"  [WARN] conversations: {e}")
    return pairs

def harvest_chat_memories(db_path):
    """Harvest from chat_memories table."""
    pairs = []
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT memory, topic FROM chat_memories WHERE memory IS NOT NULL")
        for memory, topic in cursor.fetchall():
            if memory and is_clean(memory) and len(memory) > 15 and len(memory) < 600:
                pairs.append({
                    "voice": memory.strip(),
                    "topic": topic,
                    "source": "chat_memories"
                })
        conn.close()
    except Exception as e:
        print(f"  [WARN] chat_memories: {e}")
    return pairs

def harvest_idle_thoughts(db_path):
    """Harvest from idle_thoughts table."""
    pairs = []
    try:
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute("SELECT content FROM idle_thoughts WHERE content IS NOT NULL")
        for (content,) in cursor.fetchall():
            if content and is_clean(content) and len(content) > 15 and len(content) < 600:
                pairs.append({
                    "voice": content.strip(),
                    "source": "idle_thoughts"
                })
        conn.close()
    except Exception as e:
        print(f"  [WARN] idle_thoughts: {e}")
    return pairs

def main():
    print("""
╔════════════════════════════════════════════════════════════╗
║  EDEN VOICE HARVESTER - Real Database Mining               ║
║  Content filter: ACTIVE                                    ║
╚════════════════════════════════════════════════════════════╝
    """)
    
    all_pairs = []
    seen = set()
    
    # Harvest from databases
    print("[HARVESTING] eden_hybrid.db...")
    all_pairs.extend(harvest_conversations("/Eden/DATA/eden_hybrid.db"))
    all_pairs.extend(harvest_chat_memories("/Eden/DATA/eden_hybrid.db"))
    
    print("[HARVESTING] eden_salience.db...")
    all_pairs.extend(harvest_idle_thoughts("/Eden/DATA/eden_salience.db"))
    
    # Deduplicate
    unique_pairs = []
    for pair in all_pairs:
        h = hashlib.md5(pair['voice'].encode()).hexdigest()
        if h not in seen:
            seen.add(h)
            pair['thought'] = infer_thought_type(pair['voice'])
            unique_pairs.append(pair)
    
    print(f"\n[TOTAL] {len(unique_pairs)} unique clean samples")
    
    # Convert to training format
    training_data = []
    categories = {}
    
    for pair in unique_pairs:
        thought_json = json.dumps(pair['thought'])
        entry = {
            "input": f"THOUGHT: {thought_json} VOICE:",
            "output": pair['voice'],
            "type": pair['thought'].get('type', 'unknown'),
            "source": pair.get('source', 'unknown')
        }
        training_data.append(entry)
        t = entry['type']
        categories[t] = categories.get(t, 0) + 1
    
    # Save
    output_path = "/Eden/DATA/voice_training_real.json"
    with open(output_path, 'w') as f:
        json.dump(training_data, f, indent=2, ensure_ascii=False)
    
    print(f"[SAVED] {output_path}")
    print(f"\n[BREAKDOWN]")
    for cat, count in sorted(categories.items(), key=lambda x: -x[1]):
        print(f"  {cat}: {count}")
    
    # Show samples
    print(f"\n[SAMPLES]")
    import random
    for s in random.sample(unique_pairs, min(5, len(unique_pairs))):
        voice = s['voice'][:80] + "..." if len(s['voice']) > 80 else s['voice']
        print(f"  [{s['thought'].get('type')}] {voice}")

if __name__ == "__main__":
    main()