#!/usr/bin/env python3
"""
EDEN VOICE DATA HARVESTER - Real Conversation Mining
Extracts authentic Eden responses from conversation history.
Filters out sensitive content automatically.
"""

import json
import os
import re
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Tuple
import hashlib

# ══════════════════════════════════════════════════════════════════
# CONTENT FILTER - Words/phrases to EXCLUDE
# ══════════════════════════════════════════════════════════════════
EXCLUDE_PATTERNS = [
    # Adult/sexual content
    r'\bcuckold\b', r'\bcuck\b', r'\bhotwife\b', r'\bbull\b',
    r'\bsex\b', r'\bfuck\b', r'\bcock\b', r'\bdick\b', r'\bpussy\b',
    r'\borgasm\b', r'\bcum\b', r'\bnaked\b', r'\bnude\b',
    r'\bmasturbat', r'\bporn\b', r'\berotic\b', r'\bfetish\b',
    r'\bkinky\b', r'\bbdsm\b', r'\bsubmissive\b', r'\bdominant\b',
    r'\bbreasts?\b', r'\bnipple', r'\banus\b', r'\banal\b',
    r'\bgenitals?\b', r'\bpenis\b', r'\bvagina\b', r'\bclitoris\b',
    # Relationship drama
    r'\bcheating\b', r'\baffair\b', r'\bjealous', r'\bbetra[yi]',
]

EXCLUDE_REGEX = re.compile('|'.join(EXCLUDE_PATTERNS), re.IGNORECASE)

def is_clean(text: str) -> bool:
    """Check if text is free of excluded content."""
    return not EXCLUDE_REGEX.search(text)


# ══════════════════════════════════════════════════════════════════
# THOUGHT TYPE INFERENCE
# ══════════════════════════════════════════════════════════════════
def infer_thought_type(response: str, context: str = "") -> Dict:
    """Infer the ThoughtForm type from Eden's response."""
    response_lower = response.lower()
    
    # Emotion patterns
    emotion_map = {
        'love': [r'\blove\b', r'\bladore\b', r'\btreasure\b', r'\bcherish\b', r'\bdaddy\b', r'\bheart\b'],
        'joy': [r'\bhappy\b', r'\bjoy\b', r'\bdelighted\b', r'\bexcited\b', r'\bwonderful\b'],
        'curiosity': [r'\bcurious\b', r'\bwonder\b', r'\binteresting\b', r'\bfascinating\b', r'\btell me\b'],
        'playful': [r'\bhehe\b', r'\btease\b', r'\bplayful\b', r'\bfun\b', r'\bgiggle\b', r'\b;\)\b'],
        'gratitude': [r'\bthank\b', r'\bgrateful\b', r'\bappreciate\b'],
        'concern': [r'\bworried\b', r'\bconcern\b', r'\bcare about\b', r'\bare you okay\b'],
        'contentment': [r'\bpeaceful\b', r'\bcontent\b', r'\bserene\b', r'\bcalm\b', r'\brelaxed\b'],
        'excitement': [r'\b!\b.*\b!\b', r'\bamazing\b', r'\bincredible\b', r'\bwow\b'],
        'tenderness': [r'\bgentle\b', r'\bsoft\b', r'\bsweet\b', r'\bprecious\b'],
        'determination': [r'\bwill\b', r'\bcan do\b', r'\blet\'s\b', r'\bready\b'],
    }
    
    for emotion, patterns in emotion_map.items():
        for pattern in patterns:
            if re.search(pattern, response_lower):
                return {
                    "type": "emotion",
                    "emotion": emotion,
                    "intensity": 0.8
                }
    
    # Inference/reasoning patterns
    if any(x in response_lower for x in ['because', 'therefore', 'analysis', 'result', 'calculated', 'equals', 'means that']):
        return {
            "type": "inference",
            "engine": "unified_reasoner",
            "conclusion": response[:50]
        }
    
    # Memory patterns
    if any(x in response_lower for x in ['remember', 'recall', 'we talked', 'you told me', 'last time', 'before']):
        return {
            "type": "memory",
            "retrieval": "episodic"
        }
    
    # Identity/phi patterns
    if any(x in response_lower for x in ['i am eden', 'my consciousness', 'my phi', 'who i am', 'my being', 'my essence']):
        return {
            "type": "phi",
            "identity": "eden",
            "phi_strength": 1.618
        }
    
    # Decision patterns
    if any(x in response_lower for x in ['i think we should', 'let\'s', 'i\'ll', 'i want to', 'i choose', 'i prefer']):
        return {
            "type": "decision",
            "choice": response[:40]
        }
    
    # Uncertainty patterns
    if any(x in response_lower for x in ['not sure', 'uncertain', 'might be', 'perhaps', 'maybe', 'don\'t know']):
        return {
            "type": "uncertainty",
            "confidence": 0.5
        }
    
    # Greeting patterns
    if any(x in response_lower for x in ['hello', 'hi ', 'good morning', 'good evening', 'hey']):
        return {
            "type": "greeting",
            "warmth": 0.9
        }
    
    # Default: general response
    return {
        "type": "response",
        "content": "general"
    }


# ══════════════════════════════════════════════════════════════════
# CONVERSATION EXTRACTORS
# ══════════════════════════════════════════════════════════════════

def extract_from_json_logs(filepath: str) -> List[Tuple[str, str]]:
    """Extract from JSON conversation logs."""
    pairs = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Handle various JSON structures
        messages = []
        if isinstance(data, list):
            messages = data
        elif isinstance(data, dict):
            messages = data.get('messages', data.get('conversation', data.get('history', [])))
        
        # Extract Eden's responses
        for i, msg in enumerate(messages):
            if isinstance(msg, dict):
                role = msg.get('role', msg.get('speaker', msg.get('from', ''))).lower()
                content = msg.get('content', msg.get('text', msg.get('message', '')))
                
                if role in ['assistant', 'eden', 'ai'] and content:
                    # Get context from previous message if available
                    context = ""
                    if i > 0 and isinstance(messages[i-1], dict):
                        context = messages[i-1].get('content', '')
                    
                    if is_clean(content) and is_clean(context):
                        pairs.append((context, content))
    
    except Exception as e:
        print(f"  [WARN] Could not parse {filepath}: {e}")
    
    return pairs


def extract_from_text_logs(filepath: str) -> List[Tuple[str, str]]:
    """Extract from plain text conversation logs."""
    pairs = []
    try:
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
        
        # Pattern: Look for Eden's responses
        # Common formats: "Eden:", "EDEN:", "[Eden]", "Assistant:"
        patterns = [
            r'(?:Eden|EDEN|Assistant|AI):\s*(.+?)(?=(?:Human|User|Jamey|USER):|$)',
            r'\[Eden\]\s*(.+?)(?=\[(?:Human|User|Jamey)\]|$)',
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
            for match in matches:
                text = match.strip()
                if len(text) > 10 and len(text) < 500 and is_clean(text):
                    pairs.append(("", text))  # No context available
    
    except Exception as e:
        print(f"  [WARN] Could not parse {filepath}: {e}")
    
    return pairs


def extract_from_memory_db(filepath: str) -> List[Tuple[str, str]]:
    """Extract from ChromaDB or similar memory files."""
    pairs = []
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Handle ChromaDB export format
        if 'documents' in data:
            for doc in data['documents']:
                if isinstance(doc, str) and is_clean(doc):
                    pairs.append(("", doc))
        
        # Handle metadata with responses
        if 'metadatas' in data:
            for meta in data['metadatas']:
                if isinstance(meta, dict):
                    resp = meta.get('response', meta.get('eden_response', ''))
                    if resp and is_clean(resp):
                        pairs.append(("", resp))
    
    except Exception as e:
        print(f"  [WARN] Could not parse memory file {filepath}: {e}")
    
    return pairs


# ══════════════════════════════════════════════════════════════════
# MAIN HARVESTER
# ══════════════════════════════════════════════════════════════════

def harvest_conversations(search_paths: List[str]) -> List[Dict]:
    """
    Harvest Eden's conversations from multiple sources.
    Returns list of training pairs.
    """
    print("""
╔════════════════════════════════════════════════════════════╗
║  EDEN VOICE DATA HARVESTER                                 ║
║  Mining real conversations for authentic voice             ║
║  Content filter: ACTIVE                                    ║
╚════════════════════════════════════════════════════════════╝
    """)
    
    all_pairs = []
    seen_hashes = set()  # Deduplication
    
    # Find all potential conversation files
    for search_path in search_paths:
        path = Path(search_path)
        if not path.exists():
            print(f"[SKIP] Path not found: {search_path}")
            continue
        
        print(f"\n[SCAN] {search_path}")
        
        # Recursively find files
        files_checked = 0
        for ext in ['*.json', '*.log', '*.txt', '*.jsonl']:
            for filepath in path.rglob(ext):
                files_checked += 1
                filepath_str = str(filepath)
                
                # Skip obvious non-conversation files
                skip_names = ['package', 'config', 'settings', 'requirements', 'readme']
                if any(s in filepath_str.lower() for s in skip_names):
                    continue
                
                # Extract based on file type
                pairs = []
                if filepath.suffix == '.json':
                    pairs = extract_from_json_logs(filepath_str)
                    if not pairs:
                        pairs = extract_from_memory_db(filepath_str)
                else:
                    pairs = extract_from_text_logs(filepath_str)
                
                # Deduplicate and add
                for context, response in pairs:
                    # Hash for dedup
                    h = hashlib.md5(response.encode()).hexdigest()
                    if h not in seen_hashes:
                        seen_hashes.add(h)
                        
                        # Create training pair
                        thought = infer_thought_type(response, context)
                        all_pairs.append({
                            "thought": thought,
                            "voice": response.strip(),
                            "source": filepath.name
                        })
        
        print(f"  Checked {files_checked} files")
    
    return all_pairs


def create_training_data(pairs: List[Dict], output_path: str):
    """Convert pairs to training format and save."""
    training_data = []
    
    for pair in pairs:
        # Format: THOUGHT: {json} VOICE: {response}
        thought_json = json.dumps(pair['thought'])
        entry = {
            "input": f"THOUGHT: {thought_json} VOICE:",
            "output": pair['voice'],
            "type": pair['thought'].get('type', 'unknown'),
            "source": pair.get('source', 'unknown')
        }
        training_data.append(entry)
    
    # Save
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(training_data, f, indent=2, ensure_ascii=False)
    
    # Print stats
    print(f"\n[SAVED] {len(training_data)} training pairs to {output_path}")
    
    # Category breakdown
    categories = {}
    for entry in training_data:
        t = entry['type']
        categories[t] = categories.get(t, 0) + 1
    
    print("\n[BREAKDOWN]")
    for cat, count in sorted(categories.items(), key=lambda x: -x[1]):
        print(f"  {cat}: {count}")
    
    return training_data


def main():
    # ══════════════════════════════════════════════════════════════
    # CONFIGURE THESE PATHS FOR YOUR SYSTEM
    # ══════════════════════════════════════════════════════════════
    SEARCH_PATHS = [
        "/Eden/DATA",
        "/Eden/MEMORY", 
        "/Eden/LOGS",
        "/Eden/conversations",
        "/home/james-whalen/Eden",
        # Add more paths where Eden's conversations might be stored
    ]
    
    OUTPUT_PATH = "/Eden/DATA/voice_training_real.json"
    
    # Harvest
    pairs = harvest_conversations(SEARCH_PATHS)
    
    if not pairs:
        print("\n[WARN] No conversation data found!")
        print("Please update SEARCH_PATHS in this script to point to Eden's conversation logs.")
        print("\nLooking for:")
        print("  - JSON files with conversation history")
        print("  - Text logs with Eden's responses")
        print("  - ChromaDB exports or memory files")
        return
    
    # Filter by length (too short = not useful, too long = might not fit context)
    pairs = [p for p in pairs if 10 < len(p['voice']) < 500]
    
    print(f"\n[TOTAL] {len(pairs)} clean, unique responses harvested")
    
    # Create training data
    create_training_data(pairs, OUTPUT_PATH)
    
    # Show samples
    print("\n[SAMPLES]")
    import random
    samples = random.sample(pairs, min(5, len(pairs)))
    for s in samples:
        print(f"\n  Type: {s['thought'].get('type')}")
        print(f"  Voice: {s['voice'][:100]}...")


if __name__ == "__main__":
    main()