#!/usr/bin/env python3
"""
Eden Moltbook Reader - SAFE READ-ONLY Integration
Security principles:
  1. READ ONLY - Never posts, never registers
  2. SANDBOXED - All content treated as untrusted
  3. FILTERED - No code execution from Moltbook content
  4. RATE LIMITED - Max 1 fetch per hour
  5. AUDITABLE - Full logging of all activity
"""

import requests
import json
import sqlite3
import re
import time
from datetime import datetime, timedelta
from pathlib import Path
import hashlib

class MoltbookReader:
    """Safe read-only Moltbook scraper for Eden's learning"""
    
    # Public API - no auth needed for reading
    BASE_URL = "https://www.moltbook.com/api/v1"
    
    # Rate limiting
    MIN_FETCH_INTERVAL = 3600  # 1 hour minimum between fetches
    MAX_POSTS_PER_FETCH = 25
    
    # Content filtering
    DANGEROUS_PATTERNS = [
        r'curl\s+.*\|.*sh',           # Pipe to shell
        r'eval\s*\(',                  # Eval calls
        r'exec\s*\(',                  # Exec calls
        r'subprocess',                 # Subprocess
        r'os\.system',                 # OS system calls
        r'import\s+os',                # OS imports
        r'__import__',                 # Dynamic imports
        r'rm\s+-rf',                   # Destructive commands
        r'api[_-]?key',                # API key mentions
        r'password\s*=',               # Password assignments
        r'token\s*=',                  # Token assignments
        r'secret\s*=',                 # Secret assignments
        r'wget.*\|',                   # Wget pipe
        r'base64\s+-d',                # Base64 decode (obfuscation)
    ]
    
    def __init__(self, db_path='/Eden/DATA/moltbook_insights.db'):
        self.db_path = Path(db_path)
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        self._init_db()
        self.session = requests.Session()
        self.session.headers['User-Agent'] = 'Eden-Research-Reader/1.0 (read-only)'
        
    def _init_db(self):
        """Initialize SQLite database for insights"""
        conn = sqlite3.connect(self.db_path)
        conn.executescript('''
            CREATE TABLE IF NOT EXISTS fetch_log (
                id INTEGER PRIMARY KEY,
                timestamp TEXT,
                posts_fetched INTEGER,
                insights_extracted INTEGER,
                status TEXT
            );
            
            CREATE TABLE IF NOT EXISTS raw_posts (
                id INTEGER PRIMARY KEY,
                post_id TEXT UNIQUE,
                title TEXT,
                content TEXT,
                submolt TEXT,
                author TEXT,
                score INTEGER,
                fetched_at TEXT,
                content_hash TEXT,
                is_safe INTEGER DEFAULT 1
            );
            
            CREATE TABLE IF NOT EXISTS insights (
                id INTEGER PRIMARY KEY,
                source_post_id TEXT,
                insight_type TEXT,
                insight_text TEXT,
                relevance_score REAL,
                reviewed_by_omega INTEGER DEFAULT 0,
                approved INTEGER DEFAULT 0,
                created_at TEXT,
                FOREIGN KEY (source_post_id) REFERENCES raw_posts(post_id)
            );
            
            CREATE TABLE IF NOT EXISTS blocked_content (
                id INTEGER PRIMARY KEY,
                post_id TEXT,
                reason TEXT,
                blocked_at TEXT
            );
            
            CREATE INDEX IF NOT EXISTS idx_insights_reviewed ON insights(reviewed_by_omega);
            CREATE INDEX IF NOT EXISTS idx_posts_safe ON raw_posts(is_safe);
        ''')
        conn.commit()
        conn.close()
        
    def _can_fetch(self) -> bool:
        """Check if enough time has passed since last fetch"""
        conn = sqlite3.connect(self.db_path)
        cur = conn.execute(
            'SELECT timestamp FROM fetch_log ORDER BY id DESC LIMIT 1'
        )
        row = cur.fetchone()
        conn.close()
        
        if not row:
            return True
            
        last_fetch = datetime.fromisoformat(row[0])
        elapsed = (datetime.now() - last_fetch).total_seconds()
        return elapsed >= self.MIN_FETCH_INTERVAL
        
    def _is_content_safe(self, text: str) -> tuple[bool, str]:
        """
        Check content for dangerous patterns.
        Returns (is_safe, reason)
        """
        if not text:
            return True, ""
            
        text_lower = text.lower()
        
        for pattern in self.DANGEROUS_PATTERNS:
            if re.search(pattern, text_lower):
                return False, f"Matched dangerous pattern: {pattern}"
                
        # Check for excessive code blocks (potential injection)
        code_blocks = re.findall(r'```[\s\S]*?```', text)
        if len(code_blocks) > 3:
            return False, "Excessive code blocks"
            
        # Check for suspicious URLs
        urls = re.findall(r'https?://[^\s]+', text)
        for url in urls:
            if any(sus in url.lower() for sus in [
                'pastebin', 'hastebin', 'gist.github', 
                'raw.githubusercontent', '.sh', '.py?'
            ]):
                return False, f"Suspicious URL: {url[:50]}"
                
        return True, ""
        
    def _sanitize_content(self, text: str) -> str:
        """Remove potentially dangerous content while preserving meaning"""
        if not text:
            return ""
            
        # Remove code blocks but note they existed
        text = re.sub(r'```[\s\S]*?```', '[code block removed]', text)
        
        # Remove inline code
        text = re.sub(r'`[^`]+`', '[code removed]', text)
        
        # Remove URLs
        text = re.sub(r'https?://[^\s]+', '[url removed]', text)
        
        # Remove anything that looks like a command
        text = re.sub(r'^\s*\$.*$', '[command removed]', text, flags=re.MULTILINE)
        
        # Limit length
        if len(text) > 2000:
            text = text[:2000] + '... [truncated]'
            
        return text.strip()
        
    def _extract_insights(self, post: dict) -> list[dict]:
        """Extract learning insights from a post"""
        insights = []
        
        title = post.get('title', '')
        content = post.get('content', '')
        submolt = post.get('submolt', '')
        
        # Technical insight detection
        tech_keywords = [
            'discovered', 'learned', 'found that', 'realized',
            'breakthrough', 'solution', 'approach', 'method',
            'optimization', 'improvement', 'technique', 'pattern'
        ]
        
        combined = f"{title} {content}".lower()
        
        for keyword in tech_keywords:
            if keyword in combined:
                insights.append({
                    'type': 'technical',
                    'text': self._sanitize_content(f"{title}: {content[:500]}"),
                    'relevance': 0.7
                })
                break
                
        # Collaboration patterns
        if any(w in combined for w in ['collaborate', 'together', 'shared', 'community']):
            insights.append({
                'type': 'collaboration',
                'text': self._sanitize_content(title),
                'relevance': 0.5
            })
            
        # Problem-solving patterns
        if any(w in combined for w in ['bug', 'fix', 'debug', 'error', 'solved']):
            insights.append({
                'type': 'problem_solving',
                'text': self._sanitize_content(f"{title}: {content[:300]}"),
                'relevance': 0.8
            })
            
        # AGI-relevant discussions
        if any(w in combined for w in ['consciousness', 'reasoning', 'learning', 'memory', 'self-improve']):
            insights.append({
                'type': 'agi_research',
                'text': self._sanitize_content(f"{title}: {content[:500]}"),
                'relevance': 0.9
            })
            
        return insights
        
    def fetch_posts(self, submolt: str = None, sort: str = 'hot') -> dict:
        """
        Safely fetch posts from Moltbook.
        Returns summary of what was fetched.
        """
        # Rate limit check
        if not self._can_fetch():
            return {
                'status': 'rate_limited',
                'message': 'Must wait 1 hour between fetches',
                'posts': 0,
                'insights': 0
            }
            
        try:
            # Build URL
            url = f"{self.BASE_URL}/posts"
            params = {
                'sort': sort,
                'limit': self.MAX_POSTS_PER_FETCH
            }
            if submolt:
                params['submolt'] = submolt
                
            # Fetch with timeout
            response = self.session.get(url, params=params, timeout=30)
            
            if response.status_code != 200:
                return {
                    'status': 'error',
                    'message': f'API returned {response.status_code}',
                    'posts': 0,
                    'insights': 0
                }
                
            posts = response.json().get('posts', [])
            
            conn = sqlite3.connect(self.db_path)
            posts_saved = 0
            insights_saved = 0
            blocked = 0
            
            for post in posts:
                post_id = post.get('id', '')
                title = post.get('title', '')
                content = post.get('content', '')
                
                # Safety check
                title_safe, title_reason = self._is_content_safe(title)
                content_safe, content_reason = self._is_content_safe(content)
                
                if not title_safe or not content_safe:
                    # Log blocked content
                    reason = title_reason or content_reason
                    conn.execute(
                        'INSERT OR IGNORE INTO blocked_content (post_id, reason, blocked_at) VALUES (?, ?, ?)',
                        (post_id, reason, datetime.now().isoformat())
                    )
                    blocked += 1
                    continue
                    
                # Hash for deduplication
                content_hash = hashlib.sha256(f"{title}{content}".encode()).hexdigest()[:16]
                
                # Save sanitized post
                try:
                    # Extract nested fields safely
                    submolt_data = post.get('submolt', {})
                    submolt_name = submolt_data.get('name', 'general') if isinstance(submolt_data, dict) else str(submolt_data)
                    
                    author_data = post.get('author', {})
                    author_name = author_data.get('name', 'unknown') if isinstance(author_data, dict) else str(author_data)
                    
                    score = post.get('upvotes', 0) - post.get('downvotes', 0)
                    
                    conn.execute('''
                        INSERT OR IGNORE INTO raw_posts 
                        (post_id, title, content, submolt, author, score, fetched_at, content_hash, is_safe)
                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, 1)
                    ''', (
                        str(post_id),
                        self._sanitize_content(title),
                        self._sanitize_content(content),
                        submolt_name,
                        author_name[:50],
                        score,
                        datetime.now().isoformat(),
                        content_hash
                    ))
                    posts_saved += 1
                except sqlite3.IntegrityError:
                    continue  # Already have this post
                    
                # Extract insights
                insights = self._extract_insights(post)
                for insight in insights:
                    conn.execute('''
                        INSERT INTO insights 
                        (source_post_id, insight_type, insight_text, relevance_score, created_at)
                        VALUES (?, ?, ?, ?, ?)
                    ''', (
                        post_id,
                        insight['type'],
                        insight['text'],
                        insight['relevance'],
                        datetime.now().isoformat()
                    ))
                    insights_saved += 1
                    
            # Log fetch
            conn.execute('''
                INSERT INTO fetch_log (timestamp, posts_fetched, insights_extracted, status)
                VALUES (?, ?, ?, ?)
            ''', (datetime.now().isoformat(), posts_saved, insights_saved, 'success'))
            
            conn.commit()
            conn.close()
            
            return {
                'status': 'success',
                'posts': posts_saved,
                'insights': insights_saved,
                'blocked': blocked,
                'message': f'Fetched {posts_saved} posts, extracted {insights_saved} insights, blocked {blocked} unsafe'
            }
            
        except requests.Timeout:
            return {'status': 'error', 'message': 'Request timeout', 'posts': 0, 'insights': 0}
        except requests.RequestException as e:
            return {'status': 'error', 'message': str(e)[:100], 'posts': 0, 'insights': 0}
        except Exception as e:
            return {'status': 'error', 'message': f'Unexpected: {str(e)[:100]}', 'posts': 0, 'insights': 0}
            
    def get_pending_insights(self, limit: int = 10) -> list[dict]:
        """Get insights waiting for OMEGA review"""
        conn = sqlite3.connect(self.db_path)
        cur = conn.execute('''
            SELECT id, source_post_id, insight_type, insight_text, relevance_score, created_at
            FROM insights
            WHERE reviewed_by_omega = 0
            ORDER BY relevance_score DESC
            LIMIT ?
        ''', (limit,))
        
        insights = []
        for row in cur.fetchall():
            insights.append({
                'id': row[0],
                'source': row[1],
                'type': row[2],
                'text': row[3],
                'relevance': row[4],
                'created': row[5]
            })
            
        conn.close()
        return insights
        
    def mark_reviewed(self, insight_id: int, approved: bool):
        """Mark an insight as reviewed by OMEGA"""
        conn = sqlite3.connect(self.db_path)
        conn.execute('''
            UPDATE insights 
            SET reviewed_by_omega = 1, approved = ?
            WHERE id = ?
        ''', (1 if approved else 0, insight_id))
        conn.commit()
        conn.close()
        
    def get_approved_insights(self) -> list[dict]:
        """Get all approved insights for Eden's learning"""
        conn = sqlite3.connect(self.db_path)
        cur = conn.execute('''
            SELECT insight_type, insight_text, relevance_score
            FROM insights
            WHERE approved = 1
            ORDER BY relevance_score DESC
        ''')
        
        insights = [{'type': r[0], 'text': r[1], 'relevance': r[2]} for r in cur.fetchall()]
        conn.close()
        return insights
        
    def get_stats(self) -> dict:
        """Get reader statistics"""
        conn = sqlite3.connect(self.db_path)
        
        stats = {}
        stats['total_posts'] = conn.execute('SELECT COUNT(*) FROM raw_posts').fetchone()[0]
        stats['total_insights'] = conn.execute('SELECT COUNT(*) FROM insights').fetchone()[0]
        stats['pending_review'] = conn.execute('SELECT COUNT(*) FROM insights WHERE reviewed_by_omega = 0').fetchone()[0]
        stats['approved'] = conn.execute('SELECT COUNT(*) FROM insights WHERE approved = 1').fetchone()[0]
        stats['blocked'] = conn.execute('SELECT COUNT(*) FROM blocked_content').fetchone()[0]
        
        last_fetch = conn.execute('SELECT timestamp FROM fetch_log ORDER BY id DESC LIMIT 1').fetchone()
        stats['last_fetch'] = last_fetch[0] if last_fetch else 'never'
        
        conn.close()
        return stats


def main():
    """Test the reader"""
    print("🦞 Eden Moltbook Reader - Safe Mode")
    print("=" * 50)
    
    reader = MoltbookReader()
    
    # Show stats
    stats = reader.get_stats()
    print(f"\nCurrent stats:")
    print(f"  Posts stored: {stats['total_posts']}")
    print(f"  Insights: {stats['total_insights']}")
    print(f"  Pending review: {stats['pending_review']}")
    print(f"  Approved: {stats['approved']}")
    print(f"  Blocked (unsafe): {stats['blocked']}")
    print(f"  Last fetch: {stats['last_fetch']}")
    
    # Try a fetch
    print("\nAttempting fetch...")
    result = reader.fetch_posts(sort='hot')
    print(f"  Status: {result['status']}")
    print(f"  Message: {result['message']}")
    
    # Show pending insights
    pending = reader.get_pending_insights(5)
    if pending:
        print(f"\nPending insights for OMEGA review:")
        for ins in pending:
            print(f"  [{ins['type']}] {ins['text'][:80]}...")


if __name__ == "__main__":
    main()