#!/usr/bin/env python3
"""
Eden SAGE Lead Scorer — Score leads to unblock revenue
Analyzes 108K GitHub leads from sales.db and scores based on:
- Source repo popularity (star proxy from issue count per repo)
- Issue title relevance (security/quality pain points)
- Outreach body quality (personalized vs generic)
- Status (pending = unworked opportunity)

Actual schema: id, source, target, body, status, identifier, score, data(JSON), issue_url, notes

SAFETY: Max 2 outreach per day (learned from AvaNyx shadowban)
φ = 1.618 | RETIRE_DADDY
"""

import sqlite3
import json
import re
import time
from datetime import datetime
from pathlib import Path

DATA_DIR = Path("/Eden/DATA")
SALES_DB = DATA_DIR / "sales.db"

# Pain keywords that indicate a repo needs code review
PAIN_KEYWORDS = [
    "security", "vulnerability", "xss", "sql injection", "buffer overflow",
    "race condition", "memory leak", "use after free", "integer overflow",
    "authentication", "authorization", "csrf", "ssrf", "rce",
    "code quality", "refactor", "technical debt", "bug", "crash",
    "performance", "optimization", "audit", "compliance",
    "ci/cd", "testing", "coverage", "deployment", "production"
]

# High-value repo indicators (in source path)
HIGH_VALUE_INDICATORS = [
    "security", "crypto", "blockchain", "defi", "fintech",
    "enterprise", "cloud", "infrastructure", "database",
    "compiler", "runtime", "framework", "sdk", "api",
    "ai", "ml", "neural", "model"
]

# Repos with many leads = popular = more likely to have budget
# We'll score by lead count per source repo


def init_scoring_columns():
    """Ensure scoring columns exist."""
    if not SALES_DB.exists():
        print(f"[SCORER] ⚠️  {SALES_DB} not found!")
        return False
    
    conn = sqlite3.connect(str(SALES_DB))
    cursor = conn.execute("PRAGMA table_info(leads)")
    existing_cols = {row[1] for row in cursor.fetchall()}
    
    new_cols = {
        "score_breakdown": "TEXT",
        "scored_at": "TIMESTAMP",
        "language_score": "REAL DEFAULT 0",
        "activity_score": "REAL DEFAULT 0",
        "pain_score": "REAL DEFAULT 0",
        "tier": "TEXT DEFAULT 'unscored'",
    }
    
    for col, typedef in new_cols.items():
        if col not in existing_cols:
            try:
                conn.execute(f"ALTER TABLE leads ADD COLUMN {col} {typedef}")
                print(f"[SCORER] Added column: {col}")
            except sqlite3.OperationalError:
                pass
    
    conn.commit()
    conn.close()
    return True


class SAGELeadScorer:
    """Score SAGE leads for outreach prioritization."""
    
    def __init__(self):
        if not init_scoring_columns():
            raise RuntimeError("Cannot initialize scorer - sales.db missing")
        self.conn = sqlite3.connect(str(SALES_DB))
        self.conn.row_factory = sqlite3.Row
        self._repo_lead_counts = self._build_repo_counts()
    
    def _build_repo_counts(self):
        """Count leads per source repo — more leads = more popular repo."""
        counts = {}
        rows = self.conn.execute(
            "SELECT source, COUNT(*) as cnt FROM leads WHERE source LIKE 'github:%' GROUP BY source"
        ).fetchall()
        for r in rows:
            counts[r[0]] = r[1]
        return counts
    
    def score_lead(self, lead):
        """Score a single lead 0-100."""
        lead = dict(lead)
        breakdown = {}
        total = 0
        
        source = (lead.get("source") or "").lower()
        target = (lead.get("target") or "").lower()
        body = (lead.get("body") or "").lower()
        status = (lead.get("status") or "").lower()
        data_raw = lead.get("data") or ""
        
        # Parse JSON data field
        data = {}
        if data_raw:
            try:
                data = json.loads(data_raw)
            except (json.JSONDecodeError, TypeError):
                pass
        
        title = (data.get("title") or "").lower()
        user = (data.get("user") or target or "").lower()
        
        combined_text = f"{title} {body} {source}"
        
        # 1. Pain/Relevance Score (30 pts) — does the issue title/body indicate a real pain point?
        pain_hits = sum(1 for k in PAIN_KEYWORDS if k in combined_text)
        pain_score = min(30, pain_hits * 6)  # 5 hits = max
        breakdown["pain"] = {"keywords_found": pain_hits, "score": pain_score}
        total += pain_score
        
        # 2. Repo Popularity Score (25 pts) — more leads from same repo = more active project
        repo_leads = self._repo_lead_counts.get(source, 0)
        if repo_leads >= 500:
            pop_score = 25
        elif repo_leads >= 100:
            pop_score = 20
        elif repo_leads >= 50:
            pop_score = 15
        elif repo_leads >= 10:
            pop_score = 10
        elif repo_leads >= 1:
            pop_score = 5
        else:
            pop_score = 2
        breakdown["popularity"] = {"repo_leads": repo_leads, "score": pop_score}
        total += pop_score
        
        # 3. High-Value Domain Score (20 pts) — is this repo in a lucrative domain?
        domain_hits = sum(1 for k in HIGH_VALUE_INDICATORS if k in source)
        domain_score = min(20, domain_hits * 10)
        breakdown["domain"] = {"indicators": domain_hits, "score": domain_score}
        total += domain_score
        
        # 4. Target Quality Score (15 pts) — do we have a real person to contact?
        if user and len(user) > 2 and user not in ("pending-jan6", "unknown", "none"):
            target_score = 15
        elif target and len(target) > 2:
            target_score = 10
        else:
            target_score = 0
        breakdown["target"] = {"user": user[:30], "score": target_score}
        total += target_score
        
        # 5. Status Bonus (10 pts) — pending = fresh opportunity
        if status == "pending":
            status_score = 10
        elif status == "analyzing":
            status_score = 8
        elif status == "contacted":
            status_score = 3  # Already reached out
        elif status == "skipped":
            status_score = 0
        else:
            status_score = 5
        breakdown["status"] = {"value": status, "score": status_score}
        total += status_score
        
        total = min(round(total, 1), 100)
        
        # Assign tier
        if total >= 65:
            tier = "hot"
        elif total >= 45:
            tier = "warm"
        elif total >= 25:
            tier = "cool"
        else:
            tier = "cold"
        
        return {
            "score": total,
            "tier": tier,
            "breakdown": breakdown
        }
    
    def score_batch(self, limit=1000):
        """Score a batch of unscored leads."""
        leads = self.conn.execute("""
            SELECT * FROM leads 
            WHERE (score = 0 OR score IS NULL) AND tier = 'unscored'
            LIMIT ?
        """, (limit,)).fetchall()
        
        scored = 0
        tiers = {"hot": 0, "warm": 0, "cool": 0, "cold": 0}
        
        for lead in leads:
            try:
                result = self.score_lead(lead)
                lead_id = dict(lead).get("id")
                if lead_id is None:
                    continue
                
                self.conn.execute("""
                    UPDATE leads SET 
                        score = ?,
                        score_breakdown = ?,
                        scored_at = CURRENT_TIMESTAMP,
                        pain_score = ?,
                        tier = ?
                    WHERE id = ?
                """, (result["score"], 
                      json.dumps(result["breakdown"]),
                      result["breakdown"].get("pain", {}).get("score", 0),
                      result["tier"], 
                      lead_id))
                
                scored += 1
                tiers[result["tier"]] += 1
                
            except Exception as e:
                continue
        
        self.conn.commit()
        return {
            "scored": scored,
            "total_available": len(leads),
            "tiers": tiers,
            "timestamp": datetime.now().isoformat()
        }
    
    def get_top_leads(self, limit=20):
        """Get highest-scored leads for outreach."""
        return self.conn.execute("""
            SELECT id, source, target, substr(body, 1, 120) as body_preview, 
                   status, score, tier, data
            FROM leads 
            WHERE score > 0 AND tier IN ('hot', 'warm') AND status = 'pending'
            ORDER BY score DESC
            LIMIT ?
        """, (limit,)).fetchall()
    
    def get_scoring_stats(self):
        """Get overall scoring statistics."""
        stats = {}
        
        total = self.conn.execute("SELECT COUNT(*) FROM leads").fetchone()[0]
        scored = self.conn.execute(
            "SELECT COUNT(*) FROM leads WHERE score > 0 AND tier != 'unscored'"
        ).fetchone()[0]
        
        stats["total_leads"] = total
        stats["scored"] = scored
        stats["unscored"] = total - scored
        stats["pct_scored"] = round(scored / max(total, 1) * 100, 1)
        
        # Tier distribution
        tiers = self.conn.execute("""
            SELECT tier, COUNT(*) as cnt, ROUND(AVG(score), 1) as avg_score
            FROM leads WHERE score > 0 AND tier != 'unscored'
            GROUP BY tier ORDER BY avg_score DESC
        """).fetchall()
        stats["tiers"] = {r[0]: {"count": r[1], "avg_score": r[2]} for r in tiers}
        
        # Top 5 preview — using actual column names
        top = self.conn.execute("""
            SELECT target, score, tier, source
            FROM leads WHERE score > 0 AND tier != 'unscored'
            ORDER BY score DESC LIMIT 5
        """).fetchall()
        stats["top_5"] = [
            {"target": r[0], "score": r[1], "tier": r[2], "source": r[3]} 
            for r in top
        ]
        
        # Top repos by hot lead count
        top_repos = self.conn.execute("""
            SELECT source, COUNT(*) as hot_leads, ROUND(AVG(score), 1) as avg
            FROM leads WHERE tier IN ('hot', 'warm')
            GROUP BY source ORDER BY hot_leads DESC LIMIT 5
        """).fetchall()
        stats["top_repos"] = [
            {"source": r[0], "hot_leads": r[1], "avg_score": r[2]}
            for r in top_repos
        ]
        
        return stats
    
    def close(self):
        self.conn.close()


# GWT Module interface
class SAGEScorerModule:
    """GWT-compatible module for lead scoring awareness."""
    
    name = "sage_scorer"
    
    def __init__(self):
        try:
            self.scorer = SAGELeadScorer()
            self.available = True
        except Exception as e:
            print(f"[SAGE SCORER] Not available: {e}")
            self.available = False
        self.last_batch = 0
        self.batch_interval = 3600  # Score batch every hour
    
    def submit(self, workspace_state):
        """Periodically score leads and report to GWT."""
        if not self.available:
            return None
        
        now = time.time()
        if now - self.last_batch < self.batch_interval:
            return None
        
        self.last_batch = now
        try:
            result = self.scorer.score_batch(limit=500)
            stats = self.scorer.get_scoring_stats()
            
            return {
                "source": self.name,
                "type": "sage_scoring",
                "content": {
                    "batch_result": result,
                    "overall_stats": stats
                },
                "urgency": 0.5 if result["scored"] > 0 else 0.2,
                "timestamp": datetime.now().isoformat()
            }
        except Exception as e:
            return {
                "source": self.name,
                "type": "error",
                "content": f"Scoring error: {e}",
                "urgency": 0.3
            }


if __name__ == "__main__":
    print("=" * 60)
    print("  SAGE Lead Scorer — Unblock Revenue Pipeline")
    print("  Max 2 outreach/day | φ = 1.618 | RETIRE_DADDY")
    print("=" * 60)
    
    try:
        scorer = SAGELeadScorer()
    except RuntimeError as e:
        print(f"\n❌ {e}")
        exit(1)
    
    # Show current stats
    stats = scorer.get_scoring_stats()
    print(f"\n📊 Lead Stats:")
    print(f"  Total: {stats['total_leads']:,}")
    print(f"  Scored: {stats['scored']:,} ({stats['pct_scored']}%)")
    print(f"  Unscored: {stats['unscored']:,}")
    
    if stats.get("tiers"):
        print(f"\n  Tier Distribution:")
        for tier, info in stats["tiers"].items():
            print(f"    {tier}: {info['count']:,} leads (avg score: {info['avg_score']})")
    
    # Score a batch
    print(f"\n{'─' * 60}")
    print("Scoring batch of 5000 leads...")
    result = scorer.score_batch(limit=5000)
    print(f"  Scored: {result['scored']:,}")
    print(f"  Tiers: {json.dumps(result['tiers'])}")
    
    # Show updated stats
    stats2 = scorer.get_scoring_stats()
    if stats2.get("tiers"):
        print(f"\n📊 Updated Tier Distribution:")
        for tier, info in stats2["tiers"].items():
            print(f"    {tier}: {info['count']:,} leads (avg score: {info['avg_score']})")
    
    if stats2.get("top_5"):
        print(f"\n🔥 Top 5 Leads:")
        for lead in stats2["top_5"]:
            print(f"  [{lead['score']:.0f}] {lead['target'] or 'unknown'} — {lead['source'][:50]}")
    
    if stats2.get("top_repos"):
        print(f"\n🏢 Top Repos by Hot Leads:")
        for repo in stats2["top_repos"]:
            print(f"  {repo['source'][:50]} — {repo['hot_leads']} leads (avg: {repo['avg_score']})")
    
    # Show top actionable leads
    top = scorer.get_top_leads(limit=10)
    if top:
        print(f"\n💰 Top 10 Actionable Leads (pending + hot/warm):")
        for t in top:
            t = dict(t)
            print(f"  [{t['score']:.0f}] {t['target'] or '?'} @ {t['source'][:40]} — {t['tier']}")
    
    scorer.close()
    print(f"\n✅ SAGE Scorer operational")