#!/usr/bin/env python3
import time
"""
Eden's RAG - Upgraded with Intelligent Chunking
METADATA + MEANINGFUL CHUNKS = Less Hallucination
"""
import chromadb
import requests
import re
from pathlib import Path
from typing import List, Dict, Any, Optional
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    Language,
    MarkdownHeaderTextSplitter,
    HTMLHeaderTextSplitter,
)

PHI = 1.618033988749895

class EdenRAG:
    def __init__(self, db_path: str = "/Eden/KNOWLEDGE/vector_db"):
        print("🧠 Initializing Eden RAG with intelligent chunking...")
        self.db_path = Path(db_path)
        self.db_path.parent.mkdir(parents=True, exist_ok=True)
        self.client = chromadb.PersistentClient(path=str(self.db_path))
        self.collection = self.client.get_or_create_collection("eden_knowledge")
        self._init_splitters()
        print(f"✅ RAG ready ({self.collection.count()} docs) | φ = {PHI}")
    
    def _init_splitters(self):
        self.code_splitters = {
            'python': RecursiveCharacterTextSplitter.from_language(language=Language.PYTHON, chunk_size=800, chunk_overlap=100),
            'javascript': RecursiveCharacterTextSplitter.from_language(language=Language.JS, chunk_size=800, chunk_overlap=100),
            'rust': RecursiveCharacterTextSplitter.from_language(language=Language.RUST, chunk_size=800, chunk_overlap=100),
            'go': RecursiveCharacterTextSplitter.from_language(language=Language.GO, chunk_size=800, chunk_overlap=100),
        }
        self.md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "h1"), ("##", "h2"), ("###", "h3")])
        self.html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=[("h1", "h1"), ("h2", "h2"), ("h3", "h3")])
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    
    def detect_type(self, content: str, filename: str = None) -> str:
        if filename:
            ext_map = {'.py': 'python', '.js': 'javascript', '.rs': 'rust', '.go': 'go', '.sql': 'sql', '.md': 'markdown', '.html': 'html', '.json': 'json'}
            ext = Path(filename).suffix.lower()
            if ext in ext_map: return ext_map[ext]
        if 'def ' in content and ':' in content: return 'python'
        if content.strip().startswith('{'): return 'json'
        if content.strip().startswith('# '): return 'markdown'
        return 'text'
    
    def chunk(self, content: str, content_type: str = None, filename: str = None) -> List[Dict[str, Any]]:
        if not content_type: content_type = self.detect_type(content, filename)
        chunks = []
        base_meta = {'source': filename or 'input', 'type': content_type}
        
        if content_type in self.code_splitters:
            docs = self.code_splitters[content_type].create_documents([content])
            for i, doc in enumerate(docs):
                chunks.append({'text': doc.page_content, 'metadata': {**base_meta, 'chunk': i, 'lang': content_type}})
        elif content_type == 'markdown':
            for i, doc in enumerate(self.md_splitter.split_text(content)):
                chunks.append({'text': doc.page_content, 'metadata': {**base_meta, **doc.metadata, 'chunk': i}})
        else:
            for i, text in enumerate(self.text_splitter.split_text(content)):
                chunks.append({'text': text, 'metadata': {**base_meta, 'chunk': i}})
        return chunks
    
    def embed(self, text: str) -> List[float]:
        try:
            r = requests.post("http://localhost:11434/api/embeddings", json={"model": "nomic-embed-text", "prompt": text}, timeout=30)
            return r.json()["embedding"]
        except: return [0.0] * 768
    
    def add(self, content: str, filename: str = None, content_type: str = None):
        chunks = self.chunk(content, content_type, filename)
        if not chunks: return
        texts = [c['text'] for c in chunks]
        embeddings = [self.embed(t) for t in texts]
        ids = [f"doc_{self.collection.count() + i}" for i in range(len(texts))]
        self.collection.add(documents=texts, embeddings=embeddings, metadatas=[c['metadata'] for c in chunks], ids=ids)
        print(f"✅ Added {len(chunks)} chunks ({content_type or 'auto'})")
    
    def add_code(self, code: str, language: str = 'python'):
        self.add(code, filename=f"code.{language}", content_type=language)
    
    def query(self, q: str, n: int = 5):
        return self.collection.query(query_embeddings=[self.embed(q)], n_results=n, include=['documents', 'metadatas', 'distances'])

class SAGECodeAnalyzer:
    def __init__(self, rag: EdenRAG = None):
        self.rag = rag or EdenRAG()
        self.patterns = [r'eval\s*\(', r'exec\s*\(', r'os\.system', r'subprocess\.', r'pickle\.load', r'sql.*[\'"].*\+', r'password\s*=\s*[\'"]']
    
    def analyze(self, code: str, language: str = 'python'):
        chunks = self.rag.chunk(code, content_type=language)
        findings = []
        for chunk in chunks:
            for p in self.patterns:
                if re.search(p, chunk['text'], re.IGNORECASE):
                    findings.append({'pattern': p, 'chunk': chunk['metadata'].get('chunk'), 'preview': chunk['text'][:100]})
        return {'chunks': len(chunks), 'findings': findings, 'risk': 'HIGH' if findings else 'LOW'}

def run_service():
    rag = EdenRAG()
    test = "def bad(x):\n    os.system(x)\n    eval(x)"
    rag.add_code(test, 'python')
    print(rag.query("security vulnerability"))
    print(SAGECodeAnalyzer(rag).analyze(test))

if __name__ == '__main__':
    print('🛰️ RAG Daemon active and watching...')
    while True:
        # Ingest strikes/core and wait for next cycle
        time.sleep(60)
