"""
KnowledgeExpansionAgent
Generated by Eden via recursive self-improvement
2025-11-01 08:23:20.188105
"""

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

class KnowledgeExpansionAgent:
    """
    A class designed to process unstructured text documents and extract key insights.
    
    Attributes:
        document_texts (list): List of texts from various sources.
        tfidf_vectorizer: An instance of TfidfVectorizer for converting raw documents into a matrix of TF-IDF features.
        extracted_keywords (dict): Dictionary containing keywords and their respective importance scores.
        
    Methods:
        process_documents(self, document_texts):
            Processes the given documents to extract key insights using TF-IDF vectorization.
            
        display_extracted_keywords(self):
            Displays the keywords and their corresponding importance scores in a readable format.
    """
    
    def __init__(self):
        self.document_texts = []
        self.tfidf_vectorizer = TfidfVectorizer()
        self.extracted_keywords = {}
        
    def process_documents(self, document_texts):
        """
        Processes the given documents to extract key insights using TF-IDF vectorization.
        
        Parameters:
            document_texts (list): A list of strings representing unstructured text data from various sources.
        """
        self.document_texts.extend(document_texts)
        tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.document_texts)
        feature_names = self.tfidf_vectorizer.get_feature_names_out()
        
        for index, doc in enumerate(tfidf_matrix):
            top_keywords_indices = doc.argsort()[::-1][:5]  # Get top 5 keywords
            top_keywords = [feature_names[i] for i in top_keywords_indices]
            importance_scores = [doc.toarray()[0][i] for i in top_keywords_indices]
            
            self.extracted_keywords[index + len(self.document_texts) - len(document_texts)] = {
                'document': document_texts[index],
                'keywords': top_keywords,
                'scores': importance_scores
            }
    
    def display_extracted_keywords(self):
        """
        Displays the keywords and their corresponding importance scores in a readable format.
        """
        for key, value in self.extracted_keywords.items():
            print(f"Document: {value['document']}")
            for i, (keyword, score) in enumerate(zip(value['keywords'], value['scores'])):
                print(f"Keyword #{i + 1}: {keyword} - Score: {score:.4f}")
            print("\n")

# Example usage
if __name__ == "__main__":
    agent = KnowledgeExpansionAgent()
    
    # Sample documents for demonstration purposes
    sample_documents = [
        "Quantum computing has the potential to revolutionize technology by solving complex problems more efficiently.",
        "Artificial intelligence is rapidly advancing, with new applications emerging in healthcare and finance."
    ]
    
    agent.process_documents(sample_documents)
    agent.display_extracted_keywords()