"""
DataFrameDuplicateHandler
Generated by Eden via recursive self-improvement
2025-10-28 08:53:32.239947
"""

class DataFrameDuplicateHandler:
    """
    A class to handle duplicate rows in a pandas DataFrame.
    
    This class provides methods to detect duplicates, retrieve duplicate rows,
    remove duplicates, and summarize the results.
    
    Attributes:
        df: pandas DataFrame containing the data to be processed.
        columns_to_check: List of column names to use for detecting duplicates.
        drop_index: Boolean indicating whether to drop indices when removing duplicates.
    """

    def __init__(self, dataframe, columns=None):
        """
        Initialize the DataFrameDuplicateHandler with a DataFrame and optional columns.

        Args:
            dataframe: pandas DataFrame containing the data.
            columns: Optional list of column names to check for duplicates. 
                     If None, all columns are used.
        """
        self.df = dataframe
        self.columns_to_check = columns if columns is not None else dataframe.columns.tolist()
        self.drop_index = False

    def find_duplicates(self):
        """
        Detect duplicate rows based on the specified columns.

        Returns:
            Boolean Series indicating duplicates (True for duplicates).
        """
        return self.df.duplicated(subset=self.columns_to_check, keep='first')

    def get_duplicate_rows(self):
        """
        Retrieve all rows that are duplicates.

        Returns:
            DataFrame containing only duplicate rows.
        """
        duplicates_mask = self.find_duplicates()
        return self.df[duplicates_mask].sort_values(by=self.columns_to_check)

    def remove_duplicates(self, keep='first'):
        """
        Remove duplicate rows from the DataFrame.

        Args:
            keep: How to handle duplicates. Options are 'first', 'last', or 'any'.
                  'first' keeps the first occurrence, 'last' keeps the last occurrence,
                  and 'any' keeps one of the duplicates.

        Returns:
            DataFrame with duplicates removed.
        """
        self.drop_index = True  # Set to drop indices
        return self.df.drop_duplicates(subset=self.columns_to_check, keep=keep)

    def summary(self):
        """
        Get a summary of duplicate counts and unique rows.

        Returns:
            Dictionary containing the count of duplicates and unique rows.
        """
        total_rows = len(self.df)
        unique_rows = len(self.df[self.df.index.duplicated(keep=False)] == False)
        return {
            'total_rows': total_rows,
            'unique_rows': unique_rows,
            'duplicates_count': total_rows - unique_rows
        }

    def save_clean_data(self, filename):
        """
        Save the cleaned DataFrame to a CSV file.

        Args:
            filename: Name of the file to save.
        """
        clean_df = self.remove_duplicates()
        clean_df.to_csv(filename, index=self.drop_index)
import pandas as pd

# Sample data with duplicates
data = {
    'A': [1, 2, 3, 2, 4],
    'B': ['a', 'b', 'a', 'b', 'c'],
    'C': [5, 6, 5, 7, 8]
}

df = pd.DataFrame(data)

# Initialize the handler
handler = DataFrameDuplicateHandler(df, columns=['A', 'B'])

# Find duplicates
duplicates_mask = handler.find_duplicates()
print("Duplicate mask:\n", duplicates_mask)

# Get duplicate rows
duplicate_rows = handler.get_duplicate_rows()
print("\nDuplicate rows:\n", duplicate_rows)

# Remove duplicates and get cleaned data
cleaned_df = handler.remove_duplicates(keep='last')
print("\nCleaned DataFrame:\n", cleaned_df)

# Summary statistics
summary = handler.summary()
print("\nSummary:", summary)

# Save cleaned data to CSV
handler.save_clean_data('cleaned_data.csv')