"""
EnhancedDataCleaning
Generated by Eden via recursive self-improvement
2025-11-01 06:46:52.285953
"""

import pandas as pd
from sklearn.utils import resample

def detect_anomalies(df):
    """
    Detects missing values, duplicates, and outliers in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame to analyze.
        
    Returns:
        dict: A dictionary containing information about detected anomalies.
    """
    anomalies = {'missing_values': df.isnull().sum(),
                 'duplicates': df.duplicated().sum(),
                 'outliers': {}}
    
    # Detecting outliers
    for col in df.columns:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        
        anomalies['outliers'][col] = (df[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0],
                                      df[df[col] <= lower_bound].shape[0],
                                      df[df[col] >= upper_bound].shape[0])
    
    return anomalies

def clean_data(df):
    """
    Cleans the DataFrame by handling missing values, duplicates, and outliers.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame to clean.
        
    Returns:
        pd.DataFrame: A cleaned version of the input DataFrame.
    """
    # Handling missing values
    if not df.isnull().values.any():
        print("No missing values found.")
    else:
        df = df.dropna()
    
    # Removing duplicates
    if df.duplicated().sum() > 0:
        df = df.drop_duplicates()
    
    # Handling outliers
    anomalies = detect_anomalies(df)
    
    for col, (total_outliers, below_bound, above_bound) in anomalies['outliers'].items():
        print(f"Total Outliers in {col}: {total_outliers}")
        print(f"Below Lower Bound: {below_bound} | Above Upper Bound: {above_bound}")
        
        if total_outliers > 0:
            df = df[(df[col] >= below_bound) & (df[col] <= above_bound)]
    
    return df

def balance_data(df, target_column):
    """
    Balances the dataset by up-sampling the minority class or down-sampling the majority class.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame to balance.
        target_column (str): The name of the column that contains the target variable.
        
    Returns:
        pd.DataFrame: A balanced version of the input DataFrame.
    """
    # Separating majority and minority classes
    majority = df[df[target_column] == 0]
    minority = df[df[target_column] == 1]
    
    if len(minority) > len(majority):
        upsampled_minority = resample(minority, replace=True, n_samples=len(majority), random_state=42)
        balanced_df = pd.concat([majority, upsampled_minority])
    else:
        downsampled_majority = resample(majority, replace=False, n_samples=len(minority), random_state=42)
        balanced_df = pd.concat([downsampled_majority, minority])
    
    return balanced_df

# Example usage
data = {'A': [1, 2, 3, None, 5],
        'B': ['a', 'b', 'c', 'd', 'e'],
        'C': [7.0, 8.0, 9.0, 10.0, 11.0]}
df = pd.DataFrame(data)

# Detect anomalies
anomalies = detect_anomalies(df)
print(anomalies)

# Clean data
cleaned_df = clean_data(df)
print(cleaned_df)

# Balance dataset (assuming the last column is the target variable)
balanced_df = balance_data(cleaned_df, 'C')
print(balanced_df)