A Literature Review On AI fairness Best Practices

import os 
import pandas as pd 
print(os.getcwd())

/Users/negritta/Documents/ML-FAirness Lab /fairml-bestpractices-591

def process_datasets(data_columns, directory):
    """
    Processes CSV files in the specified directory, gathering 
    information about the dataset and identifying sensitive 
    features.

    Parameters
    ----------
    data_columns (list): A list of column names to be used for 
                         storing metadata about datasets.
                         Example: ['Datasets', 'Features', 'Instances', 'Sensitive_Feature', 'Categorical Columns', 
                                  'Numeric Columns']
    directory (str): The directory path containing the CSV files to process.

    Returns
    -------
    result_df (pd.DataFrame): A pandas DataFrame containing metadata about each dataset in the specified directory. 
    The columns include:
        - 'Datasets': The name of the dataset (CSV file).
        - 'Features': The number of columns (features) in the dataset.
        - 'Instances': The number of rows (instances) in the dataset.
        - 'Sensitive_Feature': Sensitive features found in the dataset. If no sensitive feature is found, 'None' is 
                               added.
        - 'Categorical Columns': The number of categorical (object type) columns in the dataset.
        - 'Numeric Columns': The number of numeric (int64 or float64) columns in the dataset.

    Additionally
    ------------
    - The DataFrame is saved as a CSV file named 'sensitive_features_summary.csv'.

    Raises
    ------
    Exception: If there is any error loading or processing a dataset, an exception will be printed.
    """
data_columns = ['Datasets','Features','Instances','Sensitive_Feature','Categorical Columns','Numeric Columns']
# Initialize a dictionary to store metadata about each dataset
mult_dict = {char:[] for char in data_columns}

directory ='Datasets'
# List all files in the given directory
files = os.listdir(directory)
# Filter CSV files(endswith('.csv')), ignoring any spaces(.strip()) or case issues(lower())
csv_files = [f for f in files if f.strip().lower().endswith('.csv')]
# Loop through each CSV file in the directory
for dataset in csv_files:
        data_path = os.path.join(directory,dataset)
        # Read the CSV file into a DataFrame
        #df = pd.read_csv(data_path,low_memory=False)
        try:
            df = pd.read_csv(data_path,low_memory=False)  # Ensure the delimiter is correct
            # Add dataset name to the dictionary    
            dataset_name = os.path.splitext(dataset)[0]
            mult_dict['Datasets'].append(dataset_name)
            # Get the number of rows and columns (instances and features)
            rows,cols = df.shape
            mult_dict['Features'].append(cols)
            mult_dict['Instances'].append(rows)
            # Count the number of categorical and numeric columns
            mult_dict['Categorical Columns'].append(len(df.select_dtypes(include=['object']).columns))
            mult_dict['Numeric Columns'].append(len(df.select_dtypes(include=['float64', 'int64']).columns))
            
            # Clean column names (strip spaces, convert to lowercase)
            df.columns = df.columns.str.strip().str.lower()
            
            # Define a list of sensitive features to look for
            Sensitive_Feature = ['gender', 'age', 'relationship', 'race','sex','marriage']
            Found_Sensitive_Feature = []            

            # Check for sensitive features in a case-insensitive manner
            for feature in Sensitive_Feature:
                if feature in df.columns:
                    # Standardize the feature values (strip spaces, convert to lowercase)
                    df[feature] = df[feature].astype(str).str.strip().str.lower()
                    Found_Sensitive_Feature.append(feature)
                    
            # Add the found sensitive features to the dictionary (or 'None' if none found)
            mult_dict['Sensitive_Feature'].append(', '.join(Found_Sensitive_Feature) if Found_Sensitive_Feature else 'None')
            
        except Exception as e:
         # Handle and print any errors that occur during dataset processing  
         print(f"Error loading {dataset}: {e}")
# Convert the dictionary to a pandas DataFrame       
result_df = pd.DataFrame(mult_dict)

# Convert the 'Sensitive_Feature' column to a list and explode it (one row per sensitive feature)
result_df['Sensitive_Feature'] = result_df['Sensitive_Feature'].str.split(',')
result_df = result_df.explode('Sensitive_Feature')
#result_df['Sensitive_Feature'] = result_df['Sensitive_Feature'].replace('age', 'age_group')
# Save the DataFrame to a CSV file
#result_df.to_csv('sensitive_features_summary.csv', index=False)

# print the final DataFrame
result_df

datasets_folder = 'Datasets'
potential_sensitive_features = ['gender', 'age', 'relationship', 'race', 'sex', 'marriage']
# Dictionary to store DataFrames
dataframes = {}
def process_sensitive_features(df_A, sensitive_features, dataset_name):
    """
Processes multiple datasets to identify and aggregate potential sensitive features for analysis.

The code iterates through CSV files in the specified folder, examines each dataset for potential sensitive features 
(e.g., 'gender', 'age', 'relationship', 'race', 'sex', 'marriage'), and calculates value counts for each of these 
features. For the 'age' feature, it groups ages into ranges (e.g., 0-9, 10-19) and counts occurrences in 
each range. The results are compiled into a single DataFrame, saved as a CSV file ('sensitive.csv').
    Parameters
    ----------
    datasets_folder: str 
        The folder path where CSV datasets are stored.
    potential_sensitive_features: list
        List of sensitive feature names to be checked in each dataset.
    dataframes: dict
        A dictionary to store processed data from each dataset.

    Functions
    ---------
     process_sensitive_features(df_A, sensitive_features, dataset_name):
        Processes sensitive features within a given DataFrame.
        
        Parameters
        ----------
            df_A: pd.DataFrame
               The dataset DataFrame to process.
               
            sensitive_features: list
               List of sensitive feature column names present in the dataset.
               
            dataset_name: str 
               Name of the dataset, typically derived from the filename.
        
        Returns
        -------
            pd.DataFrame: A DataFrame containing value counts for each sensitive feature in the dataset, with
                          columns ['dataset', 'sensitive_feature', 'value', 'count'].

   Outputs
   -------
    Combined_sensitive_df: pd.DataFrame 
            A concatenated DataFrame with sensitive feature information from all datasets processed.
    'sensitive.csv' (file): A CSV file containing the aggregated sensitive feature data.
"""
    result = []
    
    for feature in sensitive_features:
        if feature == 'age':  # Special handling for age
            # Convert age to numeric, handling any non-numeric values
            df_A['numeric_age'] = pd.to_numeric(df_A[feature], errors='coerce')
            
            # Create age group labels (0-9, 10-19, etc.)
            labels = [f"{i}-{i+9}" for i in range(0, 100, 10)]
            
            # Create age groups using pd.cut
            age_groups = pd.cut(df_A['numeric_age'], 
                              bins=range(0, 105, 10),  # 105 to include ages up to 104
                              right=False,  # intervals like [0,10) instead of (0,10]
                              labels=labels)
            
            # Get value counts for the age groups
            feature_counts = age_groups.value_counts().reset_index()
            feature_counts.columns = ['Value', 'Count']
            
        else:
            # Original processing for other features
            #df[feature].value_counts() counts the occurrences of each unique value in that feature column.
            # Get value counts for each unique value in the sensitive feature column
            feature_counts = df_A[feature].value_counts().reset_index()
            feature_counts.columns = ['Value', 'Count']
        
        # Add metadata about the sensitive feature and dataset
        feature_counts['Sensitive_feature'] = 'age_group' if feature == 'age' else feature
        feature_counts['Dataset'] = dataset_name
        
        # Append the result to the list
        result.append(feature_counts)
    
    # If sensitive features are found, return the concatenated DataFrame, else return an empty DataFrame
    if result:
        return pd.concat(result, ignore_index=True)
    else:
        return pd.DataFrame(columns=['Dataset', 'Sensitive_feature', 'Value', 'Count'])
for filename in os.listdir(datasets_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(datasets_folder, filename)
        df_A = pd.read_csv(file_path,low_memory=False)
        #is used to extract the base name (without the file extension) from a file's name.
        dataset_name = os.path.splitext(filename)[0]
        
        # looking for sensitive features in this dataset
        df_A.columns = df_A.columns.str.strip().str.lower()
        sensitive_features = [col for col in df_A.columns if col.lower() 
                              in potential_sensitive_features]        
        # Processing sensitive features
        #for col in sensitive_features:
            #df[col] = df[col].astype(str).str.upper()
        
        # Process and aggregate sensitive features
        sensitive_df = process_sensitive_features(df_A, sensitive_features, dataset_name)
        
        # Store the processed DataFrames in the dictionary
        dataframes[filename] = {'sensitive_data': sensitive_df}

# Combine all sensitive feature DataFrames
Combined_sensitive_df = pd.concat([df_A['sensitive_data'] for df_A in dataframes.values()])
# Reindex the combined DataFrame to ensure consistent column order
Combined_sensitive_df = Combined_sensitive_df.reindex(columns=['Dataset','Sensitive_feature','Value','Count'])
#combined_sensitive_df['sensitive_feature']=combined_sensitive_df['sensitive_feature'].replace('age','age_group')
# Save the combined DataFrame to a CSV file
#Combined_sensitive_df.to_csv('sensitive.csv', index=False)
Combined_sensitive_df