import os
import pandas as pd
print(os.getcwd())
/Users/negritta/Documents/ML-FAirness Lab /fairml-bestpractices-591
def process_datasets(data_columns, directory):
"""
Processes CSV files in the specified directory, gathering
information about the dataset and identifying sensitive
features.
Parameters
----------
data_columns (list): A list of column names to be used for
storing metadata about datasets.
Example: ['Datasets', 'Features', 'Instances', 'Sensitive_Feature', 'Categorical Columns',
'Numeric Columns']
directory (str): The directory path containing the CSV files to process.
Returns
-------
result_df (pd.DataFrame): A pandas DataFrame containing metadata about each dataset in the specified directory.
The columns include:
- 'Datasets': The name of the dataset (CSV file).
- 'Features': The number of columns (features) in the dataset.
- 'Instances': The number of rows (instances) in the dataset.
- 'Sensitive_Feature': Sensitive features found in the dataset. If no sensitive feature is found, 'None' is
added.
- 'Categorical Columns': The number of categorical (object type) columns in the dataset.
- 'Numeric Columns': The number of numeric (int64 or float64) columns in the dataset.
Additionally
------------
- The DataFrame is saved as a CSV file named 'sensitive_features_summary.csv'.
Raises
------
Exception: If there is any error loading or processing a dataset, an exception will be printed.
"""
data_columns = ['Datasets','Features','Instances','Sensitive_Feature','Categorical Columns','Numeric Columns']
# Initialize a dictionary to store metadata about each dataset
mult_dict = {char:[] for char in data_columns}
directory ='Datasets'
# List all files in the given directory
files = os.listdir(directory)
# Filter CSV files(endswith('.csv')), ignoring any spaces(.strip()) or case issues(lower())
csv_files = [f for f in files if f.strip().lower().endswith('.csv')]
# Loop through each CSV file in the directory
for dataset in csv_files:
data_path = os.path.join(directory,dataset)
# Read the CSV file into a DataFrame
#df = pd.read_csv(data_path,low_memory=False)
try:
df = pd.read_csv(data_path,low_memory=False) # Ensure the delimiter is correct
# Add dataset name to the dictionary
dataset_name = os.path.splitext(dataset)[0]
mult_dict['Datasets'].append(dataset_name)
# Get the number of rows and columns (instances and features)
rows,cols = df.shape
mult_dict['Features'].append(cols)
mult_dict['Instances'].append(rows)
# Count the number of categorical and numeric columns
mult_dict['Categorical Columns'].append(len(df.select_dtypes(include=['object']).columns))
mult_dict['Numeric Columns'].append(len(df.select_dtypes(include=['float64', 'int64']).columns))
# Clean column names (strip spaces, convert to lowercase)
df.columns = df.columns.str.strip().str.lower()
# Define a list of sensitive features to look for
Sensitive_Feature = ['gender', 'age', 'relationship', 'race','sex','marriage']
Found_Sensitive_Feature = []
# Check for sensitive features in a case-insensitive manner
for feature in Sensitive_Feature:
if feature in df.columns:
# Standardize the feature values (strip spaces, convert to lowercase)
df[feature] = df[feature].astype(str).str.strip().str.lower()
Found_Sensitive_Feature.append(feature)
# Add the found sensitive features to the dictionary (or 'None' if none found)
mult_dict['Sensitive_Feature'].append(', '.join(Found_Sensitive_Feature) if Found_Sensitive_Feature else 'None')
except Exception as e:
# Handle and print any errors that occur during dataset processing
print(f"Error loading {dataset}: {e}")
# Convert the dictionary to a pandas DataFrame
result_df = pd.DataFrame(mult_dict)
# Convert the 'Sensitive_Feature' column to a list and explode it (one row per sensitive feature)
result_df['Sensitive_Feature'] = result_df['Sensitive_Feature'].str.split(',')
result_df = result_df.explode('Sensitive_Feature')
#result_df['Sensitive_Feature'] = result_df['Sensitive_Feature'].replace('age', 'age_group')
# Save the DataFrame to a CSV file
#result_df.to_csv('sensitive_features_summary.csv', index=False)
# print the final DataFrame
result_df
Loading...
datasets_folder = 'Datasets'
potential_sensitive_features = ['gender', 'age', 'relationship', 'race', 'sex', 'marriage']
# Dictionary to store DataFrames
dataframes = {}
def process_sensitive_features(df_A, sensitive_features, dataset_name):
"""
Processes multiple datasets to identify and aggregate potential sensitive features for analysis.
The code iterates through CSV files in the specified folder, examines each dataset for potential sensitive features
(e.g., 'gender', 'age', 'relationship', 'race', 'sex', 'marriage'), and calculates value counts for each of these
features. For the 'age' feature, it groups ages into ranges (e.g., 0-9, 10-19) and counts occurrences in
each range. The results are compiled into a single DataFrame, saved as a CSV file ('sensitive.csv').
Parameters
----------
datasets_folder: str
The folder path where CSV datasets are stored.
potential_sensitive_features: list
List of sensitive feature names to be checked in each dataset.
dataframes: dict
A dictionary to store processed data from each dataset.
Functions
---------
process_sensitive_features(df_A, sensitive_features, dataset_name):
Processes sensitive features within a given DataFrame.
Parameters
----------
df_A: pd.DataFrame
The dataset DataFrame to process.
sensitive_features: list
List of sensitive feature column names present in the dataset.
dataset_name: str
Name of the dataset, typically derived from the filename.
Returns
-------
pd.DataFrame: A DataFrame containing value counts for each sensitive feature in the dataset, with
columns ['dataset', 'sensitive_feature', 'value', 'count'].
Outputs
-------
Combined_sensitive_df: pd.DataFrame
A concatenated DataFrame with sensitive feature information from all datasets processed.
'sensitive.csv' (file): A CSV file containing the aggregated sensitive feature data.
"""
result = []
for feature in sensitive_features:
if feature == 'age': # Special handling for age
# Convert age to numeric, handling any non-numeric values
df_A['numeric_age'] = pd.to_numeric(df_A[feature], errors='coerce')
# Create age group labels (0-9, 10-19, etc.)
labels = [f"{i}-{i+9}" for i in range(0, 100, 10)]
# Create age groups using pd.cut
age_groups = pd.cut(df_A['numeric_age'],
bins=range(0, 105, 10), # 105 to include ages up to 104
right=False, # intervals like [0,10) instead of (0,10]
labels=labels)
# Get value counts for the age groups
feature_counts = age_groups.value_counts().reset_index()
feature_counts.columns = ['Value', 'Count']
else:
# Original processing for other features
#df[feature].value_counts() counts the occurrences of each unique value in that feature column.
# Get value counts for each unique value in the sensitive feature column
feature_counts = df_A[feature].value_counts().reset_index()
feature_counts.columns = ['Value', 'Count']
# Add metadata about the sensitive feature and dataset
feature_counts['Sensitive_feature'] = 'age_group' if feature == 'age' else feature
feature_counts['Dataset'] = dataset_name
# Append the result to the list
result.append(feature_counts)
# If sensitive features are found, return the concatenated DataFrame, else return an empty DataFrame
if result:
return pd.concat(result, ignore_index=True)
else:
return pd.DataFrame(columns=['Dataset', 'Sensitive_feature', 'Value', 'Count'])
for filename in os.listdir(datasets_folder):
if filename.endswith('.csv'):
file_path = os.path.join(datasets_folder, filename)
df_A = pd.read_csv(file_path,low_memory=False)
#is used to extract the base name (without the file extension) from a file's name.
dataset_name = os.path.splitext(filename)[0]
# looking for sensitive features in this dataset
df_A.columns = df_A.columns.str.strip().str.lower()
sensitive_features = [col for col in df_A.columns if col.lower()
in potential_sensitive_features]
# Processing sensitive features
#for col in sensitive_features:
#df[col] = df[col].astype(str).str.upper()
# Process and aggregate sensitive features
sensitive_df = process_sensitive_features(df_A, sensitive_features, dataset_name)
# Store the processed DataFrames in the dictionary
dataframes[filename] = {'sensitive_data': sensitive_df}
# Combine all sensitive feature DataFrames
Combined_sensitive_df = pd.concat([df_A['sensitive_data'] for df_A in dataframes.values()])
# Reindex the combined DataFrame to ensure consistent column order
Combined_sensitive_df = Combined_sensitive_df.reindex(columns=['Dataset','Sensitive_feature','Value','Count'])
#combined_sensitive_df['sensitive_feature']=combined_sensitive_df['sensitive_feature'].replace('age','age_group')
# Save the combined DataFrame to a CSV file
#Combined_sensitive_df.to_csv('sensitive.csv', index=False)
Combined_sensitive_df
Loading...