Source code for mlsim.bias.populations

import numpy as np
import pandas as pd
from aif360.datasets import  StructuredDataset
from .bias_components import Demographic, Target, Feature, FeatureNoise

default_params = {'dem':None,}

[docs]class Population(): ''' Object for describing a population so that sampling from the population and biased samples are possible from a sampler type and parameter dictionary ''' def __init__(self, demographic_sampler= Demographic, target_sampler = Target, feature_sampler = Feature, feature_noise_sampler = FeatureNoise, parameter_dictionary = {}): ''' initialize a population based on the way to sample from it. a population object has properties that define the samplers for the demographic variables (A,Z) the observed target (Y) and the features (X) Parameters: ----------- demographic_sampler : Demographic a sampler that inherits from mlsim.Demographic target_sampler : Target, feature_sampler : Feature, feature_noise_sampler : FeatureNoise, parameter_dictionary : dictionary default empty ''' required_keys = ['dem','target','feat','featnoise'] #ensure required keys are set for key in required_keys: if not(key in parameter_dictionary.keys()): parameter_dictionary[key]= None dem_params = parameter_dictionary['dem'] target_params = parameter_dictionary['target'] feat_params = parameter_dictionary['feat'] featnoise_params = parameter_dictionary['featnoise'] # initialize objects for each, with parameters if provided if dem_params: self.demographic_sampler = demographic_sampler(*dem_params) else: self.demographic_sampler = demographic_sampler() if target_params: self.target_sampler = target_sampler(*target_params) else: self.target_sampler = target_sampler() if feat_params: self.feature_sampler = feature_sampler(*feat_params) else: self.feature_sampler = feature_sampler() if featnoise_params: self.feature_noise_sampler = feature_noise_sampler(*featnoise_params) else: self.feature_noise_sampler = feature_noise_sampler()
[docs] def sample(self, N,return_as = 'DataFrame'): ''' sample N members of the population, according to its underlying distribution Parameters ----------- N : int number of samples return_as : string, 'dataframe' type to return as, can be pandas 'DataFrame' or IBM AIF360 'structuredDataset' ''' a,z = self.demographic_sampler.sample(N) y = self.target_sampler.sample(a,z) x = self.feature_sampler.sample(a,z,y) x = self.feature_noise_sampler.sample(a,z,y,x) if return_as == 'DataFrame': df = self.make_DataFrame(a,z,y,x) elif return_as == 'structuredDataset': df = self.make_StructuredDataset(a,z,y,x) return df
[docs] def sample_unfavorable_outcomes(self,N,rho_z_scale): ''' sample so that the disadvantaged group (a=1) gets the favorable outcome (y=1) less often based on the rho_z_scale ''' # get original demographic parameters rho_z0 = self.demographic_sampler.get_rho_z() rho_a = self.demographic_sampler.get_rho_a() # scale rho_a rho_z = [rho_z0[0],rho_z0[1]*rho_z_scale] # sameple the demongraphic vars with the new sampler self.unfavorable_dem = self.DemographicCorrelated(rho_a,rho_z) a,z = self.unfavorable_dem.sample(N) # sample the rest as usual y = self.target_sampler.sample(a,z) x = self.feature_sampler.sample(a,z,y) x = self.feature_noise_sampler.sample(a,z,y,x) return self.make_DataFrame(a,z,y,x)
[docs] def make_DataFrame(self,a,z,y,x): ''' combine into data frame with labels Parameters ---------- a : list ''' # concatenate the data and p azy = np.vstack([a,z,y]).T data = np.concatenate([azy,x],axis=1) labels =['a','z','y'] _,D = x.shape labels.extend(['x'+str(i) for i in range(D)]) return pd.DataFrame(data=data, columns = labels)
[docs] def make_StructuredDataset(self,a,z,y,x): ''' Converts a dataframe created by one of the above functions into a dataset usable in IBM 360 package Parameters ----------- df : pandas dataframe label_names : optional, a list of strings describing each label protected_attribute_names : optional, a list of strings describing features corresponding to protected attributes Returns -------- aif360.datasets.StructuredDataset containing the data with y as the target and a as protected attribute. ''' df = self.make_DataFrame(a,z,y,x) return StructuredDataset(df, ['y'], ['a'])
[docs] def get_parameter_description(self): ''' Build a string output that describes this object Returns -------- description : string values of each parameter value grouped by sampler ''' description = '' description += 'Demographic Parameters\n' description += self.demographic_sampler.params.__str__() description += '\nTarget Parameters \n' description += self.target_sampler.params.__str__() description += '\nFeature Parameters \n' description += self.feature_sampler.params.__str__() description += '\nFeature Noise Parameters \n' description += self.feature_noise_sampler.params.__str__() return description
[docs]class PopulationInstantiated(Population): ''' To instantiate with either default parameters or instantiated sampler objects ''' def __init__(self, demographic_sampler= Demographic(), target_sampler = Target(), feature_sampler = Feature(), feature_noise_sampler = FeatureNoise()): ''' initialize a population based on the way to sample from it Parameters: ----------- population_sampler : function handle function to sample from the distribution ''' self.demographic_sampler = demographic_sampler self.target_sampler = target_sampler self.feature_sampler = feature_sampler self.feature_noise_sampler = feature_noise_sampler