Source code for mlsim.bias.populations

import numpy as np
import pandas as pd
from aif360.datasets import  StructuredDataset
from .bias_components import Demographic, Target, Feature, FeatureNoise

default_params = {'dem':None,}


[docs]
class Population():
    '''
    Object for describing a population so that sampling from the population
    and biased samples are possible from a sampler type and parameter dictionary
    '''
    def __init__(self, demographic_sampler= Demographic,
                target_sampler = Target,
                feature_sampler = Feature,
                feature_noise_sampler = FeatureNoise, parameter_dictionary = {}):
        '''
        initialize a population based on the way to sample from it. a population
        object has properties that define the samplers for the demographic
        variables (A,Z) the observed target (Y) and the features (X)

        Parameters:
        -----------
        demographic_sampler : Demographic
            a sampler that inherits from mlsim.Demographic
        target_sampler : Target,
        feature_sampler : Feature,
        feature_noise_sampler : FeatureNoise,
        parameter_dictionary : dictionary default empty
        '''


        required_keys = ['dem','target','feat','featnoise']

        #ensure required keys are set
        for key in required_keys:
            if not(key in parameter_dictionary.keys()):
                parameter_dictionary[key]= None

        dem_params = parameter_dictionary['dem']
        target_params = parameter_dictionary['target']
        feat_params = parameter_dictionary['feat']
        featnoise_params = parameter_dictionary['featnoise']

        # initialize objects for each, with parameters if provided
        if dem_params:
            self.demographic_sampler = demographic_sampler(*dem_params)
        else:
            self.demographic_sampler = demographic_sampler()

        if target_params:
            self.target_sampler = target_sampler(*target_params)
        else:
            self.target_sampler = target_sampler()

        if feat_params:
            self.feature_sampler = feature_sampler(*feat_params)
        else:
            self.feature_sampler = feature_sampler()

        if featnoise_params:
            self.feature_noise_sampler = feature_noise_sampler(*featnoise_params)
        else:
            self.feature_noise_sampler = feature_noise_sampler()



[docs]
    def sample(self, N,return_as = 'DataFrame'):
        '''
        sample N members of the  population, according to its underlying
        distribution

        Parameters
        -----------
        N : int
            number of samples
        return_as : string, 'dataframe'
            type to return as, can be pandas 'DataFrame' or IBM AIF360
            'structuredDataset'
        '''
        a,z = self.demographic_sampler.sample(N)
        y = self.target_sampler.sample(a,z)
        x = self.feature_sampler.sample(a,z,y)
        x = self.feature_noise_sampler.sample(a,z,y,x)

        if return_as == 'DataFrame':
            df = self.make_DataFrame(a,z,y,x)

        elif return_as == 'structuredDataset':
            df = self.make_StructuredDataset(a,z,y,x)


        return df



[docs]
    def sample_unfavorable_outcomes(self,N,rho_z_scale):
        '''
        sample so that the disadvantaged group (a=1) gets the favorable
        outcome (y=1) less often based on the rho_z_scale
        '''
        # get original demographic parameters
        rho_z0 = self.demographic_sampler.get_rho_z()
        rho_a = self.demographic_sampler.get_rho_a()
        # scale rho_a
        rho_z = [rho_z0[0],rho_z0[1]*rho_z_scale]
        # sameple the demongraphic vars with the new sampler
        self.unfavorable_dem = self.DemographicCorrelated(rho_a,rho_z)
        a,z = self.unfavorable_dem.sample(N)

        # sample the rest as usual
        y = self.target_sampler.sample(a,z)
        x = self.feature_sampler.sample(a,z,y)
        x = self.feature_noise_sampler.sample(a,z,y,x)

        return self.make_DataFrame(a,z,y,x)



[docs]
    def make_DataFrame(self,a,z,y,x):
        '''
        combine into data frame with labels

        Parameters
        ----------
        a : list
        '''
        # concatenate the data and p
        azy = np.vstack([a,z,y]).T
        data = np.concatenate([azy,x],axis=1)
        labels =['a','z','y']
        _,D = x.shape
        labels.extend(['x'+str(i) for i in range(D)])

        return pd.DataFrame(data=data, columns = labels)



[docs]
    def make_StructuredDataset(self,a,z,y,x):
        '''
        Converts a dataframe created by one of the above functions into a dataset usable in IBM 360 package

        Parameters
        -----------
        df : pandas dataframe
        label_names : optional, a list of strings describing each label
        protected_attribute_names : optional, a list of strings describing
        features corresponding to      protected attributes

        Returns
        --------
        aif360.datasets.StructuredDataset containing the data with y as the target and a as protected attribute. 

        '''
        df = self.make_DataFrame(a,z,y,x)
        return StructuredDataset(df, ['y'], ['a'])



[docs]
    def get_parameter_description(self):
        '''
        Build a string output that describes this object

        Returns
        --------
        description : string
            values of each parameter value grouped by sampler
        '''
        description = ''


        description += 'Demographic Parameters\n'
        description += self.demographic_sampler.params.__str__()
        description += '\nTarget Parameters \n'
        description += self.target_sampler.params.__str__()
        description += '\nFeature Parameters \n'
        description += self.feature_sampler.params.__str__()
        description += '\nFeature Noise Parameters \n'
        description += self.feature_noise_sampler.params.__str__()

        return description





[docs]
class PopulationInstantiated(Population):
    '''
    To instantiate with either default parameters or instantiated sampler objects
    '''
    def __init__(self, demographic_sampler= Demographic(),
                target_sampler = Target(),
                feature_sampler = Feature(),
                feature_noise_sampler = FeatureNoise()):
        '''
        initialize a population based on the way to sample from it

        Parameters:
        -----------
        population_sampler : function handle
            function to sample from the distribution
        '''

        self.demographic_sampler = demographic_sampler
        self.target_sampler = target_sampler
        self.feature_sampler = feature_sampler
        self.feature_noise_sampler = feature_noise_sampler