Source code for mlsim.bias.bias_components

import numpy as np
import pandas as pd
from collections import namedtuple
from collections.abc import Iterable

DemParams = namedtuple('DemParams',['Pa','Pz_a'])
TargetParams = namedtuple('TargetParams',['Py_az'])
FeatureParams = namedtuple('FeatureParams',['distfunc','theta'])
NoiseParams = namedtuple('NoiseParams',['noisefunc','theta'])

class Sampler():
    '''
    base class for all samplers
    '''
    def __init__(self,param_tuple):
        '''
        '''
        self.params = self.ParamCreator(*param_tuple)


    # def sample():
    #     '''
    #     '''
    #     return self.outputs

[docs] class Demographic(Sampler): ''' base class for sampling demographics (a= protected attribute,z = true target value) ''' ParamCreator = DemParams def __init__(self,rho_a=.5,rho_z=.5): ''' P(A = 1) = rho_a P(Z=1) = rho_z default is independent sampling of a and z ''' Pa = [1-rho_a, rho_a] self.A = [0, 1] Pz = [1-rho_z, rho_z] super().__init__((Pa,[Pz,Pz]))
[docs] def sample(self,N): ''' Sample P(A,Z) = P(Z|A)P(A) Parameters ----------- N : integer number of samples to return Returns ------- a_z_tuple : Tuple a tuple of lenght 2 with elements a and z as column np arrays each of length N ''' a = np.random.choice(self.A, p= self.params.Pa, size=N) z = [np.random.choice([0,1], p= self.params.Pz_a[ai]) for ai in a] return np.asarray(a).T,np.asarray(z).T
[docs] def get_rho_a(self): ''' get P(A=1) Parameters ----------- Returns ------- rho_a : float Probability of being in the disadvantaged group, A =1 ''' return self.params.Pa[1]
[docs] def get_rho_z(self): ''' return P(Z=1|A) Parameters ----------- Returns ------- rho_z : nparray of floats probability of the favorable outcome(z =1) for A=0 and A=1 in that order ''' return np.asarray(self.params.Pz_a)[:,1]
[docs] class DemographicIndependent(Demographic): ''' ''' def __init__(self,rho_a=.2,rho_z=.1): ''' P(A = 1) = rho_a P(Z=1) = rho_z default is independent sampling of a and z ''' super().__init__(rho_a,rho_z)
[docs] class DemographicCorrelated(Demographic): ''' ''' def __init__(self,rho_a=.5,rho_z=[.5,.3]): ''' P(A = 1) = rho_a or P(A) = rho_a P(Z=1|A=i) = rho_z[i] Parameters rho_a : scalar or vector of floats probablity of A = 1 or distribution of A rho_z : vector of 2 or len(rho_a) probability Z=1, for A = i ''' if isinstance(rho_a, Iterable): Pa = rho_a self.A = list(range(len(rho_a))) else: Pa = [1-rho_a, rho_a] self.A = [0, 1] Pz_a = [[1-rho_zi, rho_zi] for rho_zi in rho_z] Sampler.__init__(self,(Pa,Pz_a))
[docs] class Target(Sampler): ''' ''' ParamCreator = TargetParams def __init__(self,beta=0.05,N_a=2): ''' P(Y=Z|A,Z ) = P(Y=Z) = 1-beta make errors with prob beta beta =0, makes Y =Z ''' pyeqz = [1-beta,beta] Py_az = [[pyeqz,pyeqz]]*N_a super().__init__((Py_az,))
[docs] def sample(self,a,z): ''' sample P(Y|A,Z) via P(Y=Z|A,Z) Parameters ----------- a : z : beta : float ''' y = [np.random.choice([zi,1-zi],p= self.params.Py_az[ai][zi]) for ai,zi in zip(a,z)] return np.asarray(y).T
[docs] class TargetDisadvantagedError(Target): ''' ''' def __init__(self,beta=.1,N_a=2): ''' make errors with prob beta (advantaged, A=(N_a-1)) P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1 ''' pyeqz = [1-beta,beta] Py_az = [[pyeqz, pyeqz]]*(N_a-1) + [[1, 0], [1, 0]] Sampler.__init__(self,(Py_az,))
[docs] class TargetTwoError(Target): ''' ''' def __init__(self,beta=[0,.1]): ''' make errors with prob beta P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta1 P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1-beta0 ''' pyz_a0 = [1-beta[0],beta[0]] pyz_a1 = [1-beta[1],beta[1]] Py_az = [[pyz_a0,pyz_a0],[pyz_a1,pyz_a1]] Sampler.__init__(self,(Py_az,))
class TargetAllAError(Target): ''' ''' def __init__(self, beta=[0, .1]): ''' make errors with prob beta P(Y=Z|A=1,Z ) = P(Y=Z|A=1) = 1-beta1 P(Y=Z|A=0,Z ) = P(Y=Z|A=0) = 1-beta0 # ''' # pyz_a0 = [1-beta[0], beta[0]] # pyz_a1 = [1-beta[1], beta[1]] Py_az = [[1-betaai, betaai]*2 for betaai in beta] Sampler.__init__(self, (Py_az,)) class TargetFlipNegative(Target): ''' ''' def __init__(self,beta=[0,.1]): ''' make errors with prob beta only for the Z=1 class P(Y=Z|A=1,Z =1 ) = 1-beta[1] P(Y=Z|A=0,Z = 1) = 1-beta[0] P(Y=Z|Z =0) = 1 ''' # pyz1_a0 = [1-beta[0],beta[0]] # pyz1_a1 = [1-beta[1],beta[1]] no_error = [1,0] # if z=0, P(Y=z) =1 Py_az = [[no_error, [1-betaai, betaai]] for betaai in beta] Sampler.__init__(self,(Py_az,)) class TargetFlipAllIndep(Target): ''' ''' def __init__(self,beta=[[.05,.1],[.05,.1]]): ''' make errors with prob beta for all possible combinations of A,Z P(Y=Z|A=1,Z =1 ) = 1- beta[1][1] P(Y=Z|A=0,Z = 1) = 1- beta[0][1] P(Y=Z|A=1,Z =0 ) = 1- beta[1][0] P(Y=Z|A=0,Z = 0) = 1- beta[0][0] ''' Py_az = [[[1-b,b] for b in be] for be in beta] Sampler.__init__(self,(Py_az,)) mean_only_mvn = lambda mu :np.random.multivariate_normal(mu,np.eye(len(mu)))
[docs] class Feature(Sampler): ''' base class for all feature samplers: P(X|A,Z,Y) by default creates two dimensional features with shared parameters across groups and good separability of classes Attributes ---------- dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y theta : list-like or list of tupples params of dist, one per value of z,a, y ''' ParamCreator = FeatureParams def __init__(self,dist= mean_only_mvn,mu = [[5,2],[2,5]], param_tuple = None,N_a =2): ''' Parameters ---------- dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y through theta default mean only multivariate_normal mu : list like parameters for dist, for each value of z, inner list can be tuple or list depending on need of dist N_a : int number of values for A ''' # same mean for both values of y and a if param_tuple: # used by subclasses super().__init__(param_tuple) else: # default params passed # mu has diffs for Z=0,1; repeat for all A for all Y N_y = len(mu) # |Y| = |Z| // z and y have sam enumber of values theta = [[mu]*N_a]*N_y super().__init__((dist,theta))
[docs] def sample(self,a,z,y): ''' sample P(X|A,Z,Y) using distribution and parameters initialized for each a,z,y. The vectors a,z,y must be the same shape Parameters ---------- a : list-like length n demographic variables z : list like length n true target y : list-like length n proxy target Returns -------- x : list like, length n featuers, same shape as a,z,y ''' if type(self.params.theta[0][0][0]) == tuple: # if a tuple, then expand and pass 2 params x = [self.params.distfunc(*self.params.theta[yi][ai][zi]) for ai,zi,yi in zip(a,z,y)] else: x = [self.params.distfunc(self.params.theta[yi][ai][zi]) for ai,zi,yi in zip(a,z,y)] return np.asarray(x)
mvn = lambda mu,var :np.random.multivariate_normal(mu,var*np.eye(len(mu)))
[docs] class FeatureSharedParam(Feature): ''' feature sampler with two total parameters and one parameter shared across Z (eg shared spread) A and Y have no impact on X ''' def __init__(self, loc, spread, dist=mvn,N_a=2): ''' unique locations and shared spread for no impact of A or Y Parameters ----------- dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y loc : list-like length |Z| location parameter of dist, one per value of z spread : scalar shared spread parameter of dist ''' theta_z = [(li,spread) for li in loc] theta = [[theta_z]*N_a]*len(loc) super().__init__(param_tuple=(dist,theta))
[docs] class FeatureTwoParams(Feature): ''' feature sampler with two unique parameters per class ''' def __init__(self, loc, spread, dist=mvn,N_a=2): ''' unique locations and shared spread for z, no impact of a an y Parameters ----------- dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y loc : list-like length |Z| location parameter of dist, one per value of z spread : list-like length |Z| spread parameter of dist, one per value of z ''' theta_z = [(li, si) for li, si in zip(loc, spread)] theta = [[theta_z]*N_a]*2 super().__init__(param_tuple=(dist,theta))
[docs] class FeaturePerGroupTwoParam(Feature): ''' feature sampler with two parameters that vary per group ''' def __init__(self,dist,loc,spread): ''' for feature bias where P(X|Z,Y, A=0) != P(X|Z,Y, A=1) Parameters ----------- dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y loc : list-like length |Z| of list-like length |A| location parameter of dist, one per value of z,a; for multivariate feature spaces in genral the location paramters will each be a list of length = number of features spread : list-like length |Z| of lists length |A| spread parameter of dist, one per value of z,a # ''' # print(len(loc), len(spread)) # print(len(loc[0]), len(spread[0])) theta_za = [[(lii,sii) for lii,sii in zip(li,si)] for li,si in zip(loc,spread)] # repeat so that features do not vary with Y theta = [theta_za,theta_za] # print(theta) super().__init__(param_tuple=(dist,theta))
[docs] class FeaturePerGroupSharedParamWithinGroup(Feature): ''' ''' def __init__(sel,dist,loc,spread): ''' for feature bias where P(X|Z,Y, A=0) != P(X|Z,Y, A=1) but one parameter of dist is shared across groups, but unique per class Parameters ----------- dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y loc : list-like length |Z| of lists length 2 location parameter of dist, one per value of z,a spread : list-like length |Z| spread parameter of dist, one per value of z ''' theta_za = [[(laizi,covzi) for laizi,covzi in zip(laiz,spread)] for laiz in loc] # same for both values fo y theta = [theta_za,theta_za] super().__init__(param_tuple=(dist,theta))
[docs] class FeaturePerGroupSharedParamAcrossGroups(Feature): ''' ''' def __init__(sel,dist,loc,spread): ''' for feature bias where P(X|Z,Y, A=0) != P(X|Z,Y, A=1) but one paramter is shared across groups and classes Parameters ----------- dist : function handle function to sample X|parameters where the paramters are dependend on Z,A,Y loc : list-like length |Z| of lists length 2 location parameter of dist, one per value of z,a spread : scalar spread parameter of dist ''' theta_za = [[(laizi,spread) for laizi in laiz] for laiz in loc] # same for both values fo y theta = [theta_za,theta_za] super().__init__(param_tuple=(dist,theta))
[docs] class FeatureMeasurementQualityProxy(Feature): ''' the measurement locations vary with the true target value z and the measurements spread vary with the meaured target value y, allowing for error to be present in both the features and the measurements. Also may vary with the protected attribute ''' def __init__(self,dist,loc,spread): ''' Parameters ---------- loc : list-like one location parameter value per (true value, protected attribute) pair spread : list-like one spread parameter value per (proxy value, protected attribute) pair ''' theta_yaz = [[[(lii,sii) for lii,sii in zip(li,si)] for li in loc] for si in spread] super().__init__(param_tuple=(dist,theta_yaz))
shape_spread_only_mvn = lambda x,cov: x + np.random.multivariate_normal([0]*len(x),cov*np.eye(len(x)))
[docs] class FeatureNoise(Sampler): ''' Base class for adding noise to features ''' ParamCreator = NoiseParams def __init__(self, dist=shape_spread_only_mvn, sig=1.0, N_a=2): ''' ''' if type(sig) ==float: # constant noise theta = [[[sig,sig]]*N_a]*2 else: # diff noise for protected attributes theta = [[sigi,sigi] for sigi in sig]*2 super().__init__((dist,theta))
[docs] def sample(self,a,z,y,x): ''' add noise to the features conditions on a,z,y add a groupwise noise to the feature vectors than the other ''' x = [self.params.noisefunc(xi,self.params.theta[yi][ai][zi]) for xi,ai,zi,yi in zip(x,a,z,y)] return np.asarray(x)
[docs] class FeatureNoiseReplace(FeatureNoise): ''' feature noise that replcaes some of the features with noise according to mean and covariance attributes ''' def __init__(self,dist,mu = [0,0,0],cov = [[1,0,0],[0,1,0],[0,0,1]],d_shared=1): ''' for subspace bias keep the same number of total features, replace some with noise, keep d_shared in the middle valid for both groups; replace the first 1/2(ceiled) of the remaining with noise for the disadvantaged group and the last portion (floored) for the advantaged group Parameters ---------- mu : List noise mean, default [0, 0, 0] cov: list noise covariance matrix, default is identity in 3 dimensions d_shared: int =1 number of shared features that are informative for both groups ''' d = len(mu) d_adv_noise = int(np.floor((d-d_shared)/2)) # noise dims per row d_dis_noise = int(np.ceil((d-d_shared)/2)) d_adv_signal = d_shared + d_dis_noise # total dims d_dis_signal = d_shared + d_adv_noise # d_noise = max(d_pad_a,d_pad_d) # create masks to 0 out features or noise as appropriate for adding adv_data_mask = np.asarray([1]*d_adv_signal + [0]*d_adv_noise) adv_noise_mask = np.asarray([1-d for d in adv_data_mask]) dis_data_mask = np.asarray([0]*d_dis_noise + [1]*d_dis_signal) dis_noise_mask = np.asarray([1-d for d in dis_data_mask]) theta_adv = (mu,cov,adv_data_mask,adv_noise_mask) theta_dis = (mu,cov,dis_data_mask,dis_noise_mask) theta_az = [[theta_adv,theta_adv], [theta_dis,theta_dis]] noisefunc = lambda x,theta: self.noise_replace_func(x,*theta) super().__init__((dist,[theta_az,theta_az])) def noise_replace_func(self,x,mu,cov,data_mask,noise_mask): # generate the noise x = x*data_mask + self.params.distfunc(mu,cov)*noise_mask return x
class FeatureNoiseShift(FeatureNoise): ''' TODO make work ''' def sample(a,z,y,x,dist,theta): ''' for subspace bias keep d_shared in the middle aligned for both groups, with d total informative features for each group by appending noise at the end fo the feature vector for the advantaged group and prepending noise and moving the first few features to the end for the disadvantaged group ''' d,N = x.shape d_shared = theta[0] d_noise = d-d_shared # noise dims per row d_total = d + d_noise # total dims # generate the noise x_n = np.random.multivariate_normal([0]*d_noise,np.eye(d_noise),N) # functions for combining noise and true vectors x_a = {0: lambda x,n: np.concatenate((x[:d_noise],n)), 1: lambda x,n: np.concatenate((n, x[d_shared-1:d], x[:d_noise]))} x = [x_a[a](x_zi,x_ni) for a,x_zi,x_ni in zip(a,x,x_n)] x = np.asarray(x) return x # -------------------------------------------- # need to be incorporated def feature_proxy(a,z,y,distfunc,theta): ''' some features are related to the ground truth and some are realated to the proxy, Parameters ---------- theta : ''' loc = theta[0] spread = theta [1] x_signal = [distfunc(loc[z_i][a_i],spread[z_i][a_i]) for z_i,a_i in zip(z,a)] x_signal = np.asarray(x_signal) x_proxy = [distfunc(loc[y_i][a_i],spread[y_i][a_i]) for y_i,a_i in zip(y,a)] x_proxy = np.asarray(x_proxy) x = np.concatenate([x_signal,x_proxy]) return x