Regression -Based Simpson’s Paradox#

#imports
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt


import mlsim 
from mlsim.anomaly import sp_plot

The basic version of SP in the regression form is the clustering model based SP- we use a gaussian mixture model and control parameters of the shape

# setup
r_clusters = -.6  # correlation coefficient of clusters
cluster_spread = .8 # pearson correlation of means
p_sp_clusters = .5 # portion of clusters with SP 
k = 5 # number of clusters
cluster_size = [2,3]
domain_range = [0, 20, 0, 20]
N = 200 # number of points
p_clusters = [1.0/k]*k

We choose the portion of the clusters to have SP and then can draw samples

p_sp_clusters = .9
sp_df2 = mlsim.anomaly.geometric_2d_gmm_sp(r_clusters,cluster_size,cluster_spread,
                                    p_sp_clusters, domain_range,k,N,p_clusters)
sp_plot(sp_df2,'x1','x2','color')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[3], line 4
      1 p_sp_clusters = .9
      2 sp_df2 = mlsim.anomaly.geometric_2d_gmm_sp(r_clusters,cluster_size,cluster_spread,
      3                                     p_sp_clusters, domain_range,k,N,p_clusters)
----> 4 sp_plot(sp_df2,'x1','x2','color')

File ~/work/ml-sim/ml-sim/mlsim/anomaly/plot_utils.py:27, in sp_plot(df, x_col, y_col, color_col, ci, domain_range, ax, aggplot, x_jitter, height, legend)
     23 n_markers = df[color_col].unique().shape[0] # number unique
     24 cur_markers = all_markers[:n_markers]
---> 27 sns.lmplot(x_col, y_col, data=df, hue=color_col, ci=ci,
     28                markers =cur_markers, palette="Set1",x_jitter=x_jitter,
     29                height=height,legend=legend)
     30 if aggplot:
     31     # adda whole data regression line, but don't cover the scatter data
     32     sns.regplot(x_col, y_col, data=df, color='black', scatter=False, ci=ci,)

TypeError: lmplot() got multiple values for argument 'data'

We can change the parameters and see variation

# setup
r_clusters = -.4  # correlation coefficient of clusters
cluster_spread = .8 # pearson correlation of means
p_sp_clusters = .6 # portion of clusters with SP 
k = 5 # number of clusters
cluster_size = [4,4]
domain_range = [0, 20, 0, 20]
N = 200 # number of points
p_clusters = [.5, .2, .1, .1, .1]

sp_df3 = mlsim.anomaly.geometric_2d_gmm_sp(r_clusters,cluster_size,cluster_spread,
                                    p_sp_clusters, domain_range,k,N,p_clusters)
sp_plot(sp_df3,'x1','x2','color')

Multiple Views#

The first extension is to add multiple independent views, we have a wrapper function for that

many_sp_df = mlsim.anomaly.geometric_indep_views_gmm_sp(2,r_clusters,cluster_size,cluster_spread,p_sp_clusters,
                domain_range,k,N,p_clusters)

sp_plot(many_sp_df,'x1','x2','A')
sp_plot(many_sp_df,'x3','x4','B')
many_sp_df.head()

The views do not have to have the same parameters though. We can make each parameter a list of values with the length set to the number of views.

# setup
r_clusters = [.8, -.2]  # correlation coefficient of clusters
cluster_spread = [.8, .2] # pearson correlation of means
p_sp_clusters = [.6, 1] # portion of clusters with SP 
k = [5,3] # number of clusters
cluster_size = [4,4]
domain_range = [0, 20, 0, 20]
N = 200 # number of points
p_clusters = [[.5, .2, .1, .1, .1],[1.0/3]*3]


many_sp_df_diff = mlsim.anomaly.geometric_indep_views_gmm_sp(2,r_clusters,cluster_size,cluster_spread,p_sp_clusters,
                domain_range,k,N,p_clusters)

sp_plot(many_sp_df_diff,'x1','x2','A')
sp_plot(many_sp_df_diff,'x3','x4','B')
many_sp_df.head()