import pandas as pd
from warnings import warn
import itertools
import os
from datetime import datetime
import traceback
import subprocess
def clean_string(parameter_val):
default_str = str(parameter_val)
# use name if keras or __name__ if exists, otherwise treat as string
if 'keras' in str(type(parameter_val)):
return parameter_val.name
elif hasattr(parameter_val,'name'):
return parameter_val.name
elif hasattr(parameter_val,'__name__'):
return parameter_val.__name__
else:
return default_str.replace('()','').replace(', ','-')
def get_installed_packages():
"""
Get a list of all installed Python packages and their versions.
Returns
-------
list of str
List of strings in the format "package_name==version" for each installed package.
Examples
--------
>>> get_installed_packages()
['numpy==1.21.0', 'pandas==1.3.0', 'scikit-learn==0.24.2']
"""
installed_packages = []
for distribution in importlib.metadata.distributions():
name = distribution.metadata['Name']
version = distribution.version
if name and version:
installed_packages.append(f"{name}=={version}")
return installed_packages
## declare warnging messages here
NOT_DF_WARNING = 'experiment function does not return a DataFrame, results cannot be saved'
SERIES_WARNING = 'it returns a series, hint: use `to_frame().T` to make the function compatible'
TALL_DF_WARNING = 'the result only has one column, you may want to transpose it (`.T`)'
[docs]
class Experiment():
def __init__(self,experiment_function, parameter_grid, name_func=None,
parameter_string = {}, name_format = None):
'''
create an experiment object
Parameters
----------
experiment_function : function
the whole experiment that takes in parameters
parameter_grid : dictionary
keys are parameters to the experiment_function
name_func : function or None
a function that creates a file name from parameter string version of parameter values
parameter_string : dictionary
key for each key in the param grid and the value is a lambda for the
how to make that a string, by default uses casting or name attribute for all
name_format : string
format string with parameter names as {keys} so that they can be filled in
'''
self.experiment_function = experiment_function
# Warns if there are duplicate entries in parameter_grid
for cur_key, values in parameter_grid.items():
if isinstance(values, list):
if len(values) != len(set(values)):
warn(f"Remove duplicate entries in '{cur_key}'", UserWarning)
self.expt_result_params = []
# check for exp object and expand
for param,val in parameter_grid.items():
if 'expttools.analysis.ExperimentResult' in str(type(val)):
self.expt_result_params.append(param)
# load dataset list and replace the ExperimentResult object with list of namedtuples
parameter_grid[param] = parameter_grid[param].get_named_tuples()
self.parameter_grid = parameter_grid
self.num_params = len(parameter_grid.keys())
#set to default for all keys in grid
self.parameter_string = {k: lambda p: clean_string(p) for k in parameter_grid.keys()}
# update with what is passed
self.parameter_string.update(parameter_string)
# make default or use passed
if not(name_func):
if not(name_format):
self.name_func = lambda cp: '_'.join([self.parameter_string[p](v)
for p,v in cp.items()])
else:
# stringify and format according to string passed
self.name_func = lambda cp: name_format.format(**{p:self.parameter_string[p](v)
for p,v in cp.items()})
else:
self.name_func = name_func
[docs]
def run_test(self,param_i=None,verbose=False):
'''
Call the function with a single (default first) value of each parameter
and return the result. throws warnings if save format is incompatible with
ExperimentResult
Parameters
----------
param_i : list
list of ints to use as the index for each parameter in order they
appear in the parameter grid
verbose : boolean
display maximum outputs
Returns
-------
cur_name : string
name of the tested case
expt_result : (df)
what the experiment function returns
'''
if not(param_i):
param_i = [0]*self.num_params
cur_params = {k:v[i] for i,(k,v) in zip(param_i,
self.parameter_grid.items())}
cur_name = self.name_func(cur_params)
expt_result = self.experiment_function(**cur_params)
if type(expt_result) == pd.DataFrame:
res_rows, res_cols = expt_result.shape
if res_cols == 1:
warn(TALL_DF_WARNING,category=RuntimeWarning)
elif type(expt_result) == pd.Series:
warn(SERIES_WARNING,category=RuntimeWarning)
else:
warn(NOT_DF_WARNING,category=RuntimeWarning)
return cur_name, expt_result
[docs]
def get_name_list(self):
'''
generate list of all names
Returns
-------
name_list : list of str
names of all folders to be created
'''
parameter_names = self.parameter_grid.keys()
parameter_it = itertools.product(*self.parameter_grid.values())
param_d_list = [{param:param_val for param,param_val in
zip(parameter_names,cur_param_values)}
for cur_param_values in parameter_it]
name_list = [self.name_func(cp) for cp in param_d_list]
return name_list
[docs]
def validate_name_func(self):
'''
validate that the names will be unique
Returns
-------
unique_nams : bool
True if all names are unique
'''
# get names
name_list = self.get_name_list()
# compare list len to set len to check unique
return len(name_list) ==len(set(name_list))
[docs]
def get_max_name_length(self):
'''
get the max length of all names
'''
name_list = self.get_name_list()
name_lens = [len(n) for n in name_list]
return max(name_lens)
[docs]
def restart_batch(self, full_path=None, base_path = None, date_stamp_name=None,
expt_name=None,verbose=False):
'''
restart a run, uses default 'results' dir and most recent one that matches
the experiment_function if no path is named or fills in from parts.
Parameters
----------
full_path : string or path
use this to specify the full path to the results, specifying this
overrides other input parameters of this method
date_stamp_name : string
specify the particular previous run you want to re-start
base_path : string or path
directory to search for previous runs
expt_name : string
name to filter on; uses experiment_function.__name__ if None
'''
# if full path not specified build it
if not(full_path):
# use default base if not specified
if not(base_path):
base_path = 'results'
# infer batch if not specified
if not(date_stamp_name):
# infer expt_name if not stated
if not(expt_name):
expt_name = self.experiment_function.__name__
runs = [run_dir for run_dir in os.listdir(base_path)
if expt_name in run_dir]
date_stamp_name = sorted(runs)[-1]
full_path = os.path.join(base_path,date_stamp_name)
# Add restart logging
restart_log_path = os.path.join(full_path, 'restart_log.txt')
with open(restart_log_path, 'a') as f:
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
f.write(f"Restarted at: {timestamp}\n")
if expt_name:
f.write(f"Experiment name: {expt_name}\n")
f.write("-" * 40 + "\n")
# call run batch with expt_name set to the previous one and datestamp_file
# set to false so that it uses the same directory
return self.run_batch(self, expt_name= date_stamp_name, verbose=verbose,
datestamp_file=False, restart=True, full_path=full_path)
[docs]
def run_batch(self, base_path='results',
name_func = None,expt_name = None, datestamp_file=True,
nested=False,verbose=False,restart=False,full_path=None):
'''
run a batch of experiments over the parameter grid and save all results in
a structured file system with naming conventions that protect from overwriting
Parameters
----------
base_path : path or string
base path to create subfolders in for ths set of experiments
name_func : function or None
a function that creates a file name from parameter values
expt_name : string, default None
meaningful name for the batch to prepend to the date to, if None,
uses the `__name__` attribute of the experiment function
datestamp_file : boolean, default True
flag to datestamp or not, typically should not be changed
nested : boolean, default false
flag to treat parameters as nested
verbose : boolean, default False
flag to display debugging output
restart : boolean, default False
is this a restart
full_path : path or string, default None
uesd for restart; generally should not be used
Returns
-------
success_list : list of string
experiment names that succeed
failed_list : list of strings
experiment names that threw an exception
'''
if not(nested):
# all by all grid
parameter_names = self.parameter_grid.keys()
# create iterator
parameter_it = itertools.product(*self.parameter_grid.values())
# create a naming function if not provided
if name_func:
self.name_func = name_func
# use function name if not provided
if not(expt_name):
expt_name = self.experiment_function.__name__
# create datestamp for whole batch
if datestamp_file:
date_stamp_name = expt_name+ datetime.today().strftime('%Y_%m_%d_%H_%M_%S_%f')
else: #used in restart
date_stamp_name = expt_name
# create the full path for the whole batch
if not(full_path):
full_path = os.path.join(base_path,date_stamp_name)
# create directory unless restarting
if not(restart):
os.mkdir(full_path)
with open(os.path.join(full_path, 'dependency_versions.txt'), 'w') as f:
subprocess.run(['pip', 'freeze'], stdout=f)
# run experiments & save results
success_list = []
failed_list = []
for cur_param_values in parameter_it:
# Create function input
cur_params = {param:param_val for param,param_val in
zip(parameter_names,cur_param_values)}
text_params_in = cur_params.copy()
# if ExperimentResult was passed, process the namedtuples
if self.expt_result_params:
info_df_list = []
for param in self.expt_result_params:
# original except for expt resutls
text_params_in[param] = cur_params[param].name
info_df_list.append(cur_params[param].info)
cur_params[param] = cur_params[param].dataframe
# store values as strings before making meta_df
text_params = {param:self.parameter_string[param](val)
for param,val in text_params_in.items()}
meta_df = pd.Series(text_params)
# append info_dfs form ExperimentResults if applicable
if self.expt_result_params:
meta_df = pd.concat([meta_df] + info_df_list)
# create a name from the current parameters
name = self.name_func(text_params_in)
meta_df['dir_name'] = name
# create current result dir name
cur_result_dir = os.path.join(full_path,name)
# if restart and the directory exists, continue to next
if restart and os.path.isdir(cur_result_dir):
# count it ass successful
success_list.append(name)
# do not rerun, go to next loop
continue
# make the directory if not restarting or restart, but it doesn't exist
try:
os.mkdir(cur_result_dir)
except FileExistsError:
print("the directory", name,"exists already, try using the \
`get_name_list` method to figure out how the name_func is not \
generating unique names")
raise
# run experiment and save
try:
result_df = self.experiment_function(**cur_params)
# save
self.save_result(result_df,meta_df,name,full_path)
success_list.append(name)
except Exception as err:
self.save_failed(meta_df,name,full_path,traceback.format_exc())
failed_list.append(name)
return date_stamp_name, success_list, failed_list
# the static methods are helper functions that do not need access to self
[docs]
@staticmethod
def param_string(parameter_value):
'''
return a string based on the type
'''
if callable(parameter_value):
return parameter_value.__name__
else:
return str(parameter_value)
[docs]
@staticmethod
def save_failed(meta_df,name,path,err):
'''
save metadata only when an experiment fails
'''
dir_name = os.path.join(path,name)
meta_df.to_csv(os.path.join(dir_name, 'failed.csv'),header=False)
with open(os.path.join(dir_name,'log.txt'),'w') as f:
f.write(str(err))
return True
[docs]
@staticmethod
def save_result(result_df,meta_df,name,path):
'''
save results with good naming conventions
Parameters
-----------
'''
dir_name = os.path.join(path,name)
result_df.to_csv(os.path.join(dir_name, 'result.csv'),index=False)
meta_df.to_csv(os.path.join(dir_name, 'info.csv'),header=False)
return True