Source code for expttools.batch_run

import pandas as pd
from warnings import warn
import itertools
import os
from datetime import datetime
import traceback
import subprocess

def clean_string(parameter_val):
    default_str = str(parameter_val)
    # use name if keras or __name__ if exists, otherwise treat as string
    if 'keras' in str(type(parameter_val)):
        return parameter_val.name
    elif hasattr(parameter_val,'name'):
        return parameter_val.name
    elif hasattr(parameter_val,'__name__'):
        return parameter_val.__name__
    else:
        return default_str.replace('()','').replace(', ','-')

def get_installed_packages():
    """
    Get a list of all installed Python packages and their versions.

    Returns
    -------
    list of str
        List of strings in the format "package_name==version" for each installed package.

    Examples
    --------
    >>> get_installed_packages()
    ['numpy==1.21.0', 'pandas==1.3.0', 'scikit-learn==0.24.2']
    """
    installed_packages = []
    for distribution in importlib.metadata.distributions():
        name = distribution.metadata['Name']
        version = distribution.version
        if name and version:
            installed_packages.append(f"{name}=={version}")
    return installed_packages

## declare warnging messages here
NOT_DF_WARNING = 'experiment function does not return a DataFrame, results cannot be saved'
SERIES_WARNING = 'it returns a series, hint: use `to_frame().T` to make the function compatible'
TALL_DF_WARNING = 'the result only has one column, you may want to transpose it (`.T`)'


[docs] class Experiment(): def __init__(self,experiment_function, parameter_grid, name_func=None, parameter_string = {}, name_format = None): ''' create an experiment object Parameters ---------- experiment_function : function the whole experiment that takes in parameters parameter_grid : dictionary keys are parameters to the experiment_function name_func : function or None a function that creates a file name from parameter string version of parameter values parameter_string : dictionary key for each key in the param grid and the value is a lambda for the how to make that a string, by default uses casting or name attribute for all name_format : string format string with parameter names as {keys} so that they can be filled in ''' self.experiment_function = experiment_function # Warns if there are duplicate entries in parameter_grid for cur_key, values in parameter_grid.items(): if isinstance(values, list): if len(values) != len(set(values)): warn(f"Remove duplicate entries in '{cur_key}'", UserWarning) self.expt_result_params = [] # check for exp object and expand for param,val in parameter_grid.items(): if 'expttools.analysis.ExperimentResult' in str(type(val)): self.expt_result_params.append(param) # load dataset list and replace the ExperimentResult object with list of namedtuples parameter_grid[param] = parameter_grid[param].get_named_tuples() self.parameter_grid = parameter_grid self.num_params = len(parameter_grid.keys()) #set to default for all keys in grid self.parameter_string = {k: lambda p: clean_string(p) for k in parameter_grid.keys()} # update with what is passed self.parameter_string.update(parameter_string) # make default or use passed if not(name_func): if not(name_format): self.name_func = lambda cp: '_'.join([self.parameter_string[p](v) for p,v in cp.items()]) else: # stringify and format according to string passed self.name_func = lambda cp: name_format.format(**{p:self.parameter_string[p](v) for p,v in cp.items()}) else: self.name_func = name_func
[docs] def run_test(self,param_i=None,verbose=False): ''' Call the function with a single (default first) value of each parameter and return the result. throws warnings if save format is incompatible with ExperimentResult Parameters ---------- param_i : list list of ints to use as the index for each parameter in order they appear in the parameter grid verbose : boolean display maximum outputs Returns ------- cur_name : string name of the tested case expt_result : (df) what the experiment function returns ''' if not(param_i): param_i = [0]*self.num_params cur_params = {k:v[i] for i,(k,v) in zip(param_i, self.parameter_grid.items())} cur_name = self.name_func(cur_params) expt_result = self.experiment_function(**cur_params) if type(expt_result) == pd.DataFrame: res_rows, res_cols = expt_result.shape if res_cols == 1: warn(TALL_DF_WARNING,category=RuntimeWarning) elif type(expt_result) == pd.Series: warn(SERIES_WARNING,category=RuntimeWarning) else: warn(NOT_DF_WARNING,category=RuntimeWarning) return cur_name, expt_result
[docs] def get_name_list(self): ''' generate list of all names Returns ------- name_list : list of str names of all folders to be created ''' parameter_names = self.parameter_grid.keys() parameter_it = itertools.product(*self.parameter_grid.values()) param_d_list = [{param:param_val for param,param_val in zip(parameter_names,cur_param_values)} for cur_param_values in parameter_it] name_list = [self.name_func(cp) for cp in param_d_list] return name_list
[docs] def validate_name_func(self): ''' validate that the names will be unique Returns ------- unique_nams : bool True if all names are unique ''' # get names name_list = self.get_name_list() # compare list len to set len to check unique return len(name_list) ==len(set(name_list))
[docs] def get_max_name_length(self): ''' get the max length of all names ''' name_list = self.get_name_list() name_lens = [len(n) for n in name_list] return max(name_lens)
[docs] def restart_batch(self, full_path=None, base_path = None, date_stamp_name=None, expt_name=None,verbose=False): ''' restart a run, uses default 'results' dir and most recent one that matches the experiment_function if no path is named or fills in from parts. Parameters ---------- full_path : string or path use this to specify the full path to the results, specifying this overrides other input parameters of this method date_stamp_name : string specify the particular previous run you want to re-start base_path : string or path directory to search for previous runs expt_name : string name to filter on; uses experiment_function.__name__ if None ''' # if full path not specified build it if not(full_path): # use default base if not specified if not(base_path): base_path = 'results' # infer batch if not specified if not(date_stamp_name): # infer expt_name if not stated if not(expt_name): expt_name = self.experiment_function.__name__ runs = [run_dir for run_dir in os.listdir(base_path) if expt_name in run_dir] date_stamp_name = sorted(runs)[-1] full_path = os.path.join(base_path,date_stamp_name) # Add restart logging restart_log_path = os.path.join(full_path, 'restart_log.txt') with open(restart_log_path, 'a') as f: timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') f.write(f"Restarted at: {timestamp}\n") if expt_name: f.write(f"Experiment name: {expt_name}\n") f.write("-" * 40 + "\n") # call run batch with expt_name set to the previous one and datestamp_file # set to false so that it uses the same directory return self.run_batch(self, expt_name= date_stamp_name, verbose=verbose, datestamp_file=False, restart=True, full_path=full_path)
[docs] def run_batch(self, base_path='results', name_func = None,expt_name = None, datestamp_file=True, nested=False,verbose=False,restart=False,full_path=None): ''' run a batch of experiments over the parameter grid and save all results in a structured file system with naming conventions that protect from overwriting Parameters ---------- base_path : path or string base path to create subfolders in for ths set of experiments name_func : function or None a function that creates a file name from parameter values expt_name : string, default None meaningful name for the batch to prepend to the date to, if None, uses the `__name__` attribute of the experiment function datestamp_file : boolean, default True flag to datestamp or not, typically should not be changed nested : boolean, default false flag to treat parameters as nested verbose : boolean, default False flag to display debugging output restart : boolean, default False is this a restart full_path : path or string, default None uesd for restart; generally should not be used Returns ------- success_list : list of string experiment names that succeed failed_list : list of strings experiment names that threw an exception ''' if not(nested): # all by all grid parameter_names = self.parameter_grid.keys() # create iterator parameter_it = itertools.product(*self.parameter_grid.values()) # create a naming function if not provided if name_func: self.name_func = name_func # use function name if not provided if not(expt_name): expt_name = self.experiment_function.__name__ # create datestamp for whole batch if datestamp_file: date_stamp_name = expt_name+ datetime.today().strftime('%Y_%m_%d_%H_%M_%S_%f') else: #used in restart date_stamp_name = expt_name # create the full path for the whole batch if not(full_path): full_path = os.path.join(base_path,date_stamp_name) # create directory unless restarting if not(restart): os.mkdir(full_path) with open(os.path.join(full_path, 'dependency_versions.txt'), 'w') as f: subprocess.run(['pip', 'freeze'], stdout=f) # run experiments & save results success_list = [] failed_list = [] for cur_param_values in parameter_it: # Create function input cur_params = {param:param_val for param,param_val in zip(parameter_names,cur_param_values)} text_params_in = cur_params.copy() # if ExperimentResult was passed, process the namedtuples if self.expt_result_params: info_df_list = [] for param in self.expt_result_params: # original except for expt resutls text_params_in[param] = cur_params[param].name info_df_list.append(cur_params[param].info) cur_params[param] = cur_params[param].dataframe # store values as strings before making meta_df text_params = {param:self.parameter_string[param](val) for param,val in text_params_in.items()} meta_df = pd.Series(text_params) # append info_dfs form ExperimentResults if applicable if self.expt_result_params: meta_df = pd.concat([meta_df] + info_df_list) # create a name from the current parameters name = self.name_func(text_params_in) meta_df['dir_name'] = name # create current result dir name cur_result_dir = os.path.join(full_path,name) # if restart and the directory exists, continue to next if restart and os.path.isdir(cur_result_dir): # count it ass successful success_list.append(name) # do not rerun, go to next loop continue # make the directory if not restarting or restart, but it doesn't exist try: os.mkdir(cur_result_dir) except FileExistsError: print("the directory", name,"exists already, try using the \ `get_name_list` method to figure out how the name_func is not \ generating unique names") raise # run experiment and save try: result_df = self.experiment_function(**cur_params) # save self.save_result(result_df,meta_df,name,full_path) success_list.append(name) except Exception as err: self.save_failed(meta_df,name,full_path,traceback.format_exc()) failed_list.append(name) return date_stamp_name, success_list, failed_list
# the static methods are helper functions that do not need access to self
[docs] @staticmethod def param_string(parameter_value): ''' return a string based on the type ''' if callable(parameter_value): return parameter_value.__name__ else: return str(parameter_value)
[docs] @staticmethod def save_failed(meta_df,name,path,err): ''' save metadata only when an experiment fails ''' dir_name = os.path.join(path,name) meta_df.to_csv(os.path.join(dir_name, 'failed.csv'),header=False) with open(os.path.join(dir_name,'log.txt'),'w') as f: f.write(str(err)) return True
[docs] @staticmethod def save_result(result_df,meta_df,name,path): ''' save results with good naming conventions Parameters ----------- ''' dir_name = os.path.join(path,name) result_df.to_csv(os.path.join(dir_name, 'result.csv'),index=False) meta_df.to_csv(os.path.join(dir_name, 'info.csv'),header=False) return True