From ecf0a3c15a0a6f5c3a0ee5744c28dc3a64639396 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 13:44:23 -0500 Subject: [PATCH 01/13] initial attempt --- evaluation/__init__.py | 1 + evaluation/precision.py | 34 +++++++++++++++++++ examples/meta-models.py | 70 ++++++++++++++++++++++++++++++++++++++++ tests/__init__.py | 0 tests/test_evaluation.py | 14 ++++++++ 5 files changed, 119 insertions(+) create mode 100644 evaluation/__init__.py create mode 100644 evaluation/precision.py create mode 100644 examples/meta-models.py create mode 100644 tests/__init__.py create mode 100644 tests/test_evaluation.py diff --git a/evaluation/__init__.py b/evaluation/__init__.py new file mode 100644 index 0000000..bc3fde8 --- /dev/null +++ b/evaluation/__init__.py @@ -0,0 +1 @@ +from .precision import cross_validate_surrogate, precision_at_n diff --git a/evaluation/precision.py b/evaluation/precision.py new file mode 100644 index 0000000..60f9997 --- /dev/null +++ b/evaluation/precision.py @@ -0,0 +1,34 @@ +import logging +import numpy as np +import scipy.stats +import sklearn + + +def precision_at_n(y_real, y_hat, top_n): + y_hat_ranks = scipy.stats.rankdata(y_hat, method='average') + test_y_ranks = scipy.stats.rankdata(y_real, method='average') + y_hat_maxargs = y_hat_ranks.argsort() + test_y_maxargs = test_y_ranks.argsort() + cnt = 0 + for entry in y_hat_maxargs[:top_n]: + if entry in test_y_maxargs[:top_n]: + cnt += 1 + return cnt / top_n + + +def cross_validate_surrogate(model, X, y, n_folds, top_n): + kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=42, shuffle=True) + splits = kf.split(X) + + precision_scores_te = [] + precision_scores_tr = [] + for train_idx, test_idx in splits: + train_x, train_y = X[train_idx], y[train_idx] + test_x, test_y = X[test_idx], y[test_idx] + new_model = sklearn.base.clone(model) + new_model.fit(train_x, train_y) + y_hat_te = new_model.predict(test_x) + y_hat_tr = new_model.predict(train_x) + precision_scores_te.append(precision_at_n(test_y, y_hat_te, top_n)) + precision_scores_tr.append(precision_at_n(train_y, y_hat_tr, top_n)) + return np.mean(precision_scores_te), np.mean(precision_scores_tr) diff --git a/examples/meta-models.py b/examples/meta-models.py new file mode 100644 index 0000000..0dc4f66 --- /dev/null +++ b/examples/meta-models.py @@ -0,0 +1,70 @@ +import arff +import argparse +import logging +import matplotlib.pyplot as plt +import openmlcontrib +import pandas as pd +import seaborn as sns +import sklearn.linear_model +import sklearn.ensemble +import os + +import evaluation + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('--performances_path', type=str, + default=os.path.expanduser('~') + '/projects/sklearn-bot/data/svc.arff') + parser.add_argument('--metafeatures_path', type=str, + default=os.path.expanduser('~') + '/projects/sklearn-bot/data/metafeatures.arff') + parser.add_argument('--output_directory', type=str, + default=os.path.expanduser('~') + '/experiments/meta-models') + args_ = parser.parse_args() + return args_ + + +def run(args): + root = logging.getLogger() + root.setLevel(logging.INFO) + + with open(args.performances_path, 'r') as fp: + arff_performances = arff.load(fp) + performances = openmlcontrib.meta.arff_to_dataframe(arff_performances, None) + with open(args.metafeatures_path, 'r') as fp: + arff_metafeatures = arff.load(fp) + metafeatures = openmlcontrib.meta.arff_to_dataframe(arff_metafeatures, None) + + results = [] + precision_at_n = 20 + cv_iterations = 5 + for idx, task_id in enumerate(performances['task_id'].unique()): + logging.info('Processing task %d (%d/%d)' % (task_id, idx+1, len(performances['task_id'].unique()))) + frame_task = performances.loc[performances['task_id'] == task_id] + frame_other = performances.loc[performances['task_id'] != task_id] + + X = frame_task[['svc__gamma', 'svc__C']].values + y = frame_task['predictive_accuracy'].values + + poly_transform = sklearn.preprocessing.PolynomialFeatures(2) + gamma_complexity_poly = poly_transform.fit_transform(X)[1:] + + quadratic_model = sklearn.linear_model.LinearRegression() + score_te, score_tr = evaluation.cross_validate_surrogate(quadratic_model, gamma_complexity_poly, y, cv_iterations, precision_at_n) + results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'test', precision_at_n: score_te}) + results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'train', precision_at_n: score_tr}) + + random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=16) + score_te, score_tr = evaluation.cross_validate_surrogate(random_forest_model, gamma_complexity_poly, y, cv_iterations, precision_at_n) + results.append({'task_id': task_id, 'strategy': 'rf_surrogate', 'set': 'test', precision_at_n: score_te}) + results.append({'task_id': task_id, 'strategy': 'rf_surrogate', 'set': 'train', precision_at_n: score_tr}) + result_frame = pd.DataFrame(results) + + os.makedirs(args.output_directory, exist_ok=True) + fig, ax = plt.subplots() + sns.boxplot(x="strategy", y=precision_at_n, hue="set", data=result_frame, ax=ax) + plt.savefig(os.path.join(args.output_directory, 'metamodels.png')) + + +if __name__ == '__main__': + run(parse_args()) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py new file mode 100644 index 0000000..0b3cc41 --- /dev/null +++ b/tests/test_evaluation.py @@ -0,0 +1,14 @@ +import evaluation +import unittest + + +class TestStringMethods(unittest.TestCase): + + def test_precision_at_n(self): + real = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + yhat = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1] + exp = [0.0, 0.0, 0.0, 0.0, 0.0, 2.0/6.0, 4.0/7.0, 6.0/8.0, 8.0/9.0, 1.0] + + for i in range(len(real)): + result = evaluation.precision_at_n(real, yhat, i+1) + assert exp[i] == result From dc1c23df204dada815c51dcba045fb9ac36847d8 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 18:15:04 -0500 Subject: [PATCH 02/13] incorporated all models --- evaluation/__init__.py | 2 +- evaluation/precision.py | 56 +++++++++++++++------ examples/meta-models.py | 106 ++++++++++++++++++++++++++++++++++------ 3 files changed, 131 insertions(+), 33 deletions(-) diff --git a/evaluation/__init__.py b/evaluation/__init__.py index bc3fde8..b59bf48 100644 --- a/evaluation/__init__.py +++ b/evaluation/__init__.py @@ -1 +1 @@ -from .precision import cross_validate_surrogate, precision_at_n +from .precision import cross_validate_surrogate, evaluate_fold, precision_at_n diff --git a/evaluation/precision.py b/evaluation/precision.py index 60f9997..91ae641 100644 --- a/evaluation/precision.py +++ b/evaluation/precision.py @@ -1,4 +1,3 @@ -import logging import numpy as np import scipy.stats import sklearn @@ -16,19 +15,44 @@ def precision_at_n(y_real, y_hat, top_n): return cnt / top_n -def cross_validate_surrogate(model, X, y, n_folds, top_n): +def evaluate_fold(model: sklearn.base.RegressorMixin, X_tr: np.ndarray, + y_tr: np.ndarray, X_te: np.ndarray, y_te: np.ndarray, + top_n: int, use_k: int): + new_model = sklearn.base.clone(model) + new_model.fit(X_tr, y_tr) + experiments = { + 'tr': (X_tr, y_tr), + 'te': (X_te, y_te), + } + + precision_score = dict() + spearman_score = dict() + for exp_type, (X, y) in experiments.items(): + y_hat = new_model.predict(X) + rand_indices = np.random.randint(len(X), size=use_k) + precision_score[exp_type] = precision_at_n(y[rand_indices], y_hat[rand_indices], top_n) + spearman_score[exp_type] = scipy.stats.pearsonr(y[rand_indices], y_hat[rand_indices])[0] + return precision_score['te'], precision_score['tr'], spearman_score['te'], spearman_score['tr'] + + +def cross_validate_surrogate(model, data, targets, n_folds, top_n, use_k): kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=42, shuffle=True) - splits = kf.split(X) - - precision_scores_te = [] - precision_scores_tr = [] - for train_idx, test_idx in splits: - train_x, train_y = X[train_idx], y[train_idx] - test_x, test_y = X[test_idx], y[test_idx] - new_model = sklearn.base.clone(model) - new_model.fit(train_x, train_y) - y_hat_te = new_model.predict(test_x) - y_hat_tr = new_model.predict(train_x) - precision_scores_te.append(precision_at_n(test_y, y_hat_te, top_n)) - precision_scores_tr.append(precision_at_n(train_y, y_hat_tr, top_n)) - return np.mean(precision_scores_te), np.mean(precision_scores_tr) + splits = kf.split(data) + + precision_scores_te = list() + precision_scores_tr = list() + spearman_scores_te = list() + spearman_scores_tr = list() + for tr_idx, te_idx in splits: + X_tr, y_tr = data[tr_idx], targets[tr_idx] + X_te, y_te = data[te_idx], targets[te_idx] + prec_te, prec_tr, spearm_te, spearm_tr = evaluate_fold(model, X_tr, y_tr, X_te, y_te, top_n, use_k) + precision_scores_te.append(prec_te) + precision_scores_tr.append(prec_tr) + spearman_scores_te.append(spearm_te) + spearman_scores_tr.append(spearm_tr) + + return np.mean(precision_scores_te), \ + np.mean(precision_scores_tr), \ + np.mean(spearman_scores_te), \ + np.mean(spearman_scores_tr) diff --git a/examples/meta-models.py b/examples/meta-models.py index 0dc4f66..6f7a2ca 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -2,6 +2,7 @@ import argparse import logging import matplotlib.pyplot as plt +import numpy as np import openmlcontrib import pandas as pd import seaborn as sns @@ -33,37 +34,110 @@ def run(args): performances = openmlcontrib.meta.arff_to_dataframe(arff_performances, None) with open(args.metafeatures_path, 'r') as fp: arff_metafeatures = arff.load(fp) - metafeatures = openmlcontrib.meta.arff_to_dataframe(arff_metafeatures, None) + # impute missing meta-features with -1 value + metafeatures = openmlcontrib.meta.arff_to_dataframe(arff_metafeatures, None).set_index('task_id').fillna(-1) + # remove all non-rbf rows + performances = performances.loc[performances['svc__kernel'] == 'rbf'] + # join with meta-features frame, and remove tasks without meta-features + performances = performances.join(metafeatures, on='task_id', how='inner') results = [] precision_at_n = 20 + precision_out_of_k = 100 + precision_name = 'precision_at_%d_out_%d' % (precision_at_n, precision_out_of_k) + spearman_name = 'spearmanr_%d' % precision_out_of_k cv_iterations = 5 + + # sklearn objects + quadratic_model = sklearn.linear_model.LinearRegression() + random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=16) + poly_transform = sklearn.preprocessing.PolynomialFeatures(2) + for idx, task_id in enumerate(performances['task_id'].unique()): logging.info('Processing task %d (%d/%d)' % (task_id, idx+1, len(performances['task_id'].unique()))) frame_task = performances.loc[performances['task_id'] == task_id] - frame_other = performances.loc[performances['task_id'] != task_id] + frame_others = performances.loc[performances['task_id'] != task_id] + assert(frame_task.shape[0] > 100) + + # some convenience datasets + param_columns = ['svc__gamma', 'svc__C'] + X_poly_train = poly_transform.fit_transform(frame_others[param_columns].values)[:, 1:] + X_poly_test = poly_transform.fit_transform(frame_task[param_columns].values)[:, 1:] + X_poly_meta_train = np.concatenate((X_poly_train, frame_others[metafeatures.columns.values]), axis=1) + X_poly_meta_test = np.concatenate((X_poly_test, frame_task[metafeatures.columns.values]), axis=1) + + # surrogates + prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(quadratic_model, + X_poly_test, + frame_task['predictive_accuracy'].values, + cv_iterations, + precision_at_n, + precision_out_of_k) + results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) - X = frame_task[['svc__gamma', 'svc__C']].values - y = frame_task['predictive_accuracy'].values + prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(random_forest_model, + frame_task[param_columns].values, + frame_task['predictive_accuracy'].values, + cv_iterations, + precision_at_n, + precision_out_of_k) + results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) - poly_transform = sklearn.preprocessing.PolynomialFeatures(2) - gamma_complexity_poly = poly_transform.fit_transform(X)[1:] + # aggregates + prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model, + X_poly_train, + frame_others['predictive_accuracy'].values, + X_poly_test, + frame_task['predictive_accuracy'].values, + precision_at_n, + precision_out_of_k) + results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) - quadratic_model = sklearn.linear_model.LinearRegression() - score_te, score_tr = evaluation.cross_validate_surrogate(quadratic_model, gamma_complexity_poly, y, cv_iterations, precision_at_n) - results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'test', precision_at_n: score_te}) - results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'train', precision_at_n: score_tr}) + prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model, + frame_others[param_columns], + frame_others['predictive_accuracy'].values, + frame_task[param_columns].values, + frame_task['predictive_accuracy'].values, + precision_at_n, + precision_out_of_k) + results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + + # meta-models + prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model, + X_poly_meta_train, + frame_others['predictive_accuracy'].values, + X_poly_meta_test, + frame_task['predictive_accuracy'].values, + precision_at_n, + precision_out_of_k) + results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + + columns = list(param_columns) + list(metafeatures.columns.values) + prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model, + frame_others[columns], + frame_others['predictive_accuracy'].values, + frame_task[columns].values, + frame_task['predictive_accuracy'].values, + precision_at_n, + precision_out_of_k) + results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) - random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=16) - score_te, score_tr = evaluation.cross_validate_surrogate(random_forest_model, gamma_complexity_poly, y, cv_iterations, precision_at_n) - results.append({'task_id': task_id, 'strategy': 'rf_surrogate', 'set': 'test', precision_at_n: score_te}) - results.append({'task_id': task_id, 'strategy': 'rf_surrogate', 'set': 'train', precision_at_n: score_tr}) result_frame = pd.DataFrame(results) os.makedirs(args.output_directory, exist_ok=True) fig, ax = plt.subplots() - sns.boxplot(x="strategy", y=precision_at_n, hue="set", data=result_frame, ax=ax) - plt.savefig(os.path.join(args.output_directory, 'metamodels.png')) + sns.boxplot(x="strategy", y=precision_name, hue="set", data=result_frame, ax=ax) + plt.savefig(os.path.join(args.output_directory, '%s.png' % precision_name)) + + fig, ax = plt.subplots() + sns.boxplot(x="strategy", y=spearman_name, hue="set", data=result_frame, ax=ax) + plt.savefig(os.path.join(args.output_directory, '%s.png' % spearman_name)) if __name__ == '__main__': From 7e6e08297492586725bfbe2ce7b67254905faa7f Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 18:20:38 -0500 Subject: [PATCH 03/13] small updates --- evaluation/precision.py | 5 +++-- tests/test_evaluation.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/evaluation/precision.py b/evaluation/precision.py index 91ae641..489d8ee 100644 --- a/evaluation/precision.py +++ b/evaluation/precision.py @@ -3,7 +3,7 @@ import sklearn -def precision_at_n(y_real, y_hat, top_n): +def precision_at_n(y_real: np.ndarray, y_hat: np.ndarray, top_n: int): y_hat_ranks = scipy.stats.rankdata(y_hat, method='average') test_y_ranks = scipy.stats.rankdata(y_real, method='average') y_hat_maxargs = y_hat_ranks.argsort() @@ -35,7 +35,8 @@ def evaluate_fold(model: sklearn.base.RegressorMixin, X_tr: np.ndarray, return precision_score['te'], precision_score['tr'], spearman_score['te'], spearman_score['tr'] -def cross_validate_surrogate(model, data, targets, n_folds, top_n, use_k): +def cross_validate_surrogate(model: sklearn.base.RegressorMixin, data: np.ndarray, + targets: np.ndarray, n_folds: int, top_n: int, use_k: int): kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=42, shuffle=True) splits = kf.split(data) diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index 0b3cc41..bf7b3e3 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -2,7 +2,7 @@ import unittest -class TestStringMethods(unittest.TestCase): +class TestEvaluationMethods(unittest.TestCase): def test_precision_at_n(self): real = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] From 833b0a83145b174ce21436cc4d93dfa2ff0f73db Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 18:25:12 -0500 Subject: [PATCH 04/13] small updates --- examples/meta-models.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/examples/meta-models.py b/examples/meta-models.py index 6f7a2ca..e640366 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -21,6 +21,7 @@ def parse_args(): default=os.path.expanduser('~') + '/projects/sklearn-bot/data/metafeatures.arff') parser.add_argument('--output_directory', type=str, default=os.path.expanduser('~') + '/experiments/meta-models') + parser.add_argument('--task_limit', type=int, default=None, help='For fast testing') args_ = parser.parse_args() return args_ @@ -53,8 +54,11 @@ def run(args): random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=16) poly_transform = sklearn.preprocessing.PolynomialFeatures(2) - for idx, task_id in enumerate(performances['task_id'].unique()): - logging.info('Processing task %d (%d/%d)' % (task_id, idx+1, len(performances['task_id'].unique()))) + all_tasks = performances['task_id'].unique() + if args.task_limit is not None: + all_tasks = all_tasks[:args.task_limit] + for idx, task_id in enumerate(all_tasks): + logging.info('Processing task %d (%d/%d)' % (task_id, idx+1, len(all_tasks))) frame_task = performances.loc[performances['task_id'] == task_id] frame_others = performances.loc[performances['task_id'] != task_id] assert(frame_task.shape[0] > 100) @@ -114,8 +118,8 @@ def run(args): frame_task['predictive_accuracy'].values, precision_at_n, precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) - results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) columns = list(param_columns) + list(metafeatures.columns.values) prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model, @@ -125,8 +129,8 @@ def run(args): frame_task['predictive_accuracy'].values, precision_at_n, precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) - results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) result_frame = pd.DataFrame(results) From ccaff71fa04fcff3059ca4e9d9882d558b563dc7 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 18:29:05 -0500 Subject: [PATCH 05/13] plot detail --- examples/meta-models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/meta-models.py b/examples/meta-models.py index e640366..fe5ae39 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -135,7 +135,7 @@ def run(args): result_frame = pd.DataFrame(results) os.makedirs(args.output_directory, exist_ok=True) - fig, ax = plt.subplots() + fig, ax = plt.subplots(figsize=(16, 6)) sns.boxplot(x="strategy", y=precision_name, hue="set", data=result_frame, ax=ax) plt.savefig(os.path.join(args.output_directory, '%s.png' % precision_name)) From 0db147d71c45ba26ca649702ae62c2520baf59af Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 18:44:49 -0500 Subject: [PATCH 06/13] parameterized poly degree --- examples/meta-models.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/meta-models.py b/examples/meta-models.py index fe5ae39..a7148ab 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -21,6 +21,7 @@ def parse_args(): default=os.path.expanduser('~') + '/projects/sklearn-bot/data/metafeatures.arff') parser.add_argument('--output_directory', type=str, default=os.path.expanduser('~') + '/experiments/meta-models') + parser.add_argument('--poly_degree', type=int, default=2) parser.add_argument('--task_limit', type=int, default=None, help='For fast testing') args_ = parser.parse_args() return args_ @@ -52,7 +53,7 @@ def run(args): # sklearn objects quadratic_model = sklearn.linear_model.LinearRegression() random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=16) - poly_transform = sklearn.preprocessing.PolynomialFeatures(2) + poly_transform = sklearn.preprocessing.PolynomialFeatures(args.poly_degree) all_tasks = performances['task_id'].unique() if args.task_limit is not None: @@ -139,7 +140,7 @@ def run(args): sns.boxplot(x="strategy", y=precision_name, hue="set", data=result_frame, ax=ax) plt.savefig(os.path.join(args.output_directory, '%s.png' % precision_name)) - fig, ax = plt.subplots() + fig, ax = plt.subplots(figsize=(16, 6)) sns.boxplot(x="strategy", y=spearman_name, hue="set", data=result_frame, ax=ax) plt.savefig(os.path.join(args.output_directory, '%s.png' % spearman_name)) From 628b88e954629354be6dcb64a376f929f6d8e142 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 18:50:23 -0500 Subject: [PATCH 07/13] parameterized all other important stuff --- examples/meta-models.py | 42 +++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/examples/meta-models.py b/examples/meta-models.py index a7148ab..52f8f1c 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -22,6 +22,11 @@ def parse_args(): parser.add_argument('--output_directory', type=str, default=os.path.expanduser('~') + '/experiments/meta-models') parser.add_argument('--poly_degree', type=int, default=2) + parser.add_argument('--precision_at_n', type=int, default=20) + parser.add_argument('--precision_out_of_k', type=int, default=100) + parser.add_argument('--cv_iterations', type=int, default=5) + parser.add_argument('--n_estimators', type=int, default=16) + parser.add_argument('--random_seed', type=int, default=42) parser.add_argument('--task_limit', type=int, default=None, help='For fast testing') args_ = parser.parse_args() return args_ @@ -44,15 +49,12 @@ def run(args): performances = performances.join(metafeatures, on='task_id', how='inner') results = [] - precision_at_n = 20 - precision_out_of_k = 100 - precision_name = 'precision_at_%d_out_%d' % (precision_at_n, precision_out_of_k) - spearman_name = 'spearmanr_%d' % precision_out_of_k - cv_iterations = 5 + precision_name = 'precision_at_%d_out_%d' % (args.precision_at_n, args.precision_out_of_k) + spearman_name = 'spearmanr_%d' % args.precision_out_of_k # sklearn objects quadratic_model = sklearn.linear_model.LinearRegression() - random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=16) + random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=args.n_estimators, random_state=args.random_seed) poly_transform = sklearn.preprocessing.PolynomialFeatures(args.poly_degree) all_tasks = performances['task_id'].unique() @@ -75,18 +77,18 @@ def run(args): prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(quadratic_model, X_poly_test, frame_task['predictive_accuracy'].values, - cv_iterations, - precision_at_n, - precision_out_of_k) + args.cv_iterations, + args.precision_at_n, + args.precision_out_of_k) results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(random_forest_model, frame_task[param_columns].values, frame_task['predictive_accuracy'].values, - cv_iterations, - precision_at_n, - precision_out_of_k) + args.cv_iterations, + args.precision_at_n, + args.precision_out_of_k) results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) @@ -96,8 +98,8 @@ def run(args): frame_others['predictive_accuracy'].values, X_poly_test, frame_task['predictive_accuracy'].values, - precision_at_n, - precision_out_of_k) + args.precision_at_n, + args.precision_out_of_k) results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) @@ -106,8 +108,8 @@ def run(args): frame_others['predictive_accuracy'].values, frame_task[param_columns].values, frame_task['predictive_accuracy'].values, - precision_at_n, - precision_out_of_k) + args.precision_at_n, + args.precision_out_of_k) results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) @@ -117,8 +119,8 @@ def run(args): frame_others['predictive_accuracy'].values, X_poly_meta_test, frame_task['predictive_accuracy'].values, - precision_at_n, - precision_out_of_k) + args.precision_at_n, + args.precision_out_of_k) results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) @@ -128,8 +130,8 @@ def run(args): frame_others['predictive_accuracy'].values, frame_task[columns].values, frame_task['predictive_accuracy'].values, - precision_at_n, - precision_out_of_k) + args.precision_at_n, + args.precision_out_of_k) results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) From bfc4249f2a7e9a4c5a9a4346b9d842d7c82bde6a Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Thu, 13 Dec 2018 18:57:08 -0500 Subject: [PATCH 08/13] style --- tests/test_evaluation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index bf7b3e3..3447566 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -1,13 +1,14 @@ import evaluation +import numpy as np import unittest class TestEvaluationMethods(unittest.TestCase): def test_precision_at_n(self): - real = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] - yhat = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1] - exp = [0.0, 0.0, 0.0, 0.0, 0.0, 2.0/6.0, 4.0/7.0, 6.0/8.0, 8.0/9.0, 1.0] + real = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]) + yhat = np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]) + exp = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 2.0/6.0, 4.0/7.0, 6.0/8.0, 8.0/9.0, 1.0]) for i in range(len(real)): result = evaluation.precision_at_n(real, yhat, i+1) From dd59706d8698e2e330a8afff8e2eb4f09e5c302b Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 14 Dec 2018 15:11:52 -0500 Subject: [PATCH 09/13] added meta-model for coefficients --- evaluation/precision.py | 17 ++++----- examples/meta-models.py | 69 ++++++++++++++++++++++++----------- quadratic/__init__.py | 1 + quadratic/quadratic.py | 80 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 29 deletions(-) create mode 100644 quadratic/__init__.py create mode 100644 quadratic/quadratic.py diff --git a/evaluation/precision.py b/evaluation/precision.py index 489d8ee..f48e273 100644 --- a/evaluation/precision.py +++ b/evaluation/precision.py @@ -1,9 +1,10 @@ import numpy as np import scipy.stats import sklearn +import typing -def precision_at_n(y_real: np.ndarray, y_hat: np.ndarray, top_n: int): +def precision_at_n(y_real: np.ndarray, y_hat: np.ndarray, top_n: int) -> float: y_hat_ranks = scipy.stats.rankdata(y_hat, method='average') test_y_ranks = scipy.stats.rankdata(y_real, method='average') y_hat_maxargs = y_hat_ranks.argsort() @@ -17,14 +18,13 @@ def precision_at_n(y_real: np.ndarray, y_hat: np.ndarray, top_n: int): def evaluate_fold(model: sklearn.base.RegressorMixin, X_tr: np.ndarray, y_tr: np.ndarray, X_te: np.ndarray, y_te: np.ndarray, - top_n: int, use_k: int): + top_n: int, use_k: int) -> typing.Tuple[float, float, float, float]: new_model = sklearn.base.clone(model) new_model.fit(X_tr, y_tr) experiments = { 'tr': (X_tr, y_tr), 'te': (X_te, y_te), } - precision_score = dict() spearman_score = dict() for exp_type, (X, y) in experiments.items(): @@ -36,7 +36,8 @@ def evaluate_fold(model: sklearn.base.RegressorMixin, X_tr: np.ndarray, def cross_validate_surrogate(model: sklearn.base.RegressorMixin, data: np.ndarray, - targets: np.ndarray, n_folds: int, top_n: int, use_k: int): + targets: np.ndarray, n_folds: int, top_n: int, + use_k: int) -> typing.Tuple[float, float, float, float]: kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=42, shuffle=True) splits = kf.split(data) @@ -53,7 +54,7 @@ def cross_validate_surrogate(model: sklearn.base.RegressorMixin, data: np.ndarra spearman_scores_te.append(spearm_te) spearman_scores_tr.append(spearm_tr) - return np.mean(precision_scores_te), \ - np.mean(precision_scores_tr), \ - np.mean(spearman_scores_te), \ - np.mean(spearman_scores_tr) + return float(np.mean(precision_scores_te)), \ + float(np.mean(precision_scores_tr)), \ + float(np.mean(spearman_scores_te)), \ + float(np.mean(spearman_scores_tr)) diff --git a/examples/meta-models.py b/examples/meta-models.py index 52f8f1c..65cfaed 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -5,12 +5,13 @@ import numpy as np import openmlcontrib import pandas as pd +import scipy.stats import seaborn as sns import sklearn.linear_model import sklearn.ensemble import os -import evaluation +import evaluation, quadratic def parse_args(): @@ -35,7 +36,14 @@ def parse_args(): def run(args): root = logging.getLogger() root.setLevel(logging.INFO) + logging.info('Started meta-models.py') + # some naming declarations + precision_name = 'precision_at_%d_out_%d' % (args.precision_at_n, args.precision_out_of_k) + spearman_name = 'spearmanr_%d' % args.precision_out_of_k + param_columns = ['svc__gamma', 'svc__C'] + + # data loading and management with open(args.performances_path, 'r') as fp: arff_performances = arff.load(fp) performances = openmlcontrib.meta.arff_to_dataframe(arff_performances, None) @@ -47,19 +55,27 @@ def run(args): performances = performances.loc[performances['svc__kernel'] == 'rbf'] # join with meta-features frame, and remove tasks without meta-features performances = performances.join(metafeatures, on='task_id', how='inner') - - results = [] - precision_name = 'precision_at_%d_out_%d' % (args.precision_at_n, args.precision_out_of_k) - spearman_name = 'spearmanr_%d' % args.precision_out_of_k + # coefficients data + coefficients_data = quadratic.generate_coefficients_data(args.poly_degree, performances, param_columns).join(metafeatures, how='inner') + logging.info('Generated all datasets') # sklearn objects - quadratic_model = sklearn.linear_model.LinearRegression() - random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=args.n_estimators, random_state=args.random_seed) + quadratic_model = sklearn.linear_model.LinearRegression(fit_intercept=False) + random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=args.n_estimators, + random_state=args.random_seed) + random_forest_coef = quadratic.MetaRandomForestQuadratic(n_estimators=args.n_estimators, + random_seed=args.random_seed, + meta_columns=list(metafeatures.columns.values), + base_columns=param_columns, + poly_degree=args.poly_degree) poly_transform = sklearn.preprocessing.PolynomialFeatures(args.poly_degree) + # determine relevant tasks all_tasks = performances['task_id'].unique() if args.task_limit is not None: all_tasks = all_tasks[:args.task_limit] + + results = [] for idx, task_id in enumerate(all_tasks): logging.info('Processing task %d (%d/%d)' % (task_id, idx+1, len(all_tasks))) frame_task = performances.loc[performances['task_id'] == task_id] @@ -67,11 +83,10 @@ def run(args): assert(frame_task.shape[0] > 100) # some convenience datasets - param_columns = ['svc__gamma', 'svc__C'] - X_poly_train = poly_transform.fit_transform(frame_others[param_columns].values)[:, 1:] - X_poly_test = poly_transform.fit_transform(frame_task[param_columns].values)[:, 1:] - X_poly_meta_train = np.concatenate((X_poly_train, frame_others[metafeatures.columns.values]), axis=1) - X_poly_meta_test = np.concatenate((X_poly_test, frame_task[metafeatures.columns.values]), axis=1) + X_poly_train = poly_transform.fit_transform(frame_others[param_columns].values) + X_poly_test = poly_transform.fit_transform(frame_task[param_columns].values) + # X_poly_meta_train = np.concatenate((X_poly_train, frame_others[metafeatures.columns.values]), axis=1) + # X_poly_meta_test = np.concatenate((X_poly_test, frame_task[metafeatures.columns.values]), axis=1) # surrogates prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(quadratic_model, @@ -114,15 +129,15 @@ def run(args): results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) # meta-models - prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model, - X_poly_meta_train, - frame_others['predictive_accuracy'].values, - X_poly_meta_test, - frame_task['predictive_accuracy'].values, - args.precision_at_n, - args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) - results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + # prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model, + # X_poly_meta_train, + # frame_others['predictive_accuracy'].values, + # X_poly_meta_test, + # frame_task['predictive_accuracy'].values, + # args.precision_at_n, + # args.precision_out_of_k) + # results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + # results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) columns = list(param_columns) + list(metafeatures.columns.values) prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model, @@ -135,9 +150,21 @@ def run(args): results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + # special case: random forest that predicts coefficients of base task + random_forest_coef.fit(coefficients_data[metafeatures.columns.values].values, + coefficients_data[quadratic.get_coefficient_names()].values) + # note that this code is an almost duplicate from the precision module. + y_hat = random_forest_coef.predict(frame_task) + rand_indices = np.random.randint(len(frame_task), size=args.precision_out_of_k) + prec_te = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices], y_hat[rand_indices], args.precision_at_n) + spearm_te = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices], y_hat[rand_indices])[0] + results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + result_frame = pd.DataFrame(results) os.makedirs(args.output_directory, exist_ok=True) + result_frame.to_csv(os.path.join(args.output_directory, 'results.csv')) + fig, ax = plt.subplots(figsize=(16, 6)) sns.boxplot(x="strategy", y=precision_name, hue="set", data=result_frame, ax=ax) plt.savefig(os.path.join(args.output_directory, '%s.png' % precision_name)) diff --git a/quadratic/__init__.py b/quadratic/__init__.py new file mode 100644 index 0000000..84b9bc4 --- /dev/null +++ b/quadratic/__init__.py @@ -0,0 +1 @@ +from .quadratic import MetaRandomForestQuadratic, generate_coefficients_data, get_coefficient_names diff --git a/quadratic/quadratic.py b/quadratic/quadratic.py new file mode 100644 index 0000000..a6d80e6 --- /dev/null +++ b/quadratic/quadratic.py @@ -0,0 +1,80 @@ +import logging +import numpy as np +import pandas as pd +import sklearn.base +import sklearn.linear_model +import sklearn.preprocessing +import typing + + +class MetaRandomForestQuadratic(sklearn.base.RegressorMixin): + + def __init__(self, n_estimators: int, random_seed: int, + meta_columns: typing.List, base_columns: typing.List, poly_degree: int): + if poly_degree != 2: + logging.warning('Polynomial degree of 2 assumed. ') + self.n_estimators = n_estimators + self.random_seed = random_seed + self.meta_columns = meta_columns + self.base_columns = base_columns + self.poly_degree = 2 + self.feat_trans = sklearn.preprocessing.PolynomialFeatures(self.poly_degree) + self.meta_model = sklearn.ensemble.RandomForestRegressor(n_estimators=self.n_estimators, + random_state=self.random_seed) + + def fit(self, X: np.ndarray, y: np.ndarray): + self.meta_model.fit(X, y) + + def predict(self, X: pd.DataFrame) -> np.ndarray: + """ + Returns a 1D numpy array + """ + predictions = [] + for idx, row in X.iterrows(): + base_model = sklearn.linear_model.LinearRegression(fit_intercept=False) + base_model.intercept_ = 0 + base_model.coef_ = self.meta_model.predict([row[self.meta_columns]])[0] + input = self.feat_trans.fit_transform([row[self.base_columns]]) + prediction = base_model.predict(input)[0] + predictions.append(prediction) + res = np.array(predictions) + return res + + +def get_coefficient_names() -> typing.List: + return [ + 'intercept', + 'coef_gamma', + 'coef_C', + 'coef_gamma_sq', + 'coef_gamma_C', + 'coef_C_sq', + ] + + +def generate_coefficients_data(poly_degree: int, performance_data: pd.DataFrame, param_columns: typing.List) -> pd.DataFrame: + """ + Pre-processess the coefficients for all datasets at once (for speed) + """ + if poly_degree != 2: + logging.warning('Not Implemented: polynomial degree of > 2. Will use degree 2 for meta-model') + coef_names = get_coefficient_names() + results = [] + for idx, task_id in enumerate(performance_data['task_id'].unique()): + frame_task = performance_data.loc[performance_data['task_id'] == task_id] + model = sklearn.linear_model.LinearRegression(fit_intercept=False) + poly_feat = sklearn.preprocessing.PolynomialFeatures(2) + X = poly_feat.fit_transform(frame_task[param_columns]) + y = frame_task['predictive_accuracy'] + model.fit(X, y) + result = { + 'task_id': task_id, + coef_names[0]: model.coef_[0], + coef_names[1]: model.coef_[1], + coef_names[2]: model.coef_[2], + coef_names[3]: model.coef_[3], + coef_names[4]: model.coef_[4], + coef_names[5]: model.coef_[5], + } + results.append(result) + return pd.DataFrame(results).set_index('task_id') From f4cbb56b9bc3301dda8fb54089b19dba16c21d48 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 14 Dec 2018 15:22:27 -0500 Subject: [PATCH 10/13] saves coefficients --- examples/meta-models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/meta-models.py b/examples/meta-models.py index 65cfaed..cf0dec4 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -57,6 +57,7 @@ def run(args): performances = performances.join(metafeatures, on='task_id', how='inner') # coefficients data coefficients_data = quadratic.generate_coefficients_data(args.poly_degree, performances, param_columns).join(metafeatures, how='inner') + coefficients_data.to_csv(os.path.join(args.output_directory, 'coefficients.csv')) logging.info('Generated all datasets') # sklearn objects From 02097e5496e2fd23372bf9e72087d58b7d3062f2 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 14 Dec 2018 16:05:14 -0500 Subject: [PATCH 11/13] added train set for RF meta coef, improved train perf --- examples/meta-models.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/examples/meta-models.py b/examples/meta-models.py index cf0dec4..652210e 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -81,6 +81,7 @@ def run(args): logging.info('Processing task %d (%d/%d)' % (task_id, idx+1, len(all_tasks))) frame_task = performances.loc[performances['task_id'] == task_id] frame_others = performances.loc[performances['task_id'] != task_id] + coefficients_train_frame = coefficients_data.drop([task_id]) assert(frame_task.shape[0] > 100) # some convenience datasets @@ -96,8 +97,8 @@ def run(args): args.cv_iterations, args.precision_at_n, args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(random_forest_model, frame_task[param_columns].values, @@ -105,8 +106,8 @@ def run(args): args.cv_iterations, args.precision_at_n, args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) # aggregates prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model, @@ -152,14 +153,20 @@ def run(args): results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) # special case: random forest that predicts coefficients of base task - random_forest_coef.fit(coefficients_data[metafeatures.columns.values].values, - coefficients_data[quadratic.get_coefficient_names()].values) - # note that this code is an almost duplicate from the precision module. - y_hat = random_forest_coef.predict(frame_task) - rand_indices = np.random.randint(len(frame_task), size=args.precision_out_of_k) - prec_te = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices], y_hat[rand_indices], args.precision_at_n) - spearm_te = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices], y_hat[rand_indices])[0] + random_forest_coef.fit(coefficients_train_frame[metafeatures.columns.values].values, + coefficients_train_frame[quadratic.get_coefficient_names()].values) + # note that this code is an almost duplicate from the precision module (TODO: refactor) + y_hat_te = random_forest_coef.predict(frame_task) + rand_indices_te = np.random.randint(len(frame_task), size=args.precision_out_of_k) + prec_te = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices_te],y_hat_te[rand_indices_te], args.precision_at_n) + spearm_te = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices_te], y_hat_te[rand_indices_te])[0] results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + # again, duplicate (TODO: refactor) + y_hat_tr = random_forest_coef.predict(frame_task) + rand_indices_tr = np.random.randint(len(frame_task), size=args.precision_out_of_k) + prec_tr = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr], args.precision_at_n) + spearm_tr = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr])[0] + results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'set': 'train-task', precision_name: prec_tr, spearman_name: spearm_tr}) result_frame = pd.DataFrame(results) From fe70369fbd3f2931ca2202b9de2dd446e79144af Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 14 Dec 2018 16:29:05 -0500 Subject: [PATCH 12/13] typo fix --- examples/meta-models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/meta-models.py b/examples/meta-models.py index 652210e..d997427 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -166,7 +166,7 @@ def run(args): rand_indices_tr = np.random.randint(len(frame_task), size=args.precision_out_of_k) prec_tr = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr], args.precision_at_n) spearm_tr = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr])[0] - results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'set': 'train-task', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) result_frame = pd.DataFrame(results) From 5dff2de672a3a54b688612cc534cf73249691c61 Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 14 Dec 2018 16:58:11 -0500 Subject: [PATCH 13/13] improved order of plot --- examples/meta-models.py | 49 ++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/examples/meta-models.py b/examples/meta-models.py index d997427..2ec280b 100644 --- a/examples/meta-models.py +++ b/examples/meta-models.py @@ -90,26 +90,35 @@ def run(args): # X_poly_meta_train = np.concatenate((X_poly_train, frame_others[metafeatures.columns.values]), axis=1) # X_poly_meta_test = np.concatenate((X_poly_test, frame_task[metafeatures.columns.values]), axis=1) - # surrogates + ####################### + # SURROGATES # + ####################### + + # quadratic prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(quadratic_model, X_poly_test, frame_task['predictive_accuracy'].values, args.cv_iterations, args.precision_at_n, args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) - results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'x_order': 61, 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'x_order': 60, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + # random forest prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(random_forest_model, frame_task[param_columns].values, frame_task['predictive_accuracy'].values, args.cv_iterations, args.precision_at_n, args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) - results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'x_order': 31, 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'x_order': 30, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + + ####################### + # AGGREGATES # + ####################### - # aggregates + # quadratic prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model, X_poly_train, frame_others['predictive_accuracy'].values, @@ -117,9 +126,10 @@ def run(args): frame_task['predictive_accuracy'].values, args.precision_at_n, args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) - results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'x_order': 41, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'x_order': 40, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + # random forest prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model, frame_others[param_columns], frame_others['predictive_accuracy'].values, @@ -127,8 +137,8 @@ def run(args): frame_task['predictive_accuracy'].values, args.precision_at_n, args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) - results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'x_order': 11, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'x_order': 10, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) # meta-models # prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model, @@ -141,6 +151,11 @@ def run(args): # results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) # results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + ############################ + # META-MODELS # + ############################ + + # random forest columns = list(param_columns) + list(metafeatures.columns.values) prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model, frame_others[columns], @@ -149,8 +164,8 @@ def run(args): frame_task['predictive_accuracy'].values, args.precision_at_n, args.precision_out_of_k) - results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) - results.append({'task_id': task_id, 'strategy': 'RF_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'RF_meta', 'x_order': 21, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_meta', 'x_order': 20, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) # special case: random forest that predicts coefficients of base task random_forest_coef.fit(coefficients_train_frame[metafeatures.columns.values].values, @@ -160,15 +175,19 @@ def run(args): rand_indices_te = np.random.randint(len(frame_task), size=args.precision_out_of_k) prec_te = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices_te],y_hat_te[rand_indices_te], args.precision_at_n) spearm_te = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices_te], y_hat_te[rand_indices_te])[0] - results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) + results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'x_order': 51, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te}) # again, duplicate (TODO: refactor) y_hat_tr = random_forest_coef.predict(frame_task) rand_indices_tr = np.random.randint(len(frame_task), size=args.precision_out_of_k) prec_tr = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr], args.precision_at_n) spearm_tr = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr])[0] - results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) + results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'x_order': 50, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr}) - result_frame = pd.DataFrame(results) + # x_order is used to trick seaborn plot into using the right order + # general order: first random forest models, then quadratic models + # secondary order: first aggregates, then meta-models, then surrogates + # tertiary order: first train-tasks, then test, then test-obs + result_frame = pd.DataFrame(results).sort_values(['x_order']) os.makedirs(args.output_directory, exist_ok=True) result_frame.to_csv(os.path.join(args.output_directory, 'results.csv'))