Skip to content
1 change: 1 addition & 0 deletions evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .precision import cross_validate_surrogate, evaluate_fold, precision_at_n
60 changes: 60 additions & 0 deletions evaluation/precision.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np
import scipy.stats
import sklearn
import typing


def precision_at_n(y_real: np.ndarray, y_hat: np.ndarray, top_n: int) -> float:
y_hat_ranks = scipy.stats.rankdata(y_hat, method='average')
test_y_ranks = scipy.stats.rankdata(y_real, method='average')
y_hat_maxargs = y_hat_ranks.argsort()
test_y_maxargs = test_y_ranks.argsort()
cnt = 0
for entry in y_hat_maxargs[:top_n]:
if entry in test_y_maxargs[:top_n]:
cnt += 1
return cnt / top_n


def evaluate_fold(model: sklearn.base.RegressorMixin, X_tr: np.ndarray,
y_tr: np.ndarray, X_te: np.ndarray, y_te: np.ndarray,
top_n: int, use_k: int) -> typing.Tuple[float, float, float, float]:
new_model = sklearn.base.clone(model)
new_model.fit(X_tr, y_tr)
experiments = {
'tr': (X_tr, y_tr),
'te': (X_te, y_te),
}
precision_score = dict()
spearman_score = dict()
for exp_type, (X, y) in experiments.items():
y_hat = new_model.predict(X)
rand_indices = np.random.randint(len(X), size=use_k)
precision_score[exp_type] = precision_at_n(y[rand_indices], y_hat[rand_indices], top_n)
spearman_score[exp_type] = scipy.stats.pearsonr(y[rand_indices], y_hat[rand_indices])[0]
return precision_score['te'], precision_score['tr'], spearman_score['te'], spearman_score['tr']


def cross_validate_surrogate(model: sklearn.base.RegressorMixin, data: np.ndarray,
targets: np.ndarray, n_folds: int, top_n: int,
use_k: int) -> typing.Tuple[float, float, float, float]:
kf = sklearn.model_selection.KFold(n_splits=n_folds, random_state=42, shuffle=True)
splits = kf.split(data)

precision_scores_te = list()
precision_scores_tr = list()
spearman_scores_te = list()
spearman_scores_tr = list()
for tr_idx, te_idx in splits:
X_tr, y_tr = data[tr_idx], targets[tr_idx]
X_te, y_te = data[te_idx], targets[te_idx]
prec_te, prec_tr, spearm_te, spearm_tr = evaluate_fold(model, X_tr, y_tr, X_te, y_te, top_n, use_k)
precision_scores_te.append(prec_te)
precision_scores_tr.append(prec_tr)
spearman_scores_te.append(spearm_te)
spearman_scores_tr.append(spearm_tr)

return float(np.mean(precision_scores_te)), \
float(np.mean(precision_scores_tr)), \
float(np.mean(spearman_scores_te)), \
float(np.mean(spearman_scores_tr))
205 changes: 205 additions & 0 deletions examples/meta-models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
import arff
import argparse
import logging
import matplotlib.pyplot as plt
import numpy as np
import openmlcontrib
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn.linear_model
import sklearn.ensemble
import os

import evaluation, quadratic


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--performances_path', type=str,
default=os.path.expanduser('~') + '/projects/sklearn-bot/data/svc.arff')
parser.add_argument('--metafeatures_path', type=str,
default=os.path.expanduser('~') + '/projects/sklearn-bot/data/metafeatures.arff')
parser.add_argument('--output_directory', type=str,
default=os.path.expanduser('~') + '/experiments/meta-models')
parser.add_argument('--poly_degree', type=int, default=2)
parser.add_argument('--precision_at_n', type=int, default=20)
parser.add_argument('--precision_out_of_k', type=int, default=100)
parser.add_argument('--cv_iterations', type=int, default=5)
parser.add_argument('--n_estimators', type=int, default=16)
parser.add_argument('--random_seed', type=int, default=42)
parser.add_argument('--task_limit', type=int, default=None, help='For fast testing')
args_ = parser.parse_args()
return args_


def run(args):
root = logging.getLogger()
root.setLevel(logging.INFO)
logging.info('Started meta-models.py')

# some naming declarations
precision_name = 'precision_at_%d_out_%d' % (args.precision_at_n, args.precision_out_of_k)
spearman_name = 'spearmanr_%d' % args.precision_out_of_k
param_columns = ['svc__gamma', 'svc__C']

# data loading and management
with open(args.performances_path, 'r') as fp:
arff_performances = arff.load(fp)
performances = openmlcontrib.meta.arff_to_dataframe(arff_performances, None)
with open(args.metafeatures_path, 'r') as fp:
arff_metafeatures = arff.load(fp)
# impute missing meta-features with -1 value
metafeatures = openmlcontrib.meta.arff_to_dataframe(arff_metafeatures, None).set_index('task_id').fillna(-1)
# remove all non-rbf rows
performances = performances.loc[performances['svc__kernel'] == 'rbf']
# join with meta-features frame, and remove tasks without meta-features
performances = performances.join(metafeatures, on='task_id', how='inner')
# coefficients data
coefficients_data = quadratic.generate_coefficients_data(args.poly_degree, performances, param_columns).join(metafeatures, how='inner')
coefficients_data.to_csv(os.path.join(args.output_directory, 'coefficients.csv'))
logging.info('Generated all datasets')

# sklearn objects
quadratic_model = sklearn.linear_model.LinearRegression(fit_intercept=False)
random_forest_model = sklearn.ensemble.RandomForestRegressor(n_estimators=args.n_estimators,
random_state=args.random_seed)
random_forest_coef = quadratic.MetaRandomForestQuadratic(n_estimators=args.n_estimators,
random_seed=args.random_seed,
meta_columns=list(metafeatures.columns.values),
base_columns=param_columns,
poly_degree=args.poly_degree)
poly_transform = sklearn.preprocessing.PolynomialFeatures(args.poly_degree)

# determine relevant tasks
all_tasks = performances['task_id'].unique()
if args.task_limit is not None:
all_tasks = all_tasks[:args.task_limit]

results = []
for idx, task_id in enumerate(all_tasks):
logging.info('Processing task %d (%d/%d)' % (task_id, idx+1, len(all_tasks)))
frame_task = performances.loc[performances['task_id'] == task_id]
frame_others = performances.loc[performances['task_id'] != task_id]
coefficients_train_frame = coefficients_data.drop([task_id])
assert(frame_task.shape[0] > 100)

# some convenience datasets
X_poly_train = poly_transform.fit_transform(frame_others[param_columns].values)
X_poly_test = poly_transform.fit_transform(frame_task[param_columns].values)
# X_poly_meta_train = np.concatenate((X_poly_train, frame_others[metafeatures.columns.values]), axis=1)
# X_poly_meta_test = np.concatenate((X_poly_test, frame_task[metafeatures.columns.values]), axis=1)

#######################
# SURROGATES #
#######################

# quadratic
prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(quadratic_model,
X_poly_test,
frame_task['predictive_accuracy'].values,
args.cv_iterations,
args.precision_at_n,
args.precision_out_of_k)
results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'x_order': 61, 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr})
results.append({'task_id': task_id, 'strategy': 'quadratic_surrogate', 'x_order': 60, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te})

# random forest
prec_te, prec_tr, spearm_te, spearm_tr = evaluation.cross_validate_surrogate(random_forest_model,
frame_task[param_columns].values,
frame_task['predictive_accuracy'].values,
args.cv_iterations,
args.precision_at_n,
args.precision_out_of_k)
results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'x_order': 31, 'set': 'train-obs', precision_name: prec_tr, spearman_name: spearm_tr})
results.append({'task_id': task_id, 'strategy': 'RF_surrogate', 'x_order': 30, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te})

#######################
# AGGREGATES #
#######################

# quadratic
prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model,
X_poly_train,
frame_others['predictive_accuracy'].values,
X_poly_test,
frame_task['predictive_accuracy'].values,
args.precision_at_n,
args.precision_out_of_k)
results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'x_order': 41, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te})
results.append({'task_id': task_id, 'strategy': 'quadratic_aggregate', 'x_order': 40, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr})

# random forest
prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model,
frame_others[param_columns],
frame_others['predictive_accuracy'].values,
frame_task[param_columns].values,
frame_task['predictive_accuracy'].values,
args.precision_at_n,
args.precision_out_of_k)
results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'x_order': 11, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te})
results.append({'task_id': task_id, 'strategy': 'RF_aggregate', 'x_order': 10, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr})

# meta-models
# prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(quadratic_model,
# X_poly_meta_train,
# frame_others['predictive_accuracy'].values,
# X_poly_meta_test,
# frame_task['predictive_accuracy'].values,
# args.precision_at_n,
# args.precision_out_of_k)
# results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'test', precision_name: prec_te, spearman_name: spearm_te})
# results.append({'task_id': task_id, 'strategy': 'quadratic_meta', 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr})

############################
# META-MODELS #
############################

# random forest
columns = list(param_columns) + list(metafeatures.columns.values)
prec_te, prec_tr, spearm_te, spearm_tr = evaluation.evaluate_fold(random_forest_model,
frame_others[columns],
frame_others['predictive_accuracy'].values,
frame_task[columns].values,
frame_task['predictive_accuracy'].values,
args.precision_at_n,
args.precision_out_of_k)
results.append({'task_id': task_id, 'strategy': 'RF_meta', 'x_order': 21, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te})
results.append({'task_id': task_id, 'strategy': 'RF_meta', 'x_order': 20, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr})

# special case: random forest that predicts coefficients of base task
random_forest_coef.fit(coefficients_train_frame[metafeatures.columns.values].values,
coefficients_train_frame[quadratic.get_coefficient_names()].values)
# note that this code is an almost duplicate from the precision module (TODO: refactor)
y_hat_te = random_forest_coef.predict(frame_task)
rand_indices_te = np.random.randint(len(frame_task), size=args.precision_out_of_k)
prec_te = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices_te],y_hat_te[rand_indices_te], args.precision_at_n)
spearm_te = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices_te], y_hat_te[rand_indices_te])[0]
results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'x_order': 51, 'set': 'test', precision_name: prec_te, spearman_name: spearm_te})
# again, duplicate (TODO: refactor)
y_hat_tr = random_forest_coef.predict(frame_task)
rand_indices_tr = np.random.randint(len(frame_task), size=args.precision_out_of_k)
prec_tr = evaluation.precision_at_n(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr], args.precision_at_n)
spearm_tr = scipy.stats.pearsonr(frame_task['predictive_accuracy'].values[rand_indices_tr], y_hat_tr[rand_indices_tr])[0]
results.append({'task_id': task_id, 'strategy': 'RF_meta_coeff', 'x_order': 50, 'set': 'train-tasks', precision_name: prec_tr, spearman_name: spearm_tr})

# x_order is used to trick seaborn plot into using the right order
# general order: first random forest models, then quadratic models
# secondary order: first aggregates, then meta-models, then surrogates
# tertiary order: first train-tasks, then test, then test-obs
result_frame = pd.DataFrame(results).sort_values(['x_order'])

os.makedirs(args.output_directory, exist_ok=True)
result_frame.to_csv(os.path.join(args.output_directory, 'results.csv'))

fig, ax = plt.subplots(figsize=(16, 6))
sns.boxplot(x="strategy", y=precision_name, hue="set", data=result_frame, ax=ax)
plt.savefig(os.path.join(args.output_directory, '%s.png' % precision_name))

fig, ax = plt.subplots(figsize=(16, 6))
sns.boxplot(x="strategy", y=spearman_name, hue="set", data=result_frame, ax=ax)
plt.savefig(os.path.join(args.output_directory, '%s.png' % spearman_name))


if __name__ == '__main__':
run(parse_args())
1 change: 1 addition & 0 deletions quadratic/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .quadratic import MetaRandomForestQuadratic, generate_coefficients_data, get_coefficient_names
80 changes: 80 additions & 0 deletions quadratic/quadratic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import logging
import numpy as np
import pandas as pd
import sklearn.base
import sklearn.linear_model
import sklearn.preprocessing
import typing


class MetaRandomForestQuadratic(sklearn.base.RegressorMixin):

def __init__(self, n_estimators: int, random_seed: int,
meta_columns: typing.List, base_columns: typing.List, poly_degree: int):
if poly_degree != 2:
logging.warning('Polynomial degree of 2 assumed. ')
self.n_estimators = n_estimators
self.random_seed = random_seed
self.meta_columns = meta_columns
self.base_columns = base_columns
self.poly_degree = 2
self.feat_trans = sklearn.preprocessing.PolynomialFeatures(self.poly_degree)
self.meta_model = sklearn.ensemble.RandomForestRegressor(n_estimators=self.n_estimators,
random_state=self.random_seed)

def fit(self, X: np.ndarray, y: np.ndarray):
self.meta_model.fit(X, y)

def predict(self, X: pd.DataFrame) -> np.ndarray:
"""
Returns a 1D numpy array
"""
predictions = []
for idx, row in X.iterrows():
base_model = sklearn.linear_model.LinearRegression(fit_intercept=False)
base_model.intercept_ = 0
base_model.coef_ = self.meta_model.predict([row[self.meta_columns]])[0]
input = self.feat_trans.fit_transform([row[self.base_columns]])
prediction = base_model.predict(input)[0]
predictions.append(prediction)
res = np.array(predictions)
return res


def get_coefficient_names() -> typing.List:
return [
'intercept',
'coef_gamma',
'coef_C',
'coef_gamma_sq',
'coef_gamma_C',
'coef_C_sq',
]


def generate_coefficients_data(poly_degree: int, performance_data: pd.DataFrame, param_columns: typing.List) -> pd.DataFrame:
"""
Pre-processess the coefficients for all datasets at once (for speed)
"""
if poly_degree != 2:
logging.warning('Not Implemented: polynomial degree of > 2. Will use degree 2 for meta-model')
coef_names = get_coefficient_names()
results = []
for idx, task_id in enumerate(performance_data['task_id'].unique()):
frame_task = performance_data.loc[performance_data['task_id'] == task_id]
model = sklearn.linear_model.LinearRegression(fit_intercept=False)
poly_feat = sklearn.preprocessing.PolynomialFeatures(2)
X = poly_feat.fit_transform(frame_task[param_columns])
y = frame_task['predictive_accuracy']
model.fit(X, y)
result = {
'task_id': task_id,
coef_names[0]: model.coef_[0],
coef_names[1]: model.coef_[1],
coef_names[2]: model.coef_[2],
coef_names[3]: model.coef_[3],
coef_names[4]: model.coef_[4],
coef_names[5]: model.coef_[5],
}
results.append(result)
return pd.DataFrame(results).set_index('task_id')
Empty file added tests/__init__.py
Empty file.
15 changes: 15 additions & 0 deletions tests/test_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import evaluation
import numpy as np
import unittest


class TestEvaluationMethods(unittest.TestCase):

def test_precision_at_n(self):
real = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
yhat = np.array([1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1])
exp = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 2.0/6.0, 4.0/7.0, 6.0/8.0, 8.0/9.0, 1.0])

for i in range(len(real)):
result = evaluation.precision_at_n(real, yhat, i+1)
assert exp[i] == result