From 6f8c664cd0f7804072d425d6606d07b9ac480c70 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 20:08:09 +0200
Subject: [PATCH 01/12] Remove the need to pass back the output file since it
is known
---
examples/custom/extensions/GradientBoosting/exec.py | 1 -
examples/custom/extensions/Stacking/exec.py | 1 -
frameworks/AutoGluon/exec.py | 1 -
frameworks/AutoGluon/exec_ts.py | 1 -
frameworks/FEDOT/exec.py | 1 -
frameworks/FEDOT/exec_ts.py | 1 -
frameworks/GAMA/exec.py | 1 -
frameworks/H2OAutoML/exec.py | 1 -
frameworks/MLPlan/exec.py | 1 -
frameworks/NaiveAutoML/exec.py | 1 -
frameworks/RandomForest/exec.py | 1 -
frameworks/SapientML/exec.py | 2 --
frameworks/TPOT/exec.py | 1 -
frameworks/TunedRandomForest/exec.py | 1 -
frameworks/autosklearn/exec.py | 1 -
frameworks/flaml/exec.py | 1 -
frameworks/hyperoptsklearn/exec.py | 1 -
frameworks/lightautoml/exec.py | 1 -
frameworks/mljarsupervised/exec.py | 1 -
frameworks/oboe/exec.py | 1 -
frameworks/shared/callee.py | 3 ++-
21 files changed, 2 insertions(+), 22 deletions(-)
diff --git a/examples/custom/extensions/GradientBoosting/exec.py b/examples/custom/extensions/GradientBoosting/exec.py
index 072eb1944..0a36066db 100644
--- a/examples/custom/extensions/GradientBoosting/exec.py
+++ b/examples/custom/extensions/GradientBoosting/exec.py
@@ -32,7 +32,6 @@ def run(dataset: Dataset, config: TaskConfig):
save_predictions(
dataset=dataset,
- output_file=config.output_predictions_file,
probabilities=probabilities,
predictions=predictions,
truth=y_test,
diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py
index c213185f9..365ba41ff 100644
--- a/examples/custom/extensions/Stacking/exec.py
+++ b/examples/custom/extensions/Stacking/exec.py
@@ -133,7 +133,6 @@ def run(dataset, config):
probabilities = estimator.predict_proba(X_test) if is_classification else None
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index f833485ef..ec5b93b3d 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -183,7 +183,6 @@ def inference_time_regression(data: Union[str, pd.DataFrame]):
shutil.rmtree(predictor.path, ignore_errors=True)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
probabilities_labels=prob_labels,
diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
index 71ecfb3d2..5615633fc 100644
--- a/frameworks/AutoGluon/exec_ts.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -95,7 +95,6 @@ def run(dataset, config):
get_reusable_executor().shutdown(wait=True)
return result(
- output_file=config.output_predictions_file,
predictions=predictions_only,
truth=truth_only,
target_is_encoded=False,
diff --git a/frameworks/FEDOT/exec.py b/frameworks/FEDOT/exec.py
index 0baba5010..a180ad0af 100644
--- a/frameworks/FEDOT/exec.py
+++ b/frameworks/FEDOT/exec.py
@@ -53,7 +53,6 @@ def run(dataset, config):
save_artifacts(fedot, config)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=dataset.test.y,
probabilities=probabilities,
diff --git a/frameworks/FEDOT/exec_ts.py b/frameworks/FEDOT/exec_ts.py
index e0fb3fc10..d45c5c5f9 100644
--- a/frameworks/FEDOT/exec_ts.py
+++ b/frameworks/FEDOT/exec_ts.py
@@ -104,7 +104,6 @@ def run(dataset, config):
save_artifacts(fedot, config)
return result(
- output_file=config.output_predictions_file,
predictions=all_series_predictions,
truth=truth_only,
target_is_encoded=False,
diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py
index a7f5ce10e..f1607057e 100644
--- a/frameworks/GAMA/exec.py
+++ b/frameworks/GAMA/exec.py
@@ -123,7 +123,6 @@ def infer(data: Union[str, pd.DataFrame]):
probabilities = gama_automl.predict_proba(X_test)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
truth=y_test,
diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py
index 009ed49c7..6fd579311 100644
--- a/frameworks/H2OAutoML/exec.py
+++ b/frameworks/H2OAutoML/exec.py
@@ -203,7 +203,6 @@ def infer(path: str):
save_artifacts(aml, dataset=dataset, config=config)
return result(
- output_file=config.output_predictions_file,
predictions=preds.predictions,
truth=preds.truth,
probabilities=preds.probabilities,
diff --git a/frameworks/MLPlan/exec.py b/frameworks/MLPlan/exec.py
index 6e58fe599..4976e9ef0 100644
--- a/frameworks/MLPlan/exec.py
+++ b/frameworks/MLPlan/exec.py
@@ -125,7 +125,6 @@ def run(dataset, config):
target_encoded = False
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=truth,
probabilities=probabilities,
diff --git a/frameworks/NaiveAutoML/exec.py b/frameworks/NaiveAutoML/exec.py
index 67921ff7c..dae84a4fc 100644
--- a/frameworks/NaiveAutoML/exec.py
+++ b/frameworks/NaiveAutoML/exec.py
@@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]):
save_artifacts(automl, config)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
probabilities=probabilities,
truth=dataset.test.y,
diff --git a/frameworks/RandomForest/exec.py b/frameworks/RandomForest/exec.py
index 4be16ac32..e2f6dc2e5 100644
--- a/frameworks/RandomForest/exec.py
+++ b/frameworks/RandomForest/exec.py
@@ -132,7 +132,6 @@ def infer(data):
log.info("Finished inference time measurements.")
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
diff --git a/frameworks/SapientML/exec.py b/frameworks/SapientML/exec.py
index 0a80ba883..e58491fb4 100644
--- a/frameworks/SapientML/exec.py
+++ b/frameworks/SapientML/exec.py
@@ -79,7 +79,6 @@ def run(dataset, config):
)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
@@ -88,7 +87,6 @@ def run(dataset, config):
)
else:
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
training_duration=training.duration,
diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py
index 4883a6f3f..e403679e7 100644
--- a/frameworks/TPOT/exec.py
+++ b/frameworks/TPOT/exec.py
@@ -131,7 +131,6 @@ def infer(data):
save_artifacts(tpot, config)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
diff --git a/frameworks/TunedRandomForest/exec.py b/frameworks/TunedRandomForest/exec.py
index 1fbd6e2af..9a4569a2d 100644
--- a/frameworks/TunedRandomForest/exec.py
+++ b/frameworks/TunedRandomForest/exec.py
@@ -286,7 +286,6 @@ def infer(data):
log.info("Finished inference time measurements.")
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
index d42eb23fb..23c09a91c 100644
--- a/frameworks/autosklearn/exec.py
+++ b/frameworks/autosklearn/exec.py
@@ -207,7 +207,6 @@ def sample_one_test_row(seed: int):
save_artifacts(auto_sklearn, config)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=dataset.test.y if use_pandas else dataset.test.y_enc,
probabilities=probabilities,
diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py
index 370f93696..a67af2663 100644
--- a/frameworks/flaml/exec.py
+++ b/frameworks/flaml/exec.py
@@ -91,7 +91,6 @@ def infer(data: Union[str, pd.DataFrame]):
log.info(f"Finished predict in {predict.duration}s.")
return result(
- output_file=config.output_predictions_file,
probabilities=probabilities,
predictions=predictions,
truth=y_test,
diff --git a/frameworks/hyperoptsklearn/exec.py b/frameworks/hyperoptsklearn/exec.py
index 48e2217b1..ce90c4c46 100644
--- a/frameworks/hyperoptsklearn/exec.py
+++ b/frameworks/hyperoptsklearn/exec.py
@@ -117,7 +117,6 @@ def default():
probabilities = None
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
diff --git a/frameworks/lightautoml/exec.py b/frameworks/lightautoml/exec.py
index d2ccf0a21..f25d011e3 100644
--- a/frameworks/lightautoml/exec.py
+++ b/frameworks/lightautoml/exec.py
@@ -99,7 +99,6 @@ def infer(data: Union[str, pd.DataFrame]):
save_artifacts(automl, config)
return result(
- output_file=config.output_predictions_file,
probabilities_labels=probabilities_labels,
probabilities=probabilities,
predictions=predictions,
diff --git a/frameworks/mljarsupervised/exec.py b/frameworks/mljarsupervised/exec.py
index 44003c45c..6d5d4f15c 100644
--- a/frameworks/mljarsupervised/exec.py
+++ b/frameworks/mljarsupervised/exec.py
@@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]):
shutil.rmtree(results_path, ignore_errors=True)
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
diff --git a/frameworks/oboe/exec.py b/frameworks/oboe/exec.py
index ebb0232a6..3e30b07db 100644
--- a/frameworks/oboe/exec.py
+++ b/frameworks/oboe/exec.py
@@ -123,7 +123,6 @@ def aml_models():
probabilities = None
return result(
- output_file=config.output_predictions_file,
predictions=predictions,
truth=y_test,
probabilities=probabilities,
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index 23b41620a..a6f073a3e 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -27,7 +27,6 @@ class FrameworkError(Exception):
def result(
- output_file=None,
predictions=None,
truth=None,
probabilities=None,
@@ -107,6 +106,8 @@ def load_data(name, path, **_):
)
json_dump(inference_measurements, inference_file, style="compact")
res["others"]["inference_times"] = str(inference_file)
+
+ res["output_file"] = config.output_predictions_file
json_dump(res, config.result_file, style="compact")
From 0896e1fa4e38cafb0c890aa0fd4463300eb0b01d Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 20:30:24 +0200
Subject: [PATCH 02/12] Serialize numpy object arrays through JSON instead of
pickle
---
amlb/utils/serialization.py | 15 ++++++++++-----
frameworks/shared/callee.py | 2 +-
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/amlb/utils/serialization.py b/amlb/utils/serialization.py
index 5beff1213..9622eae05 100644
--- a/amlb/utils/serialization.py
+++ b/amlb/utils/serialization.py
@@ -1,4 +1,5 @@
import logging
+
import math
import os
import pickle
@@ -34,8 +35,6 @@ def _import_data_libraries():
# mainly intended to serialize simple data structures like lists.
# allowed=['pickle', 'json']
fallback_serializer="json",
- # if numpy can use pickle to serialize ndarrays,
- numpy_allow_pickle=True,
# format used to serialize pandas dataframes/series between processes.
# allowed=['pickle', 'parquet', 'hdf', 'json']
pandas_serializer="parquet",
@@ -163,8 +162,14 @@ def serialize_data(data, path, config: Optional[ns] = None):
root, ext = os.path.splitext(path)
np, pd, sp = _import_data_libraries()
if np and isinstance(data, np.ndarray):
- path = f"{root}.npy"
- np.save(path, data, allow_pickle=config.numpy_allow_pickle)
+ if data.dtype == "object":
+ # Numpy cannot save object arrays without pickle
+ path = f"{root}.json"
+ data = data.squeeze().tolist()
+ json_dump(data, path, style="compact")
+ else:
+ path = f"{root}.npy"
+ np.save(path, data, allow_pickle=False)
elif sp and isinstance(data, sp.spmatrix):
# use custom extension to recognize sparsed matrices from file name.
# .npz is automatically appended if missing, and can also potentially be used for numpy arrays.
@@ -212,7 +217,7 @@ def deserialize_data(path, config: Optional[ns] = None):
if ext == ".npy":
if np is None:
raise SerializationError(f"Numpy is required to deserialize {path}.")
- return np.load(path, allow_pickle=config.numpy_allow_pickle)
+ return np.load(path)
elif ext == ".npz":
_, ext2 = os.path.splitext(base)
if ext2 == ".spy":
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index a6f073a3e..6aba88c95 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -107,7 +107,7 @@ def load_data(name, path, **_):
json_dump(inference_measurements, inference_file, style="compact")
res["others"]["inference_times"] = str(inference_file)
- res["output_file"] = config.output_predictions_file
+ res.setdefault("output_file", config.output_predictions_file)
json_dump(res, config.result_file, style="compact")
From 9f1047a04a56841598412caa59325e1b2b616692 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 20:50:37 +0200
Subject: [PATCH 03/12] Remove some options I don't want to support (mostly
pickle)
---
amlb/utils/serialization.py | 25 +++++++++++--------------
1 file changed, 11 insertions(+), 14 deletions(-)
diff --git a/amlb/utils/serialization.py b/amlb/utils/serialization.py
index 9622eae05..ea01befcc 100644
--- a/amlb/utils/serialization.py
+++ b/amlb/utils/serialization.py
@@ -6,6 +6,7 @@
import re
from typing import Optional
+
from .core import Namespace as ns, json_dump, json_load
from .process import profile
@@ -34,9 +35,10 @@ def _import_data_libraries():
# the serializer to use when there's no specific serializer available.
# mainly intended to serialize simple data structures like lists.
# allowed=['pickle', 'json']
- fallback_serializer="json",
+ # OPTION REMOVED: Only JSON is allowed. Pickle is evil.
+ # fallback_serializer="json",
# format used to serialize pandas dataframes/series between processes.
- # allowed=['pickle', 'parquet', 'hdf', 'json']
+ # allowed=['parquet', 'json']
pandas_serializer="parquet",
# the compression format used when serializing pandas dataframes/series.
# allowed=[None, 'infer', 'bz2', 'gzip']
@@ -182,9 +184,7 @@ def serialize_data(data, path, config: Optional[ns] = None):
# for example, 'true' and 'false' are converted automatically to booleans, even for column names…
data.rename(str, axis="columns", inplace=True)
ser = config.pandas_serializer
- if ser == "pickle":
- data.to_pickle(path, compression=config.pandas_compression)
- elif ser == "parquet":
+ if ser == "parquet":
if isinstance(data, pd.Series):
data = pd.DataFrame({__series__: data})
# parquet serialization doesn't support sparse dataframes
@@ -194,18 +194,15 @@ def serialize_data(data, path, config: Optional[ns] = None):
json_dump(dtypes, f"{path}.dtypes", style="compact")
data = unsparsify(data)
data.to_parquet(path, compression=config.pandas_parquet_compression)
- elif ser == "hdf":
- data.to_hdf(path, os.path.basename(path), mode="w", format="table")
elif ser == "json":
data.to_json(path, compression=config.pandas_compression)
- else: # fallback serializer
- if config.fallback_serializer == "json":
- path = f"{root}.json"
- json_dump(data, path, style="compact")
else:
- path = f"{root}.pkl"
- with open(path, "wb") as f:
- pickle.dump(data, f)
+ raise ValueError(
+ f"Invalid pandas serialization {ser} must be 'parquet' or 'json'"
+ )
+ else: # fallback serializer
+ path = f"{root}.json"
+ json_dump(data, path, style="compact")
return path
From 2759e2ef82f9a323c70d76bf3be3f2f3594861b3 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 21:32:53 +0200
Subject: [PATCH 04/12] Remove pickle serialization tests
---
.../utils/serialization/test_serializers.py | 101 ------------------
1 file changed, 101 deletions(-)
diff --git a/tests/unit/amlb/utils/serialization/test_serializers.py b/tests/unit/amlb/utils/serialization/test_serializers.py
index 4e1cd4fde..157e79070 100644
--- a/tests/unit/amlb/utils/serialization/test_serializers.py
+++ b/tests/unit/amlb/utils/serialization/test_serializers.py
@@ -18,18 +18,6 @@ def test_serialize_list_json(tmpdir):
assert li == reloaded
-@pytest.mark.use_disk
-def test_serialize_list_pickle(tmpdir):
- li = [[1, 2.2, None, 3, 4.4, "foo", True], ["bar", False, 2 / 3]]
- dest = os.path.join(tmpdir, "my_list")
- path = serialize_data(li, dest, config=ns(fallback_serializer="pickle"))
- assert path == f"{dest}.pkl"
-
- reloaded = deserialize_data(path)
- assert isinstance(reloaded, list)
- assert li == reloaded
-
-
@pytest.mark.use_disk
def test_serialize_dict_json(tmpdir):
di = dict(first=[1, 2.2, None, 3, 4.4, "foo", True], second=["bar", False, 2 / 3])
@@ -42,18 +30,6 @@ def test_serialize_dict_json(tmpdir):
assert di == reloaded
-@pytest.mark.use_disk
-def test_serialize_dict_pickle(tmpdir):
- di = dict(first=[1, 2.2, None, 3, 4.4, "foo", True], second=["bar", False, 2 / 3])
- dest = os.path.join(tmpdir, "my_dict")
- path = serialize_data(di, dest, config=ns(fallback_serializer="pickle"))
- assert path == f"{dest}.pkl"
-
- reloaded = deserialize_data(path)
- assert isinstance(reloaded, dict)
- assert di == reloaded
-
-
@pytest.mark.use_disk
def test_serialize_numpy_array(tmpdir):
import numpy as np
@@ -156,83 +132,6 @@ def test_serialize_sparse_matrix_reload_as_array(tmpdir):
assert np.array_equal(mat.toarray(), reloaded, equal_nan=True)
-@pytest.mark.use_disk
-def test_serialize_sparse_dataframe(tmpdir):
- import pandas as pd
-
- ser_config = ns(
- pandas_serializer="pickle", sparse_dataframe_deserialized_format=None
- )
- dfs = pd.DataFrame(
- dict(
- first=[0, 0, 0, 3.3],
- second=[4.4, 0, 0, 0],
- third=[0, pd.NA, 0, 0],
- )
- ).astype("Sparse")
- assert is_sparse(dfs)
- dest = os.path.join(tmpdir, "my_sparse_df")
- path = serialize_data(dfs, dest, config=ser_config)
- assert path == f"{dest}.pd"
-
- reloaded = deserialize_data(path, config=ser_config)
- assert isinstance(reloaded, pd.DataFrame)
- assert is_sparse(reloaded)
- assert dfs.compare(reloaded).empty
-
-
-@pytest.mark.use_disk
-def test_serialize_pandas_dataframe_reload_as_dense(tmpdir):
- import pandas as pd
-
- ser_config = ns(
- pandas_serializer="pickle", sparse_dataframe_deserialized_format="dense"
- )
- dfs = pd.DataFrame(
- dict(
- first=[0, 0, 0, 3.3],
- second=[4.4, 0, 0, 0],
- third=[0, pd.NA, 0, 0],
- # fourth=[None, None, 'a', None]
- )
- ).astype("Sparse")
- assert is_sparse(dfs)
- dest = os.path.join(tmpdir, "my_sparse_df")
- path = serialize_data(dfs, dest, config=ser_config)
- assert path == f"{dest}.pd"
-
- reloaded = deserialize_data(path, config=ser_config)
- assert isinstance(reloaded, pd.DataFrame)
- assert not is_sparse(reloaded)
- assert dfs.compare(reloaded).empty
-
-
-@pytest.mark.use_disk
-def test_serialize_pandas_dataframe_reload_as_array(tmpdir):
- import numpy as np
- import pandas as pd
-
- ser_config = ns(
- pandas_serializer="pickle", sparse_dataframe_deserialized_format="array"
- )
- dfs = pd.DataFrame(
- dict(
- first=[0, 0, 0, 3.3],
- second=[4.4, 0, 0, 0],
- third=[0, pd.NA, 0, 0],
- # fourth=[None, None, 'a', None]
- )
- ).astype("Sparse")
- assert is_sparse(dfs)
- dest = os.path.join(tmpdir, "my_sparse_df")
- path = serialize_data(dfs, dest, config=ser_config)
- assert path == f"{dest}.pd"
-
- reloaded = deserialize_data(path, config=ser_config)
- assert isinstance(reloaded, np.ndarray)
- assert np.array_equal(dfs.to_numpy(), np.asarray(reloaded), equal_nan=True)
-
-
@pytest.mark.use_disk
def test_serialize_sparse_numerical_dataframe_to_parquet(tmpdir):
import pandas as pd
From 79538567f7285924e0438248419730e60e8f490b Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 21:33:13 +0200
Subject: [PATCH 05/12] Be more explicit about the script failing with uncaught
exception
---
frameworks/shared/callee.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py
index 6aba88c95..4c3c4b680 100644
--- a/frameworks/shared/callee.py
+++ b/frameworks/shared/callee.py
@@ -93,6 +93,7 @@ def load_data(name, path, **_):
path = os.path.join(config.result_dir, ".".join([name, "data"]))
res[name] = serialize_data(arr, path, config=ser_config)
except BaseException as e:
+ log.error("Integration script failed with uncaught exception:")
log.exception(e)
res = dict(error_message=str(e), models_count=0)
finally:
From a08b1556367e0bda4b1f35bdaa5b37df3cb0d953 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 22:05:44 +0200
Subject: [PATCH 06/12] Add debugging step to job.. cache is not restored with
cov in bin
---
.github/workflows/run_all_frameworks.yml | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml
index 07f7e7bfc..3c9f18477 100644
--- a/.github/workflows/run_all_frameworks.yml
+++ b/.github/workflows/run_all_frameworks.yml
@@ -164,6 +164,12 @@ jobs:
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
python -m pip install "coverage[toml]"
+ - name: Check Things
+ run: |
+ ls -lah venv
+ ls -lah venv/bin
+ ls -lah /home/runner/work/automlbenchmark/automlbenchmark/venv/bin
+ echo $(pwd)
- name: Run ${{ matrix.framework }} on ${{ matrix.task }}
run: |
source venv/bin/activate
From c0a9e646a8b7a3d52e29312fd180ef806f998471 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 22:19:09 +0200
Subject: [PATCH 07/12] More debug stuff
---
.github/workflows/run_all_frameworks.yml | 3 +++
1 file changed, 3 insertions(+)
diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml
index 3c9f18477..9f13f0535 100644
--- a/.github/workflows/run_all_frameworks.yml
+++ b/.github/workflows/run_all_frameworks.yml
@@ -173,6 +173,9 @@ jobs:
- name: Run ${{ matrix.framework }} on ${{ matrix.task }}
run: |
source venv/bin/activate
+ ls /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage
+ /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage --help
+ coverage --help
coverage run -m runbenchmark ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e
coverage xml
env:
From cf8ca91d5c31b0db98da493c13831262dde3c052 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 22:19:28 +0200
Subject: [PATCH 08/12] Only install packages not already installed by
framework
---
frameworks/shared/setup.sh | 20 +++++++++++++-------
1 file changed, 13 insertions(+), 7 deletions(-)
diff --git a/frameworks/shared/setup.sh b/frameworks/shared/setup.sh
index ddf8a755d..b9460e9ee 100755
--- a/frameworks/shared/setup.sh
+++ b/frameworks/shared/setup.sh
@@ -44,13 +44,19 @@ PIP() {
$pip_exec "$@"
}
-#if [[ -x "$(command -v $PY_VENV/bin/activate)" ]]; then
-# $PY_ROOT/activate
-#fi
-
-#echo "PY=$(command -v PY)"
-#echo "PIP=$(command -v PIP)"
echo "PY=$py_exec"
echo "PIP=$pip_exec"
-PIP install --no-cache-dir -r $SHARED_DIR/requirements.txt
+REQ_FILE="$SHARED_DIR/requirements.txt"
+
+for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do
+ pkg=$(echo "$line" | sed -E 's/[=><~!].*$//')
+ # In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line
+
+ if ! PY -c "import $pkg" &> /dev/null; then
+ echo "$pkg not found. Installing from requirements.txt..."
+ PIP install --no-cache-dir "$line"
+ else
+ echo "$pkg is already installed by the framework, using that instead."
+ fi
+done
From 89ff448ee09092bf0a2961121dc2ccaa5410e76e Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 22:28:18 +0200
Subject: [PATCH 09/12] Trying to figure out why coverage wont work even though
its there
---
.github/workflows/run_all_frameworks.yml | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml
index 9f13f0535..5238f44d2 100644
--- a/.github/workflows/run_all_frameworks.yml
+++ b/.github/workflows/run_all_frameworks.yml
@@ -174,6 +174,11 @@ jobs:
run: |
source venv/bin/activate
ls /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage
+ python -m pip list
+ python -m pip install "coverage[toml]"
+ echo $(pwd)
+ which coverage
+ coverage --help
/home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage --help
coverage --help
coverage run -m runbenchmark ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e
From af9911769256dc6455c0ae134c657a97a34732fb Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 10 Jun 2025 22:55:47 +0200
Subject: [PATCH 10/12] Disable caching
---
.github/workflows/run_all_frameworks.yml | 10 ++--------
1 file changed, 2 insertions(+), 8 deletions(-)
diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml
index 5238f44d2..62e7af804 100644
--- a/.github/workflows/run_all_frameworks.yml
+++ b/.github/workflows/run_all_frameworks.yml
@@ -102,6 +102,7 @@ jobs:
- name: Create venv
run: python -m venv venv
- uses: actions/cache@v3
+ if: failure()
id: cache
with:
path: /home/runner/work/automlbenchmark/automlbenchmark/venv
@@ -151,6 +152,7 @@ jobs:
- name: Create venv
run: python -m venv venv
- uses: actions/cache@v3
+ if: failure()
id: cache
with:
path: /home/runner/work/automlbenchmark/automlbenchmark/venv
@@ -173,14 +175,6 @@ jobs:
- name: Run ${{ matrix.framework }} on ${{ matrix.task }}
run: |
source venv/bin/activate
- ls /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage
- python -m pip list
- python -m pip install "coverage[toml]"
- echo $(pwd)
- which coverage
- coverage --help
- /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage --help
- coverage --help
coverage run -m runbenchmark ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e
coverage xml
env:
From 44225cdd0cdeab1c61922303de15bed735e8526b Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 12 Jun 2025 20:22:47 +0200
Subject: [PATCH 11/12] temporarily move installation of deps to ag script
---
frameworks/AutoGluon/setup.sh | 15 +++++++++++++++
frameworks/shared/setup.sh | 14 --------------
2 files changed, 15 insertions(+), 14 deletions(-)
diff --git a/frameworks/AutoGluon/setup.sh b/frameworks/AutoGluon/setup.sh
index bac656d29..f43750fd6 100755
--- a/frameworks/AutoGluon/setup.sh
+++ b/frameworks/AutoGluon/setup.sh
@@ -61,3 +61,18 @@ fi
echo "Finished setup, testing autogluon install..."
PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed"
+
+echo "Installing AMLB dependencies into AutoGluon venv"
+REQ_FILE="${HERE}/../shared/requirements.txt"
+
+for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do
+ pkg=$(echo "$line" | sed -E 's/[=><~!].*$//')
+ # In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line
+
+ if ! PY -c "import $pkg" &> /dev/null; then
+ echo "$pkg not found. Installing from requirements.txt..."
+ PIP install --no-cache-dir "$line"
+ else
+ echo "$pkg is already installed by the framework, using that instead."
+ fi
+done
diff --git a/frameworks/shared/setup.sh b/frameworks/shared/setup.sh
index b9460e9ee..f8715f89c 100755
--- a/frameworks/shared/setup.sh
+++ b/frameworks/shared/setup.sh
@@ -46,17 +46,3 @@ PIP() {
echo "PY=$py_exec"
echo "PIP=$pip_exec"
-
-REQ_FILE="$SHARED_DIR/requirements.txt"
-
-for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do
- pkg=$(echo "$line" | sed -E 's/[=><~!].*$//')
- # In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line
-
- if ! PY -c "import $pkg" &> /dev/null; then
- echo "$pkg not found. Installing from requirements.txt..."
- PIP install --no-cache-dir "$line"
- else
- echo "$pkg is already installed by the framework, using that instead."
- fi
-done
From 0f933485ff5ed62582d01dd8c7b909829eccaf32 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 12 Jun 2025 20:52:59 +0200
Subject: [PATCH 12/12] set high verbosity
---
frameworks/AutoGluon/exec.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py
index ec5b93b3d..04f5ca7dd 100644
--- a/frameworks/AutoGluon/exec.py
+++ b/frameworks/AutoGluon/exec.py
@@ -100,6 +100,7 @@ def run(dataset, config):
eval_metric=perf_metric.name,
path=models_dir,
problem_type=problem_type,
+ verbosity=4,
).fit(train_data=train_path, time_limit=time_limit, **training_params)
log.info(f"Finished fit in {training.duration}s.")