From 6f8c664cd0f7804072d425d6606d07b9ac480c70 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 20:08:09 +0200 Subject: [PATCH 01/12] Remove the need to pass back the output file since it is known --- examples/custom/extensions/GradientBoosting/exec.py | 1 - examples/custom/extensions/Stacking/exec.py | 1 - frameworks/AutoGluon/exec.py | 1 - frameworks/AutoGluon/exec_ts.py | 1 - frameworks/FEDOT/exec.py | 1 - frameworks/FEDOT/exec_ts.py | 1 - frameworks/GAMA/exec.py | 1 - frameworks/H2OAutoML/exec.py | 1 - frameworks/MLPlan/exec.py | 1 - frameworks/NaiveAutoML/exec.py | 1 - frameworks/RandomForest/exec.py | 1 - frameworks/SapientML/exec.py | 2 -- frameworks/TPOT/exec.py | 1 - frameworks/TunedRandomForest/exec.py | 1 - frameworks/autosklearn/exec.py | 1 - frameworks/flaml/exec.py | 1 - frameworks/hyperoptsklearn/exec.py | 1 - frameworks/lightautoml/exec.py | 1 - frameworks/mljarsupervised/exec.py | 1 - frameworks/oboe/exec.py | 1 - frameworks/shared/callee.py | 3 ++- 21 files changed, 2 insertions(+), 22 deletions(-) diff --git a/examples/custom/extensions/GradientBoosting/exec.py b/examples/custom/extensions/GradientBoosting/exec.py index 072eb1944..0a36066db 100644 --- a/examples/custom/extensions/GradientBoosting/exec.py +++ b/examples/custom/extensions/GradientBoosting/exec.py @@ -32,7 +32,6 @@ def run(dataset: Dataset, config: TaskConfig): save_predictions( dataset=dataset, - output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, diff --git a/examples/custom/extensions/Stacking/exec.py b/examples/custom/extensions/Stacking/exec.py index c213185f9..365ba41ff 100644 --- a/examples/custom/extensions/Stacking/exec.py +++ b/examples/custom/extensions/Stacking/exec.py @@ -133,7 +133,6 @@ def run(dataset, config): probabilities = estimator.predict_proba(X_test) if is_classification else None return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py index f833485ef..ec5b93b3d 100644 --- a/frameworks/AutoGluon/exec.py +++ b/frameworks/AutoGluon/exec.py @@ -183,7 +183,6 @@ def inference_time_regression(data: Union[str, pd.DataFrame]): shutil.rmtree(predictor.path, ignore_errors=True) return result( - output_file=config.output_predictions_file, predictions=predictions, probabilities=probabilities, probabilities_labels=prob_labels, diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py index 71ecfb3d2..5615633fc 100644 --- a/frameworks/AutoGluon/exec_ts.py +++ b/frameworks/AutoGluon/exec_ts.py @@ -95,7 +95,6 @@ def run(dataset, config): get_reusable_executor().shutdown(wait=True) return result( - output_file=config.output_predictions_file, predictions=predictions_only, truth=truth_only, target_is_encoded=False, diff --git a/frameworks/FEDOT/exec.py b/frameworks/FEDOT/exec.py index 0baba5010..a180ad0af 100644 --- a/frameworks/FEDOT/exec.py +++ b/frameworks/FEDOT/exec.py @@ -53,7 +53,6 @@ def run(dataset, config): save_artifacts(fedot, config) return result( - output_file=config.output_predictions_file, predictions=predictions, truth=dataset.test.y, probabilities=probabilities, diff --git a/frameworks/FEDOT/exec_ts.py b/frameworks/FEDOT/exec_ts.py index e0fb3fc10..d45c5c5f9 100644 --- a/frameworks/FEDOT/exec_ts.py +++ b/frameworks/FEDOT/exec_ts.py @@ -104,7 +104,6 @@ def run(dataset, config): save_artifacts(fedot, config) return result( - output_file=config.output_predictions_file, predictions=all_series_predictions, truth=truth_only, target_is_encoded=False, diff --git a/frameworks/GAMA/exec.py b/frameworks/GAMA/exec.py index a7f5ce10e..f1607057e 100644 --- a/frameworks/GAMA/exec.py +++ b/frameworks/GAMA/exec.py @@ -123,7 +123,6 @@ def infer(data: Union[str, pd.DataFrame]): probabilities = gama_automl.predict_proba(X_test) return result( - output_file=config.output_predictions_file, predictions=predictions, probabilities=probabilities, truth=y_test, diff --git a/frameworks/H2OAutoML/exec.py b/frameworks/H2OAutoML/exec.py index 009ed49c7..6fd579311 100644 --- a/frameworks/H2OAutoML/exec.py +++ b/frameworks/H2OAutoML/exec.py @@ -203,7 +203,6 @@ def infer(path: str): save_artifacts(aml, dataset=dataset, config=config) return result( - output_file=config.output_predictions_file, predictions=preds.predictions, truth=preds.truth, probabilities=preds.probabilities, diff --git a/frameworks/MLPlan/exec.py b/frameworks/MLPlan/exec.py index 6e58fe599..4976e9ef0 100644 --- a/frameworks/MLPlan/exec.py +++ b/frameworks/MLPlan/exec.py @@ -125,7 +125,6 @@ def run(dataset, config): target_encoded = False return result( - output_file=config.output_predictions_file, predictions=predictions, truth=truth, probabilities=probabilities, diff --git a/frameworks/NaiveAutoML/exec.py b/frameworks/NaiveAutoML/exec.py index 67921ff7c..dae84a4fc 100644 --- a/frameworks/NaiveAutoML/exec.py +++ b/frameworks/NaiveAutoML/exec.py @@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]): save_artifacts(automl, config) return result( - output_file=config.output_predictions_file, predictions=predictions, probabilities=probabilities, truth=dataset.test.y, diff --git a/frameworks/RandomForest/exec.py b/frameworks/RandomForest/exec.py index 4be16ac32..e2f6dc2e5 100644 --- a/frameworks/RandomForest/exec.py +++ b/frameworks/RandomForest/exec.py @@ -132,7 +132,6 @@ def infer(data): log.info("Finished inference time measurements.") return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, diff --git a/frameworks/SapientML/exec.py b/frameworks/SapientML/exec.py index 0a80ba883..e58491fb4 100644 --- a/frameworks/SapientML/exec.py +++ b/frameworks/SapientML/exec.py @@ -79,7 +79,6 @@ def run(dataset, config): ) return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, @@ -88,7 +87,6 @@ def run(dataset, config): ) else: return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, training_duration=training.duration, diff --git a/frameworks/TPOT/exec.py b/frameworks/TPOT/exec.py index 4883a6f3f..e403679e7 100644 --- a/frameworks/TPOT/exec.py +++ b/frameworks/TPOT/exec.py @@ -131,7 +131,6 @@ def infer(data): save_artifacts(tpot, config) return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, diff --git a/frameworks/TunedRandomForest/exec.py b/frameworks/TunedRandomForest/exec.py index 1fbd6e2af..9a4569a2d 100644 --- a/frameworks/TunedRandomForest/exec.py +++ b/frameworks/TunedRandomForest/exec.py @@ -286,7 +286,6 @@ def infer(data): log.info("Finished inference time measurements.") return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py index d42eb23fb..23c09a91c 100644 --- a/frameworks/autosklearn/exec.py +++ b/frameworks/autosklearn/exec.py @@ -207,7 +207,6 @@ def sample_one_test_row(seed: int): save_artifacts(auto_sklearn, config) return result( - output_file=config.output_predictions_file, predictions=predictions, truth=dataset.test.y if use_pandas else dataset.test.y_enc, probabilities=probabilities, diff --git a/frameworks/flaml/exec.py b/frameworks/flaml/exec.py index 370f93696..a67af2663 100644 --- a/frameworks/flaml/exec.py +++ b/frameworks/flaml/exec.py @@ -91,7 +91,6 @@ def infer(data: Union[str, pd.DataFrame]): log.info(f"Finished predict in {predict.duration}s.") return result( - output_file=config.output_predictions_file, probabilities=probabilities, predictions=predictions, truth=y_test, diff --git a/frameworks/hyperoptsklearn/exec.py b/frameworks/hyperoptsklearn/exec.py index 48e2217b1..ce90c4c46 100644 --- a/frameworks/hyperoptsklearn/exec.py +++ b/frameworks/hyperoptsklearn/exec.py @@ -117,7 +117,6 @@ def default(): probabilities = None return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, diff --git a/frameworks/lightautoml/exec.py b/frameworks/lightautoml/exec.py index d2ccf0a21..f25d011e3 100644 --- a/frameworks/lightautoml/exec.py +++ b/frameworks/lightautoml/exec.py @@ -99,7 +99,6 @@ def infer(data: Union[str, pd.DataFrame]): save_artifacts(automl, config) return result( - output_file=config.output_predictions_file, probabilities_labels=probabilities_labels, probabilities=probabilities, predictions=predictions, diff --git a/frameworks/mljarsupervised/exec.py b/frameworks/mljarsupervised/exec.py index 44003c45c..6d5d4f15c 100644 --- a/frameworks/mljarsupervised/exec.py +++ b/frameworks/mljarsupervised/exec.py @@ -105,7 +105,6 @@ def infer(data: Union[str, pd.DataFrame]): shutil.rmtree(results_path, ignore_errors=True) return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, diff --git a/frameworks/oboe/exec.py b/frameworks/oboe/exec.py index ebb0232a6..3e30b07db 100644 --- a/frameworks/oboe/exec.py +++ b/frameworks/oboe/exec.py @@ -123,7 +123,6 @@ def aml_models(): probabilities = None return result( - output_file=config.output_predictions_file, predictions=predictions, truth=y_test, probabilities=probabilities, diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py index 23b41620a..a6f073a3e 100644 --- a/frameworks/shared/callee.py +++ b/frameworks/shared/callee.py @@ -27,7 +27,6 @@ class FrameworkError(Exception): def result( - output_file=None, predictions=None, truth=None, probabilities=None, @@ -107,6 +106,8 @@ def load_data(name, path, **_): ) json_dump(inference_measurements, inference_file, style="compact") res["others"]["inference_times"] = str(inference_file) + + res["output_file"] = config.output_predictions_file json_dump(res, config.result_file, style="compact") From 0896e1fa4e38cafb0c890aa0fd4463300eb0b01d Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 20:30:24 +0200 Subject: [PATCH 02/12] Serialize numpy object arrays through JSON instead of pickle --- amlb/utils/serialization.py | 15 ++++++++++----- frameworks/shared/callee.py | 2 +- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/amlb/utils/serialization.py b/amlb/utils/serialization.py index 5beff1213..9622eae05 100644 --- a/amlb/utils/serialization.py +++ b/amlb/utils/serialization.py @@ -1,4 +1,5 @@ import logging + import math import os import pickle @@ -34,8 +35,6 @@ def _import_data_libraries(): # mainly intended to serialize simple data structures like lists. # allowed=['pickle', 'json'] fallback_serializer="json", - # if numpy can use pickle to serialize ndarrays, - numpy_allow_pickle=True, # format used to serialize pandas dataframes/series between processes. # allowed=['pickle', 'parquet', 'hdf', 'json'] pandas_serializer="parquet", @@ -163,8 +162,14 @@ def serialize_data(data, path, config: Optional[ns] = None): root, ext = os.path.splitext(path) np, pd, sp = _import_data_libraries() if np and isinstance(data, np.ndarray): - path = f"{root}.npy" - np.save(path, data, allow_pickle=config.numpy_allow_pickle) + if data.dtype == "object": + # Numpy cannot save object arrays without pickle + path = f"{root}.json" + data = data.squeeze().tolist() + json_dump(data, path, style="compact") + else: + path = f"{root}.npy" + np.save(path, data, allow_pickle=False) elif sp and isinstance(data, sp.spmatrix): # use custom extension to recognize sparsed matrices from file name. # .npz is automatically appended if missing, and can also potentially be used for numpy arrays. @@ -212,7 +217,7 @@ def deserialize_data(path, config: Optional[ns] = None): if ext == ".npy": if np is None: raise SerializationError(f"Numpy is required to deserialize {path}.") - return np.load(path, allow_pickle=config.numpy_allow_pickle) + return np.load(path) elif ext == ".npz": _, ext2 = os.path.splitext(base) if ext2 == ".spy": diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py index a6f073a3e..6aba88c95 100644 --- a/frameworks/shared/callee.py +++ b/frameworks/shared/callee.py @@ -107,7 +107,7 @@ def load_data(name, path, **_): json_dump(inference_measurements, inference_file, style="compact") res["others"]["inference_times"] = str(inference_file) - res["output_file"] = config.output_predictions_file + res.setdefault("output_file", config.output_predictions_file) json_dump(res, config.result_file, style="compact") From 9f1047a04a56841598412caa59325e1b2b616692 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 20:50:37 +0200 Subject: [PATCH 03/12] Remove some options I don't want to support (mostly pickle) --- amlb/utils/serialization.py | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/amlb/utils/serialization.py b/amlb/utils/serialization.py index 9622eae05..ea01befcc 100644 --- a/amlb/utils/serialization.py +++ b/amlb/utils/serialization.py @@ -6,6 +6,7 @@ import re from typing import Optional + from .core import Namespace as ns, json_dump, json_load from .process import profile @@ -34,9 +35,10 @@ def _import_data_libraries(): # the serializer to use when there's no specific serializer available. # mainly intended to serialize simple data structures like lists. # allowed=['pickle', 'json'] - fallback_serializer="json", + # OPTION REMOVED: Only JSON is allowed. Pickle is evil. + # fallback_serializer="json", # format used to serialize pandas dataframes/series between processes. - # allowed=['pickle', 'parquet', 'hdf', 'json'] + # allowed=['parquet', 'json'] pandas_serializer="parquet", # the compression format used when serializing pandas dataframes/series. # allowed=[None, 'infer', 'bz2', 'gzip'] @@ -182,9 +184,7 @@ def serialize_data(data, path, config: Optional[ns] = None): # for example, 'true' and 'false' are converted automatically to booleans, even for column names… data.rename(str, axis="columns", inplace=True) ser = config.pandas_serializer - if ser == "pickle": - data.to_pickle(path, compression=config.pandas_compression) - elif ser == "parquet": + if ser == "parquet": if isinstance(data, pd.Series): data = pd.DataFrame({__series__: data}) # parquet serialization doesn't support sparse dataframes @@ -194,18 +194,15 @@ def serialize_data(data, path, config: Optional[ns] = None): json_dump(dtypes, f"{path}.dtypes", style="compact") data = unsparsify(data) data.to_parquet(path, compression=config.pandas_parquet_compression) - elif ser == "hdf": - data.to_hdf(path, os.path.basename(path), mode="w", format="table") elif ser == "json": data.to_json(path, compression=config.pandas_compression) - else: # fallback serializer - if config.fallback_serializer == "json": - path = f"{root}.json" - json_dump(data, path, style="compact") else: - path = f"{root}.pkl" - with open(path, "wb") as f: - pickle.dump(data, f) + raise ValueError( + f"Invalid pandas serialization {ser} must be 'parquet' or 'json'" + ) + else: # fallback serializer + path = f"{root}.json" + json_dump(data, path, style="compact") return path From 2759e2ef82f9a323c70d76bf3be3f2f3594861b3 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 21:32:53 +0200 Subject: [PATCH 04/12] Remove pickle serialization tests --- .../utils/serialization/test_serializers.py | 101 ------------------ 1 file changed, 101 deletions(-) diff --git a/tests/unit/amlb/utils/serialization/test_serializers.py b/tests/unit/amlb/utils/serialization/test_serializers.py index 4e1cd4fde..157e79070 100644 --- a/tests/unit/amlb/utils/serialization/test_serializers.py +++ b/tests/unit/amlb/utils/serialization/test_serializers.py @@ -18,18 +18,6 @@ def test_serialize_list_json(tmpdir): assert li == reloaded -@pytest.mark.use_disk -def test_serialize_list_pickle(tmpdir): - li = [[1, 2.2, None, 3, 4.4, "foo", True], ["bar", False, 2 / 3]] - dest = os.path.join(tmpdir, "my_list") - path = serialize_data(li, dest, config=ns(fallback_serializer="pickle")) - assert path == f"{dest}.pkl" - - reloaded = deserialize_data(path) - assert isinstance(reloaded, list) - assert li == reloaded - - @pytest.mark.use_disk def test_serialize_dict_json(tmpdir): di = dict(first=[1, 2.2, None, 3, 4.4, "foo", True], second=["bar", False, 2 / 3]) @@ -42,18 +30,6 @@ def test_serialize_dict_json(tmpdir): assert di == reloaded -@pytest.mark.use_disk -def test_serialize_dict_pickle(tmpdir): - di = dict(first=[1, 2.2, None, 3, 4.4, "foo", True], second=["bar", False, 2 / 3]) - dest = os.path.join(tmpdir, "my_dict") - path = serialize_data(di, dest, config=ns(fallback_serializer="pickle")) - assert path == f"{dest}.pkl" - - reloaded = deserialize_data(path) - assert isinstance(reloaded, dict) - assert di == reloaded - - @pytest.mark.use_disk def test_serialize_numpy_array(tmpdir): import numpy as np @@ -156,83 +132,6 @@ def test_serialize_sparse_matrix_reload_as_array(tmpdir): assert np.array_equal(mat.toarray(), reloaded, equal_nan=True) -@pytest.mark.use_disk -def test_serialize_sparse_dataframe(tmpdir): - import pandas as pd - - ser_config = ns( - pandas_serializer="pickle", sparse_dataframe_deserialized_format=None - ) - dfs = pd.DataFrame( - dict( - first=[0, 0, 0, 3.3], - second=[4.4, 0, 0, 0], - third=[0, pd.NA, 0, 0], - ) - ).astype("Sparse") - assert is_sparse(dfs) - dest = os.path.join(tmpdir, "my_sparse_df") - path = serialize_data(dfs, dest, config=ser_config) - assert path == f"{dest}.pd" - - reloaded = deserialize_data(path, config=ser_config) - assert isinstance(reloaded, pd.DataFrame) - assert is_sparse(reloaded) - assert dfs.compare(reloaded).empty - - -@pytest.mark.use_disk -def test_serialize_pandas_dataframe_reload_as_dense(tmpdir): - import pandas as pd - - ser_config = ns( - pandas_serializer="pickle", sparse_dataframe_deserialized_format="dense" - ) - dfs = pd.DataFrame( - dict( - first=[0, 0, 0, 3.3], - second=[4.4, 0, 0, 0], - third=[0, pd.NA, 0, 0], - # fourth=[None, None, 'a', None] - ) - ).astype("Sparse") - assert is_sparse(dfs) - dest = os.path.join(tmpdir, "my_sparse_df") - path = serialize_data(dfs, dest, config=ser_config) - assert path == f"{dest}.pd" - - reloaded = deserialize_data(path, config=ser_config) - assert isinstance(reloaded, pd.DataFrame) - assert not is_sparse(reloaded) - assert dfs.compare(reloaded).empty - - -@pytest.mark.use_disk -def test_serialize_pandas_dataframe_reload_as_array(tmpdir): - import numpy as np - import pandas as pd - - ser_config = ns( - pandas_serializer="pickle", sparse_dataframe_deserialized_format="array" - ) - dfs = pd.DataFrame( - dict( - first=[0, 0, 0, 3.3], - second=[4.4, 0, 0, 0], - third=[0, pd.NA, 0, 0], - # fourth=[None, None, 'a', None] - ) - ).astype("Sparse") - assert is_sparse(dfs) - dest = os.path.join(tmpdir, "my_sparse_df") - path = serialize_data(dfs, dest, config=ser_config) - assert path == f"{dest}.pd" - - reloaded = deserialize_data(path, config=ser_config) - assert isinstance(reloaded, np.ndarray) - assert np.array_equal(dfs.to_numpy(), np.asarray(reloaded), equal_nan=True) - - @pytest.mark.use_disk def test_serialize_sparse_numerical_dataframe_to_parquet(tmpdir): import pandas as pd From 79538567f7285924e0438248419730e60e8f490b Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 21:33:13 +0200 Subject: [PATCH 05/12] Be more explicit about the script failing with uncaught exception --- frameworks/shared/callee.py | 1 + 1 file changed, 1 insertion(+) diff --git a/frameworks/shared/callee.py b/frameworks/shared/callee.py index 6aba88c95..4c3c4b680 100644 --- a/frameworks/shared/callee.py +++ b/frameworks/shared/callee.py @@ -93,6 +93,7 @@ def load_data(name, path, **_): path = os.path.join(config.result_dir, ".".join([name, "data"])) res[name] = serialize_data(arr, path, config=ser_config) except BaseException as e: + log.error("Integration script failed with uncaught exception:") log.exception(e) res = dict(error_message=str(e), models_count=0) finally: From a08b1556367e0bda4b1f35bdaa5b37df3cb0d953 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 22:05:44 +0200 Subject: [PATCH 06/12] Add debugging step to job.. cache is not restored with cov in bin --- .github/workflows/run_all_frameworks.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml index 07f7e7bfc..3c9f18477 100644 --- a/.github/workflows/run_all_frameworks.yml +++ b/.github/workflows/run_all_frameworks.yml @@ -164,6 +164,12 @@ jobs: python -m pip install --upgrade pip python -m pip install -r requirements.txt python -m pip install "coverage[toml]" + - name: Check Things + run: | + ls -lah venv + ls -lah venv/bin + ls -lah /home/runner/work/automlbenchmark/automlbenchmark/venv/bin + echo $(pwd) - name: Run ${{ matrix.framework }} on ${{ matrix.task }} run: | source venv/bin/activate From c0a9e646a8b7a3d52e29312fd180ef806f998471 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 22:19:09 +0200 Subject: [PATCH 07/12] More debug stuff --- .github/workflows/run_all_frameworks.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml index 3c9f18477..9f13f0535 100644 --- a/.github/workflows/run_all_frameworks.yml +++ b/.github/workflows/run_all_frameworks.yml @@ -173,6 +173,9 @@ jobs: - name: Run ${{ matrix.framework }} on ${{ matrix.task }} run: | source venv/bin/activate + ls /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage + /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage --help + coverage --help coverage run -m runbenchmark ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e coverage xml env: From cf8ca91d5c31b0db98da493c13831262dde3c052 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 22:19:28 +0200 Subject: [PATCH 08/12] Only install packages not already installed by framework --- frameworks/shared/setup.sh | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/frameworks/shared/setup.sh b/frameworks/shared/setup.sh index ddf8a755d..b9460e9ee 100755 --- a/frameworks/shared/setup.sh +++ b/frameworks/shared/setup.sh @@ -44,13 +44,19 @@ PIP() { $pip_exec "$@" } -#if [[ -x "$(command -v $PY_VENV/bin/activate)" ]]; then -# $PY_ROOT/activate -#fi - -#echo "PY=$(command -v PY)" -#echo "PIP=$(command -v PIP)" echo "PY=$py_exec" echo "PIP=$pip_exec" -PIP install --no-cache-dir -r $SHARED_DIR/requirements.txt +REQ_FILE="$SHARED_DIR/requirements.txt" + +for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do + pkg=$(echo "$line" | sed -E 's/[=><~!].*$//') + # In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line + + if ! PY -c "import $pkg" &> /dev/null; then + echo "$pkg not found. Installing from requirements.txt..." + PIP install --no-cache-dir "$line" + else + echo "$pkg is already installed by the framework, using that instead." + fi +done From 89ff448ee09092bf0a2961121dc2ccaa5410e76e Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 22:28:18 +0200 Subject: [PATCH 09/12] Trying to figure out why coverage wont work even though its there --- .github/workflows/run_all_frameworks.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml index 9f13f0535..5238f44d2 100644 --- a/.github/workflows/run_all_frameworks.yml +++ b/.github/workflows/run_all_frameworks.yml @@ -174,6 +174,11 @@ jobs: run: | source venv/bin/activate ls /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage + python -m pip list + python -m pip install "coverage[toml]" + echo $(pwd) + which coverage + coverage --help /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage --help coverage --help coverage run -m runbenchmark ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e From af9911769256dc6455c0ae134c657a97a34732fb Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Tue, 10 Jun 2025 22:55:47 +0200 Subject: [PATCH 10/12] Disable caching --- .github/workflows/run_all_frameworks.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/run_all_frameworks.yml b/.github/workflows/run_all_frameworks.yml index 5238f44d2..62e7af804 100644 --- a/.github/workflows/run_all_frameworks.yml +++ b/.github/workflows/run_all_frameworks.yml @@ -102,6 +102,7 @@ jobs: - name: Create venv run: python -m venv venv - uses: actions/cache@v3 + if: failure() id: cache with: path: /home/runner/work/automlbenchmark/automlbenchmark/venv @@ -151,6 +152,7 @@ jobs: - name: Create venv run: python -m venv venv - uses: actions/cache@v3 + if: failure() id: cache with: path: /home/runner/work/automlbenchmark/automlbenchmark/venv @@ -173,14 +175,6 @@ jobs: - name: Run ${{ matrix.framework }} on ${{ matrix.task }} run: | source venv/bin/activate - ls /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage - python -m pip list - python -m pip install "coverage[toml]" - echo $(pwd) - which coverage - coverage --help - /home/runner/work/automlbenchmark/automlbenchmark/venv/bin/coverage --help - coverage --help coverage run -m runbenchmark ${{ matrix.framework }} ${{ matrix.benchmark }} test -f 0 -t ${{ matrix.task }} -e coverage xml env: From 44225cdd0cdeab1c61922303de15bed735e8526b Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 12 Jun 2025 20:22:47 +0200 Subject: [PATCH 11/12] temporarily move installation of deps to ag script --- frameworks/AutoGluon/setup.sh | 15 +++++++++++++++ frameworks/shared/setup.sh | 14 -------------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/frameworks/AutoGluon/setup.sh b/frameworks/AutoGluon/setup.sh index bac656d29..f43750fd6 100755 --- a/frameworks/AutoGluon/setup.sh +++ b/frameworks/AutoGluon/setup.sh @@ -61,3 +61,18 @@ fi echo "Finished setup, testing autogluon install..." PY -c "from autogluon.tabular.version import __version__; print(__version__)" >> "${HERE}/.setup/installed" + +echo "Installing AMLB dependencies into AutoGluon venv" +REQ_FILE="${HERE}/../shared/requirements.txt" + +for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do + pkg=$(echo "$line" | sed -E 's/[=><~!].*$//') + # In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line + + if ! PY -c "import $pkg" &> /dev/null; then + echo "$pkg not found. Installing from requirements.txt..." + PIP install --no-cache-dir "$line" + else + echo "$pkg is already installed by the framework, using that instead." + fi +done diff --git a/frameworks/shared/setup.sh b/frameworks/shared/setup.sh index b9460e9ee..f8715f89c 100755 --- a/frameworks/shared/setup.sh +++ b/frameworks/shared/setup.sh @@ -46,17 +46,3 @@ PIP() { echo "PY=$py_exec" echo "PIP=$pip_exec" - -REQ_FILE="$SHARED_DIR/requirements.txt" - -for line in $(grep -vE '^\s*#' "$REQ_FILE" | grep -vE '^\s*$'); do - pkg=$(echo "$line" | sed -E 's/[=><~!].*$//') - # In a line like "numpy==1.12.0" then pkg=numpy and line is the whole line - - if ! PY -c "import $pkg" &> /dev/null; then - echo "$pkg not found. Installing from requirements.txt..." - PIP install --no-cache-dir "$line" - else - echo "$pkg is already installed by the framework, using that instead." - fi -done From 0f933485ff5ed62582d01dd8c7b909829eccaf32 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 12 Jun 2025 20:52:59 +0200 Subject: [PATCH 12/12] set high verbosity --- frameworks/AutoGluon/exec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/frameworks/AutoGluon/exec.py b/frameworks/AutoGluon/exec.py index ec5b93b3d..04f5ca7dd 100644 --- a/frameworks/AutoGluon/exec.py +++ b/frameworks/AutoGluon/exec.py @@ -100,6 +100,7 @@ def run(dataset, config): eval_metric=perf_metric.name, path=models_dir, problem_type=problem_type, + verbosity=4, ).fit(train_data=train_path, time_limit=time_limit, **training_params) log.info(f"Finished fit in {training.duration}s.")