From 8c6cb6d0ec361a186fae77567a2eaf6020a0e32e Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 20 Jun 2025 10:31:44 +0200 Subject: [PATCH 1/4] Default to not checking for duplicates This makes it so by default people can use this feature without authenticating with OpenML. --- openml/config.py | 2 +- openml/runs/functions.py | 13 ++++++++++--- openml/testing.py | 1 - tests/test_runs/test_run.py | 4 ---- tests/test_runs/test_run_functions.py | 14 +------------- 5 files changed, 12 insertions(+), 22 deletions(-) diff --git a/openml/config.py b/openml/config.py index 706b74060..3dde45bdd 100644 --- a/openml/config.py +++ b/openml/config.py @@ -150,7 +150,7 @@ def _resolve_default_cache_dir() -> Path: "apikey": "", "server": "https://www.openml.org/api/v1/xml", "cachedir": _resolve_default_cache_dir(), - "avoid_duplicate_runs": True, + "avoid_duplicate_runs": False, "retry_policy": "human", "connection_n_retries": 5, "show_progress": False, diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 06fe49662..f8824f55f 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -59,7 +59,7 @@ def run_model_on_task( # noqa: PLR0913 model: Any, task: int | str | OpenMLTask, - avoid_duplicate_runs: bool = True, # noqa: FBT001, FBT002 + avoid_duplicate_runs: bool | None = None, flow_tags: list[str] | None = None, seed: int | None = None, add_local_measures: bool = True, # noqa: FBT001, FBT002 @@ -77,9 +77,10 @@ def run_model_on_task( # noqa: PLR0913 task : OpenMLTask or int or str Task to perform or Task id. This may be a model instead if the first argument is an OpenMLTask. - avoid_duplicate_runs : bool, optional (default=True) + avoid_duplicate_runs : bool, optional (default=None) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. + If not set, it will use the default from your openml configuration. flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) @@ -104,6 +105,8 @@ def run_model_on_task( # noqa: PLR0913 flow : OpenMLFlow (optional, only if `return_flow` is True). Flow generated from the model. """ + if avoid_duplicate_runs is None: + avoid_duplicate_runs = openml.config.avoid_duplicate_runs if avoid_duplicate_runs and not config.apikey: warnings.warn( "avoid_duplicate_runs is set to True, but no API key is set. " @@ -175,7 +178,7 @@ def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask: def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 flow: OpenMLFlow, task: OpenMLTask, - avoid_duplicate_runs: bool = True, # noqa: FBT002, FBT001 + avoid_duplicate_runs: bool | None = None, flow_tags: list[str] | None = None, seed: int | None = None, add_local_measures: bool = True, # noqa: FBT001, FBT002 @@ -198,6 +201,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 avoid_duplicate_runs : bool, optional (default=True) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. + If not set, it will use the default from your openml configuration. flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) @@ -221,6 +225,9 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 if flow_tags is not None and not isinstance(flow_tags, list): raise ValueError("flow_tags should be a list") + if avoid_duplicate_runs is None: + avoid_duplicate_runs = openml.config.avoid_duplicate_runs + # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018). # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019). if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow): diff --git a/openml/testing.py b/openml/testing.py index f026c6137..547405df0 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -101,7 +101,6 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None: self.cached = True openml.config.apikey = TestBase.apikey self.production_server = "https://www.openml.org/api/v1/xml" - openml.config.avoid_duplicate_runs = False openml.config.set_root_cache_directory(str(self.workdir)) # Increase the number of retries to avoid spurious server failures diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 88fa1672b..034b731aa 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -130,7 +130,6 @@ def test_to_from_filesystem_vanilla(self): model=model, task=task, add_local_measures=False, - avoid_duplicate_runs=False, upload_flow=True, ) @@ -174,7 +173,6 @@ def test_to_from_filesystem_search(self): model=model, task=task, add_local_measures=False, - avoid_duplicate_runs=False, ) cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) @@ -311,7 +309,6 @@ def test_publish_with_local_loaded_flow(self): flow=flow, task=task, add_local_measures=False, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -351,7 +348,6 @@ def test_offline_and_online_run_identical(self): flow=flow, task=task, add_local_measures=False, - avoid_duplicate_runs=False, upload_flow=False, ) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 725421d4f..3b9bcee1a 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -181,14 +181,12 @@ def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create run_prime = openml.runs.run_model_on_task( model=model_prime, task=task, - avoid_duplicate_runs=False, seed=seed, ) else: run_prime = openml.runs.run_model_on_task( model=model_prime, task=run.task_id, - avoid_duplicate_runs=False, seed=seed, ) @@ -278,7 +276,6 @@ def _remove_random_state(flow): flow=flow, task=task, seed=seed, - avoid_duplicate_runs=openml.config.avoid_duplicate_runs, ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run.run_id) @@ -414,7 +411,6 @@ def test_run_regression_on_classif_task(self): openml.runs.run_model_on_task( model=clf, task=task, - avoid_duplicate_runs=False, ) @pytest.mark.sklearn() @@ -969,7 +965,6 @@ def test_initialize_cv_from_run(self): run = openml.runs.run_model_on_task( model=randomsearch, task=task, - avoid_duplicate_runs=False, seed=1, ) run_ = run.publish() @@ -1026,7 +1021,6 @@ def test_local_run_swapped_parameter_order_model(self): run = openml.runs.run_model_on_task( task, clf, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1055,7 +1049,6 @@ def test_local_run_swapped_parameter_order_flow(self): run = openml.runs.run_flow_on_task( task, flow, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1083,7 +1076,6 @@ def test_local_run_metric_score(self): run = openml.runs.run_model_on_task( model=clf, task=task, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1142,7 +1134,6 @@ def test_initialize_model_from_run(self): run = openml.runs.run_model_on_task( model=clf, task=task, - avoid_duplicate_runs=False, ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run_.run_id) @@ -1251,7 +1242,6 @@ def test_run_with_illegal_flow_id_after_load(self): run = openml.runs.run_flow_on_task( task=task, flow=flow, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1316,7 +1306,6 @@ def test_run_with_illegal_flow_id_1_after_load(self): run = openml.runs.run_flow_on_task( task=task, flow=flow_new, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1664,7 +1653,6 @@ def test_run_flow_on_task_downloaded_flow(self): run = openml.runs.run_flow_on_task( flow=downloaded_flow, task=task, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1913,7 +1901,7 @@ def test_delete_run(self): task = openml.tasks.get_task(32) # diabetes; crossvalidation run = openml.runs.run_model_on_task( - model=clf, task=task, seed=rs, avoid_duplicate_runs=False + model=clf, task=task, seed=rs, ) run.publish() From a4ff70bb3903d3e0dd58357ca1545ef6d314a465 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 20 Jun 2025 11:06:20 +0200 Subject: [PATCH 2/4] Fix broken test --- tests/test_openml/test_config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 53d4abe77..0324545a7 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -175,13 +175,14 @@ def test_configuration_file_not_overwritten_on_load(): def test_configuration_loads_booleans(tmp_path): config_file_content = "avoid_duplicate_runs=true\nshow_progress=false" - with (tmp_path / "config").open("w") as config_file: + tmp_file = tmp_path / "config" + with tmp_file.open("w") as config_file: config_file.write(config_file_content) - read_config = openml.config._parse_config(tmp_path) + read_config = openml.config._parse_config(tmp_file) # Explicit test to avoid truthy/falsy modes of other types - assert True == read_config["avoid_duplicate_runs"] - assert False == read_config["show_progress"] + assert read_config["avoid_duplicate_runs"] is True + assert read_config["show_progress"] is False def test_openml_cache_dir_env_var(tmp_path: Path) -> None: From d1805e9f143759408176ed6d9c0b90a6bbac2c68 Mon Sep 17 00:00:00 2001 From: Pieter Gijsbers Date: Fri, 20 Jun 2025 11:28:39 +0200 Subject: [PATCH 3/4] Update openml/runs/functions.py --- openml/runs/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index f8824f55f..05a386b51 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -198,7 +198,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 all supervised estimators of scikit learn follow this definition of a model. task : OpenMLTask Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask. - avoid_duplicate_runs : bool, optional (default=True) + avoid_duplicate_runs : bool, optional (default=None) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. If not set, it will use the default from your openml configuration. From 377d31cd1a28d4c87cb29600fb7cc749b5253318 Mon Sep 17 00:00:00 2001 From: Pieter Gijsbers Date: Fri, 20 Jun 2025 11:29:38 +0200 Subject: [PATCH 4/4] Apply suggestions from code review --- openml/runs/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 05a386b51..666b75c37 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -80,7 +80,7 @@ def run_model_on_task( # noqa: PLR0913 avoid_duplicate_runs : bool, optional (default=None) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. - If not set, it will use the default from your openml configuration. + If not set, it will use the default from your openml configuration (False if unset). flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) @@ -201,7 +201,7 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 avoid_duplicate_runs : bool, optional (default=None) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. - If not set, it will use the default from your openml configuration. + If not set, it will use the default from your openml configuration (False if unset). flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None)