diff --git a/openml/config.py b/openml/config.py index 706b74060..3dde45bdd 100644 --- a/openml/config.py +++ b/openml/config.py @@ -150,7 +150,7 @@ def _resolve_default_cache_dir() -> Path: "apikey": "", "server": "https://www.openml.org/api/v1/xml", "cachedir": _resolve_default_cache_dir(), - "avoid_duplicate_runs": True, + "avoid_duplicate_runs": False, "retry_policy": "human", "connection_n_retries": 5, "show_progress": False, diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 06fe49662..666b75c37 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -59,7 +59,7 @@ def run_model_on_task( # noqa: PLR0913 model: Any, task: int | str | OpenMLTask, - avoid_duplicate_runs: bool = True, # noqa: FBT001, FBT002 + avoid_duplicate_runs: bool | None = None, flow_tags: list[str] | None = None, seed: int | None = None, add_local_measures: bool = True, # noqa: FBT001, FBT002 @@ -77,9 +77,10 @@ def run_model_on_task( # noqa: PLR0913 task : OpenMLTask or int or str Task to perform or Task id. This may be a model instead if the first argument is an OpenMLTask. - avoid_duplicate_runs : bool, optional (default=True) + avoid_duplicate_runs : bool, optional (default=None) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. + If not set, it will use the default from your openml configuration (False if unset). flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) @@ -104,6 +105,8 @@ def run_model_on_task( # noqa: PLR0913 flow : OpenMLFlow (optional, only if `return_flow` is True). Flow generated from the model. """ + if avoid_duplicate_runs is None: + avoid_duplicate_runs = openml.config.avoid_duplicate_runs if avoid_duplicate_runs and not config.apikey: warnings.warn( "avoid_duplicate_runs is set to True, but no API key is set. " @@ -175,7 +178,7 @@ def get_task_and_type_conversion(_task: int | str | OpenMLTask) -> OpenMLTask: def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 flow: OpenMLFlow, task: OpenMLTask, - avoid_duplicate_runs: bool = True, # noqa: FBT002, FBT001 + avoid_duplicate_runs: bool | None = None, flow_tags: list[str] | None = None, seed: int | None = None, add_local_measures: bool = True, # noqa: FBT001, FBT002 @@ -195,9 +198,10 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 all supervised estimators of scikit learn follow this definition of a model. task : OpenMLTask Task to perform. This may be an OpenMLFlow instead if the first argument is an OpenMLTask. - avoid_duplicate_runs : bool, optional (default=True) + avoid_duplicate_runs : bool, optional (default=None) If True, the run will throw an error if the setup/task combination is already present on the server. This feature requires an internet connection. + If not set, it will use the default from your openml configuration (False if unset). flow_tags : List[str], optional (default=None) A list of tags that the flow should have at creation. seed: int, optional (default=None) @@ -221,6 +225,9 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913 if flow_tags is not None and not isinstance(flow_tags, list): raise ValueError("flow_tags should be a list") + if avoid_duplicate_runs is None: + avoid_duplicate_runs = openml.config.avoid_duplicate_runs + # TODO: At some point in the future do not allow for arguments in old order (changed 6-2018). # Flexibility currently still allowed due to code-snippet in OpenML100 paper (3-2019). if isinstance(flow, OpenMLTask) and isinstance(task, OpenMLFlow): diff --git a/openml/testing.py b/openml/testing.py index f026c6137..547405df0 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -101,7 +101,6 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None: self.cached = True openml.config.apikey = TestBase.apikey self.production_server = "https://www.openml.org/api/v1/xml" - openml.config.avoid_duplicate_runs = False openml.config.set_root_cache_directory(str(self.workdir)) # Increase the number of retries to avoid spurious server failures diff --git a/tests/test_openml/test_config.py b/tests/test_openml/test_config.py index 53d4abe77..0324545a7 100644 --- a/tests/test_openml/test_config.py +++ b/tests/test_openml/test_config.py @@ -175,13 +175,14 @@ def test_configuration_file_not_overwritten_on_load(): def test_configuration_loads_booleans(tmp_path): config_file_content = "avoid_duplicate_runs=true\nshow_progress=false" - with (tmp_path / "config").open("w") as config_file: + tmp_file = tmp_path / "config" + with tmp_file.open("w") as config_file: config_file.write(config_file_content) - read_config = openml.config._parse_config(tmp_path) + read_config = openml.config._parse_config(tmp_file) # Explicit test to avoid truthy/falsy modes of other types - assert True == read_config["avoid_duplicate_runs"] - assert False == read_config["show_progress"] + assert read_config["avoid_duplicate_runs"] is True + assert read_config["show_progress"] is False def test_openml_cache_dir_env_var(tmp_path: Path) -> None: diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 88fa1672b..034b731aa 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -130,7 +130,6 @@ def test_to_from_filesystem_vanilla(self): model=model, task=task, add_local_measures=False, - avoid_duplicate_runs=False, upload_flow=True, ) @@ -174,7 +173,6 @@ def test_to_from_filesystem_search(self): model=model, task=task, add_local_measures=False, - avoid_duplicate_runs=False, ) cache_path = os.path.join(self.workdir, "runs", str(random.getrandbits(128))) @@ -311,7 +309,6 @@ def test_publish_with_local_loaded_flow(self): flow=flow, task=task, add_local_measures=False, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -351,7 +348,6 @@ def test_offline_and_online_run_identical(self): flow=flow, task=task, add_local_measures=False, - avoid_duplicate_runs=False, upload_flow=False, ) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 725421d4f..3b9bcee1a 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -181,14 +181,12 @@ def _rerun_model_and_compare_predictions(self, run_id, model_prime, seed, create run_prime = openml.runs.run_model_on_task( model=model_prime, task=task, - avoid_duplicate_runs=False, seed=seed, ) else: run_prime = openml.runs.run_model_on_task( model=model_prime, task=run.task_id, - avoid_duplicate_runs=False, seed=seed, ) @@ -278,7 +276,6 @@ def _remove_random_state(flow): flow=flow, task=task, seed=seed, - avoid_duplicate_runs=openml.config.avoid_duplicate_runs, ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run.run_id) @@ -414,7 +411,6 @@ def test_run_regression_on_classif_task(self): openml.runs.run_model_on_task( model=clf, task=task, - avoid_duplicate_runs=False, ) @pytest.mark.sklearn() @@ -969,7 +965,6 @@ def test_initialize_cv_from_run(self): run = openml.runs.run_model_on_task( model=randomsearch, task=task, - avoid_duplicate_runs=False, seed=1, ) run_ = run.publish() @@ -1026,7 +1021,6 @@ def test_local_run_swapped_parameter_order_model(self): run = openml.runs.run_model_on_task( task, clf, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1055,7 +1049,6 @@ def test_local_run_swapped_parameter_order_flow(self): run = openml.runs.run_flow_on_task( task, flow, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1083,7 +1076,6 @@ def test_local_run_metric_score(self): run = openml.runs.run_model_on_task( model=clf, task=task, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1142,7 +1134,6 @@ def test_initialize_model_from_run(self): run = openml.runs.run_model_on_task( model=clf, task=task, - avoid_duplicate_runs=False, ) run_ = run.publish() TestBase._mark_entity_for_removal("run", run_.run_id) @@ -1251,7 +1242,6 @@ def test_run_with_illegal_flow_id_after_load(self): run = openml.runs.run_flow_on_task( task=task, flow=flow, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1316,7 +1306,6 @@ def test_run_with_illegal_flow_id_1_after_load(self): run = openml.runs.run_flow_on_task( task=task, flow=flow_new, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1664,7 +1653,6 @@ def test_run_flow_on_task_downloaded_flow(self): run = openml.runs.run_flow_on_task( flow=downloaded_flow, task=task, - avoid_duplicate_runs=False, upload_flow=False, ) @@ -1913,7 +1901,7 @@ def test_delete_run(self): task = openml.tasks.get_task(32) # diabetes; crossvalidation run = openml.runs.run_model_on_task( - model=clf, task=task, seed=rs, avoid_duplicate_runs=False + model=clf, task=task, seed=rs, ) run.publish()