From ec57d8e5ab9f6a09856e0863fed7386971649b85 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 6 Jan 2026 12:49:07 +0500 Subject: [PATCH 1/3] add marker --- tests/test_datasets/test_dataset.py | 9 ++- tests/test_datasets/test_dataset_functions.py | 67 ++++++++++++++++--- .../test_evaluation_functions.py | 2 + tests/test_flows/test_flow.py | 13 ++-- tests/test_flows/test_flow_functions.py | 7 +- tests/test_openml/test_api_calls.py | 3 + tests/test_runs/test_run.py | 11 +-- tests/test_runs/test_run_functions.py | 50 ++++++++------ tests/test_setups/test_setup_functions.py | 11 +-- tests/test_study/test_study_functions.py | 5 ++ tests/test_tasks/test_classification_task.py | 7 +- tests/test_tasks/test_clustering_task.py | 2 + tests/test_tasks/test_learning_curve_task.py | 6 +- tests/test_tasks/test_regression_task.py | 4 +- tests/test_tasks/test_supervised_task.py | 1 + tests/test_tasks/test_task.py | 3 +- tests/test_tasks/test_task_functions.py | 16 ++++- tests/test_tasks/test_task_methods.py | 2 + tests/test_utils/test_utils.py | 20 +++--- 19 files changed, 170 insertions(+), 69 deletions(-) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 66e9b8554..6dc4c7d5d 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -278,6 +278,7 @@ def test_equality_comparison(self): self.assertNotEqual(self.titanic, "Wrong_object") +@pytest.mark.uses_test_server() def test_tagging(): dataset = openml.datasets.get_dataset(125, download_data=False) @@ -294,7 +295,7 @@ def test_tagging(): datasets = openml.datasets.list_datasets(tag=tag) assert datasets.empty -@pytest.mark.xfail(reason="failures_issue_1544") +@pytest.mark.uses_test_server() def test_get_feature_with_ontology_data_id_11(): # test on car dataset, which has built-in ontology references dataset = openml.datasets.get_dataset(11) @@ -303,6 +304,7 @@ def test_get_feature_with_ontology_data_id_11(): assert len(dataset.features[2].ontologies) >= 1 assert len(dataset.features[3].ontologies) >= 1 +@pytest.mark.uses_test_server() def test_add_remove_ontology_to_dataset(): did = 1 feature_index = 1 @@ -310,6 +312,7 @@ def test_add_remove_ontology_to_dataset(): openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology) openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology) +@pytest.mark.uses_test_server() def test_add_same_ontology_multiple_features(): did = 1 ontology = "https://www.openml.org/unittest/" + str(time()) @@ -318,6 +321,7 @@ def test_add_same_ontology_multiple_features(): openml.datasets.functions.data_feature_add_ontology(did, i, ontology) +@pytest.mark.uses_test_server() def test_add_illegal_long_ontology(): did = 1 ontology = "http://www.google.com/" + ("a" * 257) @@ -329,6 +333,7 @@ def test_add_illegal_long_ontology(): +@pytest.mark.uses_test_server() def test_add_illegal_url_ontology(): did = 1 ontology = "not_a_url" + str(time()) @@ -400,6 +405,7 @@ def test_get_sparse_categorical_data_id_395(self): assert len(feature.nominal_values) == 25 +@pytest.mark.uses_test_server() def test__read_features(mocker, workdir, static_cache_dir): """Test we read the features from the xml if no cache pickle is available. This test also does some simple checks to verify that the features are read correctly @@ -431,6 +437,7 @@ def test__read_features(mocker, workdir, static_cache_dir): assert pickle_mock.dump.call_count == 1 +@pytest.mark.uses_test_server() def test__read_qualities(static_cache_dir, workdir, mocker): """Test we read the qualities from the xml if no cache pickle is available. This test also does some minor checks to ensure that the qualities are read correctly. diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index f8cb1943c..c41664ba7 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -107,6 +107,7 @@ def _check_datasets(self, datasets): for did in datasets: self._check_dataset(datasets[did]) + @pytest.mark.uses_test_server() def test_tag_untag_dataset(self): tag = "test_tag_%d" % random.randint(1, 1000000) all_tags = _tag_entity("data", 1, tag) @@ -114,10 +115,12 @@ def test_tag_untag_dataset(self): all_tags = _tag_entity("data", 1, tag, untag=True) assert tag not in all_tags + @pytest.mark.uses_test_server() def test_list_datasets_length(self): datasets = openml.datasets.list_datasets() assert len(datasets) >= 100 + @pytest.mark.uses_test_server() def test_list_datasets_paginate(self): size = 10 max = 100 @@ -132,6 +135,7 @@ def test_list_datasets_paginate(self): categories=["in_preparation", "active", "deactivated"], ) + @pytest.mark.uses_test_server() def test_list_datasets_empty(self): datasets = openml.datasets.list_datasets(tag="NoOneWouldUseThisTagAnyway") assert datasets.empty @@ -155,6 +159,7 @@ def test_check_datasets_active(self): ) openml.config.server = self.test_server + @pytest.mark.uses_test_server() def test_illegal_character_tag(self): dataset = openml.datasets.get_dataset(1) tag = "illegal_tag&" @@ -164,6 +169,7 @@ def test_illegal_character_tag(self): except openml.exceptions.OpenMLServerException as e: assert e.code == 477 + @pytest.mark.uses_test_server() def test_illegal_length_tag(self): dataset = openml.datasets.get_dataset(1) tag = "a" * 65 @@ -205,6 +211,7 @@ def test__name_to_id_with_multiple_active_error(self): error_if_multiple=True, ) + @pytest.mark.uses_test_server() def test__name_to_id_name_does_not_exist(self): """With multiple active datasets, retrieve the least recent active.""" self.assertRaisesRegex( @@ -214,6 +221,7 @@ def test__name_to_id_name_does_not_exist(self): dataset_name="does_not_exist", ) + @pytest.mark.uses_test_server() def test__name_to_id_version_does_not_exist(self): """With multiple active datasets, retrieve the least recent active.""" self.assertRaisesRegex( @@ -224,6 +232,7 @@ def test__name_to_id_version_does_not_exist(self): version=100000, ) + @pytest.mark.uses_test_server() def test_get_datasets_by_name(self): # did 1 and 2 on the test server: dids = ["anneal", "kr-vs-kp"] @@ -231,6 +240,7 @@ def test_get_datasets_by_name(self): assert len(datasets) == 2 _assert_datasets_retrieved_successfully([1, 2]) + @pytest.mark.uses_test_server() def test_get_datasets_by_mixed(self): # did 1 and 2 on the test server: dids = ["anneal", 2] @@ -238,12 +248,14 @@ def test_get_datasets_by_mixed(self): assert len(datasets) == 2 _assert_datasets_retrieved_successfully([1, 2]) + @pytest.mark.uses_test_server() def test_get_datasets(self): dids = [1, 2] datasets = openml.datasets.get_datasets(dids) assert len(datasets) == 2 _assert_datasets_retrieved_successfully([1, 2]) + @pytest.mark.uses_test_server() def test_get_dataset_by_name(self): dataset = openml.datasets.get_dataset("anneal") assert type(dataset) == OpenMLDataset @@ -262,6 +274,7 @@ def test_get_dataset_download_all_files(self): # test_get_dataset_lazy raise NotImplementedError + @pytest.mark.uses_test_server() def test_get_dataset_uint8_dtype(self): dataset = openml.datasets.get_dataset(1) assert type(dataset) == OpenMLDataset @@ -280,7 +293,7 @@ def test_dataset_by_name_cannot_access_private_data(self): self.use_production_server() self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE") - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_get_dataset_lazy_all_functions(self): """Test that all expected functionality is available without downloading the dataset.""" dataset = openml.datasets.get_dataset(1) @@ -310,24 +323,28 @@ def ensure_absence_of_real_data(): assert classes == ["1", "2", "3", "4", "5", "U"] ensure_absence_of_real_data() + @pytest.mark.uses_test_server() def test_get_dataset_sparse(self): dataset = openml.datasets.get_dataset(102) X, *_ = dataset.get_data() assert isinstance(X, pd.DataFrame) assert all(isinstance(col, pd.SparseDtype) for col in X.dtypes) + @pytest.mark.uses_test_server() def test_download_rowid(self): # Smoke test which checks that the dataset has the row-id set correctly did = 44 dataset = openml.datasets.get_dataset(did) assert dataset.row_id_attribute == "Counter" + @pytest.mark.uses_test_server() def test__get_dataset_description(self): description = _get_dataset_description(self.workdir, 2) assert isinstance(description, dict) description_xml_path = os.path.join(self.workdir, "description.xml") assert os.path.exists(description_xml_path) + @pytest.mark.uses_test_server() def test__getarff_path_dataset_arff(self): openml.config.set_root_cache_directory(self.static_cache_dir) description = _get_dataset_description(self.workdir, 2) @@ -391,6 +408,7 @@ def test__download_minio_file_works_with_bucket_subdirectory(self): @mock.patch("openml._api_calls._download_minio_file") + @pytest.mark.uses_test_server() def test__get_dataset_parquet_is_cached(self, patch): openml.config.set_root_cache_directory(self.static_cache_dir) patch.side_effect = RuntimeError( @@ -431,18 +449,21 @@ def test__getarff_md5_issue(self): openml.config.connection_n_retries = n + @pytest.mark.uses_test_server() def test__get_dataset_features(self): features_file = _get_dataset_features_file(self.workdir, 2) assert isinstance(features_file, Path) features_xml_path = self.workdir / "features.xml" assert features_xml_path.exists() + @pytest.mark.uses_test_server() def test__get_dataset_qualities(self): qualities = _get_dataset_qualities_file(self.workdir, 2) assert isinstance(qualities, Path) qualities_xml_path = self.workdir / "qualities.xml" assert qualities_xml_path.exists() + @pytest.mark.uses_test_server() def test_get_dataset_force_refresh_cache(self): did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, @@ -465,6 +486,7 @@ def test_get_dataset_force_refresh_cache(self): did_cache_dir, ) + @pytest.mark.uses_test_server() def test_get_dataset_force_refresh_cache_clean_start(self): did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, @@ -501,12 +523,14 @@ def test_deletion_of_cache_dir(self): # get_dataset_description is the only data guaranteed to be downloaded @mock.patch("openml.datasets.functions._get_dataset_description") + @pytest.mark.uses_test_server() def test_deletion_of_cache_dir_faulty_download(self, patch): patch.side_effect = Exception("Boom!") self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1) datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets") assert len(os.listdir(datasets_cache_dir)) == 0 + @pytest.mark.uses_test_server() def test_publish_dataset(self): # lazy loading not possible as we need the arff-file. openml.datasets.get_dataset(3, download_data=True) @@ -532,6 +556,7 @@ def test_publish_dataset(self): ) assert isinstance(dataset.dataset_id, int) + @pytest.mark.uses_test_server() def test__retrieve_class_labels(self): openml.config.set_root_cache_directory(self.static_cache_dir) labels = openml.datasets.get_dataset(2).retrieve_class_labels() @@ -548,6 +573,7 @@ def test__retrieve_class_labels(self): labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name) assert labels == ["COIL", "SHEET"] + @pytest.mark.uses_test_server() def test_upload_dataset_with_url(self): dataset = OpenMLDataset( f"{self._get_sentinel()}-UploadTestWithURL", @@ -574,6 +600,7 @@ def _assert_status_of_dataset(self, *, did: int, status: str): assert result[did]["status"] == status @pytest.mark.flaky() + @pytest.mark.uses_test_server() def test_data_status(self): dataset = OpenMLDataset( f"{self._get_sentinel()}-UploadTestWithURL", @@ -665,7 +692,7 @@ def test_attributes_arff_from_df_unknown_dtype(self): with pytest.raises(ValueError, match=err_msg): attributes_arff_from_df(df) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_create_dataset_numpy(self): data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T @@ -699,6 +726,7 @@ def test_create_dataset_numpy(self): ), "Uploaded arff does not match original one" assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" + @pytest.mark.uses_test_server() def test_create_dataset_list(self): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], @@ -753,7 +781,7 @@ def test_create_dataset_list(self): ), "Uploaded ARFF does not match original one" assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset" - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix sparse_data = scipy.sparse.coo_matrix( @@ -856,6 +884,7 @@ def test_create_invalid_dataset(self): param["data"] = data[0] self.assertRaises(ValueError, create_dataset, **param) + @pytest.mark.uses_test_server() def test_get_online_dataset_arff(self): dataset_id = 100 # Australian # lazy loading not used as arff file is checked. @@ -871,7 +900,7 @@ def test_get_online_dataset_arff(self): return_type=arff.DENSE if d_format == "arff" else arff.COO, ), "ARFF files are not equal" - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_topic_api_error(self): # Check server exception when non-admin accessses apis self.assertRaisesRegex( @@ -890,6 +919,7 @@ def test_topic_api_error(self): topic="business", ) + @pytest.mark.uses_test_server() def test_get_online_dataset_format(self): # Phoneme dataset dataset_id = 77 @@ -899,7 +929,7 @@ def test_get_online_dataset_format(self): dataset_id ), "The format of the ARFF files is different" - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_create_dataset_pandas(self): data = [ ["a", "sunny", 85.0, 85.0, "FALSE", "no"], @@ -1124,7 +1154,7 @@ def test_ignore_attributes_dataset(self): paper_url=paper_url, ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_publish_fetch_ignore_attribute(self): """Test to upload and retrieve dataset and check ignore_attributes""" data = [ @@ -1243,7 +1273,7 @@ def test_create_dataset_row_id_attribute_error(self): paper_url=paper_url, ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_create_dataset_row_id_attribute_inference(self): # meta-information name = f"{self._get_sentinel()}-pandas_testing_dataset" @@ -1334,11 +1364,13 @@ def test_create_dataset_attributes_auto_without_df(self): paper_url=paper_url, ) + @pytest.mark.uses_test_server() def test_list_qualities(self): qualities = openml.datasets.list_qualities() assert isinstance(qualities, list) is True assert all(isinstance(q, str) for q in qualities) is True + @pytest.mark.uses_test_server() def test_get_dataset_cache_format_pickle(self): dataset = openml.datasets.get_dataset(1) dataset.get_data() @@ -1354,6 +1386,7 @@ def test_get_dataset_cache_format_pickle(self): assert len(categorical) == X.shape[1] assert len(attribute_names) == X.shape[1] + @pytest.mark.uses_test_server() def test_get_dataset_cache_format_feather(self): # This test crashed due to using the parquet file by default, which is downloaded # from minio. However, there is a mismatch between OpenML test server and minio IDs. @@ -1386,6 +1419,7 @@ def test_get_dataset_cache_format_feather(self): assert len(categorical) == X.shape[1] assert len(attribute_names) == X.shape[1] + @pytest.mark.uses_test_server() def test_data_edit_non_critical_field(self): # Case 1 # All users can edit non-critical fields of datasets @@ -1407,7 +1441,7 @@ def test_data_edit_non_critical_field(self): edited_dataset = openml.datasets.get_dataset(did) assert edited_dataset.description == desc - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_data_edit_critical_field(self): # Case 2 # only owners (or admin) can edit all critical fields of datasets @@ -1434,6 +1468,7 @@ def test_data_edit_critical_field(self): os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)), ) + @pytest.mark.uses_test_server() def test_data_edit_requires_field(self): # Check server exception when no field to edit is provided self.assertRaisesRegex( @@ -1446,6 +1481,7 @@ def test_data_edit_requires_field(self): data_id=64, # blood-transfusion-service-center ) + @pytest.mark.uses_test_server() def test_data_edit_requires_valid_dataset(self): # Check server exception when unknown dataset is provided self.assertRaisesRegex( @@ -1456,7 +1492,7 @@ def test_data_edit_requires_valid_dataset(self): description="xor operation dataset", ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self): # Need to own a dataset to be able to edit meta-data # Will be creating a forked version of an existing dataset to allow the unit test user @@ -1483,6 +1519,7 @@ def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self): default_target_attribute="y", ) + @pytest.mark.uses_test_server() def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self): # Check server exception when a non-owner or non-admin tries to edit critical fields self.assertRaisesRegex( @@ -1494,6 +1531,7 @@ def test_edit_data_user_cannot_edit_critical_field_of_other_users_dataset(self): default_target_attribute="y", ) + @pytest.mark.uses_test_server() def test_data_fork(self): did = 1 result = fork_dataset(did) @@ -1785,6 +1823,7 @@ def all_datasets(): return openml.datasets.list_datasets() +@pytest.mark.uses_test_server() def test_list_datasets(all_datasets: pd.DataFrame): # We can only perform a smoke test here because we test on dynamic # data from the internet... @@ -1793,42 +1832,49 @@ def test_list_datasets(all_datasets: pd.DataFrame): _assert_datasets_have_id_and_valid_status(all_datasets) +@pytest.mark.uses_test_server() def test_list_datasets_by_tag(all_datasets: pd.DataFrame): tag_datasets = openml.datasets.list_datasets(tag="study_14") assert 0 < len(tag_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(tag_datasets) +@pytest.mark.uses_test_server() def test_list_datasets_by_size(): datasets = openml.datasets.list_datasets(size=5) assert len(datasets) == 5 _assert_datasets_have_id_and_valid_status(datasets) +@pytest.mark.uses_test_server() def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame): small_datasets = openml.datasets.list_datasets(number_instances="5..100") assert 0 < len(small_datasets) <= len(all_datasets) _assert_datasets_have_id_and_valid_status(small_datasets) +@pytest.mark.uses_test_server() def test_list_datasets_by_number_features(all_datasets: pd.DataFrame): wide_datasets = openml.datasets.list_datasets(number_features="50..100") assert 8 <= len(wide_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(wide_datasets) +@pytest.mark.uses_test_server() def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame): five_class_datasets = openml.datasets.list_datasets(number_classes="5") assert 3 <= len(five_class_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(five_class_datasets) +@pytest.mark.uses_test_server() def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame): na_datasets = openml.datasets.list_datasets(number_missing_values="5..100") assert 5 <= len(na_datasets) < len(all_datasets) _assert_datasets_have_id_and_valid_status(na_datasets) +@pytest.mark.uses_test_server() def test_list_datasets_combined_filters(all_datasets: pd.DataFrame): combined_filter_datasets = openml.datasets.list_datasets( tag="study_14", @@ -1901,6 +1947,7 @@ def isolate_for_test(): ("with_data", "with_qualities", "with_features"), itertools.product([True, False], repeat=3), ) +@pytest.mark.uses_test_server() def test_get_dataset_lazy_behavior( isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool ): @@ -1927,6 +1974,7 @@ def test_get_dataset_lazy_behavior( ) +@pytest.mark.uses_test_server() def test_get_dataset_with_invalid_id() -> None: INVALID_ID = 123819023109238 # Well, at some point this will probably be valid... with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e: @@ -1954,6 +2002,7 @@ def test_read_features_from_xml_with_whitespace() -> None: assert dict[1].nominal_values == [" - 50000.", " 50000+."] +@pytest.mark.uses_test_server() def test_get_dataset_parquet(requests_mock, test_files_directory): # Parquet functionality is disabled on the test server # There is no parquet-copy of the test server yet. diff --git a/tests/test_evaluations/test_evaluation_functions.py b/tests/test_evaluations/test_evaluation_functions.py index ffd3d9f78..7009217d6 100644 --- a/tests/test_evaluations/test_evaluation_functions.py +++ b/tests/test_evaluations/test_evaluation_functions.py @@ -155,6 +155,7 @@ def test_evaluation_list_limit(self): ) assert len(evaluations) == 100 + @pytest.mark.uses_test_server() def test_list_evaluations_empty(self): evaluations = openml.evaluations.list_evaluations("unexisting_measure") if len(evaluations) > 0: @@ -232,6 +233,7 @@ def test_evaluation_list_sort(self): test_output = sorted(unsorted_output, reverse=True) assert test_output[:size] == sorted_output + @pytest.mark.uses_test_server() def test_list_evaluation_measures(self): measures = openml.evaluations.list_evaluation_measures() assert isinstance(measures, list) is True diff --git a/tests/test_flows/test_flow.py b/tests/test_flows/test_flow.py index da719d058..99cee6f87 100644 --- a/tests/test_flows/test_flow.py +++ b/tests/test_flows/test_flow.py @@ -102,6 +102,7 @@ def test_get_structure(self): subflow = flow.get_subflow(structure) assert subflow.flow_id == sub_flow_id + @pytest.mark.uses_test_server() def test_tagging(self): flows = openml.flows.list_flows(size=1) flow_id = flows["id"].iloc[0] @@ -119,6 +120,7 @@ def test_tagging(self): flows = openml.flows.list_flows(tag=tag) assert len(flows) == 0 + @pytest.mark.uses_test_server() def test_from_xml_to_xml(self): # Get the raw xml thing # TODO maybe get this via get_flow(), which would have to be refactored @@ -178,7 +180,7 @@ def test_to_xml_from_xml(self): assert new_flow is not flow @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_publish_flow(self): flow = openml.OpenMLFlow( name="sklearn.dummy.DummyClassifier", @@ -220,7 +222,7 @@ def test_publish_existing_flow(self, flow_exists_mock): ) @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_publish_flow_with_similar_components(self): clf = sklearn.ensemble.VotingClassifier( [("lr", sklearn.linear_model.LogisticRegression(solver="lbfgs"))], @@ -271,7 +273,7 @@ def test_publish_flow_with_similar_components(self): TestBase.logger.info(f"collected from {__file__.split('/')[-1]}: {flow3.flow_id}") @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_semi_legal_flow(self): # TODO: Test if parameters are set correctly! # should not throw error as it contains two differentiable forms of @@ -363,6 +365,7 @@ def test_illegal_flow(self): ) self.assertRaises(ValueError, self.extension.model_to_flow, illegal) + @pytest.mark.uses_test_server() def test_nonexisting_flow_exists(self): def get_sentinel(): # Create a unique prefix for the flow. Necessary because the flow @@ -380,7 +383,7 @@ def get_sentinel(): assert not flow_id @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_existing_flow_exists(self): # create a flow nb = sklearn.naive_bayes.GaussianNB() @@ -421,7 +424,7 @@ def test_existing_flow_exists(self): assert downloaded_flow_id == flow.flow_id @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_sklearn_to_upload_to_flow(self): iris = sklearn.datasets.load_iris() X = iris.data diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py index 0be65ceac..46bc36a94 100644 --- a/tests/test_flows/test_flow_functions.py +++ b/tests/test_flows/test_flow_functions.py @@ -274,12 +274,12 @@ def test_are_flows_equal_ignore_if_older(self): assert_flows_equal(flow, flow, ignore_parameter_values_on_older_children=None) @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="OrdinalEncoder introduced in 0.20. " "No known models with list of lists parameters in older versions.", ) + @pytest.mark.uses_test_server() def test_sklearn_to_flow_list_of_lists(self): from sklearn.preprocessing import OrdinalEncoder @@ -308,6 +308,7 @@ def test_get_flow1(self): assert flow.external_version is None @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_get_flow_reinstantiate_model(self): model = ensemble.RandomForestClassifier(n_estimators=33) extension = openml.extensions.get_extension_by_model(model) @@ -319,6 +320,7 @@ def test_get_flow_reinstantiate_model(self): downloaded_flow = openml.flows.get_flow(flow.flow_id, reinstantiate=True) assert isinstance(downloaded_flow.model, sklearn.ensemble.RandomForestClassifier) + @pytest.mark.uses_test_server() def test_get_flow_reinstantiate_model_no_extension(self): # Flow 10 is a WEKA flow self.assertRaisesRegex( @@ -389,7 +391,7 @@ def test_get_flow_reinstantiate_flow_not_strict_pre_023(self): assert "sklearn==0.19.1" not in flow.dependencies @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_get_flow_id(self): if self.long_version: list_all = openml.utils._list_all @@ -424,6 +426,7 @@ def test_get_flow_id(self): pytest.skip(reason="Not sure why there should only be one version of this flow.") assert flow_ids_exact_version_True == flow_ids_exact_version_False + @pytest.mark.uses_test_server() def test_delete_flow(self): flow = openml.OpenMLFlow( name="sklearn.dummy.DummyClassifier", diff --git a/tests/test_openml/test_api_calls.py b/tests/test_openml/test_api_calls.py index da6857b6e..a295259ef 100644 --- a/tests/test_openml/test_api_calls.py +++ b/tests/test_openml/test_api_calls.py @@ -15,12 +15,14 @@ class TestConfig(openml.testing.TestBase): + @pytest.mark.uses_test_server() def test_too_long_uri(self): with pytest.raises(openml.exceptions.OpenMLServerError, match="URI too long!"): openml.datasets.list_datasets(data_id=list(range(10000))) @unittest.mock.patch("time.sleep") @unittest.mock.patch("requests.Session") + @pytest.mark.uses_test_server() def test_retry_on_database_error(self, Session_class_mock, _): response_mock = unittest.mock.Mock() response_mock.text = ( @@ -115,6 +117,7 @@ def test_download_minio_failure(mock_minio, tmp_path: Path) -> None: ("task/42", "delete"), # 460 ], ) +@pytest.mark.uses_test_server() def test_authentication_endpoints_requiring_api_key_show_relevant_help_link( endpoint: str, method: str, diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py index 71651d431..1a66b76c0 100644 --- a/tests/test_runs/test_run.py +++ b/tests/test_runs/test_run.py @@ -25,6 +25,7 @@ class TestRun(TestBase): # Splitting not helpful, these test's don't rely on the server and take # less than 1 seconds + @pytest.mark.uses_test_server() def test_tagging(self): runs = openml.runs.list_runs(size=1) assert not runs.empty, "Test server state is incorrect" @@ -118,7 +119,7 @@ def _check_array(array, type_): assert run_prime_trace_content is None @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_to_from_filesystem_vanilla(self): model = Pipeline( [ @@ -154,7 +155,7 @@ def test_to_from_filesystem_vanilla(self): @pytest.mark.sklearn() @pytest.mark.flaky() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_to_from_filesystem_search(self): model = Pipeline( [ @@ -189,7 +190,7 @@ def test_to_from_filesystem_search(self): ) @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_to_from_filesystem_no_model(self): model = Pipeline( [("imputer", SimpleImputer(strategy="mean")), ("classifier", DummyClassifier())], @@ -295,7 +296,7 @@ def assert_run_prediction_data(task, run, model): assert_method(y_test, saved_y_test) @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_publish_with_local_loaded_flow(self): """ Publish a run tied to a local flow after it has first been saved to @@ -339,7 +340,7 @@ def test_publish_with_local_loaded_flow(self): openml.runs.get_run(loaded_run.run_id) @pytest.mark.sklearn() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_offline_and_online_run_identical(self): extension = SklearnExtension() diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 305d859d9..98b97045a 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -398,6 +398,7 @@ def _check_sample_evaluations( assert evaluation < max_time_allowed @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_regression_on_classif_task(self): task_id = 259 # collins; crossvalidation; has numeric targets @@ -413,8 +414,8 @@ def test_run_regression_on_classif_task(self): task=task, ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_check_erronous_sklearn_flow_fails(self): task_id = 115 # diabetes; crossvalidation task = openml.tasks.get_task(task_id) @@ -882,8 +883,8 @@ def test_run_and_upload_maskedarrays(self): ########################################################################## - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_learning_curve_task_1(self): task_id = 801 # diabates dataset num_test_instances = 6144 # for learning curve @@ -907,8 +908,8 @@ def test_learning_curve_task_1(self): ) self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_learning_curve_task_2(self): task_id = 801 # diabates dataset num_test_instances = 6144 # for learning curve @@ -944,12 +945,12 @@ def test_learning_curve_task_2(self): ) self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.21"), reason="Pipelines don't support indexing (used for the assert check)", ) + @pytest.mark.uses_test_server() def test_initialize_cv_from_run(self): randomsearch = Pipeline( [ @@ -1023,8 +1024,8 @@ def _test_local_evaluations(self, run): assert alt_scores[idx] >= 0 assert alt_scores[idx] <= 1 - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_local_run_swapped_parameter_order_model(self): clf = DecisionTreeClassifier() australian_task = 595 # Australian; crossvalidation @@ -1039,12 +1040,12 @@ def test_local_run_swapped_parameter_order_model(self): self._test_local_evaluations(run) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) + @pytest.mark.uses_test_server() def test_local_run_swapped_parameter_order_flow(self): # construct sci-kit learn classifier clf = Pipeline( @@ -1068,12 +1069,12 @@ def test_local_run_swapped_parameter_order_flow(self): self._test_local_evaluations(run) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) + @pytest.mark.uses_test_server() def test_local_run_metric_score(self): # construct sci-kit learn classifier clf = Pipeline( @@ -1106,12 +1107,12 @@ def test_online_run_metric_score(self): self._test_local_evaluations(run) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) + @pytest.mark.uses_test_server() def test_initialize_model_from_run(self): clf = sklearn.pipeline.Pipeline( steps=[ @@ -1168,12 +1169,12 @@ def test_initialize_model_from_run(self): assert flowS.components["Imputer"].parameters["strategy"] == '"most_frequent"' assert flowS.components["VarianceThreshold"].parameters["threshold"] == "0.05" - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) + @pytest.mark.uses_test_server() def test__run_exists(self): # would be better to not sentinel these clfs, # so we do not have to perform the actual runs @@ -1228,8 +1229,8 @@ def test__run_exists(self): run_ids = run_exists(task.task_id, setup_exists) assert run_ids, (run_ids, clf) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_with_illegal_flow_id(self): # check the case where the user adds an illegal flow id to a # non-existing flo @@ -1248,8 +1249,8 @@ def test_run_with_illegal_flow_id(self): avoid_duplicate_runs=True, ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_with_illegal_flow_id_after_load(self): # Same as `test_run_with_illegal_flow_id`, but test this error is also # caught if the run is stored to and loaded from disk first. @@ -1281,6 +1282,7 @@ def test_run_with_illegal_flow_id_after_load(self): TestBase.logger.info(f"collected from test_run_functions: {loaded_run.run_id}") @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_with_illegal_flow_id_1(self): # Check the case where the user adds an illegal flow id to an existing # flow. Comes to a different value error than the previous test @@ -1305,8 +1307,8 @@ def test_run_with_illegal_flow_id_1(self): avoid_duplicate_runs=True, ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_with_illegal_flow_id_1_after_load(self): # Same as `test_run_with_illegal_flow_id_1`, but test this error is # also caught if the run is stored to and loaded from disk first. @@ -1344,12 +1346,12 @@ def test_run_with_illegal_flow_id_1_after_load(self): loaded_run.publish, ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="OneHotEncoder cannot handle mixed type DataFrame as input", ) + @pytest.mark.uses_test_server() def test__run_task_get_arffcontent(self): task = openml.tasks.get_task(7) # kr-vs-kp; crossvalidation num_instances = 3196 @@ -1450,6 +1452,7 @@ def test_get_runs_list(self): for run in runs.to_dict(orient="index").values(): self._check_run(run) + @pytest.mark.uses_test_server() def test_list_runs_empty(self): runs = openml.runs.list_runs(task=[0]) assert runs.empty @@ -1572,12 +1575,12 @@ def test_get_runs_list_by_tag(self): runs = openml.runs.list_runs(tag="curves", size=2) assert len(runs) >= 1 - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) + @pytest.mark.uses_test_server() def test_run_on_dataset_with_missing_labels_dataframe(self): # Check that _run_task_get_arffcontent works when one of the class # labels only declared in the arff file, but is not present in the @@ -1609,12 +1612,12 @@ def test_run_on_dataset_with_missing_labels_dataframe(self): # repeat, fold, row_id, 6 confidences, prediction and correct label assert len(row) == 12 - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) + @pytest.mark.uses_test_server() def test_run_on_dataset_with_missing_labels_array(self): # Check that _run_task_get_arffcontent works when one of the class # labels only declared in the arff file, but is not present in the @@ -1653,6 +1656,7 @@ def test_run_on_dataset_with_missing_labels_array(self): # repeat, fold, row_id, 6 confidences, prediction and correct label assert len(row) == 12 + @pytest.mark.uses_test_server() def test_get_cached_run(self): openml.config.set_root_cache_directory(self.static_cache_dir) openml.runs.functions._get_cached_run(1) @@ -1662,8 +1666,8 @@ def test_get_uncached_run(self): with pytest.raises(openml.exceptions.OpenMLCacheException): openml.runs.functions._get_cached_run(10) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_flow_on_task_downloaded_flow(self): model = sklearn.ensemble.RandomForestClassifier(n_estimators=33) flow = self.extension.model_to_flow(model) @@ -1694,6 +1698,7 @@ def test_format_prediction_non_supervised(self): ): format_prediction(clustering, *ignored_input) + @pytest.mark.uses_test_server() def test_format_prediction_classification_no_probabilities(self): classification = openml.tasks.get_task( self.TEST_SERVER_TASK_SIMPLE["task_id"], @@ -1703,7 +1708,7 @@ def test_format_prediction_classification_no_probabilities(self): with pytest.raises(ValueError, match="`proba` is required for classification task"): format_prediction(classification, *ignored_input, proba=None) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_format_prediction_classification_incomplete_probabilities(self): classification = openml.tasks.get_task( self.TEST_SERVER_TASK_SIMPLE["task_id"], @@ -1714,6 +1719,7 @@ def test_format_prediction_classification_incomplete_probabilities(self): with pytest.raises(ValueError, match="Each class should have a predicted probability"): format_prediction(classification, *ignored_input, proba=incomplete_probabilities) + @pytest.mark.uses_test_server() def test_format_prediction_task_without_classlabels_set(self): classification = openml.tasks.get_task( self.TEST_SERVER_TASK_SIMPLE["task_id"], @@ -1724,7 +1730,7 @@ def test_format_prediction_task_without_classlabels_set(self): with pytest.raises(ValueError, match="The classification task must have class labels set"): format_prediction(classification, *ignored_input, proba={}) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_format_prediction_task_learning_curve_sample_not_set(self): learning_curve = openml.tasks.get_task(801, download_data=False) # diabetes;crossvalidation probabilities = {c: 0.2 for c in learning_curve.class_labels} @@ -1732,7 +1738,7 @@ def test_format_prediction_task_learning_curve_sample_not_set(self): with pytest.raises(ValueError, match="`sample` can not be none for LearningCurveTask"): format_prediction(learning_curve, *ignored_input, sample=None, proba=probabilities) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_format_prediction_task_regression(self): task_meta_data = self.TEST_SERVER_TASK_REGRESSION["task_meta_data"] _task_id = check_task_existence(**task_meta_data) @@ -1762,12 +1768,12 @@ def test_format_prediction_task_regression(self): - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="SimpleImputer doesn't handle mixed type DataFrame as input", ) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_delete_run(self): rs = np.random.randint(1, 2**31 - 1) clf = sklearn.pipeline.Pipeline( @@ -1863,12 +1869,12 @@ def test_delete_unknown_run(mock_delete, test_files_directory, test_api_key): @pytest.mark.sklearn() -@pytest.mark.xfail(reason="failures_issue_1544", strict=False) @unittest.skipIf( Version(sklearn.__version__) < Version("0.21"), reason="couldn't perform local tests successfully w/o bloating RAM", ) @mock.patch("openml_sklearn.SklearnExtension._prevent_optimize_n_jobs") +@pytest.mark.uses_test_server() def test__run_task_get_arffcontent_2(parallel_mock): """Tests if a run executed in parallel is collated correctly.""" task = openml.tasks.get_task(7) # Supervised Classification on kr-vs-kp @@ -1940,7 +1946,6 @@ def test__run_task_get_arffcontent_2(parallel_mock): ) -@pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.21"), @@ -1960,6 +1965,7 @@ def test__run_task_get_arffcontent_2(parallel_mock): (-1, "threading", 10), # the threading backend does preserve mocks even with parallelizing ] ) +@pytest.mark.uses_test_server() def test_joblib_backends(parallel_mock, n_jobs, backend, call_count): """Tests evaluation of a run using various joblib backends and n_jobs.""" if backend is None: diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index a3b698a37..a0469f9a5 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -34,8 +34,8 @@ def setUp(self): self.extension = SklearnExtension() super().setUp() - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_nonexisting_setup_exists(self): # first publish a non-existing flow sentinel = get_sentinel() @@ -82,8 +82,8 @@ def _existing_setup_exists(self, classif): setup_id = openml.setups.setup_exists(flow) assert setup_id == run.setup_id - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_existing_setup_exists_1(self): def side_effect(self): self.var_smoothing = 1e-9 @@ -98,14 +98,14 @@ def side_effect(self): nb = sklearn.naive_bayes.GaussianNB() self._existing_setup_exists(nb) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_exisiting_setup_exists_2(self): # Check a flow with one hyperparameter self._existing_setup_exists(sklearn.naive_bayes.GaussianNB()) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_existing_setup_exists_3(self): # Check a flow with many hyperparameters self._existing_setup_exists( @@ -147,6 +147,7 @@ def test_setup_list_filter_flow(self): for setup_id in setups: assert setups[setup_id].flow_id == flow_id + @pytest.mark.uses_test_server() def test_list_setups_empty(self): setups = openml.setups.list_setups(setup=[0]) if len(setups) > 0: @@ -167,6 +168,7 @@ def test_list_setups_output_format(self): assert isinstance(setups, pd.DataFrame) assert len(setups) == 10 + @pytest.mark.uses_test_server() def test_setuplist_offset(self): size = 10 setups = openml.setups.list_setups(offset=0, size=size) @@ -178,6 +180,7 @@ def test_setuplist_offset(self): assert len(all) == size * 2 + @pytest.mark.uses_test_server() def test_get_cached_setup(self): openml.config.set_root_cache_directory(self.static_cache_dir) openml.setups.functions._get_cached_setup(1) diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py index 40026592f..839e74cf3 100644 --- a/tests/test_study/test_study_functions.py +++ b/tests/test_study/test_study_functions.py @@ -73,6 +73,7 @@ def test_get_suite_error(self): ): openml.study.get_suite(123) + @pytest.mark.uses_test_server() def test_publish_benchmark_suite(self): fixture_alias = None fixture_name = "unit tested benchmark suite" @@ -141,13 +142,16 @@ def _test_publish_empty_study_is_allowed(self, explicit: bool): assert study_downloaded.main_entity_type == "run" assert study_downloaded.runs is None + @pytest.mark.uses_test_server() def test_publish_empty_study_explicit(self): self._test_publish_empty_study_is_allowed(explicit=True) + @pytest.mark.uses_test_server() def test_publish_empty_study_implicit(self): self._test_publish_empty_study_is_allowed(explicit=False) @pytest.mark.flaky() + @pytest.mark.uses_test_server() def test_publish_study(self): # get some random runs to attach run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10) @@ -217,6 +221,7 @@ def test_publish_study(self): res = openml.study.delete_study(study.id) assert res + @pytest.mark.uses_test_server() def test_study_attach_illegal(self): run_list = openml.runs.list_runs(size=10) assert len(run_list) == 10 diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py index 5528cabf2..fed0c0a00 100644 --- a/tests/test_tasks/test_classification_task.py +++ b/tests/test_tasks/test_classification_task.py @@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1): self.task_type = TaskType.SUPERVISED_CLASSIFICATION self.estimation_procedure = 5 - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_download_task(self): task = super().test_download_task() assert task.task_id == self.task_id @@ -26,14 +26,13 @@ def test_download_task(self): assert task.dataset_id == 20 assert task.estimation_procedure_id == self.estimation_procedure - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_class_labels(self): task = get_task(self.task_id) assert task.class_labels == ["tested_negative", "tested_positive"] -@pytest.mark.xfail(reason="failures_issue_1544", strict=False) -@pytest.mark.server() +@pytest.mark.uses_test_server() def test_get_X_and_Y(): task = get_task(119) X, Y = task.get_X_and_y() diff --git a/tests/test_tasks/test_clustering_task.py b/tests/test_tasks/test_clustering_task.py index dcc024388..2bbb015c6 100644 --- a/tests/test_tasks/test_clustering_task.py +++ b/tests/test_tasks/test_clustering_task.py @@ -28,6 +28,7 @@ def test_get_dataset(self): task.get_dataset() @pytest.mark.production() + @pytest.mark.uses_test_server() def test_download_task(self): # no clustering tasks on test server self.use_production_server() @@ -36,6 +37,7 @@ def test_download_task(self): assert task.task_type_id == TaskType.CLUSTERING assert task.dataset_id == 36 + @pytest.mark.uses_test_server() def test_upload_task(self): compatible_datasets = self._get_compatible_rand_dataset() for i in range(100): diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py index 5f4b3e0ab..fbcbfe9bf 100644 --- a/tests/test_tasks/test_learning_curve_task.py +++ b/tests/test_tasks/test_learning_curve_task.py @@ -18,7 +18,7 @@ def setUp(self, n_levels: int = 1): self.task_type = TaskType.LEARNING_CURVE self.estimation_procedure = 13 - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_get_X_and_Y(self): X, Y = super().test_get_X_and_Y() assert X.shape == (768, 8) @@ -27,14 +27,14 @@ def test_get_X_and_Y(self): assert isinstance(Y, pd.Series) assert pd.api.types.is_categorical_dtype(Y) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_download_task(self): task = super().test_download_task() assert task.task_id == self.task_id assert task.task_type_id == TaskType.LEARNING_CURVE assert task.dataset_id == 20 - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_class_labels(self): task = get_task(self.task_id) assert task.class_labels == ["tested_negative", "tested_positive"] diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py index 0cd2d96e2..a834cdf0f 100644 --- a/tests/test_tasks/test_regression_task.py +++ b/tests/test_tasks/test_regression_task.py @@ -49,7 +49,7 @@ def setUp(self, n_levels: int = 1): self.task_type = TaskType.SUPERVISED_REGRESSION - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_get_X_and_Y(self): X, Y = super().test_get_X_and_Y() assert X.shape == (194, 32) @@ -58,7 +58,7 @@ def test_get_X_and_Y(self): assert isinstance(Y, pd.Series) assert pd.api.types.is_numeric_dtype(Y) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_download_task(self): task = super().test_download_task() assert task.task_id == self.task_id diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py index e5a17a72b..3f7b06ee4 100644 --- a/tests/test_tasks/test_supervised_task.py +++ b/tests/test_tasks/test_supervised_task.py @@ -28,6 +28,7 @@ def setUpClass(cls): def setUp(self, n_levels: int = 1): super().setUp() + @pytest.mark.uses_test_server() def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]: task = get_task(self.task_id) X, Y = task.get_X_and_y() diff --git a/tests/test_tasks/test_task.py b/tests/test_tasks/test_task.py index 67f715d2b..b77782847 100644 --- a/tests/test_tasks/test_task.py +++ b/tests/test_tasks/test_task.py @@ -32,10 +32,11 @@ def setUpClass(cls): def setUp(self, n_levels: int = 1): super().setUp() + @pytest.mark.uses_test_server() def test_download_task(self): return get_task(self.task_id) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_upload_task(self): # We don't know if the task in question already exists, so we try a few times. Checking # beforehand would not be an option because a concurrent unit test could potentially diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index 110459711..a46e597bc 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -26,6 +26,7 @@ def setUp(self): def tearDown(self): super().tearDown() + @pytest.mark.uses_test_server() def test__get_cached_tasks(self): openml.config.set_root_cache_directory(self.static_cache_dir) tasks = openml.tasks.functions._get_cached_tasks() @@ -33,6 +34,7 @@ def test__get_cached_tasks(self): assert len(tasks) == 3 assert isinstance(next(iter(tasks.values())), OpenMLTask) + @pytest.mark.uses_test_server() def test__get_cached_task(self): openml.config.set_root_cache_directory(self.static_cache_dir) task = openml.tasks.functions._get_cached_task(1) @@ -47,6 +49,7 @@ def test__get_cached_task_not_cached(self): 2, ) + @pytest.mark.uses_test_server() def test__get_estimation_procedure_list(self): estimation_procedures = openml.tasks.functions._get_estimation_procedure_list() assert isinstance(estimation_procedures, list) @@ -69,6 +72,7 @@ def _check_task(self, task): assert isinstance(task["status"], str) assert task["status"] in ["in_preparation", "active", "deactivated"] + @pytest.mark.uses_test_server() def test_list_tasks_by_type(self): num_curves_tasks = 198 # number is flexible, check server if fails ttid = TaskType.LEARNING_CURVE @@ -78,15 +82,18 @@ def test_list_tasks_by_type(self): assert ttid == task["ttid"] self._check_task(task) + @pytest.mark.uses_test_server() def test_list_tasks_length(self): ttid = TaskType.LEARNING_CURVE tasks = openml.tasks.list_tasks(task_type=ttid) assert len(tasks) > 100 + @pytest.mark.uses_test_server() def test_list_tasks_empty(self): tasks = openml.tasks.list_tasks(tag="NoOneWillEverUseThisTag") assert tasks.empty + @pytest.mark.uses_test_server() def test_list_tasks_by_tag(self): num_basic_tasks = 100 # number is flexible, check server if fails tasks = openml.tasks.list_tasks(tag="OpenML100") @@ -94,12 +101,14 @@ def test_list_tasks_by_tag(self): for task in tasks.to_dict(orient="index").values(): self._check_task(task) + @pytest.mark.uses_test_server() def test_list_tasks(self): tasks = openml.tasks.list_tasks() assert len(tasks) >= 900 for task in tasks.to_dict(orient="index").values(): self._check_task(task) + @pytest.mark.uses_test_server() def test_list_tasks_paginate(self): size = 10 max = 100 @@ -109,6 +118,7 @@ def test_list_tasks_paginate(self): for task in tasks.to_dict(orient="index").values(): self._check_task(task) + @pytest.mark.uses_test_server() def test_list_tasks_per_type_paginate(self): size = 40 max = 100 @@ -125,6 +135,7 @@ def test_list_tasks_per_type_paginate(self): assert j == task["ttid"] self._check_task(task) + @pytest.mark.uses_test_server() def test__get_task(self): openml.config.set_root_cache_directory(self.static_cache_dir) openml.tasks.get_task(1882) @@ -139,6 +150,7 @@ def test__get_task_live(self): # https://github.com/openml/openml-python/issues/378 openml.tasks.get_task(34536) + @pytest.mark.uses_test_server() def test_get_task(self): task = openml.tasks.get_task(1, download_data=True) # anneal; crossvalidation assert isinstance(task, OpenMLTask) @@ -152,7 +164,7 @@ def test_get_task(self): os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff") ) - @pytest.mark.xfail(reason="failures_issue_1544", strict=False) + @pytest.mark.uses_test_server() def test_get_task_lazy(self): task = openml.tasks.get_task(2, download_data=False) # anneal; crossvalidation assert isinstance(task, OpenMLTask) @@ -193,6 +205,7 @@ def assert_and_raise(*args, **kwargs): # Now the file should no longer exist assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml")) + @pytest.mark.uses_test_server() def test_get_task_with_cache(self): openml.config.set_root_cache_directory(self.static_cache_dir) task = openml.tasks.get_task(1) @@ -208,6 +221,7 @@ def test_get_task_different_types(self): # Issue 538, get_task failing with clustering task. openml.tasks.functions.get_task(126033) + @pytest.mark.uses_test_server() def test_download_split(self): task = openml.tasks.get_task(1) # anneal; crossvalidation split = task.download_split() diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py index 540c43de0..6b8804b9f 100644 --- a/tests/test_tasks/test_task_methods.py +++ b/tests/test_tasks/test_task_methods.py @@ -16,6 +16,7 @@ def setUp(self): def tearDown(self): super().tearDown() + @pytest.mark.uses_test_server() def test_tagging(self): task = openml.tasks.get_task(1) # anneal; crossvalidation # tags can be at most 64 alphanumeric (+ underscore) chars @@ -31,6 +32,7 @@ def test_tagging(self): tasks = openml.tasks.list_tasks(tag=tag) assert len(tasks) == 0 + @pytest.mark.uses_test_server() def test_get_train_and_test_split_indices(self): openml.config.set_root_cache_directory(self.static_cache_dir) task = openml.tasks.get_task(1882) diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py index 35be84903..a1cdb55ea 100644 --- a/tests/test_utils/test_utils.py +++ b/tests/test_utils/test_utils.py @@ -48,18 +48,18 @@ def _mocked_perform_api_call(call, request_method): return openml._api_calls._download_text_file(url) -@pytest.mark.server() +@pytest.mark.uses_test_server() def test_list_all(): openml.utils._list_all(listing_call=openml.tasks.functions._list_tasks) -@pytest.mark.server() +@pytest.mark.uses_test_server() def test_list_all_for_tasks(min_number_tasks_on_test_server): tasks = openml.tasks.list_tasks(size=min_number_tasks_on_test_server) assert min_number_tasks_on_test_server == len(tasks) -@pytest.mark.server() +@pytest.mark.uses_test_server() def test_list_all_with_multiple_batches(min_number_tasks_on_test_server): # By setting the batch size one lower than the minimum we guarantee at least two # batches and at the same time do as few batches (roundtrips) as possible. @@ -72,7 +72,7 @@ def test_list_all_with_multiple_batches(min_number_tasks_on_test_server): assert min_number_tasks_on_test_server <= sum(len(batch) for batch in batches) -@pytest.mark.server() +@pytest.mark.uses_test_server() def test_list_all_for_datasets(min_number_datasets_on_test_server): datasets = openml.datasets.list_datasets( size=min_number_datasets_on_test_server, @@ -83,29 +83,29 @@ def test_list_all_for_datasets(min_number_datasets_on_test_server): _check_dataset(dataset) -@pytest.mark.server() +@pytest.mark.uses_test_server() def test_list_all_for_flows(min_number_flows_on_test_server): flows = openml.flows.list_flows(size=min_number_flows_on_test_server) assert min_number_flows_on_test_server == len(flows) -@pytest.mark.server() @pytest.mark.flaky() # Other tests might need to upload runs first +@pytest.mark.uses_test_server() def test_list_all_for_setups(min_number_setups_on_test_server): # TODO apparently list_setups function does not support kwargs setups = openml.setups.list_setups(size=min_number_setups_on_test_server) assert min_number_setups_on_test_server == len(setups) -@pytest.mark.server() @pytest.mark.flaky() # Other tests might need to upload runs first +@pytest.mark.uses_test_server() def test_list_all_for_runs(min_number_runs_on_test_server): runs = openml.runs.list_runs(size=min_number_runs_on_test_server) assert min_number_runs_on_test_server == len(runs) -@pytest.mark.server() @pytest.mark.flaky() # Other tests might need to upload runs first +@pytest.mark.uses_test_server() def test_list_all_for_evaluations(min_number_evaluations_on_test_server): # TODO apparently list_evaluations function does not support kwargs evaluations = openml.evaluations.list_evaluations( @@ -115,8 +115,8 @@ def test_list_all_for_evaluations(min_number_evaluations_on_test_server): assert min_number_evaluations_on_test_server == len(evaluations) -@pytest.mark.server() @unittest.mock.patch("openml._api_calls._perform_api_call", side_effect=_mocked_perform_api_call) +@pytest.mark.uses_test_server() def test_list_all_few_results_available(_perform_api_call): datasets = openml.datasets.list_datasets(size=1000, data_name="iris", data_version=1) assert len(datasets) == 1, "only one iris dataset version 1 should be present" @@ -141,7 +141,7 @@ def test__create_cache_directory(config_mock, tmp_path): openml.utils._create_cache_directory("ghi") -@pytest.mark.server() +@pytest.mark.uses_test_server() def test_correct_test_server_download_state(): """This test verifies that the test server downloads the data from the correct source. From f5d268073c639019840e71d9cb7f1404bd0ae5a4 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 6 Jan 2026 22:32:11 +0500 Subject: [PATCH 2/3] deselect tests requiring the test server --- .github/workflows/test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 850abdfe7..d65cc3796 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -98,9 +98,9 @@ jobs: fi if [ "${{ matrix.sklearn-only }}" = "true" ]; then - marks="sklearn and not production" + marks="sklearn and not production and not uses_test_server" else - marks="not production" + marks="not production and not uses_test_server" fi pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks" @@ -113,9 +113,9 @@ jobs: fi if [ "${{ matrix.sklearn-only }}" = "true" ]; then - marks="sklearn and production" + marks="sklearn and production and not uses_test_server" else - marks="production" + marks="production and not uses_test_server" fi pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks" @@ -123,7 +123,7 @@ jobs: - name: Run tests on Windows if: matrix.os == 'windows-latest' run: | # we need a separate step because of the bash-specific if-statement in the previous one. - pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 + pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1 -m "not uses_test_server" - name: Check for files left behind by test if: matrix.os != 'windows-latest' && always() From 6c331a9577f8fd9493231c81a08c589b39f25e75 Mon Sep 17 00:00:00 2001 From: geetu040 Date: Tue, 6 Jan 2026 22:46:07 +0500 Subject: [PATCH 3/3] add remaining missing markers --- tests/test_runs/test_run_functions.py | 15 +++++++-------- tests/test_tasks/test_task_functions.py | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 98b97045a..db54151d1 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -627,8 +627,8 @@ def _run_and_upload_regression( sentinel=sentinel, ) - @pytest.mark.skip(reason="failures_issue_1544") @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_and_upload_logistic_regression(self): lr = LogisticRegression(solver="lbfgs", max_iter=1000) task_id = self.TEST_SERVER_TASK_SIMPLE["task_id"] @@ -636,8 +636,8 @@ def test_run_and_upload_logistic_regression(self): n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501") - @pytest.mark.skip(reason="failures_issue_1544") @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_and_upload_linear_regression(self): lr = LinearRegression() task_id = self.TEST_SERVER_TASK_REGRESSION["task_id"] @@ -667,8 +667,8 @@ def test_run_and_upload_linear_regression(self): n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"] self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501") - @pytest.mark.skip(reason="failures_issue_1544") @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_and_upload_pipeline_dummy_pipeline(self): pipeline1 = Pipeline( steps=[ @@ -681,12 +681,12 @@ def test_run_and_upload_pipeline_dummy_pipeline(self): n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"] self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501") - @pytest.mark.skip(reason="failures_issue_1544") @pytest.mark.sklearn() @unittest.skipIf( Version(sklearn.__version__) < Version("0.20"), reason="columntransformer introduction in 0.20.0", ) + @pytest.mark.uses_test_server() def test_run_and_upload_column_transformer_pipeline(self): import sklearn.compose import sklearn.impute @@ -746,7 +746,6 @@ def get_ct_cf(nominal_indices, numeric_indices): sentinel=sentinel, ) - @pytest.mark.skip(reason="failures_issue_1544") @pytest.mark.sklearn() @unittest.skip("https://github.com/openml/OpenML/issues/1180") @unittest.skipIf( @@ -799,8 +798,8 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock): call_count += 1 assert call_count == 3 - @pytest.mark.skip(reason="failures_issue_1544") @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_and_upload_gridsearch(self): estimator_name = ( "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator" @@ -823,7 +822,7 @@ def test_run_and_upload_gridsearch(self): assert len(run.trace.trace_iterations) == 9 @pytest.mark.sklearn() - @pytest.mark.skip(reason="failures_issue_1544") + @pytest.mark.uses_test_server() def test_run_and_upload_randomsearch(self): randomsearch = RandomizedSearchCV( RandomForestClassifier(n_estimators=5), @@ -855,8 +854,8 @@ def test_run_and_upload_randomsearch(self): trace = openml.runs.get_run_trace(run.run_id) assert len(trace.trace_iterations) == 5 - @pytest.mark.skip(reason="failures_issue_1544") @pytest.mark.sklearn() + @pytest.mark.uses_test_server() def test_run_and_upload_maskedarrays(self): # This testcase is important for 2 reasons: # 1) it verifies the correct handling of masked arrays (not all diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py index a46e597bc..3a2b9ea0a 100644 --- a/tests/test_tasks/test_task_functions.py +++ b/tests/test_tasks/test_task_functions.py @@ -187,7 +187,7 @@ def test_get_task_lazy(self): ) @mock.patch("openml.tasks.functions.get_dataset") - @pytest.mark.xfail(reason="failures_issue_1544") + @pytest.mark.uses_test_server() def test_removal_upon_download_failure(self, get_dataset): class WeirdException(Exception): pass