From 02e1c2da1b058ee3fb65c38af06c80040cc69bed Mon Sep 17 00:00:00 2001 From: taniya-das Date: Thu, 19 Jun 2025 19:32:03 +0200 Subject: [PATCH 1/3] lazy conversion --- tests/conftest.py | 20 ++ tests/test_datasets/test_dataset.py | 409 +++++++++++++++------------- 2 files changed, 240 insertions(+), 189 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 40a801e86..bdffb0eec 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -308,3 +308,23 @@ def workdir(tmp_path): os.chdir(tmp_path) yield tmp_path os.chdir(original_cwd) + +@pytest.fixture +def mock_iris_dataset(requests_mock, test_files_directory): + """Fixture to provide the iris dataset.""" + content_file = ( + test_files_directory / "mock_responses" / "datasets" / "61" / "description.xml" + ) + requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text()) + + yield + +@pytest.fixture +def mock_titanic_dataset(requests_mock, test_files_directory): + """Fixture to provide the titanic dataset.""" + content_file = ( + test_files_directory / "mock_responses" / "datasets" / "40945" / "description.xml" + ) + requests_mock.get("https://www.openml.org/api/v1/xml/data/40945", text=content_file.read_text()) + + yield diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index c48086a72..18618f63f 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -72,65 +72,11 @@ def test_repr(self): data = openml.datasets.OpenMLDataset(name="somename", description="a description") str(data) - def test_init_string_validation(self): - with pytest.raises(ValueError, match="Invalid symbols ' ' in name"): - openml.datasets.OpenMLDataset(name="some name", description="a description") - - with pytest.raises(ValueError, match="Invalid symbols 'ï' in description"): - openml.datasets.OpenMLDataset(name="somename", description="a descriptïon") - - with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"): - openml.datasets.OpenMLDataset( - name="somename", - description="a description", - citation="Something by Müller", - ) - - def test__unpack_categories_with_nan_likes(self): - # unpack_categories decodes numeric categorical values according to the header - # Containing a 'non' category in the header shouldn't lead to failure. - categories = ["a", "b", None, float("nan"), np.nan] - series = pd.Series([0, 1, None, float("nan"), np.nan, 1, 0]) - clean_series = OpenMLDataset._unpack_categories(series, categories) - - expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"] - self.assertListEqual(list(clean_series.values), expected_values) - self.assertListEqual(list(clean_series.cat.categories.values), list("ab")) - - def test_get_data_pandas(self): - data, _, _, _ = self.titanic.get_data() - assert isinstance(data, pd.DataFrame) - assert data.shape[1] == len(self.titanic.features) - assert data.shape[0] == 1309 - col_dtype = { - "pclass": "uint8", - "survived": "category", - "name": "object", - "sex": "category", - "age": "float64", - "sibsp": "uint8", - "parch": "uint8", - "ticket": "object", - "fare": "float64", - "cabin": "object", - "embarked": "category", - "boat": "object", - "body": "float64", - "home.dest": "object", - } - for col_name in data.columns: - assert data[col_name].dtype.name == col_dtype[col_name] - - X, y, _, _ = self.titanic.get_data( - target=self.titanic.default_target_attribute, - ) - assert isinstance(X, pd.DataFrame) - assert isinstance(y, pd.Series) - assert X.shape == (1309, 13) - assert y.shape == (1309,) - for col_name in X.columns: - assert X[col_name].dtype.name == col_dtype[col_name] - assert y.dtype.name == col_dtype["survived"] + + + + + @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") def test_get_data_boolean_pandas(self): @@ -143,139 +89,224 @@ def test_get_data_boolean_pandas(self): data, _, _, _ = self.pc4.get_data() assert data["c"].dtype.name == "category" assert set(data["c"].cat.categories) == {True, False} + + +@pytest.mark.production +def test_init_string_validation(): + with pytest.raises(ValueError, match="Invalid symbols ' ' in name"): + openml.datasets.OpenMLDataset(name="some name", description="a description") + + with pytest.raises(ValueError, match="Invalid symbols 'ï' in description"): + openml.datasets.OpenMLDataset(name="somename", description="a descriptïon") + + with pytest.raises(ValueError, match="Invalid symbols 'ü' in citation"): + openml.datasets.OpenMLDataset( + name="somename", + description="a description", + citation="Something by Müller", + ) + +@pytest.mark.production +def test__unpack_categories_with_nan_likes(): + # unpack_categories decodes numeric categorical values according to the header + # Containing a 'non' category in the header shouldn't lead to failure. + categories = ["a", "b", None, float("nan"), np.nan] + series = pd.Series([0, 1, None, float("nan"), np.nan, 1, 0]) + clean_series = OpenMLDataset._unpack_categories(series, categories) + + expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"] + assert list(clean_series.values) == expected_values + assert list(clean_series.cat.categories.values) == list("ab") + - def _check_expected_type(self, dtype, is_cat, col): - if is_cat: - expected_type = "category" - elif not col.isna().any() and (col.astype("uint8") == col).all(): - expected_type = "uint8" - else: - expected_type = "float64" - - assert dtype.name == expected_type - - @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") - def test_get_data_with_rowid(self): - self.dataset.row_id_attribute = "condition" - rval, _, categorical, _ = self.dataset.get_data(include_row_id=True) - assert isinstance(rval, pd.DataFrame) - for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): - self._check_expected_type(dtype, is_cat, rval[col]) - assert rval.shape == (898, 39) - assert len(categorical) == 39 - - rval, _, categorical, _ = self.dataset.get_data() - assert isinstance(rval, pd.DataFrame) - for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): - self._check_expected_type(dtype, is_cat, rval[col]) - assert rval.shape == (898, 38) - assert len(categorical) == 38 - - @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") - def test_get_data_with_target_pandas(self): - X, y, categorical, attribute_names = self.dataset.get_data(target="class") - assert isinstance(X, pd.DataFrame) - for dtype, is_cat, col in zip(X.dtypes, categorical, X): - self._check_expected_type(dtype, is_cat, X[col]) - assert isinstance(y, pd.Series) - assert y.dtype.name == "category" - - assert X.shape == (898, 38) - assert len(attribute_names) == 38 - assert y.shape == (898,) - - assert "class" not in attribute_names - - def test_get_data_rowid_and_ignore_and_target(self): - self.dataset.ignore_attribute = ["condition"] - self.dataset.row_id_attribute = ["hardness"] - X, y, categorical, names = self.dataset.get_data(target="class") - assert X.shape == (898, 36) - assert len(categorical) == 36 - cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3 - self.assertListEqual(categorical, cats) - assert y.shape == (898,) +@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") +@pytest.mark.production +def test_get_data_with_rowid(): + dataset = openml.datasets.get_dataset(2, download_data=False) + dataset.row_id_attribute = "condition" + rval, _, categorical, _ = dataset.get_data(include_row_id=True) + assert isinstance(rval, pd.DataFrame) + for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): + _check_expected_type(dtype, is_cat, rval[col]) + assert rval.shape == (898, 39) + assert len(categorical) == 39 + + rval, _, categorical, _ = dataset.get_data() + assert isinstance(rval, pd.DataFrame) + for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): + _check_expected_type(dtype, is_cat, rval[col]) + assert rval.shape == (898, 38) + assert len(categorical) == 38 + +@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") +@pytest.mark.production +def test_get_data_with_target_pandas(): + dataset = openml.datasets.get_dataset(2, download_data=False) + X, y, categorical, attribute_names = dataset.get_data(target="class") + assert isinstance(X, pd.DataFrame) + for dtype, is_cat, col in zip(X.dtypes, categorical, X): + _check_expected_type(dtype, is_cat, X[col]) + assert isinstance(y, pd.Series) + assert y.dtype.name == "category" + + assert X.shape == (898, 38) + assert len(attribute_names) == 38 + assert y.shape == (898,) + + assert "class" not in attribute_names + +def _check_expected_type(dtype, is_cat, col): + if is_cat: + expected_type = "category" + elif not col.isna().any() and (col.astype("uint8") == col).all(): + expected_type = "uint8" + else: + expected_type = "float64" + + assert dtype.name == expected_type + +@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") +@pytest.mark.production +def test_get_data_with_ignore_attributes(): + dataset = openml.datasets.get_dataset(2, download_data=False) + dataset.ignore_attribute = ["condition"] + rval, _, categorical, _ = dataset.get_data(include_ignore_attribute=True) + for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): + _check_expected_type(dtype, is_cat, rval[col]) + assert rval.shape == (898, 39) + assert len(categorical) == 39 + + rval, _, categorical, _ = dataset.get_data(include_ignore_attribute=False) + for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): + _check_expected_type(dtype, is_cat, rval[col]) + assert rval.shape == (898, 38) + assert len(categorical) == 38 + + +@pytest.mark.production +def test_get_data_pandas(): + titanic = openml.datasets.get_dataset(40945, download_data=False) + data, _, _, _ = titanic.get_data() + assert isinstance(data, pd.DataFrame) + assert data.shape[1] == len(titanic.features) + assert data.shape[0] == 1309 + col_dtype = { + "pclass": "uint8", + "survived": "category", + "name": "object", + "sex": "category", + "age": "float64", + "sibsp": "uint8", + "parch": "uint8", + "ticket": "object", + "fare": "float64", + "cabin": "object", + "embarked": "category", + "boat": "object", + "body": "float64", + "home.dest": "object", + } + for col_name in data.columns: + assert data[col_name].dtype.name == col_dtype[col_name] + + X, y, _, _ = titanic.get_data( + target=titanic.default_target_attribute, + ) + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + assert X.shape == (1309, 13) + assert y.shape == (1309,) + for col_name in X.columns: + assert X[col_name].dtype.name == col_dtype[col_name] + assert y.dtype.name == col_dtype["survived"] + +@pytest.mark.production +def test_get_data_rowid_and_ignore_and_target(): + dataset = openml.datasets.get_dataset(2, download_data=False) + dataset.ignore_attribute = ["condition"] + dataset.row_id_attribute = ["hardness"] + X, y, categorical, names = dataset.get_data(target="class") + assert X.shape == (898, 36) + assert len(categorical) == 36 + cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3 + assert categorical == cats + assert y.shape == (898,) + +@pytest.mark.production +def test_get_data_with_nonexisting_class(): + # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However, + # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to + # indices 4 and 5, and that nothing is mapped to index 3. + dataset = openml.datasets.get_dataset(2, download_data=False) + _, y, _, _ = dataset.get_data("class") + assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"] + + +@pytest.mark.production +def test_get_data_corrupt_pickle(): + # Lazy loaded dataset, populate cache. + iris = openml.datasets.get_dataset(61, download_data=False) + iris.get_data() + # Corrupt pickle file, overwrite as empty. + with open(iris.data_pickle_file, "w") as fh: + fh.write("") + # Despite the corrupt file, the data should be loaded from the ARFF file. + # A warning message is written to the python logger. + xy, _, _, _ = iris.get_data() + assert isinstance(xy, pd.DataFrame) + assert xy.shape == (150, 5) + +@pytest.mark.production +def test_lazy_loading_metadata(): + # Initial Setup + did_cache_dir = openml.utils._create_cache_directory_for_id( + openml.datasets.functions.DATASETS_CACHE_DIR_NAME, + 2, + ) + _compare_dataset = openml.datasets.get_dataset( + 2, + download_data=False, + download_features_meta_data=True, + download_qualities=True, + ) + change_time = os.stat(did_cache_dir).st_mtime + + # Test with cache + _dataset = openml.datasets.get_dataset( + 2, + download_data=False, + download_features_meta_data=False, + download_qualities=False, + ) + assert change_time == os.stat(did_cache_dir).st_mtime + assert _dataset.features == _compare_dataset.features + assert _dataset.qualities == _compare_dataset.qualities + + # -- Test without cache + openml.utils._remove_cache_dir_for_id( + openml.datasets.functions.DATASETS_CACHE_DIR_NAME, + did_cache_dir, + ) - @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") - def test_get_data_with_ignore_attributes(self): - self.dataset.ignore_attribute = ["condition"] - rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=True) - for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): - self._check_expected_type(dtype, is_cat, rval[col]) - assert rval.shape == (898, 39) - assert len(categorical) == 39 - - rval, _, categorical, _ = self.dataset.get_data(include_ignore_attribute=False) - for dtype, is_cat, col in zip(rval.dtypes, categorical, rval): - self._check_expected_type(dtype, is_cat, rval[col]) - assert rval.shape == (898, 38) - assert len(categorical) == 38 - - def test_get_data_with_nonexisting_class(self): - # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However, - # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to - # indices 4 and 5, and that nothing is mapped to index 3. - _, y, _, _ = self.dataset.get_data("class") - assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"] - - def test_get_data_corrupt_pickle(self): - # Lazy loaded dataset, populate cache. - self.iris.get_data() - # Corrupt pickle file, overwrite as empty. - with open(self.iris.data_pickle_file, "w") as fh: - fh.write("") - # Despite the corrupt file, the data should be loaded from the ARFF file. - # A warning message is written to the python logger. - xy, _, _, _ = self.iris.get_data() - assert isinstance(xy, pd.DataFrame) - assert xy.shape == (150, 5) - - def test_lazy_loading_metadata(self): - # Initial Setup - did_cache_dir = openml.utils._create_cache_directory_for_id( - openml.datasets.functions.DATASETS_CACHE_DIR_NAME, - 2, - ) - _compare_dataset = openml.datasets.get_dataset( - 2, - download_data=False, - download_features_meta_data=True, - download_qualities=True, - ) - change_time = os.stat(did_cache_dir).st_mtime - - # Test with cache - _dataset = openml.datasets.get_dataset( - 2, - download_data=False, - download_features_meta_data=False, - download_qualities=False, - ) - assert change_time == os.stat(did_cache_dir).st_mtime - assert _dataset.features == _compare_dataset.features - assert _dataset.qualities == _compare_dataset.qualities - - # -- Test without cache - openml.utils._remove_cache_dir_for_id( - openml.datasets.functions.DATASETS_CACHE_DIR_NAME, - did_cache_dir, - ) + _dataset = openml.datasets.get_dataset( + 2, + download_data=False, + download_features_meta_data=False, + download_qualities=False, + ) + assert ["description.xml"] == os.listdir(did_cache_dir) + assert change_time != os.stat(did_cache_dir).st_mtime + assert _dataset.features == _compare_dataset.features + assert _dataset.qualities == _compare_dataset.qualities - _dataset = openml.datasets.get_dataset( - 2, - download_data=False, - download_features_meta_data=False, - download_qualities=False, - ) - assert ["description.xml"] == os.listdir(did_cache_dir) - assert change_time != os.stat(did_cache_dir).st_mtime - assert _dataset.features == _compare_dataset.features - assert _dataset.qualities == _compare_dataset.qualities - - def test_equality_comparison(self): - self.assertEqual(self.iris, self.iris) - self.assertNotEqual(self.iris, self.titanic) - self.assertNotEqual(self.titanic, "Wrong_object") +@pytest.mark.production +def test_equality_comparison(mock_iris_dataset, mock_titanic_dataset): + + iris = openml.datasets.get_dataset(61, download_data=False) + titanic = openml.datasets.get_dataset(40945, download_data=False) + assert iris == iris + assert iris != titanic + assert titanic != "Wrong_object" def test_tagging(): From 0a8cde679f29c02f4e647fb40ff9eb83c9e5a901 Mon Sep 17 00:00:00 2001 From: taniya-das Date: Thu, 19 Jun 2025 19:33:17 +0200 Subject: [PATCH 2/3] test files --- .../datasets/40945/description.xml | 26 ++++++++++++++++ .../datasets/61/description.xml | 30 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 tests/files/mock_responses/datasets/40945/description.xml create mode 100644 tests/files/mock_responses/datasets/61/description.xml diff --git a/tests/files/mock_responses/datasets/40945/description.xml b/tests/files/mock_responses/datasets/40945/description.xml new file mode 100644 index 000000000..c81d870c7 --- /dev/null +++ b/tests/files/mock_responses/datasets/40945/description.xml @@ -0,0 +1,26 @@ + + 40945 + Titanic + 1 + **Author**: Frank E. Harrell Jr., Thomas Cason +**Source**: [Vanderbilt Biostatistics](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html) +**Please cite**: + +The original Titanic dataset, describing the survival status of individual passengers on the Titanic. The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. The principal source for data about Titanic passengers is the Encyclopedia Titanica. The datasets used here were begun by a variety of researchers. One of the original sources is Eaton & Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay. + +Thomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created. + +For more information about how this dataset was constructed: +http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt + + +### Attribute information + +The variables on our extracted dataset are pclass, survived, name, age, embarked, home.dest, room, ticket, boat, and sex. pclass refers to passenger class (1st, 2nd, 3rd), and is a proxy for socio-economic class. Age is in years, and some infants had fractional values. The titanic2 data frame has no missing data and includes records for the crew, but age is dichotomized at adult vs. child. These data were obtained from Robert Dawson, Saint Mary's University, E-mail. The variables are pclass, age, sex, survived. These data frames are useful for demonstrating many of the functions in Hmisc as well as demonstrating binary logistic regression analysis using the Design library. For more details and references see Simonoff, Jeffrey S (1997): The "unusual episode" and a second statistics course. J Statistics Education, Vol. 5 No. 1. + 7 + ARFF + 2017-10-16T01:17:36 + Public https://api.openml.org/data/v1/download/16826755/Titanic.arff + http://openml1.win.tue.nl/dataset40945/dataset_40945.pq 16826755 survived text_data public http://openml1.win.tue.nl/dataset40945/dataset_40945.pq active + 2018-10-04 07:19:36 60ac7205eee0ba5045c90b3bba95b1c4 + diff --git a/tests/files/mock_responses/datasets/61/description.xml b/tests/files/mock_responses/datasets/61/description.xml new file mode 100644 index 000000000..515cceae8 --- /dev/null +++ b/tests/files/mock_responses/datasets/61/description.xml @@ -0,0 +1,30 @@ + + 61 + iris + 1 + **Author**: R.A. Fisher +**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall +**Please cite**: + +**Iris Plants Database** +This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. + +Predicted attribute: class of iris plant. +This is an exceedingly simple domain. + +### Attribute Information: + 1. sepal length in cm + 2. sepal width in cm + 3. petal length in cm + 4. petal width in cm + 5. class: + -- Iris Setosa + -- Iris Versicolour + -- Iris Virginica + 4 + ARFF + R.A. Fisher 1936 2014-04-06T23:23:39 + English Public https://api.openml.org/data/v1/download/61/iris.arff + http://openml1.win.tue.nl/dataset61/dataset_61.pq 61 class 1 https://archive.ics.uci.edu/ml/citation_policy.html study_1study_25study_4study_41study_50study_52study_7study_86study_88study_89uci public https://archive.ics.uci.edu/ml/datasets/Iris http://digital.library.adelaide.edu.au/dspace/handle/2440/15227 http://openml1.win.tue.nl/dataset61/dataset_61.pq active + 2020-11-20 19:02:18 ad484452702105cbf3d30f8deaba39a9 + From 4fb5ed8ef792106f3a01efae646cc4631c159a18 Mon Sep 17 00:00:00 2001 From: taniya-das Date: Fri, 20 Jun 2025 17:56:42 +0200 Subject: [PATCH 3/3] update tests with mock requests --- tests/conftest.py | 41 +++++- tests/test_datasets/test_dataset.py | 192 +++++++++++++++------------- 2 files changed, 142 insertions(+), 91 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bdffb0eec..dd7226def 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -313,7 +313,7 @@ def workdir(tmp_path): def mock_iris_dataset(requests_mock, test_files_directory): """Fixture to provide the iris dataset.""" content_file = ( - test_files_directory / "mock_responses" / "datasets" / "61" / "description.xml" + test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml" ) requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text()) @@ -323,8 +323,45 @@ def mock_iris_dataset(requests_mock, test_files_directory): def mock_titanic_dataset(requests_mock, test_files_directory): """Fixture to provide the titanic dataset.""" content_file = ( - test_files_directory / "mock_responses" / "datasets" / "40945" / "description.xml" + test_files_directory / "mock_responses" / "datasets" / "data_description_40945.xml" ) requests_mock.get("https://www.openml.org/api/v1/xml/data/40945", text=content_file.read_text()) yield + + +@pytest.fixture +def mock_dataset_id_2(requests_mock, test_files_directory): + """Fixture to provide the dataset ID 2.""" + content_file = ( + test_files_directory / "mock_responses" / "datasets" / "2" / "description.xml" + ) + requests_mock.get("https://www.openml.org/api/v1/xml/data/2", text=content_file.read_text()) + + data_file = ( + test_files_directory / "mock_responses" / "datasets" / "2" / "dataset.arff" + ) + requests_mock.get("https://api.openml.org/data/v1/download/1666876/anneal.arff", text=data_file.read_text()) + + + yield + +@pytest.fixture +def mock_jm1_dataset(requests_mock, test_files_directory): + """Fixture to provide the JM1 dataset.""" + content_file = ( + test_files_directory / "mock_responses" / "datasets" / "data_description_1053.xml" + ) + requests_mock.get("https://www.openml.org/api/v1/xml/data/1053", text=content_file.read_text()) + + yield + +@pytest.fixture +def mock_pc4_dataset(requests_mock, test_files_directory): + """Fixture to provide the PC4 dataset.""" + content_file = ( + test_files_directory / "mock_responses" / "datasets" / "data_description_1049.xml" + ) + requests_mock.get("https://www.openml.org/api/v1/xml/data/1049", text=content_file.read_text()) + + yield \ No newline at end of file diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 18618f63f..3055799cf 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -73,24 +73,6 @@ def test_repr(self): str(data) - - - - - - @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") - def test_get_data_boolean_pandas(self): - # test to check that we are converting properly True and False even - # with some inconsistency when dumping the data on openml - data, _, _, _ = self.jm1.get_data() - assert data["defects"].dtype.name == "category" - assert set(data["defects"].cat.categories) == {True, False} - - data, _, _, _ = self.pc4.get_data() - assert data["c"].dtype.name == "category" - assert set(data["c"].cat.categories) == {True, False} - - @pytest.mark.production def test_init_string_validation(): with pytest.raises(ValueError, match="Invalid symbols ' ' in name"): @@ -105,7 +87,7 @@ def test_init_string_validation(): description="a description", citation="Something by Müller", ) - + @pytest.mark.production def test__unpack_categories_with_nan_likes(): # unpack_categories decodes numeric categorical values according to the header @@ -117,11 +99,75 @@ def test__unpack_categories_with_nan_likes(): expected_values = ["a", "b", np.nan, np.nan, np.nan, "b", "a"] assert list(clean_series.values) == expected_values assert list(clean_series.cat.categories.values) == list("ab") + +# expects downloaded data. +@pytest.mark.production +def test_get_data_pandas(mock_titanic_dataset): + titanic = openml.datasets.get_dataset(40945, download_data=False) + data, _, _, _ = titanic.get_data() + assert isinstance(data, pd.DataFrame) + assert data.shape[1] == len(titanic.features) + assert data.shape[0] == 1309 + col_dtype = { + "pclass": "uint8", + "survived": "category", + "name": "object", + "sex": "category", + "age": "float64", + "sibsp": "uint8", + "parch": "uint8", + "ticket": "object", + "fare": "float64", + "cabin": "object", + "embarked": "category", + "boat": "object", + "body": "float64", + "home.dest": "object", + } + for col_name in data.columns: + assert data[col_name].dtype.name == col_dtype[col_name] + + X, y, _, _ = titanic.get_data( + target=titanic.default_target_attribute, + ) + assert isinstance(X, pd.DataFrame) + assert isinstance(y, pd.Series) + assert X.shape == (1309, 13) + assert y.shape == (1309,) + for col_name in X.columns: + assert X[col_name].dtype.name == col_dtype[col_name] + assert y.dtype.name == col_dtype["survived"] + +# Why download = False and then expecting data? +@pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") +@pytest.mark.production +def test_get_data_boolean_pandas(mock_jm1_dataset, mock_pc4_dataset): + # test to check that we are converting properly True and False even + # with some inconsistency when dumping the data on openml + jm1 = openml.datasets.get_dataset(1053, download_data=False) + pc4 = openml.datasets.get_dataset(1049, download_data=False) + data, _, _, _ = jm1.get_data() + assert data["defects"].dtype.name == "category" + assert set(data["defects"].cat.categories) == {True, False} + + data, _, _, _ = pc4.get_data() + assert data["c"].dtype.name == "category" + assert set(data["c"].cat.categories) == {True, False} + +def _check_expected_type(dtype, is_cat, col): + if is_cat: + expected_type = "category" + elif not col.isna().any() and (col.astype("uint8") == col).all(): + expected_type = "uint8" + else: + expected_type = "float64" + + assert dtype.name == expected_type @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") @pytest.mark.production -def test_get_data_with_rowid(): +def test_get_data_with_rowid(mock_dataset_id_2): dataset = openml.datasets.get_dataset(2, download_data=False) dataset.row_id_attribute = "condition" rval, _, categorical, _ = dataset.get_data(include_row_id=True) @@ -138,9 +184,10 @@ def test_get_data_with_rowid(): assert rval.shape == (898, 38) assert len(categorical) == 38 +# same error with check_expected_type. Verify. @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") @pytest.mark.production -def test_get_data_with_target_pandas(): +def test_get_data_with_target_pandas(mock_dataset_id_2): dataset = openml.datasets.get_dataset(2, download_data=False) X, y, categorical, attribute_names = dataset.get_data(target="class") assert isinstance(X, pd.DataFrame) @@ -155,16 +202,20 @@ def test_get_data_with_target_pandas(): assert "class" not in attribute_names -def _check_expected_type(dtype, is_cat, col): - if is_cat: - expected_type = "category" - elif not col.isna().any() and (col.astype("uint8") == col).all(): - expected_type = "uint8" - else: - expected_type = "float64" - assert dtype.name == expected_type +@pytest.mark.production +def test_get_data_rowid_and_ignore_and_target(mock_dataset_id_2): + dataset = openml.datasets.get_dataset(2, download_data=False) + dataset.ignore_attribute = ["condition"] + dataset.row_id_attribute = ["hardness"] + X, y, categorical, names = dataset.get_data(target="class") + assert X.shape == (898, 36) + assert len(categorical) == 36 + cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3 + assert categorical == cats + assert y.shape == (898,) +# _check_expected_type error. Verify @pytest.mark.skip("https://github.com/openml/openml-python/issues/1157") @pytest.mark.production def test_get_data_with_ignore_attributes(): @@ -181,59 +232,10 @@ def test_get_data_with_ignore_attributes(): _check_expected_type(dtype, is_cat, rval[col]) assert rval.shape == (898, 38) assert len(categorical) == 38 - - -@pytest.mark.production -def test_get_data_pandas(): - titanic = openml.datasets.get_dataset(40945, download_data=False) - data, _, _, _ = titanic.get_data() - assert isinstance(data, pd.DataFrame) - assert data.shape[1] == len(titanic.features) - assert data.shape[0] == 1309 - col_dtype = { - "pclass": "uint8", - "survived": "category", - "name": "object", - "sex": "category", - "age": "float64", - "sibsp": "uint8", - "parch": "uint8", - "ticket": "object", - "fare": "float64", - "cabin": "object", - "embarked": "category", - "boat": "object", - "body": "float64", - "home.dest": "object", - } - for col_name in data.columns: - assert data[col_name].dtype.name == col_dtype[col_name] - - X, y, _, _ = titanic.get_data( - target=titanic.default_target_attribute, - ) - assert isinstance(X, pd.DataFrame) - assert isinstance(y, pd.Series) - assert X.shape == (1309, 13) - assert y.shape == (1309,) - for col_name in X.columns: - assert X[col_name].dtype.name == col_dtype[col_name] - assert y.dtype.name == col_dtype["survived"] - -@pytest.mark.production -def test_get_data_rowid_and_ignore_and_target(): - dataset = openml.datasets.get_dataset(2, download_data=False) - dataset.ignore_attribute = ["condition"] - dataset.row_id_attribute = ["hardness"] - X, y, categorical, names = dataset.get_data(target="class") - assert X.shape == (898, 36) - assert len(categorical) == 36 - cats = [True] * 3 + [False, True, True, False] + [True] * 23 + [False] * 3 + [True] * 3 - assert categorical == cats - assert y.shape == (898,) - + + @pytest.mark.production -def test_get_data_with_nonexisting_class(): +def test_get_data_with_nonexisting_class(mock_dataset_id_2): # This class is using the anneal dataset with labels [1, 2, 3, 4, 5, 'U']. However, # label 4 does not exist and we test that the features 5 and 'U' are correctly mapped to # indices 4 and 5, and that nothing is mapped to index 3. @@ -241,9 +243,9 @@ def test_get_data_with_nonexisting_class(): _, y, _, _ = dataset.get_data("class") assert list(y.dtype.categories) == ["1", "2", "3", "4", "5", "U"] - + @pytest.mark.production -def test_get_data_corrupt_pickle(): +def test_get_data_corrupt_pickle(mock_iris_dataset): # Lazy loaded dataset, populate cache. iris = openml.datasets.get_dataset(61, download_data=False) iris.get_data() @@ -255,7 +257,17 @@ def test_get_data_corrupt_pickle(): xy, _, _, _ = iris.get_data() assert isinstance(xy, pd.DataFrame) assert xy.shape == (150, 5) - + iris.get_data() + # Corrupt pickle file, overwrite as empty. + with open(iris.data_pickle_file, "w") as fh: + fh.write("") + # Despite the corrupt file, the data should be loaded from the ARFF file. + # A warning message is written to the python logger. + xy, _, _, _ = iris.get_data() + assert isinstance(xy, pd.DataFrame) + assert xy.shape == (150, 5) + +# check again! @pytest.mark.production def test_lazy_loading_metadata(): # Initial Setup @@ -307,8 +319,11 @@ def test_equality_comparison(mock_iris_dataset, mock_titanic_dataset): assert iris == iris assert iris != titanic assert titanic != "Wrong_object" - - + + + + + def test_tagging(): dataset = openml.datasets.get_dataset(125, download_data=False) @@ -358,7 +373,6 @@ def test_add_illegal_long_ontology(): assert e.code == 1105 - def test_add_illegal_url_ontology(): did = 1 ontology = "not_a_url" + str(time()) @@ -367,8 +381,8 @@ def test_add_illegal_url_ontology(): assert False except openml.exceptions.OpenMLServerException as e: assert e.code == 1106 - - + + @pytest.mark.production() class OpenMLDatasetTestSparse(TestBase): _multiprocess_can_split_ = True