diff --git a/pyproject.toml b/pyproject.toml index 83f0793f7..202712027 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ test=[ "pytest-rerunfailures", "mypy", "ruff", + "requests-mock", ] examples=[ "matplotlib", diff --git a/tests/files/mock_responses/datasets/data_description_61.xml b/tests/files/mock_responses/datasets/data_description_61.xml new file mode 100644 index 000000000..fc25e5861 --- /dev/null +++ b/tests/files/mock_responses/datasets/data_description_61.xml @@ -0,0 +1,30 @@ + + 61 + iris + 1 + **Author**: R.A. Fisher +**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall +**Please cite**: + +**Iris Plants Database** +This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. + +Predicted attribute: class of iris plant. +This is an exceedingly simple domain. + +### Attribute Information: + 1. sepal length in cm + 2. sepal width in cm + 3. petal length in cm + 4. petal width in cm + 5. class: + -- Iris Setosa + -- Iris Versicolour + -- Iris Virginica + 4 + ARFF + R.A. Fisher 1936 2014-04-06T23:23:39 + English Public https://api.openml.org/data/v1/download/61/iris.arff + https://data.openml.org/datasets/0000/0061/dataset_61.pq 61 class 1 https://archive.ics.uci.edu/ml/citation_policy.html BotanyEcologyKaggleMachine Learningstudy_1study_25study_4study_41study_50study_52study_7study_86study_88study_89uci public https://archive.ics.uci.edu/ml/datasets/Iris http://digital.library.adelaide.edu.au/dspace/handle/2440/15227 https://data.openml.org/datasets/0000/0061/dataset_61.pq active + 2020-11-20 19:02:18 ad484452702105cbf3d30f8deaba39a9 + diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 1dc9daab1..33c853f3a 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -17,6 +17,7 @@ import pandas as pd import pytest import requests +import requests_mock import scipy.sparse from oslo_concurrency import lockutils @@ -1505,16 +1506,6 @@ def test_data_fork(self): data_id=999999, ) - @pytest.mark.production() - def test_get_dataset_parquet(self): - # Parquet functionality is disabled on the test server - # There is no parquet-copy of the test server yet. - openml.config.server = self.production_server - dataset = openml.datasets.get_dataset(61, download_data=True) - assert dataset._parquet_url is not None - assert dataset.parquet_file is not None - assert os.path.isfile(dataset.parquet_file) - assert dataset.data_file is None # is alias for arff path @pytest.mark.production() def test_list_datasets_with_high_size_parameter(self): @@ -1960,3 +1951,17 @@ def test_read_features_from_xml_with_whitespace() -> None: features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml" dict = _read_features(features_file) assert dict[1].nominal_values == [" - 50000.", " 50000+."] + + +def test_get_dataset_parquet(requests_mock, test_files_directory): + # Parquet functionality is disabled on the test server + # There is no parquet-copy of the test server yet. + content_file = ( + test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml" + ) + requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text()) + dataset = openml.datasets.get_dataset(61, download_data=True) + assert dataset._parquet_url is not None + assert dataset.parquet_file is not None + assert os.path.isfile(dataset.parquet_file) + assert dataset.data_file is None # is alias for arff path