From 046442d3ca95d1be5d9c21332f08e8e1463af8d8 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Tue, 17 Jun 2025 10:41:22 +0200
Subject: [PATCH] Mock response from the production server for dataset
description
---
pyproject.toml | 1 +
.../datasets/data_description_61.xml | 30 +++++++++++++++++++
tests/test_datasets/test_dataset_functions.py | 25 +++++++++-------
3 files changed, 46 insertions(+), 10 deletions(-)
create mode 100644 tests/files/mock_responses/datasets/data_description_61.xml
diff --git a/pyproject.toml b/pyproject.toml
index 83f0793f7..202712027 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,7 @@ test=[
"pytest-rerunfailures",
"mypy",
"ruff",
+ "requests-mock",
]
examples=[
"matplotlib",
diff --git a/tests/files/mock_responses/datasets/data_description_61.xml b/tests/files/mock_responses/datasets/data_description_61.xml
new file mode 100644
index 000000000..fc25e5861
--- /dev/null
+++ b/tests/files/mock_responses/datasets/data_description_61.xml
@@ -0,0 +1,30 @@
+
+ 61
+ iris
+ 1
+ **Author**: R.A. Fisher
+**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall
+**Please cite**:
+
+**Iris Plants Database**
+This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.
+
+Predicted attribute: class of iris plant.
+This is an exceedingly simple domain.
+
+### Attribute Information:
+ 1. sepal length in cm
+ 2. sepal width in cm
+ 3. petal length in cm
+ 4. petal width in cm
+ 5. class:
+ -- Iris Setosa
+ -- Iris Versicolour
+ -- Iris Virginica
+ 4
+ ARFF
+ R.A. Fisher 1936 2014-04-06T23:23:39
+ English Public https://api.openml.org/data/v1/download/61/iris.arff
+ https://data.openml.org/datasets/0000/0061/dataset_61.pq 61 class 1 https://archive.ics.uci.edu/ml/citation_policy.html BotanyEcologyKaggleMachine Learningstudy_1study_25study_4study_41study_50study_52study_7study_86study_88study_89uci public https://archive.ics.uci.edu/ml/datasets/Iris http://digital.library.adelaide.edu.au/dspace/handle/2440/15227 https://data.openml.org/datasets/0000/0061/dataset_61.pq active
+ 2020-11-20 19:02:18 ad484452702105cbf3d30f8deaba39a9
+
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 1dc9daab1..33c853f3a 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -17,6 +17,7 @@
import pandas as pd
import pytest
import requests
+import requests_mock
import scipy.sparse
from oslo_concurrency import lockutils
@@ -1505,16 +1506,6 @@ def test_data_fork(self):
data_id=999999,
)
- @pytest.mark.production()
- def test_get_dataset_parquet(self):
- # Parquet functionality is disabled on the test server
- # There is no parquet-copy of the test server yet.
- openml.config.server = self.production_server
- dataset = openml.datasets.get_dataset(61, download_data=True)
- assert dataset._parquet_url is not None
- assert dataset.parquet_file is not None
- assert os.path.isfile(dataset.parquet_file)
- assert dataset.data_file is None # is alias for arff path
@pytest.mark.production()
def test_list_datasets_with_high_size_parameter(self):
@@ -1960,3 +1951,17 @@ def test_read_features_from_xml_with_whitespace() -> None:
features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
dict = _read_features(features_file)
assert dict[1].nominal_values == [" - 50000.", " 50000+."]
+
+
+def test_get_dataset_parquet(requests_mock, test_files_directory):
+ # Parquet functionality is disabled on the test server
+ # There is no parquet-copy of the test server yet.
+ content_file = (
+ test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
+ )
+ requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
+ dataset = openml.datasets.get_dataset(61, download_data=True)
+ assert dataset._parquet_url is not None
+ assert dataset.parquet_file is not None
+ assert os.path.isfile(dataset.parquet_file)
+ assert dataset.data_file is None # is alias for arff path