Skip to content

Commit 485f903

Browse files
committed
Mock response from the production server for dataset description
1 parent 0560829 commit 485f903

3 files changed

Lines changed: 46 additions & 0 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ test=[
7474
"pytest-rerunfailures",
7575
"mypy",
7676
"ruff",
77+
"requests-mock",
7778
]
7879
examples=[
7980
"matplotlib",
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
<oml:data_set_description xmlns:oml="http://openml.org/openml">
2+
<oml:id>61</oml:id>
3+
<oml:name>iris</oml:name>
4+
<oml:version>1</oml:version>
5+
<oml:description>**Author**: R.A. Fisher
6+
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall
7+
**Please cite**:
8+
9+
**Iris Plants Database**
10+
This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda &amp; Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other.
11+
12+
Predicted attribute: class of iris plant.
13+
This is an exceedingly simple domain.
14+
15+
### Attribute Information:
16+
1. sepal length in cm
17+
2. sepal width in cm
18+
3. petal length in cm
19+
4. petal width in cm
20+
5. class:
21+
-- Iris Setosa
22+
-- Iris Versicolour
23+
-- Iris Virginica</oml:description>
24+
<oml:description_version>4</oml:description_version>
25+
<oml:format>ARFF</oml:format>
26+
<oml:creator>R.A. Fisher</oml:creator> <oml:collection_date>1936</oml:collection_date> <oml:upload_date>2014-04-06T23:23:39</oml:upload_date>
27+
<oml:language>English</oml:language> <oml:licence>Public</oml:licence> <oml:url>https://api.openml.org/data/v1/download/61/iris.arff</oml:url>
28+
<oml:parquet_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:parquet_url> <oml:file_id>61</oml:file_id> <oml:default_target_attribute>class</oml:default_target_attribute> <oml:version_label>1</oml:version_label> <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation> <oml:tag>Botany</oml:tag><oml:tag>Ecology</oml:tag><oml:tag>Kaggle</oml:tag><oml:tag>Machine Learning</oml:tag><oml:tag>study_1</oml:tag><oml:tag>study_25</oml:tag><oml:tag>study_4</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>study_86</oml:tag><oml:tag>study_88</oml:tag><oml:tag>study_89</oml:tag><oml:tag>uci</oml:tag> <oml:visibility>public</oml:visibility> <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Iris</oml:original_data_url> <oml:paper_url>http://digital.library.adelaide.edu.au/dspace/handle/2440/15227</oml:paper_url> <oml:minio_url>https://data.openml.org/datasets/0000/0061/dataset_61.pq</oml:minio_url> <oml:status>active</oml:status>
29+
<oml:processing_date>2020-11-20 19:02:18</oml:processing_date> <oml:md5_checksum>ad484452702105cbf3d30f8deaba39a9</oml:md5_checksum>
30+
</oml:data_set_description>

tests/test_datasets/test_dataset_functions.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import pandas as pd
1818
import pytest
1919
import requests
20+
import requests_mock
2021
import scipy.sparse
2122
from oslo_concurrency import lockutils
2223

@@ -1960,3 +1961,17 @@ def test_read_features_from_xml_with_whitespace() -> None:
19601961
features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
19611962
dict = _read_features(features_file)
19621963
assert dict[1].nominal_values == [" - 50000.", " 50000+."]
1964+
1965+
1966+
def test_get_dataset_parquet(requests_mock, test_files_directory):
1967+
# Parquet functionality is disabled on the test server
1968+
# There is no parquet-copy of the test server yet.
1969+
content_file = (
1970+
test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
1971+
)
1972+
requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
1973+
dataset = openml.datasets.get_dataset(61, download_data=True)
1974+
assert dataset._parquet_url is not None
1975+
assert dataset.parquet_file is not None
1976+
assert os.path.isfile(dataset.parquet_file)
1977+
assert dataset.data_file is None # is alias for arff path

0 commit comments

Comments
 (0)