openml · Taniya-Das · Jun 19, 2025 · Jun 19, 2025 · Jun 20, 2025
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -308,3 +308,60 @@ def workdir(tmp_path):
     os.chdir(tmp_path)
     yield tmp_path
     os.chdir(original_cwd)
+
+@pytest.fixture
+def mock_iris_dataset(requests_mock, test_files_directory):
+    """Fixture to provide the iris dataset."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
+
+    yield
+
+@pytest.fixture
+def mock_titanic_dataset(requests_mock, test_files_directory):
+    """Fixture to provide the titanic dataset."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_description_40945.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/40945", text=content_file.read_text())
+
+    yield
+
+
+@pytest.fixture
+def mock_dataset_id_2(requests_mock, test_files_directory):
+    """Fixture to provide the dataset ID 2."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "2" / "description.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/2", text=content_file.read_text())
+
+    data_file = (
+        test_files_directory / "mock_responses" / "datasets" / "2" / "dataset.arff"
+    )
+    requests_mock.get("https://api.openml.org/data/v1/download/1666876/anneal.arff", text=data_file.read_text())
+
+
+    yield
+
+@pytest.fixture
+def mock_jm1_dataset(requests_mock, test_files_directory):
+    """Fixture to provide the JM1 dataset."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_description_1053.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/1053", text=content_file.read_text())
+
+    yield
+
+@pytest.fixture
+def mock_pc4_dataset(requests_mock, test_files_directory):
+    """Fixture to provide the PC4 dataset."""
+    content_file = (
+        test_files_directory / "mock_responses" / "datasets" / "data_description_1049.xml"
+    )
+    requests_mock.get("https://www.openml.org/api/v1/xml/data/1049", text=content_file.read_text())
+
+    yield
diff --git a/tests/files/mock_responses/datasets/40945/description.xml b/tests/files/mock_responses/datasets/40945/description.xml
@@ -0,0 +1,26 @@
+<oml:data_set_description xmlns:oml="http://openml.org/openml">
+  <oml:id>40945</oml:id>
+  <oml:name>Titanic</oml:name>
+  <oml:version>1</oml:version>
+  <oml:description>**Author**: Frank E. Harrell Jr., Thomas Cason  
+**Source**: [Vanderbilt Biostatistics](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.html)  
+**Please cite**:   
+
+The original Titanic dataset, describing the survival status of individual passengers on the Titanic. The titanic data does not contain information from the crew, but it does contain actual ages of half of the passengers. The principal source for data about Titanic passengers is the Encyclopedia Titanica. The datasets used here were begun by a variety of researchers. One of the original sources is Eaton &amp; Haas (1994) Titanic: Triumph and Tragedy, Patrick Stephens Ltd, which includes a passenger list created by many researchers and edited by Michael A. Findlay.
+
+Thomas Cason of UVa has greatly updated and improved the titanic data frame using the Encyclopedia Titanica and created the dataset here. Some duplicate passengers have been dropped, many errors corrected, many missing ages filled in, and new variables created. 
+
+For more information about how this dataset was constructed:
+http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3info.txt
+
+
+### Attribute information  
+
+The variables on our extracted dataset are pclass, survived, name, age, embarked, home.dest, room, ticket, boat, and sex. pclass refers to passenger class (1st, 2nd, 3rd), and is a proxy for socio-economic class. Age is in years, and some infants had fractional values. The titanic2 data frame has no missing data and includes records for the crew, but age is dichotomized at adult vs. child. These data were obtained from Robert Dawson, Saint Mary's University, E-mail. The variables are pclass, age, sex, survived. These data frames are useful for demonstrating many of the functions in Hmisc as well as demonstrating binary logistic regression analysis using the Design library. For more details and references see Simonoff, Jeffrey S (1997): The &quot;unusual episode&quot; and a second statistics course. J Statistics Education, Vol. 5 No. 1.</oml:description>
+  <oml:description_version>7</oml:description_version>
+  <oml:format>ARFF</oml:format>
+        <oml:upload_date>2017-10-16T01:17:36</oml:upload_date>
+    <oml:licence>Public</oml:licence>  <oml:url>https://api.openml.org/data/v1/download/16826755/Titanic.arff</oml:url>
+  <oml:parquet_url>http://openml1.win.tue.nl/dataset40945/dataset_40945.pq</oml:parquet_url>  <oml:file_id>16826755</oml:file_id>  <oml:default_target_attribute>survived</oml:default_target_attribute>          <oml:tag>text_data</oml:tag>  <oml:visibility>public</oml:visibility>      <oml:minio_url>http://openml1.win.tue.nl/dataset40945/dataset_40945.pq</oml:minio_url>  <oml:status>active</oml:status>
+  <oml:processing_date>2018-10-04 07:19:36</oml:processing_date>      <oml:md5_checksum>60ac7205eee0ba5045c90b3bba95b1c4</oml:md5_checksum>
+</oml:data_set_description>
diff --git a/tests/files/mock_responses/datasets/61/description.xml b/tests/files/mock_responses/datasets/61/description.xml
@@ -0,0 +1,30 @@
+<oml:data_set_description xmlns:oml="http://openml.org/openml">
+  <oml:id>61</oml:id>
+  <oml:name>iris</oml:name>
+  <oml:version>1</oml:version>
+  <oml:description>**Author**: R.A. Fisher  
+**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  
+**Please cite**:   
+
+**Iris Plants Database**  
+This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda &amp; Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.
+
+Predicted attribute: class of iris plant.  
+This is an exceedingly simple domain.  
+
+### Attribute Information:
+    1. sepal length in cm
+    2. sepal width in cm
+    3. petal length in cm
+    4. petal width in cm
+    5. class: 
+       -- Iris Setosa
+       -- Iris Versicolour
+       -- Iris Virginica</oml:description>
+  <oml:description_version>4</oml:description_version>
+  <oml:format>ARFF</oml:format>
+  <oml:creator>R.A. Fisher</oml:creator>     <oml:collection_date>1936</oml:collection_date>  <oml:upload_date>2014-04-06T23:23:39</oml:upload_date>
+  <oml:language>English</oml:language>  <oml:licence>Public</oml:licence>  <oml:url>https://api.openml.org/data/v1/download/61/iris.arff</oml:url>
+  <oml:parquet_url>http://openml1.win.tue.nl/dataset61/dataset_61.pq</oml:parquet_url>  <oml:file_id>61</oml:file_id>  <oml:default_target_attribute>class</oml:default_target_attribute>      <oml:version_label>1</oml:version_label>  <oml:citation>https://archive.ics.uci.edu/ml/citation_policy.html</oml:citation>  <oml:tag>study_1</oml:tag><oml:tag>study_25</oml:tag><oml:tag>study_4</oml:tag><oml:tag>study_41</oml:tag><oml:tag>study_50</oml:tag><oml:tag>study_52</oml:tag><oml:tag>study_7</oml:tag><oml:tag>study_86</oml:tag><oml:tag>study_88</oml:tag><oml:tag>study_89</oml:tag><oml:tag>uci</oml:tag>  <oml:visibility>public</oml:visibility>  <oml:original_data_url>https://archive.ics.uci.edu/ml/datasets/Iris</oml:original_data_url>  <oml:paper_url>http://digital.library.adelaide.edu.au/dspace/handle/2440/15227</oml:paper_url>  <oml:minio_url>http://openml1.win.tue.nl/dataset61/dataset_61.pq</oml:minio_url>  <oml:status>active</oml:status>
+  <oml:processing_date>2020-11-20 19:02:18</oml:processing_date>      <oml:md5_checksum>ad484452702105cbf3d30f8deaba39a9</oml:md5_checksum>
+</oml:data_set_description>