diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 55a4a354a..31cdff602 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -96,14 +96,22 @@ jobs:
         echo "Repository status before tests: $git_status"
     - name: Show installed dependencies
       run: python -m pip list
-    - name: Run tests on Ubuntu
+    - name: Run tests on Ubuntu Test
       if: matrix.os == 'ubuntu-latest'
       run: |
         if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
         # Most of the time, running only the scikit-learn tests is sufficient
-        if [ ${{ matrix.sklearn-only }} = 'true' ]; then sklearn='-m sklearn'; fi
-        echo pytest -n 4 --durations=20 --dist load -sv $codecov $sklearn -o log_cli=true
-        pytest -n 4 --durations=20 --dist load -sv $codecov $sklearn -o log_cli=true
+        if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and not production'; else marks='not production'; fi
+        echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+    - name: Run tests on Ubuntu Production
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
+        # Most of the time, running only the scikit-learn tests is sufficient
+        if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and production'; else marks='production'; fi
+        echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
     - name: Run tests on Windows
       if: matrix.os == 'windows-latest'
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
diff --git a/openml/testing.py b/openml/testing.py
index a3a5806e8..f026c6137 100644
--- a/openml/testing.py
+++ b/openml/testing.py
@@ -101,7 +101,6 @@ def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
         self.cached = True
         openml.config.apikey = TestBase.apikey
         self.production_server = "https://www.openml.org/api/v1/xml"
-        openml.config.server = TestBase.test_server
         openml.config.avoid_duplicate_runs = False
         openml.config.set_root_cache_directory(str(self.workdir))
 
@@ -120,7 +119,6 @@ def tearDown(self) -> None:
                 # one of the files may still be used by another process
                 raise e
 
-        openml.config.server = self.production_server
         openml.config.connection_n_retries = self.connection_n_retries
         openml.config.retry_policy = self.retry_policy
 
diff --git a/tests/conftest.py b/tests/conftest.py
index b523117c1..968a5a58a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -268,11 +268,15 @@ def as_robot() -> Iterator[None]:
     openml.config.set_retry_policy(policy, n_retries)
 
 
-@pytest.fixture(autouse=True, scope="session")
-def with_test_server():
-    openml.config.start_using_configuration_for_example()
+@pytest.fixture(autouse=True)
+def with_server(request):
+    if "production" in request.keywords:
+        openml.config.server = "https://www.openml.org/api/v1/xml"
+        yield
+        return
+    openml.config.server = "https://test.openml.org/api/v1/xml"
+    openml.config.apikey = "c0c42819af31e706efe1f4b88c23c6c1"
     yield
-    openml.config.stop_using_configuration_for_example()
 
 
 @pytest.fixture(autouse=True)
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index d6b26d864..ad8bbc693 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -1951,7 +1951,8 @@ def test_get_dataset_parquet(requests_mock, test_files_directory):
     content_file = (
             test_files_directory / "mock_responses" / "datasets" / "data_description_61.xml"
     )
-    requests_mock.get("https://www.openml.org/api/v1/xml/data/61", text=content_file.read_text())
+    # While the mocked example is from production, unit tests by default connect to the test server.
+    requests_mock.get("https://test.openml.org/api/v1/xml/data/61", text=content_file.read_text())
     dataset = openml.datasets.get_dataset(61, download_data=True)
     assert dataset._parquet_url is not None
     assert dataset.parquet_file is not None
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 7235075c0..5023943d7 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -68,7 +68,7 @@ class TestRun(TestBase):
         "task_meta_data": {
             "task_type": TaskType.SUPERVISED_CLASSIFICATION,
             "dataset_id": 16,  # credit-a
-            "estimation_procedure_id": 1,
+            "estimation_procedure_id": 6,
             "target_name": "class",
         },
     }
@@ -81,7 +81,7 @@ class TestRun(TestBase):
         "task_meta_data": {
             "task_type": TaskType.SUPERVISED_CLASSIFICATION,
             "dataset_id": 20,  # diabetes
-            "estimation_procedure_id": 1,
+            "estimation_procedure_id": 5,
             "target_name": "class",
         },
     }
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index d3553262f..d4f2ed9d7 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import pandas as pd
+import pytest
 
 from openml.tasks import TaskType, get_task
 
@@ -17,14 +18,6 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
-    def test_get_X_and_Y(self):
-        X, Y = super().test_get_X_and_Y()
-        assert X.shape == (768, 8)
-        assert isinstance(X, pd.DataFrame)
-        assert Y.shape == (768,)
-        assert isinstance(Y, pd.Series)
-        assert pd.api.types.is_categorical_dtype(Y)
-
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -34,4 +27,15 @@ def test_download_task(self):
 
     def test_class_labels(self):
         task = get_task(self.task_id)
-        assert task.class_labels == ["tested_negative", "tested_positive"]
\ No newline at end of file
+        assert task.class_labels == ["tested_negative", "tested_positive"]
+
+
+@pytest.mark.server()
+def test_get_X_and_Y():
+    task = get_task(119)
+    X, Y = task.get_X_and_y()
+    assert X.shape == (768, 8)
+    assert isinstance(X, pd.DataFrame)
+    assert Y.shape == (768,)
+    assert isinstance(Y, pd.Series)
+    assert pd.api.types.is_categorical_dtype(Y)