From 99ce264144a117a66866844547b66f80184ba999 Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Fri, 5 Jun 2026 15:35:55 -0700
Subject: [PATCH 1/8] fix: make JumpStart private hub integ tests xdist-safe

Under pytest-xdist (-n 120) each worker created its own private hub,
exhausting the per-account hub limit (100) and triggering destructive
cross-worker cleanup that deleted hubs other workers were actively
using, causing "Hub ... does not exist" failures. The add_model_references
fixture also swallowed all errors and did not wait for async reference
propagation, causing "Hub content ... does not exist" failures.

- Share a single hub across all xdist workers via filelock + a JSON
  state file with reference counting; only the last worker tears it down.
- Make _cleanup_old_hubs non-destructive: only delete hubs older than
  STALE_HUB_AGE_HOURS and never the active run's hub.
- Add add_model_references_to_hub helper that creates references
  idempotently (keyed by hub + model set) and polls until each
  reference is resolvable before tests run.
---
 tests/integ/sagemaker/jumpstart/conftest.py   | 134 ++++++++++++++----
 .../test_jumpstart_private_hub_estimator.py   |  22 +--
 .../model/test_jumpstart_private_hub_model.py |  22 +--
 tests/integ/sagemaker/jumpstart/utils.py      |  77 ++++++++++
 4 files changed, 190 insertions(+), 65 deletions(-)

diff --git a/tests/integ/sagemaker/jumpstart/conftest.py b/tests/integ/sagemaker/jumpstart/conftest.py
index 50e062e384..d74eadcf71 100644
--- a/tests/integ/sagemaker/jumpstart/conftest.py
+++ b/tests/integ/sagemaker/jumpstart/conftest.py
@@ -12,9 +12,13 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 
+import json
 import os
+from datetime import datetime, timedelta, timezone
+
 import boto3
 import pytest
+from filelock import FileLock
 from botocore.config import Config
 from sagemaker.jumpstart.constants import JUMPSTART_DEFAULT_REGION_NAME
 from sagemaker.jumpstart.hub.hub import Hub
@@ -39,19 +43,23 @@
 )
 
 
-def _setup():
+# Only delete leftover hubs from previous test runs that are older than this many
+# hours. This guards against deleting a hub that another concurrent test run (or
+# xdist worker) is actively using.
+STALE_HUB_AGE_HOURS = 3
+
+
+def _setup(test_suite_id=None, test_hub_name=None):
     print("Setting up...")
-    test_suite_id = get_test_suite_id()
-    test_hub_name = f"{HUB_NAME_PREFIX}{test_suite_id}"
+    test_suite_id = test_suite_id or get_test_suite_id()
+    test_hub_name = test_hub_name or f"{HUB_NAME_PREFIX}{test_suite_id}"
     test_hub_description = "PySDK Integ Test Private Hub"
 
     os.environ.update({ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID: test_suite_id})
     os.environ.update({ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME: test_hub_name})
 
     # Create a private hub to use for the test session
-    hub = Hub(
-        hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME], sagemaker_session=get_sm_session()
-    )
+    hub = Hub(hub_name=test_hub_name, sagemaker_session=get_sm_session())
 
     # Check if hub already exists before creating
     try:
@@ -73,14 +81,14 @@ def _setup():
                 raise
 
 
-def _teardown():
+def _teardown(test_suite_id=None, test_hub_name=None):
     print("Tearing down...")
 
     test_cache_bucket = get_test_artifact_bucket()
 
-    test_suite_id = os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]
+    test_suite_id = test_suite_id or os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]
 
-    test_hub_name = os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME]
+    test_hub_name = test_hub_name or os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME]
 
     boto3_session = boto3.Session(region_name=JUMPSTART_DEFAULT_REGION_NAME)
 
@@ -156,30 +164,41 @@ def _teardown():
     _delete_hubs(sagemaker_session, test_hub_name)
 
 
-def _cleanup_old_hubs(sagemaker_session):
-    """Clean up old test hubs to free up resources."""
+def _cleanup_old_hubs(sagemaker_session, active_hub_name=None):
+    """Clean up stale test hubs from previous runs to free up resources.
+
+    Only deletes hubs that are clearly stale (older than ``STALE_HUB_AGE_HOURS``)
+    so that hubs actively in use by the current test run or by concurrent xdist
+    workers are never removed. The hub for the current run (``active_hub_name``)
+    is always preserved.
+    """
     try:
+        active_hub_name = active_hub_name or os.environ.get(ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME)
+        cutoff = datetime.now(timezone.utc) - timedelta(hours=STALE_HUB_AGE_HOURS)
+
         response = sagemaker_session.list_hubs()
-        test_hubs = [
-            hub
-            for hub in response.get("HubSummaries", [])
-            if hub["HubName"].startswith(HUB_NAME_PREFIX)
-        ]
-
-        # Sort by creation time and delete oldest hubs
-        test_hubs.sort(key=lambda x: x.get("CreationTime", ""))
-
-        # Delete oldest hubs (keep only the most recent 10)
-        hubs_to_delete = (
-            test_hubs[:-10] if len(test_hubs) > 10 else test_hubs[: max(0, len(test_hubs) - 40)]
-        )
+        for hub in response.get("HubSummaries", []):
+            hub_name = hub["HubName"]
+            if not hub_name.startswith(HUB_NAME_PREFIX):
+                continue
+            if hub_name == active_hub_name:
+                continue
+
+            creation_time = hub.get("CreationTime")
+            # Only delete hubs we can confirm are older than the cutoff. If the
+            # creation time is unavailable, err on the side of keeping the hub.
+            if creation_time is None:
+                continue
+            if creation_time.tzinfo is None:
+                creation_time = creation_time.replace(tzinfo=timezone.utc)
+            if creation_time >= cutoff:
+                continue
 
-        for hub in hubs_to_delete:
             try:
-                print(f"Deleting old hub: {hub['HubName']}")
-                _delete_hubs(sagemaker_session, hub["HubName"])
+                print(f"Deleting stale hub: {hub_name}")
+                _delete_hubs(sagemaker_session, hub_name)
             except Exception as e:
-                print(f"Failed to delete hub {hub['HubName']}: {e}")
+                print(f"Failed to delete hub {hub_name}: {e}")
     except Exception as e:
         print(f"Failed to cleanup old hubs: {e}")
 
@@ -211,7 +230,60 @@ def _delete_hub_contents(sagemaker_session, hub_name, model):
 
 
 @pytest.fixture(scope="session", autouse=True)
-def setup(request):
-    _setup()
+def setup(request, worker_id, tmp_path_factory):
+    """Create a single shared private hub for the whole test run.
+
+    Under pytest-xdist every worker is a separate process, so a naive
+    ``scope="session"`` fixture would create one hub per worker. With high
+    parallelism (e.g. ``-n 120``) that quickly exhausts the per-account private
+    hub limit (100) and triggers destructive cross-worker cleanup. To avoid
+    this, all workers coordinate through a lock file and a shared JSON state
+    file: the first worker creates the hub, the rest reuse it, and only the last
+    worker to finish tears it down (reference counting).
+    """
+    # Non-xdist run: single process owns the full lifecycle.
+    if worker_id == "master":
+        _setup()
+        request.addfinalizer(_teardown)
+        return
+
+    # xdist run: coordinate hub creation/teardown across workers.
+    root_tmp_dir = tmp_path_factory.getbasetemp().parent
+    state_file = root_tmp_dir / "jumpstart_hub_state.json"
+    lock_file = root_tmp_dir / "jumpstart_hub_state.json.lock"
+
+    with FileLock(str(lock_file)):
+        if state_file.is_file():
+            state = json.loads(state_file.read_text())
+            state["ref_count"] += 1
+        else:
+            test_suite_id = get_test_suite_id()
+            test_hub_name = f"{HUB_NAME_PREFIX}{test_suite_id}"
+            _setup(test_suite_id=test_suite_id, test_hub_name=test_hub_name)
+            state = {
+                "test_suite_id": test_suite_id,
+                "test_hub_name": test_hub_name,
+                "ref_count": 1,
+            }
+        state_file.write_text(json.dumps(state))
+
+    # Ensure this worker's environment points at the shared hub.
+    os.environ.update({ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID: state["test_suite_id"]})
+    os.environ.update({ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME: state["test_hub_name"]})
+
+    def _finalize():
+        with FileLock(str(lock_file)):
+            if not state_file.is_file():
+                return
+            current = json.loads(state_file.read_text())
+            current["ref_count"] -= 1
+            if current["ref_count"] <= 0:
+                _teardown(
+                    test_suite_id=current["test_suite_id"],
+                    test_hub_name=current["test_hub_name"],
+                )
+                state_file.unlink()
+            else:
+                state_file.write_text(json.dumps(current))
 
-    request.addfinalizer(_teardown)
+    request.addfinalizer(_finalize)
diff --git a/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py b/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py
index d512915343..4c455c3b32 100644
--- a/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py
+++ b/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py
@@ -17,7 +17,6 @@
 
 import pytest
 from sagemaker.jumpstart.constants import JUMPSTART_DEFAULT_REGION_NAME
-from sagemaker.jumpstart.hub.hub import Hub
 
 from sagemaker.jumpstart.estimator import JumpStartEstimator
 from sagemaker.jumpstart.utils import get_jumpstart_content_bucket
@@ -28,10 +27,9 @@
     JUMPSTART_TAG,
 )
 from tests.integ.sagemaker.jumpstart.utils import (
-    get_public_hub_model_arn,
     get_sm_session,
-    with_exponential_backoff,
     get_training_dataset_for_model_and_version,
+    add_model_references_to_hub,
 )
 
 MAX_INIT_TIME_SECONDS = 5
@@ -43,23 +41,13 @@
 }
 
 
-@with_exponential_backoff()
-def create_model_reference(hub_instance, model_arn):
-    try:
-        hub_instance.create_model_reference(model_arn=model_arn)
-    except Exception:
-        pass
-
-
 @pytest.fixture(scope="session")
 def add_model_references():
-    # Create Model References to test in Hub
-    hub_instance = Hub(
-        hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME], sagemaker_session=get_sm_session()
+    # Create Model References to test in Hub (idempotent + waits for readiness)
+    add_model_references_to_hub(
+        hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
+        model_ids=TEST_MODEL_IDS,
     )
-    for model in TEST_MODEL_IDS:
-        model_arn = get_public_hub_model_arn(hub_instance, model)
-        create_model_reference(hub_instance, model_arn)
 
 
 def test_jumpstart_hub_estimator(setup, add_model_references):
diff --git a/tests/integ/sagemaker/jumpstart/private_hub/model/test_jumpstart_private_hub_model.py b/tests/integ/sagemaker/jumpstart/private_hub/model/test_jumpstart_private_hub_model.py
index 3956c2240d..3737391102 100644
--- a/tests/integ/sagemaker/jumpstart/private_hub/model/test_jumpstart_private_hub_model.py
+++ b/tests/integ/sagemaker/jumpstart/private_hub/model/test_jumpstart_private_hub_model.py
@@ -17,7 +17,6 @@
 
 import pytest
 from sagemaker.enums import EndpointType
-from sagemaker.jumpstart.hub.hub import Hub
 from sagemaker.jumpstart.hub.utils import generate_hub_arn_for_init_kwargs
 from sagemaker.predictor import retrieve_default
 
@@ -30,9 +29,8 @@
     JUMPSTART_TAG,
 )
 from tests.integ.sagemaker.jumpstart.utils import (
-    get_public_hub_model_arn,
     get_sm_session,
-    with_exponential_backoff,
+    add_model_references_to_hub,
 )
 
 MAX_INIT_TIME_SECONDS = 5
@@ -46,23 +44,13 @@
 }
 
 
-@with_exponential_backoff()
-def create_model_reference(hub_instance, model_arn):
-    try:
-        hub_instance.create_model_reference(model_arn=model_arn)
-    except Exception:
-        pass
-
-
 @pytest.fixture(scope="session")
 def add_model_references():
-    # Create Model References to test in Hub
-    hub_instance = Hub(
-        hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME], sagemaker_session=get_sm_session()
+    # Create Model References to test in Hub (idempotent + waits for readiness)
+    add_model_references_to_hub(
+        hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
+        model_ids=TEST_MODEL_IDS,
     )
-    for model in TEST_MODEL_IDS:
-        model_arn = get_public_hub_model_arn(hub_instance, model)
-        create_model_reference(hub_instance, model_arn)
 
 
 def test_jumpstart_hub_model(setup, add_model_references):
diff --git a/tests/integ/sagemaker/jumpstart/utils.py b/tests/integ/sagemaker/jumpstart/utils.py
index d439ef7e95..c326b135e0 100644
--- a/tests/integ/sagemaker/jumpstart/utils.py
+++ b/tests/integ/sagemaker/jumpstart/utils.py
@@ -12,9 +12,11 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 import functools
+import hashlib
 import json
 
 import random
+import tempfile
 import time
 import uuid
 from typing import Any, Dict, List, Tuple
@@ -24,6 +26,7 @@
 
 from botocore.config import Config
 from botocore.exceptions import ClientError
+from filelock import FileLock
 import pytest
 
 
@@ -149,6 +152,80 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
+@with_exponential_backoff()
+def _create_model_reference(hub_instance, model_arn):
+    """Create a model reference in the hub, tolerating an already-existing one."""
+    try:
+        hub_instance.create_model_reference(model_arn=model_arn)
+    except ClientError as e:
+        # A reference that already exists is fine (idempotent across xdist
+        # workers sharing a hub). Anything else should surface.
+        if e.response["Error"]["Code"] in ("ResourceInUse", "ResourceLimitExceeded"):
+            return
+        raise
+
+
+def _wait_for_model_reference(sagemaker_session, hub_name, model_name, timeout=300, poll=10):
+    """Block until a model reference is resolvable in the hub.
+
+    ``create_hub_content_reference`` is asynchronous, so a test that uses the
+    reference immediately after creation can race against propagation and see
+    ``ResourceNotFound``. Poll until the reference is listable (or time out).
+    """
+    deadline = time.time() + timeout
+    last_error = None
+    while time.time() < deadline:
+        try:
+            response = sagemaker_session.list_hub_content_versions(
+                hub_name=hub_name,
+                hub_content_type="ModelReference",
+                hub_content_name=model_name,
+            )
+            if response.get("HubContentSummaries"):
+                return
+        except ClientError as e:
+            if e.response["Error"]["Code"] != "ResourceNotFound":
+                raise
+            last_error = e
+        time.sleep(poll)
+    raise TimeoutError(
+        f"Model reference '{model_name}' was not available in hub '{hub_name}' "
+        f"within {timeout}s. Last error: {last_error}"
+    )
+
+
+def add_model_references_to_hub(hub_name, model_ids):
+    """Idempotently add model references to a hub and wait until they resolve.
+
+    Safe to call concurrently from multiple xdist workers sharing a hub: a lock
+    file serializes the creation work and a marker file ensures it only runs
+    once per hub per test run. The marker is keyed by both the hub name and the
+    specific set of model ids, so different callers adding different model sets
+    to the same shared hub each run exactly once.
+    """
+    sagemaker_session = get_sm_session()
+    hub_instance = Hub(hub_name=hub_name, sagemaker_session=sagemaker_session)
+
+    model_ids = sorted(model_ids)
+    models_digest = hashlib.md5(
+        ",".join(model_ids).encode("utf-8"), usedforsecurity=False
+    ).hexdigest()
+    marker = os.path.join(
+        tempfile.gettempdir(), f"jumpstart_model_refs_{hub_name}_{models_digest}.done"
+    )
+    lock_path = f"{marker}.lock"
+
+    with FileLock(lock_path):
+        if not os.path.exists(marker):
+            for model in model_ids:
+                model_arn = get_public_hub_model_arn(hub_instance, model)
+                _create_model_reference(hub_instance, model_arn)
+            for model in model_ids:
+                _wait_for_model_reference(sagemaker_session, hub_name, model)
+            with open(marker, "w") as f:
+                f.write("done")
+
+
 class EndpointInvoker:
     def __init__(
         self,

From f79347ac4d1cf020f030056ef2c278429b5afa9c Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Fri, 5 Jun 2026 15:44:45 -0700
Subject: [PATCH 2/8] fix: isolate sagemaker_session for serve integ tests to
 prevent settings pollution

ModelBuilder mutates session.settings._local_download_dir to a temporary
/tmp/sagemaker/model-builder/<uuid> path. The serve integ tests passed the
repo-wide session-scoped sagemaker_session fixture into ModelBuilder, so that
mutation leaked across test modules. After the temp dir was cleaned up, the
lingering setting broke unrelated tests sharing the same session, notably
tests/integ/sagemaker/workflow/test_tuning_steps.py::test_tuning_multi_algos
with "ValueError: Inputted directory ... does not exist".

Override sagemaker_session in tests/integ/sagemaker/serve/conftest.py with a
dedicated session (constructed identically to the parent fixture) so the
ModelBuilder mutation stays contained within the serve package.
---
 tests/integ/sagemaker/serve/conftest.py | 48 +++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tests/integ/sagemaker/serve/conftest.py b/tests/integ/sagemaker/serve/conftest.py
index 5eb3a2ea11..5119dfb3a0 100644
--- a/tests/integ/sagemaker/serve/conftest.py
+++ b/tests/integ/sagemaker/serve/conftest.py
@@ -18,7 +18,55 @@
 import sagemaker
 import sagemaker_core.helper.session_helper as core_session
 
+from botocore.config import Config
+from sagemaker import Session
+
 DEFAULT_REGION = "us-west-2"
+CUSTOM_S3_OBJECT_KEY_PREFIX = "session-default-prefix"
+
+
+@pytest.fixture(scope="session")
+def sagemaker_session(
+    sagemaker_client_config, sagemaker_runtime_config, boto_session, sagemaker_metrics_config
+):
+    """Isolated Session for the serve (ModelBuilder) integ tests.
+
+    Overrides the repo-wide ``sagemaker_session`` fixture (defined in
+    ``tests/conftest.py``) for everything under ``tests/integ/sagemaker/serve``.
+
+    ModelBuilder mutates the global ``session.settings._local_download_dir`` to a
+    temporary ``/tmp/sagemaker/model-builder/<uuid>`` path. When the shared
+    session-scoped fixture is reused by other test modules, that temp dir gets
+    cleaned up while the polluted setting lingers, breaking unrelated tests such
+    as ``tests/integ/sagemaker/workflow/test_tuning_steps.py::test_tuning_multi_algos``
+    (``ValueError: Inputted directory ... does not exist``). Scoping a dedicated
+    session to the serve package keeps that mutation contained here.
+    """
+    sagemaker_client_config.setdefault("config", Config(retries=dict(max_attempts=10)))
+    sagemaker_client = (
+        boto_session.client("sagemaker", **sagemaker_client_config)
+        if sagemaker_client_config
+        else None
+    )
+    runtime_client = (
+        boto_session.client("sagemaker-runtime", **sagemaker_runtime_config)
+        if sagemaker_runtime_config
+        else None
+    )
+    metrics_client = (
+        boto_session.client("sagemaker-metrics", **sagemaker_metrics_config)
+        if sagemaker_metrics_config
+        else None
+    )
+
+    return Session(
+        boto_session=boto_session,
+        sagemaker_client=sagemaker_client,
+        sagemaker_runtime_client=runtime_client,
+        sagemaker_metrics_client=metrics_client,
+        sagemaker_config={},
+        default_bucket_prefix=CUSTOM_S3_OBJECT_KEY_PREFIX,
+    )
 
 
 @pytest.fixture(scope="module")

From 00299fd4300b9271fb59ebdb773f03b8d2253b74 Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Fri, 5 Jun 2026 23:28:10 -0700
Subject: [PATCH 3/8] fix: tear down shared JumpStart hub after all xdist
 workers finish

The previous reference-counted teardown in the session fixture finalizer
was unsafe: pytest-xdist distributes tests dynamically, so a worker could
finish its session (running finalizers) while other workers still had hub
tests pending. Decrementing to zero there deleted the shared hub mid-run,
causing "Hub ... does not exist" / "Hub content ... does not exist"
failures in gated hub tests.

Workers now only create-or-reuse the shared hub (never delete it). Teardown
runs exactly once in pytest_sessionfinish on the controller process (no
workerinput), which is guaranteed to run after all workers finish. Stale
hub reclamation continues to be handled by the age-based _cleanup_old_hubs.
---
 tests/integ/sagemaker/jumpstart/conftest.py | 93 ++++++++++++++-------
 1 file changed, 61 insertions(+), 32 deletions(-)

diff --git a/tests/integ/sagemaker/jumpstart/conftest.py b/tests/integ/sagemaker/jumpstart/conftest.py
index d74eadcf71..938984a9c7 100644
--- a/tests/integ/sagemaker/jumpstart/conftest.py
+++ b/tests/integ/sagemaker/jumpstart/conftest.py
@@ -14,6 +14,7 @@
 
 import json
 import os
+import pathlib
 from datetime import datetime, timedelta, timezone
 
 import boto3
@@ -229,33 +230,52 @@ def _delete_hub_contents(sagemaker_session, hub_name, model):
     )
 
 
+def _hub_state_root(config):
+    """Return the run-level tmp dir shared by the xdist controller and workers.
+
+    The controller's basetemp is the run root (e.g. ``.../pytest-N``) while each
+    worker's basetemp is a ``popen-gw*`` subdir of it. Normalizing to the run
+    root gives every process the same location for the shared state file.
+
+    Works across pytest versions: prefers the ``TempPathFactory`` attached as
+    ``config._tmp_path_factory`` and falls back to the legacy ``_tmpdirhandler``.
+    """
+    factory = getattr(config, "_tmp_path_factory", None)
+    if factory is not None:
+        basetemp = pathlib.Path(str(factory.getbasetemp()))
+    else:
+        basetemp = pathlib.Path(str(config._tmpdirhandler.getbasetemp()))
+
+    if basetemp.name.startswith("popen-gw"):
+        return basetemp.parent
+    return basetemp
+
+
 @pytest.fixture(scope="session", autouse=True)
-def setup(request, worker_id, tmp_path_factory):
-    """Create a single shared private hub for the whole test run.
+def setup(request):
+    """Ensure a single shared private hub exists for the whole test run.
 
     Under pytest-xdist every worker is a separate process, so a naive
     ``scope="session"`` fixture would create one hub per worker. With high
     parallelism (e.g. ``-n 120``) that quickly exhausts the per-account private
-    hub limit (100) and triggers destructive cross-worker cleanup. To avoid
-    this, all workers coordinate through a lock file and a shared JSON state
-    file: the first worker creates the hub, the rest reuse it, and only the last
-    worker to finish tears it down (reference counting).
+    hub limit (100). All workers therefore coordinate through a lock file and a
+    shared JSON state file: the first worker creates the hub, the rest reuse it.
+
+    The hub is intentionally NOT deleted from a worker finalizer. xdist
+    distributes tests dynamically, so a worker can finish its whole session (and
+    run its finalizers) before another worker even reaches its first hub test;
+    reference counting in that finalizer would delete the hub out from under
+    workers still using it ("Hub ... does not exist" failures). Teardown instead
+    runs exactly once, after all workers finish, in ``pytest_sessionfinish`` on
+    the controller process.
     """
-    # Non-xdist run: single process owns the full lifecycle.
-    if worker_id == "master":
-        _setup()
-        request.addfinalizer(_teardown)
-        return
-
-    # xdist run: coordinate hub creation/teardown across workers.
-    root_tmp_dir = tmp_path_factory.getbasetemp().parent
+    root_tmp_dir = _hub_state_root(request.config)
     state_file = root_tmp_dir / "jumpstart_hub_state.json"
     lock_file = root_tmp_dir / "jumpstart_hub_state.json.lock"
 
     with FileLock(str(lock_file)):
         if state_file.is_file():
             state = json.loads(state_file.read_text())
-            state["ref_count"] += 1
         else:
             test_suite_id = get_test_suite_id()
             test_hub_name = f"{HUB_NAME_PREFIX}{test_suite_id}"
@@ -263,27 +283,36 @@ def setup(request, worker_id, tmp_path_factory):
             state = {
                 "test_suite_id": test_suite_id,
                 "test_hub_name": test_hub_name,
-                "ref_count": 1,
             }
-        state_file.write_text(json.dumps(state))
+            state_file.write_text(json.dumps(state))
 
     # Ensure this worker's environment points at the shared hub.
     os.environ.update({ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID: state["test_suite_id"]})
     os.environ.update({ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME: state["test_hub_name"]})
 
-    def _finalize():
-        with FileLock(str(lock_file)):
-            if not state_file.is_file():
-                return
-            current = json.loads(state_file.read_text())
-            current["ref_count"] -= 1
-            if current["ref_count"] <= 0:
-                _teardown(
-                    test_suite_id=current["test_suite_id"],
-                    test_hub_name=current["test_hub_name"],
-                )
-                state_file.unlink()
-            else:
-                state_file.write_text(json.dumps(current))
 
-    request.addfinalizer(_finalize)
+def pytest_sessionfinish(session, exitstatus):
+    """Tear down the shared hub once, after all xdist workers have finished.
+
+    xdist workers carry a ``workerinput`` attribute on their config; only the
+    controller (or a non-xdist run, which has no workerinput) performs teardown.
+    Running here guarantees no worker is still using the hub.
+    """
+    if hasattr(session.config, "workerinput"):
+        return  # xdist worker: the controller handles teardown.
+
+    root_tmp_dir = _hub_state_root(session.config)
+    state_file = root_tmp_dir / "jumpstart_hub_state.json"
+    lock_file = root_tmp_dir / "jumpstart_hub_state.json.lock"
+
+    with FileLock(str(lock_file)):
+        if not state_file.is_file():
+            return
+        state = json.loads(state_file.read_text())
+        try:
+            _teardown(
+                test_suite_id=state["test_suite_id"],
+                test_hub_name=state["test_hub_name"],
+            )
+        finally:
+            state_file.unlink()

From 855a3c7b561d0e10c347b093beb30b0153ecbda7 Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Fri, 5 Jun 2026 23:46:44 -0700
Subject: [PATCH 4/8] fix: stabilize Spark jar build and inference-component
 endpoint timeout in integ tests

Two unrelated v2 integ-test failures, fixed together:

- test_spark_processing.py::test_sagemaker_pyspark_v3 (Spark 3.x): build_jar
  ran javac/jar without checking exit codes, so a failed jar rebuild (which
  truncates the committed hello-spark-java.jar) was swallowed and surfaced
  later as a misleading "code ... wasn't found" error, especially under xdist
  where the fixture runs per worker. Run the build commands with explicit
  return-code checks and assert the jar exists afterward.

- test_serve_model_builder_inference_component_happy.py::
  test_model_builder_ic_sagemaker_endpoint: deploying a 7B JumpStart model as
  an inference component on ml.g5.24xlarge regularly needs more than the
  15-minute standard endpoint timeout to reach InService (the failure was a
  deploy timeout, not a quota cap). Add a dedicated 30-minute timeout
  (SERVE_SAGEMAKER_IC_ENDPOINT_TIMEOUT) for this flow without changing the
  standard serve endpoint timeout.
---
 tests/integ/sagemaker/serve/constants.py      |  4 ++
 ...model_builder_inference_component_happy.py |  4 +-
 tests/integ/test_spark_processing.py          | 55 +++++++++++--------
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/tests/integ/sagemaker/serve/constants.py b/tests/integ/sagemaker/serve/constants.py
index 3f25f6a575..b2fcb4154f 100644
--- a/tests/integ/sagemaker/serve/constants.py
+++ b/tests/integ/sagemaker/serve/constants.py
@@ -21,6 +21,10 @@
 SERVE_MODEL_PACKAGE_TIMEOUT = 10
 SERVE_LOCAL_CONTAINER_TIMEOUT = 10
 SERVE_SAGEMAKER_ENDPOINT_TIMEOUT = 15
+# Inference-component deployments of large (7B) JumpStart models pull a big image
+# and load the model before the endpoint reaches InService, which routinely takes
+# longer than the standard endpoint timeout. Give that flow more headroom.
+SERVE_SAGEMAKER_IC_ENDPOINT_TIMEOUT = 30
 SERVE_SAVE_TIMEOUT = 2
 
 PYTHON_VERSION_IS_NOT_38 = platform.python_version_tuple()[1] != "8"
diff --git a/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py b/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
index 06312a45b1..bb2c1a34c8 100644
--- a/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
+++ b/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
@@ -24,7 +24,7 @@
 from sagemaker.utils import unique_name_from_base
 
 from tests.integ.sagemaker.serve.constants import (
-    SERVE_SAGEMAKER_ENDPOINT_TIMEOUT,
+    SERVE_SAGEMAKER_IC_ENDPOINT_TIMEOUT,
 )
 from tests.integ.timeout import timeout
 import logging
@@ -88,7 +88,7 @@ def test_model_builder_ic_sagemaker_endpoint(
 
     chain.build()
 
-    with timeout(minutes=SERVE_SAGEMAKER_ENDPOINT_TIMEOUT):
+    with timeout(minutes=SERVE_SAGEMAKER_IC_ENDPOINT_TIMEOUT):
         try:
             logger.info("Deploying and predicting in SAGEMAKER_ENDPOINT mode...")
             endpoint_name = f"llama-ic-endpoint-name-{uuid.uuid1().hex}"
diff --git a/tests/integ/test_spark_processing.py b/tests/integ/test_spark_processing.py
index ac956be94e..b6443a80bb 100644
--- a/tests/integ/test_spark_processing.py
+++ b/tests/integ/test_spark_processing.py
@@ -38,6 +38,8 @@
 @pytest.fixture(scope="module", autouse=True)
 def build_jar():
     jar_file_path = os.path.join(SPARK_PATH, "code", "java", "hello-java-spark")
+    jar_file = os.path.join(jar_file_path, "hello-spark-java.jar")
+
     # compile java file
     java_version = subprocess.check_output(["java", "-version"], stderr=subprocess.STDOUT).decode(
         "utf-8"
@@ -45,30 +47,39 @@ def build_jar():
     java_version = re.search(JAVA_VERSION_PATTERN, java_version).groups()[0]
 
     if float(java_version) > 1.8:
-        subprocess.run(
-            [
-                "javac",
-                "--release",
-                "8",
-                os.path.join(jar_file_path, JAVA_FILE_PATH, "HelloJavaSparkApp.java"),
-            ]
-        )
+        javac_cmd = [
+            "javac",
+            "--release",
+            "8",
+            os.path.join(jar_file_path, JAVA_FILE_PATH, "HelloJavaSparkApp.java"),
+        ]
     else:
-        subprocess.run(
-            ["javac", os.path.join(jar_file_path, JAVA_FILE_PATH, "HelloJavaSparkApp.java")]
-        )
+        javac_cmd = ["javac", os.path.join(jar_file_path, JAVA_FILE_PATH, "HelloJavaSparkApp.java")]
+
+    jar_cmd = [
+        "jar",
+        "cfm",
+        jar_file,
+        os.path.join(jar_file_path, "manifest.txt"),
+        "-C",
+        jar_file_path,
+        ".",
+    ]
 
-    subprocess.run(
-        [
-            "jar",
-            "cfm",
-            os.path.join(jar_file_path, "hello-spark-java.jar"),
-            os.path.join(jar_file_path, "manifest.txt"),
-            "-C",
-            jar_file_path,
-            ".",
-        ]
-    )
+    # Build with check=True so a failing javac/jar surfaces immediately instead
+    # of being swallowed. The jar (re)build truncates the committed
+    # hello-spark-java.jar, so a silent failure here would leave the test with a
+    # missing/corrupt jar and a confusing "code ... wasn't found" error at run
+    # time (especially under xdist, where this runs per worker).
+    for cmd in (javac_cmd, jar_cmd):
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Failed to build Spark test jar (command: {' '.join(cmd)}).\n"
+                f"stdout:\n{result.stdout}\nstderr:\n{result.stderr}"
+            )
+
+    assert os.path.isfile(jar_file), f"Spark test jar was not produced at {jar_file}"
 
 
 @pytest.fixture(scope="module")

From d912e41abf5c595f7a82255efa0dd4135554ed8c Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Sat, 6 Jun 2026 20:17:00 -0700
Subject: [PATCH 5/8] 
 https://us-west-2.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fcodebuild$252Fsagemaker-python-sdk-ci-integ-tests/log-events/e558697a-488d-4eab-a4ad-2971d9a1081f

---
 .../sagemaker/jumpstart/model/test_jumpstart_model.py     | 2 ++
 tests/integ/sagemaker/jumpstart/utils.py                  | 7 ++++++-
 .../test_serve_model_builder_inference_component_happy.py | 8 ++++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/integ/sagemaker/jumpstart/model/test_jumpstart_model.py b/tests/integ/sagemaker/jumpstart/model/test_jumpstart_model.py
index de287bb3d8..e9d07048f5 100644
--- a/tests/integ/sagemaker/jumpstart/model/test_jumpstart_model.py
+++ b/tests/integ/sagemaker/jumpstart/model/test_jumpstart_model.py
@@ -34,6 +34,7 @@
     download_inference_assets,
     get_sm_session,
     get_tabular_data,
+    x_fail_if_ice,
 )
 
 INF2_SUPPORTED_REGIONS = {
@@ -192,6 +193,7 @@ def test_jumpstart_gated_model(setup):
     assert response is not None
 
 
+@x_fail_if_ice
 def test_jumpstart_gated_model_inference_component_enabled(setup):
 
     model_id = "meta-textgeneration-llama-2-7b"
diff --git a/tests/integ/sagemaker/jumpstart/utils.py b/tests/integ/sagemaker/jumpstart/utils.py
index c326b135e0..3f8a7d7846 100644
--- a/tests/integ/sagemaker/jumpstart/utils.py
+++ b/tests/integ/sagemaker/jumpstart/utils.py
@@ -80,7 +80,12 @@ def wrapper(*args, **kwargs):
         try:
             return func(*args, **kwargs)
         except Exception as e:
-            if "CapacityError" in str(e):
+            # Insufficient capacity is a transient, region-level AWS condition
+            # (no instances available right now), not a SDK defect. SageMaker
+            # surfaces it either as a "CapacityError" or as an endpoint failure
+            # whose reason contains "InsufficientInstanceCapacity"; treat both as
+            # an expected failure so canaries don't go red on capacity shortages.
+            if "CapacityError" in str(e) or "InsufficientInstanceCapacity" in str(e):
                 pytest.xfail(str(e))
             raise
 
diff --git a/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py b/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
index bb2c1a34c8..8102bff2e7 100644
--- a/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
+++ b/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
@@ -41,7 +41,11 @@
 
 LLAMA_2_7B_JS_ID = "meta-textgeneration-llama-2-7b"
 LLAMA_IC_NAME = "llama2-mb-ic"
-INSTANCE_TYPE = "ml.g5.24xlarge"
+# ml.g5.24xlarge (4x A10G) is chronically capacity-constrained in us-west-2 and
+# made this test flaky with InsufficientInstanceCapacity / deploy timeouts. This
+# test exercises ModelBuilder's inference-component orchestration, not large-GPU
+# hosting, so a single-accelerator instance with ample capacity is sufficient.
+INSTANCE_TYPE = "ml.g5.2xlarge"
 
 
 @pytest.fixture
@@ -52,7 +56,7 @@ def model_builder_llama_inference_component():
         model_version="4.*",
         schema_builder=SchemaBuilder(sample_input, sample_output),
         resource_requirements=ResourceRequirements(
-            requests={"memory": 98304, "num_accelerators": 4, "copies": 1, "num_cpus": 40}
+            requests={"memory": 24576, "num_accelerators": 1, "copies": 1, "num_cpus": 8}
         ),
     )
 

From f895968de1d38f1151e859f814e2bace685ac42c Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Sun, 7 Jun 2026 01:00:22 -0700
Subject: [PATCH 6/8] fix: stop deleting shared JumpStart hub mid-run; xfail
 flaky IC deploy test

JumpStart hub:
The shared hub was being deleted at session end on the controller, but hub
tests deploy long-lived endpoints, so a straggler worker could still be running
a hub test at ~100% when teardown deleted the hub, causing intermittent
"Hub ... does not exist" failures (e.g. test_jumpstart_hub_gated_estimator_
with_eula). Stop deleting the hub during the run entirely: session-end teardown
still cleans leaked endpoints/models/configs/artifacts but no longer deletes the
hub, and stale hubs from prior runs are reclaimed proactively at setup via the
age-based _cleanup_old_hubs (older than STALE_HUB_AGE_HOURS).

Inference-component serve test:
test_model_builder_ic_sagemaker_endpoint fails in the ModelBuilder IC deploy
path: CreateEndpoint is followed by a DescribeEndpoint that intermittently
reports the endpoint as not found. This is an SDK-level issue, not a test
config problem, so xfail (non-strict) the test to unblock the canary while it
is tracked separately.

X-AI-Prompt: Stop mid-run hub deletion (rely on age-based reclamation) and xfail the flaky ModelBuilder inference-component deploy test
X-AI-Tool: kiro-cli
---
 tests/integ/sagemaker/jumpstart/conftest.py   | 42 ++++++++++++-------
 ...model_builder_inference_component_happy.py |  6 +++
 2 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/tests/integ/sagemaker/jumpstart/conftest.py b/tests/integ/sagemaker/jumpstart/conftest.py
index 938984a9c7..7f7820c9c3 100644
--- a/tests/integ/sagemaker/jumpstart/conftest.py
+++ b/tests/integ/sagemaker/jumpstart/conftest.py
@@ -62,6 +62,11 @@ def _setup(test_suite_id=None, test_hub_name=None):
     # Create a private hub to use for the test session
     hub = Hub(hub_name=test_hub_name, sagemaker_session=get_sm_session())
 
+    # Proactively reclaim stale hubs from prior runs so we don't accumulate
+    # toward the per-account private hub limit. This only deletes hubs older
+    # than STALE_HUB_AGE_HOURS and never the hub we are about to use.
+    _cleanup_old_hubs(get_sm_session(), active_hub_name=test_hub_name)
+
     # Check if hub already exists before creating
     try:
         hub.describe()
@@ -82,7 +87,7 @@ def _setup(test_suite_id=None, test_hub_name=None):
                 raise
 
 
-def _teardown(test_suite_id=None, test_hub_name=None):
+def _teardown(test_suite_id=None, test_hub_name=None, delete_hub=False):
     print("Tearing down...")
 
     test_cache_bucket = get_test_artifact_bucket()
@@ -161,8 +166,12 @@ def _teardown(test_suite_id=None, test_hub_name=None):
     bucket = s3_resource.Bucket(test_cache_bucket)
     bucket.objects.filter(Prefix=test_suite_id + "/").delete()
 
-    # delete private hubs
-    _delete_hubs(sagemaker_session, test_hub_name)
+    # delete private hubs (only when explicitly requested). During an xdist run
+    # we never delete the active hub, because a straggler worker may still be
+    # running a hub test when another process reaches teardown; stale hubs from
+    # prior runs are reclaimed by the age-based _cleanup_old_hubs instead.
+    if delete_hub:
+        _delete_hubs(sagemaker_session, test_hub_name)
 
 
 def _cleanup_old_hubs(sagemaker_session, active_hub_name=None):
@@ -261,13 +270,13 @@ def setup(request):
     hub limit (100). All workers therefore coordinate through a lock file and a
     shared JSON state file: the first worker creates the hub, the rest reuse it.
 
-    The hub is intentionally NOT deleted from a worker finalizer. xdist
-    distributes tests dynamically, so a worker can finish its whole session (and
-    run its finalizers) before another worker even reaches its first hub test;
-    reference counting in that finalizer would delete the hub out from under
-    workers still using it ("Hub ... does not exist" failures). Teardown instead
-    runs exactly once, after all workers finish, in ``pytest_sessionfinish`` on
-    the controller process.
+    The hub is intentionally NOT deleted at the end of the run. xdist
+    distributes tests dynamically and hub tests deploy long-lived endpoints, so
+    a straggler worker can still be running a hub test (at ~100%) while another
+    process reaches teardown. Deleting the hub there pulls it out from under the
+    straggler ("Hub ... does not exist" failures). Instead, leaked endpoints and
+    artifacts are cleaned at run end, and the hub itself is reclaimed on a later
+    run by the age-based ``_cleanup_old_hubs`` (older than STALE_HUB_AGE_HOURS).
     """
     root_tmp_dir = _hub_state_root(request.config)
     state_file = root_tmp_dir / "jumpstart_hub_state.json"
@@ -292,14 +301,16 @@ def setup(request):
 
 
 def pytest_sessionfinish(session, exitstatus):
-    """Tear down the shared hub once, after all xdist workers have finished.
+    """Clean up leaked test resources once, after all xdist workers finish.
 
-    xdist workers carry a ``workerinput`` attribute on their config; only the
-    controller (or a non-xdist run, which has no workerinput) performs teardown.
-    Running here guarantees no worker is still using the hub.
+    Runs only on the controller (xdist workers carry a ``workerinput`` attribute
+    on their config; a non-xdist run has none). Deletes endpoints/models/configs
+    and S3 artifacts tagged for this run, but deliberately does NOT delete the
+    shared hub (see ``setup``); stale hubs are reclaimed by ``_cleanup_old_hubs``
+    on a subsequent run.
     """
     if hasattr(session.config, "workerinput"):
-        return  # xdist worker: the controller handles teardown.
+        return  # xdist worker: the controller handles cleanup.
 
     root_tmp_dir = _hub_state_root(session.config)
     state_file = root_tmp_dir / "jumpstart_hub_state.json"
@@ -313,6 +324,7 @@ def pytest_sessionfinish(session, exitstatus):
             _teardown(
                 test_suite_id=state["test_suite_id"],
                 test_hub_name=state["test_hub_name"],
+                delete_hub=False,
             )
         finally:
             state_file.unlink()
diff --git a/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py b/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
index 8102bff2e7..f8c7ffafeb 100644
--- a/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
+++ b/tests/integ/sagemaker/serve/test_serve_model_builder_inference_component_happy.py
@@ -61,6 +61,12 @@ def model_builder_llama_inference_component():
     )
 
 
+@pytest.mark.xfail(
+    reason="Flaky ModelBuilder inference-component deploy path: CreateEndpoint is "
+    "followed by a DescribeEndpoint that intermittently reports the endpoint as "
+    "not found. Tracked separately as an SDK issue; xfail to unblock the canary.",
+    strict=False,
+)
 @pytest.mark.skipif(
     tests.integ.test_region() not in "us-west-2",
     reason="G5 capacity available in PDX.",

From 1c2450f0428d3398b604b18c940e5c474105ee6f Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Wed, 10 Jun 2026 00:18:18 -0700
Subject: [PATCH 7/8] test: speed up slow JumpStart estimator canary integ
 tests

These canaries only need to exercise the train/deploy/predict flow, not
produce a well-trained model, yet they dominated canary runtime (the
estimator tests each ran ~100 min). Trim the training workload to bring
the suite under one hour while keeping coverage intact.

Bert estimator tests (full QNLI -> QNLI-tiny + epochs=1):
- map the floating "*" version of huggingface-spc-bert-base-cased to the
  QNLI-tiny dataset instead of the full QNLI dataset (constants.py)
- cap training to a single epoch (hyperparameters={"epochs": "1"}) for:
    - test_jumpstart_estimator
    - test_jumpstart_hub_estimator
    - test_jumpstart_hub_estimator_with_session

Gated llama estimator tests (sec_amazon has no tiny variant, so cap steps
via hyperparameters={"max_steps": "1"}):
- test_gated_model_training_v1
- test_gated_model_training_v2
- test_jumpstart_hub_gated_estimator_with_eula

X-AI-Prompt: Reduce JumpStart estimator canary test runtime by using the tiny training dataset and capping epochs/steps so the suite finishes under an hour
X-AI-Tool: kiro-cli
---
 tests/integ/sagemaker/jumpstart/constants.py             | 5 ++++-
 .../jumpstart/estimator/test_jumpstart_estimator.py      | 9 +++++++++
 .../estimator/test_jumpstart_private_hub_estimator.py    | 9 +++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/tests/integ/sagemaker/jumpstart/constants.py b/tests/integ/sagemaker/jumpstart/constants.py
index 70448e9214..f503ea4ecd 100644
--- a/tests/integ/sagemaker/jumpstart/constants.py
+++ b/tests/integ/sagemaker/jumpstart/constants.py
@@ -47,7 +47,10 @@ def _to_s3_path(filename: str, s3_prefix: Optional[str]) -> str:
     ("huggingface-spc-bert-base-cased", "1.0.0"): ("training-datasets/QNLI-tiny/"),
     ("huggingface-spc-bert-base-cased", "1.2.3"): ("training-datasets/QNLI-tiny/"),
     ("huggingface-spc-bert-base-cased", "2.0.3"): ("training-datasets/QNLI-tiny/"),
-    ("huggingface-spc-bert-base-cased", "*"): ("training-datasets/QNLI/"),
+    # Use the tiny dataset for the floating "*" version too: these are canary
+    # tests that only need to exercise the train/deploy flow, not produce a
+    # well-trained model. The full QNLI dataset made fit() dramatically slower.
+    ("huggingface-spc-bert-base-cased", "*"): ("training-datasets/QNLI-tiny/"),
     ("js-trainable-model", "*"): ("training-datasets/QNLI-tiny/"),
     ("meta-textgeneration-llama-2-7b", "*"): ("training-datasets/sec_amazon/"),
     ("meta-textgeneration-llama-2-7b", "2.*"): ("training-datasets/sec_amazon/"),
diff --git a/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py b/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py
index 5b52935869..6f684fd18d 100644
--- a/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py
+++ b/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py
@@ -61,6 +61,9 @@ def test_jumpstart_estimator(setup):
         tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
         max_run=259200,  # avoid exceeding resource limits
         instance_type="ml.g4dn.xlarge",
+        # Canary only needs to exercise the train/deploy flow, so cap training
+        # to a single epoch to keep fit() fast.
+        hyperparameters={"epochs": "1"},
     )
 
     # uses ml.g4dn.xlarge instance
@@ -111,6 +114,9 @@ def test_gated_model_training_v1(setup):
         environment={"accept_eula": "true"},
         max_run=259200,  # avoid exceeding resource limits
         tolerate_vulnerable_model=True,
+        # Canary only verifies the train/deploy flow, so cap training to a
+        # single step to keep fit() fast (sec_amazon has no tiny variant).
+        hyperparameters={"max_steps": "1"},
     )
 
     # uses ml.g5.12xlarge instance
@@ -153,6 +159,9 @@ def test_gated_model_training_v2(setup):
         environment={"accept_eula": "true"},
         max_run=259200,  # avoid exceeding resource limits
         tolerate_vulnerable_model=True,  # tolerate old version of model
+        # Canary only verifies the train/deploy flow, so cap training to a
+        # single step to keep fit() fast (sec_amazon has no tiny variant).
+        hyperparameters={"max_steps": "1"},
     )
 
     # uses ml.g5.12xlarge instance
diff --git a/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py b/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py
index 4c455c3b32..ce258063eb 100644
--- a/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py
+++ b/tests/integ/sagemaker/jumpstart/private_hub/estimator/test_jumpstart_private_hub_estimator.py
@@ -58,6 +58,9 @@ def test_jumpstart_hub_estimator(setup, add_model_references):
         hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
         tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
         instance_type="ml.g4dn.xlarge",
+        # Canary only needs to exercise the train/deploy flow, so cap training
+        # to a single epoch to keep fit() fast.
+        hyperparameters={"epochs": "1"},
     )
 
     estimator.fit(
@@ -98,6 +101,9 @@ def test_jumpstart_hub_estimator_with_session(setup, add_model_references):
         tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
         hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
         instance_type="ml.g4dn.xlarge",
+        # Canary only needs to exercise the train/deploy flow, so cap training
+        # to a single epoch to keep fit() fast.
+        hyperparameters={"epochs": "1"},
     )
 
     estimator.fit(
@@ -137,6 +143,9 @@ def test_jumpstart_hub_gated_estimator_with_eula(setup, add_model_references):
         hub_name=os.environ[ENV_VAR_JUMPSTART_SDK_TEST_HUB_NAME],
         tags=[{"Key": JUMPSTART_TAG, "Value": os.environ[ENV_VAR_JUMPSTART_SDK_TEST_SUITE_ID]}],
         instance_type="ml.g5.2xlarge",
+        # Canary only verifies the train/deploy flow, so cap training to a
+        # single step to keep fit() fast (sec_amazon has no tiny variant).
+        hyperparameters={"max_steps": "1"},
     )
 
     estimator.fit(

From 3d9b90a96a84df6581eaf8a6f781da36ab836f1f Mon Sep 17 00:00:00 2001
From: Lucas Jia <lucasjia@amazon.com>
Date: Wed, 10 Jun 2026 14:01:01 -0700
Subject: [PATCH 8/8] test: mark JumpStart neuron gated training test as
 slow_test

Excludes test_gated_model_training_v2_neuron from ci-integ-tests and
canaries-v2, which both filter out `slow_test`. Trn1/Inf2 capacity makes
this test prone to multi-hour stalls, and max_steps=1 cannot shrink the
provisioning wait.
---
 .../sagemaker/jumpstart/estimator/test_jumpstart_estimator.py    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py b/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py
index 6f684fd18d..54f252e91f 100644
--- a/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py
+++ b/tests/integ/sagemaker/jumpstart/estimator/test_jumpstart_estimator.py
@@ -199,6 +199,7 @@ def test_gated_model_training_v2(setup):
 
 
 @x_fail_if_ice
+@pytest.mark.slow_test
 @pytest.mark.skipif(
     tests.integ.test_region() not in TRN2_SUPPORTED_REGIONS,
     reason=f"TRN2 instances unavailable in {tests.integ.test_region()}.",