Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions sagemaker-mlops/tests/integ/test_feature_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,26 @@ def test_delete_feature_group(feature_group_name, sample_dataframe, bucket, role
fg.wait_for_status("Created")

fg.delete()
time.sleep(2)

with pytest.raises(Exception):
FeatureGroup.get(feature_group_name=feature_group_name)

# FeatureGroup deletion is asynchronous: after delete() returns the group
# stays in "Deleting" status and is still describable for a while, so a
# fixed short sleep + single get() is racy. Poll until get() raises (the
# group is fully gone) or we hit the timeout.
deadline = time.time() + 120
last_exc = None
while time.time() < deadline:
try:
FeatureGroup.get(feature_group_name=feature_group_name)
except Exception as e: # noqa: BLE001 - any error means it's no longer retrievable
last_exc = e
break
time.sleep(5)
else:
pytest.fail(
f"FeatureGroup {feature_group_name} was still retrievable 120s after delete()"
)

assert last_exc is not None


# Test 7: Ingest to both OnlineStore and OfflineStore
Expand Down
36 changes: 31 additions & 5 deletions sagemaker-mlops/tests/integ/test_feature_store_lakeformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,12 @@ def test_create_feature_group_and_enable_lake_formation(s3_uri, role, region):
assert fg.feature_group_status == "Created"

# Enable Lake Formation governance
result = fg.enable_lake_formation(hybrid_access_mode_enabled=False, acknowledge_risk=True)
result = fg.enable_lake_formation(
hybrid_access_mode_enabled=False,
acknowledge_risk=True,
use_service_linked_role=False,
registration_role_arn=role,
)

# Verify all phases completed successfully
assert result["s3_location_registered"] is True
Expand Down Expand Up @@ -198,6 +203,8 @@ def test_create_feature_group_with_lake_formation_enabled(s3_uri, role, region):
enabled=True,
hybrid_access_mode_enabled = False,
acknowledge_risk=True,
use_service_linked_role=False,
registration_role_arn=role,
)

fg = FeatureGroupManager.create(
Expand Down Expand Up @@ -467,8 +474,17 @@ def test_enable_lake_formation_fails_with_nonexistent_role(
# Verify we got an appropriate error
error_msg = str(exc_info.value)
print(exc_info)
# Should mention role-related issues (not found, invalid, access denied, etc.)
assert "EntityNotFoundException" in error_msg
# The registration must fail because the role is not usable. Depending on
# how the build/execution role's iam:PassRole policy is scoped, this surfaces
# either as Lake Formation rejecting the unknown role (EntityNotFoundException)
# or as IAM denying PassRole before the call reaches Lake Formation
# (AccessDeniedException on iam:PassRole). Both are valid "nonexistent / not
# usable role" outcomes for this negative test.
assert (
"EntityNotFoundException" in error_msg
or "AccessDeniedException" in error_msg
or "iam:PassRole" in error_msg
), f"Unexpected error for nonexistent role registration: {error_msg}"


# ============================================================================
Expand Down Expand Up @@ -503,7 +519,12 @@ def test_enable_lake_formation_full_flow_with_policy_output(s3_uri, role, region

# Enable Lake Formation governance
with caplog.at_level(logging.WARNING, logger="sagemaker.mlops.feature_store.feature_group_manager"):
result = fg.enable_lake_formation(hybrid_access_mode_enabled=False, acknowledge_risk=True)
result = fg.enable_lake_formation(
hybrid_access_mode_enabled=False,
acknowledge_risk=True,
use_service_linked_role=False,
registration_role_arn=role,
)

# Verify all phases completed successfully
assert result["s3_location_registered"] is True
Expand Down Expand Up @@ -546,7 +567,12 @@ def test_enable_lake_formation_default_logs_recommended_policy(s3_uri, role, reg

# Enable Lake Formation governance with hybrid_access_mode_enabled=False
with caplog.at_level(logging.WARNING, logger="sagemaker.mlops.feature_store.feature_group_manager"):
result = fg.enable_lake_formation(hybrid_access_mode_enabled=False, acknowledge_risk=True)
result = fg.enable_lake_formation(
hybrid_access_mode_enabled=False,
acknowledge_risk=True,
use_service_linked_role=False,
registration_role_arn=role,
)

# Verify phases completed successfully
assert result["s3_location_registered"] is True
Expand Down
24 changes: 11 additions & 13 deletions sagemaker-serve/src/sagemaker/serve/bedrock_model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,10 +645,9 @@ def _get_checkpoint_uri_from_manifest(self) -> Optional[str]:
"""Get checkpoint URI from manifest.json for Nova models.

Steps:
1. Fetch S3 model artifacts from training job
2. Construct path to manifest.json in the output directory
3. Read and parse manifest.json
4. Return checkpoint_s3_bucket value
1. Build the manifest.json path from the training job output_data_config
2. Read and parse manifest.json
3. Return checkpoint_s3_bucket value

Returns:
Checkpoint URI from manifest.json.
Expand All @@ -660,16 +659,15 @@ def _get_checkpoint_uri_from_manifest(self) -> Optional[str]:
if not isinstance(self.model, TrainingJob):
raise ValueError("Model must be a TrainingJob instance for Nova models")

s3_artifacts = self.model.model_artifacts.s3_model_artifacts
if not s3_artifacts:
raise ValueError("No S3 model artifacts found in training job")
# Nova serverless training jobs have no model_artifacts; the manifest
# lives under the job's output_data_config path.
output_data_config = getattr(self.model, "output_data_config", None)
s3_output_path = getattr(output_data_config, "s3_output_path", None)
if not s3_output_path:
raise ValueError("No S3 output path found in training job output_data_config")

logger.info("S3 artifacts path: %s", s3_artifacts)

# Construct manifest path
# s3://bucket/path/output/model.tar.gz -> s3://bucket/path/output/output/manifest.json
parts = s3_artifacts.rstrip("/").rsplit("/", 1)
manifest_path = parts[0] + "/output/manifest.json"
output_path = s3_output_path.rstrip("/")
manifest_path = f"{output_path}/{self.model.training_job_name}/output/output/manifest.json"

logger.info("Manifest path: %s", manifest_path)

Expand Down
3 changes: 3 additions & 0 deletions sagemaker-serve/src/sagemaker/serve/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4699,6 +4699,9 @@ def _resolve_nova_escrow_uri(self) -> str:
training_job = self.model
elif isinstance(self.model, ModelTrainer):
training_job = self.model._latest_training_job
elif isinstance(self.model, BaseTrainer) and hasattr(self.model, "_latest_training_job"):
# SFTTrainer / RLVRTrainer / DPOTrainer expose the job via _latest_training_job.
training_job = self.model._latest_training_job
else:
raise ValueError("Nova escrow URI resolution requires a TrainingJob or ModelTrainer")

Expand Down
Loading
Loading