Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions .github/workflows/gpu-integ-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,36 @@ jobs:
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
source-version: refs/heads/master

# Bedrock model-import integ tests. Run serially (concurrency 1) in their own
# CodeBuild project because the "Concurrent model import jobs" Bedrock quota is
# fixed at 1 and not raisable; running them in parallel (as PR checks did)
# makes them collide and flake. us-west-2 only (no us_east_1-marked tests).
# Folded into the same run-level pass/fail metric as the GPU jobs below, so it
# shares the GpuIntegRunAlarm rather than getting a separate alarm.
import-model-integ-tests:
needs: check-prior-success
if: needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
aws-region: us-west-2
role-duration-seconds: 10800
- name: Run Bedrock Model-Import Integ Tests
uses: aws-actions/aws-codebuild-run-build@v1
with:
project-name: sagemaker-python-sdk-ci-health-import-model-integ-tests
source-version: refs/heads/master

# Run-level result: a run is successful only if BOTH region jobs succeeded.
# Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in
# us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and
# cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate
# short-circuited today's run (an earlier run already succeeded).
report-result:
needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1]
needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1, import-model-integ-tests]
# Only emit the daily alarm metric for scheduled runs that actually executed
# the test jobs:
# - check-prior-success.result == 'success': if the gate job itself failed,
Expand All @@ -111,12 +134,13 @@ jobs:
exit 0
fi
if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \
[ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then
[ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ] && \
[ "${{ needs.import-model-integ-tests.result }}" == "success" ]; then
value=0
echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0"
echo "All region/import jobs succeeded; emitting GpuIntegRunFailure=0"
else
value=1
echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1"
echo "At least one region/import job did not succeed; emitting GpuIntegRunFailure=1"
fi
aws cloudwatch put-metric-data \
--namespace GpuIntegRunMetrics \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ def _setup_model_files(s3_artifacts_uri, s3_client):


@pytest.mark.serial
@pytest.mark.import_model
class TestBedrockImportJobPolling:
"""Test import job polling for OSS models (Option C: deploy only waits for import)."""

Expand Down Expand Up @@ -236,6 +237,7 @@ def test_deploy_oss_model_waits_for_import_completion(


@pytest.mark.serial
@pytest.mark.import_model
class TestBedrockProvisionedThroughput:
"""Test create_provisioned_throughput as a standalone method.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,7 @@ def test_bedrock_model_builder_creation(self, training_job):
f"BedrockModelBuilder creation failed: {str(e)}. This might be due to sagemaker-core integration issues.")

@pytest.mark.slow
@pytest.mark.import_model
def test_bedrock_job_created(self, deployed_model_arn):
"""Test that Bedrock import job was created successfully."""
assert deployed_model_arn is not None
Expand All @@ -583,6 +584,7 @@ def test_bedrock_job_created(self, deployed_model_arn):
# Documentation recommends retries: https://docs.aws.amazon.com/bedrock/latest/userguide/invoke-imported-model.html#handle-model-not-ready-exception.
# TODO: Fix using provisioned throughput or better wait mechanism
@pytest.mark.slow
@pytest.mark.import_model
def test_bedrock_model_invoke(self, deployed_model_arn, bedrock_runtime):
logger.warning(
"This test is known to be flaky due to 'model not ready' exceptions from Bedrock. "
Expand Down
1 change: 1 addition & 0 deletions sagemaker-serve/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ markers =
timeout: mark a test as a timeout.
gpu_intensive: mark a test as GPU resource intensive (runs on scheduled CI, not PR checks).
us_east_1: mark a test that requires us-east-1 test account credentials (784379639078).
import_model: mark a test that creates a Bedrock model import job. Concurrent model import jobs are capped at 1 by a non-raisable Bedrock service quota, so these run serially in a dedicated scheduled CI run, not in PR checks.

[testenv]
setenv =
Expand Down
Loading