From 7025c5c33ff3f858fb5559cbfe8c5f275f2690ac Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Tue, 9 Jun 2026 16:38:05 -0700 Subject: [PATCH 1/3] chore: Change GPU integ tests schedule to daily 10 PM US Pacific Replace the every-8-hours cron ("0 */8 * * *") with a daily run at 06:00 UTC ("0 6 * * *"), which corresponds to 10:00 PM US Pacific (PST). --- .github/workflows/gpu-integ-tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu-integ-tests.yml b/.github/workflows/gpu-integ-tests.yml index 9c9fb3e01b..19faabdc1c 100644 --- a/.github/workflows/gpu-integ-tests.yml +++ b/.github/workflows/gpu-integ-tests.yml @@ -1,7 +1,8 @@ name: GPU Integ Tests on: schedule: - - cron: "0 */8 * * *" + # 10:00 PM US Pacific (PST, UTC-8) = 06:00 UTC next day + - cron: "0 6 * * *" workflow_dispatch: permissions: From de503debbffc12544eb19bebea1bcfb925a535a4 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Wed, 10 Jun 2026 00:01:45 -0700 Subject: [PATCH 2/3] Add daily schedule, skip-on-success gate, and run-level alarm metric for GPU integ tests Run at 12am/2am/4am PST; skip later runs once one succeeds; emit a run-level pass/fail metric (both regions must pass) to CloudWatch for alarming. X-AI-Prompt: Reschedule gpu integ tests to 3 nightly runs with skip-on-success and emit a run-level metric for a sev2.5 alarm when all runs fail X-AI-Tool: kiro-cli --- .github/workflows/gpu-integ-tests.yml | 77 +++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/.github/workflows/gpu-integ-tests.yml b/.github/workflows/gpu-integ-tests.yml index 19faabdc1c..caed9b28ff 100644 --- a/.github/workflows/gpu-integ-tests.yml +++ b/.github/workflows/gpu-integ-tests.yml @@ -1,15 +1,53 @@ name: GPU Integ Tests on: schedule: - # 10:00 PM US Pacific (PST, UTC-8) = 06:00 UTC next day - - cron: "0 6 * * *" + # US Pacific (PST, UTC-8): 12:00 AM / 2:00 AM / 4:00 AM -> 08/10/12 UTC. + # All three fire within the same UTC day so the run-level CloudWatch metric + # (GpuIntegRunFailure) aggregates correctly per day. + - cron: "0 8 * * *" + - cron: "0 10 * * *" + - cron: "0 12 * * *" workflow_dispatch: permissions: - id-token: write # This is required for requesting the JWT + id-token: write # This is required for requesting the JWT + actions: read # required for the gate job to query prior runs of this workflow jobs: + # Gate: if an earlier scheduled run already succeeded today, skip the rest of + # today's scheduled runs. Manual (workflow_dispatch) runs always proceed. + check-prior-success: + runs-on: ubuntu-latest + outputs: + already_succeeded: ${{ steps.check.outputs.already_succeeded }} + steps: + - name: Check for a successful scheduled run earlier today + id: check + env: + GH_TOKEN: ${{ github.token }} + run: | + if [ "${{ github.event_name }}" != "schedule" ]; then + echo "Not a scheduled run; proceeding." + echo "already_succeeded=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + today=$(date -u +%Y-%m-%d) + count=$(gh api -X GET \ + "/repos/${{ github.repository }}/actions/workflows/gpu-integ-tests.yml/runs" \ + -f event=schedule \ + -f status=success \ + -f "created=>=${today}T00:00:00Z" \ + --jq '.workflow_runs | length') + echo "Successful scheduled runs today: $count" + if [ "$count" -gt 0 ]; then + echo "already_succeeded=true" >> "$GITHUB_OUTPUT" + else + echo "already_succeeded=false" >> "$GITHUB_OUTPUT" + fi + gpu-integ-tests: + needs: check-prior-success + if: needs.check-prior-success.outputs.already_succeeded != 'true' runs-on: ubuntu-latest steps: - name: Configure AWS Credentials @@ -25,6 +63,8 @@ jobs: source-version: refs/heads/master gpu-integ-tests-us-east-1: + needs: check-prior-success + if: needs.check-prior-success.outputs.already_succeeded != 'true' runs-on: ubuntu-latest steps: - name: Configure AWS Credentials (us-east-1) @@ -38,3 +78,34 @@ jobs: with: project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests source-version: refs/heads/master + + # Run-level result: a run is successful only if BOTH region jobs succeeded. + # Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in + # us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and + # cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate + # short-circuited today's run (an earlier run already succeeded). + report-result: + needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1] + if: always() && needs.check-prior-success.outputs.already_succeeded != 'true' + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }} + aws-region: us-west-2 + - name: Emit run-level pass/fail metric + run: | + if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \ + [ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then + value=0 + echo "Both region jobs succeeded; emitting GpuIntegRunFailure=0" + else + value=1 + echo "At least one region job did not succeed; emitting GpuIntegRunFailure=1" + fi + aws cloudwatch put-metric-data \ + --namespace GpuIntegRunMetrics \ + --metric-name GpuIntegRunFailure \ + --value "$value" \ + --unit Count From 27c2f29fbb4f1491e22c841546e36d05d0c0c859 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Thu, 11 Jun 2026 16:25:17 -0700 Subject: [PATCH 3/3] Harden gpu-integ-tests metric emission for daily failure alarm Tighten the report-result gate so the daily GpuIntegRunFailure metric is only emitted for scheduled runs that actually executed the test jobs: - require check-prior-success to have succeeded, so skipped test jobs from a failed gate are not misread as a failure under always() - skip metric emission entirely for manual workflow_dispatch runs so they do not count toward the "all of today's scheduled runs failed" alarm --- .github/workflows/gpu-integ-tests.yml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu-integ-tests.yml b/.github/workflows/gpu-integ-tests.yml index caed9b28ff..00fcb611fc 100644 --- a/.github/workflows/gpu-integ-tests.yml +++ b/.github/workflows/gpu-integ-tests.yml @@ -86,7 +86,14 @@ jobs: # short-circuited today's run (an earlier run already succeeded). report-result: needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1] - if: always() && needs.check-prior-success.outputs.already_succeeded != 'true' + # Only emit the daily alarm metric for scheduled runs that actually executed + # the test jobs: + # - check-prior-success.result == 'success': if the gate job itself failed, + # the test jobs are skipped; without this guard always() would still run + # report-result and read those skips as a (false) failure -> emit 1. + # - already_succeeded != 'true': an earlier run today already passed, so the + # gate short-circuited this run; nothing to report. + if: always() && needs.check-prior-success.result == 'success' && needs.check-prior-success.outputs.already_succeeded != 'true' runs-on: ubuntu-latest steps: - name: Configure AWS Credentials @@ -96,6 +103,13 @@ jobs: aws-region: us-west-2 - name: Emit run-level pass/fail metric run: | + # Manual (workflow_dispatch) runs must not contribute to the daily + # GpuIntegRunFailure count that drives GpuIntegRunAlarm; only scheduled + # runs count toward the "all of today's scheduled runs failed" alarm. + if [ "${{ github.event_name }}" != "schedule" ]; then + echo "Not a scheduled run (${{ github.event_name }}); skipping metric emission." + exit 0 + fi if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \ [ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ]; then value=0