-
Notifications
You must be signed in to change notification settings - Fork 1.3k
149 lines (143 loc) · 6.49 KB
/
Copy pathgpu-integ-tests.yml
File metadata and controls
149 lines (143 loc) · 6.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
name: GPU Integ Tests
on:
schedule:
# US Pacific (PST, UTC-8): 10:00 PM / 1:00 AM / 4:00 AM -> 06/09/12 UTC.
# All three fire within the same UTC day so the run-level CloudWatch metric
# (GpuIntegRunFailure) aggregates correctly per day.
- cron: "0 6 * * *"
- cron: "0 9 * * *"
- cron: "0 12 * * *"
workflow_dispatch:
permissions:
id-token: write # This is required for requesting the JWT
actions: read # required for the gate job to query prior runs of this workflow
jobs:
# Gate: if an earlier scheduled run already succeeded today, skip the rest of
# today's scheduled runs. Manual (workflow_dispatch) runs always proceed.
check-prior-success:
runs-on: ubuntu-latest
outputs:
already_succeeded: ${{ steps.check.outputs.already_succeeded }}
steps:
- name: Check for a successful scheduled run earlier today
id: check
env:
GH_TOKEN: ${{ github.token }}
run: |
if [ "${{ github.event_name }}" != "schedule" ]; then
echo "Not a scheduled run; proceeding."
echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
exit 0
fi
today=$(date -u +%Y-%m-%d)
count=$(gh api -X GET \
"/repos/${{ github.repository }}/actions/workflows/gpu-integ-tests.yml/runs" \
-f event=schedule \
-f status=success \
-f "created=>=${today}T00:00:00Z" \
--jq '.workflow_runs | length')
echo "Successful scheduled runs today: $count"
if [ "$count" -gt 0 ]; then
echo "already_succeeded=true" >> "$GITHUB_OUTPUT"
else
echo "already_succeeded=false" >> "$GITHUB_OUTPUT"
fi
gpu-integ-tests:
needs: check-prior-success
if: needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
aws-region: us-west-2
role-duration-seconds: 10800
- name: Run GPU Integ Tests
uses: aws-actions/aws-codebuild-run-build@v1
with:
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
source-version: refs/heads/master
gpu-integ-tests-us-east-1:
needs: check-prior-success
if: needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials (us-east-1)
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.CI_AWS_ROLE_US_EAST_1_ARN }}
aws-region: us-east-1
role-duration-seconds: 10800
- name: Run GPU Integ Tests (us-east-1)
uses: aws-actions/aws-codebuild-run-build@v1
with:
project-name: sagemaker-python-sdk-ci-health-gpu-integ-tests
source-version: refs/heads/master
# Bedrock model-import integ tests. Run serially (concurrency 1) in their own
# CodeBuild project because the "Concurrent model import jobs" Bedrock quota is
# fixed at 1 and not raisable; running them in parallel (as PR checks did)
# makes them collide and flake. us-west-2 only (no us_east_1-marked tests).
# Folded into the same run-level pass/fail metric as the GPU jobs below, so it
# shares the GpuIntegRunAlarm rather than getting a separate alarm.
import-model-integ-tests:
needs: check-prior-success
if: needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.CI_AWS_ROLE_ARN }}
aws-region: us-west-2
role-duration-seconds: 10800
- name: Run Bedrock Model-Import Integ Tests
uses: aws-actions/aws-codebuild-run-build@v1
with:
project-name: sagemaker-python-sdk-ci-health-import-model-integ-tests
source-version: refs/heads/master
# Run-level result: a run is successful only if BOTH region jobs succeeded.
# Emits GpuIntegRunFailure = 1 (failed) / 0 (succeeded) to CloudWatch in
# us-west-2. The CDK alarm (GpuIntegRunAlarm) sums this over a UTC day and
# cuts a daytime sev2 when all of the day's runs failed. Skipped when the gate
# short-circuited today's run (an earlier run already succeeded).
report-result:
needs: [check-prior-success, gpu-integ-tests, gpu-integ-tests-us-east-1, import-model-integ-tests]
# Only emit the daily alarm metric for scheduled runs that actually executed
# the test jobs:
# - check-prior-success.result == 'success': if the gate job itself failed,
# the test jobs are skipped; without this guard always() would still run
# report-result and read those skips as a (false) failure -> emit 1.
# - already_succeeded != 'true': an earlier run today already passed, so the
# gate short-circuited this run; nothing to report.
if: always() && needs.check-prior-success.result == 'success' && needs.check-prior-success.outputs.already_succeeded != 'true'
runs-on: ubuntu-latest
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.MONITORING_ROLE_ARN }}
aws-region: us-west-2
- name: Emit run-level pass/fail metric
run: |
# Manual (workflow_dispatch) runs must not contribute to the daily
# GpuIntegRunFailure count that drives GpuIntegRunAlarm; only scheduled
# runs count toward the "all of today's scheduled runs failed" alarm.
if [ "${{ github.event_name }}" != "schedule" ]; then
echo "Not a scheduled run (${{ github.event_name }}); skipping metric emission."
exit 0
fi
if [ "${{ needs.gpu-integ-tests.result }}" == "success" ] && \
[ "${{ needs.gpu-integ-tests-us-east-1.result }}" == "success" ] && \
[ "${{ needs.import-model-integ-tests.result }}" == "success" ]; then
value=0
echo "All region/import jobs succeeded; emitting GpuIntegRunFailure=0"
else
value=1
echo "At least one region/import job did not succeed; emitting GpuIntegRunFailure=1"
fi
aws cloudwatch put-metric-data \
--namespace GpuIntegRunMetrics \
--metric-name GpuIntegRunFailure \
--value "$value" \
--unit Count