Skip to content

Commit f45a0ce

Browse files
committed
Optimize NVIDIA GPU operator jobs
* Move arm64 tests to the respective version files * Skip gather steps on success * Mark job as successful even if a best effort step failed * Use smaller instance types * Run arm64 jobs on amd64 test farms * Remove unused gather steps
1 parent 9a700a8 commit f45a0ce

13 files changed

Lines changed: 1043 additions & 878 deletions

ci-operator/config/rh-ecosystem-edge/nvidia-ci/.config.prowgen

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,6 @@ slack_reporter:
1414
{{end}}
1515
{{end}}
1616
job_names:
17-
- nvidia-gpu-operator-e2e-24-3-x
18-
- nvidia-gpu-operator-e2e-24-6-x
19-
- nvidia-gpu-operator-e2e-24-9-x
20-
- nvidia-gpu-operator-e2e-24-12-x
2117
- nvidia-gpu-operator-e2e-25-3-x
2218
- nvidia-gpu-operator-e2e-25-10-x
2319
- nvidia-gpu-operator-e2e-master
@@ -26,3 +22,4 @@ slack_reporter:
2622
- nvidia-network-operator-shared-device-eth-rdma
2723
- mirror-vcsref-image
2824
- nvidia-dra-gpu-e2e
25+
- nvidia-gpu-operator-e2e-arm64

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.14-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.14.0-0
16+
stream: 4-stable
17+
upper: 4.15.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.15-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.15.0-0
16+
stream: 4-stable
17+
upper: 4.16.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.16-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.16.0-0
16+
stream: 4-stable
17+
upper: 4.17.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.17-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.17.0-0
16+
stream: 4-stable
17+
upper: 4.18.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.18-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.18.0-0
16+
stream: 4-stable
17+
upper: 4.19.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.19-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.19.0-0
16+
stream: 4-stable
17+
upper: 4.20.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.20-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.20.0-0
16+
stream: 4-stable
17+
upper: 4.21.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.21-stable.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.21.0-0
16+
stream: 4-stable
17+
upper: 4.22.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -46,6 +54,19 @@ tests:
4654
BASE_DOMAIN: edge-sro.rhecoeng.com
4755
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
4856
workflow: nvidia-gpu-operator-e2e-aws
57+
- always_run: false
58+
as: nvidia-gpu-operator-e2e-arm64
59+
steps:
60+
cluster_profile: aws-edge-infra
61+
dependencies:
62+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
63+
env:
64+
AWS_REGION_OVERWRITE: us-east-1
65+
BASE_DOMAIN: edge-sro.rhecoeng.com
66+
COMPUTE_NODE_TYPE: g5g.2xlarge
67+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
68+
OCP_ARCH: arm64
69+
workflow: nvidia-gpu-operator-e2e-aws
4970
zz_generated_metadata:
5071
branch: main
5172
org: rh-ecosystem-edge

ci-operator/config/rh-ecosystem-edge/nvidia-ci/rh-ecosystem-edge-nvidia-ci-main__4.22-stable.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ images:
77
- dockerfile_path: Containerfile
88
to: nvidia-ci
99
releases:
10+
arm64-latest:
11+
prerelease:
12+
architecture: arm64
13+
product: ocp
14+
version_bounds:
15+
lower: 4.22.0-0
16+
stream: 4-stable
17+
upper: 4.23.0-0
1018
latest:
1119
prerelease:
1220
product: ocp
@@ -51,6 +59,21 @@ tests:
5159
NVIDIAGPU_GPU_FALLBACK_CATALOGSOURCE_INDEX_IMAGE: registry.redhat.io/redhat/certified-operator-index:v4.21
5260
NVIDIAGPU_SUBSCRIPTION_CHANNEL: v25.3
5361
workflow: nvidia-gpu-operator-e2e-aws
62+
- always_run: false
63+
as: nvidia-gpu-operator-e2e-arm64
64+
steps:
65+
cluster_profile: aws-edge-infra
66+
dependencies:
67+
OPENSHIFT_INSTALL_RELEASE_IMAGE_OVERRIDE: release:arm64-latest
68+
env:
69+
AWS_REGION_OVERWRITE: us-east-1
70+
BASE_DOMAIN: edge-sro.rhecoeng.com
71+
COMPUTE_NODE_TYPE: g5g.2xlarge
72+
CONTROL_PLANE_INSTANCE_TYPE: m6g.xlarge
73+
NFD_FALLBACK_CATALOGSOURCE_INDEX_IMAGE: registry.redhat.io/redhat/redhat-operator-index:v4.21
74+
NVIDIAGPU_GPU_FALLBACK_CATALOGSOURCE_INDEX_IMAGE: registry.redhat.io/redhat/certified-operator-index:v4.21
75+
OCP_ARCH: arm64
76+
workflow: nvidia-gpu-operator-e2e-aws
5477
zz_generated_metadata:
5578
branch: main
5679
org: rh-ecosystem-edge

0 commit comments

Comments
 (0)