From 062f0066ca836f07020ba8b54435f1d40b2faf64 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Sat, 6 Dec 2025 08:56:20 -0800 Subject: [PATCH 1/4] terraform: Add tier-based GPU selection for Lambda Labs Add support for tier-based GPU instance selection for Lambda Labs, similar to the existing DataCrunch implementation. This allows users to specify a maximum GPU tier and the system will automatically select the highest available GPU within that tier. The implementation adds capacity checking and tier selection scripts that query the Lambda Labs API to find available instances. Single GPU tier groups fall back from GH200 to H100 to A100 to A6000 to A10. Multi-GPU tier groups fall back from 8x B200 to 8x H100 to 8x A100 to 8x V100. New Kconfig options provide tier-based selections like H100_OR_LESS and 8X_H100_OR_LESS. The terraform ansible tasks detect these wildcard types and invoke the tier selection script to find available capacity before provisioning. Defconfigs are provided for common tier combinations to simplify usage. Users can now run commands like make defconfig-lambdalabs-h100-or-less to get the best available single GPU up to H100 tier. Generated-by: Claude AI Signed-off-by: Luis Chamberlain Signed-off-by: Chuck Lever --- defconfigs/lambdalabs-8x-b200-or-less | 14 + defconfigs/lambdalabs-8x-h100-or-less | 14 + defconfigs/lambdalabs-a100-or-less | 14 + defconfigs/lambdalabs-gh200-or-less | 14 + defconfigs/lambdalabs-h100-or-less | 14 + playbooks/roles/terraform/tasks/main.yml | 105 +++++- scripts/lambdalabs_check_capacity.py | 124 +++++++ scripts/lambdalabs_select_tier.py | 313 ++++++++++++++++++ terraform/lambdalabs/kconfigs/Kconfig.compute | 68 ++++ 9 files changed, 679 insertions(+), 1 deletion(-) create mode 100644 defconfigs/lambdalabs-8x-b200-or-less create mode 100644 defconfigs/lambdalabs-8x-h100-or-less create mode 100644 defconfigs/lambdalabs-a100-or-less create mode 100644 defconfigs/lambdalabs-gh200-or-less create mode 100644 defconfigs/lambdalabs-h100-or-less create mode 100755 scripts/lambdalabs_check_capacity.py create mode 100755 scripts/lambdalabs_select_tier.py diff --git a/defconfigs/lambdalabs-8x-b200-or-less b/defconfigs/lambdalabs-8x-b200-or-less new file mode 100644 index 000000000..5205d2b4f --- /dev/null +++ b/defconfigs/lambdalabs-8x-b200-or-less @@ -0,0 +1,14 @@ +# Lambda Labs 8-GPU with tier-based fallback (B200 maximum tier) +# Uses 8X_B200_OR_LESS for best available 8-GPU up to B200 +# Fallback order: 8x B200 → 8x H100 → 8x A100-80 → 8x A100 → 8x V100 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_B200_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-8x-h100-or-less b/defconfigs/lambdalabs-8x-h100-or-less new file mode 100644 index 000000000..09bb5b6e0 --- /dev/null +++ b/defconfigs/lambdalabs-8x-h100-or-less @@ -0,0 +1,14 @@ +# Lambda Labs 8-GPU with tier-based fallback (H100 maximum tier) +# Uses 8X_H100_OR_LESS for best available 8-GPU up to H100 +# Fallback order: 8x H100 → 8x A100-80 → 8x A100 → 8x V100 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_H100_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-a100-or-less b/defconfigs/lambdalabs-a100-or-less new file mode 100644 index 000000000..57ddc9a05 --- /dev/null +++ b/defconfigs/lambdalabs-a100-or-less @@ -0,0 +1,14 @@ +# Lambda Labs GPU with tier-based fallback (A100 maximum tier) +# Uses A100_OR_LESS for best available single GPU up to A100 +# Fallback order: A100-SXM → A100 → A6000 → RTX6000 → A10 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A100_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-gh200-or-less b/defconfigs/lambdalabs-gh200-or-less new file mode 100644 index 000000000..17263d3da --- /dev/null +++ b/defconfigs/lambdalabs-gh200-or-less @@ -0,0 +1,14 @@ +# Lambda Labs GPU with tier-based fallback (GH200 maximum tier) +# Uses GH200_OR_LESS for best available single GPU up to GH200 +# Fallback order: GH200 → H100-SXM → H100-PCIe → A100-SXM → A100 → A6000 → RTX6000 → A10 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GH200_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-h100-or-less b/defconfigs/lambdalabs-h100-or-less new file mode 100644 index 000000000..8af59c837 --- /dev/null +++ b/defconfigs/lambdalabs-h100-or-less @@ -0,0 +1,14 @@ +# Lambda Labs GPU with tier-based fallback (H100 maximum tier) +# Uses H100_OR_LESS for best available single GPU up to H100 +# Fallback order: H100-SXM → H100-PCIe → A100-SXM → A100 → A6000 → RTX6000 → A10 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_H100_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/playbooks/roles/terraform/tasks/main.yml b/playbooks/roles/terraform/tasks/main.yml index f2aa248fe..c8605feaa 100644 --- a/playbooks/roles/terraform/tasks/main.yml +++ b/playbooks/roles/terraform/tasks/main.yml @@ -91,11 +91,111 @@ - destroy - status +- name: Auto-select Lambda Labs instance type for tier-based wildcards + ansible.builtin.shell: + cmd: | + case "{{ terraform_lambdalabs_instance_type }}" in + GH200_OR_LESS|H100_OR_LESS|A100_OR_LESS|A6000_OR_LESS) + # Use tier-based selection script for single GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + 8X_B200_OR_LESS|8X_H100_OR_LESS|8X_A100_OR_LESS) + # Use tier-based selection script for 8x GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + *) + echo "Unknown wildcard type: {{ terraform_lambdalabs_instance_type }}" + exit 1 + ;; + esac + register: lambdalabs_auto_instance_type + failed_when: false + changed_when: false + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + tags: + - bringup + +- name: Fail if no Lambda Labs instances available for wildcard selection + ansible.builtin.fail: + msg: | + No GPU instances available for {{ terraform_lambdalabs_instance_type }} + + {{ lambdalabs_auto_instance_type.stderr }} + + Try: + - Wait and retry (capacity changes frequently) + - Check Lambda Labs dashboard: https://cloud.lambdalabs.com + - Use a different tier group via menuconfig + - Check capacity manually: scripts/lambdalabs_check_capacity.py + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc != 0 + tags: + - bringup + +- name: Parse Lambda Labs auto-selected instance type and region + set_fact: + lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}" + lambdalabs_auto_selected_region: "{{ lambdalabs_auto_instance_type.stdout.split()[1] }}" + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Report Lambda Labs auto-selected instance type for wildcards + ansible.builtin.debug: + msg: "Auto-selected instance type: {{ lambdalabs_auto_selected_instance }} in region: {{ lambdalabs_auto_selected_region }}" + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected instance type + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_instance_type\s*=' + line: 'lambdalabs_instance_type = "{{ lambdalabs_auto_selected_instance }}"' + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected region + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_region\s*=' + line: 'lambdalabs_region = "{{ lambdalabs_auto_selected_region }}"' + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Set Lambda Labs resolved instance type for subsequent tasks + set_fact: + lambdalabs_resolved_instance_type: "{{ lambdalabs_auto_selected_instance if (terraform_lambdalabs_instance_type in ['GH200_OR_LESS', 'H100_OR_LESS', 'A100_OR_LESS', 'A6000_OR_LESS', '8X_B200_OR_LESS', '8X_H100_OR_LESS', '8X_A100_OR_LESS'] and lambdalabs_auto_instance_type.rc == 0) else terraform_lambdalabs_instance_type }}" + when: + - kdevops_terraform_provider == "lambdalabs" + tags: + - bringup + - name: Check Lambda Labs capacity before provisioning (if using Lambda Labs) ansible.builtin.shell: cmd: | {{ topdir_path }}/scripts/lambda-cli --output json check-availability \ - {{ terraform_lambdalabs_instance_type }} {{ terraform_lambdalabs_region }} | \ + {{ lambdalabs_resolved_instance_type | default(terraform_lambdalabs_instance_type) }} {{ lambdalabs_auto_selected_region | default(terraform_lambdalabs_region) }} | \ python3 -c " import sys, json data = json.load(sys.stdin) @@ -113,6 +213,7 @@ changed_when: false when: - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type not in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] tags: - bringup @@ -121,6 +222,8 @@ msg: "{{ capacity_check.stdout }}" when: - kdevops_terraform_provider == "lambdalabs" + - capacity_check is defined + - capacity_check.rc is defined - capacity_check.rc != 0 tags: - bringup diff --git a/scripts/lambdalabs_check_capacity.py b/scripts/lambdalabs_check_capacity.py new file mode 100755 index 000000000..b6c05641e --- /dev/null +++ b/scripts/lambdalabs_check_capacity.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: copyleft-next-0.3.1 +""" +Check Lambda Labs instance availability across all regions. + +This script queries the Lambda Labs API to find where specific instance types +are available, helping users avoid provisioning failures. +""" + +import argparse +import json +import os +import sys + +# Import our Lambda Labs API module +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, SCRIPT_DIR) + +from lambdalabs_api import get_api_key, get_instance_types_with_capacity + + +def check_availability(instance_type=None, json_output=False, pick_first=False): + """Check instance availability across all regions.""" + api_key = get_api_key() + if not api_key: + sys.stderr.write("Error: Lambda Labs API key not found\n") + sys.stderr.write("Set LAMBDALABS_API_KEY or create ~/.lambdalabs/credentials\n") + sys.exit(1) + + instance_data, capacity_map = get_instance_types_with_capacity(api_key) + + if not capacity_map: + sys.stderr.write("Error: Could not fetch instance availability\n") + sys.exit(1) + + if instance_type: + # Check specific instance type + regions = capacity_map.get(instance_type, []) + if pick_first: + if regions: + print(regions[0]) + return 0 + return 1 + + if json_output: + result = [{"instance_type": instance_type, "regions": regions}] + print(json.dumps(result, indent=2)) + else: + if regions: + print(f"{instance_type}:") + for region in regions: + print(f" • {region}") + else: + print(f"{instance_type}: No capacity available") + return 0 if regions else 1 + else: + # Show all GPU instances with capacity + results = [] + gpu_instances = { + k: v for k, v in capacity_map.items() if k.startswith("gpu_") and v + } + + if json_output: + # Format for tier selection script compatibility + # Group by region for consistency with DataCrunch format + region_map = {} + for inst_type, regions in gpu_instances.items(): + for region in regions: + if region not in region_map: + region_map[region] = [] + region_map[region].append(inst_type) + + results = [ + {"location": region, "instances": instances} + for region, instances in sorted(region_map.items()) + ] + print(json.dumps(results, indent=2)) + else: + print("GPU Instance Availability:\n") + + # Group by region + region_map = {} + for inst_type, regions in gpu_instances.items(): + for region in regions: + if region not in region_map: + region_map[region] = [] + region_map[region].append(inst_type) + + for region in sorted(region_map.keys()): + print(f"📍 {region}:") + for inst in sorted(region_map[region]): + print(f" • {inst}") + print() + + if not region_map: + print("No GPU instances currently available") + + return 0 + + +def main(): + parser = argparse.ArgumentParser( + description="Check Lambda Labs instance availability" + ) + parser.add_argument( + "--instance-type", + "-i", + help="Check specific instance type (e.g., gpu_1x_h100_sxm5)", + ) + parser.add_argument( + "--json", "-j", action="store_true", help="Output in JSON format" + ) + parser.add_argument( + "--pick-first", + action="store_true", + help="Return first available region (for scripts)", + ) + + args = parser.parse_args() + sys.exit(check_availability(args.instance_type, args.json, args.pick_first)) + + +if __name__ == "__main__": + main() diff --git a/scripts/lambdalabs_select_tier.py b/scripts/lambdalabs_select_tier.py new file mode 100755 index 000000000..305b26aec --- /dev/null +++ b/scripts/lambdalabs_select_tier.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: copyleft-next-0.3.1 +""" +Lambda Labs GPU tier-based instance selection. + +This script implements a tiered fallback system for selecting GPU instances +based on availability. Users can specify a maximum tier (e.g., "h100" or "gh200") +and the script will try to provision the highest tier available, falling back +to lower tiers if necessary. +""" + +import argparse +import json +import os +import sys +from typing import Dict, List, Optional + +# Import our Lambda Labs API module +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, SCRIPT_DIR) + +from lambdalabs_api import get_api_key, get_instance_types_with_capacity + + +# GPU tier definitions for single GPU instances (highest to lowest performance) +GPU_TIERS_1X = { + "gh200": [ + "gpu_1x_gh200", # NVIDIA GH200 Grace Hopper Superchip + ], + "h100-sxm": [ + "gpu_1x_h100_sxm5", # H100 SXM5 (highest bandwidth) + ], + "h100-pcie": [ + "gpu_1x_h100_pcie", # H100 PCIe + ], + "a100-sxm": [ + "gpu_1x_a100_sxm4", # A100 SXM4 + ], + "a100": [ + "gpu_1x_a100", # A100 (PCIe variant) + ], + "a6000": [ + "gpu_1x_a6000", # RTX A6000 + ], + "rtx6000": [ + "gpu_1x_rtx6000", # RTX 6000 + ], + "a10": [ + "gpu_1x_a10", # A10 (budget option) + ], +} + +# Tier ordering for single GPU (highest to lowest) +TIER_ORDER_1X = [ + "gh200", + "h100-sxm", + "h100-pcie", + "a100-sxm", + "a100", + "a6000", + "rtx6000", + "a10", +] + +# GPU tier definitions for 8x GPU instances +GPU_TIERS_8X = { + "b200": [ + "gpu_8x_b200_sxm6", # 8x B200 (Blackwell) + ], + "h100": [ + "gpu_8x_h100_sxm5", # 8x H100 SXM5 + ], + "a100-80": [ + "gpu_8x_a100_80gb_sxm4", # 8x A100 80GB + ], + "a100": [ + "gpu_8x_a100", # 8x A100 + ], + "v100": [ + "gpu_8x_v100", # 8x V100 (legacy) + ], +} + +TIER_ORDER_8X = [ + "b200", + "h100", + "a100-80", + "a100", + "v100", +] + +# Pre-defined tier groups for single GPU +TIER_GROUPS_1X = { + "gh200-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("gh200") :], + "h100-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("h100-sxm") :], + "a100-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("a100-sxm") :], + "a6000-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("a6000") :], +} + +# Pre-defined tier groups for 8x GPU +TIER_GROUPS_8X = { + "8x-b200-or-less": TIER_ORDER_8X[TIER_ORDER_8X.index("b200") :], + "8x-h100-or-less": TIER_ORDER_8X[TIER_ORDER_8X.index("h100") :], + "8x-a100-or-less": TIER_ORDER_8X[TIER_ORDER_8X.index("a100-80") :], +} + +# Combined tier groups +TIER_GROUPS = {**TIER_GROUPS_1X, **TIER_GROUPS_8X} + + +def get_capacity_map(api_key: str) -> Dict[str, List[str]]: + """ + Get GPU instance capacity map. + + Returns: + Dictionary mapping instance_type to list of available regions + Example: {"gpu_1x_h100_sxm5": ["us-west-1", "us-east-1"]} + """ + _, capacity_map = get_instance_types_with_capacity(api_key) + return capacity_map + + +def check_instance_availability( + instance_type: str, capacity_map: Dict[str, List[str]] +) -> Optional[str]: + """ + Check if a specific instance type has available capacity in any region. + + Returns: + The region where the instance is available, or None if not available + """ + regions = capacity_map.get(instance_type, []) + return regions[0] if regions else None + + +def select_instance_from_tiers( + tier_group: str, verbose: bool = False +) -> Optional[Dict[str, str]]: + """ + Select the highest-tier available instance from a tier group. + + Returns: + Dictionary with 'instance_type' and 'region' keys, or None if unavailable + Example: {"instance_type": "gpu_1x_h100_sxm5", "region": "us-west-1"} + """ + if tier_group not in TIER_GROUPS: + if verbose: + print(f"Error: Unknown tier group '{tier_group}'", file=sys.stderr) + print( + f"Available tier groups: {', '.join(sorted(TIER_GROUPS.keys()))}", + file=sys.stderr, + ) + return None + + api_key = get_api_key() + if not api_key: + if verbose: + print("Error: Lambda Labs API key not found", file=sys.stderr) + return None + + capacity_map = get_capacity_map(api_key) + + if verbose and capacity_map: + gpu_capacity = { + k: v for k, v in capacity_map.items() if v and k.startswith("gpu_") + } + if gpu_capacity: + print("Available GPU capacity:", file=sys.stderr) + for inst_type, regions in sorted(gpu_capacity.items()): + print(f" {inst_type}: {', '.join(regions)}", file=sys.stderr) + print("", file=sys.stderr) + + tiers_to_check = TIER_GROUPS[tier_group] + + # Determine which GPU_TIERS dict to use + is_8x = tier_group.startswith("8x-") + gpu_tiers = GPU_TIERS_8X if is_8x else GPU_TIERS_1X + + if verbose: + print(f"Checking tier group: {tier_group}", file=sys.stderr) + print( + f"Tiers to check (highest to lowest): {', '.join(tiers_to_check)}", + file=sys.stderr, + ) + print("", file=sys.stderr) + + for tier in tiers_to_check: + if tier not in gpu_tiers: + continue + + instance_types = gpu_tiers[tier] + + if verbose: + print( + f"Checking tier '{tier}': {', '.join(instance_types)}", file=sys.stderr + ) + + for instance_type in instance_types: + if verbose: + print(f" Checking {instance_type}...", end=" ", file=sys.stderr) + + region = check_instance_availability(instance_type, capacity_map) + if region: + if verbose: + print(f"✓ AVAILABLE in {region}", file=sys.stderr) + print("", file=sys.stderr) + print( + f"Selected: {instance_type} in {region} (tier: {tier})", + file=sys.stderr, + ) + return {"instance_type": instance_type, "region": region} + + if verbose: + print("✗ not available", file=sys.stderr) + + if verbose: + print("", file=sys.stderr) + + if verbose: + print( + "Error: No instances available in any tier", + file=sys.stderr, + ) + + return None + + +def list_tier_groups(): + """Print available tier groups and their contents.""" + print("Available tier groups:\n") + + print("Single GPU (1x) tiers:") + for group_name in sorted(TIER_GROUPS_1X.keys()): + tiers = TIER_GROUPS_1X[group_name] + print(f" {group_name}:") + for tier in tiers: + instance_types = GPU_TIERS_1X.get(tier, []) + print(f" - {tier}: {', '.join(instance_types)}") + print("") + + print("Multi-GPU (8x) tiers:") + for group_name in sorted(TIER_GROUPS_8X.keys()): + tiers = TIER_GROUPS_8X[group_name] + print(f" {group_name}:") + for tier in tiers: + instance_types = GPU_TIERS_8X.get(tier, []) + print(f" - {tier}: {', '.join(instance_types)}") + + +def main(): + parser = argparse.ArgumentParser( + description="Select Lambda Labs GPU instance using tier-based fallback", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Select best available single GPU up to H100 + %(prog)s h100-or-less + + # Select best available single GPU up to GH200 + %(prog)s gh200-or-less + + # Select best available 8x GPU up to H100 + %(prog)s 8x-h100-or-less + + # List all tier groups + %(prog)s --list-tiers + + # Verbose mode to see selection process + %(prog)s h100-or-less --verbose +""", + ) + + parser.add_argument( + "tier_group", + nargs="?", + help="Tier group to select from (e.g., h100-or-less, gh200-or-less)", + ) + + parser.add_argument( + "--list-tiers", + action="store_true", + help="List all available tier groups and exit", + ) + + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Print detailed selection process", + ) + + args = parser.parse_args() + + if args.list_tiers: + list_tier_groups() + return 0 + + if not args.tier_group: + parser.print_help() + return 1 + + result = select_instance_from_tiers(args.tier_group, args.verbose) + + if result: + # Output format: instance_type region + print(f"{result['instance_type']} {result['region']}") + return 0 + else: + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/terraform/lambdalabs/kconfigs/Kconfig.compute b/terraform/lambdalabs/kconfigs/Kconfig.compute index 579e7207d..f761eafc0 100644 --- a/terraform/lambdalabs/kconfigs/Kconfig.compute +++ b/terraform/lambdalabs/kconfigs/Kconfig.compute @@ -10,12 +10,80 @@ endif # TERRAFORM_LAMBDALABS_REGION_SMART_CHEAPEST # Include dynamically generated instance types when not using smart cheapest selection if !TERRAFORM_LAMBDALABS_REGION_SMART_CHEAPEST source "terraform/lambdalabs/kconfigs/Kconfig.compute.generated" + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GH200_OR_LESS + bool "GH200_OR_LESS - Best available up to GH200 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: GH200 → H100-SXM → H100-PCIe → A100-SXM → + A100 → A6000 → RTX6000 → A10. + + Use this for maximum single-GPU performance with automatic fallback + when top-tier options are unavailable. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_H100_OR_LESS + bool "H100_OR_LESS - Best available up to H100 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: H100-SXM → H100-PCIe → A100-SXM → A100 → + A6000 → RTX6000 → A10. + + Use when you want the best available single GPU up to H100. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A100_OR_LESS + bool "A100_OR_LESS - Best available up to A100 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: A100-SXM → A100 → A6000 → RTX6000 → A10. + + Use for cost-effective GPU provisioning with A100 as the maximum tier. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A6000_OR_LESS + bool "A6000_OR_LESS - Best available up to A6000 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: A6000 → RTX6000 → A10. + + Budget-friendly option with A6000 as maximum tier. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_B200_OR_LESS + bool "8X_B200_OR_LESS - Best available 8-GPU up to B200 (tier-based)" + help + Tier-based selection for 8-GPU instances. + Tries tiers in order: 8x B200 → 8x H100 → 8x A100-80 → 8x A100 → 8x V100. + + Use for maximum multi-GPU performance with automatic fallback. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_H100_OR_LESS + bool "8X_H100_OR_LESS - Best available 8-GPU up to H100 (tier-based)" + help + Tier-based selection for 8-GPU instances. + Tries tiers in order: 8x H100 → 8x A100-80 → 8x A100 → 8x V100. + + Use when you need 8 GPUs with H100 as the maximum tier. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_A100_OR_LESS + bool "8X_A100_OR_LESS - Best available 8-GPU up to A100 (tier-based)" + help + Tier-based selection for 8-GPU instances. + Tries tiers in order: 8x A100-80 → 8x A100 → 8x V100. + + Cost-effective 8-GPU option with A100 as maximum tier. + endif config TERRAFORM_LAMBDALABS_INSTANCE_TYPE string output yaml default $(shell, python3 scripts/lambdalabs_smart_inference.py instance) if TERRAFORM_LAMBDALABS_REGION_SMART_CHEAPEST + # Tier-based instance type mappings + default "GH200_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GH200_OR_LESS + default "H100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_H100_OR_LESS + default "A100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A100_OR_LESS + default "A6000_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A6000_OR_LESS + default "8X_B200_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_B200_OR_LESS + default "8X_H100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_H100_OR_LESS + default "8X_A100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_A100_OR_LESS # Dynamically generated mappings for all instance types default "cpu_4x_general" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_CPU_4X_GENERAL default "gpu_1x_a10" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GPU_1X_A10 From 76422a22ca27c5abb8c6c0db08efbda43c2a0c7c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 14 Dec 2025 15:58:48 -0500 Subject: [PATCH 2/4] lambdalabs: Fix error handling in check_availability function Address review feedback regarding inconsistent error handling in the check_availability function. The function contract implies returning int values for both success and failure, but error paths were calling sys.exit() directly. Change the error handling to return non-zero integers instead of calling sys.exit(), making the function consistent and easier to test. Remove the unused instance_data binding from get_instance_types_with_capacity() since only capacity_map is used. Add exception handling around the API call so the function never raises unhandled exceptions. Generated-by: Claude AI Signed-off-by: Chuck Lever --- scripts/lambdalabs_check_capacity.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/lambdalabs_check_capacity.py b/scripts/lambdalabs_check_capacity.py index b6c05641e..219f12df4 100755 --- a/scripts/lambdalabs_check_capacity.py +++ b/scripts/lambdalabs_check_capacity.py @@ -25,13 +25,17 @@ def check_availability(instance_type=None, json_output=False, pick_first=False): if not api_key: sys.stderr.write("Error: Lambda Labs API key not found\n") sys.stderr.write("Set LAMBDALABS_API_KEY or create ~/.lambdalabs/credentials\n") - sys.exit(1) + return 1 - instance_data, capacity_map = get_instance_types_with_capacity(api_key) + try: + _, capacity_map = get_instance_types_with_capacity(api_key) + except Exception as e: + sys.stderr.write(f"Error: Failed to fetch instance availability: {e}\n") + return 1 if not capacity_map: sys.stderr.write("Error: Could not fetch instance availability\n") - sys.exit(1) + return 1 if instance_type: # Check specific instance type From eebb25dcefda7d6ede857eb66d680f406a55cf4e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 14 Dec 2025 16:02:01 -0500 Subject: [PATCH 3/4] lambdalabs: Extract region-map building logic into helper function Address review feedback about duplicate code in check_availability(). The logic to build a region_map dictionary from gpu_instances appeared twice identically, violating the DRY principle. Extract this common pattern into a private _build_region_map() helper function that takes gpu_instances and returns the region-to-instance-type mapping. Both the JSON output and text output code paths now call this helper instead of duplicating the iteration logic. Generated-by: Claude AI Signed-off-by: Chuck Lever --- scripts/lambdalabs_check_capacity.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/scripts/lambdalabs_check_capacity.py b/scripts/lambdalabs_check_capacity.py index 219f12df4..aa93b47e5 100755 --- a/scripts/lambdalabs_check_capacity.py +++ b/scripts/lambdalabs_check_capacity.py @@ -19,6 +19,17 @@ from lambdalabs_api import get_api_key, get_instance_types_with_capacity +def _build_region_map(gpu_instances): + """Build a mapping from regions to available instance types.""" + region_map = {} + for inst_type, regions in gpu_instances.items(): + for region in regions: + if region not in region_map: + region_map[region] = [] + region_map[region].append(inst_type) + return region_map + + def check_availability(instance_type=None, json_output=False, pick_first=False): """Check instance availability across all regions.""" api_key = get_api_key() @@ -67,12 +78,7 @@ def check_availability(instance_type=None, json_output=False, pick_first=False): if json_output: # Format for tier selection script compatibility # Group by region for consistency with DataCrunch format - region_map = {} - for inst_type, regions in gpu_instances.items(): - for region in regions: - if region not in region_map: - region_map[region] = [] - region_map[region].append(inst_type) + region_map = _build_region_map(gpu_instances) results = [ {"location": region, "instances": instances} @@ -83,12 +89,7 @@ def check_availability(instance_type=None, json_output=False, pick_first=False): print("GPU Instance Availability:\n") # Group by region - region_map = {} - for inst_type, regions in gpu_instances.items(): - for region in regions: - if region not in region_map: - region_map[region] = [] - region_map[region].append(inst_type) + region_map = _build_region_map(gpu_instances) for region in sorted(region_map.keys()): print(f"📍 {region}:") From dbccf26ddd20d482dda009b4dedf4252d613cab6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 16 Dec 2025 11:17:29 -0500 Subject: [PATCH 4/4] lambdalabs: Validate tier selection script output format The tier selection script outputs "instance_type region" which is then parsed by splitting on whitespace and accessing indices [0] and [1]. If the script produces unexpected output such as an empty line or a single word, the split operation produces a list with fewer than two elements, causing Ansible to fail with a cryptic index error. Add an explicit validation task using ansible.builtin.assert to verify the output contains exactly two whitespace-separated values before attempting to parse it. This provides a clear error message showing the actual output when the format is invalid, making debugging easier. Generated-by: Claude AI Signed-off-by: Chuck Lever --- playbooks/roles/terraform/tasks/main.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/playbooks/roles/terraform/tasks/main.yml b/playbooks/roles/terraform/tasks/main.yml index c8605feaa..9561b837b 100644 --- a/playbooks/roles/terraform/tasks/main.yml +++ b/playbooks/roles/terraform/tasks/main.yml @@ -138,6 +138,21 @@ tags: - bringup +- name: Validate Lambda Labs tier selection output format + ansible.builtin.assert: + that: + - lambdalabs_auto_instance_type.stdout.split() | length == 2 + fail_msg: | + Invalid output from tier selection script. + Expected format: "instance_type region" + Got: "{{ lambdalabs_auto_instance_type.stdout }}" + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + - name: Parse Lambda Labs auto-selected instance type and region set_fact: lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}"