diff --git a/defconfigs/lambdalabs-8x-b200-or-less b/defconfigs/lambdalabs-8x-b200-or-less new file mode 100644 index 000000000..5205d2b4f --- /dev/null +++ b/defconfigs/lambdalabs-8x-b200-or-less @@ -0,0 +1,14 @@ +# Lambda Labs 8-GPU with tier-based fallback (B200 maximum tier) +# Uses 8X_B200_OR_LESS for best available 8-GPU up to B200 +# Fallback order: 8x B200 → 8x H100 → 8x A100-80 → 8x A100 → 8x V100 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_B200_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-8x-h100-or-less b/defconfigs/lambdalabs-8x-h100-or-less new file mode 100644 index 000000000..09bb5b6e0 --- /dev/null +++ b/defconfigs/lambdalabs-8x-h100-or-less @@ -0,0 +1,14 @@ +# Lambda Labs 8-GPU with tier-based fallback (H100 maximum tier) +# Uses 8X_H100_OR_LESS for best available 8-GPU up to H100 +# Fallback order: 8x H100 → 8x A100-80 → 8x A100 → 8x V100 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_H100_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-a100-or-less b/defconfigs/lambdalabs-a100-or-less new file mode 100644 index 000000000..57ddc9a05 --- /dev/null +++ b/defconfigs/lambdalabs-a100-or-less @@ -0,0 +1,14 @@ +# Lambda Labs GPU with tier-based fallback (A100 maximum tier) +# Uses A100_OR_LESS for best available single GPU up to A100 +# Fallback order: A100-SXM → A100 → A6000 → RTX6000 → A10 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A100_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-gh200-or-less b/defconfigs/lambdalabs-gh200-or-less new file mode 100644 index 000000000..17263d3da --- /dev/null +++ b/defconfigs/lambdalabs-gh200-or-less @@ -0,0 +1,14 @@ +# Lambda Labs GPU with tier-based fallback (GH200 maximum tier) +# Uses GH200_OR_LESS for best available single GPU up to GH200 +# Fallback order: GH200 → H100-SXM → H100-PCIe → A100-SXM → A100 → A6000 → RTX6000 → A10 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GH200_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/defconfigs/lambdalabs-h100-or-less b/defconfigs/lambdalabs-h100-or-less new file mode 100644 index 000000000..8af59c837 --- /dev/null +++ b/defconfigs/lambdalabs-h100-or-less @@ -0,0 +1,14 @@ +# Lambda Labs GPU with tier-based fallback (H100 maximum tier) +# Uses H100_OR_LESS for best available single GPU up to H100 +# Fallback order: H100-SXM → H100-PCIe → A100-SXM → A100 → A6000 → RTX6000 → A10 +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_LAMBDALABS=y +CONFIG_TERRAFORM_LAMBDALABS_REGION_SMART_INFER=y +CONFIG_TERRAFORM_LAMBDALABS_INSTANCE_TYPE_H100_OR_LESS=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_OVERWRITE=y +CONFIG_TERRAFORM_SSH_CONFIG_GENKEY_EMPTY_PASSPHRASE=y +CONFIG_WORKFLOWS=y +CONFIG_WORKFLOWS_TESTS=y +CONFIG_WORKFLOWS_LINUX_TESTS=y +CONFIG_WORKFLOWS_DEDICATED_WORKFLOW=y diff --git a/playbooks/roles/terraform/tasks/main.yml b/playbooks/roles/terraform/tasks/main.yml index f2aa248fe..9561b837b 100644 --- a/playbooks/roles/terraform/tasks/main.yml +++ b/playbooks/roles/terraform/tasks/main.yml @@ -91,11 +91,126 @@ - destroy - status +- name: Auto-select Lambda Labs instance type for tier-based wildcards + ansible.builtin.shell: + cmd: | + case "{{ terraform_lambdalabs_instance_type }}" in + GH200_OR_LESS|H100_OR_LESS|A100_OR_LESS|A6000_OR_LESS) + # Use tier-based selection script for single GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + 8X_B200_OR_LESS|8X_H100_OR_LESS|8X_A100_OR_LESS) + # Use tier-based selection script for 8x GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + *) + echo "Unknown wildcard type: {{ terraform_lambdalabs_instance_type }}" + exit 1 + ;; + esac + register: lambdalabs_auto_instance_type + failed_when: false + changed_when: false + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + tags: + - bringup + +- name: Fail if no Lambda Labs instances available for wildcard selection + ansible.builtin.fail: + msg: | + No GPU instances available for {{ terraform_lambdalabs_instance_type }} + + {{ lambdalabs_auto_instance_type.stderr }} + + Try: + - Wait and retry (capacity changes frequently) + - Check Lambda Labs dashboard: https://cloud.lambdalabs.com + - Use a different tier group via menuconfig + - Check capacity manually: scripts/lambdalabs_check_capacity.py + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc != 0 + tags: + - bringup + +- name: Validate Lambda Labs tier selection output format + ansible.builtin.assert: + that: + - lambdalabs_auto_instance_type.stdout.split() | length == 2 + fail_msg: | + Invalid output from tier selection script. + Expected format: "instance_type region" + Got: "{{ lambdalabs_auto_instance_type.stdout }}" + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Parse Lambda Labs auto-selected instance type and region + set_fact: + lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}" + lambdalabs_auto_selected_region: "{{ lambdalabs_auto_instance_type.stdout.split()[1] }}" + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Report Lambda Labs auto-selected instance type for wildcards + ansible.builtin.debug: + msg: "Auto-selected instance type: {{ lambdalabs_auto_selected_instance }} in region: {{ lambdalabs_auto_selected_region }}" + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected instance type + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_instance_type\s*=' + line: 'lambdalabs_instance_type = "{{ lambdalabs_auto_selected_instance }}"' + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected region + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_region\s*=' + line: 'lambdalabs_region = "{{ lambdalabs_auto_selected_region }}"' + when: + - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Set Lambda Labs resolved instance type for subsequent tasks + set_fact: + lambdalabs_resolved_instance_type: "{{ lambdalabs_auto_selected_instance if (terraform_lambdalabs_instance_type in ['GH200_OR_LESS', 'H100_OR_LESS', 'A100_OR_LESS', 'A6000_OR_LESS', '8X_B200_OR_LESS', '8X_H100_OR_LESS', '8X_A100_OR_LESS'] and lambdalabs_auto_instance_type.rc == 0) else terraform_lambdalabs_instance_type }}" + when: + - kdevops_terraform_provider == "lambdalabs" + tags: + - bringup + - name: Check Lambda Labs capacity before provisioning (if using Lambda Labs) ansible.builtin.shell: cmd: | {{ topdir_path }}/scripts/lambda-cli --output json check-availability \ - {{ terraform_lambdalabs_instance_type }} {{ terraform_lambdalabs_region }} | \ + {{ lambdalabs_resolved_instance_type | default(terraform_lambdalabs_instance_type) }} {{ lambdalabs_auto_selected_region | default(terraform_lambdalabs_region) }} | \ python3 -c " import sys, json data = json.load(sys.stdin) @@ -113,6 +228,7 @@ changed_when: false when: - kdevops_terraform_provider == "lambdalabs" + - terraform_lambdalabs_instance_type not in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] tags: - bringup @@ -121,6 +237,8 @@ msg: "{{ capacity_check.stdout }}" when: - kdevops_terraform_provider == "lambdalabs" + - capacity_check is defined + - capacity_check.rc is defined - capacity_check.rc != 0 tags: - bringup diff --git a/scripts/lambdalabs_check_capacity.py b/scripts/lambdalabs_check_capacity.py new file mode 100755 index 000000000..aa93b47e5 --- /dev/null +++ b/scripts/lambdalabs_check_capacity.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: copyleft-next-0.3.1 +""" +Check Lambda Labs instance availability across all regions. + +This script queries the Lambda Labs API to find where specific instance types +are available, helping users avoid provisioning failures. +""" + +import argparse +import json +import os +import sys + +# Import our Lambda Labs API module +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, SCRIPT_DIR) + +from lambdalabs_api import get_api_key, get_instance_types_with_capacity + + +def _build_region_map(gpu_instances): + """Build a mapping from regions to available instance types.""" + region_map = {} + for inst_type, regions in gpu_instances.items(): + for region in regions: + if region not in region_map: + region_map[region] = [] + region_map[region].append(inst_type) + return region_map + + +def check_availability(instance_type=None, json_output=False, pick_first=False): + """Check instance availability across all regions.""" + api_key = get_api_key() + if not api_key: + sys.stderr.write("Error: Lambda Labs API key not found\n") + sys.stderr.write("Set LAMBDALABS_API_KEY or create ~/.lambdalabs/credentials\n") + return 1 + + try: + _, capacity_map = get_instance_types_with_capacity(api_key) + except Exception as e: + sys.stderr.write(f"Error: Failed to fetch instance availability: {e}\n") + return 1 + + if not capacity_map: + sys.stderr.write("Error: Could not fetch instance availability\n") + return 1 + + if instance_type: + # Check specific instance type + regions = capacity_map.get(instance_type, []) + if pick_first: + if regions: + print(regions[0]) + return 0 + return 1 + + if json_output: + result = [{"instance_type": instance_type, "regions": regions}] + print(json.dumps(result, indent=2)) + else: + if regions: + print(f"{instance_type}:") + for region in regions: + print(f" • {region}") + else: + print(f"{instance_type}: No capacity available") + return 0 if regions else 1 + else: + # Show all GPU instances with capacity + results = [] + gpu_instances = { + k: v for k, v in capacity_map.items() if k.startswith("gpu_") and v + } + + if json_output: + # Format for tier selection script compatibility + # Group by region for consistency with DataCrunch format + region_map = _build_region_map(gpu_instances) + + results = [ + {"location": region, "instances": instances} + for region, instances in sorted(region_map.items()) + ] + print(json.dumps(results, indent=2)) + else: + print("GPU Instance Availability:\n") + + # Group by region + region_map = _build_region_map(gpu_instances) + + for region in sorted(region_map.keys()): + print(f"📍 {region}:") + for inst in sorted(region_map[region]): + print(f" • {inst}") + print() + + if not region_map: + print("No GPU instances currently available") + + return 0 + + +def main(): + parser = argparse.ArgumentParser( + description="Check Lambda Labs instance availability" + ) + parser.add_argument( + "--instance-type", + "-i", + help="Check specific instance type (e.g., gpu_1x_h100_sxm5)", + ) + parser.add_argument( + "--json", "-j", action="store_true", help="Output in JSON format" + ) + parser.add_argument( + "--pick-first", + action="store_true", + help="Return first available region (for scripts)", + ) + + args = parser.parse_args() + sys.exit(check_availability(args.instance_type, args.json, args.pick_first)) + + +if __name__ == "__main__": + main() diff --git a/scripts/lambdalabs_select_tier.py b/scripts/lambdalabs_select_tier.py new file mode 100755 index 000000000..305b26aec --- /dev/null +++ b/scripts/lambdalabs_select_tier.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: copyleft-next-0.3.1 +""" +Lambda Labs GPU tier-based instance selection. + +This script implements a tiered fallback system for selecting GPU instances +based on availability. Users can specify a maximum tier (e.g., "h100" or "gh200") +and the script will try to provision the highest tier available, falling back +to lower tiers if necessary. +""" + +import argparse +import json +import os +import sys +from typing import Dict, List, Optional + +# Import our Lambda Labs API module +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, SCRIPT_DIR) + +from lambdalabs_api import get_api_key, get_instance_types_with_capacity + + +# GPU tier definitions for single GPU instances (highest to lowest performance) +GPU_TIERS_1X = { + "gh200": [ + "gpu_1x_gh200", # NVIDIA GH200 Grace Hopper Superchip + ], + "h100-sxm": [ + "gpu_1x_h100_sxm5", # H100 SXM5 (highest bandwidth) + ], + "h100-pcie": [ + "gpu_1x_h100_pcie", # H100 PCIe + ], + "a100-sxm": [ + "gpu_1x_a100_sxm4", # A100 SXM4 + ], + "a100": [ + "gpu_1x_a100", # A100 (PCIe variant) + ], + "a6000": [ + "gpu_1x_a6000", # RTX A6000 + ], + "rtx6000": [ + "gpu_1x_rtx6000", # RTX 6000 + ], + "a10": [ + "gpu_1x_a10", # A10 (budget option) + ], +} + +# Tier ordering for single GPU (highest to lowest) +TIER_ORDER_1X = [ + "gh200", + "h100-sxm", + "h100-pcie", + "a100-sxm", + "a100", + "a6000", + "rtx6000", + "a10", +] + +# GPU tier definitions for 8x GPU instances +GPU_TIERS_8X = { + "b200": [ + "gpu_8x_b200_sxm6", # 8x B200 (Blackwell) + ], + "h100": [ + "gpu_8x_h100_sxm5", # 8x H100 SXM5 + ], + "a100-80": [ + "gpu_8x_a100_80gb_sxm4", # 8x A100 80GB + ], + "a100": [ + "gpu_8x_a100", # 8x A100 + ], + "v100": [ + "gpu_8x_v100", # 8x V100 (legacy) + ], +} + +TIER_ORDER_8X = [ + "b200", + "h100", + "a100-80", + "a100", + "v100", +] + +# Pre-defined tier groups for single GPU +TIER_GROUPS_1X = { + "gh200-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("gh200") :], + "h100-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("h100-sxm") :], + "a100-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("a100-sxm") :], + "a6000-or-less": TIER_ORDER_1X[TIER_ORDER_1X.index("a6000") :], +} + +# Pre-defined tier groups for 8x GPU +TIER_GROUPS_8X = { + "8x-b200-or-less": TIER_ORDER_8X[TIER_ORDER_8X.index("b200") :], + "8x-h100-or-less": TIER_ORDER_8X[TIER_ORDER_8X.index("h100") :], + "8x-a100-or-less": TIER_ORDER_8X[TIER_ORDER_8X.index("a100-80") :], +} + +# Combined tier groups +TIER_GROUPS = {**TIER_GROUPS_1X, **TIER_GROUPS_8X} + + +def get_capacity_map(api_key: str) -> Dict[str, List[str]]: + """ + Get GPU instance capacity map. + + Returns: + Dictionary mapping instance_type to list of available regions + Example: {"gpu_1x_h100_sxm5": ["us-west-1", "us-east-1"]} + """ + _, capacity_map = get_instance_types_with_capacity(api_key) + return capacity_map + + +def check_instance_availability( + instance_type: str, capacity_map: Dict[str, List[str]] +) -> Optional[str]: + """ + Check if a specific instance type has available capacity in any region. + + Returns: + The region where the instance is available, or None if not available + """ + regions = capacity_map.get(instance_type, []) + return regions[0] if regions else None + + +def select_instance_from_tiers( + tier_group: str, verbose: bool = False +) -> Optional[Dict[str, str]]: + """ + Select the highest-tier available instance from a tier group. + + Returns: + Dictionary with 'instance_type' and 'region' keys, or None if unavailable + Example: {"instance_type": "gpu_1x_h100_sxm5", "region": "us-west-1"} + """ + if tier_group not in TIER_GROUPS: + if verbose: + print(f"Error: Unknown tier group '{tier_group}'", file=sys.stderr) + print( + f"Available tier groups: {', '.join(sorted(TIER_GROUPS.keys()))}", + file=sys.stderr, + ) + return None + + api_key = get_api_key() + if not api_key: + if verbose: + print("Error: Lambda Labs API key not found", file=sys.stderr) + return None + + capacity_map = get_capacity_map(api_key) + + if verbose and capacity_map: + gpu_capacity = { + k: v for k, v in capacity_map.items() if v and k.startswith("gpu_") + } + if gpu_capacity: + print("Available GPU capacity:", file=sys.stderr) + for inst_type, regions in sorted(gpu_capacity.items()): + print(f" {inst_type}: {', '.join(regions)}", file=sys.stderr) + print("", file=sys.stderr) + + tiers_to_check = TIER_GROUPS[tier_group] + + # Determine which GPU_TIERS dict to use + is_8x = tier_group.startswith("8x-") + gpu_tiers = GPU_TIERS_8X if is_8x else GPU_TIERS_1X + + if verbose: + print(f"Checking tier group: {tier_group}", file=sys.stderr) + print( + f"Tiers to check (highest to lowest): {', '.join(tiers_to_check)}", + file=sys.stderr, + ) + print("", file=sys.stderr) + + for tier in tiers_to_check: + if tier not in gpu_tiers: + continue + + instance_types = gpu_tiers[tier] + + if verbose: + print( + f"Checking tier '{tier}': {', '.join(instance_types)}", file=sys.stderr + ) + + for instance_type in instance_types: + if verbose: + print(f" Checking {instance_type}...", end=" ", file=sys.stderr) + + region = check_instance_availability(instance_type, capacity_map) + if region: + if verbose: + print(f"✓ AVAILABLE in {region}", file=sys.stderr) + print("", file=sys.stderr) + print( + f"Selected: {instance_type} in {region} (tier: {tier})", + file=sys.stderr, + ) + return {"instance_type": instance_type, "region": region} + + if verbose: + print("✗ not available", file=sys.stderr) + + if verbose: + print("", file=sys.stderr) + + if verbose: + print( + "Error: No instances available in any tier", + file=sys.stderr, + ) + + return None + + +def list_tier_groups(): + """Print available tier groups and their contents.""" + print("Available tier groups:\n") + + print("Single GPU (1x) tiers:") + for group_name in sorted(TIER_GROUPS_1X.keys()): + tiers = TIER_GROUPS_1X[group_name] + print(f" {group_name}:") + for tier in tiers: + instance_types = GPU_TIERS_1X.get(tier, []) + print(f" - {tier}: {', '.join(instance_types)}") + print("") + + print("Multi-GPU (8x) tiers:") + for group_name in sorted(TIER_GROUPS_8X.keys()): + tiers = TIER_GROUPS_8X[group_name] + print(f" {group_name}:") + for tier in tiers: + instance_types = GPU_TIERS_8X.get(tier, []) + print(f" - {tier}: {', '.join(instance_types)}") + + +def main(): + parser = argparse.ArgumentParser( + description="Select Lambda Labs GPU instance using tier-based fallback", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Select best available single GPU up to H100 + %(prog)s h100-or-less + + # Select best available single GPU up to GH200 + %(prog)s gh200-or-less + + # Select best available 8x GPU up to H100 + %(prog)s 8x-h100-or-less + + # List all tier groups + %(prog)s --list-tiers + + # Verbose mode to see selection process + %(prog)s h100-or-less --verbose +""", + ) + + parser.add_argument( + "tier_group", + nargs="?", + help="Tier group to select from (e.g., h100-or-less, gh200-or-less)", + ) + + parser.add_argument( + "--list-tiers", + action="store_true", + help="List all available tier groups and exit", + ) + + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Print detailed selection process", + ) + + args = parser.parse_args() + + if args.list_tiers: + list_tier_groups() + return 0 + + if not args.tier_group: + parser.print_help() + return 1 + + result = select_instance_from_tiers(args.tier_group, args.verbose) + + if result: + # Output format: instance_type region + print(f"{result['instance_type']} {result['region']}") + return 0 + else: + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/terraform/lambdalabs/kconfigs/Kconfig.compute b/terraform/lambdalabs/kconfigs/Kconfig.compute index 579e7207d..f761eafc0 100644 --- a/terraform/lambdalabs/kconfigs/Kconfig.compute +++ b/terraform/lambdalabs/kconfigs/Kconfig.compute @@ -10,12 +10,80 @@ endif # TERRAFORM_LAMBDALABS_REGION_SMART_CHEAPEST # Include dynamically generated instance types when not using smart cheapest selection if !TERRAFORM_LAMBDALABS_REGION_SMART_CHEAPEST source "terraform/lambdalabs/kconfigs/Kconfig.compute.generated" + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GH200_OR_LESS + bool "GH200_OR_LESS - Best available up to GH200 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: GH200 → H100-SXM → H100-PCIe → A100-SXM → + A100 → A6000 → RTX6000 → A10. + + Use this for maximum single-GPU performance with automatic fallback + when top-tier options are unavailable. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_H100_OR_LESS + bool "H100_OR_LESS - Best available up to H100 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: H100-SXM → H100-PCIe → A100-SXM → A100 → + A6000 → RTX6000 → A10. + + Use when you want the best available single GPU up to H100. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A100_OR_LESS + bool "A100_OR_LESS - Best available up to A100 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: A100-SXM → A100 → A6000 → RTX6000 → A10. + + Use for cost-effective GPU provisioning with A100 as the maximum tier. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A6000_OR_LESS + bool "A6000_OR_LESS - Best available up to A6000 (tier-based)" + help + Tier-based selection: provision the highest-tier single GPU available. + Tries tiers in order: A6000 → RTX6000 → A10. + + Budget-friendly option with A6000 as maximum tier. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_B200_OR_LESS + bool "8X_B200_OR_LESS - Best available 8-GPU up to B200 (tier-based)" + help + Tier-based selection for 8-GPU instances. + Tries tiers in order: 8x B200 → 8x H100 → 8x A100-80 → 8x A100 → 8x V100. + + Use for maximum multi-GPU performance with automatic fallback. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_H100_OR_LESS + bool "8X_H100_OR_LESS - Best available 8-GPU up to H100 (tier-based)" + help + Tier-based selection for 8-GPU instances. + Tries tiers in order: 8x H100 → 8x A100-80 → 8x A100 → 8x V100. + + Use when you need 8 GPUs with H100 as the maximum tier. + +config TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_A100_OR_LESS + bool "8X_A100_OR_LESS - Best available 8-GPU up to A100 (tier-based)" + help + Tier-based selection for 8-GPU instances. + Tries tiers in order: 8x A100-80 → 8x A100 → 8x V100. + + Cost-effective 8-GPU option with A100 as maximum tier. + endif config TERRAFORM_LAMBDALABS_INSTANCE_TYPE string output yaml default $(shell, python3 scripts/lambdalabs_smart_inference.py instance) if TERRAFORM_LAMBDALABS_REGION_SMART_CHEAPEST + # Tier-based instance type mappings + default "GH200_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GH200_OR_LESS + default "H100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_H100_OR_LESS + default "A100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A100_OR_LESS + default "A6000_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_A6000_OR_LESS + default "8X_B200_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_B200_OR_LESS + default "8X_H100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_H100_OR_LESS + default "8X_A100_OR_LESS" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_8X_A100_OR_LESS # Dynamically generated mappings for all instance types default "cpu_4x_general" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_CPU_4X_GENERAL default "gpu_1x_a10" if TERRAFORM_LAMBDALABS_INSTANCE_TYPE_GPU_1X_A10