From 6b14c47eca1514efbdaaac779cd84f81dd0eac88 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 17 Dec 2025 10:35:25 -0500 Subject: [PATCH 1/2] terraform: refactor tasks into provider-specific modules The terraform role's main.yml had grown to nearly 800 lines with Lambda Labs and DataCrunch provider-specific logic interleaved throughout the file. This made maintenance difficult and required understanding the entire file to make changes for a single provider. This refactoring creates a plugin-like architecture where each cloud provider's unique requirements are isolated in dedicated task files. The new structure separates concerns by operation (bringup, destroy) and by provider (lambdalabs, datacrunch, generic), making it straightforward to add support for new providers or modify existing ones without risk of breaking other providers. Common functionality shared across all providers, such as SSH configuration and status reporting, is now cleanly separated in a common directory. This follows patterns already established in other roles like bootlinux and guestfs. Generated-by: Claude AI Signed-off-by: Chuck Lever --- .../terraform/tasks/bringup/datacrunch.yml | 441 ++++++++++ .../roles/terraform/tasks/bringup/generic.yml | 12 + .../terraform/tasks/bringup/lambdalabs.yml | 222 +++++ .../roles/terraform/tasks/bringup/main.yml | 27 + .../terraform/tasks/common/ssh-config.yml | 37 + .../roles/terraform/tasks/common/status.yml | 35 + .../terraform/tasks/destroy/datacrunch.yml | 67 ++ .../roles/terraform/tasks/destroy/generic.yml | 12 + .../roles/terraform/tasks/destroy/main.yml | 26 + playbooks/roles/terraform/tasks/main.yml | 779 +----------------- 10 files changed, 893 insertions(+), 765 deletions(-) create mode 100644 playbooks/roles/terraform/tasks/bringup/datacrunch.yml create mode 100644 playbooks/roles/terraform/tasks/bringup/generic.yml create mode 100644 playbooks/roles/terraform/tasks/bringup/lambdalabs.yml create mode 100644 playbooks/roles/terraform/tasks/bringup/main.yml create mode 100644 playbooks/roles/terraform/tasks/common/ssh-config.yml create mode 100644 playbooks/roles/terraform/tasks/common/status.yml create mode 100644 playbooks/roles/terraform/tasks/destroy/datacrunch.yml create mode 100644 playbooks/roles/terraform/tasks/destroy/generic.yml create mode 100644 playbooks/roles/terraform/tasks/destroy/main.yml diff --git a/playbooks/roles/terraform/tasks/bringup/datacrunch.yml b/playbooks/roles/terraform/tasks/bringup/datacrunch.yml new file mode 100644 index 00000000..7e0ca15c --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/datacrunch.yml @@ -0,0 +1,441 @@ +--- +# DataCrunch provider bringup tasks +# Provider installation, tier-based instance selection, capacity checking, terraform apply + +- name: Set DataCrunch provider architecture + ansible.builtin.set_fact: + datacrunch_provider_arch: "{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}" + tags: + - bringup + - destroy + - status + +- name: Check if DataCrunch terraform provider is already installed + ansible.builtin.stat: + path: >- + ~/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch/0.0.3/{{ + ansible_system | lower }}_{{ datacrunch_provider_arch }}/terraform-provider-datacrunch_v0.0.3 + register: datacrunch_provider_installed + tags: + - bringup + - destroy + - status + +- name: Download and install DataCrunch terraform provider from GitHub releases + ansible.builtin.shell: + cmd: | + PROVIDER_VERSION="0.0.3" + PROVIDER_OS="{{ ansible_system | lower }}" + PROVIDER_ARCH="{{ datacrunch_provider_arch }}" + PROVIDER_DIR="$HOME/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch" + PROVIDER_DIR="${PROVIDER_DIR}/${PROVIDER_VERSION}/${PROVIDER_OS}_${PROVIDER_ARCH}" + PROVIDER_FILE="terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}" + GITHUB_URL="https://github.com/linux-kdevops/terraform-provider-datacrunch/releases" + + mkdir -p "${PROVIDER_DIR}" + cd "${PROVIDER_DIR}" + + # Download the provider binary + wget -q "${GITHUB_URL}/download/v${PROVIDER_VERSION}/${PROVIDER_FILE}.zip" + + # Extract the binary + unzip -o "${PROVIDER_FILE}.zip" + + # Clean up zip file + rm "${PROVIDER_FILE}.zip" + + # Make it executable + chmod +x terraform-provider-datacrunch_v${PROVIDER_VERSION} + + echo "DataCrunch provider v${PROVIDER_VERSION} installed to ${PROVIDER_DIR}" + changed_when: true + when: + - not datacrunch_provider_installed.stat.exists + tags: + - bringup + - destroy + - status + +- name: Auto-select instance type for tier-based wildcards + ansible.builtin.shell: + cmd: | + set -o pipefail + case "{{ terraform_datacrunch_instance_type }}" in + B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) + # Use tier-based selection script + tier_group=$(echo "{{ terraform_datacrunch_instance_type }}" | \ + tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/datacrunch_select_tier.py "$tier_group" --verbose + ;; + ANY_1H100) + # Legacy H100 variant selection - check all regions + for variant in 1H100.80S.30V 1H100.80S.32V; do + result=$({{ topdir_path }}/scripts/datacrunch_check_capacity.py \ + --instance-type "$variant" --json 2>/dev/null || echo "[]") + location=$(echo "$result" | python3 -c " + import sys, json + data=json.load(sys.stdin) + print(data[0]['location']) if data and len(data) > 0 else '' + " 2>/dev/null) + if [ -n "$location" ]; then + echo "$variant $location" + exit 0 + fi + done + echo "No single H100 variants available" >&2 + exit 1 + ;; + *) + echo "Unknown wildcard type: {{ terraform_datacrunch_instance_type }}" + exit 1 + ;; + esac + register: datacrunch_auto_instance_type + failed_when: false + changed_when: false + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + tags: + - bringup + +- name: Fail if no instances available for wildcard selection + ansible.builtin.fail: + msg: | + No GPU instances available for {{ terraform_datacrunch_instance_type }} + + {{ datacrunch_auto_instance_type.stderr }} + + Try: + - Wait and retry (capacity changes frequently) + - Check DataCrunch dashboard: https://cloud.datacrunch.io + - Use a different tier group via menuconfig + - Check capacity manually: scripts/datacrunch_check_capacity.py --instance-type + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc != 0 + tags: + - bringup + +- name: Parse auto-selected instance type and location + ansible.builtin.set_fact: + auto_selected_instance: "{{ datacrunch_auto_instance_type.stdout.split()[0] }}" + auto_selected_location: "{{ datacrunch_auto_instance_type.stdout.split()[1] }}" + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Report auto-selected instance type for wildcards + ansible.builtin.debug: + msg: >- + Auto-selected instance type: {{ auto_selected_instance }} + in region: {{ auto_selected_location }} + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update terraform vars with auto-selected instance type + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^datacrunch_instance_type\s*=' + line: 'datacrunch_instance_type = "{{ auto_selected_instance }}"' + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Define DataCrunch tier wildcards list + ansible.builtin.set_fact: + datacrunch_tier_wildcards: + - B300_OR_LESS + - B200_OR_LESS + - H100_OR_LESS + - A100_80_OR_LESS + - A100_40_OR_LESS + - ANY_1H100 + tags: + - bringup + +- name: Set resolved instance type for subsequent tasks + ansible.builtin.set_fact: + resolved_instance_type: >- + {{ auto_selected_instance + if (terraform_datacrunch_instance_type in datacrunch_tier_wildcards + and datacrunch_auto_instance_type.rc == 0) + else terraform_datacrunch_instance_type }} + tags: + - bringup + +- name: Check DataCrunch capacity before provisioning + ansible.builtin.shell: + cmd: | + set -o pipefail + {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ + --instance-type {{ resolved_instance_type }} \ + --json | \ + python3 -c " + import sys, json + data = json.load(sys.stdin) + if data and len(data) > 0: + locations = [item['location'] for item in data] + print(f'Instance {{ resolved_instance_type }} is available in: ' + ', '.join(locations)) + sys.exit(0) + else: + print('Error: Instance {{ resolved_instance_type }} is not available in any location') + print('Check available instances with: scripts/datacrunch_check_capacity.py') + sys.exit(1) + " + register: datacrunch_capacity_check + failed_when: false + changed_when: false + when: + - terraform_datacrunch_instance_type not in datacrunch_tier_wildcards + tags: + - bringup + +- name: Report DataCrunch capacity check result + ansible.builtin.fail: + msg: "{{ datacrunch_capacity_check.stdout }}" + when: + - datacrunch_capacity_check is defined + - datacrunch_capacity_check.rc is defined + - datacrunch_capacity_check.rc != 0 + tags: + - bringup + +- name: Auto-select DataCrunch location for explicit instance types + ansible.builtin.shell: + cmd: | + {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ + --instance-type {{ resolved_instance_type }} \ + --pick-first + register: datacrunch_auto_location + changed_when: false + when: + - terraform_datacrunch_instance_type not in datacrunch_tier_wildcards + tags: + - bringup + +- name: Use tier-selected location for wildcard instance types + ansible.builtin.set_fact: + datacrunch_final_location: "{{ auto_selected_location }}" + when: + - terraform_datacrunch_instance_type in datacrunch_tier_wildcards + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Use auto-selected location for explicit instance types + ansible.builtin.set_fact: + datacrunch_final_location: "{{ datacrunch_auto_location.stdout }}" + when: + - terraform_datacrunch_instance_type not in datacrunch_tier_wildcards + - datacrunch_auto_location.rc == 0 + tags: + - bringup + +- name: Update terraform vars with final location + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^datacrunch_location\s*=' + line: 'datacrunch_location = "{{ datacrunch_final_location }}"' + when: + - datacrunch_final_location is defined + tags: + - bringup + +- name: Display final location + ansible.builtin.debug: + msg: "Selected DataCrunch location: {{ datacrunch_final_location }}" + when: + - datacrunch_final_location is defined + tags: + - bringup + +- name: Check if terraform state already has resources + ansible.builtin.command: + cmd: terraform state list + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_state_check + failed_when: false + changed_when: false + tags: + - bringup + +- name: Set flag for existing terraform resources + ansible.builtin.set_fact: + terraform_resources_exist: "{{ terraform_state_check.stdout_lines | default([]) | length > 0 }}" + tags: + - bringup + +- name: Report that infrastructure is already provisioned + ansible.builtin.debug: + msg: "Infrastructure already provisioned ({{ terraform_state_check.stdout_lines | default([]) | length }} resources in state). Skipping terraform apply." + when: + - terraform_resources_exist | default(false) + tags: + - bringup + +- name: Initialize external provider for DataCrunch (workaround for dev_overrides) + ansible.builtin.shell: + cmd: | + # Hide all terraform files that reference datacrunch resources + # so that terraform init only sees the external provider requirement + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file" ]; then + mv "$file" "$file.bak" + fi + done + + # Create minimal terraform config with only external provider + cat > provider_init.tf << 'EOF' + terraform { + required_version = ">= 1.0" + required_providers { + external = { + source = "hashicorp/external" + version = "~> 2.3" + } + } + } + EOF + + # Initialize to get external provider and generate lock file + # Suppress color output to avoid ANSI codes in logs + terraform init -no-color > /dev/null 2>&1 || terraform init -no-color + + # Preserve the generated lock file for the external provider + # This is needed because dev_overrides prevents normal init of datacrunch provider + # but we still need the lock file for other providers + if [ -f .terraform.lock.hcl ]; then + cp .terraform.lock.hcl .terraform.lock.hcl.generated + fi + + # Clean up temporary file and restore original terraform files + rm provider_init.tf + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file.bak" ]; then + mv "$file.bak" "$file" + fi + done + + # Restore the lock file after putting all files back + if [ -f .terraform.lock.hcl.generated ]; then + mv .terraform.lock.hcl.generated .terraform.lock.hcl + fi + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + when: + - not (terraform_resources_exist | default(false)) + changed_when: false + tags: + - bringup + +- name: Bring up terraform resources (DataCrunch with tier fallback on failure) + ansible.builtin.shell: + cmd: | + set -o pipefail + MAX_RETRIES=5 + EXCLUDED_INSTANCES="" + TIER_GROUP="{{ terraform_datacrunch_instance_type | lower | replace('_', '-') }}" + + # Check if using tier-based selection + is_tier_based() { + case "{{ terraform_datacrunch_instance_type }}" in + B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) + return 0 + ;; + *) + return 1 + ;; + esac + } + + for attempt in $(seq 1 $MAX_RETRIES); do + echo "=== Attempt $attempt of $MAX_RETRIES ===" + + # Get current instance type from tfvars + CURRENT_INSTANCE=$(grep '^datacrunch_instance_type' terraform.tfvars | \ + sed 's/.*= *"\([^"]*\)".*/\1/') + echo "Trying instance type: $CURRENT_INSTANCE" + + # Attempt terraform apply and capture output + APPLY_OUTPUT=$(terraform apply -auto-approve -no-color 2>&1) && { + echo "$APPLY_OUTPUT" + echo "Terraform apply succeeded!" + exit 0 + } + + # Apply failed - check what kind of error + echo "$APPLY_OUTPUT" + + # Check if this is a 503 or deployment error that we can retry with fallback + if echo "$APPLY_OUTPUT" | grep -q "API returned status 503\|Error deploying instance"; then + echo "" + echo "Deployment failed for $CURRENT_INSTANCE - checking if tier fallback is available..." + + if ! is_tier_based; then + echo "Not using tier-based selection, cannot fall back to different instance type." + exit 1 + fi + + # Add current instance to exclusion list + if [ -n "$EXCLUDED_INSTANCES" ]; then + EXCLUDED_INSTANCES="$EXCLUDED_INSTANCES --exclude $CURRENT_INSTANCE" + else + EXCLUDED_INSTANCES="--exclude $CURRENT_INSTANCE" + fi + + echo "Excluded instances so far: $EXCLUDED_INSTANCES" + + # Try to select next available instance + echo "Selecting next available instance from tier group: $TIER_GROUP" + NEXT_SELECTION=$({{ topdir_path }}/scripts/datacrunch_select_tier.py \ + "$TIER_GROUP" --verbose $EXCLUDED_INSTANCES 2>&1) || { + echo "No more instances available in tier group $TIER_GROUP" + echo "$NEXT_SELECTION" + exit 1 + } + + # Parse new instance and location + NEW_INSTANCE=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $1}') + NEW_LOCATION=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $2}') + + if [ -z "$NEW_INSTANCE" ] || [ -z "$NEW_LOCATION" ]; then + echo "Failed to parse new instance selection" + exit 1 + fi + + echo "" + echo "Falling back to: $NEW_INSTANCE in $NEW_LOCATION" + + # Update terraform.tfvars with new instance type and location + sed -i "s/^datacrunch_instance_type.*/datacrunch_instance_type = \"$NEW_INSTANCE\"/" \ + terraform.tfvars + sed -i "s/^datacrunch_location.*/datacrunch_location = \"$NEW_LOCATION\"/" \ + terraform.tfvars + + echo "Updated terraform.tfvars, retrying..." + echo "" + else + echo "Terraform failed with non-recoverable error" + exit 1 + fi + done + + echo "Exhausted all $MAX_RETRIES retry attempts" + exit 1 + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + changed_when: true + when: + - not (terraform_resources_exist | default(false)) + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/bringup/generic.yml b/playbooks/roles/terraform/tasks/bringup/generic.yml new file mode 100644 index 00000000..2e5a9290 --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/generic.yml @@ -0,0 +1,12 @@ +--- +# Generic terraform bringup for providers without special handling +# Used for: AWS, Azure, GCE, OCI, and other standard providers + +- name: Bring up terraform resources + cloud.terraform.terraform: + binary_path: "{{ terraform_binary_path }}" + force_init: true + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + state: present + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/bringup/lambdalabs.yml b/playbooks/roles/terraform/tasks/bringup/lambdalabs.yml new file mode 100644 index 00000000..8aa9048a --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/lambdalabs.yml @@ -0,0 +1,222 @@ +--- +# Lambda Labs provider bringup tasks +# API key validation, tier-based instance selection, capacity checking + +- name: Define Lambda Labs tier wildcards list + ansible.builtin.set_fact: + lambdalabs_tier_wildcards: + - GH200_OR_LESS + - H100_OR_LESS + - A100_OR_LESS + - A6000_OR_LESS + - 8X_B200_OR_LESS + - 8X_H100_OR_LESS + - 8X_A100_OR_LESS + tags: + - bringup + +- name: Check Lambda Labs API key configuration + ansible.builtin.command: + cmd: "python3 {{ topdir_path }}/scripts/lambdalabs_credentials.py check" + register: api_key_check + failed_when: false + changed_when: false + tags: + - bringup + - destroy + - status + +- name: Report Lambda Labs API key configuration status + ansible.builtin.fail: + msg: | + ERROR: Lambda Labs API key is not configured! + + To fix this, configure your Lambda Labs API key using one of these methods: + + Use the kdevops credentials management tool: + python3 scripts/lambdalabs_credentials.py set 'your-actual-api-key-here' + + Or manually create the credentials file: + mkdir -p ~/.lambdalabs + echo "[default]" > ~/.lambdalabs/credentials + echo "lambdalabs_api_key=your-actual-api-key-here" >> ~/.lambdalabs/credentials + chmod 600 ~/.lambdalabs/credentials + + Get your API key from: https://cloud.lambdalabs.com + when: + - api_key_check.rc != 0 + tags: + - bringup + - destroy + - status + +- name: Display Lambda Labs API key configuration status + ansible.builtin.debug: + msg: "{{ api_key_check.stdout }}" + when: + - api_key_check.rc == 0 + tags: + - bringup + - destroy + - status + +- name: Auto-select Lambda Labs instance type for tier-based wildcards + ansible.builtin.shell: + cmd: | + set -o pipefail + case "{{ terraform_lambdalabs_instance_type }}" in + GH200_OR_LESS|H100_OR_LESS|A100_OR_LESS|A6000_OR_LESS) + # Use tier-based selection script for single GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | \ + tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + 8X_B200_OR_LESS|8X_H100_OR_LESS|8X_A100_OR_LESS) + # Use tier-based selection script for 8x GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | \ + tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + *) + echo "Unknown wildcard type: {{ terraform_lambdalabs_instance_type }}" + exit 1 + ;; + esac + register: lambdalabs_auto_instance_type + failed_when: false + changed_when: false + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + tags: + - bringup + +- name: Fail if no Lambda Labs instances available for wildcard selection + ansible.builtin.fail: + msg: | + No GPU instances available for {{ terraform_lambdalabs_instance_type }} + + {{ lambdalabs_auto_instance_type.stderr }} + + Try: + - Wait and retry (capacity changes frequently) + - Check Lambda Labs dashboard: https://cloud.lambdalabs.com + - Use a different tier group via menuconfig + - Check capacity manually: scripts/lambdalabs_check_capacity.py + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc != 0 + tags: + - bringup + +- name: Validate Lambda Labs tier selection output format + ansible.builtin.assert: + that: + - lambdalabs_auto_instance_type.stdout.split() | length == 2 + fail_msg: | + Invalid output from tier selection script. + Expected format: "instance_type region" + Got: "{{ lambdalabs_auto_instance_type.stdout }}" + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Parse Lambda Labs auto-selected instance type and region + ansible.builtin.set_fact: + lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}" + lambdalabs_auto_selected_region: "{{ lambdalabs_auto_instance_type.stdout.split()[1] }}" + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Report Lambda Labs auto-selected instance type for wildcards + ansible.builtin.debug: + msg: >- + Auto-selected instance type: {{ lambdalabs_auto_selected_instance }} + in region: {{ lambdalabs_auto_selected_region }} + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected instance type + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_instance_type\s*=' + line: 'lambdalabs_instance_type = "{{ lambdalabs_auto_selected_instance }}"' + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected region + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_region\s*=' + line: 'lambdalabs_region = "{{ lambdalabs_auto_selected_region }}"' + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Set Lambda Labs resolved instance type for subsequent tasks + ansible.builtin.set_fact: + lambdalabs_resolved_instance_type: >- + {{ lambdalabs_auto_selected_instance + if (terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + and lambdalabs_auto_instance_type.rc == 0) + else terraform_lambdalabs_instance_type }} + tags: + - bringup + +- name: Check Lambda Labs capacity before provisioning + ansible.builtin.shell: + cmd: | + set -o pipefail + {{ topdir_path }}/scripts/lambda-cli --output json check-availability \ + {{ lambdalabs_resolved_instance_type | default(terraform_lambdalabs_instance_type) }} \ + {{ lambdalabs_auto_selected_region | default(terraform_lambdalabs_region) }} | \ + python3 -c " + import sys, json + data = json.load(sys.stdin) + if data.get('available'): + print(data.get('message', 'Instance available')) + sys.exit(0) + else: + print(data.get('error', 'Instance not available')) + if 'available_regions' in data: + print(f' Available in: ' + ', '.join(data['available_regions'])) + sys.exit(1) + " + register: capacity_check + failed_when: false + changed_when: false + when: + - terraform_lambdalabs_instance_type not in lambdalabs_tier_wildcards + tags: + - bringup + +- name: Report Lambda Labs capacity check result + ansible.builtin.fail: + msg: "{{ capacity_check.stdout }}" + when: + - capacity_check is defined + - capacity_check.rc is defined + - capacity_check.rc != 0 + tags: + - bringup + +- name: Bring up terraform resources (Lambda Labs) + cloud.terraform.terraform: + binary_path: "{{ terraform_binary_path }}" + force_init: true + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + state: present + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/bringup/main.yml b/playbooks/roles/terraform/tasks/bringup/main.yml new file mode 100644 index 00000000..5723740a --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/main.yml @@ -0,0 +1,27 @@ +--- +# Terraform bringup orchestration +# Includes provider-specific tasks based on kdevops_terraform_provider + +- name: Include Lambda Labs bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/lambdalabs.yml" + when: + - kdevops_terraform_provider == "lambdalabs" + tags: + - bringup + +- name: Include DataCrunch bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/datacrunch.yml" + when: + - kdevops_terraform_provider == "datacrunch" + tags: + - bringup + +- name: Include generic bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/generic.yml" + when: + - kdevops_terraform_provider not in ["lambdalabs", "datacrunch"] + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/common/ssh-config.yml b/playbooks/roles/terraform/tasks/common/ssh-config.yml new file mode 100644 index 00000000..c1c4d0a9 --- /dev/null +++ b/playbooks/roles/terraform/tasks/common/ssh-config.yml @@ -0,0 +1,37 @@ +--- +# SSH configuration generation for terraform-provisioned nodes +# This is common to all terraform providers + +- name: Retrieve the controller_ip_map from terraform + cloud.terraform.terraform_output: + binary_path: "{{ terraform_binary_path }}" + format: json + name: controller_ip_map + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_output + tags: + - ssh + +- name: Add each target node's ssh Host entry on the control host + ansible.builtin.blockinfile: + block: "{{ lookup('template', 'ssh_config.j2') }}" + create: true + dest: "{{ kdevops_ssh_config }}" + insertafter: "EOF" + marker: "# {mark} host configuration for {{ item.key }}" + mode: "u=rw,g=r,o=r" + loop: "{{ terraform_output.value | dict2items }}" + tags: + - ssh + +- name: Ensure the Include directive is present on the controller + ansible.builtin.blockinfile: + path: "{{ sshconfig }}" + insertbefore: BOF + append_newline: true + create: true + marker: "# {mark} Managed by kdevops" + mode: "u=rw,g=r,o=r" + block: "Include {{ kdevops_ssh_config_prefix }}*" + tags: + - ssh diff --git a/playbooks/roles/terraform/tasks/common/status.yml b/playbooks/roles/terraform/tasks/common/status.yml new file mode 100644 index 00000000..cfa68670 --- /dev/null +++ b/playbooks/roles/terraform/tasks/common/status.yml @@ -0,0 +1,35 @@ +--- +# Terraform status reporting +# Common to all terraform providers + +- name: Report terraform status + tags: + - status + block: + - name: Retrieve the controller_ip_map from terraform + cloud.terraform.terraform_output: + binary_path: "{{ terraform_binary_path }}" + format: json + name: controller_ip_map + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_output + + - name: End play -- terraform state file is empty or missing + ansible.builtin.meta: end_play + when: + - terraform_output.warnings is defined + + - name: Count active resources + ansible.builtin.command: + cmd: "terraform state list" + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_state + changed_when: false + + - name: Show status + ansible.builtin.debug: + msg: "Active resources: {{ terraform_state.stdout_lines | length }}" + + - name: Show controller IP map + ansible.builtin.debug: + var: terraform_output.value diff --git a/playbooks/roles/terraform/tasks/destroy/datacrunch.yml b/playbooks/roles/terraform/tasks/destroy/datacrunch.yml new file mode 100644 index 00000000..c2bc1342 --- /dev/null +++ b/playbooks/roles/terraform/tasks/destroy/datacrunch.yml @@ -0,0 +1,67 @@ +--- +# DataCrunch provider destroy tasks +# Handles dev_overrides workaround for external provider initialization + +- name: Initialize external provider for DataCrunch before destroy + ansible.builtin.shell: + cmd: | + # Hide all terraform files that reference datacrunch resources + # so that terraform init only sees the external provider requirement + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file" ]; then + mv "$file" "$file.bak" + fi + done + + # Create minimal terraform config with only external provider + cat > provider_init.tf << 'EOF' + terraform { + required_version = ">= 1.0" + required_providers { + external = { + source = "hashicorp/external" + version = "~> 2.3" + } + } + } + EOF + + # Initialize to get external provider and generate lock file + terraform init + + # Preserve the generated lock file for the external provider + if [ -f .terraform.lock.hcl ]; then + cp .terraform.lock.hcl .terraform.lock.hcl.generated + fi + + # Clean up temporary file and restore original terraform files + rm provider_init.tf + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file.bak" ]; then + mv "$file.bak" "$file" + fi + done + + # Restore the lock file after putting all files back + if [ -f .terraform.lock.hcl.generated ]; then + mv .terraform.lock.hcl.generated .terraform.lock.hcl + fi + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + changed_when: false + tags: + - destroy + +- name: Destroy terraform resources (DataCrunch with dev overrides) + ansible.builtin.command: + cmd: terraform destroy -auto-approve -no-color + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + changed_when: true + tags: + - destroy + +- name: Remove terraform lock file for DataCrunch after destroy + ansible.builtin.file: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/.terraform.lock.hcl" + state: absent + tags: + - destroy diff --git a/playbooks/roles/terraform/tasks/destroy/generic.yml b/playbooks/roles/terraform/tasks/destroy/generic.yml new file mode 100644 index 00000000..7df6390e --- /dev/null +++ b/playbooks/roles/terraform/tasks/destroy/generic.yml @@ -0,0 +1,12 @@ +--- +# Generic terraform destroy for providers without special handling +# Used for: AWS, Azure, GCE, OCI, Lambda Labs, and other standard providers + +- name: Destroy terraform resources + cloud.terraform.terraform: + binary_path: "{{ terraform_binary_path }}" + force_init: true + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + state: absent + tags: + - destroy diff --git a/playbooks/roles/terraform/tasks/destroy/main.yml b/playbooks/roles/terraform/tasks/destroy/main.yml new file mode 100644 index 00000000..f882ceab --- /dev/null +++ b/playbooks/roles/terraform/tasks/destroy/main.yml @@ -0,0 +1,26 @@ +--- +# Terraform destroy orchestration +# Includes provider-specific tasks based on kdevops_terraform_provider + +- name: Remove the ephemeral ssh config file on the control host + ansible.builtin.file: + path: "{{ kdevops_ssh_config }}" + state: absent + tags: + - destroy + +- name: Include DataCrunch destroy tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/destroy/datacrunch.yml" + when: + - kdevops_terraform_provider == "datacrunch" + tags: + - destroy + +- name: Include generic destroy tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/destroy/generic.yml" + when: + - kdevops_terraform_provider != "datacrunch" + tags: + - destroy diff --git a/playbooks/roles/terraform/tasks/main.yml b/playbooks/roles/terraform/tasks/main.yml index 9561b837..f57652f1 100644 --- a/playbooks/roles/terraform/tasks/main.yml +++ b/playbooks/roles/terraform/tasks/main.yml @@ -1,778 +1,27 @@ --- -- name: Check Lambda Labs API key configuration (if using Lambda Labs) - ansible.builtin.command: - cmd: "python3 {{ topdir_path }}/scripts/lambdalabs_credentials.py check" - register: api_key_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "lambdalabs" - tags: - - bringup - - destroy - - status - -- name: Report Lambda Labs API key configuration status - ansible.builtin.fail: - msg: | - ERROR: Lambda Labs API key is not configured! - - To fix this, configure your Lambda Labs API key using one of these methods: - - Use the kdevops credentials management tool: - python3 scripts/lambdalabs_credentials.py set 'your-actual-api-key-here' - - Or manually create the credentials file: - mkdir -p ~/.lambdalabs - echo "[default]" > ~/.lambdalabs/credentials - echo "lambdalabs_api_key=your-actual-api-key-here" >> ~/.lambdalabs/credentials - chmod 600 ~/.lambdalabs/credentials - - Get your API key from: https://cloud.lambdalabs.com - when: - - kdevops_terraform_provider == "lambdalabs" - - api_key_check.rc != 0 - tags: - - bringup - - destroy - - status - -- name: Display Lambda Labs API key configuration status - ansible.builtin.debug: - msg: "{{ api_key_check.stdout }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - api_key_check.rc == 0 - tags: - - bringup - - destroy - - status - -- name: Check if DataCrunch terraform provider is already installed - ansible.builtin.stat: - path: "~/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch/0.0.3/{{ ansible_system | lower }}_{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}/terraform-provider-datacrunch_v0.0.3" - register: datacrunch_provider_installed - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup - - destroy - - status - -- name: Download and install DataCrunch terraform provider from GitHub releases - ansible.builtin.shell: - cmd: | - PROVIDER_VERSION="0.0.3" - PROVIDER_OS="{{ ansible_system | lower }}" - PROVIDER_ARCH="{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}" - PROVIDER_DIR="$HOME/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch/${PROVIDER_VERSION}/${PROVIDER_OS}_${PROVIDER_ARCH}" - - mkdir -p "${PROVIDER_DIR}" - cd "${PROVIDER_DIR}" - - # Download the provider binary - wget -q "https://github.com/linux-kdevops/terraform-provider-datacrunch/releases/download/v${PROVIDER_VERSION}/terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}.zip" - - # Extract the binary - unzip -o "terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}.zip" - - # Clean up zip file - rm "terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}.zip" - - # Make it executable - chmod +x terraform-provider-datacrunch_v${PROVIDER_VERSION} - - echo "DataCrunch provider v${PROVIDER_VERSION} installed to ${PROVIDER_DIR}" - when: - - kdevops_terraform_provider == "datacrunch" - - not datacrunch_provider_installed.stat.exists - tags: - - bringup - - destroy - - status - -- name: Auto-select Lambda Labs instance type for tier-based wildcards - ansible.builtin.shell: - cmd: | - case "{{ terraform_lambdalabs_instance_type }}" in - GH200_OR_LESS|H100_OR_LESS|A100_OR_LESS|A6000_OR_LESS) - # Use tier-based selection script for single GPU - tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') - {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose - ;; - 8X_B200_OR_LESS|8X_H100_OR_LESS|8X_A100_OR_LESS) - # Use tier-based selection script for 8x GPU - tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') - {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose - ;; - *) - echo "Unknown wildcard type: {{ terraform_lambdalabs_instance_type }}" - exit 1 - ;; - esac - register: lambdalabs_auto_instance_type - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - tags: - - bringup - -- name: Fail if no Lambda Labs instances available for wildcard selection - ansible.builtin.fail: - msg: | - No GPU instances available for {{ terraform_lambdalabs_instance_type }} - - {{ lambdalabs_auto_instance_type.stderr }} - - Try: - - Wait and retry (capacity changes frequently) - - Check Lambda Labs dashboard: https://cloud.lambdalabs.com - - Use a different tier group via menuconfig - - Check capacity manually: scripts/lambdalabs_check_capacity.py - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc != 0 - tags: - - bringup - -- name: Validate Lambda Labs tier selection output format - ansible.builtin.assert: - that: - - lambdalabs_auto_instance_type.stdout.split() | length == 2 - fail_msg: | - Invalid output from tier selection script. - Expected format: "instance_type region" - Got: "{{ lambdalabs_auto_instance_type.stdout }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Parse Lambda Labs auto-selected instance type and region - set_fact: - lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}" - lambdalabs_auto_selected_region: "{{ lambdalabs_auto_instance_type.stdout.split()[1] }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Report Lambda Labs auto-selected instance type for wildcards - ansible.builtin.debug: - msg: "Auto-selected instance type: {{ lambdalabs_auto_selected_instance }} in region: {{ lambdalabs_auto_selected_region }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Update Lambda Labs terraform vars with auto-selected instance type - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^lambdalabs_instance_type\s*=' - line: 'lambdalabs_instance_type = "{{ lambdalabs_auto_selected_instance }}"' - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Update Lambda Labs terraform vars with auto-selected region - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^lambdalabs_region\s*=' - line: 'lambdalabs_region = "{{ lambdalabs_auto_selected_region }}"' - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Set Lambda Labs resolved instance type for subsequent tasks - set_fact: - lambdalabs_resolved_instance_type: "{{ lambdalabs_auto_selected_instance if (terraform_lambdalabs_instance_type in ['GH200_OR_LESS', 'H100_OR_LESS', 'A100_OR_LESS', 'A6000_OR_LESS', '8X_B200_OR_LESS', '8X_H100_OR_LESS', '8X_A100_OR_LESS'] and lambdalabs_auto_instance_type.rc == 0) else terraform_lambdalabs_instance_type }}" - when: - - kdevops_terraform_provider == "lambdalabs" - tags: - - bringup - -- name: Check Lambda Labs capacity before provisioning (if using Lambda Labs) - ansible.builtin.shell: - cmd: | - {{ topdir_path }}/scripts/lambda-cli --output json check-availability \ - {{ lambdalabs_resolved_instance_type | default(terraform_lambdalabs_instance_type) }} {{ lambdalabs_auto_selected_region | default(terraform_lambdalabs_region) }} | \ - python3 -c " - import sys, json - data = json.load(sys.stdin) - if data.get('available'): - print(data.get('message', 'Instance available')) - sys.exit(0) - else: - print(data.get('error', 'Instance not available')) - if 'available_regions' in data: - print(f' Available in: ' + ', '.join(data['available_regions'])) - sys.exit(1) - " - register: capacity_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type not in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - tags: - - bringup - -- name: Report Lambda Labs capacity check result - ansible.builtin.fail: - msg: "{{ capacity_check.stdout }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - capacity_check is defined - - capacity_check.rc is defined - - capacity_check.rc != 0 - tags: - - bringup - -- name: Auto-select instance type for tier-based wildcards - ansible.builtin.shell: - cmd: | - case "{{ terraform_datacrunch_instance_type }}" in - B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) - # Use tier-based selection script - tier_group=$(echo "{{ terraform_datacrunch_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') - {{ topdir_path }}/scripts/datacrunch_select_tier.py "$tier_group" --verbose - ;; - ANY_1H100) - # Legacy H100 variant selection - check all regions - for variant in 1H100.80S.30V 1H100.80S.32V; do - result=$({{ topdir_path }}/scripts/datacrunch_check_capacity.py --instance-type "$variant" --json 2>/dev/null || echo "[]") - location=$(echo "$result" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data[0]['location']) if data and len(data) > 0 else ''" 2>/dev/null) - if [ -n "$location" ]; then - echo "$variant $location" - exit 0 - fi - done - echo "No single H100 variants available" >&2 - exit 1 - ;; - *) - echo "Unknown wildcard type: {{ terraform_datacrunch_instance_type }}" - exit 1 - ;; - esac - register: datacrunch_auto_instance_type - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - tags: - - bringup - -- name: Fail if no instances available for wildcard selection - ansible.builtin.fail: - msg: | - No GPU instances available for {{ terraform_datacrunch_instance_type }} - - {{ datacrunch_auto_instance_type.stderr }} - - Try: - - Wait and retry (capacity changes frequently) - - Check DataCrunch dashboard: https://cloud.datacrunch.io - - Use a different tier group via menuconfig - - Check capacity manually: scripts/datacrunch_check_capacity.py --instance-type - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc != 0 - tags: - - bringup - -- name: Parse auto-selected instance type and location - set_fact: - auto_selected_instance: "{{ datacrunch_auto_instance_type.stdout.split()[0] }}" - auto_selected_location: "{{ datacrunch_auto_instance_type.stdout.split()[1] }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Report auto-selected instance type for wildcards - ansible.builtin.debug: - msg: "Auto-selected instance type: {{ auto_selected_instance }} in region: {{ auto_selected_location }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Update terraform vars with auto-selected instance type - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^datacrunch_instance_type\s*=' - line: 'datacrunch_instance_type = "{{ auto_selected_instance }}"' - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Set resolved instance type for subsequent tasks - set_fact: - resolved_instance_type: "{{ auto_selected_instance if (terraform_datacrunch_instance_type in ['B300_OR_LESS', 'B200_OR_LESS', 'H100_OR_LESS', 'A100_80_OR_LESS', 'A100_40_OR_LESS', 'ANY_1H100'] and datacrunch_auto_instance_type.rc == 0) else terraform_datacrunch_instance_type }}" - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup +# Terraform role entry point +# Orchestrates provider-specific and common tasks via includes -- name: Check DataCrunch capacity before provisioning (if using DataCrunch) - ansible.builtin.shell: - cmd: | - {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ - --instance-type {{ resolved_instance_type }} \ - --json | \ - python3 -c " - import sys, json - data = json.load(sys.stdin) - if data and len(data) > 0: - locations = [item['location'] for item in data] - print(f'Instance {{ resolved_instance_type }} is available in: ' + ', '.join(locations)) - sys.exit(0) - else: - print('Error: Instance {{ resolved_instance_type }} is not available in any location') - print('Check available instances with: scripts/datacrunch_check_capacity.py') - sys.exit(1) - " - register: datacrunch_capacity_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type not in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] +- name: Include bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/main.yml" tags: - bringup -- name: Report DataCrunch capacity check result - ansible.builtin.fail: - msg: "{{ datacrunch_capacity_check.stdout }}" - when: - - kdevops_terraform_provider == "datacrunch" - - datacrunch_capacity_check is defined - - datacrunch_capacity_check.rc is defined - - datacrunch_capacity_check.rc != 0 - tags: - - bringup - -- name: Auto-select DataCrunch location for explicit instance types - ansible.builtin.shell: - cmd: | - {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ - --instance-type {{ resolved_instance_type }} \ - --pick-first - register: datacrunch_auto_location - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type not in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - tags: - - bringup - -- name: Use tier-selected location for wildcard instance types - set_fact: - datacrunch_final_location: "{{ auto_selected_location }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Use auto-selected location for explicit instance types - set_fact: - datacrunch_final_location: "{{ datacrunch_auto_location.stdout }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type not in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_location.rc == 0 - tags: - - bringup - -- name: Update terraform vars with final location - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^datacrunch_location\s*=' - line: 'datacrunch_location = "{{ datacrunch_final_location }}"' - when: - - kdevops_terraform_provider == "datacrunch" - - datacrunch_final_location is defined - tags: - - bringup - -- name: Display final location - ansible.builtin.debug: - msg: "Selected DataCrunch location: {{ datacrunch_final_location }}" - when: - - kdevops_terraform_provider == "datacrunch" - - datacrunch_final_location is defined - tags: - - bringup - -# No longer needed - terraform reads directly from credentials file - -- name: Check if terraform state already has resources - ansible.builtin.command: - cmd: terraform state list - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_state_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup - -- name: Set flag for existing terraform resources - set_fact: - terraform_resources_exist: "{{ terraform_state_check.stdout_lines | default([]) | length > 0 }}" - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup - -- name: Report that infrastructure is already provisioned - ansible.builtin.debug: - msg: "Infrastructure already provisioned ({{ terraform_state_check.stdout_lines | default([]) | length }} resources in state). Skipping terraform apply." - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_resources_exist | default(false) - tags: - - bringup - -- name: Initialize external provider for DataCrunch (workaround for dev_overrides) - ansible.builtin.shell: - cmd: | - # Hide all terraform files that reference datacrunch resources - # so that terraform init only sees the external provider requirement - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file" ]; then - mv "$file" "$file.bak" - fi - done - - # Create minimal terraform config with only external provider - cat > provider_init.tf << 'EOF' - terraform { - required_version = ">= 1.0" - required_providers { - external = { - source = "hashicorp/external" - version = "~> 2.3" - } - } - } - EOF - - # Initialize to get external provider and generate lock file - # Suppress color output to avoid ANSI codes in logs - terraform init -no-color > /dev/null 2>&1 || terraform init -no-color - - # Preserve the generated lock file for the external provider - # This is needed because dev_overrides prevents normal init of datacrunch provider - # but we still need the lock file for other providers - if [ -f .terraform.lock.hcl ]; then - cp .terraform.lock.hcl .terraform.lock.hcl.generated - fi - - # Clean up temporary file and restore original terraform files - rm provider_init.tf - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file.bak" ]; then - mv "$file.bak" "$file" - fi - done - - # Restore the lock file after putting all files back - if [ -f .terraform.lock.hcl.generated ]; then - mv .terraform.lock.hcl.generated .terraform.lock.hcl - fi - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - - not (terraform_resources_exist | default(false)) - changed_when: false - tags: - - bringup - -- name: Bring up terraform resources (DataCrunch with tier fallback on failure) - ansible.builtin.shell: - cmd: | - MAX_RETRIES=5 - EXCLUDED_INSTANCES="" - TIER_GROUP="{{ terraform_datacrunch_instance_type | lower | replace('_', '-') }}" - - # Check if using tier-based selection - is_tier_based() { - case "{{ terraform_datacrunch_instance_type }}" in - B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) - return 0 - ;; - *) - return 1 - ;; - esac - } - - for attempt in $(seq 1 $MAX_RETRIES); do - echo "=== Attempt $attempt of $MAX_RETRIES ===" - - # Get current instance type from tfvars - CURRENT_INSTANCE=$(grep '^datacrunch_instance_type' terraform.tfvars | sed 's/.*= *"\([^"]*\)".*/\1/') - echo "Trying instance type: $CURRENT_INSTANCE" - - # Attempt terraform apply and capture output - APPLY_OUTPUT=$(terraform apply -auto-approve -no-color 2>&1) && { - echo "$APPLY_OUTPUT" - echo "Terraform apply succeeded!" - exit 0 - } - - # Apply failed - check what kind of error - echo "$APPLY_OUTPUT" - - # Check if this is a 503 or deployment error that we can retry with fallback - if echo "$APPLY_OUTPUT" | grep -q "API returned status 503\|Error deploying instance"; then - echo "" - echo "Deployment failed for $CURRENT_INSTANCE - checking if tier fallback is available..." - - if ! is_tier_based; then - echo "Not using tier-based selection, cannot fall back to different instance type." - exit 1 - fi - - # Add current instance to exclusion list - if [ -n "$EXCLUDED_INSTANCES" ]; then - EXCLUDED_INSTANCES="$EXCLUDED_INSTANCES --exclude $CURRENT_INSTANCE" - else - EXCLUDED_INSTANCES="--exclude $CURRENT_INSTANCE" - fi - - echo "Excluded instances so far: $EXCLUDED_INSTANCES" - - # Try to select next available instance - echo "Selecting next available instance from tier group: $TIER_GROUP" - NEXT_SELECTION=$({{ topdir_path }}/scripts/datacrunch_select_tier.py "$TIER_GROUP" --verbose $EXCLUDED_INSTANCES 2>&1) || { - echo "No more instances available in tier group $TIER_GROUP" - echo "$NEXT_SELECTION" - exit 1 - } - - # Parse new instance and location - NEW_INSTANCE=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $1}') - NEW_LOCATION=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $2}') - - if [ -z "$NEW_INSTANCE" ] || [ -z "$NEW_LOCATION" ]; then - echo "Failed to parse new instance selection" - exit 1 - fi - - echo "" - echo "Falling back to: $NEW_INSTANCE in $NEW_LOCATION" - - # Update terraform.tfvars with new instance type and location - sed -i "s/^datacrunch_instance_type.*/datacrunch_instance_type = \"$NEW_INSTANCE\"/" terraform.tfvars - sed -i "s/^datacrunch_location.*/datacrunch_location = \"$NEW_LOCATION\"/" terraform.tfvars - - echo "Updated terraform.tfvars, retrying..." - echo "" - else - echo "Terraform failed with non-recoverable error" - exit 1 - fi - done - - echo "Exhausted all $MAX_RETRIES retry attempts" - exit 1 - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - - not (terraform_resources_exist | default(false)) - tags: - - bringup - -- name: Bring up terraform resources (other providers) - cloud.terraform.terraform: - binary_path: "{{ terraform_binary_path }}" - force_init: true - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - state: present - when: - - kdevops_terraform_provider != "datacrunch" - tags: - - bringup - -- name: Retrieve the controller_ip_map from terraform - cloud.terraform.terraform_output: - binary_path: "{{ terraform_binary_path }}" - format: json - name: controller_ip_map - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_output - tags: - - ssh - -- name: Add each target node's ssh Host entry on the control host - ansible.builtin.blockinfile: - block: "{{ lookup('template', 'ssh_config.j2') }}" - create: true - dest: "{{ kdevops_ssh_config }}" - insertafter: "EOF" - marker: "# {mark} host configuration for {{ item.key }}" - mode: "u=rw,g=r,o=r" - loop: "{{ terraform_output.value | dict2items }}" - tags: - - ssh - -- name: Ensure the Include directive is present on the controller - ansible.builtin.blockinfile: - path: "{{ sshconfig }}" - insertbefore: BOF - append_newline: true - create: true - marker: "# {mark} Managed by kdevops" - mode: "u=rw,g=r,o=r" - block: "Include {{ kdevops_ssh_config_prefix }}*" +- name: Include SSH configuration tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/common/ssh-config.yml" tags: - ssh -- name: Report terraform status +- name: Include status reporting tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/common/status.yml" tags: - status - block: - - name: Retrieve the controller_ip_map from terraform - cloud.terraform.terraform_output: - binary_path: "{{ terraform_binary_path }}" - format: json - name: controller_ip_map - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_output - - - name: End play -- terraform state file is empty or missing - ansible.builtin.meta: end_play - when: - - terraform_output.warnings is defined - - - name: Count active resources - ansible.builtin.command: - cmd: "terraform state list" - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_state - changed_when: false - - - name: Show status - ansible.builtin.debug: - msg: "Active resources: {{ terraform_state.stdout_lines | length }}" - - - name: Show controller IP map - ansible.builtin.debug: - var: terraform_output.value - -- name: Remove the ephemeral ssh config file on the control host - ansible.builtin.file: - path: "{{ kdevops_ssh_config }}" - state: absent - tags: - - destroy - -- name: Initialize external provider for DataCrunch before destroy - ansible.builtin.shell: - cmd: | - # Hide all terraform files that reference datacrunch resources - # so that terraform init only sees the external provider requirement - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file" ]; then - mv "$file" "$file.bak" - fi - done - - # Create minimal terraform config with only external provider - cat > provider_init.tf << 'EOF' - terraform { - required_version = ">= 1.0" - required_providers { - external = { - source = "hashicorp/external" - version = "~> 2.3" - } - } - } - EOF - - # Initialize to get external provider and generate lock file - terraform init - - # Preserve the generated lock file for the external provider - if [ -f .terraform.lock.hcl ]; then - cp .terraform.lock.hcl .terraform.lock.hcl.generated - fi - - # Clean up temporary file and restore original terraform files - rm provider_init.tf - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file.bak" ]; then - mv "$file.bak" "$file" - fi - done - - # Restore the lock file after putting all files back - if [ -f .terraform.lock.hcl.generated ]; then - mv .terraform.lock.hcl.generated .terraform.lock.hcl - fi - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - changed_when: false - tags: - - destroy - -- name: Destroy terraform resources (DataCrunch with dev overrides) - ansible.builtin.command: - cmd: terraform destroy -auto-approve -no-color - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - destroy - -- name: Remove terraform lock file for DataCrunch after destroy - ansible.builtin.file: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/.terraform.lock.hcl" - state: absent - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - destroy -- name: Destroy terraform resources (other providers) - cloud.terraform.terraform: - binary_path: "{{ terraform_binary_path }}" - force_init: true - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - state: absent - when: - - kdevops_terraform_provider != "datacrunch" +- name: Include destroy tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/destroy/main.yml" tags: - destroy From fac2c3147b49d15db96d10643eeb0e693a4bcfc5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 17 Dec 2025 10:47:20 -0500 Subject: [PATCH 2/2] MAINTAINERS: Update my email address The cel@kernel address now points to an email service that handles plaintext email properly. Signed-off-by: Chuck Lever --- MAINTAINERS | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cb43ebfa..65bf5716 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -99,7 +99,7 @@ F: scripts/refs.Makefile F: workflows/linux/refs/* TERRAFORM -M: Chuck Lever +M: Chuck Lever R: Luis Chamberlain L: kdevops@lists.linux.dev S: Maintained @@ -112,7 +112,7 @@ F: scripts/terraform.Makefile F: terraform/ GITR WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git @@ -120,7 +120,7 @@ F: workflows/gitr/ F: playbooks/roles/gitr/ LTP WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git @@ -128,7 +128,7 @@ F: workflows/ltp/ F: playbooks/roles/ltp/ NFSTEST WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git @@ -136,7 +136,7 @@ F: workflows/nfstest/ F: playbooks/roles/nfstest/ PYNFS WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git