diff --git a/MAINTAINERS b/MAINTAINERS index cb43ebfa..65bf5716 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -99,7 +99,7 @@ F: scripts/refs.Makefile F: workflows/linux/refs/* TERRAFORM -M: Chuck Lever +M: Chuck Lever R: Luis Chamberlain L: kdevops@lists.linux.dev S: Maintained @@ -112,7 +112,7 @@ F: scripts/terraform.Makefile F: terraform/ GITR WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git @@ -120,7 +120,7 @@ F: workflows/gitr/ F: playbooks/roles/gitr/ LTP WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git @@ -128,7 +128,7 @@ F: workflows/ltp/ F: playbooks/roles/ltp/ NFSTEST WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git @@ -136,7 +136,7 @@ F: workflows/nfstest/ F: playbooks/roles/nfstest/ PYNFS WORKFLOW -M: Chuck Lever +M: Chuck Lever L: kdevops@lists.linux.dev S: Maintained T: git https://github.com/linux-kdevops/kdevops.git diff --git a/playbooks/roles/terraform/tasks/bringup/datacrunch.yml b/playbooks/roles/terraform/tasks/bringup/datacrunch.yml new file mode 100644 index 00000000..7e0ca15c --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/datacrunch.yml @@ -0,0 +1,441 @@ +--- +# DataCrunch provider bringup tasks +# Provider installation, tier-based instance selection, capacity checking, terraform apply + +- name: Set DataCrunch provider architecture + ansible.builtin.set_fact: + datacrunch_provider_arch: "{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}" + tags: + - bringup + - destroy + - status + +- name: Check if DataCrunch terraform provider is already installed + ansible.builtin.stat: + path: >- + ~/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch/0.0.3/{{ + ansible_system | lower }}_{{ datacrunch_provider_arch }}/terraform-provider-datacrunch_v0.0.3 + register: datacrunch_provider_installed + tags: + - bringup + - destroy + - status + +- name: Download and install DataCrunch terraform provider from GitHub releases + ansible.builtin.shell: + cmd: | + PROVIDER_VERSION="0.0.3" + PROVIDER_OS="{{ ansible_system | lower }}" + PROVIDER_ARCH="{{ datacrunch_provider_arch }}" + PROVIDER_DIR="$HOME/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch" + PROVIDER_DIR="${PROVIDER_DIR}/${PROVIDER_VERSION}/${PROVIDER_OS}_${PROVIDER_ARCH}" + PROVIDER_FILE="terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}" + GITHUB_URL="https://github.com/linux-kdevops/terraform-provider-datacrunch/releases" + + mkdir -p "${PROVIDER_DIR}" + cd "${PROVIDER_DIR}" + + # Download the provider binary + wget -q "${GITHUB_URL}/download/v${PROVIDER_VERSION}/${PROVIDER_FILE}.zip" + + # Extract the binary + unzip -o "${PROVIDER_FILE}.zip" + + # Clean up zip file + rm "${PROVIDER_FILE}.zip" + + # Make it executable + chmod +x terraform-provider-datacrunch_v${PROVIDER_VERSION} + + echo "DataCrunch provider v${PROVIDER_VERSION} installed to ${PROVIDER_DIR}" + changed_when: true + when: + - not datacrunch_provider_installed.stat.exists + tags: + - bringup + - destroy + - status + +- name: Auto-select instance type for tier-based wildcards + ansible.builtin.shell: + cmd: | + set -o pipefail + case "{{ terraform_datacrunch_instance_type }}" in + B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) + # Use tier-based selection script + tier_group=$(echo "{{ terraform_datacrunch_instance_type }}" | \ + tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/datacrunch_select_tier.py "$tier_group" --verbose + ;; + ANY_1H100) + # Legacy H100 variant selection - check all regions + for variant in 1H100.80S.30V 1H100.80S.32V; do + result=$({{ topdir_path }}/scripts/datacrunch_check_capacity.py \ + --instance-type "$variant" --json 2>/dev/null || echo "[]") + location=$(echo "$result" | python3 -c " + import sys, json + data=json.load(sys.stdin) + print(data[0]['location']) if data and len(data) > 0 else '' + " 2>/dev/null) + if [ -n "$location" ]; then + echo "$variant $location" + exit 0 + fi + done + echo "No single H100 variants available" >&2 + exit 1 + ;; + *) + echo "Unknown wildcard type: {{ terraform_datacrunch_instance_type }}" + exit 1 + ;; + esac + register: datacrunch_auto_instance_type + failed_when: false + changed_when: false + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + tags: + - bringup + +- name: Fail if no instances available for wildcard selection + ansible.builtin.fail: + msg: | + No GPU instances available for {{ terraform_datacrunch_instance_type }} + + {{ datacrunch_auto_instance_type.stderr }} + + Try: + - Wait and retry (capacity changes frequently) + - Check DataCrunch dashboard: https://cloud.datacrunch.io + - Use a different tier group via menuconfig + - Check capacity manually: scripts/datacrunch_check_capacity.py --instance-type + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc != 0 + tags: + - bringup + +- name: Parse auto-selected instance type and location + ansible.builtin.set_fact: + auto_selected_instance: "{{ datacrunch_auto_instance_type.stdout.split()[0] }}" + auto_selected_location: "{{ datacrunch_auto_instance_type.stdout.split()[1] }}" + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Report auto-selected instance type for wildcards + ansible.builtin.debug: + msg: >- + Auto-selected instance type: {{ auto_selected_instance }} + in region: {{ auto_selected_location }} + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update terraform vars with auto-selected instance type + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^datacrunch_instance_type\s*=' + line: 'datacrunch_instance_type = "{{ auto_selected_instance }}"' + when: + - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", + "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Define DataCrunch tier wildcards list + ansible.builtin.set_fact: + datacrunch_tier_wildcards: + - B300_OR_LESS + - B200_OR_LESS + - H100_OR_LESS + - A100_80_OR_LESS + - A100_40_OR_LESS + - ANY_1H100 + tags: + - bringup + +- name: Set resolved instance type for subsequent tasks + ansible.builtin.set_fact: + resolved_instance_type: >- + {{ auto_selected_instance + if (terraform_datacrunch_instance_type in datacrunch_tier_wildcards + and datacrunch_auto_instance_type.rc == 0) + else terraform_datacrunch_instance_type }} + tags: + - bringup + +- name: Check DataCrunch capacity before provisioning + ansible.builtin.shell: + cmd: | + set -o pipefail + {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ + --instance-type {{ resolved_instance_type }} \ + --json | \ + python3 -c " + import sys, json + data = json.load(sys.stdin) + if data and len(data) > 0: + locations = [item['location'] for item in data] + print(f'Instance {{ resolved_instance_type }} is available in: ' + ', '.join(locations)) + sys.exit(0) + else: + print('Error: Instance {{ resolved_instance_type }} is not available in any location') + print('Check available instances with: scripts/datacrunch_check_capacity.py') + sys.exit(1) + " + register: datacrunch_capacity_check + failed_when: false + changed_when: false + when: + - terraform_datacrunch_instance_type not in datacrunch_tier_wildcards + tags: + - bringup + +- name: Report DataCrunch capacity check result + ansible.builtin.fail: + msg: "{{ datacrunch_capacity_check.stdout }}" + when: + - datacrunch_capacity_check is defined + - datacrunch_capacity_check.rc is defined + - datacrunch_capacity_check.rc != 0 + tags: + - bringup + +- name: Auto-select DataCrunch location for explicit instance types + ansible.builtin.shell: + cmd: | + {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ + --instance-type {{ resolved_instance_type }} \ + --pick-first + register: datacrunch_auto_location + changed_when: false + when: + - terraform_datacrunch_instance_type not in datacrunch_tier_wildcards + tags: + - bringup + +- name: Use tier-selected location for wildcard instance types + ansible.builtin.set_fact: + datacrunch_final_location: "{{ auto_selected_location }}" + when: + - terraform_datacrunch_instance_type in datacrunch_tier_wildcards + - datacrunch_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Use auto-selected location for explicit instance types + ansible.builtin.set_fact: + datacrunch_final_location: "{{ datacrunch_auto_location.stdout }}" + when: + - terraform_datacrunch_instance_type not in datacrunch_tier_wildcards + - datacrunch_auto_location.rc == 0 + tags: + - bringup + +- name: Update terraform vars with final location + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^datacrunch_location\s*=' + line: 'datacrunch_location = "{{ datacrunch_final_location }}"' + when: + - datacrunch_final_location is defined + tags: + - bringup + +- name: Display final location + ansible.builtin.debug: + msg: "Selected DataCrunch location: {{ datacrunch_final_location }}" + when: + - datacrunch_final_location is defined + tags: + - bringup + +- name: Check if terraform state already has resources + ansible.builtin.command: + cmd: terraform state list + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_state_check + failed_when: false + changed_when: false + tags: + - bringup + +- name: Set flag for existing terraform resources + ansible.builtin.set_fact: + terraform_resources_exist: "{{ terraform_state_check.stdout_lines | default([]) | length > 0 }}" + tags: + - bringup + +- name: Report that infrastructure is already provisioned + ansible.builtin.debug: + msg: "Infrastructure already provisioned ({{ terraform_state_check.stdout_lines | default([]) | length }} resources in state). Skipping terraform apply." + when: + - terraform_resources_exist | default(false) + tags: + - bringup + +- name: Initialize external provider for DataCrunch (workaround for dev_overrides) + ansible.builtin.shell: + cmd: | + # Hide all terraform files that reference datacrunch resources + # so that terraform init only sees the external provider requirement + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file" ]; then + mv "$file" "$file.bak" + fi + done + + # Create minimal terraform config with only external provider + cat > provider_init.tf << 'EOF' + terraform { + required_version = ">= 1.0" + required_providers { + external = { + source = "hashicorp/external" + version = "~> 2.3" + } + } + } + EOF + + # Initialize to get external provider and generate lock file + # Suppress color output to avoid ANSI codes in logs + terraform init -no-color > /dev/null 2>&1 || terraform init -no-color + + # Preserve the generated lock file for the external provider + # This is needed because dev_overrides prevents normal init of datacrunch provider + # but we still need the lock file for other providers + if [ -f .terraform.lock.hcl ]; then + cp .terraform.lock.hcl .terraform.lock.hcl.generated + fi + + # Clean up temporary file and restore original terraform files + rm provider_init.tf + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file.bak" ]; then + mv "$file.bak" "$file" + fi + done + + # Restore the lock file after putting all files back + if [ -f .terraform.lock.hcl.generated ]; then + mv .terraform.lock.hcl.generated .terraform.lock.hcl + fi + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + when: + - not (terraform_resources_exist | default(false)) + changed_when: false + tags: + - bringup + +- name: Bring up terraform resources (DataCrunch with tier fallback on failure) + ansible.builtin.shell: + cmd: | + set -o pipefail + MAX_RETRIES=5 + EXCLUDED_INSTANCES="" + TIER_GROUP="{{ terraform_datacrunch_instance_type | lower | replace('_', '-') }}" + + # Check if using tier-based selection + is_tier_based() { + case "{{ terraform_datacrunch_instance_type }}" in + B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) + return 0 + ;; + *) + return 1 + ;; + esac + } + + for attempt in $(seq 1 $MAX_RETRIES); do + echo "=== Attempt $attempt of $MAX_RETRIES ===" + + # Get current instance type from tfvars + CURRENT_INSTANCE=$(grep '^datacrunch_instance_type' terraform.tfvars | \ + sed 's/.*= *"\([^"]*\)".*/\1/') + echo "Trying instance type: $CURRENT_INSTANCE" + + # Attempt terraform apply and capture output + APPLY_OUTPUT=$(terraform apply -auto-approve -no-color 2>&1) && { + echo "$APPLY_OUTPUT" + echo "Terraform apply succeeded!" + exit 0 + } + + # Apply failed - check what kind of error + echo "$APPLY_OUTPUT" + + # Check if this is a 503 or deployment error that we can retry with fallback + if echo "$APPLY_OUTPUT" | grep -q "API returned status 503\|Error deploying instance"; then + echo "" + echo "Deployment failed for $CURRENT_INSTANCE - checking if tier fallback is available..." + + if ! is_tier_based; then + echo "Not using tier-based selection, cannot fall back to different instance type." + exit 1 + fi + + # Add current instance to exclusion list + if [ -n "$EXCLUDED_INSTANCES" ]; then + EXCLUDED_INSTANCES="$EXCLUDED_INSTANCES --exclude $CURRENT_INSTANCE" + else + EXCLUDED_INSTANCES="--exclude $CURRENT_INSTANCE" + fi + + echo "Excluded instances so far: $EXCLUDED_INSTANCES" + + # Try to select next available instance + echo "Selecting next available instance from tier group: $TIER_GROUP" + NEXT_SELECTION=$({{ topdir_path }}/scripts/datacrunch_select_tier.py \ + "$TIER_GROUP" --verbose $EXCLUDED_INSTANCES 2>&1) || { + echo "No more instances available in tier group $TIER_GROUP" + echo "$NEXT_SELECTION" + exit 1 + } + + # Parse new instance and location + NEW_INSTANCE=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $1}') + NEW_LOCATION=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $2}') + + if [ -z "$NEW_INSTANCE" ] || [ -z "$NEW_LOCATION" ]; then + echo "Failed to parse new instance selection" + exit 1 + fi + + echo "" + echo "Falling back to: $NEW_INSTANCE in $NEW_LOCATION" + + # Update terraform.tfvars with new instance type and location + sed -i "s/^datacrunch_instance_type.*/datacrunch_instance_type = \"$NEW_INSTANCE\"/" \ + terraform.tfvars + sed -i "s/^datacrunch_location.*/datacrunch_location = \"$NEW_LOCATION\"/" \ + terraform.tfvars + + echo "Updated terraform.tfvars, retrying..." + echo "" + else + echo "Terraform failed with non-recoverable error" + exit 1 + fi + done + + echo "Exhausted all $MAX_RETRIES retry attempts" + exit 1 + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + changed_when: true + when: + - not (terraform_resources_exist | default(false)) + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/bringup/generic.yml b/playbooks/roles/terraform/tasks/bringup/generic.yml new file mode 100644 index 00000000..2e5a9290 --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/generic.yml @@ -0,0 +1,12 @@ +--- +# Generic terraform bringup for providers without special handling +# Used for: AWS, Azure, GCE, OCI, and other standard providers + +- name: Bring up terraform resources + cloud.terraform.terraform: + binary_path: "{{ terraform_binary_path }}" + force_init: true + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + state: present + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/bringup/lambdalabs.yml b/playbooks/roles/terraform/tasks/bringup/lambdalabs.yml new file mode 100644 index 00000000..8aa9048a --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/lambdalabs.yml @@ -0,0 +1,222 @@ +--- +# Lambda Labs provider bringup tasks +# API key validation, tier-based instance selection, capacity checking + +- name: Define Lambda Labs tier wildcards list + ansible.builtin.set_fact: + lambdalabs_tier_wildcards: + - GH200_OR_LESS + - H100_OR_LESS + - A100_OR_LESS + - A6000_OR_LESS + - 8X_B200_OR_LESS + - 8X_H100_OR_LESS + - 8X_A100_OR_LESS + tags: + - bringup + +- name: Check Lambda Labs API key configuration + ansible.builtin.command: + cmd: "python3 {{ topdir_path }}/scripts/lambdalabs_credentials.py check" + register: api_key_check + failed_when: false + changed_when: false + tags: + - bringup + - destroy + - status + +- name: Report Lambda Labs API key configuration status + ansible.builtin.fail: + msg: | + ERROR: Lambda Labs API key is not configured! + + To fix this, configure your Lambda Labs API key using one of these methods: + + Use the kdevops credentials management tool: + python3 scripts/lambdalabs_credentials.py set 'your-actual-api-key-here' + + Or manually create the credentials file: + mkdir -p ~/.lambdalabs + echo "[default]" > ~/.lambdalabs/credentials + echo "lambdalabs_api_key=your-actual-api-key-here" >> ~/.lambdalabs/credentials + chmod 600 ~/.lambdalabs/credentials + + Get your API key from: https://cloud.lambdalabs.com + when: + - api_key_check.rc != 0 + tags: + - bringup + - destroy + - status + +- name: Display Lambda Labs API key configuration status + ansible.builtin.debug: + msg: "{{ api_key_check.stdout }}" + when: + - api_key_check.rc == 0 + tags: + - bringup + - destroy + - status + +- name: Auto-select Lambda Labs instance type for tier-based wildcards + ansible.builtin.shell: + cmd: | + set -o pipefail + case "{{ terraform_lambdalabs_instance_type }}" in + GH200_OR_LESS|H100_OR_LESS|A100_OR_LESS|A6000_OR_LESS) + # Use tier-based selection script for single GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | \ + tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + 8X_B200_OR_LESS|8X_H100_OR_LESS|8X_A100_OR_LESS) + # Use tier-based selection script for 8x GPU + tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | \ + tr '[:upper:]' '[:lower:]' | tr '_' '-') + {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose + ;; + *) + echo "Unknown wildcard type: {{ terraform_lambdalabs_instance_type }}" + exit 1 + ;; + esac + register: lambdalabs_auto_instance_type + failed_when: false + changed_when: false + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + tags: + - bringup + +- name: Fail if no Lambda Labs instances available for wildcard selection + ansible.builtin.fail: + msg: | + No GPU instances available for {{ terraform_lambdalabs_instance_type }} + + {{ lambdalabs_auto_instance_type.stderr }} + + Try: + - Wait and retry (capacity changes frequently) + - Check Lambda Labs dashboard: https://cloud.lambdalabs.com + - Use a different tier group via menuconfig + - Check capacity manually: scripts/lambdalabs_check_capacity.py + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc != 0 + tags: + - bringup + +- name: Validate Lambda Labs tier selection output format + ansible.builtin.assert: + that: + - lambdalabs_auto_instance_type.stdout.split() | length == 2 + fail_msg: | + Invalid output from tier selection script. + Expected format: "instance_type region" + Got: "{{ lambdalabs_auto_instance_type.stdout }}" + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Parse Lambda Labs auto-selected instance type and region + ansible.builtin.set_fact: + lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}" + lambdalabs_auto_selected_region: "{{ lambdalabs_auto_instance_type.stdout.split()[1] }}" + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Report Lambda Labs auto-selected instance type for wildcards + ansible.builtin.debug: + msg: >- + Auto-selected instance type: {{ lambdalabs_auto_selected_instance }} + in region: {{ lambdalabs_auto_selected_region }} + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected instance type + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_instance_type\s*=' + line: 'lambdalabs_instance_type = "{{ lambdalabs_auto_selected_instance }}"' + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Update Lambda Labs terraform vars with auto-selected region + ansible.builtin.lineinfile: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" + regexp: '^lambdalabs_region\s*=' + line: 'lambdalabs_region = "{{ lambdalabs_auto_selected_region }}"' + when: + - terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + - lambdalabs_auto_instance_type.rc == 0 + tags: + - bringup + +- name: Set Lambda Labs resolved instance type for subsequent tasks + ansible.builtin.set_fact: + lambdalabs_resolved_instance_type: >- + {{ lambdalabs_auto_selected_instance + if (terraform_lambdalabs_instance_type in lambdalabs_tier_wildcards + and lambdalabs_auto_instance_type.rc == 0) + else terraform_lambdalabs_instance_type }} + tags: + - bringup + +- name: Check Lambda Labs capacity before provisioning + ansible.builtin.shell: + cmd: | + set -o pipefail + {{ topdir_path }}/scripts/lambda-cli --output json check-availability \ + {{ lambdalabs_resolved_instance_type | default(terraform_lambdalabs_instance_type) }} \ + {{ lambdalabs_auto_selected_region | default(terraform_lambdalabs_region) }} | \ + python3 -c " + import sys, json + data = json.load(sys.stdin) + if data.get('available'): + print(data.get('message', 'Instance available')) + sys.exit(0) + else: + print(data.get('error', 'Instance not available')) + if 'available_regions' in data: + print(f' Available in: ' + ', '.join(data['available_regions'])) + sys.exit(1) + " + register: capacity_check + failed_when: false + changed_when: false + when: + - terraform_lambdalabs_instance_type not in lambdalabs_tier_wildcards + tags: + - bringup + +- name: Report Lambda Labs capacity check result + ansible.builtin.fail: + msg: "{{ capacity_check.stdout }}" + when: + - capacity_check is defined + - capacity_check.rc is defined + - capacity_check.rc != 0 + tags: + - bringup + +- name: Bring up terraform resources (Lambda Labs) + cloud.terraform.terraform: + binary_path: "{{ terraform_binary_path }}" + force_init: true + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + state: present + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/bringup/main.yml b/playbooks/roles/terraform/tasks/bringup/main.yml new file mode 100644 index 00000000..5723740a --- /dev/null +++ b/playbooks/roles/terraform/tasks/bringup/main.yml @@ -0,0 +1,27 @@ +--- +# Terraform bringup orchestration +# Includes provider-specific tasks based on kdevops_terraform_provider + +- name: Include Lambda Labs bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/lambdalabs.yml" + when: + - kdevops_terraform_provider == "lambdalabs" + tags: + - bringup + +- name: Include DataCrunch bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/datacrunch.yml" + when: + - kdevops_terraform_provider == "datacrunch" + tags: + - bringup + +- name: Include generic bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/generic.yml" + when: + - kdevops_terraform_provider not in ["lambdalabs", "datacrunch"] + tags: + - bringup diff --git a/playbooks/roles/terraform/tasks/common/ssh-config.yml b/playbooks/roles/terraform/tasks/common/ssh-config.yml new file mode 100644 index 00000000..c1c4d0a9 --- /dev/null +++ b/playbooks/roles/terraform/tasks/common/ssh-config.yml @@ -0,0 +1,37 @@ +--- +# SSH configuration generation for terraform-provisioned nodes +# This is common to all terraform providers + +- name: Retrieve the controller_ip_map from terraform + cloud.terraform.terraform_output: + binary_path: "{{ terraform_binary_path }}" + format: json + name: controller_ip_map + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_output + tags: + - ssh + +- name: Add each target node's ssh Host entry on the control host + ansible.builtin.blockinfile: + block: "{{ lookup('template', 'ssh_config.j2') }}" + create: true + dest: "{{ kdevops_ssh_config }}" + insertafter: "EOF" + marker: "# {mark} host configuration for {{ item.key }}" + mode: "u=rw,g=r,o=r" + loop: "{{ terraform_output.value | dict2items }}" + tags: + - ssh + +- name: Ensure the Include directive is present on the controller + ansible.builtin.blockinfile: + path: "{{ sshconfig }}" + insertbefore: BOF + append_newline: true + create: true + marker: "# {mark} Managed by kdevops" + mode: "u=rw,g=r,o=r" + block: "Include {{ kdevops_ssh_config_prefix }}*" + tags: + - ssh diff --git a/playbooks/roles/terraform/tasks/common/status.yml b/playbooks/roles/terraform/tasks/common/status.yml new file mode 100644 index 00000000..cfa68670 --- /dev/null +++ b/playbooks/roles/terraform/tasks/common/status.yml @@ -0,0 +1,35 @@ +--- +# Terraform status reporting +# Common to all terraform providers + +- name: Report terraform status + tags: + - status + block: + - name: Retrieve the controller_ip_map from terraform + cloud.terraform.terraform_output: + binary_path: "{{ terraform_binary_path }}" + format: json + name: controller_ip_map + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_output + + - name: End play -- terraform state file is empty or missing + ansible.builtin.meta: end_play + when: + - terraform_output.warnings is defined + + - name: Count active resources + ansible.builtin.command: + cmd: "terraform state list" + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + register: terraform_state + changed_when: false + + - name: Show status + ansible.builtin.debug: + msg: "Active resources: {{ terraform_state.stdout_lines | length }}" + + - name: Show controller IP map + ansible.builtin.debug: + var: terraform_output.value diff --git a/playbooks/roles/terraform/tasks/destroy/datacrunch.yml b/playbooks/roles/terraform/tasks/destroy/datacrunch.yml new file mode 100644 index 00000000..c2bc1342 --- /dev/null +++ b/playbooks/roles/terraform/tasks/destroy/datacrunch.yml @@ -0,0 +1,67 @@ +--- +# DataCrunch provider destroy tasks +# Handles dev_overrides workaround for external provider initialization + +- name: Initialize external provider for DataCrunch before destroy + ansible.builtin.shell: + cmd: | + # Hide all terraform files that reference datacrunch resources + # so that terraform init only sees the external provider requirement + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file" ]; then + mv "$file" "$file.bak" + fi + done + + # Create minimal terraform config with only external provider + cat > provider_init.tf << 'EOF' + terraform { + required_version = ">= 1.0" + required_providers { + external = { + source = "hashicorp/external" + version = "~> 2.3" + } + } + } + EOF + + # Initialize to get external provider and generate lock file + terraform init + + # Preserve the generated lock file for the external provider + if [ -f .terraform.lock.hcl ]; then + cp .terraform.lock.hcl .terraform.lock.hcl.generated + fi + + # Clean up temporary file and restore original terraform files + rm provider_init.tf + for file in provider.tf main.tf output.tf vars.tf nodes.tf; do + if [ -f "$file.bak" ]; then + mv "$file.bak" "$file" + fi + done + + # Restore the lock file after putting all files back + if [ -f .terraform.lock.hcl.generated ]; then + mv .terraform.lock.hcl.generated .terraform.lock.hcl + fi + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + changed_when: false + tags: + - destroy + +- name: Destroy terraform resources (DataCrunch with dev overrides) + ansible.builtin.command: + cmd: terraform destroy -auto-approve -no-color + chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + changed_when: true + tags: + - destroy + +- name: Remove terraform lock file for DataCrunch after destroy + ansible.builtin.file: + path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/.terraform.lock.hcl" + state: absent + tags: + - destroy diff --git a/playbooks/roles/terraform/tasks/destroy/generic.yml b/playbooks/roles/terraform/tasks/destroy/generic.yml new file mode 100644 index 00000000..7df6390e --- /dev/null +++ b/playbooks/roles/terraform/tasks/destroy/generic.yml @@ -0,0 +1,12 @@ +--- +# Generic terraform destroy for providers without special handling +# Used for: AWS, Azure, GCE, OCI, Lambda Labs, and other standard providers + +- name: Destroy terraform resources + cloud.terraform.terraform: + binary_path: "{{ terraform_binary_path }}" + force_init: true + project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" + state: absent + tags: + - destroy diff --git a/playbooks/roles/terraform/tasks/destroy/main.yml b/playbooks/roles/terraform/tasks/destroy/main.yml new file mode 100644 index 00000000..f882ceab --- /dev/null +++ b/playbooks/roles/terraform/tasks/destroy/main.yml @@ -0,0 +1,26 @@ +--- +# Terraform destroy orchestration +# Includes provider-specific tasks based on kdevops_terraform_provider + +- name: Remove the ephemeral ssh config file on the control host + ansible.builtin.file: + path: "{{ kdevops_ssh_config }}" + state: absent + tags: + - destroy + +- name: Include DataCrunch destroy tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/destroy/datacrunch.yml" + when: + - kdevops_terraform_provider == "datacrunch" + tags: + - destroy + +- name: Include generic destroy tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/destroy/generic.yml" + when: + - kdevops_terraform_provider != "datacrunch" + tags: + - destroy diff --git a/playbooks/roles/terraform/tasks/main.yml b/playbooks/roles/terraform/tasks/main.yml index 9561b837..f57652f1 100644 --- a/playbooks/roles/terraform/tasks/main.yml +++ b/playbooks/roles/terraform/tasks/main.yml @@ -1,778 +1,27 @@ --- -- name: Check Lambda Labs API key configuration (if using Lambda Labs) - ansible.builtin.command: - cmd: "python3 {{ topdir_path }}/scripts/lambdalabs_credentials.py check" - register: api_key_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "lambdalabs" - tags: - - bringup - - destroy - - status - -- name: Report Lambda Labs API key configuration status - ansible.builtin.fail: - msg: | - ERROR: Lambda Labs API key is not configured! - - To fix this, configure your Lambda Labs API key using one of these methods: - - Use the kdevops credentials management tool: - python3 scripts/lambdalabs_credentials.py set 'your-actual-api-key-here' - - Or manually create the credentials file: - mkdir -p ~/.lambdalabs - echo "[default]" > ~/.lambdalabs/credentials - echo "lambdalabs_api_key=your-actual-api-key-here" >> ~/.lambdalabs/credentials - chmod 600 ~/.lambdalabs/credentials - - Get your API key from: https://cloud.lambdalabs.com - when: - - kdevops_terraform_provider == "lambdalabs" - - api_key_check.rc != 0 - tags: - - bringup - - destroy - - status - -- name: Display Lambda Labs API key configuration status - ansible.builtin.debug: - msg: "{{ api_key_check.stdout }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - api_key_check.rc == 0 - tags: - - bringup - - destroy - - status - -- name: Check if DataCrunch terraform provider is already installed - ansible.builtin.stat: - path: "~/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch/0.0.3/{{ ansible_system | lower }}_{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}/terraform-provider-datacrunch_v0.0.3" - register: datacrunch_provider_installed - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup - - destroy - - status - -- name: Download and install DataCrunch terraform provider from GitHub releases - ansible.builtin.shell: - cmd: | - PROVIDER_VERSION="0.0.3" - PROVIDER_OS="{{ ansible_system | lower }}" - PROVIDER_ARCH="{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}" - PROVIDER_DIR="$HOME/.terraform.d/plugins/registry.terraform.io/linux-kdevops/datacrunch/${PROVIDER_VERSION}/${PROVIDER_OS}_${PROVIDER_ARCH}" - - mkdir -p "${PROVIDER_DIR}" - cd "${PROVIDER_DIR}" - - # Download the provider binary - wget -q "https://github.com/linux-kdevops/terraform-provider-datacrunch/releases/download/v${PROVIDER_VERSION}/terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}.zip" - - # Extract the binary - unzip -o "terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}.zip" - - # Clean up zip file - rm "terraform-provider-datacrunch_${PROVIDER_VERSION}_${PROVIDER_OS}_${PROVIDER_ARCH}.zip" - - # Make it executable - chmod +x terraform-provider-datacrunch_v${PROVIDER_VERSION} - - echo "DataCrunch provider v${PROVIDER_VERSION} installed to ${PROVIDER_DIR}" - when: - - kdevops_terraform_provider == "datacrunch" - - not datacrunch_provider_installed.stat.exists - tags: - - bringup - - destroy - - status - -- name: Auto-select Lambda Labs instance type for tier-based wildcards - ansible.builtin.shell: - cmd: | - case "{{ terraform_lambdalabs_instance_type }}" in - GH200_OR_LESS|H100_OR_LESS|A100_OR_LESS|A6000_OR_LESS) - # Use tier-based selection script for single GPU - tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') - {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose - ;; - 8X_B200_OR_LESS|8X_H100_OR_LESS|8X_A100_OR_LESS) - # Use tier-based selection script for 8x GPU - tier_group=$(echo "{{ terraform_lambdalabs_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') - {{ topdir_path }}/scripts/lambdalabs_select_tier.py "$tier_group" --verbose - ;; - *) - echo "Unknown wildcard type: {{ terraform_lambdalabs_instance_type }}" - exit 1 - ;; - esac - register: lambdalabs_auto_instance_type - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - tags: - - bringup - -- name: Fail if no Lambda Labs instances available for wildcard selection - ansible.builtin.fail: - msg: | - No GPU instances available for {{ terraform_lambdalabs_instance_type }} - - {{ lambdalabs_auto_instance_type.stderr }} - - Try: - - Wait and retry (capacity changes frequently) - - Check Lambda Labs dashboard: https://cloud.lambdalabs.com - - Use a different tier group via menuconfig - - Check capacity manually: scripts/lambdalabs_check_capacity.py - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc != 0 - tags: - - bringup - -- name: Validate Lambda Labs tier selection output format - ansible.builtin.assert: - that: - - lambdalabs_auto_instance_type.stdout.split() | length == 2 - fail_msg: | - Invalid output from tier selection script. - Expected format: "instance_type region" - Got: "{{ lambdalabs_auto_instance_type.stdout }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Parse Lambda Labs auto-selected instance type and region - set_fact: - lambdalabs_auto_selected_instance: "{{ lambdalabs_auto_instance_type.stdout.split()[0] }}" - lambdalabs_auto_selected_region: "{{ lambdalabs_auto_instance_type.stdout.split()[1] }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Report Lambda Labs auto-selected instance type for wildcards - ansible.builtin.debug: - msg: "Auto-selected instance type: {{ lambdalabs_auto_selected_instance }} in region: {{ lambdalabs_auto_selected_region }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Update Lambda Labs terraform vars with auto-selected instance type - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^lambdalabs_instance_type\s*=' - line: 'lambdalabs_instance_type = "{{ lambdalabs_auto_selected_instance }}"' - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Update Lambda Labs terraform vars with auto-selected region - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^lambdalabs_region\s*=' - line: 'lambdalabs_region = "{{ lambdalabs_auto_selected_region }}"' - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - - lambdalabs_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Set Lambda Labs resolved instance type for subsequent tasks - set_fact: - lambdalabs_resolved_instance_type: "{{ lambdalabs_auto_selected_instance if (terraform_lambdalabs_instance_type in ['GH200_OR_LESS', 'H100_OR_LESS', 'A100_OR_LESS', 'A6000_OR_LESS', '8X_B200_OR_LESS', '8X_H100_OR_LESS', '8X_A100_OR_LESS'] and lambdalabs_auto_instance_type.rc == 0) else terraform_lambdalabs_instance_type }}" - when: - - kdevops_terraform_provider == "lambdalabs" - tags: - - bringup - -- name: Check Lambda Labs capacity before provisioning (if using Lambda Labs) - ansible.builtin.shell: - cmd: | - {{ topdir_path }}/scripts/lambda-cli --output json check-availability \ - {{ lambdalabs_resolved_instance_type | default(terraform_lambdalabs_instance_type) }} {{ lambdalabs_auto_selected_region | default(terraform_lambdalabs_region) }} | \ - python3 -c " - import sys, json - data = json.load(sys.stdin) - if data.get('available'): - print(data.get('message', 'Instance available')) - sys.exit(0) - else: - print(data.get('error', 'Instance not available')) - if 'available_regions' in data: - print(f' Available in: ' + ', '.join(data['available_regions'])) - sys.exit(1) - " - register: capacity_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "lambdalabs" - - terraform_lambdalabs_instance_type not in ["GH200_OR_LESS", "H100_OR_LESS", "A100_OR_LESS", "A6000_OR_LESS", "8X_B200_OR_LESS", "8X_H100_OR_LESS", "8X_A100_OR_LESS"] - tags: - - bringup - -- name: Report Lambda Labs capacity check result - ansible.builtin.fail: - msg: "{{ capacity_check.stdout }}" - when: - - kdevops_terraform_provider == "lambdalabs" - - capacity_check is defined - - capacity_check.rc is defined - - capacity_check.rc != 0 - tags: - - bringup - -- name: Auto-select instance type for tier-based wildcards - ansible.builtin.shell: - cmd: | - case "{{ terraform_datacrunch_instance_type }}" in - B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) - # Use tier-based selection script - tier_group=$(echo "{{ terraform_datacrunch_instance_type }}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') - {{ topdir_path }}/scripts/datacrunch_select_tier.py "$tier_group" --verbose - ;; - ANY_1H100) - # Legacy H100 variant selection - check all regions - for variant in 1H100.80S.30V 1H100.80S.32V; do - result=$({{ topdir_path }}/scripts/datacrunch_check_capacity.py --instance-type "$variant" --json 2>/dev/null || echo "[]") - location=$(echo "$result" | python3 -c "import sys, json; data=json.load(sys.stdin); print(data[0]['location']) if data and len(data) > 0 else ''" 2>/dev/null) - if [ -n "$location" ]; then - echo "$variant $location" - exit 0 - fi - done - echo "No single H100 variants available" >&2 - exit 1 - ;; - *) - echo "Unknown wildcard type: {{ terraform_datacrunch_instance_type }}" - exit 1 - ;; - esac - register: datacrunch_auto_instance_type - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - tags: - - bringup - -- name: Fail if no instances available for wildcard selection - ansible.builtin.fail: - msg: | - No GPU instances available for {{ terraform_datacrunch_instance_type }} - - {{ datacrunch_auto_instance_type.stderr }} - - Try: - - Wait and retry (capacity changes frequently) - - Check DataCrunch dashboard: https://cloud.datacrunch.io - - Use a different tier group via menuconfig - - Check capacity manually: scripts/datacrunch_check_capacity.py --instance-type - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc != 0 - tags: - - bringup - -- name: Parse auto-selected instance type and location - set_fact: - auto_selected_instance: "{{ datacrunch_auto_instance_type.stdout.split()[0] }}" - auto_selected_location: "{{ datacrunch_auto_instance_type.stdout.split()[1] }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Report auto-selected instance type for wildcards - ansible.builtin.debug: - msg: "Auto-selected instance type: {{ auto_selected_instance }} in region: {{ auto_selected_location }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Update terraform vars with auto-selected instance type - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^datacrunch_instance_type\s*=' - line: 'datacrunch_instance_type = "{{ auto_selected_instance }}"' - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Set resolved instance type for subsequent tasks - set_fact: - resolved_instance_type: "{{ auto_selected_instance if (terraform_datacrunch_instance_type in ['B300_OR_LESS', 'B200_OR_LESS', 'H100_OR_LESS', 'A100_80_OR_LESS', 'A100_40_OR_LESS', 'ANY_1H100'] and datacrunch_auto_instance_type.rc == 0) else terraform_datacrunch_instance_type }}" - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup +# Terraform role entry point +# Orchestrates provider-specific and common tasks via includes -- name: Check DataCrunch capacity before provisioning (if using DataCrunch) - ansible.builtin.shell: - cmd: | - {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ - --instance-type {{ resolved_instance_type }} \ - --json | \ - python3 -c " - import sys, json - data = json.load(sys.stdin) - if data and len(data) > 0: - locations = [item['location'] for item in data] - print(f'Instance {{ resolved_instance_type }} is available in: ' + ', '.join(locations)) - sys.exit(0) - else: - print('Error: Instance {{ resolved_instance_type }} is not available in any location') - print('Check available instances with: scripts/datacrunch_check_capacity.py') - sys.exit(1) - " - register: datacrunch_capacity_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type not in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] +- name: Include bringup tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/bringup/main.yml" tags: - bringup -- name: Report DataCrunch capacity check result - ansible.builtin.fail: - msg: "{{ datacrunch_capacity_check.stdout }}" - when: - - kdevops_terraform_provider == "datacrunch" - - datacrunch_capacity_check is defined - - datacrunch_capacity_check.rc is defined - - datacrunch_capacity_check.rc != 0 - tags: - - bringup - -- name: Auto-select DataCrunch location for explicit instance types - ansible.builtin.shell: - cmd: | - {{ topdir_path }}/scripts/datacrunch_check_capacity.py \ - --instance-type {{ resolved_instance_type }} \ - --pick-first - register: datacrunch_auto_location - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type not in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - tags: - - bringup - -- name: Use tier-selected location for wildcard instance types - set_fact: - datacrunch_final_location: "{{ auto_selected_location }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_instance_type.rc == 0 - tags: - - bringup - -- name: Use auto-selected location for explicit instance types - set_fact: - datacrunch_final_location: "{{ datacrunch_auto_location.stdout }}" - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_datacrunch_instance_type not in ["B300_OR_LESS", "B200_OR_LESS", "H100_OR_LESS", "A100_80_OR_LESS", "A100_40_OR_LESS", "ANY_1H100"] - - datacrunch_auto_location.rc == 0 - tags: - - bringup - -- name: Update terraform vars with final location - ansible.builtin.lineinfile: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/terraform.tfvars" - regexp: '^datacrunch_location\s*=' - line: 'datacrunch_location = "{{ datacrunch_final_location }}"' - when: - - kdevops_terraform_provider == "datacrunch" - - datacrunch_final_location is defined - tags: - - bringup - -- name: Display final location - ansible.builtin.debug: - msg: "Selected DataCrunch location: {{ datacrunch_final_location }}" - when: - - kdevops_terraform_provider == "datacrunch" - - datacrunch_final_location is defined - tags: - - bringup - -# No longer needed - terraform reads directly from credentials file - -- name: Check if terraform state already has resources - ansible.builtin.command: - cmd: terraform state list - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_state_check - failed_when: false - changed_when: false - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup - -- name: Set flag for existing terraform resources - set_fact: - terraform_resources_exist: "{{ terraform_state_check.stdout_lines | default([]) | length > 0 }}" - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - bringup - -- name: Report that infrastructure is already provisioned - ansible.builtin.debug: - msg: "Infrastructure already provisioned ({{ terraform_state_check.stdout_lines | default([]) | length }} resources in state). Skipping terraform apply." - when: - - kdevops_terraform_provider == "datacrunch" - - terraform_resources_exist | default(false) - tags: - - bringup - -- name: Initialize external provider for DataCrunch (workaround for dev_overrides) - ansible.builtin.shell: - cmd: | - # Hide all terraform files that reference datacrunch resources - # so that terraform init only sees the external provider requirement - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file" ]; then - mv "$file" "$file.bak" - fi - done - - # Create minimal terraform config with only external provider - cat > provider_init.tf << 'EOF' - terraform { - required_version = ">= 1.0" - required_providers { - external = { - source = "hashicorp/external" - version = "~> 2.3" - } - } - } - EOF - - # Initialize to get external provider and generate lock file - # Suppress color output to avoid ANSI codes in logs - terraform init -no-color > /dev/null 2>&1 || terraform init -no-color - - # Preserve the generated lock file for the external provider - # This is needed because dev_overrides prevents normal init of datacrunch provider - # but we still need the lock file for other providers - if [ -f .terraform.lock.hcl ]; then - cp .terraform.lock.hcl .terraform.lock.hcl.generated - fi - - # Clean up temporary file and restore original terraform files - rm provider_init.tf - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file.bak" ]; then - mv "$file.bak" "$file" - fi - done - - # Restore the lock file after putting all files back - if [ -f .terraform.lock.hcl.generated ]; then - mv .terraform.lock.hcl.generated .terraform.lock.hcl - fi - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - - not (terraform_resources_exist | default(false)) - changed_when: false - tags: - - bringup - -- name: Bring up terraform resources (DataCrunch with tier fallback on failure) - ansible.builtin.shell: - cmd: | - MAX_RETRIES=5 - EXCLUDED_INSTANCES="" - TIER_GROUP="{{ terraform_datacrunch_instance_type | lower | replace('_', '-') }}" - - # Check if using tier-based selection - is_tier_based() { - case "{{ terraform_datacrunch_instance_type }}" in - B300_OR_LESS|B200_OR_LESS|H100_OR_LESS|A100_80_OR_LESS|A100_40_OR_LESS) - return 0 - ;; - *) - return 1 - ;; - esac - } - - for attempt in $(seq 1 $MAX_RETRIES); do - echo "=== Attempt $attempt of $MAX_RETRIES ===" - - # Get current instance type from tfvars - CURRENT_INSTANCE=$(grep '^datacrunch_instance_type' terraform.tfvars | sed 's/.*= *"\([^"]*\)".*/\1/') - echo "Trying instance type: $CURRENT_INSTANCE" - - # Attempt terraform apply and capture output - APPLY_OUTPUT=$(terraform apply -auto-approve -no-color 2>&1) && { - echo "$APPLY_OUTPUT" - echo "Terraform apply succeeded!" - exit 0 - } - - # Apply failed - check what kind of error - echo "$APPLY_OUTPUT" - - # Check if this is a 503 or deployment error that we can retry with fallback - if echo "$APPLY_OUTPUT" | grep -q "API returned status 503\|Error deploying instance"; then - echo "" - echo "Deployment failed for $CURRENT_INSTANCE - checking if tier fallback is available..." - - if ! is_tier_based; then - echo "Not using tier-based selection, cannot fall back to different instance type." - exit 1 - fi - - # Add current instance to exclusion list - if [ -n "$EXCLUDED_INSTANCES" ]; then - EXCLUDED_INSTANCES="$EXCLUDED_INSTANCES --exclude $CURRENT_INSTANCE" - else - EXCLUDED_INSTANCES="--exclude $CURRENT_INSTANCE" - fi - - echo "Excluded instances so far: $EXCLUDED_INSTANCES" - - # Try to select next available instance - echo "Selecting next available instance from tier group: $TIER_GROUP" - NEXT_SELECTION=$({{ topdir_path }}/scripts/datacrunch_select_tier.py "$TIER_GROUP" --verbose $EXCLUDED_INSTANCES 2>&1) || { - echo "No more instances available in tier group $TIER_GROUP" - echo "$NEXT_SELECTION" - exit 1 - } - - # Parse new instance and location - NEW_INSTANCE=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $1}') - NEW_LOCATION=$(echo "$NEXT_SELECTION" | tail -1 | awk '{print $2}') - - if [ -z "$NEW_INSTANCE" ] || [ -z "$NEW_LOCATION" ]; then - echo "Failed to parse new instance selection" - exit 1 - fi - - echo "" - echo "Falling back to: $NEW_INSTANCE in $NEW_LOCATION" - - # Update terraform.tfvars with new instance type and location - sed -i "s/^datacrunch_instance_type.*/datacrunch_instance_type = \"$NEW_INSTANCE\"/" terraform.tfvars - sed -i "s/^datacrunch_location.*/datacrunch_location = \"$NEW_LOCATION\"/" terraform.tfvars - - echo "Updated terraform.tfvars, retrying..." - echo "" - else - echo "Terraform failed with non-recoverable error" - exit 1 - fi - done - - echo "Exhausted all $MAX_RETRIES retry attempts" - exit 1 - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - - not (terraform_resources_exist | default(false)) - tags: - - bringup - -- name: Bring up terraform resources (other providers) - cloud.terraform.terraform: - binary_path: "{{ terraform_binary_path }}" - force_init: true - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - state: present - when: - - kdevops_terraform_provider != "datacrunch" - tags: - - bringup - -- name: Retrieve the controller_ip_map from terraform - cloud.terraform.terraform_output: - binary_path: "{{ terraform_binary_path }}" - format: json - name: controller_ip_map - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_output - tags: - - ssh - -- name: Add each target node's ssh Host entry on the control host - ansible.builtin.blockinfile: - block: "{{ lookup('template', 'ssh_config.j2') }}" - create: true - dest: "{{ kdevops_ssh_config }}" - insertafter: "EOF" - marker: "# {mark} host configuration for {{ item.key }}" - mode: "u=rw,g=r,o=r" - loop: "{{ terraform_output.value | dict2items }}" - tags: - - ssh - -- name: Ensure the Include directive is present on the controller - ansible.builtin.blockinfile: - path: "{{ sshconfig }}" - insertbefore: BOF - append_newline: true - create: true - marker: "# {mark} Managed by kdevops" - mode: "u=rw,g=r,o=r" - block: "Include {{ kdevops_ssh_config_prefix }}*" +- name: Include SSH configuration tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/common/ssh-config.yml" tags: - ssh -- name: Report terraform status +- name: Include status reporting tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/common/status.yml" tags: - status - block: - - name: Retrieve the controller_ip_map from terraform - cloud.terraform.terraform_output: - binary_path: "{{ terraform_binary_path }}" - format: json - name: controller_ip_map - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_output - - - name: End play -- terraform state file is empty or missing - ansible.builtin.meta: end_play - when: - - terraform_output.warnings is defined - - - name: Count active resources - ansible.builtin.command: - cmd: "terraform state list" - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - register: terraform_state - changed_when: false - - - name: Show status - ansible.builtin.debug: - msg: "Active resources: {{ terraform_state.stdout_lines | length }}" - - - name: Show controller IP map - ansible.builtin.debug: - var: terraform_output.value - -- name: Remove the ephemeral ssh config file on the control host - ansible.builtin.file: - path: "{{ kdevops_ssh_config }}" - state: absent - tags: - - destroy - -- name: Initialize external provider for DataCrunch before destroy - ansible.builtin.shell: - cmd: | - # Hide all terraform files that reference datacrunch resources - # so that terraform init only sees the external provider requirement - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file" ]; then - mv "$file" "$file.bak" - fi - done - - # Create minimal terraform config with only external provider - cat > provider_init.tf << 'EOF' - terraform { - required_version = ">= 1.0" - required_providers { - external = { - source = "hashicorp/external" - version = "~> 2.3" - } - } - } - EOF - - # Initialize to get external provider and generate lock file - terraform init - - # Preserve the generated lock file for the external provider - if [ -f .terraform.lock.hcl ]; then - cp .terraform.lock.hcl .terraform.lock.hcl.generated - fi - - # Clean up temporary file and restore original terraform files - rm provider_init.tf - for file in provider.tf main.tf output.tf vars.tf nodes.tf; do - if [ -f "$file.bak" ]; then - mv "$file.bak" "$file" - fi - done - - # Restore the lock file after putting all files back - if [ -f .terraform.lock.hcl.generated ]; then - mv .terraform.lock.hcl.generated .terraform.lock.hcl - fi - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - changed_when: false - tags: - - destroy - -- name: Destroy terraform resources (DataCrunch with dev overrides) - ansible.builtin.command: - cmd: terraform destroy -auto-approve -no-color - chdir: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - destroy - -- name: Remove terraform lock file for DataCrunch after destroy - ansible.builtin.file: - path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}/.terraform.lock.hcl" - state: absent - when: - - kdevops_terraform_provider == "datacrunch" - tags: - - destroy -- name: Destroy terraform resources (other providers) - cloud.terraform.terraform: - binary_path: "{{ terraform_binary_path }}" - force_init: true - project_path: "{{ topdir_path }}/terraform/{{ kdevops_terraform_provider }}" - state: absent - when: - - kdevops_terraform_provider != "datacrunch" +- name: Include destroy tasks + ansible.builtin.include_tasks: + file: "{{ role_path }}/tasks/destroy/main.yml" tags: - destroy