Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion deploy/aws-hypervisor/instance.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,18 @@ export RHEL_HOST_ARCHITECTURE=x86_64
export REGION=us-west-2
export EC2_INSTANCE_TYPE="c5n.metal"
export AWS_DEFAULT_REGION=us-west-2
# Availability Zone override (optional) - if set, the instance will be placed in this AZ
# Useful when a specific AZ has better capacity or pricing (e.g. for spot instances)
# Example: export AVAILABILITY_ZONE=us-west-2b
# export AVAILABILITY_ZONE=

# EC2 Capacity Reservation Settings
# Spot Instance Settings
# Set to 'true' to use spot instances for significant cost savings (up to 90% off on-demand)
# Spot instances may be stopped by AWS when capacity is needed, but will NOT be terminated
# (uses persistent spot with stop interruption behavior)
# export USE_SPOT_INSTANCE=false

# EC2 Capacity Reservation Settings (ignored when USE_SPOT_INSTANCE=true)
# Enable capacity reservation check before instance creation (recommended)
# Set to 'false' to disable the pre-flight capacity check
export ENABLE_CAPACITY_RESERVATION=true
Expand Down
7 changes: 6 additions & 1 deletion deploy/aws-hypervisor/scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@ export RHEL_HOST_ARCHITECTURE="${RHEL_HOST_ARCHITECTURE:-x86_64}"
export EC2_INSTANCE_TYPE="${EC2_INSTANCE_TYPE:-c5n.metal}"
export RHEL_VERSION="${RHEL_VERSION:-9.6}"

# Capacity reservation defaults
# Spot instance defaults
export USE_SPOT_INSTANCE="${USE_SPOT_INSTANCE:-false}"
# Availability zone override (optional)
export AVAILABILITY_ZONE="${AVAILABILITY_ZONE:-}"

# Capacity reservation defaults (ignored when USE_SPOT_INSTANCE=true)
export ENABLE_CAPACITY_RESERVATION="${ENABLE_CAPACITY_RESERVATION:-true}"
export CAPACITY_RESERVATION_DURATION_MINUTES="${CAPACITY_RESERVATION_DURATION_MINUTES:-60}"

Expand Down
14 changes: 12 additions & 2 deletions deploy/aws-hypervisor/scripts/create.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,19 @@ echo "ec2-user" > "${SCRIPT_DIR}/../${SHARED_DIR}/ssh_user"

echo -e "AMI ID: $RHEL_HOST_AMI"
echo -e "Machine Type: $EC2_INSTANCE_TYPE"
echo -e "Spot Instance: ${USE_SPOT_INSTANCE:-false}"

# Spot and capacity reservations are mutually exclusive
USE_SPOT="${USE_SPOT_INSTANCE:-false}"
if [[ "${USE_SPOT}" == "true" ]]; then
msg_info "Spot instance requested - skipping capacity reservation (mutually exclusive)"
ENABLE_CAPACITY_RESERVATION="false"
fi

# Create capacity reservation to validate and guarantee instance availability
CAPACITY_RESERVATION_ID=""
AVAILABILITY_ZONE=""
# Preserve user-provided AVAILABILITY_ZONE from instance.env
AVAILABILITY_ZONE="${AVAILABILITY_ZONE:-}"

if [[ "${ENABLE_CAPACITY_RESERVATION}" == "true" ]]; then
if reservation_result=$(create_capacity_reservation "${EC2_INSTANCE_TYPE}" "${REGION}"); then
Expand Down Expand Up @@ -100,7 +109,8 @@ aws --region "$REGION" cloudformation create-stack --stack-name "${STACK_NAME}"
"ParameterKey=EC2Type,ParameterValue=${ec2Type}" \
"ParameterKey=PublicKeyString,ParameterValue=$(cat "${SSH_PUBLIC_KEY}")" \
"ParameterKey=CapacityReservationId,ParameterValue=${CAPACITY_RESERVATION_ID}" \
"ParameterKey=AvailabilityZone,ParameterValue=${AVAILABILITY_ZONE}"
"ParameterKey=AvailabilityZone,ParameterValue=${AVAILABILITY_ZONE}" \
"ParameterKey=UseSpot,ParameterValue=$( [[ "${USE_SPOT}" == "true" ]] && echo "Yes" || echo "No" )"

echo "Created stack"

Expand Down
17 changes: 17 additions & 0 deletions deploy/aws-hypervisor/scripts/destroy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,23 @@ if [[ -f "${reservation_file}" ]]; then
rm -f "${instance_data_dir}/availability-zone"
fi

# Cancel persistent spot request if the instance is a spot instance
instance_id_file="${instance_data_dir}/aws-instance-id"
if [[ -f "${instance_id_file}" ]]; then
instance_id=$(cat "${instance_id_file}")
spot_request_id=$(aws --region "${REGION}" ec2 describe-instances \
--instance-ids "${instance_id}" \
--query 'Reservations[0].Instances[0].SpotInstanceRequestId' \
--output text --no-cli-pager 2>/dev/null || echo "")

if [[ -n "${spot_request_id}" && "${spot_request_id}" != "None" && "${spot_request_id}" != "null" ]]; then
msg_info "Canceling persistent spot request ${spot_request_id}..."
aws --region "${REGION}" ec2 cancel-spot-instance-requests \
--spot-instance-request-ids "${spot_request_id}" \
--no-cli-pager >/dev/null 2>&1 || msg_warning "Failed to cancel spot request (may already be canceled)"
fi
fi

# Delete the CloudFormation stack
echo "Deleting CloudFormation stack '${STACK_NAME}'..."
aws --region "$REGION" cloudformation delete-stack --stack-name "${STACK_NAME}"
Expand Down
16 changes: 13 additions & 3 deletions deploy/aws-hypervisor/scripts/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,24 @@ fi
INSTANCE_ID=$(cat "${SCRIPT_DIR}/../${SHARED_DIR}/aws-instance-id")
echo "Starting instance ${INSTANCE_ID}..."

# Check current instance state
# Check current instance state and lifecycle
# shellcheck disable=SC2153 # REGION is sourced from instance.env via common.sh, not a misspelling of local 'region'
INSTANCE_STATE=$(aws --region "${REGION}" ec2 describe-instances --instance-ids "${INSTANCE_ID}" --query 'Reservations[0].Instances[0].State.Name' --output text --no-cli-pager)
INSTANCE_LIFECYCLE=$(aws --region "${REGION}" ec2 describe-instances --instance-ids "${INSTANCE_ID}" --query 'Reservations[0].Instances[0].InstanceLifecycle' --output text --no-cli-pager || echo "unknown")
echo "Current instance state: ${INSTANCE_STATE}"
if [[ "${INSTANCE_LIFECYCLE}" == "spot" ]]; then
msg_info "Instance is a spot instance"
fi

case "${INSTANCE_STATE}" in
"running")
echo "Instance is already running."
;;
"stopped")
ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}"
# Spot instances don't support capacity reservation attributes
if [[ "${INSTANCE_LIFECYCLE}" != "spot" ]]; then
ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}"
fi
echo "Starting instance..."
aws --region "${REGION}" ec2 start-instances --instance-ids "${INSTANCE_ID}" --no-cli-pager > /dev/null
echo "Waiting for instance to start..."
Expand All @@ -60,7 +67,10 @@ case "${INSTANCE_STATE}" in
"stopping")
echo "Instance is currently stopping. Waiting for it to stop completely..."
aws --region "${REGION}" ec2 wait instance-stopped --instance-ids "${INSTANCE_ID}" --no-cli-pager
ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}"
# Spot instances don't support capacity reservation attributes
if [[ "${INSTANCE_LIFECYCLE}" != "spot" ]]; then
ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}"
fi
echo "Now starting instance..."
aws --region "${REGION}" ec2 start-instances --instance-ids "${INSTANCE_ID}" --no-cli-pager > /dev/null
echo "Waiting for instance to start..."
Expand Down
32 changes: 28 additions & 4 deletions deploy/aws-hypervisor/templates/rhel-instance.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Conditions:
AddSecondaryVolume: !Not [!Equals [!Ref EC2Type, 'MetalMachine']]
UseCapacityReservation: !Not [!Equals [!Ref CapacityReservationId, '']]
UseSpecificAZ: !Not [!Equals [!Ref AvailabilityZone, '']]
UseSpotInstance: !Equals [!Ref UseSpot, 'Yes']

Mappings:
VolumeSize:
Expand Down Expand Up @@ -54,6 +55,13 @@ Parameters:
Type: String
Description: Specific AZ for instance placement (optional)
Default: ""
UseSpot:
Type: String
Description: Whether to use spot instances (Yes/No)
Default: "No"
AllowedValues:
- "Yes"
- "No"

Metadata:
AWS::CloudFormation::Interface:
Expand Down Expand Up @@ -238,6 +246,18 @@ Resources:
CapacityReservationTarget:
CapacityReservationId: !Ref CapacityReservationId

# Launch Template for Spot Instances (only created when spot is used)
RHELSpotLaunchTemplate:
Type: AWS::EC2::LaunchTemplate
Condition: UseSpotInstance
Properties:
LaunchTemplateData:
InstanceMarketOptions:
MarketType: spot
SpotOptions:
SpotInstanceType: persistent
InstanceInterruptionBehavior: stop

Comment on lines +249 to +260
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Look for any spot-request cancellation logic across the deploy tree
rg -nP -C2 'cancel-spot-instance-requests|describe-spot-instance-requests|SpotInstanceRequestId' 
fd -t f 'destroy|delete|cleanup|teardown' deploy/aws-hypervisor | xargs -I{} sh -c 'echo "=== {} ==="; cat "{}"'

Repository: openshift-eng/two-node-toolbox

Length of output: 2729


🌐 Web query:

When an EC2 instance is launched via CloudFormation AWS::EC2::Instance using a LaunchTemplate with InstanceMarketOptions MarketType spot and SpotInstanceType persistent, is the associated spot instance request cancelled automatically when the CloudFormation stack is deleted?

💡 Result:

No, the associated Spot Instance request is not cancelled automatically when the CloudFormation stack is deleted. CloudFormation deletes the EC2 instance by terminating it, but for persistent Spot requests, termination causes the request to become open again, potentially launching a new instance. Multiple user reports confirm the Spot request persists after stack deletion, requiring manual cancellation.

Citations:


Add spot instance request cancellation to teardown process.

The destroy.sh script handles capacity reservation cleanup but does not cancel the persistent Spot Instance Request created by SpotInstanceType: persistent. When the CloudFormation stack deletes the EC2 instance, the persistent Spot request remains open and can relaunch a replacement instance, leading to orphaned instances and continued billing.

The teardown script must explicitly cancel all Spot Instance Requests associated with the stack before or after CloudFormation stack deletion. Reference the Spot request ID stored during instance launch and use aws ec2 cancel-spot-instance-requests to clean it up during the destroy phase.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@deploy/aws-hypervisor/templates/rhel-instance.yaml` around lines 249 - 260,
The teardown lacks cancellation of persistent Spot Instance Requests created by
the RHELSpotLaunchTemplate (SpotOptions -> SpotInstanceType: persistent); update
the destroy.sh script to locate the Spot Request ID recorded when the instance
was launched (or query for requests tied to the CloudFormation stack/instance
tags) and call aws ec2 cancel-spot-instance-requests --spot-instance-request-ids
<ids> as part of teardown (before/after stack delete) to ensure persistent
requests are removed and no replacement instances are launched.

# EC2 Instance
RHELInstance:
Type: AWS::EC2::Instance
Expand All @@ -251,10 +271,14 @@ Resources:
IamInstanceProfile: !Ref RHELInstanceProfile
InstanceType: !Ref HostInstanceType
LaunchTemplate: !If
- UseCapacityReservation
- LaunchTemplateId: !Ref RHELLaunchTemplate
Version: !GetAtt RHELLaunchTemplate.LatestVersionNumber
- !Ref AWS::NoValue
- UseSpotInstance
- LaunchTemplateId: !Ref RHELSpotLaunchTemplate
Version: !GetAtt RHELSpotLaunchTemplate.LatestVersionNumber
- !If
- UseCapacityReservation
- LaunchTemplateId: !Ref RHELLaunchTemplate
Version: !GetAtt RHELLaunchTemplate.LatestVersionNumber
- !Ref AWS::NoValue
NetworkInterfaces:
- AssociatePublicIpAddress: "False"
DeviceIndex: "0"
Expand Down