diff --git a/deploy/aws-hypervisor/instance.env.template b/deploy/aws-hypervisor/instance.env.template index 6fa028a..45ac143 100644 --- a/deploy/aws-hypervisor/instance.env.template +++ b/deploy/aws-hypervisor/instance.env.template @@ -6,8 +6,18 @@ export RHEL_HOST_ARCHITECTURE=x86_64 export REGION=us-west-2 export EC2_INSTANCE_TYPE="c5n.metal" export AWS_DEFAULT_REGION=us-west-2 +# Availability Zone override (optional) - if set, the instance will be placed in this AZ +# Useful when a specific AZ has better capacity or pricing (e.g. for spot instances) +# Example: export AVAILABILITY_ZONE=us-west-2b +# export AVAILABILITY_ZONE= -# EC2 Capacity Reservation Settings +# Spot Instance Settings +# Set to 'true' to use spot instances for significant cost savings (up to 90% off on-demand) +# Spot instances may be stopped by AWS when capacity is needed, but will NOT be terminated +# (uses persistent spot with stop interruption behavior) +# export USE_SPOT_INSTANCE=false + +# EC2 Capacity Reservation Settings (ignored when USE_SPOT_INSTANCE=true) # Enable capacity reservation check before instance creation (recommended) # Set to 'false' to disable the pre-flight capacity check export ENABLE_CAPACITY_RESERVATION=true diff --git a/deploy/aws-hypervisor/scripts/common.sh b/deploy/aws-hypervisor/scripts/common.sh index 8096f89..0f36274 100755 --- a/deploy/aws-hypervisor/scripts/common.sh +++ b/deploy/aws-hypervisor/scripts/common.sh @@ -10,7 +10,12 @@ export RHEL_HOST_ARCHITECTURE="${RHEL_HOST_ARCHITECTURE:-x86_64}" export EC2_INSTANCE_TYPE="${EC2_INSTANCE_TYPE:-c5n.metal}" export RHEL_VERSION="${RHEL_VERSION:-9.6}" -# Capacity reservation defaults +# Spot instance defaults +export USE_SPOT_INSTANCE="${USE_SPOT_INSTANCE:-false}" +# Availability zone override (optional) +export AVAILABILITY_ZONE="${AVAILABILITY_ZONE:-}" + +# Capacity reservation defaults (ignored when USE_SPOT_INSTANCE=true) export ENABLE_CAPACITY_RESERVATION="${ENABLE_CAPACITY_RESERVATION:-true}" export CAPACITY_RESERVATION_DURATION_MINUTES="${CAPACITY_RESERVATION_DURATION_MINUTES:-60}" diff --git a/deploy/aws-hypervisor/scripts/create.sh b/deploy/aws-hypervisor/scripts/create.sh index 75e85b7..f2ce657 100755 --- a/deploy/aws-hypervisor/scripts/create.sh +++ b/deploy/aws-hypervisor/scripts/create.sh @@ -55,10 +55,19 @@ echo "ec2-user" > "${SCRIPT_DIR}/../${SHARED_DIR}/ssh_user" echo -e "AMI ID: $RHEL_HOST_AMI" echo -e "Machine Type: $EC2_INSTANCE_TYPE" +echo -e "Spot Instance: ${USE_SPOT_INSTANCE:-false}" + +# Spot and capacity reservations are mutually exclusive +USE_SPOT="${USE_SPOT_INSTANCE:-false}" +if [[ "${USE_SPOT}" == "true" ]]; then + msg_info "Spot instance requested - skipping capacity reservation (mutually exclusive)" + ENABLE_CAPACITY_RESERVATION="false" +fi # Create capacity reservation to validate and guarantee instance availability CAPACITY_RESERVATION_ID="" -AVAILABILITY_ZONE="" +# Preserve user-provided AVAILABILITY_ZONE from instance.env +AVAILABILITY_ZONE="${AVAILABILITY_ZONE:-}" if [[ "${ENABLE_CAPACITY_RESERVATION}" == "true" ]]; then if reservation_result=$(create_capacity_reservation "${EC2_INSTANCE_TYPE}" "${REGION}"); then @@ -100,7 +109,8 @@ aws --region "$REGION" cloudformation create-stack --stack-name "${STACK_NAME}" "ParameterKey=EC2Type,ParameterValue=${ec2Type}" \ "ParameterKey=PublicKeyString,ParameterValue=$(cat "${SSH_PUBLIC_KEY}")" \ "ParameterKey=CapacityReservationId,ParameterValue=${CAPACITY_RESERVATION_ID}" \ - "ParameterKey=AvailabilityZone,ParameterValue=${AVAILABILITY_ZONE}" + "ParameterKey=AvailabilityZone,ParameterValue=${AVAILABILITY_ZONE}" \ + "ParameterKey=UseSpot,ParameterValue=$( [[ "${USE_SPOT}" == "true" ]] && echo "Yes" || echo "No" )" echo "Created stack" diff --git a/deploy/aws-hypervisor/scripts/destroy.sh b/deploy/aws-hypervisor/scripts/destroy.sh index fba37c4..89189f0 100755 --- a/deploy/aws-hypervisor/scripts/destroy.sh +++ b/deploy/aws-hypervisor/scripts/destroy.sh @@ -50,6 +50,23 @@ if [[ -f "${reservation_file}" ]]; then rm -f "${instance_data_dir}/availability-zone" fi +# Cancel persistent spot request if the instance is a spot instance +instance_id_file="${instance_data_dir}/aws-instance-id" +if [[ -f "${instance_id_file}" ]]; then + instance_id=$(cat "${instance_id_file}") + spot_request_id=$(aws --region "${REGION}" ec2 describe-instances \ + --instance-ids "${instance_id}" \ + --query 'Reservations[0].Instances[0].SpotInstanceRequestId' \ + --output text --no-cli-pager 2>/dev/null || echo "") + + if [[ -n "${spot_request_id}" && "${spot_request_id}" != "None" && "${spot_request_id}" != "null" ]]; then + msg_info "Canceling persistent spot request ${spot_request_id}..." + aws --region "${REGION}" ec2 cancel-spot-instance-requests \ + --spot-instance-request-ids "${spot_request_id}" \ + --no-cli-pager >/dev/null 2>&1 || msg_warning "Failed to cancel spot request (may already be canceled)" + fi +fi + # Delete the CloudFormation stack echo "Deleting CloudFormation stack '${STACK_NAME}'..." aws --region "$REGION" cloudformation delete-stack --stack-name "${STACK_NAME}" diff --git a/deploy/aws-hypervisor/scripts/start.sh b/deploy/aws-hypervisor/scripts/start.sh index 3f35931..ce53816 100755 --- a/deploy/aws-hypervisor/scripts/start.sh +++ b/deploy/aws-hypervisor/scripts/start.sh @@ -39,17 +39,24 @@ fi INSTANCE_ID=$(cat "${SCRIPT_DIR}/../${SHARED_DIR}/aws-instance-id") echo "Starting instance ${INSTANCE_ID}..." -# Check current instance state +# Check current instance state and lifecycle # shellcheck disable=SC2153 # REGION is sourced from instance.env via common.sh, not a misspelling of local 'region' INSTANCE_STATE=$(aws --region "${REGION}" ec2 describe-instances --instance-ids "${INSTANCE_ID}" --query 'Reservations[0].Instances[0].State.Name' --output text --no-cli-pager) +INSTANCE_LIFECYCLE=$(aws --region "${REGION}" ec2 describe-instances --instance-ids "${INSTANCE_ID}" --query 'Reservations[0].Instances[0].InstanceLifecycle' --output text --no-cli-pager || echo "unknown") echo "Current instance state: ${INSTANCE_STATE}" +if [[ "${INSTANCE_LIFECYCLE}" == "spot" ]]; then + msg_info "Instance is a spot instance" +fi case "${INSTANCE_STATE}" in "running") echo "Instance is already running." ;; "stopped") - ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}" + # Spot instances don't support capacity reservation attributes + if [[ "${INSTANCE_LIFECYCLE}" != "spot" ]]; then + ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}" + fi echo "Starting instance..." aws --region "${REGION}" ec2 start-instances --instance-ids "${INSTANCE_ID}" --no-cli-pager > /dev/null echo "Waiting for instance to start..." @@ -60,7 +67,10 @@ case "${INSTANCE_STATE}" in "stopping") echo "Instance is currently stopping. Waiting for it to stop completely..." aws --region "${REGION}" ec2 wait instance-stopped --instance-ids "${INSTANCE_ID}" --no-cli-pager - ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}" + # Spot instances don't support capacity reservation attributes + if [[ "${INSTANCE_LIFECYCLE}" != "spot" ]]; then + ensure_open_capacity_preference "${INSTANCE_ID}" "${REGION}" + fi echo "Now starting instance..." aws --region "${REGION}" ec2 start-instances --instance-ids "${INSTANCE_ID}" --no-cli-pager > /dev/null echo "Waiting for instance to start..." diff --git a/deploy/aws-hypervisor/templates/rhel-instance.yaml b/deploy/aws-hypervisor/templates/rhel-instance.yaml index 9c71969..21bfe35 100644 --- a/deploy/aws-hypervisor/templates/rhel-instance.yaml +++ b/deploy/aws-hypervisor/templates/rhel-instance.yaml @@ -6,6 +6,7 @@ Conditions: AddSecondaryVolume: !Not [!Equals [!Ref EC2Type, 'MetalMachine']] UseCapacityReservation: !Not [!Equals [!Ref CapacityReservationId, '']] UseSpecificAZ: !Not [!Equals [!Ref AvailabilityZone, '']] + UseSpotInstance: !Equals [!Ref UseSpot, 'Yes'] Mappings: VolumeSize: @@ -54,6 +55,13 @@ Parameters: Type: String Description: Specific AZ for instance placement (optional) Default: "" + UseSpot: + Type: String + Description: Whether to use spot instances (Yes/No) + Default: "No" + AllowedValues: + - "Yes" + - "No" Metadata: AWS::CloudFormation::Interface: @@ -238,6 +246,18 @@ Resources: CapacityReservationTarget: CapacityReservationId: !Ref CapacityReservationId + # Launch Template for Spot Instances (only created when spot is used) + RHELSpotLaunchTemplate: + Type: AWS::EC2::LaunchTemplate + Condition: UseSpotInstance + Properties: + LaunchTemplateData: + InstanceMarketOptions: + MarketType: spot + SpotOptions: + SpotInstanceType: persistent + InstanceInterruptionBehavior: stop + # EC2 Instance RHELInstance: Type: AWS::EC2::Instance @@ -251,10 +271,14 @@ Resources: IamInstanceProfile: !Ref RHELInstanceProfile InstanceType: !Ref HostInstanceType LaunchTemplate: !If - - UseCapacityReservation - - LaunchTemplateId: !Ref RHELLaunchTemplate - Version: !GetAtt RHELLaunchTemplate.LatestVersionNumber - - !Ref AWS::NoValue + - UseSpotInstance + - LaunchTemplateId: !Ref RHELSpotLaunchTemplate + Version: !GetAtt RHELSpotLaunchTemplate.LatestVersionNumber + - !If + - UseCapacityReservation + - LaunchTemplateId: !Ref RHELLaunchTemplate + Version: !GetAtt RHELLaunchTemplate.LatestVersionNumber + - !Ref AWS::NoValue NetworkInterfaces: - AssociatePublicIpAddress: "False" DeviceIndex: "0"