From 591292a3620d466507368771763aba95009ec112 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 13 Aug 2025 22:54:05 +0000 Subject: [PATCH 1/4] fix --- .../cluster/test_distributed_training.py | 49 +++++++++++++------ 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/tests/unittests/cluster/test_distributed_training.py b/tests/unittests/cluster/test_distributed_training.py index 4c9a7ce..9fc6ef6 100644 --- a/tests/unittests/cluster/test_distributed_training.py +++ b/tests/unittests/cluster/test_distributed_training.py @@ -40,20 +40,37 @@ def test_distributed_training(test_helper, framework_version): image_uri = test_helper.get_custom_image_uri(framework_version, type="training", gpu=False) - cp.fit( - predictor_init_args=predictor_init_args, - predictor_fit_args=predictor_fit_args, - custom_image_uri=image_uri, - framework_version=framework_version, - backend_kwargs={ - "initialization_commands": [ - "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com", - "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 369469875935.dkr.ecr.us-east-1.amazonaws.com", - ] - }, - ) + try: + cp.fit( + predictor_init_args=predictor_init_args, + predictor_fit_args=predictor_fit_args, + custom_image_uri=image_uri, + framework_version=framework_version, + backend_kwargs={ + "initialization_commands": [ + "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com", + "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 369469875935.dkr.ecr.us-east-1.amazonaws.com", + # Auto-terminate after 20 minutes as safety for CI + "echo '#!/bin/bash' > /tmp/auto_terminate.sh", + "echo 'sleep 1200' >> /tmp/auto_terminate.sh", # 20 minutes + 'echo \'TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")\' >> /tmp/auto_terminate.sh', + "echo 'INSTANCE_ID=$(curl -H \"X-aws-ec2-metadata-token: \$TOKEN\" -s http://169.254.169.254/latest/meta-data/instance-id)' >> /tmp/auto_terminate.sh", + "echo 'aws ec2 terminate-instances --instance-ids \$INSTANCE_ID --region us-east-1' >> /tmp/auto_terminate.sh", + "chmod +x /tmp/auto_terminate.sh", + "nohup /tmp/auto_terminate.sh > /tmp/auto_terminate.log 2>&1 &", + ] + }, + ) - model_artifact_path = cp.get_fit_job_output_path() - bucket, prefix = s3_path_to_bucket_prefix(model_artifact_path) - s3_client = boto3.client("s3") - s3_client.head_object(Bucket=bucket, Key=prefix) + model_artifact_path = cp.get_fit_job_output_path() + bucket, prefix = s3_path_to_bucket_prefix(model_artifact_path) + s3_client = boto3.client("s3") + s3_client.head_object(Bucket=bucket, Key=prefix) + except Exception as e: + # In case of any failure, try to cleanup the Ray cluster + try: + if hasattr(cp.backend, "_cluster_manager") and cp.backend._cluster_manager is not None: + cp.backend._cluster_manager.down() + except Exception as cleanup_error: + print(f"Warning: Failed to cleanup cluster after test failure: {cleanup_error}") + raise e From 6be6081a18fe2ce77e8c81e49b9c0447c575be5c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 13 Aug 2025 22:58:35 +0000 Subject: [PATCH 2/4] fix lint --- .../cluster/test_distributed_training.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/unittests/cluster/test_distributed_training.py b/tests/unittests/cluster/test_distributed_training.py index 9fc6ef6..7d0a43c 100644 --- a/tests/unittests/cluster/test_distributed_training.py +++ b/tests/unittests/cluster/test_distributed_training.py @@ -48,14 +48,19 @@ def test_distributed_training(test_helper, framework_version): framework_version=framework_version, backend_kwargs={ "initialization_commands": [ - "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com", - "aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 369469875935.dkr.ecr.us-east-1.amazonaws.com", + "aws ecr get-login-password --region us-east-1 | docker login --username AWS " + "--password-stdin 763104351884.dkr.ecr.us-east-1.amazonaws.com", + "aws ecr get-login-password --region us-east-1 | docker login --username AWS " + "--password-stdin 369469875935.dkr.ecr.us-east-1.amazonaws.com", # Auto-terminate after 20 minutes as safety for CI "echo '#!/bin/bash' > /tmp/auto_terminate.sh", "echo 'sleep 1200' >> /tmp/auto_terminate.sh", # 20 minutes - 'echo \'TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600")\' >> /tmp/auto_terminate.sh', - "echo 'INSTANCE_ID=$(curl -H \"X-aws-ec2-metadata-token: \$TOKEN\" -s http://169.254.169.254/latest/meta-data/instance-id)' >> /tmp/auto_terminate.sh", - "echo 'aws ec2 terminate-instances --instance-ids \$INSTANCE_ID --region us-east-1' >> /tmp/auto_terminate.sh", + 'echo \'TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" ' + '-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")\' >> /tmp/auto_terminate.sh', + "echo 'INSTANCE_ID=$(curl -H \"X-aws-ec2-metadata-token: \\$TOKEN\" " + "-s http://169.254.169.254/latest/meta-data/instance-id)' >> /tmp/auto_terminate.sh", + "echo 'aws ec2 terminate-instances --instance-ids \\$INSTANCE_ID " + "--region us-east-1' >> /tmp/auto_terminate.sh", "chmod +x /tmp/auto_terminate.sh", "nohup /tmp/auto_terminate.sh > /tmp/auto_terminate.log 2>&1 &", ] From 67564bc1a220643332537479c7675a9ef00b0d25 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 13 Aug 2025 23:00:27 +0000 Subject: [PATCH 3/4] fix lint --- tests/unittests/cluster/test_distributed_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittests/cluster/test_distributed_training.py b/tests/unittests/cluster/test_distributed_training.py index 7d0a43c..cd8996d 100644 --- a/tests/unittests/cluster/test_distributed_training.py +++ b/tests/unittests/cluster/test_distributed_training.py @@ -57,7 +57,7 @@ def test_distributed_training(test_helper, framework_version): "echo 'sleep 1200' >> /tmp/auto_terminate.sh", # 20 minutes 'echo \'TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" ' '-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")\' >> /tmp/auto_terminate.sh', - "echo 'INSTANCE_ID=$(curl -H \"X-aws-ec2-metadata-token: \\$TOKEN\" " + 'echo \'INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: \\$TOKEN" ' "-s http://169.254.169.254/latest/meta-data/instance-id)' >> /tmp/auto_terminate.sh", "echo 'aws ec2 terminate-instances --instance-ids \\$INSTANCE_ID " "--region us-east-1' >> /tmp/auto_terminate.sh", From 26e61ec5557d03ea512fcd8996bf27fd30dd1a65 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 13 Aug 2025 23:46:37 +0000 Subject: [PATCH 4/4] test --- .../cluster/test_distributed_training.py | 81 +++++++++++++++++-- 1 file changed, 75 insertions(+), 6 deletions(-) diff --git a/tests/unittests/cluster/test_distributed_training.py b/tests/unittests/cluster/test_distributed_training.py index cd8996d..aae54dd 100644 --- a/tests/unittests/cluster/test_distributed_training.py +++ b/tests/unittests/cluster/test_distributed_training.py @@ -40,6 +40,49 @@ def test_distributed_training(test_helper, framework_version): image_uri = test_helper.get_custom_image_uri(framework_version, type="training", gpu=False) + import threading + import time + import boto3 + + # Create a safety timer that will terminate any instances from this test after 5 minutes + def safety_termination(): + time.sleep(300) # 5 minutes for testing + try: + print("Safety termination triggered - checking for running instances...") + ec2 = boto3.client("ec2", region_name="us-east-1") + + # Find instances that might be from this test run + response = ec2.describe_instances( + Filters=[ + {"Name": "instance-state-name", "Values": ["running", "pending"]}, + {"Name": "tag:ray-cluster-name", "Values": ["*ag_ray_aws_default*"]}, + ] + ) + + instance_ids = [] + for reservation in response["Reservations"]: + for instance in reservation["Instances"]: + # Only terminate recent instances (launched within last 10 minutes) + launch_time = instance["LaunchTime"].timestamp() + current_time = time.time() + if (current_time - launch_time) < 600: # 10 minutes + instance_ids.append(instance["InstanceId"]) + + if instance_ids: + print(f"Safety termination: Found {len(instance_ids)} instances to terminate: {instance_ids}") + ec2.terminate_instances(InstanceIds=instance_ids) + print("Safety termination: Instances terminated successfully") + else: + print("Safety termination: No matching instances found") + + except Exception as e: + print(f"Safety termination failed: {e}") + + # Start the safety timer in background + safety_thread = threading.Thread(target=safety_termination, daemon=True) + safety_thread.start() + print("Started safety termination timer (5 minutes)") + try: cp.fit( predictor_init_args=predictor_init_args, @@ -54,15 +97,41 @@ def test_distributed_training(test_helper, framework_version): "--password-stdin 369469875935.dkr.ecr.us-east-1.amazonaws.com", # Auto-terminate after 20 minutes as safety for CI "echo '#!/bin/bash' > /tmp/auto_terminate.sh", - "echo 'sleep 1200' >> /tmp/auto_terminate.sh", # 20 minutes + "echo 'exec > >(tee -a /tmp/auto_terminate.log) 2>&1' >> /tmp/auto_terminate.sh", # Log everything + "echo 'echo \"[$(date)] Auto-termination script started. Will terminate in 5 minutes.\"' >> /tmp/auto_terminate.sh", + "echo 'echo \"[$(date)] Process ID: $$\"' >> /tmp/auto_terminate.sh", + "echo 'sleep 300' >> /tmp/auto_terminate.sh", # 5 minutes for testing + "echo 'echo \"[$(date)] 5 minutes elapsed. Initiating termination...\"' >> /tmp/auto_terminate.sh", 'echo \'TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" ' - '-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")\' >> /tmp/auto_terminate.sh', - 'echo \'INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: \\$TOKEN" ' - "-s http://169.254.169.254/latest/meta-data/instance-id)' >> /tmp/auto_terminate.sh", + '-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null || echo "failed")\' >> /tmp/auto_terminate.sh', + 'echo \'if [ "$TOKEN" = "failed" ]; then\' >> /tmp/auto_terminate.sh', + "echo ' echo \"[$(date)] Failed to get metadata token, trying without token...\"' >> /tmp/auto_terminate.sh", + "echo ' INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo \"unknown\")' >> /tmp/auto_terminate.sh", + "echo 'else' >> /tmp/auto_terminate.sh", + 'echo \' INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" ' + '-s http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")\' >> /tmp/auto_terminate.sh', + "echo 'fi' >> /tmp/auto_terminate.sh", + "echo 'echo \"[$(date)] Instance ID: \\$INSTANCE_ID\"' >> /tmp/auto_terminate.sh", + "echo 'echo \"[$(date)] Instance ID: \\$INSTANCE_ID\"' >> /tmp/auto_terminate.sh", + 'echo \'if [ "\\$INSTANCE_ID" != "unknown" ] && [ -n "\\$INSTANCE_ID" ]; then\' >> /tmp/auto_terminate.sh', + "echo 'echo \"[$(date)] Checking IAM permissions...\"' >> /tmp/auto_terminate.sh", + "echo 'aws sts get-caller-identity 2>&1' >> /tmp/auto_terminate.sh", + "echo 'echo \"[$(date)] Attempting termination...\"' >> /tmp/auto_terminate.sh", "echo 'aws ec2 terminate-instances --instance-ids \\$INSTANCE_ID " - "--region us-east-1' >> /tmp/auto_terminate.sh", + "--region us-east-1 2>&1' >> /tmp/auto_terminate.sh", + "echo 'echo \"[$(date)] Termination command sent.\"' >> /tmp/auto_terminate.sh", + "echo 'else' >> /tmp/auto_terminate.sh", + "echo 'echo \"[$(date)] ERROR: Could not determine instance ID, cannot terminate\"' >> /tmp/auto_terminate.sh", + "echo 'fi' >> /tmp/auto_terminate.sh", "chmod +x /tmp/auto_terminate.sh", - "nohup /tmp/auto_terminate.sh > /tmp/auto_terminate.log 2>&1 &", + # Create a systemd-style service for better persistence + "echo 'Starting auto-termination script...'", + "setsid /tmp/auto_terminate.sh &", # Use setsid instead of nohup for better process isolation + "echo 'Auto-termination script PID:' $!", + "sleep 2", # Give script time to start + "ps aux | grep auto_terminate | grep -v grep || echo 'Warning: auto_terminate script not found in process list'", + "ls -la /tmp/auto_terminate.*", + "echo 'Check /tmp/auto_terminate.log for status'", ] }, )