Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions sagemaker-train/tests/integ/ai_registry/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,9 @@ def test_create_dataset_from_s3_oss_dpo(self, unique_name, test_bucket, cleanup_
assert dataset.name == unique_name
assert dataset.customization_technique == CustomizationTechnique.DPO

@pytest.mark.us_east_1
def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup_list):
"""Test creating RLVR dataset from S3 URI."""
"""Test creating Nova SFT dataset from S3 URI."""
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_sft_train.jsonl"
dataset = DataSet.create(
name=unique_name,
Expand All @@ -92,8 +93,9 @@ def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup
assert dataset.name == unique_name
assert dataset.customization_technique == CustomizationTechnique.SFT

@pytest.mark.us_east_1
def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup_list):
"""Test creating RLVR dataset from S3 URI."""
"""Test creating Nova DPO dataset from S3 URI."""
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_dpo_train.jsonl"
dataset = DataSet.create(

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no cleanup for DataSet.create()?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is cleanup — the cleanup_list fixture handles it. Each test appends the created dataset to cleanup_list, and the fixture's teardown calls resource.delete() on all tracked resources (see conftest.py line ~108-113).

name=unique_name,
Expand All @@ -105,8 +107,9 @@ def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup
assert dataset.name == unique_name
assert dataset.customization_technique == CustomizationTechnique.DPO

@pytest.mark.us_east_1
def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup_list):
"""Test creating RLVR dataset from S3 URI."""
"""Test creating Nova RFT dataset from S3 URI."""
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_rft_train.jsonl"
dataset = DataSet.create(
name=unique_name,
Expand All @@ -118,8 +121,9 @@ def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup
assert dataset.name == unique_name
assert dataset.customization_technique == CustomizationTechnique.RLVR

@pytest.mark.us_east_1
def test_create_dataset_from_s3_nova_eval(self, unique_name, test_bucket, cleanup_list):
"""Test creating RLVR dataset from S3 URI."""
"""Test creating Nova eval dataset from S3 URI."""
s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_eval.jsonl"
dataset = DataSet.create(
name=unique_name,
Expand Down
40 changes: 29 additions & 11 deletions sagemaker-train/tests/integ/train/test_benchmark_evaluator.py

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we running these tests in a different account? E.g. 784379639078 instead of 729646638167?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, @mujtaba1747 set that up earlier

Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,11 @@
"region": "us-west-2",
}

# Nova model evaluation configuration (uses our own test account in us-east-1)
# Nova model evaluation configuration (uses dedicated test account in us-east-1)
NOVA_CONFIG = {
"model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65",
"dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl",
"s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/",
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models",
"dataset_s3_uri": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/zc_test.jsonl",
"s3_output_path": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/",
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:784379639078:model-package-group/sdk-test-finetuned-models",
"region": "us-east-1",
}

Expand Down Expand Up @@ -339,25 +338,44 @@ def test_benchmark_evaluation_base_model_only(self):
assert execution.status.overall_status == "Succeeded"
logger.info("Base model only evaluation completed successfully")

@pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account")
@pytest.mark.gpu_intensive
@pytest.mark.us_east_1
def test_benchmark_evaluation_nova_model(self):
"""
Test benchmark evaluation with Nova model.

This test uses a Nova fine-tuned model package in us-east-1 region.
Configuration from commented section in benchmark_demo.ipynb.

Note: This test is currently skipped pending us-east-1 test infra migration.
Note: This test requires a model package to exist in the model package group.
It should be run after a successful SFT or RLVR training job has produced one.
"""
import boto3

# Get benchmarks
Benchmark = get_benchmarks()

# Dynamically find the latest model package in the group
sm_client = boto3.client("sagemaker", region_name=NOVA_CONFIG["region"])
packages = sm_client.list_model_packages(
ModelPackageGroupName="sdk-test-finetuned-models",
SortBy="CreationTime",
SortOrder="Descending",
MaxResults=1,
)

if not packages["ModelPackageSummaryList"]:
pytest.skip("No model packages available in sdk-test-finetuned-models group. Run SFT/RLVR training first.")

model_package_arn = packages["ModelPackageSummaryList"][0]["ModelPackageArn"]
logger.info(f"Using model package: {model_package_arn}")

logger.info("Creating BenchmarkEvaluator with Nova model")

# Create evaluator with Nova model package
evaluator = BenchMarkEvaluator(
benchmark=Benchmark.MMLU,
model=NOVA_CONFIG["model_package_arn"],
model=model_package_arn,
s3_output_path=NOVA_CONFIG["s3_output_path"],
model_package_group=NOVA_CONFIG["model_package_group_arn"],
base_eval_name="integ-test-nova-eval",
Expand All @@ -367,7 +385,7 @@ def test_benchmark_evaluation_nova_model(self):
# Verify evaluator was created
assert evaluator is not None
assert evaluator.benchmark == Benchmark.MMLU
assert evaluator.model == NOVA_CONFIG["model_package_arn"]
assert evaluator.model == model_package_arn
assert evaluator.region == NOVA_CONFIG["region"]

logger.info(f"Created evaluator: {evaluator.base_eval_name}")
Expand Down Expand Up @@ -397,8 +415,8 @@ def test_benchmark_evaluation_nova_model(self):
logger.info(f"Status after refresh: {execution.status.overall_status}")

# Wait for completion
logger.info("Waiting for evaluation to complete (timeout: 1 hour)")
execution.wait(target_status="Succeeded", poll=30, timeout=3600)
logger.info("Waiting for evaluation to complete (timeout: 3 hours)")
execution.wait(target_status="Succeeded", poll=30, timeout=10800)

# Verify completion
assert execution.status.overall_status == "Succeeded"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session):


@pytest.mark.gpu_intensive
@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1")
@pytest.mark.us_east_1
def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1):
"""Test RLVR training workflow with Nova model."""
# sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region)
Expand All @@ -113,18 +113,18 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1):
model_package_group="sdk-test-finetuned-models",
mlflow_experiment_name="test-nova-rlvr-finetuned-models-exp",
mlflow_run_name="test-nova-rlvr-finetuned-models-run",
training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl",
validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl",
s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/",
custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1",
training_dataset="s3://sagemaker-us-east-1-784379639078/input_data/rlvr-nova/grpo-64-sample.jsonl",
validation_dataset="s3://sagemaker-us-east-1-784379639078/input_data/rlvr-nova/grpo-64-sample.jsonl",
s3_output_path="s3://sagemaker-us-east-1-784379639078/output/",
custom_reward_function="arn:aws:sagemaker:us-east-1:784379639078:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1",
accept_eula=True,
sagemaker_session=sagemaker_session_us_east_1,
base_job_name=f"rlvr-nova-integ-{unique_id}",
)
training_job = rlvr_trainer.train(wait=False)

# Manual wait loop
max_wait_time = 3600
max_wait_time = 10800 # 3 hour timeout (Nova training takes >1 hour)
poll_interval = 30
start_time = time.time()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session):


@pytest.mark.gpu_intensive
@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1")
@pytest.mark.us_east_1
def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1):
"""Test SFT trainer with Nova model."""
# sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region)
Expand All @@ -108,8 +108,8 @@ def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1):
model_package_group="sdk-test-finetuned-models",
mlflow_experiment_name="test-nova-finetuned-models-exp",
mlflow_run_name="test-nova-finetuned-models-run",
training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_200_samples.jsonl",
s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/",
training_dataset="s3://sagemaker-us-east-1-784379639078/input_data/sft-nova/sft_200_samples.jsonl",
s3_output_path="s3://sagemaker-us-east-1-784379639078/output/",
sagemaker_session=sagemaker_session_us_east_1,
base_job_name=f"sft-nova-integ-{unique_id}",
)
Expand All @@ -118,7 +118,7 @@ def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1):
training_job = sft_trainer_nova.train(wait=False)

# Manual wait loop
max_wait_time = 3600 # 1 hour timeout
max_wait_time = 10800 # 3 hour timeout (Nova training takes >1 hour)
poll_interval = 30 # Check every 30 seconds
start_time = time.time()

Expand Down
1 change: 1 addition & 0 deletions sagemaker-train/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ markers =
release
image_uris_unit_test
gpu_intensive: mark a test as GPU resource intensive (runs on scheduled CI, not PR checks).
us_east_1: mark a test that requires us-east-1 test account credentials (784379639078).
timeout: mark a test as a timeout.
serial: marks tests that must run serially (not in parallel)

Expand Down
Loading