From ee75350631550011ceb865dbfc24882d862bb9d5 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Thu, 28 May 2026 16:54:14 -0700 Subject: [PATCH 1/3] test: unskip nova tests --- .../tests/integ/ai_registry/test_dataset.py | 12 ++++--- .../integ/train/test_benchmark_evaluator.py | 36 ++++++++++++++----- .../train/test_rlvr_trainer_integration.py | 10 +++--- .../train/test_sft_trainer_integration.py | 6 ++-- sagemaker-train/tox.ini | 1 + 5 files changed, 44 insertions(+), 21 deletions(-) diff --git a/sagemaker-train/tests/integ/ai_registry/test_dataset.py b/sagemaker-train/tests/integ/ai_registry/test_dataset.py index cfefcdb85e..46f5ce987a 100644 --- a/sagemaker-train/tests/integ/ai_registry/test_dataset.py +++ b/sagemaker-train/tests/integ/ai_registry/test_dataset.py @@ -79,8 +79,9 @@ def test_create_dataset_from_s3_oss_dpo(self, unique_name, test_bucket, cleanup_ assert dataset.name == unique_name assert dataset.customization_technique == CustomizationTechnique.DPO + @pytest.mark.us_east_1 def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup_list): - """Test creating RLVR dataset from S3 URI.""" + """Test creating Nova SFT dataset from S3 URI.""" s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_sft_train.jsonl" dataset = DataSet.create( name=unique_name, @@ -92,8 +93,9 @@ def test_create_dataset_from_s3_nova_sft(self, unique_name, test_bucket, cleanup assert dataset.name == unique_name assert dataset.customization_technique == CustomizationTechnique.SFT + @pytest.mark.us_east_1 def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup_list): - """Test creating RLVR dataset from S3 URI.""" + """Test creating Nova DPO dataset from S3 URI.""" s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_dpo_train.jsonl" dataset = DataSet.create( name=unique_name, @@ -105,8 +107,9 @@ def test_create_dataset_from_s3_nova_dpo(self, unique_name, test_bucket, cleanup assert dataset.name == unique_name assert dataset.customization_technique == CustomizationTechnique.DPO + @pytest.mark.us_east_1 def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup_list): - """Test creating RLVR dataset from S3 URI.""" + """Test creating Nova RFT dataset from S3 URI.""" s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_rft_train.jsonl" dataset = DataSet.create( name=unique_name, @@ -118,8 +121,9 @@ def test_create_dataset_from_s3_nova_rft(self, unique_name, test_bucket, cleanup assert dataset.name == unique_name assert dataset.customization_technique == CustomizationTechnique.RLVR + @pytest.mark.us_east_1 def test_create_dataset_from_s3_nova_eval(self, unique_name, test_bucket, cleanup_list): - """Test creating RLVR dataset from S3 URI.""" + """Test creating Nova eval dataset from S3 URI.""" s3_uri = f"s3://{test_bucket}/test_datasets/Nova/nova_eval.jsonl" dataset = DataSet.create( name=unique_name, diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index ed3c79c937..c6dcd8933d 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -61,12 +61,11 @@ "region": "us-west-2", } -# Nova model evaluation configuration (uses our own test account in us-east-1) +# Nova model evaluation configuration (uses dedicated test account in us-east-1) NOVA_CONFIG = { - "model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65", - "dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl", - "s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/", - "model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models", + "dataset_s3_uri": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/zc_test.jsonl", + "s3_output_path": "s3://sagemaker-us-east-1-784379639078/model-customization/eval/", + "model_package_group_arn": "arn:aws:sagemaker:us-east-1:784379639078:model-package-group/sdk-test-finetuned-models", "region": "us-east-1", } @@ -339,7 +338,8 @@ def test_benchmark_evaluation_base_model_only(self): assert execution.status.overall_status == "Succeeded" logger.info("Base model only evaluation completed successfully") - @pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account") + @pytest.mark.gpu_intensive + @pytest.mark.us_east_1 def test_benchmark_evaluation_nova_model(self): """ Test benchmark evaluation with Nova model. @@ -347,17 +347,35 @@ def test_benchmark_evaluation_nova_model(self): This test uses a Nova fine-tuned model package in us-east-1 region. Configuration from commented section in benchmark_demo.ipynb. - Note: This test is currently skipped pending us-east-1 test infra migration. + Note: This test requires a model package to exist in the model package group. + It should be run after a successful SFT or RLVR training job has produced one. """ + import boto3 + # Get benchmarks Benchmark = get_benchmarks() + # Dynamically find the latest model package in the group + sm_client = boto3.client("sagemaker", region_name=NOVA_CONFIG["region"]) + packages = sm_client.list_model_packages( + ModelPackageGroupName="sdk-test-finetuned-models", + SortBy="CreationTime", + SortOrder="Descending", + MaxResults=1, + ) + + if not packages["ModelPackageSummaryList"]: + pytest.skip("No model packages available in sdk-test-finetuned-models group. Run SFT/RLVR training first.") + + model_package_arn = packages["ModelPackageSummaryList"][0]["ModelPackageArn"] + logger.info(f"Using model package: {model_package_arn}") + logger.info("Creating BenchmarkEvaluator with Nova model") # Create evaluator with Nova model package evaluator = BenchMarkEvaluator( benchmark=Benchmark.MMLU, - model=NOVA_CONFIG["model_package_arn"], + model=model_package_arn, s3_output_path=NOVA_CONFIG["s3_output_path"], model_package_group=NOVA_CONFIG["model_package_group_arn"], base_eval_name="integ-test-nova-eval", @@ -367,7 +385,7 @@ def test_benchmark_evaluation_nova_model(self): # Verify evaluator was created assert evaluator is not None assert evaluator.benchmark == Benchmark.MMLU - assert evaluator.model == NOVA_CONFIG["model_package_arn"] + assert evaluator.model == model_package_arn assert evaluator.region == NOVA_CONFIG["region"] logger.info(f"Created evaluator: {evaluator.base_eval_name}") diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index e8320c29ac..679d5e636f 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -102,7 +102,7 @@ def test_rlvr_trainer_with_custom_reward_function(sagemaker_session): @pytest.mark.gpu_intensive -@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") +@pytest.mark.us_east_1 def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): """Test RLVR training workflow with Nova model.""" # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) @@ -113,10 +113,10 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-nova-rlvr-finetuned-models-exp", mlflow_run_name="test-nova-rlvr-finetuned-models-run", - training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", - validation_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/rlvr-nova/grpo-64-sample.jsonl", - s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", - custom_reward_function="arn:aws:sagemaker:us-east-1:729646638167:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", + training_dataset="s3://sagemaker-us-east-1-784379639078/input_data/rlvr-nova/grpo-64-sample.jsonl", + validation_dataset="s3://sagemaker-us-east-1-784379639078/input_data/rlvr-nova/grpo-64-sample.jsonl", + s3_output_path="s3://sagemaker-us-east-1-784379639078/output/", + custom_reward_function="arn:aws:sagemaker:us-east-1:784379639078:hub-content/sdktest/JsonDoc/rlvr-nova-test-rf/0.0.1", accept_eula=True, sagemaker_session=sagemaker_session_us_east_1, base_job_name=f"rlvr-nova-integ-{unique_id}", diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 93be84a738..0ad7d49ecc 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -96,7 +96,7 @@ def test_sft_trainer_with_validation_dataset(sagemaker_session): @pytest.mark.gpu_intensive -@pytest.mark.skip(reason="TODO: Nova test to be enabled in us-east-1") +@pytest.mark.us_east_1 def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): """Test SFT trainer with Nova model.""" # sagemaker_session_us_east_1 fixture is defined in conftest.py (us-east-1 region) @@ -108,8 +108,8 @@ def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): model_package_group="sdk-test-finetuned-models", mlflow_experiment_name="test-nova-finetuned-models-exp", mlflow_run_name="test-nova-finetuned-models-run", - training_dataset="s3://mc-flows-sdk-testing-us-east-1/input_data/sft-nova/sft_200_samples.jsonl", - s3_output_path="s3://mc-flows-sdk-testing-us-east-1/output/", + training_dataset="s3://sagemaker-us-east-1-784379639078/input_data/sft-nova/sft_200_samples.jsonl", + s3_output_path="s3://sagemaker-us-east-1-784379639078/output/", sagemaker_session=sagemaker_session_us_east_1, base_job_name=f"sft-nova-integ-{unique_id}", ) diff --git a/sagemaker-train/tox.ini b/sagemaker-train/tox.ini index 028925d95f..06f4fc31cc 100644 --- a/sagemaker-train/tox.ini +++ b/sagemaker-train/tox.ini @@ -63,6 +63,7 @@ markers = release image_uris_unit_test gpu_intensive: mark a test as GPU resource intensive (runs on scheduled CI, not PR checks). + us_east_1: mark a test that requires us-east-1 test account credentials (784379639078). timeout: mark a test as a timeout. serial: marks tests that must run serially (not in parallel) From 4bd0fcf76f5dd69671d96f42e171c0027a755bd7 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Thu, 28 May 2026 22:05:15 -0700 Subject: [PATCH 2/3] test: extend max_wait_time --- .../tests/integ/train/test_rlvr_trainer_integration.py | 2 +- .../tests/integ/train/test_sft_trainer_integration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py index 679d5e636f..677b126840 100644 --- a/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_rlvr_trainer_integration.py @@ -124,7 +124,7 @@ def test_rlvr_trainer_nova_workflow(sagemaker_session_us_east_1): training_job = rlvr_trainer.train(wait=False) # Manual wait loop - max_wait_time = 3600 + max_wait_time = 10800 # 3 hour timeout (Nova training takes >1 hour) poll_interval = 30 start_time = time.time() diff --git a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py index 0ad7d49ecc..68446991c4 100644 --- a/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py +++ b/sagemaker-train/tests/integ/train/test_sft_trainer_integration.py @@ -118,7 +118,7 @@ def test_sft_trainer_nova_workflow(sagemaker_session_us_east_1): training_job = sft_trainer_nova.train(wait=False) # Manual wait loop - max_wait_time = 3600 # 1 hour timeout + max_wait_time = 10800 # 3 hour timeout (Nova training takes >1 hour) poll_interval = 30 # Check every 30 seconds start_time = time.time() From df5dd1f0b353d668e55523d6beb7bee39d3b83f1 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Fri, 29 May 2026 04:34:44 -0700 Subject: [PATCH 3/3] extend timeout for test_benchmark_evaluation_nova_model --- sagemaker-train/tests/integ/train/test_benchmark_evaluator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index c6dcd8933d..992130c3bd 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -415,8 +415,8 @@ def test_benchmark_evaluation_nova_model(self): logger.info(f"Status after refresh: {execution.status.overall_status}") # Wait for completion - logger.info("Waiting for evaluation to complete (timeout: 1 hour)") - execution.wait(target_status="Succeeded", poll=30, timeout=3600) + logger.info("Waiting for evaluation to complete (timeout: 3 hours)") + execution.wait(target_status="Succeeded", poll=30, timeout=10800) # Verify completion assert execution.status.overall_status == "Succeeded"