From d6aff1076b3d087277458b8a70a82f4bff5c0b3b Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Tue, 26 May 2026 15:51:53 -0700 Subject: [PATCH 1/4] tests: unskip three tests --- sagemaker-train/tests/integ/train/test_benchmark_evaluator.py | 4 ++-- sagemaker-train/tests/integ/train/test_model_trainer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index 0db9b856d0..311492d7d4 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -286,7 +286,7 @@ def test_benchmark_subtasks_validation(self): logger.info("Subtask validation tests passed") - @pytest.mark.skip(reason="Pipeline creation fails - under investigation") + # @pytest.mark.skip(reason="Pipeline creation fails - under investigation") def test_benchmark_evaluation_base_model_only(self): """ Test benchmark evaluation with base model only (no fine-tuned model). @@ -339,7 +339,7 @@ def test_benchmark_evaluation_base_model_only(self): assert execution.status.overall_status == "Succeeded" logger.info("Base model only evaluation completed successfully") - @pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5") + # @pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5") def test_benchmark_evaluation_nova_model(self): """ Test benchmark evaluation with Nova model. diff --git a/sagemaker-train/tests/integ/train/test_model_trainer.py b/sagemaker-train/tests/integ/train/test_model_trainer.py index 1589143112..63bbfc52bb 100644 --- a/sagemaker-train/tests/integ/train/test_model_trainer.py +++ b/sagemaker-train/tests/integ/train/test_model_trainer.py @@ -96,7 +96,7 @@ def test_hp_contract_basic_sh_script(sagemaker_session): # skip this test for now as requirments.txt is not resolved -@pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes") +# @pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes") def test_hp_contract_mpi_script(sagemaker_session): compute = Compute(instance_type="ml.m5.xlarge", instance_count=2) model_trainer = ModelTrainer( From 42736a9b52cd9bcfb63e9dd86c68507c9e0c8a30 Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Tue, 26 May 2026 23:36:44 -0700 Subject: [PATCH 2/4] fix: use relative imports in mpi_driver.py for container compatibility The MPI driver script used absolute imports (from sagemaker.train.container_drivers...) which fail at runtime in the training container because sagemaker-train is not installed there. The driver scripts are copied to /opt/ml/input/data/sm_drivers/ and executed directly by the container entrypoint. Changed to sys.path-based relative imports matching the pattern used by torchrun_driver.py, which works correctly in the container environment. --- .../container_drivers/distributed_drivers/mpi_driver.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py b/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py index 8ffe1f4318..3c9c383406 100644 --- a/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py +++ b/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py @@ -17,7 +17,11 @@ import sys import json -from sagemaker.train.container_drivers.distributed_drivers.mpi_utils import ( +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from distributed_drivers.mpi_utils import ( # noqa: E402 # pylint: disable=C0413,E0611 start_sshd_daemon, bootstrap_master_node, bootstrap_worker_node, @@ -27,7 +31,7 @@ ) -from sagemaker.train.container_drivers.common.utils import ( +from common.utils import ( # noqa: E402 # pylint: disable=C0413,E0611 logger, hyperparameters_to_cli_args, get_process_count, From 4014d2f6ca4675b850bcbc55131399103f29910f Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Tue, 26 May 2026 23:57:50 -0700 Subject: [PATCH 3/4] test: migrate nova eval test to use own account resources Remove cross-account dependency in test_benchmark_evaluation_nova_model by replacing resources from account 052150106756 with our test account (729646638167) in us-east-1. Also removed mlflow_tracking_server_arn since no MLflow server exists in us-east-1. Test remains skipped pending us-east-1 test infrastructure migration to a dedicated test account. --- .../integ/train/test_benchmark_evaluator.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index 311492d7d4..73a26ccaa7 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -61,13 +61,12 @@ "region": "us-west-2", } -# Nova model evaluation configuration (from commented section in notebook) +# Nova model evaluation configuration (uses our own test account in us-east-1) NOVA_CONFIG = { - "model_package_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package/test-nova-finetuned-models/3", - "dataset_s3_uri": "s3://sagemaker-us-east-1-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl", - "s3_output_path": "s3://mufi-test-serverless-iad/eval/", - "mlflow_tracking_server_arn": "arn:aws:sagemaker:us-east-1:052150106756:mlflow-tracking-server/mlflow-prod-server", - "model_package_group_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package-group/test-nova-finetuned-models", + "model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65", + "dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl", + "s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/", + "model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models", "region": "us-east-1", } @@ -339,7 +338,7 @@ def test_benchmark_evaluation_base_model_only(self): assert execution.status.overall_status == "Succeeded" logger.info("Base model only evaluation completed successfully") - # @pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5") + @pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account") def test_benchmark_evaluation_nova_model(self): """ Test benchmark evaluation with Nova model. @@ -347,8 +346,7 @@ def test_benchmark_evaluation_nova_model(self): This test uses a Nova fine-tuned model package in us-east-1 region. Configuration from commented section in benchmark_demo.ipynb. - Note: This test is currently skipped. Remove the @pytest.mark.skip decorator - when you want to enable it. + Note: This test is currently skipped pending us-east-1 test infra migration. """ # Get benchmarks Benchmark = get_benchmarks() @@ -360,7 +358,6 @@ def test_benchmark_evaluation_nova_model(self): benchmark=Benchmark.MMLU, model=NOVA_CONFIG["model_package_arn"], s3_output_path=NOVA_CONFIG["s3_output_path"], - mlflow_resource_arn=NOVA_CONFIG["mlflow_tracking_server_arn"], model_package_group=NOVA_CONFIG["model_package_group_arn"], base_eval_name="integ-test-nova-eval", region=NOVA_CONFIG["region"], From 39cdcc3b6f9e0e86578661b7bc39e2934ea31c4a Mon Sep 17 00:00:00 2001 From: Lucas Jia Date: Wed, 27 May 2026 15:55:10 -0700 Subject: [PATCH 4/4] test: remove module level skip --- sagemaker-train/tests/integ/train/test_benchmark_evaluator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index 174cd1b135..ed3c79c937 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -23,8 +23,6 @@ EvaluationPipelineExecution, ) -pytestmark = pytest.mark.gpu_intensive - # Configure logging logging.basicConfig( level=logging.INFO,