diff --git a/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py b/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py index 8ffe1f4318..3c9c383406 100644 --- a/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py +++ b/sagemaker-train/src/sagemaker/train/container_drivers/distributed_drivers/mpi_driver.py @@ -17,7 +17,11 @@ import sys import json -from sagemaker.train.container_drivers.distributed_drivers.mpi_utils import ( +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from distributed_drivers.mpi_utils import ( # noqa: E402 # pylint: disable=C0413,E0611 start_sshd_daemon, bootstrap_master_node, bootstrap_worker_node, @@ -27,7 +31,7 @@ ) -from sagemaker.train.container_drivers.common.utils import ( +from common.utils import ( # noqa: E402 # pylint: disable=C0413,E0611 logger, hyperparameters_to_cli_args, get_process_count, diff --git a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py index ad92523aec..ed3c79c937 100644 --- a/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py +++ b/sagemaker-train/tests/integ/train/test_benchmark_evaluator.py @@ -23,8 +23,6 @@ EvaluationPipelineExecution, ) -pytestmark = pytest.mark.gpu_intensive - # Configure logging logging.basicConfig( level=logging.INFO, @@ -63,13 +61,12 @@ "region": "us-west-2", } -# Nova model evaluation configuration (from commented section in notebook) +# Nova model evaluation configuration (uses our own test account in us-east-1) NOVA_CONFIG = { - "model_package_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package/test-nova-finetuned-models/3", - "dataset_s3_uri": "s3://sagemaker-us-east-1-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl", - "s3_output_path": "s3://mufi-test-serverless-iad/eval/", - "mlflow_tracking_server_arn": "arn:aws:sagemaker:us-east-1:052150106756:mlflow-tracking-server/mlflow-prod-server", - "model_package_group_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package-group/test-nova-finetuned-models", + "model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65", + "dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl", + "s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/", + "model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models", "region": "us-east-1", } @@ -288,7 +285,7 @@ def test_benchmark_subtasks_validation(self): logger.info("Subtask validation tests passed") - @pytest.mark.skip(reason="Pipeline creation fails - under investigation") + # @pytest.mark.skip(reason="Pipeline creation fails - under investigation") @pytest.mark.gpu_intensive def test_benchmark_evaluation_base_model_only(self): """ @@ -342,7 +339,7 @@ def test_benchmark_evaluation_base_model_only(self): assert execution.status.overall_status == "Succeeded" logger.info("Base model only evaluation completed successfully") - @pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5") + @pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account") def test_benchmark_evaluation_nova_model(self): """ Test benchmark evaluation with Nova model. @@ -350,8 +347,7 @@ def test_benchmark_evaluation_nova_model(self): This test uses a Nova fine-tuned model package in us-east-1 region. Configuration from commented section in benchmark_demo.ipynb. - Note: This test is currently skipped. Remove the @pytest.mark.skip decorator - when you want to enable it. + Note: This test is currently skipped pending us-east-1 test infra migration. """ # Get benchmarks Benchmark = get_benchmarks() @@ -363,7 +359,6 @@ def test_benchmark_evaluation_nova_model(self): benchmark=Benchmark.MMLU, model=NOVA_CONFIG["model_package_arn"], s3_output_path=NOVA_CONFIG["s3_output_path"], - mlflow_resource_arn=NOVA_CONFIG["mlflow_tracking_server_arn"], model_package_group=NOVA_CONFIG["model_package_group_arn"], base_eval_name="integ-test-nova-eval", region=NOVA_CONFIG["region"], diff --git a/sagemaker-train/tests/integ/train/test_model_trainer.py b/sagemaker-train/tests/integ/train/test_model_trainer.py index 1589143112..63bbfc52bb 100644 --- a/sagemaker-train/tests/integ/train/test_model_trainer.py +++ b/sagemaker-train/tests/integ/train/test_model_trainer.py @@ -96,7 +96,7 @@ def test_hp_contract_basic_sh_script(sagemaker_session): # skip this test for now as requirments.txt is not resolved -@pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes") +# @pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes") def test_hp_contract_mpi_script(sagemaker_session): compute = Compute(instance_type="ml.m5.xlarge", instance_count=2) model_trainer = ModelTrainer(