Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this failure of sagemaker.train not being available a cause of sagemaker not being installed within the training container (or the correct version of sagemaker)? I'm not sure relative paths is the correct fix here, although feel free to correct me if I'm wrong

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question! The driver scripts (mpi_driver.py, torchrun_driver.py, etc.) are not meant to run as part of an installed sagemaker-train package. They're uploaded as standalone files to /opt/ml/input/data/sm_drivers/ in the training container and executed directly by the entrypoint bash script. The container uses a standard AWS DLC image (e.g., pytorch-training:2.0.0-cpu-py310) which doesn't have sagemaker-train installed — and it shouldn't need to.

torchrun_driver.py already uses sys.path.insert + relative module imports and works correctly. mpi_driver.py was the only driver using absolute imports, which was inconsistent with the rest of the codebase. I'm actually mirroring what we have done and is running fine here.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, I see in PySDK V2 that a similar approach is being used here too, thanks for the clarification!

PySDK V2 link: https://github.com/aws/sagemaker-python-sdk/blob/master-v2/src/sagemaker/modules/train/container_drivers/distributed_drivers/mpi_driver.py#L31

Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
import sys
import json

from sagemaker.train.container_drivers.distributed_drivers.mpi_utils import (
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

from distributed_drivers.mpi_utils import ( # noqa: E402 # pylint: disable=C0413,E0611
start_sshd_daemon,
bootstrap_master_node,
bootstrap_worker_node,
Expand All @@ -27,7 +31,7 @@
)


from sagemaker.train.container_drivers.common.utils import (
from common.utils import ( # noqa: E402 # pylint: disable=C0413,E0611
logger,
hyperparameters_to_cli_args,
get_process_count,
Expand Down
21 changes: 8 additions & 13 deletions sagemaker-train/tests/integ/train/test_benchmark_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@
EvaluationPipelineExecution,
)

pytestmark = pytest.mark.gpu_intensive

# Configure logging
logging.basicConfig(
level=logging.INFO,
Expand Down Expand Up @@ -63,13 +61,12 @@
"region": "us-west-2",
}

# Nova model evaluation configuration (from commented section in notebook)
# Nova model evaluation configuration (uses our own test account in us-east-1)
NOVA_CONFIG = {
"model_package_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package/test-nova-finetuned-models/3",
"dataset_s3_uri": "s3://sagemaker-us-east-1-052150106756/studio-users/d20251107t195443/datasets/2025-11-07T19-55-37-609Z/zc_test.jsonl",
"s3_output_path": "s3://mufi-test-serverless-iad/eval/",
"mlflow_tracking_server_arn": "arn:aws:sagemaker:us-east-1:052150106756:mlflow-tracking-server/mlflow-prod-server",
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:052150106756:model-package-group/test-nova-finetuned-models",
"model_package_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package/sdk-test-finetuned-models/65",
"dataset_s3_uri": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/zc_test.jsonl",
"s3_output_path": "s3://sagemaker-us-east-1-729646638167/model-customization/eval/",
"model_package_group_arn": "arn:aws:sagemaker:us-east-1:729646638167:model-package-group/sdk-test-finetuned-models",
"region": "us-east-1",
}

Expand Down Expand Up @@ -288,7 +285,7 @@ def test_benchmark_subtasks_validation(self):

logger.info("Subtask validation tests passed")

@pytest.mark.skip(reason="Pipeline creation fails - under investigation")
# @pytest.mark.skip(reason="Pipeline creation fails - under investigation")
@pytest.mark.gpu_intensive
def test_benchmark_evaluation_base_model_only(self):
"""
Expand Down Expand Up @@ -342,16 +339,15 @@ def test_benchmark_evaluation_base_model_only(self):
assert execution.status.overall_status == "Succeeded"
logger.info("Base model only evaluation completed successfully")

@pytest.mark.skip(reason="Requires us-east-1 test infrastructure - tracked in AI-5")
@pytest.mark.skip(reason="Pending us-east-1 test infrastructure migration to dedicated test account")
def test_benchmark_evaluation_nova_model(self):
"""
Test benchmark evaluation with Nova model.

This test uses a Nova fine-tuned model package in us-east-1 region.
Configuration from commented section in benchmark_demo.ipynb.

Note: This test is currently skipped. Remove the @pytest.mark.skip decorator
when you want to enable it.
Note: This test is currently skipped pending us-east-1 test infra migration.
"""
# Get benchmarks
Benchmark = get_benchmarks()
Expand All @@ -363,7 +359,6 @@ def test_benchmark_evaluation_nova_model(self):
benchmark=Benchmark.MMLU,
model=NOVA_CONFIG["model_package_arn"],
s3_output_path=NOVA_CONFIG["s3_output_path"],
mlflow_resource_arn=NOVA_CONFIG["mlflow_tracking_server_arn"],
model_package_group=NOVA_CONFIG["model_package_group_arn"],
base_eval_name="integ-test-nova-eval",
region=NOVA_CONFIG["region"],
Expand Down
2 changes: 1 addition & 1 deletion sagemaker-train/tests/integ/train/test_model_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def test_hp_contract_basic_sh_script(sagemaker_session):


# skip this test for now as requirments.txt is not resolved
@pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes")
# @pytest.mark.skip(reason="MPI distributed training does not resolve requirements.txt on worker nodes")
def test_hp_contract_mpi_script(sagemaker_session):
compute = Compute(instance_type="ml.m5.xlarge", instance_count=2)
model_trainer = ModelTrainer(
Expand Down
Loading