Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
change: Simplify exp plus integ test configuration (aws#694)
Browse files Browse the repository at this point in the history
Co-authored-by: Dewen Qi <qidewen@amazon.com>
qidewenwhen and Dewen Qi committed Dec 14, 2022
1 parent 17d7f4d commit 147bdf0
Showing 15 changed files with 648 additions and 340 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -30,8 +30,8 @@ env/
.vscode/
**/tmp
.python-version
**/_repack_model.py
**/_repack_script_launcher.sh
tests/data/experiment/docker/boto
tests/data/experiment/docker/sagemaker-dev.tar.gz
tests/data/**/_repack_model.py
tests/data/experiment/resources/sagemaker-beta-1.0.tar.gz
16 changes: 16 additions & 0 deletions src/sagemaker/experiments/_api_types.py
Original file line number Diff line number Diff line change
@@ -224,3 +224,19 @@ class TrialComponentSearchResult(_base_types.ApiObject):
source_detail = None
tags = None
parents = None


class TrialSummary(_base_types.ApiObject):
"""Summary model of a trial.
Attributes:
trial_arn (str): The ARN of the trial.
trial_name (str): The name of the trial.
creation_time (datetime): When the trial was created.
last_modified_time (datetime): When the trial was last modified.
"""

trial_arn = None
trial_name = None
creation_time = None
last_modified_time = None
75 changes: 75 additions & 0 deletions src/sagemaker/experiments/experiment.py
Original file line number Diff line number Diff line change
@@ -13,7 +13,11 @@
"""Contains the SageMaker Experiment class."""
from __future__ import absolute_import

import time

from sagemaker.apiutils import _base_types
from sagemaker.experiments.trial import _Trial
from sagemaker.experiments.trial_component import _TrialComponent


class _Experiment(_base_types.Record):
@@ -44,6 +48,8 @@ class _Experiment(_base_types.Record):
_boto_update_members = ["experiment_name", "description", "display_name"]
_boto_delete_members = ["experiment_name"]

_MAX_DELETE_ALL_ATTEMPTS = 3

def save(self):
"""Save the state of this Experiment to SageMaker.
@@ -160,3 +166,72 @@ def _load_or_create(
sagemaker_session=sagemaker_session,
)
return experiment

def list_trials(self, created_before=None, created_after=None, sort_by=None, sort_order=None):
"""List trials in this experiment matching the specified criteria.
Args:
created_before (datetime.datetime): Return trials created before this instant
(default: None).
created_after (datetime.datetime): Return trials created after this instant
(default: None).
sort_by (str): Which property to sort results by. One of 'Name', 'CreationTime'
(default: None).
sort_order (str): One of 'Ascending', or 'Descending' (default: None).
Returns:
collections.Iterator[experiments._api_types.TrialSummary] :
An iterator over trials matching the criteria.
"""
return _Trial.list(
experiment_name=self.experiment_name,
created_before=created_before,
created_after=created_after,
sort_by=sort_by,
sort_order=sort_order,
sagemaker_session=self.sagemaker_session,
)

def delete_all(self, action):
"""Force to delete the experiment and associated trials, trial components.
Args:
action (str): The string '--force' is required to pass in to confirm recursively
delete the experiments, and all its trials and trial components.
"""
if action != "--force":
raise ValueError(
"Must confirm with string '--force' in order to delete the experiment and "
"associated trials, trial components."
)

delete_attempt_count = 0
last_exception = None
while True:
if delete_attempt_count == self._MAX_DELETE_ALL_ATTEMPTS:
raise Exception("Failed to delete, please try again.") from last_exception
try:
for trial_summary in self.list_trials():
trial = _Trial.load(
sagemaker_session=self.sagemaker_session,
trial_name=trial_summary.trial_name,
)
for (
trial_component_summary
) in trial.list_trial_components(): # pylint: disable=no-member
tc = _TrialComponent.load(
sagemaker_session=self.sagemaker_session,
trial_component_name=trial_component_summary.trial_component_name,
)
tc.delete(force_disassociate=True)
# to prevent throttling
time.sleep(1.2)
trial.delete() # pylint: disable=no-member
# to prevent throttling
time.sleep(1.2)
self.delete()
break
except Exception as ex: # pylint: disable=broad-except
last_exception = ex
finally:
delete_attempt_count = delete_attempt_count + 1
47 changes: 47 additions & 0 deletions src/sagemaker/experiments/trial.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@
from __future__ import absolute_import

from sagemaker.apiutils import _base_types
from sagemaker.experiments import _api_types
from sagemaker.experiments.trial_component import _TrialComponent


@@ -117,6 +118,52 @@ def create(
)
return trial

@classmethod
def list(
cls,
experiment_name=None,
trial_component_name=None,
created_before=None,
created_after=None,
sort_by=None,
sort_order=None,
sagemaker_session=None,
):
"""List all trials matching the specified criteria.
Args:
experiment_name (str): Name of the experiment. If specified, only trials in
the experiment will be returned (default: None).
trial_component_name (str): Name of the trial component. If specified, only
trials with this trial component name will be returned (default: None).
created_before (datetime.datetime): Return trials created before this instant
(default: None).
created_after (datetime.datetime): Return trials created after this instant
(default: None).
sort_by (str): Which property to sort results by. One of 'Name', 'CreationTime'
(default: None).
sort_order (str): One of 'Ascending', or 'Descending' (default: None).
sagemaker_session (sagemaker.session.Session): Session object which
manages interactions with Amazon SageMaker APIs and any other
AWS services needed. If not specified, one is created using the
default AWS configuration chain.
Returns:
collections.Iterator[experiments._api_types.TrialSummary]: An iterator over trials
matching the specified criteria.
"""
return super(_Trial, cls)._list(
"list_trials",
_api_types.TrialSummary.from_boto,
"TrialSummaries",
experiment_name=experiment_name,
trial_component_name=trial_component_name,
created_before=created_before,
created_after=created_after,
sort_by=sort_by,
sort_order=sort_order,
sagemaker_session=sagemaker_session,
)

def add_trial_component(self, trial_component):
"""Add the specified trial component to this trial.
6 changes: 4 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -18,10 +18,12 @@
# TODO-experiment-plus: Remove this line, which loads the internal boto models.
# The corresponding model jsons were generated from the coral model package and should
# be updated regularly.
normal_json = "file://./tests/data/experiment/sagemaker-2017-07-24.normal.json"
normal_json = "file://./tests/data/experiment/resources/sagemaker-2017-07-24.normal.json"
os.system(f"aws configure add-model --service-model {normal_json} --service-name sagemaker")

metrics_model_json = "file://./tests/data/experiment/sagemaker-metrics-2022-09-30.normal.json"
metrics_model_json = (
"file://./tests/data/experiment/resources/sagemaker-metrics-2022-09-30.normal.json"
)
os.system(
f"aws configure add-model --service-model {metrics_model_json} --service-name sagemaker-metrics"
)
50 changes: 0 additions & 50 deletions tests/data/experiment/docker/Dockerfile

This file was deleted.

11 changes: 11 additions & 0 deletions tests/data/experiment/scripts/launcher.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
./aws/install

# TODO we should remove the boto model file once the Run API release
aws configure add-model --service-model file://resources/sagemaker-metrics-2022-09-30.normal.json --service-name sagemaker-metrics
aws configure add-model --service-model file://resources/sagemaker-2017-07-24.normal.json --service-name sagemaker

pip install resources/sagemaker-beta-1.0.tar.gz
python train_job_script_for_run_clz.py
102 changes: 7 additions & 95 deletions tests/integ/sagemaker/experiments/conftest.py
Original file line number Diff line number Diff line change
@@ -12,28 +12,20 @@
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import base64
import glob
import logging
import os
import shutil
import subprocess
import sys
import tempfile
import time
import uuid

import boto3
import pytest

import docker

from tests.integ import lock
from tests.integ.utils import create_repository
from tests.integ import DATA_DIR

from sagemaker.experiments import trial_component, trial, experiment
from sagemaker.s3 import S3Uploader
from sagemaker.utils import retry_with_backoff
from tests.integ.sagemaker.experiments.helpers import name, names

@@ -137,96 +129,16 @@ def tempdir():
shutil.rmtree(temp_dir)


@pytest.fixture(scope="module")
def bucket(sagemaker_session):
return sagemaker_session.default_bucket()


@pytest.fixture(scope="module")
def training_input_s3_uri(sagemaker_session, tempdir, bucket):
filepath = os.path.join(tempdir, name())
with open(filepath, "w") as w:
w.write("Hello World!")
s3_uri = f"s3://{bucket}/experiments/training-input/{name()}"
return S3Uploader.upload(
local_path=filepath, desired_s3_uri=s3_uri, sagemaker_session=sagemaker_session
)


@pytest.fixture(scope="module")
def training_output_s3_uri(bucket):
return f"s3://{bucket}/experiments/training-output/"


# TODO we should remove the boto model file once the Run API changes release
BOTO_MODEL_LOCAL_PATH = os.path.join(DATA_DIR, "experiment", "sagemaker-2017-07-24.normal.json")
METRICS_MODEL_LOCAL_PATH = os.path.join(
DATA_DIR, "experiment", "sagemaker-metrics-2022-09-30.normal.json"
)
IMAGE_REPO_NAME = "sagemaker-experiments-test"
IMAGE_VERSION = "1.0.92" # We should bump it up if need to update the docker image
SM_SDK_TAR_NAME_IN_IMAGE = "sagemaker-dev.tar.gz"
SM_BOTO_MODEL_PATH_IN_IMAGE = "boto/sagemaker-2017-07-24.normal.json"
SM_METRICS_MODEL_PATH_IN_IMAGE = "boto/sagemaker-metrics-2022-09-30.normal.json"
_EXP_PLUS_SDK_TAR = "sagemaker-beta-1.0.tar.gz"


@pytest.fixture(scope="module")
def docker_image(sagemaker_session):
# requires docker to be running
docker_client = docker.from_env()
ecr_client = sagemaker_session.boto_session.client("ecr")

token = ecr_client.get_authorization_token()
username, password = (
base64.b64decode(token["authorizationData"][0]["authorizationToken"]).decode().split(":")
)
registry = token["authorizationData"][0]["proxyEndpoint"]
repository_name = IMAGE_REPO_NAME
tag = "{}/{}:{}".format(registry, repository_name, IMAGE_VERSION)[8:]
docker_dir = os.path.join(DATA_DIR, "experiment", "docker")

with lock.lock():
# initialize the docker image repository
create_repository(ecr_client, repository_name)

# pull existing image for layer cache
try:
docker_client.images.pull(tag, auth_config={"username": username, "password": password})
print("Docker image with tag {} already exists.".format(tag))
return tag
except docker.errors.NotFound:
print("Docker image with tag {} does not exist. Will create one.".format(tag))

# copy boto model under docker dir
os.makedirs(os.path.join(docker_dir, "boto"), exist_ok=True)
shutil.copy(
BOTO_MODEL_LOCAL_PATH,
os.path.join(docker_dir, SM_BOTO_MODEL_PATH_IN_IMAGE),
)
shutil.copy(
METRICS_MODEL_LOCAL_PATH,
os.path.join(docker_dir, SM_METRICS_MODEL_PATH_IN_IMAGE),
)

# generate sdk tar file from package and put it under docker dir
subprocess.check_call([sys.executable, "setup.py", "sdist"])
sdist_path = max(glob.glob("dist/sagemaker-*"), key=os.path.getctime)
shutil.copy(sdist_path, os.path.join(docker_dir, SM_SDK_TAR_NAME_IN_IMAGE))

docker_client.images.build(
path=docker_dir,
dockerfile="Dockerfile",
tag=tag,
cache_from=[tag],
buildargs={
"library": SM_SDK_TAR_NAME_IN_IMAGE,
"botomodel": SM_BOTO_MODEL_PATH_IN_IMAGE,
"script": "scripts/train_job_script_for_run_clz.py",
"metricsmodel": SM_METRICS_MODEL_PATH_IN_IMAGE,
},
)
docker_client.images.push(tag, auth_config={"username": username, "password": password})
return tag
def job_resource_dir():
resource_dir = os.path.join(DATA_DIR, "experiment/resources")
os.system("python setup.py sdist")
sdist_path = max(glob.glob("dist/sagemaker-*"), key=os.path.getctime)
shutil.copy(sdist_path, os.path.join(resource_dir, _EXP_PLUS_SDK_TAR))
return resource_dir


def _delete_associations(arn, sagemaker_session):
13 changes: 13 additions & 0 deletions tests/integ/sagemaker/experiments/helpers.py
Original file line number Diff line number Diff line change
@@ -12,7 +12,10 @@
# language governing permissions and limitations under the License.
from __future__ import absolute_import

from contextlib import contextmanager

from sagemaker import utils
from sagemaker.experiments.experiment import _Experiment

EXP_INTEG_TEST_NAME_PREFIX = "experiments-integ"

@@ -27,3 +30,13 @@ def names():

def to_seconds(dt):
return int(dt.timestamp())


@contextmanager
def cleanup_exp_resources(exp_names, sagemaker_session):
try:
yield
finally:
for exp_name in exp_names:
exp = _Experiment.load(experiment_name=exp_name, sagemaker_session=sagemaker_session)
exp.delete_all(action="--force")
383 changes: 191 additions & 192 deletions tests/integ/sagemaker/experiments/test_run.py

Large diffs are not rendered by default.

196 changes: 196 additions & 0 deletions tests/unit/sagemaker/experiments/test_experiment.py
Original file line number Diff line number Diff line change
@@ -20,6 +20,7 @@

from sagemaker import Session
from sagemaker.experiments import experiment
from sagemaker.experiments._api_types import TrialSummary


@pytest.fixture
@@ -131,3 +132,198 @@ def test_load_or_create_when_not_exist(mock_create, mock_load):
tags=None,
sagemaker_session=sagemaker_session,
)


def test_list_trials_empty(sagemaker_session):
sagemaker_session.sagemaker_client.list_trials.return_value = {"TrialSummaries": []}
experiment_obj = experiment._Experiment(sagemaker_session=sagemaker_session)
assert list(experiment_obj.list_trials()) == []


def test_list_trials_single(sagemaker_session, datetime_obj):
experiment_obj = experiment._Experiment(sagemaker_session=sagemaker_session)
sagemaker_session.sagemaker_client.list_trials.return_value = {
"TrialSummaries": [
{"Name": "trial-foo", "CreationTime": datetime_obj, "LastModifiedTime": datetime_obj}
]
}

assert list(experiment_obj.list_trials()) == [
TrialSummary(name="trial-foo", creation_time=datetime_obj, last_modified_time=datetime_obj)
]


def test_list_trials_two_values(sagemaker_session, datetime_obj):
experiment_obj = experiment._Experiment(sagemaker_session=sagemaker_session)
sagemaker_session.sagemaker_client.list_trials.return_value = {
"TrialSummaries": [
{"Name": "trial-foo-1", "CreationTime": datetime_obj, "LastModifiedTime": datetime_obj},
{"Name": "trial-foo-2", "CreationTime": datetime_obj, "LastModifiedTime": datetime_obj},
]
}

assert list(experiment_obj.list_trials()) == [
TrialSummary(
name="trial-foo-1", creation_time=datetime_obj, last_modified_time=datetime_obj
),
TrialSummary(
name="trial-foo-2", creation_time=datetime_obj, last_modified_time=datetime_obj
),
]


def test_next_token(sagemaker_session, datetime_obj):
experiment_obj = experiment._Experiment(sagemaker_session)
client = sagemaker_session.sagemaker_client
client.list_trials.side_effect = [
{
"TrialSummaries": [
{
"Name": "trial-foo-1",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
{
"Name": "trial-foo-2",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
],
"NextToken": "foo",
},
{
"TrialSummaries": [
{
"Name": "trial-foo-3",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
}
]
},
]

assert list(experiment_obj.list_trials()) == [
TrialSummary(
name="trial-foo-1", creation_time=datetime_obj, last_modified_time=datetime_obj
),
TrialSummary(
name="trial-foo-2", creation_time=datetime_obj, last_modified_time=datetime_obj
),
TrialSummary(
name="trial-foo-3", creation_time=datetime_obj, last_modified_time=datetime_obj
),
]

client.list_trials.assert_any_call(**{})
client.list_trials.assert_any_call(NextToken="foo")


def test_list_trials_call_args(sagemaker_session):
client = sagemaker_session.sagemaker_client
created_before = datetime.datetime(1999, 10, 12, 0, 0, 0)
created_after = datetime.datetime(1990, 10, 12, 0, 0, 0)
experiment_obj = experiment._Experiment(sagemaker_session=sagemaker_session)
client.list_trials.return_value = {}
assert [] == list(
experiment_obj.list_trials(created_after=created_after, created_before=created_before)
)
client.list_trials.assert_called_with(CreatedBefore=created_before, CreatedAfter=created_after)


def test_delete_all_with_incorrect_action_name(sagemaker_session):
obj = experiment._Experiment(sagemaker_session, experiment_name="foo", description="bar")
with pytest.raises(ValueError) as err:
obj.delete_all(action="abc")

assert "Must confirm with string '--force'" in str(err)


def test_delete_all(sagemaker_session):
obj = experiment._Experiment(sagemaker_session, experiment_name="foo", description="bar")
client = sagemaker_session.sagemaker_client
client.list_trials.return_value = {
"TrialSummaries": [
{
"TrialName": "trial-1",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
{
"TrialName": "trial-2",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
]
}
client.describe_trial.side_effect = [
{"Trialname": "trial-1", "ExperimentName": "experiment-name-value"},
{"Trialname": "trial-2", "ExperimentName": "experiment-name-value"},
]
client.list_trial_components.side_effect = [
{
"TrialComponentSummaries": [
{
"TrialComponentName": "trial-component-1",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
{
"TrialComponentName": "trial-component-2",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
]
},
{
"TrialComponentSummaries": [
{
"TrialComponentName": "trial-component-3",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
{
"TrialComponentName": "trial-component-4",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
]
},
]

client.describe_trial_component.side_effect = [
{"TrialComponentName": "trial-component-1"},
{"TrialComponentName": "trial-component-2"},
{"TrialComponentName": "trial-component-3"},
{"TrialComponentName": "trial-component-4"},
]

client.delete_trial_component.return_value = {}
client.delete_trial.return_value = {}
client.delete_experiment.return_value = {}

obj.delete_all(action="--force")

client.delete_experiment.assert_called_with(ExperimentName="foo")

delete_trial_expected_calls = [
unittest.mock.call(TrialName="trial-1"),
unittest.mock.call(TrialName="trial-2"),
]
assert delete_trial_expected_calls == client.delete_trial.mock_calls

delete_trial_component_expected_calls = [
unittest.mock.call(TrialComponentName="trial-component-1"),
unittest.mock.call(TrialComponentName="trial-component-2"),
unittest.mock.call(TrialComponentName="trial-component-3"),
unittest.mock.call(TrialComponentName="trial-component-4"),
]
assert delete_trial_component_expected_calls == client.delete_trial_component.mock_calls


def test_delete_all_fail(sagemaker_session):
obj = experiment._Experiment(sagemaker_session, experiment_name="foo", description="bar")
sagemaker_session.sagemaker_client.list_trials.side_effect = Exception
with pytest.raises(Exception) as e:
obj.delete_all(action="--force")

assert str(e.value) == "Failed to delete, please try again."
87 changes: 87 additions & 0 deletions tests/unit/sagemaker/experiments/test_trial.py
Original file line number Diff line number Diff line change
@@ -20,6 +20,7 @@
from mock.mock import patch

from sagemaker import Session
from sagemaker.experiments._api_types import TrialSummary
from sagemaker.experiments.trial import _Trial
from sagemaker.experiments.trial_component import _TrialComponent

@@ -211,3 +212,89 @@ def test_load_or_create_when_not_exist(mock_create, mock_load):
tags=None,
sagemaker_session=sagemaker_session,
)


def test_list_trials_without_experiment_name(sagemaker_session, datetime_obj):
client = sagemaker_session.sagemaker_client
client.list_trials.return_value = {
"TrialSummaries": [
{
"TrialName": "trial-1",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
{
"TrialName": "trial-2",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
]
}
expected = [
TrialSummary(
trial_name="trial-1", creation_time=datetime_obj, last_modified_time=datetime_obj
),
TrialSummary(
trial_name="trial-2", creation_time=datetime_obj, last_modified_time=datetime_obj
),
]
assert expected == list(_Trial.list(sagemaker_session=sagemaker_session))
client.list_trials.assert_called_with(**{})


def test_list_trials_with_experiment_name(sagemaker_session, datetime_obj):
client = sagemaker_session.sagemaker_client
client.list_trials.return_value = {
"TrialSummaries": [
{
"TrialName": "trial-1",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
{
"TrialName": "trial-2",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
]
}
expected = [
TrialSummary(
trial_name="trial-1", creation_time=datetime_obj, last_modified_time=datetime_obj
),
TrialSummary(
trial_name="trial-2", creation_time=datetime_obj, last_modified_time=datetime_obj
),
]
assert expected == list(_Trial.list(experiment_name="foo", sagemaker_session=sagemaker_session))
client.list_trials.assert_called_with(ExperimentName="foo")


def test_list_trials_with_trial_component_name(sagemaker_session, datetime_obj):
client = sagemaker_session.sagemaker_client
client.list_trials.return_value = {
"TrialSummaries": [
{
"TrialName": "trial-1",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
{
"TrialName": "trial-2",
"CreationTime": datetime_obj,
"LastModifiedTime": datetime_obj,
},
]
}
expected = [
TrialSummary(
trial_name="trial-1", creation_time=datetime_obj, last_modified_time=datetime_obj
),
TrialSummary(
trial_name="trial-2", creation_time=datetime_obj, last_modified_time=datetime_obj
),
]
assert expected == list(
_Trial.list(trial_component_name="tc-foo", sagemaker_session=sagemaker_session)
)
client.list_trials.assert_called_with(TrialComponentName="tc-foo")

0 comments on commit 147bdf0

Please sign in to comment.