Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AWS SageMaker] Integration tests automation #3768

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
092661d
# This is a combination of 5 commits.
May 13, 2020
b3d51d7
Add initial scripts
May 15, 2020
003322b
Remove debugging mark
May 15, 2020
3013bfc
Update example EKS cluster name
May 15, 2020
fbeadb3
Remove quiet from Docker build
May 15, 2020
d0a1a16
Manually pass env
May 15, 2020
21dc284
Update env list vars as string
May 15, 2020
c5761ee
Update use array directly
May 15, 2020
46ef129
Update variable array to export
May 15, 2020
6750d1e
Update to using read for splitting
May 15, 2020
46c87d3
Move to helper script
May 15, 2020
1a301e6
Update export from CodeBuild
May 15, 2020
68cfa0d
Add wait for minio
May 15, 2020
2f20e46
Update kubectl wait timeout
May 16, 2020
fd2e758
Update minor changes for PR
May 18, 2020
5aba2ba
Update integration test buildspec to quiet build
May 18, 2020
97c63ab
Add region to delete EKS
May 18, 2020
aeb4709
Add wait for pods
May 18, 2020
86c9436
Updated README
May 18, 2020
e2b2665
Add fixed interval wait
May 18, 2020
3907aac
Fix CodeBuild step order
May 18, 2020
fa4d941
Add file lock for experiment ID
May 18, 2020
ecf9cf1
Fix missing pytest parameter
May 18, 2020
fdfbcf8
Update run create only once
May 19, 2020
02ee94f
Add filelock to conda env
May 19, 2020
4c91d73
Update experiment name ensuring creation each time
May 19, 2020
951a63c
Add try/catch with create experiment
May 19, 2020
c1e0d83
Remove caching from KFP deployment
May 19, 2020
4ae6389
Remove disable KFP caching
May 19, 2020
5414ba3
Move .gitignore changes to inside component
May 19, 2020
3a4c00d
Add blank line to default .gitignore
May 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions components/aws/sagemaker/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Any environment variable files
**/*/.env
16 changes: 13 additions & 3 deletions components/aws/sagemaker/codebuild/integration-test.buildspec.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
version: 0.2

env:
variables:
CONTAINER_VARIABLES: "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI EKS_PRIVATE_SUBNETS EKS_PUBLIC_SUBNETS PYTEST_ADDOPTS S3_DATA_BUCKET EKS_EXISTING_CLUSTER SAGEMAKER_EXECUTION_ROLE_ARN REGION"

phases:
build:
commands:
- cd components/aws
- docker build . -f ./sagemaker/tests/integration_tests/Dockerfile -t amazon/integration-test-image --quiet

- cd sagemaker/codebuild/scripts && export CONTAINER_VARIABLE_ARGUMENTS="$(./construct_environment_array.sh)"

# Run the container and copy the results to /tmp
# Passes all host environment variables through to the container
- docker run --name integration-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/integration-test-image
- docker cp integration-test-container:/app/tests/integration_tests/integration_tests.log /tmp/results.xml
# Passes all listed host environment variables through to the container
- docker run --name integration-test-container $(echo $CONTAINER_VARIABLE_ARGUMENTS) amazon/integration-test-image

post_build:
commands:
- docker cp integration-test-container:/tests/integration_tests/integration_tests.log /tmp/results.xml
- docker rm -f integration-test-container

reports:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

# This script breaks up a string of environment variable names into a list of
# parameters that `docker run` accepts. This needs to be made into a script
# for CodeBuild because these commands do not run in dash - the default terminal
# on the CodeBuild standard images.

IFS=' ' read -a variable_array <<< $CONTAINER_VARIABLES
printf -v CONTAINER_VARIABLE_ARGUMENTS -- "--env %s " "${variable_array[@]}"
echo $CONTAINER_VARIABLE_ARGUMENTS
3 changes: 3 additions & 0 deletions components/aws/sagemaker/codebuild/unit-test.buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ phases:
# Run the container and copy the results to /tmp
# Passes all host environment variables through to the container
- docker run --name unit-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/unit-test-image

post_build:
commands:
- docker cp unit-test-container:/app/tests/unit_tests/unit_tests.log /tmp/results.xml
- docker rm -f unit-test-container

Expand Down
12 changes: 12 additions & 0 deletions components/aws/sagemaker/tests/integration_tests/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# If you would like to override the credentials for the container
# AWS_ACCESS_KEY_ID=
# AWS_SECRET_ACCESS_KEY=
# AWS_SESSION_TOKEN=

REGION=us-east-1
RedbackThomson marked this conversation as resolved.
Show resolved Hide resolved

SAGEMAKER_EXECUTION_ROLE_ARN=arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-Example
S3_DATA_BUCKET=my-data-bucket

# If you hope to use an existing EKS cluster, rather than creating a new one.
# EKS_EXISTING_CLUSTER=my-eks-cluster
43 changes: 43 additions & 0 deletions components/aws/sagemaker/tests/integration_tests/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FROM continuumio/miniconda:4.7.12

RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
wget \
git \
jq

# Install eksctl
RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.19.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp \
&& mv /tmp/eksctl /usr/local/bin

# Install aws-iam-authenticator
RUN curl -S -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.16.8/2020-04-16/bin/linux/amd64/aws-iam-authenticator \
&& chmod +x /usr/local/bin/aws-iam-authenticator

# Install Kubectl
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.0/bin/linux/amd64/kubectl \
&& chmod +x ./kubectl \
&& mv ./kubectl /usr/local/bin/kubectl

# Install Argo CLI
RUN curl -sSL -o /usr/local/bin/argo https://github.com/argoproj/argo/releases/download/v2.8.0/argo-linux-amd64 \
&& chmod +x /usr/local/bin/argo

# Copy conda environment early to avoid cache busting
COPY ./sagemaker/tests/integration_tests/environment.yml environment.yml

# Create conda environment for running tests and set as start-up environment
RUN conda env create -f environment.yml
RUN echo "source activate kfp_test_env" > ~/.bashrc
RedbackThomson marked this conversation as resolved.
Show resolved Hide resolved
ENV PATH "/opt/conda/envs/kfp_test_env/bin":$PATH

# Environment variables to be used by tests
ENV REGION="us-west-2"
RedbackThomson marked this conversation as resolved.
Show resolved Hide resolved
ENV SAGEMAKER_EXECUTION_ROLE_ARN="arn:aws:iam::1234567890:role/sagemaker-role"
ENV S3_DATA_BUCKET="kfp-test-data"
ENV MINIO_LOCAL_PORT=9000
ENV KFP_NAMESPACE="kubeflow"

COPY ./sagemaker/ .
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we copy the samples directory as well into this? because we will be adding tests for examples soon

leave for now if the change is time taking

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will leave for now - can we add this as a subtask of adding samples to our integration tests?


ENTRYPOINT [ "/bin/bash", "./tests/integration_tests/scripts/run_integration_tests" ]
51 changes: 15 additions & 36 deletions components/aws/sagemaker/tests/integration_tests/README.md
Original file line number Diff line number Diff line change
@@ -1,42 +1,21 @@
## Requirements
1. [Conda](https://docs.conda.io/en/latest/miniconda.html)
1. [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/)
1. Argo CLI: [Mac](https://github.com/argoproj/homebrew-tap), [Linux](https://eksworkshop.com/advanced/410_batch/install/)
1. K8s cluster with Kubeflow pipelines > 0.4.0 installed
1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and S3FullAccess
1. IAM User credentials with SageMakerFullAccess permissions
1. [Docker](https://www.docker.com/)
1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and AmazonS3FullAccess
1. IAM User credentials with SageMakerFullAccess, AWSCloudFormationFullAccess, IAMFullAccess, AmazonEC2FullAccess, AmazonS3FullAccess permissions

## Creating S3 buckets with datasets

Change the bucket name and run the python script `[s3_sample_data_creator.py](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset)` to create S3 buckets with mnist dataset in the region where you want to run the tests
In the following Python script, change the bucket name and run the [`s3_sample_data_creator.py`](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset) to create an S3 bucket with the sample mnist dataset in the region where you want to run the tests.

## Step to run integration tests
1. Configure AWS credentials with access to EKS cluster
1. Fetch kubeconfig to `~/.kube/config` or set `KUBECONFIG` environment variable to point to kubeconfig of the cluster
1. Create a [secret](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/) named `aws-secret` in kubeflow namespace with credentials of IAM User for SageMakerFullAccess
```yaml
apiVersion: v1
kind: Secret
metadata:
name: aws-secret
namespace: kubeflow
type: Opaque
data:
AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY
AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS
```

> Note: To get base64 string, run `echo -n $AWS_ACCESS_KEY_ID | base64`
1. Create conda environment using environment.yml for running tests. Run `conda env create -f environment.yml`
1. Activate the conda environment `conda activate kfp_test_env`
1. Run port-forward to minio service in background. Example: `kubectl port-forward svc/minio-service 9000:9000 -n kubeflow &`
1. Provide the following arguments to pytest:
1. `region`: AWS region where test will run. Default - us-west-2
1. `role-arn`: SageMaker execution IAM role ARN
1. `s3-data-bucket`: Regional S3 bucket in which test data is hosted
1. `minio-service-port`: Localhost port to which minio service is mapped to. Default - 9000
1. `kfp-namespace`: Cluster namespace where kubeflow pipelines is installed. Default - Kubeflow
1. cd into this directory and run
```
pytest --region <> --role-arn <> --s3-data-bucket <> --minio-service-port <> --kfp-namespace <>
```
1. Copy the `.env.example` file to `.env` and in the following steps modify the fields of this new file:
1. Configure the AWS credentials fields with those of your IAM User.
1. Update the `SAGEMAKER_EXECUTION_ROLE_ARN` with that of your role created earlier.
1. Update the `S3_DATA_BUCKET` parameter with the name of the bucket created earlier.
1. (Optional) If you have already created an EKS cluster for testing, replace the `EKS_EXISTING_CLUSTER` field with it's name.
1. Build the image by doing the following:
1. Navigate to the `components/aws` directory.
1. Run `docker build . -f sagemaker/tests/integration_tests/Dockerfile -t amazon/integration_test`
1. Run the image, injecting your environment variable files:
1. Navigate to the `components/aws` directory.
1. Run `docker run --env-file sagemaker/tests/integration_tests/.env amazon/integration_test`
26 changes: 22 additions & 4 deletions components/aws/sagemaker/tests/integration_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import utils

from datetime import datetime
from filelock import FileLock


def pytest_addoption(parser):
Expand Down Expand Up @@ -86,12 +87,29 @@ def kfp_client():
kfp_installed_namespace = utils.get_kfp_namespace()
return kfp.Client(namespace=kfp_installed_namespace)


@pytest.fixture(scope="session")
def experiment_id(kfp_client):
exp_name = datetime.now().strftime("%Y-%m-%d")
def get_experiment_id(kfp_client):
exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M")
try:
experiment = kfp_client.get_experiment(experiment_name=exp_name)
except ValueError:
experiment = kfp_client.create_experiment(name=exp_name)
return experiment.id

@pytest.fixture(scope="session")
def experiment_id(kfp_client, tmp_path_factory, worker_id):
if not worker_id:
RedbackThomson marked this conversation as resolved.
Show resolved Hide resolved
return get_experiment_id(kfp_client)

# Locking taking as an example from
# https://github.com/pytest-dev/pytest-xdist#making-session-scoped-fixtures-execute-only-once
# get the temp directory shared by all workers
root_tmp_dir = tmp_path_factory.getbasetemp().parent

fn = root_tmp_dir / "experiment_id"
with FileLock(str(fn) + ".lock"):
if fn.is_file():
data = fn.read_text()
else:
data = get_experiment_id(kfp_client)
fn.write_text(data)
return data
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- pyyaml=5.3.*
- flake8=3.7.*
- flake8-black=0.1.*
- filelock=3.0.*
- pip:
- kubernetes==11.0.*
- kfp==0.5.*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Arguments:
variant_name_1: variant-1
instance_type_1: ml.m4.xlarge
initial_instance_count_1: 1
initial_variant_weight_1: 1.0
network_isolation: "True"
role: ((ROLE_ARN))

Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def create_endpoint_pipeline(
model_artifact_url=model_artifact_url,
network_isolation=network_isolation,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)
RedbackThomson marked this conversation as resolved.
Show resolved Hide resolved

sagemaker_deploy_op(
region=region,
Expand All @@ -46,7 +46,7 @@ def create_endpoint_pipeline(
instance_type_1=instance_type_1,
initial_instance_count_1=initial_instance_count_1,
initial_variant_weight_1=initial_variant_weight_1,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def create_model_pipeline(
model_artifact_url=model_artifact_url,
network_isolation=network_isolation,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def hpo_pipeline(
network_isolation=network_isolation,
max_wait_time=max_wait_time,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def training_pipeline(
max_wait_time=max_wait_time,
checkpoint_config=checkpoint_config,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def batch_transform_pipeline(
model_artifact_url=model_artifact_url,
network_isolation=network_isolation,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)

sagemaker_batch_transform_op(
region=region,
Expand All @@ -57,7 +57,7 @@ def batch_transform_pipeline(
split_type=split_type,
compression_type=compression_type,
output_location=output_location,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is awesome, thanks for adding it :)

)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env bash

# Helper script to generate an IAM Role needed to install role-based authentication to a KFP service account.
#
# Run as:
# $ ./generate_iam_role ${cluster_arn/cluster_name} ${role_name} ${cluster_region} [optional: ${service_namespace} ${service_account}]
#

CLUSTER_ARN="${1}"
ROLE_NAME="${2}"
CLUSTER_REGION="${3:-us-east-1}"
SERVICE_NAMESPACE="${4:-kubeflow}"
SERVICE_ACCOUNT="${5:-pipeline-runner}"
aws_account=$(aws sts get-caller-identity --query Account --output text)
trustfile="trust.json"

cwd=$(dirname $(realpath $0))

# if using an existing cluster, use the cluster arn to get the region and cluster name
# example, cluster_arn=arn:aws:eks:us-east-1:12345678910:cluster/test
cluster_name=$(echo ${CLUSTER_ARN} | cut -d'/' -f2)

# A function to get the OIDC_ID associated with an EKS cluster
function get_oidc_id {
# TODO: Ideally this should be based on version compatibility instead of command failure
eksctl utils associate-iam-oidc-provider --cluster ${cluster_name} --region ${CLUSTER_REGION} --approve
RedbackThomson marked this conversation as resolved.
Show resolved Hide resolved
if [[ $? -ge 1 ]]; then
eksctl utils associate-iam-oidc-provider --name ${cluster_name} --region ${CLUSTER_REGION} --approve
fi

local oidc=$(aws eks describe-cluster --name ${cluster_name} --region ${CLUSTER_REGION} --query cluster.identity.oidc.issuer --output text)
oidc_id=$(echo ${oidc} | rev | cut -d'/' -f1 | rev)
}

# A function that generates an IAM role for the given account, cluster, namespace, region
# Parameter:
# $1: Name of the trust file to generate.
function create_namespaced_iam_role {
local trustfile="${1}"
# Check if role already exists
aws iam get-role --role-name ${ROLE_NAME}
if [[ $? -eq 0 ]]; then
echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding."
else
echo "IAM Role does not exist, creating a new Role for the cluster"
aws iam create-role --role-name ${ROLE_NAME} --assume-role-policy-document file://${trustfile} --output=text --query "Role.Arn"
aws iam attach-role-policy --role-name ${ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
fi
}

# Remove the generated trust file
# Parameter:
# $1: Name of the trust file to delete.
function delete_generated_file {
rm "${1}"
}

echo "Get the OIDC ID for the cluster"
get_oidc_id
echo "Delete the trust json file if it already exists"
delete_generated_file "${trustfile}"
echo "Generate a trust json"
"$cwd"/generate_trust_policy ${CLUSTER_REGION} ${aws_account} ${oidc_id} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > "${trustfile}"
echo "Create the IAM Role using these values"
create_namespaced_iam_role "${trustfile}"
echo "Cleanup for the next run"
delete_generated_file "${trustfile}"

Loading