Skip to content

Commit

Permalink
[AWS SageMaker] Integration tests automation (#3768)
Browse files Browse the repository at this point in the history
* # This is a combination of 5 commits.
# This is the 1st commit message:

Add initial scripts

# This is the commit message #2:

Add working pytest script

# This is the commit message #3:

Add initial scripts

# This is the commit message #4:

Add environment variable files

# This is the commit message #5:

Remove old cluster script

* Add initial scripts

Add working pytest script

Add initial scripts

Add environment variable files

Remove old cluster script

Update pipeline credentials to OIDC

Add initial scripts

Add working pytest script

Add initial scripts

Add working pytest script

* Remove debugging mark

* Update example EKS cluster name

* Remove quiet from Docker build

* Manually pass env

* Update env list vars as string

* Update use array directly

* Update variable array to export

* Update to using read for splitting

* Move to helper script

* Update export from CodeBuild

* Add wait for minio

* Update kubectl wait timeout

* Update minor changes for PR

* Update integration test buildspec to quiet build

* Add region to delete EKS

* Add wait for pods

* Updated README

* Add fixed interval wait

* Fix CodeBuild step order

* Add file lock for experiment ID

* Fix missing pytest parameter

* Update run create only once

* Add filelock to conda env

* Update experiment name ensuring creation each time

* Add try/catch with create experiment

* Remove caching from KFP deployment

* Remove disable KFP caching

* Move .gitignore changes to inside component

* Add blank line to default .gitignore
  • Loading branch information
RedbackThomson authored May 20, 2020
1 parent 4a961ce commit f2a860b
Show file tree
Hide file tree
Showing 18 changed files with 404 additions and 50 deletions.
2 changes: 2 additions & 0 deletions components/aws/sagemaker/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Any environment variable files
**/*/.env
16 changes: 13 additions & 3 deletions components/aws/sagemaker/codebuild/integration-test.buildspec.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,24 @@
version: 0.2

env:
variables:
CONTAINER_VARIABLES: "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI EKS_PRIVATE_SUBNETS EKS_PUBLIC_SUBNETS PYTEST_ADDOPTS S3_DATA_BUCKET EKS_EXISTING_CLUSTER SAGEMAKER_EXECUTION_ROLE_ARN REGION"

phases:
build:
commands:
- cd components/aws
- docker build . -f ./sagemaker/tests/integration_tests/Dockerfile -t amazon/integration-test-image --quiet

- cd sagemaker/codebuild/scripts && export CONTAINER_VARIABLE_ARGUMENTS="$(./construct_environment_array.sh)"

# Run the container and copy the results to /tmp
# Passes all host environment variables through to the container
- docker run --name integration-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/integration-test-image
- docker cp integration-test-container:/app/tests/integration_tests/integration_tests.log /tmp/results.xml
# Passes all listed host environment variables through to the container
- docker run --name integration-test-container $(echo $CONTAINER_VARIABLE_ARGUMENTS) amazon/integration-test-image

post_build:
commands:
- docker cp integration-test-container:/tests/integration_tests/integration_tests.log /tmp/results.xml
- docker rm -f integration-test-container

reports:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

# This script breaks up a string of environment variable names into a list of
# parameters that `docker run` accepts. This needs to be made into a script
# for CodeBuild because these commands do not run in dash - the default terminal
# on the CodeBuild standard images.

IFS=' ' read -a variable_array <<< $CONTAINER_VARIABLES
printf -v CONTAINER_VARIABLE_ARGUMENTS -- "--env %s " "${variable_array[@]}"
echo $CONTAINER_VARIABLE_ARGUMENTS
3 changes: 3 additions & 0 deletions components/aws/sagemaker/codebuild/unit-test.buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ phases:
# Run the container and copy the results to /tmp
# Passes all host environment variables through to the container
- docker run --name unit-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/unit-test-image

post_build:
commands:
- docker cp unit-test-container:/app/tests/unit_tests/unit_tests.log /tmp/results.xml
- docker rm -f unit-test-container

Expand Down
12 changes: 12 additions & 0 deletions components/aws/sagemaker/tests/integration_tests/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# If you would like to override the credentials for the container
# AWS_ACCESS_KEY_ID=
# AWS_SECRET_ACCESS_KEY=
# AWS_SESSION_TOKEN=

REGION=us-east-1

SAGEMAKER_EXECUTION_ROLE_ARN=arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-Example
S3_DATA_BUCKET=my-data-bucket

# If you hope to use an existing EKS cluster, rather than creating a new one.
# EKS_EXISTING_CLUSTER=my-eks-cluster
43 changes: 43 additions & 0 deletions components/aws/sagemaker/tests/integration_tests/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
FROM continuumio/miniconda:4.7.12

RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
wget \
git \
jq

# Install eksctl
RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.19.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp \
&& mv /tmp/eksctl /usr/local/bin

# Install aws-iam-authenticator
RUN curl -S -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.16.8/2020-04-16/bin/linux/amd64/aws-iam-authenticator \
&& chmod +x /usr/local/bin/aws-iam-authenticator

# Install Kubectl
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.0/bin/linux/amd64/kubectl \
&& chmod +x ./kubectl \
&& mv ./kubectl /usr/local/bin/kubectl

# Install Argo CLI
RUN curl -sSL -o /usr/local/bin/argo https://github.com/argoproj/argo/releases/download/v2.8.0/argo-linux-amd64 \
&& chmod +x /usr/local/bin/argo

# Copy conda environment early to avoid cache busting
COPY ./sagemaker/tests/integration_tests/environment.yml environment.yml

# Create conda environment for running tests and set as start-up environment
RUN conda env create -f environment.yml
RUN echo "source activate kfp_test_env" > ~/.bashrc
ENV PATH "/opt/conda/envs/kfp_test_env/bin":$PATH

# Environment variables to be used by tests
ENV REGION="us-west-2"
ENV SAGEMAKER_EXECUTION_ROLE_ARN="arn:aws:iam::1234567890:role/sagemaker-role"
ENV S3_DATA_BUCKET="kfp-test-data"
ENV MINIO_LOCAL_PORT=9000
ENV KFP_NAMESPACE="kubeflow"

COPY ./sagemaker/ .

ENTRYPOINT [ "/bin/bash", "./tests/integration_tests/scripts/run_integration_tests" ]
51 changes: 15 additions & 36 deletions components/aws/sagemaker/tests/integration_tests/README.md
Original file line number Diff line number Diff line change
@@ -1,42 +1,21 @@
## Requirements
1. [Conda](https://docs.conda.io/en/latest/miniconda.html)
1. [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/)
1. Argo CLI: [Mac](https://github.com/argoproj/homebrew-tap), [Linux](https://eksworkshop.com/advanced/410_batch/install/)
1. K8s cluster with Kubeflow pipelines > 0.4.0 installed
1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and S3FullAccess
1. IAM User credentials with SageMakerFullAccess permissions
1. [Docker](https://www.docker.com/)
1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and AmazonS3FullAccess
1. IAM User credentials with SageMakerFullAccess, AWSCloudFormationFullAccess, IAMFullAccess, AmazonEC2FullAccess, AmazonS3FullAccess permissions

## Creating S3 buckets with datasets

Change the bucket name and run the python script `[s3_sample_data_creator.py](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset)` to create S3 buckets with mnist dataset in the region where you want to run the tests
In the following Python script, change the bucket name and run the [`s3_sample_data_creator.py`](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset) to create an S3 bucket with the sample mnist dataset in the region where you want to run the tests.

## Step to run integration tests
1. Configure AWS credentials with access to EKS cluster
1. Fetch kubeconfig to `~/.kube/config` or set `KUBECONFIG` environment variable to point to kubeconfig of the cluster
1. Create a [secret](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/) named `aws-secret` in kubeflow namespace with credentials of IAM User for SageMakerFullAccess
```yaml
apiVersion: v1
kind: Secret
metadata:
name: aws-secret
namespace: kubeflow
type: Opaque
data:
AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY
AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS
```
> Note: To get base64 string, run `echo -n $AWS_ACCESS_KEY_ID | base64`
1. Create conda environment using environment.yml for running tests. Run `conda env create -f environment.yml`
1. Activate the conda environment `conda activate kfp_test_env`
1. Run port-forward to minio service in background. Example: `kubectl port-forward svc/minio-service 9000:9000 -n kubeflow &`
1. Provide the following arguments to pytest:
1. `region`: AWS region where test will run. Default - us-west-2
1. `role-arn`: SageMaker execution IAM role ARN
1. `s3-data-bucket`: Regional S3 bucket in which test data is hosted
1. `minio-service-port`: Localhost port to which minio service is mapped to. Default - 9000
1. `kfp-namespace`: Cluster namespace where kubeflow pipelines is installed. Default - Kubeflow
1. cd into this directory and run
```
pytest --region <> --role-arn <> --s3-data-bucket <> --minio-service-port <> --kfp-namespace <>
```
1. Copy the `.env.example` file to `.env` and in the following steps modify the fields of this new file:
1. Configure the AWS credentials fields with those of your IAM User.
1. Update the `SAGEMAKER_EXECUTION_ROLE_ARN` with that of your role created earlier.
1. Update the `S3_DATA_BUCKET` parameter with the name of the bucket created earlier.
1. (Optional) If you have already created an EKS cluster for testing, replace the `EKS_EXISTING_CLUSTER` field with it's name.
1. Build the image by doing the following:
1. Navigate to the `components/aws` directory.
1. Run `docker build . -f sagemaker/tests/integration_tests/Dockerfile -t amazon/integration_test`
1. Run the image, injecting your environment variable files:
1. Navigate to the `components/aws` directory.
1. Run `docker run --env-file sagemaker/tests/integration_tests/.env amazon/integration_test`
26 changes: 22 additions & 4 deletions components/aws/sagemaker/tests/integration_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import utils

from datetime import datetime
from filelock import FileLock


def pytest_addoption(parser):
Expand Down Expand Up @@ -86,12 +87,29 @@ def kfp_client():
kfp_installed_namespace = utils.get_kfp_namespace()
return kfp.Client(namespace=kfp_installed_namespace)


@pytest.fixture(scope="session")
def experiment_id(kfp_client):
exp_name = datetime.now().strftime("%Y-%m-%d")
def get_experiment_id(kfp_client):
exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M")
try:
experiment = kfp_client.get_experiment(experiment_name=exp_name)
except ValueError:
experiment = kfp_client.create_experiment(name=exp_name)
return experiment.id

@pytest.fixture(scope="session")
def experiment_id(kfp_client, tmp_path_factory, worker_id):
if not worker_id:
return get_experiment_id(kfp_client)

# Locking taking as an example from
# https://github.com/pytest-dev/pytest-xdist#making-session-scoped-fixtures-execute-only-once
# get the temp directory shared by all workers
root_tmp_dir = tmp_path_factory.getbasetemp().parent

fn = root_tmp_dir / "experiment_id"
with FileLock(str(fn) + ".lock"):
if fn.is_file():
data = fn.read_text()
else:
data = get_experiment_id(kfp_client)
fn.write_text(data)
return data
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies:
- pyyaml=5.3.*
- flake8=3.7.*
- flake8-black=0.1.*
- filelock=3.0.*
- pip:
- kubernetes==11.0.*
- kfp==0.5.*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Arguments:
variant_name_1: variant-1
instance_type_1: ml.m4.xlarge
initial_instance_count_1: 1
initial_variant_weight_1: 1.0
network_isolation: "True"
role: ((ROLE_ARN))

Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def create_endpoint_pipeline(
model_artifact_url=model_artifact_url,
network_isolation=network_isolation,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)

sagemaker_deploy_op(
region=region,
Expand All @@ -46,7 +46,7 @@ def create_endpoint_pipeline(
instance_type_1=instance_type_1,
initial_instance_count_1=initial_instance_count_1,
initial_variant_weight_1=initial_variant_weight_1,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def create_model_pipeline(
model_artifact_url=model_artifact_url,
network_isolation=network_isolation,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def hpo_pipeline(
network_isolation=network_isolation,
max_wait_time=max_wait_time,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def training_pipeline(
max_wait_time=max_wait_time,
checkpoint_config=checkpoint_config,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def batch_transform_pipeline(
model_artifact_url=model_artifact_url,
network_isolation=network_isolation,
role=role,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)

sagemaker_batch_transform_op(
region=region,
Expand All @@ -57,7 +57,7 @@ def batch_transform_pipeline(
split_type=split_type,
compression_type=compression_type,
output_location=output_location,
).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"))
)


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env bash

# Helper script to generate an IAM Role needed to install role-based authentication to a KFP service account.
#
# Run as:
# $ ./generate_iam_role ${cluster_arn/cluster_name} ${role_name} ${cluster_region} [optional: ${service_namespace} ${service_account}]
#

CLUSTER_ARN="${1}"
ROLE_NAME="${2}"
CLUSTER_REGION="${3:-us-east-1}"
SERVICE_NAMESPACE="${4:-kubeflow}"
SERVICE_ACCOUNT="${5:-pipeline-runner}"
aws_account=$(aws sts get-caller-identity --query Account --output text)
trustfile="trust.json"

cwd=$(dirname $(realpath $0))

# if using an existing cluster, use the cluster arn to get the region and cluster name
# example, cluster_arn=arn:aws:eks:us-east-1:12345678910:cluster/test
cluster_name=$(echo ${CLUSTER_ARN} | cut -d'/' -f2)

# A function to get the OIDC_ID associated with an EKS cluster
function get_oidc_id {
# TODO: Ideally this should be based on version compatibility instead of command failure
eksctl utils associate-iam-oidc-provider --cluster ${cluster_name} --region ${CLUSTER_REGION} --approve
if [[ $? -ge 1 ]]; then
eksctl utils associate-iam-oidc-provider --name ${cluster_name} --region ${CLUSTER_REGION} --approve
fi

local oidc=$(aws eks describe-cluster --name ${cluster_name} --region ${CLUSTER_REGION} --query cluster.identity.oidc.issuer --output text)
oidc_id=$(echo ${oidc} | rev | cut -d'/' -f1 | rev)
}

# A function that generates an IAM role for the given account, cluster, namespace, region
# Parameter:
# $1: Name of the trust file to generate.
function create_namespaced_iam_role {
local trustfile="${1}"
# Check if role already exists
aws iam get-role --role-name ${ROLE_NAME}
if [[ $? -eq 0 ]]; then
echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding."
else
echo "IAM Role does not exist, creating a new Role for the cluster"
aws iam create-role --role-name ${ROLE_NAME} --assume-role-policy-document file://${trustfile} --output=text --query "Role.Arn"
aws iam attach-role-policy --role-name ${ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
fi
}

# Remove the generated trust file
# Parameter:
# $1: Name of the trust file to delete.
function delete_generated_file {
rm "${1}"
}

echo "Get the OIDC ID for the cluster"
get_oidc_id
echo "Delete the trust json file if it already exists"
delete_generated_file "${trustfile}"
echo "Generate a trust json"
"$cwd"/generate_trust_policy ${CLUSTER_REGION} ${aws_account} ${oidc_id} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > "${trustfile}"
echo "Create the IAM Role using these values"
create_namespaced_iam_role "${trustfile}"
echo "Cleanup for the next run"
delete_generated_file "${trustfile}"

Loading

0 comments on commit f2a860b

Please sign in to comment.