diff --git a/components/aws/sagemaker/.gitignore b/components/aws/sagemaker/.gitignore new file mode 100644 index 00000000000..58c9068fc5c --- /dev/null +++ b/components/aws/sagemaker/.gitignore @@ -0,0 +1,2 @@ +# Any environment variable files +**/*/.env \ No newline at end of file diff --git a/components/aws/sagemaker/codebuild/integration-test.buildspec.yml b/components/aws/sagemaker/codebuild/integration-test.buildspec.yml index 0ca12b06c61..09dafe53cd4 100644 --- a/components/aws/sagemaker/codebuild/integration-test.buildspec.yml +++ b/components/aws/sagemaker/codebuild/integration-test.buildspec.yml @@ -1,14 +1,24 @@ version: 0.2 + +env: + variables: + CONTAINER_VARIABLES: "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI EKS_PRIVATE_SUBNETS EKS_PUBLIC_SUBNETS PYTEST_ADDOPTS S3_DATA_BUCKET EKS_EXISTING_CLUSTER SAGEMAKER_EXECUTION_ROLE_ARN REGION" + phases: build: commands: - cd components/aws - docker build . -f ./sagemaker/tests/integration_tests/Dockerfile -t amazon/integration-test-image --quiet + - cd sagemaker/codebuild/scripts && export CONTAINER_VARIABLE_ARGUMENTS="$(./construct_environment_array.sh)" + # Run the container and copy the results to /tmp - # Passes all host environment variables through to the container - - docker run --name integration-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/integration-test-image - - docker cp integration-test-container:/app/tests/integration_tests/integration_tests.log /tmp/results.xml + # Passes all listed host environment variables through to the container + - docker run --name integration-test-container $(echo $CONTAINER_VARIABLE_ARGUMENTS) amazon/integration-test-image + + post_build: + commands: + - docker cp integration-test-container:/tests/integration_tests/integration_tests.log /tmp/results.xml - docker rm -f integration-test-container reports: diff --git a/components/aws/sagemaker/codebuild/scripts/construct_environment_array.sh b/components/aws/sagemaker/codebuild/scripts/construct_environment_array.sh new file mode 100755 index 00000000000..249108d8bf5 --- /dev/null +++ b/components/aws/sagemaker/codebuild/scripts/construct_environment_array.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +# This script breaks up a string of environment variable names into a list of +# parameters that `docker run` accepts. This needs to be made into a script +# for CodeBuild because these commands do not run in dash - the default terminal +# on the CodeBuild standard images. + +IFS=' ' read -a variable_array <<< $CONTAINER_VARIABLES +printf -v CONTAINER_VARIABLE_ARGUMENTS -- "--env %s " "${variable_array[@]}" +echo $CONTAINER_VARIABLE_ARGUMENTS \ No newline at end of file diff --git a/components/aws/sagemaker/codebuild/unit-test.buildspec.yml b/components/aws/sagemaker/codebuild/unit-test.buildspec.yml index a366094bfa4..4d688494470 100644 --- a/components/aws/sagemaker/codebuild/unit-test.buildspec.yml +++ b/components/aws/sagemaker/codebuild/unit-test.buildspec.yml @@ -8,6 +8,9 @@ phases: # Run the container and copy the results to /tmp # Passes all host environment variables through to the container - docker run --name unit-test-container $(env | cut -f1 -d= | sed 's/^/-e /') amazon/unit-test-image + + post_build: + commands: - docker cp unit-test-container:/app/tests/unit_tests/unit_tests.log /tmp/results.xml - docker rm -f unit-test-container diff --git a/components/aws/sagemaker/tests/integration_tests/.env.example b/components/aws/sagemaker/tests/integration_tests/.env.example new file mode 100644 index 00000000000..33c04cd60f8 --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/.env.example @@ -0,0 +1,12 @@ +# If you would like to override the credentials for the container +# AWS_ACCESS_KEY_ID= +# AWS_SECRET_ACCESS_KEY= +# AWS_SESSION_TOKEN= + +REGION=us-east-1 + +SAGEMAKER_EXECUTION_ROLE_ARN=arn:aws:iam::123456789012:role/service-role/AmazonSageMaker-ExecutionRole-Example +S3_DATA_BUCKET=my-data-bucket + +# If you hope to use an existing EKS cluster, rather than creating a new one. +# EKS_EXISTING_CLUSTER=my-eks-cluster \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/Dockerfile b/components/aws/sagemaker/tests/integration_tests/Dockerfile new file mode 100644 index 00000000000..75c66f8c1bd --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/Dockerfile @@ -0,0 +1,43 @@ +FROM continuumio/miniconda:4.7.12 + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + wget \ + git \ + jq + +# Install eksctl +RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.19.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp \ + && mv /tmp/eksctl /usr/local/bin + +# Install aws-iam-authenticator +RUN curl -S -o /usr/local/bin/aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.16.8/2020-04-16/bin/linux/amd64/aws-iam-authenticator \ + && chmod +x /usr/local/bin/aws-iam-authenticator + +# Install Kubectl +RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.18.0/bin/linux/amd64/kubectl \ + && chmod +x ./kubectl \ + && mv ./kubectl /usr/local/bin/kubectl + +# Install Argo CLI +RUN curl -sSL -o /usr/local/bin/argo https://github.com/argoproj/argo/releases/download/v2.8.0/argo-linux-amd64 \ + && chmod +x /usr/local/bin/argo + +# Copy conda environment early to avoid cache busting +COPY ./sagemaker/tests/integration_tests/environment.yml environment.yml + +# Create conda environment for running tests and set as start-up environment +RUN conda env create -f environment.yml +RUN echo "source activate kfp_test_env" > ~/.bashrc +ENV PATH "/opt/conda/envs/kfp_test_env/bin":$PATH + +# Environment variables to be used by tests +ENV REGION="us-west-2" +ENV SAGEMAKER_EXECUTION_ROLE_ARN="arn:aws:iam::1234567890:role/sagemaker-role" +ENV S3_DATA_BUCKET="kfp-test-data" +ENV MINIO_LOCAL_PORT=9000 +ENV KFP_NAMESPACE="kubeflow" + +COPY ./sagemaker/ . + +ENTRYPOINT [ "/bin/bash", "./tests/integration_tests/scripts/run_integration_tests" ] \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/README.md b/components/aws/sagemaker/tests/integration_tests/README.md index 898d666d61a..e43fbe525b1 100644 --- a/components/aws/sagemaker/tests/integration_tests/README.md +++ b/components/aws/sagemaker/tests/integration_tests/README.md @@ -1,42 +1,21 @@ ## Requirements -1. [Conda](https://docs.conda.io/en/latest/miniconda.html) -1. [Kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) -1. Argo CLI: [Mac](https://github.com/argoproj/homebrew-tap), [Linux](https://eksworkshop.com/advanced/410_batch/install/) -1. K8s cluster with Kubeflow pipelines > 0.4.0 installed -1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and S3FullAccess -1. IAM User credentials with SageMakerFullAccess permissions +1. [Docker](https://www.docker.com/) +1. [IAM Role](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) with a SageMakerFullAccess and AmazonS3FullAccess +1. IAM User credentials with SageMakerFullAccess, AWSCloudFormationFullAccess, IAMFullAccess, AmazonEC2FullAccess, AmazonS3FullAccess permissions ## Creating S3 buckets with datasets -Change the bucket name and run the python script `[s3_sample_data_creator.py](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset)` to create S3 buckets with mnist dataset in the region where you want to run the tests +In the following Python script, change the bucket name and run the [`s3_sample_data_creator.py`](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/mnist-kmeans-sagemaker#the-sample-dataset) to create an S3 bucket with the sample mnist dataset in the region where you want to run the tests. ## Step to run integration tests -1. Configure AWS credentials with access to EKS cluster -1. Fetch kubeconfig to `~/.kube/config` or set `KUBECONFIG` environment variable to point to kubeconfig of the cluster -1. Create a [secret](https://kubernetes.io/docs/tasks/inject-data-application/distribute-credentials-secure/) named `aws-secret` in kubeflow namespace with credentials of IAM User for SageMakerFullAccess - ```yaml - apiVersion: v1 - kind: Secret - metadata: - name: aws-secret - namespace: kubeflow - type: Opaque - data: - AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY - AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS - ``` - - > Note: To get base64 string, run `echo -n $AWS_ACCESS_KEY_ID | base64` -1. Create conda environment using environment.yml for running tests. Run `conda env create -f environment.yml` -1. Activate the conda environment `conda activate kfp_test_env` -1. Run port-forward to minio service in background. Example: `kubectl port-forward svc/minio-service 9000:9000 -n kubeflow &` -1. Provide the following arguments to pytest: - 1. `region`: AWS region where test will run. Default - us-west-2 - 1. `role-arn`: SageMaker execution IAM role ARN - 1. `s3-data-bucket`: Regional S3 bucket in which test data is hosted - 1. `minio-service-port`: Localhost port to which minio service is mapped to. Default - 9000 - 1. `kfp-namespace`: Cluster namespace where kubeflow pipelines is installed. Default - Kubeflow -1. cd into this directory and run - ``` - pytest --region <> --role-arn <> --s3-data-bucket <> --minio-service-port <> --kfp-namespace <> - ``` +1. Copy the `.env.example` file to `.env` and in the following steps modify the fields of this new file: + 1. Configure the AWS credentials fields with those of your IAM User. + 1. Update the `SAGEMAKER_EXECUTION_ROLE_ARN` with that of your role created earlier. + 1. Update the `S3_DATA_BUCKET` parameter with the name of the bucket created earlier. + 1. (Optional) If you have already created an EKS cluster for testing, replace the `EKS_EXISTING_CLUSTER` field with it's name. +1. Build the image by doing the following: + 1. Navigate to the `components/aws` directory. + 1. Run `docker build . -f sagemaker/tests/integration_tests/Dockerfile -t amazon/integration_test` +1. Run the image, injecting your environment variable files: + 1. Navigate to the `components/aws` directory. + 1. Run `docker run --env-file sagemaker/tests/integration_tests/.env amazon/integration_test` \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/conftest.py b/components/aws/sagemaker/tests/integration_tests/conftest.py index 47e6cb9ea40..52c29656cc2 100644 --- a/components/aws/sagemaker/tests/integration_tests/conftest.py +++ b/components/aws/sagemaker/tests/integration_tests/conftest.py @@ -5,6 +5,7 @@ import utils from datetime import datetime +from filelock import FileLock def pytest_addoption(parser): @@ -86,12 +87,29 @@ def kfp_client(): kfp_installed_namespace = utils.get_kfp_namespace() return kfp.Client(namespace=kfp_installed_namespace) - -@pytest.fixture(scope="session") -def experiment_id(kfp_client): - exp_name = datetime.now().strftime("%Y-%m-%d") +def get_experiment_id(kfp_client): + exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M") try: experiment = kfp_client.get_experiment(experiment_name=exp_name) except ValueError: experiment = kfp_client.create_experiment(name=exp_name) return experiment.id + +@pytest.fixture(scope="session") +def experiment_id(kfp_client, tmp_path_factory, worker_id): + if not worker_id: + return get_experiment_id(kfp_client) + + # Locking taking as an example from + # https://github.com/pytest-dev/pytest-xdist#making-session-scoped-fixtures-execute-only-once + # get the temp directory shared by all workers + root_tmp_dir = tmp_path_factory.getbasetemp().parent + + fn = root_tmp_dir / "experiment_id" + with FileLock(str(fn) + ".lock"): + if fn.is_file(): + data = fn.read_text() + else: + data = get_experiment_id(kfp_client) + fn.write_text(data) + return data \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/environment.yml b/components/aws/sagemaker/tests/integration_tests/environment.yml index 565777dc8db..90c7645bc61 100644 --- a/components/aws/sagemaker/tests/integration_tests/environment.yml +++ b/components/aws/sagemaker/tests/integration_tests/environment.yml @@ -12,6 +12,7 @@ dependencies: - pyyaml=5.3.* - flake8=3.7.* - flake8-black=0.1.* + - filelock=3.0.* - pip: - kubernetes==11.0.* - kfp==0.5.* diff --git a/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml b/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml index e961a588b97..f4a413c8281 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml +++ b/components/aws/sagemaker/tests/integration_tests/resources/config/kmeans-mnist-endpoint/config.yaml @@ -15,6 +15,7 @@ Arguments: variant_name_1: variant-1 instance_type_1: ml.m4.xlarge initial_instance_count_1: 1 + initial_variant_weight_1: 1.0 network_isolation: "True" role: ((ROLE_ARN)) \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py index 801b3458f41..8b28e52eac8 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_endpoint_pipeline.py @@ -34,7 +34,7 @@ def create_endpoint_pipeline( model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) sagemaker_deploy_op( region=region, @@ -46,7 +46,7 @@ def create_endpoint_pipeline( instance_type_1=instance_type_1, initial_instance_count_1=initial_instance_count_1, initial_variant_weight_1=initial_variant_weight_1, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py index a7fa0afe057..75f4f6a26e3 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/create_model_pipeline.py @@ -26,7 +26,7 @@ def create_model_pipeline( model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py index 721658355e8..cd1a50fb57a 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/hpo_pipeline.py @@ -56,7 +56,7 @@ def hpo_pipeline( network_isolation=network_isolation, max_wait_time=max_wait_time, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py index e69d103e564..ad8eab23bff 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/training_pipeline.py @@ -46,7 +46,7 @@ def training_pipeline( max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py b/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py index 8ac879f81c1..e8b38697f3a 100644 --- a/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py +++ b/components/aws/sagemaker/tests/integration_tests/resources/definition/transform_job_pipeline.py @@ -40,7 +40,7 @@ def batch_transform_pipeline( model_artifact_url=model_artifact_url, network_isolation=network_isolation, role=role, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) sagemaker_batch_transform_op( region=region, @@ -57,7 +57,7 @@ def batch_transform_pipeline( split_type=split_type, compression_type=compression_type, output_location=output_location, - ).apply(use_aws_secret("aws-secret", "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY")) + ) if __name__ == "__main__": diff --git a/components/aws/sagemaker/tests/integration_tests/scripts/generate_iam_role b/components/aws/sagemaker/tests/integration_tests/scripts/generate_iam_role new file mode 100755 index 00000000000..7e4d1e9b143 --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/scripts/generate_iam_role @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +# Helper script to generate an IAM Role needed to install role-based authentication to a KFP service account. +# +# Run as: +# $ ./generate_iam_role ${cluster_arn/cluster_name} ${role_name} ${cluster_region} [optional: ${service_namespace} ${service_account}] +# + +CLUSTER_ARN="${1}" +ROLE_NAME="${2}" +CLUSTER_REGION="${3:-us-east-1}" +SERVICE_NAMESPACE="${4:-kubeflow}" +SERVICE_ACCOUNT="${5:-pipeline-runner}" +aws_account=$(aws sts get-caller-identity --query Account --output text) +trustfile="trust.json" + +cwd=$(dirname $(realpath $0)) + +# if using an existing cluster, use the cluster arn to get the region and cluster name +# example, cluster_arn=arn:aws:eks:us-east-1:12345678910:cluster/test +cluster_name=$(echo ${CLUSTER_ARN} | cut -d'/' -f2) + +# A function to get the OIDC_ID associated with an EKS cluster +function get_oidc_id { + # TODO: Ideally this should be based on version compatibility instead of command failure + eksctl utils associate-iam-oidc-provider --cluster ${cluster_name} --region ${CLUSTER_REGION} --approve + if [[ $? -ge 1 ]]; then + eksctl utils associate-iam-oidc-provider --name ${cluster_name} --region ${CLUSTER_REGION} --approve + fi + + local oidc=$(aws eks describe-cluster --name ${cluster_name} --region ${CLUSTER_REGION} --query cluster.identity.oidc.issuer --output text) + oidc_id=$(echo ${oidc} | rev | cut -d'/' -f1 | rev) +} + +# A function that generates an IAM role for the given account, cluster, namespace, region +# Parameter: +# $1: Name of the trust file to generate. +function create_namespaced_iam_role { + local trustfile="${1}" + # Check if role already exists + aws iam get-role --role-name ${ROLE_NAME} + if [[ $? -eq 0 ]]; then + echo "A role for this cluster and namespace already exists in this account, assuming sagemaker access and proceeding." + else + echo "IAM Role does not exist, creating a new Role for the cluster" + aws iam create-role --role-name ${ROLE_NAME} --assume-role-policy-document file://${trustfile} --output=text --query "Role.Arn" + aws iam attach-role-policy --role-name ${ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + fi +} + +# Remove the generated trust file +# Parameter: +# $1: Name of the trust file to delete. +function delete_generated_file { + rm "${1}" +} + +echo "Get the OIDC ID for the cluster" +get_oidc_id +echo "Delete the trust json file if it already exists" +delete_generated_file "${trustfile}" +echo "Generate a trust json" +"$cwd"/generate_trust_policy ${CLUSTER_REGION} ${aws_account} ${oidc_id} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > "${trustfile}" +echo "Create the IAM Role using these values" +create_namespaced_iam_role "${trustfile}" +echo "Cleanup for the next run" +delete_generated_file "${trustfile}" + diff --git a/components/aws/sagemaker/tests/integration_tests/scripts/generate_trust_policy b/components/aws/sagemaker/tests/integration_tests/scripts/generate_trust_policy new file mode 100755 index 00000000000..1c10fa10fe8 --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/scripts/generate_trust_policy @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# Helper script to generate trust the policy needed to assign role-based authentication to a KFP service account. +# +# Run as: +# $ ./generate_trust_policy ${EKS_CLUSTER_REGION} ${AWS_ACCOUNT_ID} ${OIDC_ID} ${SERVICE_NAMESPACE} ${SERVICE_ACCOUNT} > trust.json +# +# For example: +# $ ./generate_trust_policy us-west-2 123456789012 D48675832CA65BD10A532F597OIDCID > trust.json +# This will create a file `trust.json` containing a role policy that enables the KFP service runner in an EKS cluster to assume AWS roles. +# +# The SERVICE_NAMESPACE parameter is for when you want to run Kubeflow in a custom namespace other than "kubeflow". +# The SERVICE_ACCOUNT parameter is for when you want to give permissions to a service account other than the default "pipeline-runner". + +cluster_region="$1" +account_number="$2" +oidc_id="$3" +service_namespace="${4}" +service_account="${5}" + +printf '{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::'"${account_number}"':oidc-provider/oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"'" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"':aud": "sts.amazonaws.com", + "oidc.eks.'"${cluster_region}"'.amazonaws.com/id/'"${oidc_id}"':sub": "system:serviceaccount:'"${service_namespace}"':'"${service_account}"'" + } + } + } + ] +} +' \ No newline at end of file diff --git a/components/aws/sagemaker/tests/integration_tests/scripts/run_integration_tests b/components/aws/sagemaker/tests/integration_tests/scripts/run_integration_tests new file mode 100755 index 00000000000..6ad3fb9db28 --- /dev/null +++ b/components/aws/sagemaker/tests/integration_tests/scripts/run_integration_tests @@ -0,0 +1,168 @@ +#!/usr/bin/env bash + +set -u +set -o pipefail + +usage(){ + echo "Usage: $0 -n [-r ]" + exit 1 +} + +cwd=$(dirname $(realpath $0)) + +### Input parameters +DEPLOY_NAME="sagemaker-kfp-"$(date '+%Y-%m-%d-%H-%M-%S')"" # The name given to the entire deployment (tagging all resources) +REGION=${REGION:-"$(aws configure get region)"} # Deployment region + +### Configuration parameters +EKS_EXISTING_CLUSTER=${EKS_EXISTING_CLUSTER:-""} # Use an existing EKS cluster +EKS_CLUSTER_VERSION=${EKS_CLUSTER_VERSION:-"1.15"} # EKS cluster K8s version +EKS_NODE_COUNT=${EKS_NODE_COUNT:-"1"} # The initial node count of the EKS cluster +EKS_PUBLIC_SUBNETS=${EKS_PUBLIC_SUBNETS:-""} +EKS_PRIVATE_SUBNETS=${EKS_PRIVATE_SUBNETS:-""} + +### Testing parameters +MINIO_LOCAL_PORT=${MINIO_LOCAL_PORT:-9000} +KFP_NAMESPACE=${KFP_NAMESPACE:-"kubeflow"} +KFP_SERVICE_ACCOUNT=${KFP_SERVICE_ACCOUNT:-"pipeline-runner"} + +PYTEST_MARKER=${PYTEST_MARKER:-""} +S3_DATA_BUCKET=${S3_DATA_BUCKET:-""} +SAGEMAKER_EXECUTION_ROLE_ARN=${SAGEMAKER_EXECUTION_ROLE_ARN:-""} + +while getopts ":n:r:s:" opt; do + case $opt in + n) + DEPLOY_NAME="$OPTARG" + ;; + s) + S3_DATA_BUCKET="$OPTARG" + ;; + r) + REGION="$OPTARG" + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + exit 1 + ;; + esac +done + +# Ensure a deployment name was specified +if [ "$DEPLOY_NAME" == "" ]; then + echo "Missing deployment name" + usage + exit 1 +fi + +if [ "$S3_DATA_BUCKET" == "" ]; then + echo "Missing S3 data bucket name" + usage + exit 1 +fi + +function cleanup() { + set +e + + cleanup_kfp + delete_generated_role + + if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then + delete_eks + fi +} + +# Set the trap to clean up resources in the case of an error +trap cleanup EXIT +set -e + +function launch_eks() { + EKS_CLUSTER_NAME="${DEPLOY_NAME}-eks-cluster" + + echo "[Creating EKS] Launching EKS cluster $EKS_CLUSTER_NAME" + + eksctl_args=( --managed --nodes "${EKS_NODE_COUNT}" --node-type=c5.xlarge --timeout=30m --region "${REGION}" --auto-kubeconfig --version "${EKS_CLUSTER_VERSION}" ) + [ ! -z "${EKS_PUBLIC_SUBNETS}" ] && eksctl_args+=( --vpc-public-subnets="${EKS_PUBLIC_SUBNETS}" ) + [ ! -z "${EKS_PRIVATE_SUBNETS}" ] && eksctl_args+=( --vpc-private-subnets="${EKS_PRIVATE_SUBNETS}" ) + + eksctl create cluster "${EKS_CLUSTER_NAME}" "${eksctl_args[@]}" + + aws eks update-kubeconfig --name "$EKS_CLUSTER_NAME" --region "$REGION" + + echo "[Creating EKS] $EKS_CLUSTER_NAME launched" +} + +function delete_eks() { + eksctl delete cluster --name "${EKS_CLUSTER_NAME}" --region "${REGION}" +} + +function install_kfp() { + echo "[Installing KFP] Applying KFP manifests" + + PIPELINE_VERSION=0.5.1 + kubectl apply -k github.com/kubeflow/pipelines/manifests/kustomize/cluster-scoped-resources?ref=$PIPELINE_VERSION + kubectl wait --for condition=established --timeout=60s crd/applications.app.k8s.io + kubectl apply -k github.com/kubeflow/pipelines/manifests/kustomize/env/dev?ref=$PIPELINE_VERSION + + echo "[Installing KFP] Port-forwarding Minio" + + kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=minio --timeout=5m + kubectl port-forward -n kubeflow svc/minio-service $MINIO_LOCAL_PORT:9000 & + MINIO_PID=$! + + echo "[Installing KFP] Minio port-forwarded to ${MINIO_LOCAL_PORT}" + + echo "[Installing KFP] Waiting for pods to stand up" + + kubectl wait --for=condition=ready -n "${KFP_NAMESPACE}" pod -l app=ml-pipeline --timeout=5m + + # TODO: Replace with calculated waits + # For the moment we don't know which pods will be slower, so we are just relying on a fixed interval + sleep 3m + + echo "[Installing KFP] Pipeline pods are ready" +} + +function generate_iam_role_name() { + OIDC_ROLE_NAME="$(echo "${DEPLOY_NAME}-kubeflow-role" | cut -c1-64)" + OIDC_ROLE_ARN="arn:aws:iam::$(aws sts get-caller-identity --query=Account --output=text):role/${OIDC_ROLE_NAME}" +} + +function install_generated_role() { + kubectl patch serviceaccount -n ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT} --patch '{"metadata": {"annotations": {"eks.amazonaws.com/role-arn": "'"${OIDC_ROLE_ARN}"'"}}}' +} + +function delete_generated_role() { + # Delete the role associated with the cluster thats being deleted + aws iam detach-role-policy --role-name "${OIDC_ROLE_NAME}" --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam delete-role --role-name "${OIDC_ROLE_NAME}" +} + +function cleanup_kfp() { + # Clean up Minio + if [ ! -z "${MINIO_PID}" ]; then + kill -9 $MINIO_PID || true + fi +} + +if [[ -z "${EKS_EXISTING_CLUSTER}" ]]; then + launch_eks +else + aws eks update-kubeconfig --name "${EKS_EXISTING_CLUSTER}" --region "$REGION" + EKS_CLUSTER_NAME="${EKS_EXISTING_CLUSTER}" + DEPLOY_NAME="${EKS_EXISTING_CLUSTER}" +fi + +generate_iam_role_name +"$cwd"/generate_iam_role ${EKS_CLUSTER_NAME} ${OIDC_ROLE_NAME} ${REGION} ${KFP_NAMESPACE} ${KFP_SERVICE_ACCOUNT} +install_kfp +install_generated_role + +pytest_args=( --region "${REGION}" --role-arn "${SAGEMAKER_EXECUTION_ROLE_ARN}" --s3-data-bucket "${S3_DATA_BUCKET}" --minio-service-port "${MINIO_LOCAL_PORT}" --kfp-namespace "${KFP_NAMESPACE}" ) +[ ! -z "${PYTEST_MARKER}" ] && pytest_args+=( -m "${PYTEST_MARKER}" ) + +cd tests/integration_tests && python -m pytest "${pytest_args[@]}" --junitxml ./integration_tests.log -n $(nproc) \ No newline at end of file