diff --git a/Makefile b/Makefile index 6840509..447e1ea 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,12 @@ include settings.mk +.PHONY: publish-container + +# For creating an AMI. packer-image: docker build -t rastervision/packer -f Dockerfile.packer . -validate-packer-template: +validate-packer-template: packer-image docker run --rm -it \ -v ${PWD}/:/usr/local/src \ -v ${HOME}/.aws:/root/.aws:ro \ @@ -14,49 +17,33 @@ validate-packer-template: rastervision/packer \ validate packer/template-gpu.json -create-image: validate-packer-template +create-ami: validate-packer-template docker run --rm -it \ -v ${PWD}/:/usr/local/src \ -v ${HOME}/.aws:/root/.aws:ro \ -e AWS_PROFILE=${AWS_PROFILE} \ -e AWS_BATCH_BASE_AMI=${AWS_BATCH_BASE_AMI} \ -e AWS_ROOT_BLOCK_DEVICE_SIZE=${AWS_ROOT_BLOCK_DEVICE_SIZE} \ + -e AWS_REGION=${AWS_REGION} \ -w /usr/local/src \ rastervision/packer \ build packer/template-gpu.json -terraform-init: - cd terraform && \ - terraform init; - - -plan: terraform-init - cd terraform && \ - terraform plan \ - -var="batch_ami_id=${AMI_ID}" \ - -var="aws_key_name=${KEY_PAIR_NAME}" \ - -var="aws_region=${AWS_REGION}" \ - -var="ecr_image_tag=${ECR_IMAGE_TAG}" \ - -var="subnet_ids=${SUBNET_IDS}" \ - -out="raster-vision.tfplan"; - -apply: - cd terraform && \ - terraform apply "raster-vision.tfplan"; - -destroy: - cd terraform && \ - terraform destroy \ - -var="batch_ami_id=${AMI_ID}" \ - -var="aws_key_name=${KEY_PAIR_NAME}" \ - -var="aws_region=${AWS_REGION}" \ - -var="ecr_image_tag=${ECR_IMAGE_TAG}" \ - -var="subnet_ids=${SUBNET_IDS}"; +# For publishing a Docker image to ECR. +publish-container-gpu: + $(eval ACCOUNT_ID=$(shell aws sts get-caller-identity --output text --query 'Account')) + aws ecr get-login --no-include-email --region ${AWS_REGION} | bash; + docker tag ${RASTER_VISION_GPU_IMAGE} \ + ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_GPU_IMAGE}:${ECR_IMAGE_TAG} + docker push \ + ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_GPU_IMAGE}:${ECR_IMAGE_TAG} -publish-container: +publish-container-cpu: $(eval ACCOUNT_ID=$(shell aws sts get-caller-identity --output text --query 'Account')) aws ecr get-login --no-include-email --region ${AWS_REGION} | bash; - docker tag ${RASTER_VISION_IMAGE} \ - ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_IMAGE}:${ECR_IMAGE_TAG} + docker tag ${RASTER_VISION_CPU_IMAGE} \ + ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_CPU_IMAGE}:${ECR_IMAGE_TAG} docker push \ - ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_IMAGE}:${ECR_IMAGE_TAG} + ${ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_CPU_IMAGE}:${ECR_IMAGE_TAG} + +publish-container: publish-container-cpu publish-container-gpu diff --git a/README.md b/README.md index 733c855..29f74a0 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,21 @@ # Raster Vision AWS Batch runner setup -This repository contains the deployment code that sets up the necessary AWS resources to utilize the AWS Batch runner in [Raster Vision](https://rastervision.io). Deployment can be driven either by [Terraform](https://terraform.io/) and the [AWS Command Line Interface (CLI)](http://aws.amazon.com/cli/) through a local Docker Compose environment, or via the AWS console using a [CloudFormation template](https://aws.amazon.com/cloudformation/aws-cloudformation-templates/). +This repository contains the deployment code that sets up the necessary AWS resources to utilize the AWS Batch runner in [Raster Vision](https://rastervision.io). Using Batch is advantageous because it starts and stops instances automatically and runs jobs sequentially or in parallel according to the dependencies between them. In addition, this deployment sets up distinct CPU and GPU resources and utilizes spot instances, which is more cost-effective than always using a GPU on-demand instance. Deployment is driven via the AWS console using a [CloudFormation template](https://aws.amazon.com/cloudformation/aws-cloudformation-templates/). This AWS Batch setup is an "advanced" option that assumes some familiarity with [Docker](https://docs.docker.com/), AWS [IAM](https://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html), [named profiles](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html), [availability zones](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html), [EC2](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/concepts.html), [ECR](https://docs.aws.amazon.com/AmazonECR/latest/userguide/what-is-ecr.html), [CloudFormation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/Welcome.html), and [Batch](https://docs.aws.amazon.com/batch/latest/userguide/what-is-batch.html). ## Table of Contents ## +* [AWS Account Setup](#aws-account-setup) * [AWS Credentials](#aws-credentials) -* [Packer Image](#packer-docker-image) * [AMI Creation](#ami-creation) * [Deploying Batch resources](#deploying-batch-resources) +* [Update Raster Vision configuration](#update-raster-vision-configuration) + +## AWS Account Setup ## + +In order to setup Batch using this repo, you will need to setup your AWS account so that: +* you have either root access to your AWS account, or an IAM user with admin permissions. It may be possible with less permissions, but we haven't figured out how to do this yet after some experimentation. +* you have the ability to launch P2 or P3 instances which have GPUs. In the past, it was necessary to open a support ticket to request access to these instances. You will know if this is the case if the Packer job fails when trying to launch the instance. +* you have requested permission from AWS to use availability zones outside the USA if you would like to use them. (New AWS accounts can't launch EC2 instances in other AZs by default.) If you are in doubt, just use us-east-1. ## AWS Credentials ## @@ -21,33 +29,21 @@ Default region name [us-east-1]: us-east-1 Default output format [None]: ``` -You will be prompted to enter your AWS credentials, along with a default region. These credentials will be used to authenticate calls to the AWS API when using Terraform and the AWS CLI. - -## Packer Docker Image ## - -You must ensure that you have the `rastervision/packer` Docker image. -From within the root directory of the repository, type `make packer-image` to build it. +You will be prompted to enter your AWS credentials, along with a default region. The Access Key ID and Secret Access Key can be retrieved from the IAM console. These credentials will be used to authenticate calls to the AWS API when using Packer and the AWS CLI. ## AMI Creation ## -This step uses packer to install nvidia-docker on the base ECS AMI -in order to run GPU jobs on AWS Batch. +This step uses packer to install nvidia-docker on the base ECS AMI in order to run GPU jobs on AWS Batch. Note an occasional (issue)[https://github.com/azavea/raster-vision-cloudformation/issues/9] with using Packer. ### Configure the settings ### -Copy the `settings.mk.template` file to `settings.mk`, and fill out the options shown in the table below. -Please note that some of the entries must be filled-out before you run the `make create-image` command (e.g. `AWS_BATCH_BASE_AMI`) -and some of them can only be filled-out afterwards (e.g. `AMI_ID`). +Copy the `settings.mk.template` file to `settings.mk`, and fill out the options shown in the table below. *Remaining variables in the settings file will be filled in later.* -| `AWS_BATCH_BASE_AMI` | The AMI of the Deep Learning Base AMI (Amazon Linux) to use. | +| Variable | Description | |------------------------------|------------------------------------------------------------------------------| +| `AWS_BATCH_BASE_AMI` | The AMI of the Deep Learning Base AMI (Amazon Linux) to use. | | `AWS_ROOT_BLOCK_DEVICE_SIZE` | The size of the volume, in GiB, of the root device for the AMI. | -| `AMI_ID` | The AMI ID that comes from the `make create-image` step | -| `KEY_PAIR_NAME` | The key pair name for the batch EC2 instances | | `AWS_REGION` | The AWS region to use. | -| `RASTER_VISION_IMAGE` | The raster vision image to use. e.g. quay.io/azavea/raster-vision:gpu-latest | -| `ECR_IMAGE` | The name for the ECR image | -| `ECR_IMAGE_TAG` | The ECR image tag to use, that is the tag in ECR_IMAGE | To find the latest Deep Learning Base AMI, search in the AMI section of your EC2 AWS console for `Deep Learning Base AMI (Amazon Linux)`. @@ -74,60 +70,57 @@ so you should go through the three steps rather than copying from the screenshot ### Create the Custom AMI ### -Ensure that the AWS profile for the account you want to create the AMI in is set in your `AWS_PROFILE` -environment variable setting. +**Ensure that the AWS profile for the account you want to create the AMI in is set in your `AWS_PROFILE` environment variable setting.** If you skip this step, Packer will freeze. Then run: ```shell -> make create-image +> make create-ami ``` -This will run packer, which will spin up an EC2 instance, install the necessary resources, create an AMI -off of the instance, and shut the instance down. - -### Record the AMI ID ### - -Be sure to record the AMI ID, which will be given in the last line of the output for `make create-image` -on a successful run. Put this in the `settings.mk` as `AMI_ID`. +This will run Packer, which will spin up an EC2 instance, install the necessary resources, create an AMI +off of the instance, and shut the instance down. Be sure to record the AMI ID, which will be given in the last line of the output. ## Deploying Batch resources ## -Once you have an AMI ready, you have two options for deploying Batch resources: you can use Terraform and the command line, or you can use the AWS CloudFormation console. - -### Option A: Terraform and the command line - -Create the AWS Batch compute environment, queue, and more by doing: - -```shell -> make plan -> make apply -``` - -### Option B: AWS CloudFormation console - To deploy AWS Batch resources using AWS CloudFormation, start by logging into your AWS console. Then, follow the steps below: - Navigate to `CloudFormation > Create Stack` - In the `Choose a template field`, select `Upload a template to Amazon S3` and upload the template in `cloudformation/template.yml` +- `Prefix`: If you are setting up multiple RV stacks within an AWS account, you need to set a prefix for namespacing resources. Otherwise, there will be name collisions with any resources that were created as part of another stack. - Specify the following required parameters: - `Stack Name`: The name of your CloudFormation stack - `VPC`: The ID of the Virtual Private Cloud in which to deploy your resource. Your account should have at least one by default. - - `Subnets`: The ID of any subnets that you want to deploy your resources into. Your account should have at least two by default; make sure that the subnets you select are in the VPC that you chose by using the AWS VPC console, or else CloudFormation will throw an error. (Subnets are tied to availability zones, and so affect spot prices.) - - `SSH Key Name`: The name of the SSH key pair you want to be able to use to shell into your Batch instances. If you've created an EC2 instance before, you should already have one you can use; otherwise, you can create one in the EC2 console. - - `AMI`: Provide the ID of the AMI that you created above. -- Adjust any preset parameters that you want to change (the defaults should be fine for most users) and click `Next` + - `Subnets`: The ID of any subnets that you want to deploy your resources into. Your account should have at least two by default; make sure that the subnets you select are in the VPC that you chose by using the AWS VPC console, or else CloudFormation will throw an error. (Subnets are tied to availability zones, and so affect spot prices.) In addition, you need to choose subnets that are available for the instance type you have chosen. To find which subnets are available, go to Spot Pricing History in the EC2 console and select the instance type. Then look up the availability zones that are present in the VPC console to find the corresponding subnets. ![spot availability zones for p3 instances](/docs/images/spot-azs.png) + - `SSH Key Name`: The name of the SSH key pair you want to be able to use to shell into your Batch instances. If you've created an EC2 instance before, you should already have one you can use; otherwise, you can create one in the EC2 console. *Note: If you decide to create a new one, you will need to log out and then back in to the console before creating a Cloudformation stack using this key.* + - `AMI`: For the GPU AMI, provide the ID of the AMI that you created above. **For the CPU AMI, you need to use the ECS-optimized AMI.** You can find the AMI ID for your availability zone [here](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs-optimized_AMI.html). If you use the same AMI for both, CPU jobs will fail with the following error: + ``` + Container messageCannotStartContainerError: Error response from daemon: OCI runtime create failed: container_linux.go:348: starting container process caused "process_linux.go:402: container init caused \"process_linux.go:385: running prestart hook 0 caused \\\"error runni ) + ``` + - `Instance Types`: Provide the instance types you would like to use. (For GPUs, `p3.2xlarge` is approximately 4 times the speed for 4 times the price.) +- Adjust any preset parameters that you want to change (the defaults should be fine for most users) and click `Next`. + - Advanced users: If you plan on modifying Raster Vision and would like to publish a custom image to run on Batch, you will need to specify (CPU and GPU) ECR repo names and a tag name to use for both. Note that the repo names cannot be the same as the Stack name (the first field in the UI) and cannot be the same as any existing ECR repo names. If you are in a team environment where you are sharing the AWS account, the repo names should contain an identifier such as your username. - Accept all default options on the `Options` page and click `Next` - Accept `I acknowledge that AWS CloudFormation might create IAM resources with custom names` on the `Review` page and click `Create` - Watch your resources get deployed! -### Publish the Raster Vision container to ECS +### Optional: Publish local Raster Vision images to ECR -Once you've deployed Batch resources with either Terraform or the AWS CloudFormation console, the last step is to publish your Raster Vision container to ECS where Batch can pull it. +If you setup ECR repositories during the CloudFormation setup (the "advanced user" option), then you will need to follow this step, which publishes local Raster Vision images to those ECR repositories. Every time you make a change to your local Raster Vision images and want to use those on Batch, you will need to run this step. -Use +Run `./docker/build` in the main Raster Vision repo to build local copies of the CPU and GPU images. -```shell -> make publish-container -``` +In `settings.mk`, fill out the options shown in the table below. + +| Variable | Description | +|------------------------------|------------------------------------------------------------------------------| +| `RASTER_VISION_CPU_IMAGE` | The local Raster Vision CPU image to use. +| `RASTER_VISION_GPU_IMAGE` | The local Raster Vision GPU image to use. +| `ECR_CPU_IMAGE` | The name of the ECR CPU image | +| `ECR_GPU_IMAGE` | The name of the ECR GPU image | +| `ECR_IMAGE_TAG` | The ECR image tag to use, that is the tag in ECR_CPU_IMAGE and ECR_GPU_IMAGE | + +Run `make publish-container` to publish the CPU and GPU images to your ECR repositories. + +## Update Raster Vision configuration -to publish the raster-vision container to your ECR repository. +Finally, make sure to update your [Raster Vision configuration](https://docs.rastervision.io/en/latest/setup.html#setting-up-aws-batch) with the Batch resources that were created. diff --git a/cloudformation/template.yml b/cloudformation/template.yml index f5b0668..40d572d 100644 --- a/cloudformation/template.yml +++ b/cloudformation/template.yml @@ -15,12 +15,12 @@ Metadata: - VPC - SubnetIds - KeyName + - GpuAMI + - CpuAMI - Label: - default: AMIs and Instance Types (Advanced) + default: Instance Types (Advanced) Parameters: - - GpuAMI - - CpuAMI - GpuInstanceTypes - CpuInstanceTypes - @@ -39,8 +39,10 @@ Metadata: - GpuRepositoryName - CpuRepositoryName - ImageTag - - InstanceVCPUs - - InstanceMemory + - GPUInstanceVCPUs + - GPUInstanceMemory + - CPUInstanceVCPUs + - CPUInstanceMemory ParameterLabels: Prefix: default: Prefix @@ -64,10 +66,14 @@ Metadata: default: Instance Types (GPU) CpuInstanceTypes: default: Instance Types (CPU) - InstanceVCPUs: - default: vCPU Limit - InstanceMemory: - default: Memory Limit + GPUInstanceVCPUs: + default: GPU vCPU Limit + GPUInstanceMemory: + default: GPU Memory Limit + CPUInstanceVCPUs: + default: CPU vCPU Limit + CPUInstanceMemory: + default: CPU Memory Limit CpuRepositoryName: default: Repository Name (CPU) GpuRepositoryName: @@ -85,19 +91,18 @@ Parameters: Default: "" Description: > Optional lowercase identifier to use for namespacing your resources (e.g. - RasterVisionIamRole becomes yournameRasterVisionIamRole) + RasterVisionIamRole becomes yournameRasterVisionIamRole). You need to do this if you + are setting up multiple RV stacks within one account. MaxLength: 12 AllowedPattern: ^[a-z0-9]*$ ConstraintDescription: must only contain lowercase letters and numbers GpuAMI: Type: AWS::EC2::Image::Id - Default: ami-08169a3f3a0be41cd Description: Amazon Machine Image to use for the GPU-enabled compute environment CpuAMI: Type: AWS::EC2::Image::Id - Default: ami-08169a3f3a0be41cd Description: Amazon Machine Image to use for the CPU-enabled compute environment KeyName: @@ -150,15 +155,25 @@ Parameters: A comma-separated list of instance types that may be launched with the CPU-enabled AMI - InstanceVCPUs: + GPUInstanceVCPUs: Type: Number Default: 4 - Description: Number of vCPUs reserved for the container by the task definition + Description: Number of vCPUs reserved for the container by the task definition for GPU instances - InstanceMemory: + GPUInstanceMemory: Type: Number Default: 40000 - Description: The hard limit (in MB) of memory to present to the container + Description: The hard limit (in MB) of memory to present to the container for GPU instances + + CPUInstanceVCPUs: + Type: Number + Default: 1 + Description: Number of vCPUs reserved for the container by the task definition for CPU instances + + CPUInstanceMemory: + Type: Number + Default: 1250 + Description: The hard limit (in MB) of memory to present to the container for CPU instances GpuRepositoryName: Type: String @@ -180,7 +195,7 @@ Parameters: Type: String Default: "" Description: > - (Optional) Tag of the container image to retrieve from ECR -- required + (Optional) Tag of the CPU and GPU container image to retrieve from ECR -- required if CpuRepositoryName or GpuRepositoryName is not empty VPC: @@ -380,8 +395,8 @@ Resources: JobDefinitionName: !Join ['', [!Ref Prefix, 'RasterVisionCustomGpuJobDefinition']] ContainerProperties: Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${GpuRepositoryName}:${ImageTag}" - Vcpus: !Ref InstanceVCPUs - Memory: !Ref InstanceMemory + Vcpus: !Ref GPUInstanceVCPUs + Memory: !Ref GPUInstanceMemory Volumes: - Host: @@ -403,8 +418,8 @@ Resources: JobDefinitionName: !Join ['', [!Ref Prefix, 'RasterVisionHostedGpuJobDefinition']] ContainerProperties: Image: quay.io/azavea/raster-vision:gpu-latest - Vcpus: !Ref InstanceVCPUs - Memory: !Ref InstanceMemory + Vcpus: !Ref GPUInstanceVCPUs + Memory: !Ref GPUInstanceMemory Volumes: - Host: @@ -426,8 +441,8 @@ Resources: JobDefinitionName: !Join ['', [!Ref Prefix, 'RasterVisionCustomCpuJobDefinition']] ContainerProperties: Image: !Sub "${AWS::AccountId}.dkr.ecr.${AWS::Region}.amazonaws.com/${CpuRepositoryName}:${ImageTag}" - Vcpus: !Ref InstanceVCPUs - Memory: !Ref InstanceMemory + Vcpus: !Ref CPUInstanceVCPUs + Memory: !Ref CPUInstanceMemory Volumes: - Host: @@ -449,8 +464,8 @@ Resources: JobDefinitionName: !Join ['', [!Ref Prefix, 'RasterVisionHostedCpuJobDefinition']] ContainerProperties: Image: quay.io/azavea/raster-vision:cpu-latest - Vcpus: !Ref InstanceVCPUs - Memory: !Ref InstanceMemory + Vcpus: !Ref CPUInstanceVCPUs + Memory: !Ref CPUInstanceMemory Volumes: - Host: diff --git a/docker-compose.yml b/docker-compose.yml deleted file mode 100644 index c55db40..0000000 --- a/docker-compose.yml +++ /dev/null @@ -1,22 +0,0 @@ -version: '2.1' -services: - terraform: - image: quay.io/azavea/terraform:0.10.8 - volumes: - - ./:/usr/local/src - - $HOME/.aws:/root/.aws:ro - environment: - - AWS_PROFILE=$AWS_PROFILE - - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-east-1} - working_dir: /usr/local/src - entrypoint: bash - packer: - image: hashicorp/packer:1.3.1 - volumes: - - ./:/usr/local/src - - $HOME/.aws:/root/.aws:ro - environment: - - AWS_PROFILE=$AWS_PROFILE - - AWS_DEFAULT_REGION=${AWS_DEFAULT_REGION:-us-east-1} - working_dir: /usr/local/src - entrypoint: bash diff --git a/docs/images/spot-azs.png b/docs/images/spot-azs.png new file mode 100644 index 0000000..ba958c1 Binary files /dev/null and b/docs/images/spot-azs.png differ diff --git a/packer/scripts/configure-gpu.sh b/packer/scripts/configure-gpu.sh index 8973488..a4d6ae5 100755 --- a/packer/scripts/configure-gpu.sh +++ b/packer/scripts/configure-gpu.sh @@ -11,7 +11,7 @@ sudo yum install -y $PACKAGES sudo pkill -SIGHUP dockerd # Run test container to verify installation -sudo docker run --privileged --runtime=nvidia --rm nvidia/cuda nvidia-smi +sudo docker run --privileged --runtime=nvidia --rm nvidia/cuda:9.0 nvidia-smi # Update Docker daemon.json to user nvidia-container-runtime by default sudo tee /etc/docker/daemon.json <