Skip to content

Commit

Permalink
Retry on docker pull
Browse files Browse the repository at this point in the history
  • Loading branch information
parasj committed Mar 22, 2022
1 parent 42ad665 commit 9ec8ce0
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 4 deletions.
2 changes: 1 addition & 1 deletion skylark/compute/aws/aws_cloud_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def provision_instance(
if name is None:
name = f"skylark-aws-{str(uuid.uuid4()).replace('-', '')}"
iam_instance_profile_name = f"{name}_profile"
iam = self.auth.get_boto3_client("iam")
iam = self.auth.get_boto3_client("iam", region)
ec2 = self.auth.get_boto3_resource("ec2", region)
vpc = self.get_vpc(region)
assert vpc is not None, "No VPC found"
Expand Down
11 changes: 9 additions & 2 deletions skylark/compute/server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import partial
import json
import subprocess
from enum import Enum, auto
Expand Down Expand Up @@ -197,6 +198,13 @@ def install_docker(self):
if not docker_version.startswith("Success"):
raise RuntimeError(f"Failed to install Docker on {self.region_tag}, {self.public_ip()}: OUT {out}\nERR {err}")

def pull_docker(self, gateway_docker_image):
docker_out, docker_err = self.run_command(f"sudo docker pull {gateway_docker_image}")
if "Status: Downloaded newer image" not in docker_out and "Status: Image is up to date" not in docker_out:
raise RuntimeError(
f"Failed to pull docker image {gateway_docker_image} on {self.region_tag}, {self.public_ip()}: OUT {docker_out}\nERR {docker_err}"
)

def start_gateway(
self,
outgoing_ports: Dict[str, int], # maps ip to number of connections along route
Expand Down Expand Up @@ -226,8 +234,7 @@ def check_stderr(tup):

# pull docker image and start container
with Timer(f"{desc_prefix}: Docker pull"):
docker_out, docker_err = self.run_command(f"sudo docker pull {gateway_docker_image}")
assert "Status: Downloaded newer image" in docker_out or "Status: Image is up to date" in docker_out, (docker_out, docker_err)
retry_backoff(partial(self.pull_docker, gateway_docker_image), exception_class=RuntimeError)
logger.debug(f"{desc_prefix}: Starting gateway container")
docker_run_flags = f"-d --log-driver=local --log-opt max-file=16 --ipc=host --network=host --ulimit nofile={1024 * 1024}"
docker_run_flags += " --mount type=tmpfs,dst=/skylark,tmpfs-size=$(($(free -b | head -n2 | tail -n1 | awk '{print $2}')/2))"
Expand Down
5 changes: 4 additions & 1 deletion skylark/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ def retry_backoff(
if i == max_retries - 1:
raise e
else:
logger.warning(f"Retrying function due to {e} (attempt {i + 1}/{max_retries})")
# ignore retries due to IAM instance profile propagation
if "Invalid IAM Instance Profile name" not in str(e):
fn_name = fn.__name__ if hasattr(fn, "__name__") else "unknown function"
logger.warning(f"Retrying {fn_name} due to: {e} (attempt {i + 1}/{max_retries})")
time.sleep(backoff)
backoff = min(backoff * 2, max_backoff)

0 comments on commit 9ec8ce0

Please sign in to comment.