Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry docker pull on error #218

Merged
merged 1 commit into from
Mar 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion skylark/compute/aws/aws_cloud_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def provision_instance(
if name is None:
name = f"skylark-aws-{str(uuid.uuid4()).replace('-', '')}"
iam_instance_profile_name = f"{name}_profile"
iam = self.auth.get_boto3_client("iam")
iam = self.auth.get_boto3_client("iam", region)
ec2 = self.auth.get_boto3_resource("ec2", region)
vpc = self.get_vpc(region)
assert vpc is not None, "No VPC found"
Expand Down
11 changes: 9 additions & 2 deletions skylark/compute/server.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import partial
import json
import subprocess
from enum import Enum, auto
Expand Down Expand Up @@ -197,6 +198,13 @@ def install_docker(self):
if not docker_version.startswith("Success"):
raise RuntimeError(f"Failed to install Docker on {self.region_tag}, {self.public_ip()}: OUT {out}\nERR {err}")

def pull_docker(self, gateway_docker_image):
docker_out, docker_err = self.run_command(f"sudo docker pull {gateway_docker_image}")
if "Status: Downloaded newer image" not in docker_out and "Status: Image is up to date" not in docker_out:
raise RuntimeError(
f"Failed to pull docker image {gateway_docker_image} on {self.region_tag}, {self.public_ip()}: OUT {docker_out}\nERR {docker_err}"
)

def start_gateway(
self,
outgoing_ports: Dict[str, int], # maps ip to number of connections along route
Expand Down Expand Up @@ -226,8 +234,7 @@ def check_stderr(tup):

# pull docker image and start container
with Timer(f"{desc_prefix}: Docker pull"):
docker_out, docker_err = self.run_command(f"sudo docker pull {gateway_docker_image}")
assert "Status: Downloaded newer image" in docker_out or "Status: Image is up to date" in docker_out, (docker_out, docker_err)
retry_backoff(partial(self.pull_docker, gateway_docker_image), exception_class=RuntimeError)
logger.debug(f"{desc_prefix}: Starting gateway container")
docker_run_flags = f"-d --log-driver=local --log-opt max-file=16 --ipc=host --network=host --ulimit nofile={1024 * 1024}"
docker_run_flags += " --mount type=tmpfs,dst=/skylark,tmpfs-size=$(($(free -b | head -n2 | tail -n1 | awk '{print $2}')/2))"
Expand Down
5 changes: 4 additions & 1 deletion skylark/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ def retry_backoff(
if i == max_retries - 1:
raise e
else:
logger.warning(f"Retrying function due to {e} (attempt {i + 1}/{max_retries})")
# ignore retries due to IAM instance profile propagation
if "Invalid IAM Instance Profile name" not in str(e):
fn_name = fn.__name__ if hasattr(fn, "__name__") else "unknown function"
logger.warning(f"Retrying {fn_name} due to: {e} (attempt {i + 1}/{max_retries})")
time.sleep(backoff)
backoff = min(backoff * 2, max_backoff)