Skip to content

Commit

Permalink
Add automated retries on retryable condition for building images in CI (
Browse files Browse the repository at this point in the history
#24006)

There is a flakiness in pushing cache images to ghcr.io, therefore
we want to add automated retries when the images fail intermittently.

The root cause of the problem is tracked in containerd:
containerd/containerd#5978
  • Loading branch information
potiuk authored May 29, 2022
1 parent ae343fa commit 7cf0e43
Show file tree
Hide file tree
Showing 9 changed files with 252 additions and 174 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1689,6 +1689,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}"
run: >
breeze build-image
--prepare-buildx-cache
--max-retries 3
--platform linux/amd64,linux/arm64
env:
PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }}
Expand Down Expand Up @@ -1722,6 +1723,7 @@ ${{ hashFiles('.pre-commit-config.yaml') }}"
--install-packages-from-context
--prepare-buildx-cache
--disable-airflow-repo-cache
--max-retries 3
--platform linux/amd64,linux/arm64
env:
PYTHON_MAJOR_MINOR_VERSION: ${{ matrix.python-version }}
Expand Down
40 changes: 33 additions & 7 deletions dev/breeze/src/airflow_breeze/commands/ci_image_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
import sys
from pathlib import Path
from subprocess import CompletedProcess
from typing import List, Optional, Tuple, Union

import click
Expand Down Expand Up @@ -50,6 +51,7 @@
option_image_name,
option_image_tag,
option_install_providers_from_sources,
option_max_retries,
option_parallelism,
option_platform,
option_prepare_buildx_cache,
Expand Down Expand Up @@ -87,6 +89,7 @@
instruct_build_image,
is_repo_rebased,
run_command,
run_result_contains,
)

CI_IMAGE_TOOLS_COMMANDS = {
Expand All @@ -110,6 +113,7 @@
"--tag-as-latest",
"--docker-cache",
"--force-build",
"--max-retries",
],
},
{
Expand Down Expand Up @@ -202,6 +206,7 @@
@option_docker_cache
@option_image_tag
@option_prepare_buildx_cache
@option_max_retries
@option_push_image
@option_empty_image
@option_install_providers_from_sources
Expand Down Expand Up @@ -477,15 +482,34 @@ def build_ci_image(verbose: bool, dry_run: bool, ci_image_params: BuildCiParams)
)
else:
get_console().print(f"\n[info]Building CI Image for Python {ci_image_params.python}\n")
build_command_result = run_command(
cmd, verbose=verbose, dry_run=dry_run, cwd=AIRFLOW_SOURCES_ROOT, text=True, check=False
)
if build_command_result.returncode == 0:
if ci_image_params.prepare_buildx_cache:
num_tries = 1 if ci_image_params.max_retries is None else ci_image_params.max_retries
build_command_result = CompletedProcess(args=[], returncode=1, stdout="This should never happen.")
while num_tries > 0:
build_command_result = run_command(
cmd,
verbose=verbose,
dry_run=dry_run,
cwd=AIRFLOW_SOURCES_ROOT,
check=False,
text=True,
capture_output=True,
)
if ci_image_params.prepare_buildx_cache and build_command_result.returncode == 0:
build_command_result = build_cache(
image_params=ci_image_params, dry_run=dry_run, verbose=verbose
)

if build_command_result.returncode == 0:
break
num_tries -= 1
if run_result_contains(build_command_result, "cannot reuse body, request must be retried"):
if num_tries > 0:
get_console().print(
"[info]Retrying failed command on retryable condition. "
f"There are {num_tries} left[/]"
)
continue
else:
break
if not ci_image_params.prepare_buildx_cache:
if not dry_run:
if build_command_result.returncode == 0:
Expand All @@ -504,7 +528,9 @@ def build_ci_image(verbose: bool, dry_run: bool, ci_image_params: BuildCiParams)
f"Image build: {ci_image_params.python}",
)
else:
get_console().print("[info]Not updating build cache because we are in `dry_run` mode.[/]")
get_console().print(
"[info]Not tagging/marking image as refreshed because we are in `dry_run` mode.[/]"
)
return build_command_result.returncode, f"Image build: {ci_image_params.python}"


Expand Down
53 changes: 41 additions & 12 deletions dev/breeze/src/airflow_breeze/commands/production_image_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import contextlib
import os
import sys
from subprocess import CompletedProcess
from typing import Optional, Tuple

import click
Expand Down Expand Up @@ -49,6 +50,7 @@
option_image_name,
option_image_tag,
option_install_providers_from_sources,
option_max_retries,
option_parallelism,
option_platform,
option_prepare_buildx_cache,
Expand Down Expand Up @@ -77,7 +79,12 @@
from airflow_breeze.utils.python_versions import get_python_version_list
from airflow_breeze.utils.registry import login_to_github_docker_registry
from airflow_breeze.utils.run_tests import verify_an_image
from airflow_breeze.utils.run_utils import filter_out_none, fix_group_permissions, run_command
from airflow_breeze.utils.run_utils import (
filter_out_none,
fix_group_permissions,
run_command,
run_result_contains,
)

PRODUCTION_IMAGE_TOOLS_COMMANDS = {
"name": "Production Image tools",
Expand All @@ -99,6 +106,7 @@
"--image-tag",
"--tag-as-latest",
"--docker-cache",
"--max-retries",
],
},
{
Expand Down Expand Up @@ -206,6 +214,7 @@
@option_docker_cache
@option_image_tag
@option_prepare_buildx_cache
@option_max_retries
@option_push_image
@option_empty_image
@option_airflow_constraints_mode_prod
Expand Down Expand Up @@ -517,16 +526,36 @@ def build_production_image(
image_params=prod_image_params,
verbose=verbose,
)
build_command_result = run_command(
cmd, verbose=verbose, dry_run=dry_run, cwd=AIRFLOW_SOURCES_ROOT, check=False, text=True
)
if build_command_result.returncode == 0:
if prod_image_params.prepare_buildx_cache:
build_command_result = build_cache(
image_params=prod_image_params, dry_run=dry_run, verbose=verbose
)
num_tries = 1 if prod_image_params.max_retries is None else prod_image_params.max_retries
build_command_result = CompletedProcess(args=[], returncode=1, stdout="This should never happen.")
while num_tries > 0:
build_command_result = run_command(
cmd,
verbose=verbose,
dry_run=dry_run,
cwd=AIRFLOW_SOURCES_ROOT,
check=False,
text=True,
capture_output=True,
)
if build_command_result.returncode == 0:
if prod_image_params.prepare_buildx_cache:
build_command_result = build_cache(
image_params=prod_image_params, dry_run=dry_run, verbose=verbose
)
else:
if prod_image_params.tag_as_latest:
build_command_result = tag_image_as_latest(prod_image_params, dry_run, verbose)
if build_command_result.returncode == 0:
break
num_tries -= 1
if run_result_contains(build_command_result, "cannot reuse body, request must be retried"):
if num_tries > 0:
get_console().print(
"[info]Retrying failed command on retryable condition. "
f"There are {num_tries} left[/]"
)
continue
else:
if prod_image_params.tag_as_latest:
build_command_result = tag_image_as_latest(prod_image_params, dry_run, verbose)

break
return build_command_result.returncode, f"Image build: {prod_image_params.python}"
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class _CommonBuildParams:
github_token: str = os.environ.get('GITHUB_TOKEN', "")
github_username: str = ""
image_tag: Optional[str] = None
max_retries: Optional[int] = None
install_providers_from_sources: bool = False
platform: str = f"linux/{os.uname().machine}"
prepare_buildx_cache: bool = False
Expand Down
6 changes: 6 additions & 0 deletions dev/breeze/src/airflow_breeze/utils/common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,12 @@
is_flag=True,
envvar='PREPARE_BUILDX_CACHE',
)
option_max_retries = click.option(
'--max-retries',
help='Maximum number of retries for the operation for "retryable" intermittent problems.',
type=click.IntRange(min=2),
envvar='MAX_RETRIES',
)
option_push_image = click.option(
'--push-image',
help='Push image after building it.',
Expand Down
8 changes: 8 additions & 0 deletions dev/breeze/src/airflow_breeze/utils/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,3 +405,11 @@ def get_runnable_ci_image(verbose: bool, dry_run: bool) -> str:
instruction=f"breeze build-image --python {python_version}",
)
return airflow_image


def run_result_contains(result: RunCommandResult, message: str) -> bool:
if result.stdout and message in result.stdout:
return True
if result.stderr and message in result.stderr:
return True
return False
Loading

0 comments on commit 7cf0e43

Please sign in to comment.