Skip to content

Commit

Permalink
Vastly speed up provider documentation publishing (#33792)
Browse files Browse the repository at this point in the history
* Vastly speed up provider documentation publishing

Publishing documentation is quite slow when preparing multiple
provider documentation - and this is mostly because we are serializing
copying of directories and checking if the directories are present.
Hhowever we can easily speed it up by parallelising the publishing
per-package.

This will speed up both CI and release-manager's docs publishing
step (vastly).

Co-authored-by: Elad Kalif <[email protected]>
  • Loading branch information
potiuk and eladkal authored Aug 27, 2023
1 parent c96ae2b commit 8227db3
Show file tree
Hide file tree
Showing 27 changed files with 479 additions and 342 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -643,7 +643,7 @@ jobs:
- name: "Publish docs"
run: >
breeze release-management publish-docs
--override-versioned
--override-versioned --run-in-parallel
${{ needs.build-info.outputs.docs-filter-list-as-string }}
- name: "Generate back references for providers"
run: breeze release-management add-back-references all-providers
Expand Down
3 changes: 3 additions & 0 deletions BREEZE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2068,6 +2068,9 @@ while publishing the documentation.
The flag ``--airflow-site-directory`` takes the path of the cloned ``airflow-site``. The command will
not proceed if this is an invalid path.
When you have multi-processor machine docs publishing can be vastly sped up by using ``--run-in-parallel`` option when
publishing docs for multiple providers.
These are all available flags of ``release-management publish-docs`` command:
.. image:: ./images/breeze/output_release-management_publish-docs.svg
Expand Down
6 changes: 4 additions & 2 deletions dev/README_RELEASE_PROVIDER_PACKAGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -382,15 +382,17 @@ If you have providers as list of provider ids because you just released them, yo

- Copy the documentation to the ``airflow-site`` repository

All providers (including overriding documentation for doc-only changes):
All providers (including overriding documentation for doc-only changes) - note that publishing is
way faster on multi-cpu machines when you are publishing multiple providers:


```shell script
cd "${AIRFLOW_REPO_ROOT}"

breeze release-management publish-docs \
--package-filter apache-airflow-providers \
--package-filter 'apache-airflow-providers-*' \
--override-versioned
--override-versioned --run-in-parallel

breeze release-management add-back-references all-providers
```
Expand Down
109 changes: 90 additions & 19 deletions dev/breeze/src/airflow_breeze/commands/release_management_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
option_airflow_constraints_mode_update,
option_airflow_constraints_reference,
option_airflow_extras,
option_airflow_site_directory,
option_answer,
option_commit_sha,
option_debug_resources,
Expand Down Expand Up @@ -119,7 +120,7 @@
run_command,
run_compile_www_assets,
)
from airflow_breeze.utils.shared_options import get_dry_run, get_forced_answer
from airflow_breeze.utils.shared_options import get_dry_run, get_forced_answer, get_verbose
from airflow_breeze.utils.suspended_providers import get_suspended_provider_ids

option_debug_release_management = click.option(
Expand Down Expand Up @@ -769,30 +770,95 @@ def alias_image(image_from: str, image_to: str):
)


def run_docs_publishing(
package_name: str,
airflow_site_directory: str,
override_versioned: bool,
verbose: bool,
output: Output | None,
) -> tuple[int, str]:
builder = PublishDocsBuilder(package_name=package_name, output=output, verbose=verbose)
builder.publish(override_versioned=override_versioned, airflow_site_dir=airflow_site_directory)
return (
0,
f"Docs published: {package_name}",
)


PUBLISHING_DOCS_PROGRESS_MATCHER = r"Publishing docs|Copy directory"


def run_publish_docs_in_parallel(
package_list: list[str],
airflow_site_directory: str,
override_versioned: bool,
include_success_outputs: bool,
parallelism: int,
skip_cleanup: bool,
debug_resources: bool,
):
"""Run docs publishing in parallel"""
with ci_group("Publishing docs for packages"):
all_params = [f"Publishing docs {package_name}" for package_name in package_list]
with run_with_pool(
parallelism=parallelism,
all_params=all_params,
debug_resources=debug_resources,
progress_matcher=GenericRegexpProgressMatcher(
regexp=PUBLISHING_DOCS_PROGRESS_MATCHER, lines_to_search=6
),
) as (pool, outputs):
results = [
pool.apply_async(
run_docs_publishing,
kwds={
"package_name": package_name,
"airflow_site_directory": airflow_site_directory,
"override_versioned": override_versioned,
"output": outputs[index],
"verbose": get_verbose(),
},
)
for index, package_name in enumerate(package_list)
]
check_async_run_results(
results=results,
success="All package documentation published.",
outputs=outputs,
include_success_outputs=include_success_outputs,
skip_cleanup=skip_cleanup,
summarize_on_ci=SummarizeAfter.NO_SUMMARY,
)


@release_management.command(
name="publish-docs",
help="Command to publish generated documentation to airflow-site",
)
@click.option("-s", "--override-versioned", help="Overrides versioned directories.", is_flag=True)
@click.option(
"-a",
"--airflow-site-directory",
envvar="AIRFLOW_SITE_DIRECTORY",
help="Local directory path of cloned airflow-site repo.",
required=True,
)
@option_airflow_site_directory
@click.option(
"--package-filter",
help="List of packages to consider.",
type=NotVerifiedBetterChoice(get_available_documentation_packages()),
multiple=True,
)
@option_run_in_parallel
@option_parallelism
@option_debug_resources
@option_include_success_outputs
@option_skip_cleanup
@option_verbose
@option_dry_run
def publish_docs(
override_versioned: bool,
airflow_site_directory: str,
package_filter: tuple[str],
run_in_parallel: bool,
parallelism: int,
debug_resources: bool,
include_success_outputs: bool,
skip_cleanup: bool,
):
"""Publishes documentation to airflow-site."""
if not os.path.isdir(airflow_site_directory):
Expand All @@ -809,23 +875,28 @@ def publish_docs(
for pkg in current_packages:
print(f" - {pkg}")
print()
for package_name in current_packages:
builder = PublishDocsBuilder(package_name=package_name)
builder.publish(override_versioned=override_versioned, airflow_site_dir=airflow_site_directory)
if run_in_parallel:
run_publish_docs_in_parallel(
package_list=current_packages,
parallelism=parallelism,
skip_cleanup=skip_cleanup,
debug_resources=debug_resources,
include_success_outputs=True,
airflow_site_directory=airflow_site_directory,
override_versioned=override_versioned,
)
else:
for package_name in current_packages:
run_docs_publishing(
package_name, airflow_site_directory, override_versioned, verbose=get_verbose(), output=None
)


@release_management.command(
name="add-back-references",
help="Command to add back references for documentation to make it backward compatible.",
)
@click.option(
"-a",
"--airflow-site-directory",
envvar="AIRFLOW_SITE_DIRECTORY",
type=click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True),
help="Local directory path of cloned airflow-site repo.",
required=True,
)
@option_airflow_site_directory
@argument_packages_plus_all_providers
@option_verbose
@option_dry_run
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@
"--airflow-site-directory",
],
},
{
"name": "Parallel running",
"options": [
"--run-in-parallel",
"--parallelism",
"--skip-cleanup",
"--debug-resources",
"--include-success-outputs",
],
},
],
"breeze release-management add-back-references": [
{
Expand Down
10 changes: 9 additions & 1 deletion dev/breeze/src/airflow_breeze/utils/common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def _set_default_from_parent(ctx: click.core.Context, option: click.core.Option,
)
option_run_in_parallel = click.option(
"--run-in-parallel",
help="Run the operation in parallel on all or selected subset of Python versions.",
help="Run the operation in parallel on all or selected subset of parameters.",
is_flag=True,
envvar="RUN_IN_PARALLEL",
)
Expand Down Expand Up @@ -607,3 +607,11 @@ def _set_default_from_parent(ctx: click.core.Context, option: click.core.Option,
help="Optional additional requirements to upgrade eagerly to avoid backtracking "
"(see `breeze ci find-backtracking-candidates`).",
)
option_airflow_site_directory = click.option(
"-a",
"--airflow-site-directory",
envvar="AIRFLOW_SITE_DIRECTORY",
type=click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True),
help="Local directory path of cloned airflow-site repo.",
required=True,
)
49 changes: 28 additions & 21 deletions dev/breeze/src/airflow_breeze/utils/publish_docs_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,26 @@
from pathlib import Path
from subprocess import run

from rich.console import Console

from airflow_breeze.global_constants import get_airflow_version
from airflow_breeze.utils.console import Output, get_console
from airflow_breeze.utils.docs_errors import DocBuildError, parse_sphinx_warnings
from airflow_breeze.utils.helm_chart_utils import chart_version
from airflow_breeze.utils.publish_docs_helpers import CONSOLE_WIDTH, load_package_data, pretty_format_path
from airflow_breeze.utils.publish_docs_helpers import load_package_data, pretty_format_path
from airflow_breeze.utils.spelling_checks import SpellingError, parse_spelling_warnings

PROCESS_TIMEOUT = 15 * 60

ROOT_PROJECT_DIR = Path(__file__).parents[5].resolve()
DOCS_DIR = os.path.join(ROOT_PROJECT_DIR, "docs")

console = Console(force_terminal=True, color_system="standard", width=CONSOLE_WIDTH)


class PublishDocsBuilder:
"""Documentation builder for Airflow Docs Publishing."""

def __init__(self, package_name: str):
def __init__(self, package_name: str, output: Output | None, verbose: bool):
self.package_name = package_name
self.output = output
self.verbose = verbose

@property
def _doctree_dir(self) -> str:
Expand Down Expand Up @@ -153,11 +152,13 @@ def check_spelling(self, verbose: bool) -> list[SpellingError]:
env = os.environ.copy()
env["AIRFLOW_PACKAGE_NAME"] = self.package_name
if verbose:
console.print(
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] Executing cmd: ",
" ".join(shlex.quote(c) for c in build_cmd),
)
console.print(f"[info]{self.package_name:60}:[/] The output is hidden until an error occurs.")
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] The output is hidden until an error occurs."
)
with open(self.log_spelling_filename, "w") as output:
completed_proc = run(
build_cmd,
Expand Down Expand Up @@ -186,14 +187,16 @@ def check_spelling(self, verbose: bool) -> list[SpellingError]:
warning_text += spelling_file.read()

spelling_errors.extend(parse_spelling_warnings(warning_text, self._src_dir))
console.print(f"[info]{self.package_name:60}:[/] [red]Finished spell-checking with errors[/]")
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] [red]Finished spell-checking with errors[/]"
)
else:
if spelling_errors:
console.print(
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] [yellow]Finished spell-checking with warnings[/]"
)
else:
console.print(
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] [green]Finished spell-checking successfully[/]"
)
return spelling_errors
Expand Down Expand Up @@ -226,12 +229,12 @@ def build_sphinx_docs(self, verbose: bool) -> list[DocBuildError]:
env = os.environ.copy()
env["AIRFLOW_PACKAGE_NAME"] = self.package_name
if verbose:
console.print(
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] Executing cmd: ",
" ".join(shlex.quote(c) for c in build_cmd),
)
else:
console.print(
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] Running sphinx. "
f"The output is hidden until an error occurs."
)
Expand Down Expand Up @@ -259,32 +262,36 @@ def build_sphinx_docs(self, verbose: bool) -> list[DocBuildError]:
warning_text = re.sub(r"\x1B[@-_][0-?]*[ -/]*[@-~]", "", warning_text)
build_errors.extend(parse_sphinx_warnings(warning_text, self._src_dir))
if build_errors:
console.print(f"[info]{self.package_name:60}:[/] [red]Finished docs building with errors[/]")
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] [red]Finished docs building with errors[/]"
)
else:
console.print(f"[info]{self.package_name:60}:[/] [green]Finished docs building successfully[/]")
get_console(output=self.output).print(
f"[info]{self.package_name:60}:[/] [green]Finished docs building successfully[/]"
)
return build_errors

def publish(self, override_versioned: bool, airflow_site_dir: str):
"""Copy documentation packages files to airflow-site repository."""
console.print(f"Publishing docs for {self.package_name}")
get_console(output=self.output).print(f"Publishing docs for {self.package_name}")
output_dir = os.path.join(airflow_site_dir, self._publish_dir)
pretty_source = pretty_format_path(self._build_dir, os.getcwd())
pretty_target = pretty_format_path(output_dir, airflow_site_dir)
console.print(f"Copy directory: {pretty_source} => {pretty_target}")
get_console(output=self.output).print(f"Copy directory: {pretty_source} => {pretty_target}")
if os.path.exists(output_dir):
if self.is_versioned:
if override_versioned:
console.print(f"Overriding previously existing {output_dir}! ")
get_console(output=self.output).print(f"Overriding previously existing {output_dir}! ")
else:
console.print(
get_console(output=self.output).print(
f"Skipping previously existing {output_dir}! "
f"Delete it manually if you want to regenerate it!"
)
console.print()
get_console(output=self.output).print()
return
shutil.rmtree(output_dir)
shutil.copytree(self._build_dir, output_dir)
if self.is_versioned:
with open(os.path.join(output_dir, "..", "stable.txt"), "w") as stable_file:
stable_file.write(self._current_version)
console.print()
get_console(output=self.output).print()
Loading

0 comments on commit 8227db3

Please sign in to comment.