apply_fixes can parse report files from a DWYU execution log

For large workspaces discovering the DWYU report files by crawling the bazel-out directory can be quite slow due to an enormous amount of files and directories being present. To work around this, we enable the apply_fixes script to parse a log file containing the command line output of executing the DWYU aspect. This execution log is then parsed and the DWYU report paths deduced.
martis42 · May 26, 2024 · 56815ea · 56815ea
1 parent c8be082
commit 56815ea
Show file tree

Hide file tree

Showing 17 changed files with 276 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -248,6 +248,10 @@ You can see the full command line interface and more information about the scrip
 If the `apply_fixes` tool is not able to discover the report files, this can be caused by the `bazel-bin` convenience symlink at the workspace root not existing or not pointing to the output directory which was used by to generate the report files.
 The tool offers options to control how the output directory is discovered.
 
+Discovering the DWYU report files automatically can take a large amount of time if the `bazel-bin` directory is too large.
+In such cases you can pipe the command line output of executing the DWYU aspect into a file and forward this file to the apply_fixes script via the `--dwyu-log-file` option.
+The apply_fixes script will then deduce the DWYU report file locations without crawling though thw whole `bazel-bin` directory.
+
 Unfortunately, the tool cannot promise perfect results due to various constraints:
 
 - If alias targets are involved, this cannot be processed properly.

diff --git a/src/analyze_includes/main.py b/src/analyze_includes/main.py
@@ -99,6 +99,7 @@ def main(args: Namespace) -> int:
         system_under_inspection=system_under_inspection,
         ensure_private_deps=args.implementation_deps_available,
     )
+    result.report = args.report
 
     args.report.parent.mkdir(parents=True, exist_ok=True)
     with args.report.open(mode="w", encoding="utf-8") as report:

diff --git a/src/analyze_includes/result.py b/src/analyze_includes/result.py
@@ -6,18 +6,22 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
+    from pathlib import Path
+
     from src.analyze_includes.parse_source import Include
 
 
 @dataclass
 class Result:
     target: str
+    report: Path | None = None
+    use_impl_deps: bool = False
+
     public_includes_without_dep: list[Include] = field(default_factory=list)
     private_includes_without_dep: list[Include] = field(default_factory=list)
     unused_deps: list[str] = field(default_factory=list)
     unused_impl_deps: list[str] = field(default_factory=list)
     deps_which_should_be_private: list[str] = field(default_factory=list)
-    use_impl_deps: bool = False
 
     def is_ok(self) -> bool:
         return (
@@ -47,6 +51,9 @@ def to_str(self) -> str:
         if self.deps_which_should_be_private:
             msg += "\nPublic dependencies which are used only in private code:\n"
             msg += "\n".join(f"  Dependency='{dep}'" for dep in self.deps_which_should_be_private)
+
+        msg += f"\n\nDWYU Report: {self.report}"
+
         return self._framed_msg(msg)
 
     def to_json(self) -> str:

diff --git a/src/analyze_includes/test/result_test.py b/src/analyze_includes/test/result_test.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import unittest
 from pathlib import Path
 
@@ -7,14 +9,16 @@
 
 class TestResult(unittest.TestCase):
     @staticmethod
-    def _expected_msg(target: str, errors: str = "") -> str:
+    def _expected_msg(target: str, errors: str = "", report: str | None = None) -> str:
         border = 80 * "="
         msg = f"DWYU analyzing: '{target}'\n\n"
         if errors:
             msg += "Result: FAILURE\n\n"
+            report = f"\n\nDWYU Report: {report}\n"
         else:
             msg += "Result: SUCCESS"
-        return border + "\n" + msg + errors + "\n" + border
+            report = "\n"
+        return border + "\n" + msg + errors + report + border
 
     def test_is_ok(self) -> None:
         unit = Result("//foo:bar")
@@ -33,6 +37,43 @@ def test_is_ok(self) -> None:
   "deps_which_should_be_private": [],
   "use_implementation_deps": false
 }
+""".lstrip(),
+        )
+
+    def test_is_ok_fails_and_prints_report(self) -> None:
+        unit = Result(
+            target="//foo:bar",
+            private_includes_without_dep=[Include(file=Path("foo"), include="missing")],
+        )
+        unit.report = Path("some/report.json")
+
+        self.assertFalse(unit.is_ok())
+        self.assertEqual(
+            unit.to_str(),
+            self._expected_msg(
+                target="//foo:bar",
+                errors="Includes which are not available from the direct dependencies:"
+                "\n  File='foo', include='missing'",
+                report="some/report.json",
+            ),
+        )
+        # The report is not mentioned in the json file as it would be redundant
+        self.assertEqual(
+            unit.to_json(),
+            """
+{
+  "analyzed_target": "//foo:bar",
+  "public_includes_without_dep": {},
+  "private_includes_without_dep": {
+    "foo": [
+      "missing"
+    ]
+  },
+  "unused_deps": [],
+  "unused_implementation_deps": [],
+  "deps_which_should_be_private": [],
+  "use_implementation_deps": false
+}
 """.lstrip(),
         )
 

diff --git a/src/apply_fixes/BUILD b/src/apply_fixes/BUILD
@@ -6,6 +6,7 @@ py_library(
         "apply_fixes.py",
         "bazel_query.py",
         "buildozer_executor.py",
+        "get_dwyu_reports.py",
         "search_missing_deps.py",
         "summary.py",
         "utils.py",

diff --git a/src/apply_fixes/apply_fixes.py b/src/apply_fixes/apply_fixes.py
@@ -2,16 +2,15 @@
 
 import json
 import logging
-import shlex
-import sys
-from os import environ, walk
+from os import environ
 from pathlib import Path
 from typing import TYPE_CHECKING
 
 from src.apply_fixes.bazel_query import BazelQuery
 from src.apply_fixes.buildozer_executor import BuildozerExecutor
+from src.apply_fixes.get_dwyu_reports import gather_reports, get_reports_search_dir
 from src.apply_fixes.search_missing_deps import search_missing_deps
-from src.apply_fixes.utils import execute_and_capture
+from src.apply_fixes.utils import args_string_to_list
 
 if TYPE_CHECKING:
     from argparse import Namespace
@@ -29,10 +28,6 @@ def __init__(self, main_args: Namespace) -> None:
         self.add_missing_deps = main_args.fix_missing_deps or main_args.fix_all
 
 
-def args_string_to_list(args: str | None) -> list[str]:
-    return shlex.split(args) if args else []
-
-
 def get_workspace(main_args: Namespace) -> Path | None:
     if main_args.workspace:
         return Path(main_args.workspace)
@@ -43,45 +38,6 @@ def get_workspace(main_args: Namespace) -> Path | None:
     return Path(workspace_root)
 
 
-def get_reports_search_dir(main_args: Namespace, workspace_root: Path) -> Path:
-    """
-    Unless a dedicated search directory is provided, try to deduce the 'bazel-bin' dir.
-    """
-    if main_args.search_path:
-        return Path(main_args.search_path)
-
-    if main_args.use_bazel_info:
-        process = execute_and_capture(
-            cmd=[
-                "bazel",
-                *args_string_to_list(main_args.bazel_startup_args),
-                "info",
-                *args_string_to_list(main_args.bazel_args),
-                "bazel-bin",
-            ],
-            cwd=workspace_root,
-        )
-        return Path(process.stdout.strip())
-
-    bazel_bin_link = workspace_root / "bazel-bin"
-    if not bazel_bin_link.is_dir():
-        logging.fatal(f"ERROR: convenience symlink '{bazel_bin_link}' does not exist or is not a symlink.")
-        sys.exit(1)
-    return bazel_bin_link.resolve()
-
-
-def gather_reports(search_path: Path) -> list[Path]:
-    """
-    We explicitly use os.walk() as it has better performance than Path.glob() in large and deeply nested file trees.
-    """
-    reports = []
-    for root, _, files in walk(search_path):
-        for file in files:
-            if file.endswith("_dwyu_report.json"):
-                reports.append(Path(root) / file)  # noqa: PERF401
-    return reports
-
-
 def add_discovered_deps(
     discovered_public_deps: list[str],
     discovered_private_deps: list[str],
@@ -160,7 +116,7 @@ def main(args: Namespace) -> int:
     reports_search_dir = get_reports_search_dir(main_args=args, workspace_root=workspace)
     logging.debug(f"Reports search directory: '{reports_search_dir}'")
 
-    reports = gather_reports(reports_search_dir)
+    reports = gather_reports(main_args=args, search_path=reports_search_dir)
     if not reports:
         logging.fatal(
             """

diff --git a/src/apply_fixes/get_dwyu_reports.py b/src/apply_fixes/get_dwyu_reports.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import logging
+import sys
+from os import walk
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from src.apply_fixes.utils import args_string_to_list, execute_and_capture
+
+if TYPE_CHECKING:
+    import argparse
+
+
+def gather_reports(main_args: argparse.Namespace, search_path: Path) -> list[Path]:
+    if main_args.dwyu_log_file:
+        return [search_path / log.split("/bin/", 1)[1] for log in parse_dwyu_execution_log(main_args.dwyu_log_file)]
+
+    reports = []
+    # We explicitly use os.walk() as it has better performance than Path.glob() in large and deeply nested file trees.
+    for root, _, files in walk(search_path):
+        for file in files:
+            if file.endswith("_dwyu_report.json"):
+                reports.append(Path(root) / file)  # noqa: PERF401
+    return reports
+
+
+def parse_dwyu_execution_log(log_file: Path) -> list[str]:
+    dwyu_report_anchor = "DWYU Report: "
+    with log_file.open() as log:
+        return [
+            line.strip().split(dwyu_report_anchor)[1] for line in log.readlines() if line.startswith(dwyu_report_anchor)
+        ]
+
+
+def get_reports_search_dir(main_args: argparse.Namespace, workspace_root: Path) -> Path:
+    """
+    Unless an alternative method is selected, follow the convenience symlinks at the workspace root to discover the
+    DWYU report files.
+    """
+    if main_args.search_path:
+        return Path(main_args.search_path)
+
+    if main_args.use_bazel_info:
+        process = execute_and_capture(
+            cmd=[
+                "bazel",
+                *args_string_to_list(main_args.bazel_startup_args),
+                "info",
+                *args_string_to_list(main_args.bazel_args),
+                "bazel-bin",
+            ],
+            cwd=workspace_root,
+        )
+        return Path(process.stdout.strip())
+
+    bazel_bin_link = workspace_root / "bazel-bin"
+    if not bazel_bin_link.is_dir():
+        logging.fatal(f"ERROR: convenience symlink '{bazel_bin_link}' does not exist.")
+        sys.exit(1)
+    return bazel_bin_link.resolve()
diff --git a/src/apply_fixes/main.py b/src/apply_fixes/main.py
@@ -1,6 +1,7 @@
 import logging
 import sys
 from argparse import ArgumentParser, Namespace, RawDescriptionHelpFormatter
+from pathlib import Path
 
 from src.apply_fixes.apply_fixes import main
 
@@ -79,6 +80,19 @@ def cli() -> Namespace:
         deduce the Bazel output directory containing the DWYU report files. Or if you want to search only in a sub tree
         of the Bazel output directories.""",
     )
+    parser.add_argument(
+        "--dwyu-log-file",
+        metavar="PATH",
+        type=Path,
+        help="""
+        If discovering the DWYU report files in the bazel-bin is not feasible, one can instead pipe the command line
+        output of executing the DWYU aspect into a log file and tell this script to extract the DWYU report paths from
+        this execution log. This can be helpful when your workspace is so large, that crawling the corresponding
+        'bazel-bin' directory is too slow for a satisfactory user experience. This script still has to be able to
+        discover the location of the 'bazel-bin' directory. Meaning, the 'bazel-bin' convenience symlink at the
+        workspace root should exists or if it is not available one of the following options should be used:
+        ['--use-bazel-info', '--search-path'].""",
+    )
     parser.add_argument(
         "--use-cquery",
         action="store_true",
@@ -140,6 +154,10 @@ def cli() -> Namespace:
         logging.fatal("Please choose at least one of the 'fix-..' options")
         sys.exit(1)
 
+    if args.use_bazel_info and args.search_path:
+        logging.fatal("Please choose only one options controlling the 'bazel-bin' directory discovery.")
+        sys.exit(1)
+
     return args
 
 

diff --git a/src/apply_fixes/test/BUILD b/src/apply_fixes/test/BUILD
@@ -12,6 +12,12 @@ py_test(
     deps = ["//src/apply_fixes:lib"],
 )
 
+py_test(
+    name = "get_dwyu_reports_test",
+    srcs = ["get_dwyu_reports_test.py"],
+    deps = ["//src/apply_fixes:lib"],
+)
+
 py_test(
     name = "search_missing_deps",
     srcs = ["search_missing_deps.py"],
@@ -23,3 +29,9 @@ py_test(
     srcs = ["summary_test.py"],
     deps = ["//src/apply_fixes:lib"],
 )
+
+py_test(
+    name = "utils_test",
+    srcs = ["utils_test.py"],
+    deps = ["//src/apply_fixes:lib"],
+)
diff --git a/src/apply_fixes/test/get_dwyu_reports_test.py b/src/apply_fixes/test/get_dwyu_reports_test.py
@@ -0,0 +1,35 @@
+import unittest
+from pathlib import Path
+
+from src.apply_fixes.get_dwyu_reports import parse_dwyu_execution_log
+
+
+class TestParseDwyuExecutionLog(unittest.TestCase):
+    def test_parse_dwyu_execution_log(self) -> None:
+        test_log = Path("test_log.txt")
+        with test_log.open(mode="wt") as fp:
+            fp.write(
+                """
+Some unrelated stuff
+DWYU Report: bazel-out/opt/bin/some/target_dwyu_report.json
+ERROR: Unrelated error
+DWYU Report: bazel-out/opt/bin/root_target_dwyu_report.json
+""".strip()
+            )
+
+        logs = parse_dwyu_execution_log(test_log)
+        self.assertEqual(
+            logs, ["bazel-out/opt/bin/some/target_dwyu_report.json", "bazel-out/opt/bin/root_target_dwyu_report.json"]
+        )
+
+    def test_parse_dwyu_execution_log_empty(self) -> None:
+        test_log = Path("test_log.txt")
+        with test_log.open(mode="wt") as fp:
+            fp.write("")
+
+        logs = parse_dwyu_execution_log(test_log)
+        self.assertEqual(logs, [])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/apply_fixes/test/utils_test.py b/src/apply_fixes/test/utils_test.py
@@ -0,0 +1,19 @@
+import unittest
+
+from src.apply_fixes.utils import args_string_to_list
+
+
+class TestArgsStringToList(unittest.TestCase):
+    def test_no_args(self) -> None:
+        self.assertEqual(args_string_to_list(None), [])
+        self.assertEqual(args_string_to_list(""), [])
+
+    def test_single_arg(self) -> None:
+        self.assertEqual(args_string_to_list("foo"), ["foo"])
+
+    def test_multiple_args(self) -> None:
+        self.assertEqual(args_string_to_list("--foo --bar=42 baz 1337"), ["--foo", "--bar=42", "baz", "1337"])
+
+
+if __name__ == "__main__":
+    unittest.main()