Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the REUSE.toml performance regression (+ do a whole lot of refactoring) #1047

Merged
merged 10 commits into from
Sep 26, 2024
2 changes: 2 additions & 0 deletions changelog.d/changed/reuse-toml-ignored.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- If `REUSE.toml` is ignored by VCS, the linter now also ignores this files.
(#1047)
3 changes: 3 additions & 0 deletions changelog.d/fixed/performance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Performance greatly improved for projects with large directories ignored by
VCS. (#1047)
- Performance slightly improved for large projects. (#1047)
3 changes: 3 additions & 0 deletions changelog.d/fixed/source-bug.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- In some scenarios, where a user has multiple `REUSE.toml` files and one of
those files could not be parsed, the wrong `REUSE.toml` was signalled as being
unparseable. This is now fixed. (#1047)
10 changes: 2 additions & 8 deletions src/reuse/_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from ._format import INDENT, fill_all, fill_paragraph
from ._util import PathType, setup_logging
from .global_licensing import GlobalLicensingParseError
from .project import GlobalLicensingConflict, GlobalLicensingFound, Project
from .project import GlobalLicensingConflict, Project
from .vcs import find_root

_LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -267,18 +267,12 @@ def main(args: Optional[List[str]] = None, out: IO[str] = sys.stdout) -> int:
project = Project.from_directory(root)
# FileNotFoundError and NotADirectoryError don't need to be caught because
# argparse already made sure of these things.
except UnicodeDecodeError:
found = cast(GlobalLicensingFound, Project.find_global_licensing(root))
main_parser.error(
_("'{path}' could not be decoded as UTF-8.").format(path=found.path)
)
except GlobalLicensingParseError as error:
found = cast(GlobalLicensingFound, Project.find_global_licensing(root))
main_parser.error(
_(
"'{path}' could not be parsed. We received the following error"
" message: {message}"
).format(path=found.path, message=str(error))
).format(path=error.source, message=str(error))
)
except GlobalLicensingConflict as error:
main_parser.error(str(error))
Expand Down
135 changes: 135 additions & 0 deletions src/reuse/covered_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. <https://fsfe.org>
#
# SPDX-License-Identifier: GPL-3.0-or-later

"""The REUSE Specification has a concept called Covered Files; files which must
contain licensing information. Some files in a project are not Covered Files,
and thus needn't contain licensing information. This module contains all that
logic.
"""

import contextlib
import logging
import os
from pathlib import Path
from typing import Collection, Generator, Optional, Set, cast

from . import (
_IGNORE_DIR_PATTERNS,
_IGNORE_FILE_PATTERNS,
_IGNORE_MESON_PARENT_DIR_PATTERNS,
)
from ._util import StrPath, is_relative_to
from .vcs import VCSStrategy

_LOGGER = logging.getLogger(__name__)


def is_path_ignored(
path: Path,
subset_files: Optional[Collection[StrPath]] = None,
include_submodules: bool = False,
include_meson_subprojects: bool = False,
vcs_strategy: Optional[VCSStrategy] = None,
) -> bool:
"""Is *path* ignored by some mechanism?"""
# pylint: disable=too-many-return-statements,too-many-branches
name = path.name
parent_parts = path.parent.parts
parent_dir = parent_parts[-1] if len(parent_parts) > 0 else ""

if path.is_symlink():
_LOGGER.debug("skipping symlink '%s'", path)
return True

if path.is_file():
if subset_files is not None and path.resolve() not in subset_files:
return True
for pattern in _IGNORE_FILE_PATTERNS:
if pattern.match(name):
return True
# Suppressing this error because I simply don't want to deal
# with that here.
with contextlib.suppress(OSError):
if path.stat().st_size == 0:
_LOGGER.debug("skipping 0-sized file '%s'", path)
return True

elif path.is_dir():
if subset_files is not None and not any(
is_relative_to(Path(file_), path.resolve())
for file_ in subset_files
):
return True
for pattern in _IGNORE_DIR_PATTERNS:
if pattern.match(name):
return True
if not include_meson_subprojects:
for pattern in _IGNORE_MESON_PARENT_DIR_PATTERNS:
if pattern.match(parent_dir):
_LOGGER.info(
"ignoring '%s' because it is a Meson subproject", path
)
return True
if (
not include_submodules
and vcs_strategy
and vcs_strategy.is_submodule(path)
):
_LOGGER.info("ignoring '%s' because it is a submodule", path)
return True

if vcs_strategy and vcs_strategy.is_ignored(path):
return True

return False


def iter_files(
directory: StrPath,
subset_files: Optional[Collection[StrPath]] = None,
include_submodules: bool = False,
include_meson_subprojects: bool = False,
vcs_strategy: Optional[VCSStrategy] = None,
) -> Generator[Path, None, None]:
"""Yield all Covered Files in *directory* and its subdirectories according
to the REUSE Specification.
"""
directory = Path(directory)
if subset_files is not None:
subset_files = cast(
Set[Path], {Path(file_).resolve() for file_ in subset_files}
)

for root_str, dirs, files in os.walk(directory):
root = Path(root_str)
_LOGGER.debug("currently walking in '%s'", root)

# Don't walk ignored directories
for dir_ in list(dirs):
the_dir = root / dir_
if is_path_ignored(
the_dir,
subset_files=subset_files,
include_submodules=include_submodules,
include_meson_subprojects=include_meson_subprojects,
vcs_strategy=vcs_strategy,
):
_LOGGER.debug("ignoring '%s'", the_dir)
dirs.remove(dir_)

# Filter files.
for file_ in files:
the_file = root / file_
if is_path_ignored(
the_file,
subset_files=subset_files,
include_submodules=include_submodules,
include_meson_subprojects=include_meson_subprojects,
vcs_strategy=vcs_strategy,
):
_LOGGER.debug("ignoring '%s'", the_file)
continue

_LOGGER.debug("yielding '%s'", the_file)
yield the_file
Loading
Loading