From d9a2b5120532704c5ce641129a191149cf1df17b Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Mon, 8 Jul 2024 14:41:58 +0200 Subject: [PATCH 01/10] Remove bidirectional aggregation between VCSStrategy and Project Hooray! This had always been bad design. Signed-off-by: Carmen Bianca BAKKER --- src/reuse/project.py | 12 ++++++------ src/reuse/report.py | 2 +- src/reuse/vcs.py | 46 ++++++++++++++++++++------------------------ 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/src/reuse/project.py b/src/reuse/project.py index 8ab6d37c4..f21b86b5f 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -80,7 +80,7 @@ class Project: def __init__( self, root: StrPath, - vcs_strategy: Optional[Type[VCSStrategy]] = None, + vcs_strategy: Optional[VCSStrategy] = None, license_map: Optional[Dict[str, Dict]] = None, licenses: Optional[Dict[str, Path]] = None, global_licensing: Optional[GlobalLicensing] = None, @@ -90,8 +90,8 @@ def __init__( self.root = Path(root) if vcs_strategy is None: - vcs_strategy = VCSStrategyNone - self.vcs_strategy = vcs_strategy(self) + vcs_strategy = VCSStrategyNone(root) + self.vcs_strategy = vcs_strategy if license_map is None: license_map = LICENSE_MAP @@ -530,13 +530,13 @@ def _find_licenses(self) -> Dict[str, Path]: return license_files @classmethod - def _detect_vcs_strategy(cls, root: StrPath) -> Type[VCSStrategy]: + def _detect_vcs_strategy(cls, root: StrPath) -> VCSStrategy: """For each supported VCS, check if the software is available and if the directory is a repository. If not, return :class:`VCSStrategyNone`. """ for strategy in all_vcs_strategies(): if strategy.EXE and strategy.in_repo(root): - return strategy + return strategy(root) _LOGGER.info( _( @@ -544,4 +544,4 @@ def _detect_vcs_strategy(cls, root: StrPath) -> Type[VCSStrategy]: " software is not installed" ).format(root) ) - return VCSStrategyNone + return VCSStrategyNone(root) diff --git a/src/reuse/report.py b/src/reuse/report.py index e9267b280..70032d0b3 100644 --- a/src/reuse/report.py +++ b/src/reuse/report.py @@ -59,7 +59,7 @@ def __init__( # pickled. new_project = Project( project.root, - vcs_strategy=project.vcs_strategy.__class__, + vcs_strategy=project.vcs_strategy, license_map=project.license_map, licenses=project.licenses.copy(), # TODO: adjust this method/class to account for REUSE.toml as diff --git a/src/reuse/vcs.py b/src/reuse/vcs.py index 924703071..daca12da8 100644 --- a/src/reuse/vcs.py +++ b/src/reuse/vcs.py @@ -24,6 +24,7 @@ PIJUL_EXE, StrPath, execute_command, + relative_from_root, ) if TYPE_CHECKING: @@ -37,9 +38,8 @@ class VCSStrategy(ABC): EXE: str | None = None - @abstractmethod - def __init__(self, project: Project): - self.project = project + def __init__(self, root: StrPath): + self.root = Path(root) @abstractmethod def is_ignored(self, path: StrPath) -> bool: @@ -72,10 +72,6 @@ def find_root(cls, cwd: Optional[StrPath] = None) -> Optional[Path]: class VCSStrategyNone(VCSStrategy): """Strategy that is used when there is no VCS.""" - def __init__(self, project: Project): - # pylint: disable=useless-super-delegation - super().__init__(project) - def is_ignored(self, path: StrPath) -> bool: return False @@ -96,8 +92,8 @@ class VCSStrategyGit(VCSStrategy): EXE = GIT_EXE - def __init__(self, project: Project): - super().__init__(project) + def __init__(self, root: StrPath): + super().__init__(root) if not self.EXE: raise FileNotFoundError("Could not find binary for Git") self._all_ignored_files = self._find_all_ignored_files() @@ -121,7 +117,7 @@ def _find_all_ignored_files(self) -> Set[Path]: # Separate output with \0 instead of \n. "-z", ] - result = execute_command(command, _LOGGER, cwd=self.project.root) + result = execute_command(command, _LOGGER, cwd=self.root) all_files = result.stdout.decode("utf-8").split("\0") return {Path(file_) for file_ in all_files} @@ -135,7 +131,7 @@ def _find_submodules(self) -> Set[Path]: "--get-regexp", r"\.path$", ] - result = execute_command(command, _LOGGER, cwd=self.project.root) + result = execute_command(command, _LOGGER, cwd=self.root) # The final element may be an empty string. Filter it. submodule_entries = [ entry @@ -146,12 +142,12 @@ def _find_submodules(self) -> Set[Path]: return {Path(entry.splitlines()[1]) for entry in submodule_entries} def is_ignored(self, path: StrPath) -> bool: - path = self.project.relative_from_root(path) + path = relative_from_root(path, self.root) return path in self._all_ignored_files def is_submodule(self, path: StrPath) -> bool: return any( - self.project.relative_from_root(path).resolve() + relative_from_root(path, self.root).resolve() == submodule_path.resolve() for submodule_path in self._submodules ) @@ -189,8 +185,8 @@ class VCSStrategyHg(VCSStrategy): EXE = HG_EXE - def __init__(self, project: Project): - super().__init__(project) + def __init__(self, root: StrPath): + super().__init__(root) if not self.EXE: raise FileNotFoundError("Could not find binary for Mercurial") self._all_ignored_files = self._find_all_ignored_files() @@ -211,12 +207,12 @@ def _find_all_ignored_files(self) -> Set[Path]: "--no-status", "--print0", ] - result = execute_command(command, _LOGGER, cwd=self.project.root) + result = execute_command(command, _LOGGER, cwd=self.root) all_files = result.stdout.decode("utf-8").split("\0") return {Path(file_) for file_ in all_files} def is_ignored(self, path: StrPath) -> bool: - path = self.project.relative_from_root(path) + path = relative_from_root(path, self.root) return path in self._all_ignored_files def is_submodule(self, path: StrPath) -> bool: @@ -256,8 +252,8 @@ class VCSStrategyJujutsu(VCSStrategy): EXE = JUJUTSU_EXE - def __init__(self, project: Project): - super().__init__(project) + def __init__(self, root: StrPath): + super().__init__(root) if not self.EXE: raise FileNotFoundError("Could not find binary for Jujutsu") self._all_tracked_files = self._find_all_tracked_files() @@ -267,12 +263,12 @@ def _find_all_tracked_files(self) -> Set[Path]: Return a set of all files tracked in the current jj revision """ command = [str(self.EXE), "files"] - result = execute_command(command, _LOGGER, cwd=self.project.root) + result = execute_command(command, _LOGGER, cwd=self.root) all_files = result.stdout.decode("utf-8").split("\n") return {Path(file_) for file_ in all_files if file_} def is_ignored(self, path: StrPath) -> bool: - path = self.project.relative_from_root(path) + path = relative_from_root(path, self.root) for tracked in self._all_tracked_files: if tracked.parts[: len(path.parts)] == path.parts: @@ -321,8 +317,8 @@ class VCSStrategyPijul(VCSStrategy): EXE = PIJUL_EXE - def __init__(self, project: Project): - super().__init__(project) + def __init__(self, root: StrPath): + super().__init__(root) if not self.EXE: raise FileNotFoundError("Could not find binary for Pijul") self._all_tracked_files = self._find_all_tracked_files() @@ -330,12 +326,12 @@ def __init__(self, project: Project): def _find_all_tracked_files(self) -> Set[Path]: """Return a set of all files tracked by pijul.""" command = [str(self.EXE), "list"] - result = execute_command(command, _LOGGER, cwd=self.project.root) + result = execute_command(command, _LOGGER, cwd=self.root) all_files = result.stdout.decode("utf-8").splitlines() return {Path(file_) for file_ in all_files} def is_ignored(self, path: StrPath) -> bool: - path = self.project.relative_from_root(path) + path = relative_from_root(path, self.root) return path not in self._all_tracked_files def is_submodule(self, path: StrPath) -> bool: From 90369041b54c9ce56b549c2b2ab762ff4ec29f10 Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Mon, 8 Jul 2024 15:28:49 +0200 Subject: [PATCH 02/10] Refactor iter_files out of Project This is kind of annoying, but it is necessary for some work I want to do in global_licensing. Fortunately this was surprisingly easy; iter_files is the first function ever written for REUSE, so I expected that it would be more tangled up. Signed-off-by: Carmen Bianca BAKKER --- src/reuse/project.py | 217 +++++++++++++++++++++++++------------------ 1 file changed, 127 insertions(+), 90 deletions(-) diff --git a/src/reuse/project.py b/src/reuse/project.py index f21b86b5f..3de00d302 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -22,6 +22,7 @@ Collection, DefaultDict, Dict, + Generator, Iterator, List, NamedTuple, @@ -170,71 +171,6 @@ def from_directory( return project - def _iter_files( - self, - directory: Optional[StrPath] = None, - subset_files: Optional[Collection[StrPath]] = None, - ) -> Iterator[Path]: - # pylint: disable=too-many-branches - if directory is None: - directory = self.root - directory = Path(directory) - if subset_files is not None: - subset_files = cast( - Set[Path], {Path(file_).resolve() for file_ in subset_files} - ) - - for root_str, dirs, files in os.walk(directory): - root = Path(root_str) - _LOGGER.debug("currently walking in '%s'", root) - - # Don't walk ignored directories - for dir_ in list(dirs): - the_dir = root / dir_ - if subset_files is not None and not any( - is_relative_to(file_, the_dir.resolve()) - for file_ in subset_files - ): - continue - if self._is_path_ignored(the_dir): - _LOGGER.debug("ignoring '%s'", the_dir) - dirs.remove(dir_) - elif the_dir.is_symlink(): - _LOGGER.debug("skipping symlink '%s'", the_dir) - dirs.remove(dir_) - elif ( - not self.include_submodules - and self.vcs_strategy.is_submodule(the_dir) - ): - _LOGGER.info( - "ignoring '%s' because it is a submodule", the_dir - ) - dirs.remove(dir_) - - # Filter files. - for file_ in files: - the_file = root / file_ - if ( - subset_files is not None - and the_file.resolve() not in subset_files - ): - continue - if self._is_path_ignored(the_file): - _LOGGER.debug("ignoring '%s'", the_file) - continue - if the_file.is_symlink(): - _LOGGER.debug("skipping symlink '%s'", the_file) - continue - # Suppressing this error because I simply don't want to deal - # with that here. - with contextlib.suppress(OSError): - if the_file.stat().st_size == 0: - _LOGGER.debug("skipping 0-sized file '%s'", the_file) - continue - - _LOGGER.debug("yielding '%s'", the_file) - yield the_file - def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]: """Yield all files in *directory* and its subdirectories. @@ -255,7 +191,14 @@ def all_files(self, directory: Optional[StrPath] = None) -> Iterator[Path]: Args: directory: The directory in which to search. """ - return self._iter_files(directory=directory) + if directory is None: + directory = self.root + return iter_files( + directory, + include_submodules=self.include_submodules, + include_meson_subprojects=self.include_meson_subprojects, + vcs_strategy=self.vcs_strategy, + ) def subset_files( self, files: Collection[StrPath], directory: Optional[StrPath] = None @@ -269,7 +212,15 @@ def subset_files( yielded. directory: The directory in which to search. """ - return self._iter_files(directory=directory, subset_files=files) + if directory is None: + directory = self.root + return iter_files( + directory=directory, + subset_files=files, + include_submodules=self.include_submodules, + include_meson_subprojects=self.include_meson_subprojects, + vcs_strategy=self.vcs_strategy, + ) def reuse_info_of(self, path: StrPath) -> List[ReuseInfo]: """Return REUSE info of *path*. @@ -413,29 +364,6 @@ def find_global_licensing( candidate = GlobalLicensingFound(root, NestedReuseTOML) return candidate - def _is_path_ignored(self, path: Path) -> bool: - """Is *path* ignored by some mechanism?""" - name = path.name - parent_parts = path.parent.parts - parent_dir = parent_parts[-1] if len(parent_parts) > 0 else "" - if path.is_file(): - for pattern in _IGNORE_FILE_PATTERNS: - if pattern.match(name): - return True - elif path.is_dir(): - for pattern in _IGNORE_DIR_PATTERNS: - if pattern.match(name): - return True - if not self.include_meson_subprojects: - for pattern in _IGNORE_MESON_PARENT_DIR_PATTERNS: - if pattern.match(parent_dir): - return True - - if self.vcs_strategy.is_ignored(path): - return True - - return False - def _identifier_of_license(self, path: Path) -> str: """Figure out the SPDX License identifier of a license given its path. The name of the path (minus its extension) should be a valid SPDX @@ -545,3 +473,112 @@ def _detect_vcs_strategy(cls, root: StrPath) -> VCSStrategy: ).format(root) ) return VCSStrategyNone(root) + + +def is_path_ignored( + path: Path, + subset_files: Optional[Collection[StrPath]] = None, + include_submodules: bool = False, + include_meson_subprojects: bool = False, + vcs_strategy: Optional[VCSStrategy] = None, +) -> bool: + """Is *path* ignored by some mechanism?""" + # pylint: disable=too-many-return-statements,too-many-branches + name = path.name + parent_parts = path.parent.parts + parent_dir = parent_parts[-1] if len(parent_parts) > 0 else "" + + if path.is_symlink(): + _LOGGER.debug("skipping symlink '%s'", path) + return True + + if path.is_file(): + if subset_files is not None and path.resolve() not in subset_files: + continue + for pattern in _IGNORE_FILE_PATTERNS: + if pattern.match(name): + return True + # Suppressing this error because I simply don't want to deal + # with that here. + with contextlib.suppress(OSError): + if path.stat().st_size == 0: + _LOGGER.debug("skipping 0-sized file '%s'", path) + return True + + elif path.is_dir(): + if subset_files is not None and not any( + is_relative_to(file_, path.resolve()) for file_ in subset_files + ): + return True + for pattern in _IGNORE_DIR_PATTERNS: + if pattern.match(name): + return True + if not include_meson_subprojects: + for pattern in _IGNORE_MESON_PARENT_DIR_PATTERNS: + if pattern.match(parent_dir): + _LOGGER.info( + "ignoring '%s' because it is a Meson subproject", path + ) + return True + if ( + not include_submodules + and vcs_strategy + and vcs_strategy.is_submodule(path) + ): + _LOGGER.info("ignoring '%s' because it is a submodule", path) + return True + + if vcs_strategy and vcs_strategy.is_ignored(path): + return True + + return False + + +def iter_files( + directory: StrPath, + subset_files: Optional[Collection[StrPath]] = None, + include_submodules: bool = False, + include_meson_subprojects: bool = False, + vcs_strategy: Optional[VCSStrategy] = None, +) -> Generator[Path, None, None]: + """Yield all Covered Files in *directory* and its subdirectories according + to the REUSE Specification. + """ + directory = Path(directory) + if subset_files is not None: + subset_files = cast( + Set[Path], {Path(file_).resolve() for file_ in subset_files} + ) + + for root_str, dirs, files in os.walk(directory): + root = Path(root_str) + _LOGGER.debug("currently walking in '%s'", root) + + # Don't walk ignored directories + for dir_ in list(dirs): + the_dir = root / dir_ + if is_path_ignored( + the_dir, + subset_files=subset_files, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ): + _LOGGER.debug("ignoring '%s'", the_dir) + dirs.remove(dir_) + + # Filter files. + for file_ in files: + the_file = root / file_ + if is_path_ignored( + the_file, + subset_files=subset_files, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ): + _LOGGER.debug("ignoring '%s'", the_file) + continue + + _LOGGER.debug("yielding '%s'", the_file) + yield the_file From 3b25418d58f0c10e0ef65256c21fe6b15c998e73 Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Mon, 8 Jul 2024 17:23:35 +0200 Subject: [PATCH 03/10] Factor out iter_files into its own module covered_files Signed-off-by: Carmen Bianca BAKKER --- src/reuse/covered_files.py | 135 +++++++++++++ src/reuse/project.py | 118 +---------- tests/conftest.py | 2 + tests/test_covered_files.py | 350 ++++++++++++++++++++++++++++++++ tests/test_project.py | 385 ++++++------------------------------ 5 files changed, 546 insertions(+), 444 deletions(-) create mode 100644 src/reuse/covered_files.py create mode 100644 tests/test_covered_files.py diff --git a/src/reuse/covered_files.py b/src/reuse/covered_files.py new file mode 100644 index 000000000..01db5eaa4 --- /dev/null +++ b/src/reuse/covered_files.py @@ -0,0 +1,135 @@ +# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""The REUSE Specification has a concept called Covered Files; files which must +contain licensing information. Some files in a project are not Covered Files, +and thus needn't contain licensing information. This module contains all that +logic. +""" + +import contextlib +import logging +import os +from pathlib import Path +from typing import Collection, Generator, Optional, Set, cast + +from . import ( + _IGNORE_DIR_PATTERNS, + _IGNORE_FILE_PATTERNS, + _IGNORE_MESON_PARENT_DIR_PATTERNS, +) +from ._util import StrPath, is_relative_to +from .vcs import VCSStrategy + +_LOGGER = logging.getLogger(__name__) + + +def is_path_ignored( + path: Path, + subset_files: Optional[Collection[StrPath]] = None, + include_submodules: bool = False, + include_meson_subprojects: bool = False, + vcs_strategy: Optional[VCSStrategy] = None, +) -> bool: + """Is *path* ignored by some mechanism?""" + # pylint: disable=too-many-return-statements,too-many-branches + name = path.name + parent_parts = path.parent.parts + parent_dir = parent_parts[-1] if len(parent_parts) > 0 else "" + + if path.is_symlink(): + _LOGGER.debug("skipping symlink '%s'", path) + return True + + if path.is_file(): + if subset_files is not None and path.resolve() not in subset_files: + return True + for pattern in _IGNORE_FILE_PATTERNS: + if pattern.match(name): + return True + # Suppressing this error because I simply don't want to deal + # with that here. + with contextlib.suppress(OSError): + if path.stat().st_size == 0: + _LOGGER.debug("skipping 0-sized file '%s'", path) + return True + + elif path.is_dir(): + if subset_files is not None and not any( + is_relative_to(Path(file_), path.resolve()) + for file_ in subset_files + ): + return True + for pattern in _IGNORE_DIR_PATTERNS: + if pattern.match(name): + return True + if not include_meson_subprojects: + for pattern in _IGNORE_MESON_PARENT_DIR_PATTERNS: + if pattern.match(parent_dir): + _LOGGER.info( + "ignoring '%s' because it is a Meson subproject", path + ) + return True + if ( + not include_submodules + and vcs_strategy + and vcs_strategy.is_submodule(path) + ): + _LOGGER.info("ignoring '%s' because it is a submodule", path) + return True + + if vcs_strategy and vcs_strategy.is_ignored(path): + return True + + return False + + +def iter_files( + directory: StrPath, + subset_files: Optional[Collection[StrPath]] = None, + include_submodules: bool = False, + include_meson_subprojects: bool = False, + vcs_strategy: Optional[VCSStrategy] = None, +) -> Generator[Path, None, None]: + """Yield all Covered Files in *directory* and its subdirectories according + to the REUSE Specification. + """ + directory = Path(directory) + if subset_files is not None: + subset_files = cast( + Set[Path], {Path(file_).resolve() for file_ in subset_files} + ) + + for root_str, dirs, files in os.walk(directory): + root = Path(root_str) + _LOGGER.debug("currently walking in '%s'", root) + + # Don't walk ignored directories + for dir_ in list(dirs): + the_dir = root / dir_ + if is_path_ignored( + the_dir, + subset_files=subset_files, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ): + _LOGGER.debug("ignoring '%s'", the_dir) + dirs.remove(dir_) + + # Filter files. + for file_ in files: + the_file = root / file_ + if is_path_ignored( + the_file, + subset_files=subset_files, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ): + _LOGGER.debug("ignoring '%s'", the_file) + continue + + _LOGGER.debug("yielding '%s'", the_file) + yield the_file diff --git a/src/reuse/project.py b/src/reuse/project.py index 3de00d302..f5a92796d 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -34,13 +34,7 @@ from binaryornot.check import is_binary -from . import ( - _IGNORE_DIR_PATTERNS, - _IGNORE_FILE_PATTERNS, - _IGNORE_MESON_PARENT_DIR_PATTERNS, - IdentifierNotFound, - ReuseInfo, -) +from . import IdentifierNotFound, ReuseInfo from ._licenses import EXCEPTION_MAP, LICENSE_MAP from ._util import ( _LICENSEREF_PATTERN, @@ -50,6 +44,7 @@ relative_from_root, reuse_info_of_file, ) +from .covered_files import iter_files from .global_licensing import ( GlobalLicensing, NestedReuseTOML, @@ -473,112 +468,3 @@ def _detect_vcs_strategy(cls, root: StrPath) -> VCSStrategy: ).format(root) ) return VCSStrategyNone(root) - - -def is_path_ignored( - path: Path, - subset_files: Optional[Collection[StrPath]] = None, - include_submodules: bool = False, - include_meson_subprojects: bool = False, - vcs_strategy: Optional[VCSStrategy] = None, -) -> bool: - """Is *path* ignored by some mechanism?""" - # pylint: disable=too-many-return-statements,too-many-branches - name = path.name - parent_parts = path.parent.parts - parent_dir = parent_parts[-1] if len(parent_parts) > 0 else "" - - if path.is_symlink(): - _LOGGER.debug("skipping symlink '%s'", path) - return True - - if path.is_file(): - if subset_files is not None and path.resolve() not in subset_files: - continue - for pattern in _IGNORE_FILE_PATTERNS: - if pattern.match(name): - return True - # Suppressing this error because I simply don't want to deal - # with that here. - with contextlib.suppress(OSError): - if path.stat().st_size == 0: - _LOGGER.debug("skipping 0-sized file '%s'", path) - return True - - elif path.is_dir(): - if subset_files is not None and not any( - is_relative_to(file_, path.resolve()) for file_ in subset_files - ): - return True - for pattern in _IGNORE_DIR_PATTERNS: - if pattern.match(name): - return True - if not include_meson_subprojects: - for pattern in _IGNORE_MESON_PARENT_DIR_PATTERNS: - if pattern.match(parent_dir): - _LOGGER.info( - "ignoring '%s' because it is a Meson subproject", path - ) - return True - if ( - not include_submodules - and vcs_strategy - and vcs_strategy.is_submodule(path) - ): - _LOGGER.info("ignoring '%s' because it is a submodule", path) - return True - - if vcs_strategy and vcs_strategy.is_ignored(path): - return True - - return False - - -def iter_files( - directory: StrPath, - subset_files: Optional[Collection[StrPath]] = None, - include_submodules: bool = False, - include_meson_subprojects: bool = False, - vcs_strategy: Optional[VCSStrategy] = None, -) -> Generator[Path, None, None]: - """Yield all Covered Files in *directory* and its subdirectories according - to the REUSE Specification. - """ - directory = Path(directory) - if subset_files is not None: - subset_files = cast( - Set[Path], {Path(file_).resolve() for file_ in subset_files} - ) - - for root_str, dirs, files in os.walk(directory): - root = Path(root_str) - _LOGGER.debug("currently walking in '%s'", root) - - # Don't walk ignored directories - for dir_ in list(dirs): - the_dir = root / dir_ - if is_path_ignored( - the_dir, - subset_files=subset_files, - include_submodules=include_submodules, - include_meson_subprojects=include_meson_subprojects, - vcs_strategy=vcs_strategy, - ): - _LOGGER.debug("ignoring '%s'", the_dir) - dirs.remove(dir_) - - # Filter files. - for file_ in files: - the_file = root / file_ - if is_path_ignored( - the_file, - subset_files=subset_files, - include_submodules=include_submodules, - include_meson_subprojects=include_meson_subprojects, - vcs_strategy=vcs_strategy, - ): - _LOGGER.debug("ignoring '%s'", the_file) - continue - - _LOGGER.debug("yielding '%s'", the_file) - yield the_file diff --git a/tests/conftest.py b/tests/conftest.py index 7b95763c8..40433342e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -66,6 +66,8 @@ sys.implementation.name != "cpython", reason="only CPython supported" ) git = pytest.mark.skipif(not GIT_EXE, reason="requires git") +hg = pytest.mark.skipif(not HG_EXE, reason="requires mercurial") +pijul = pytest.mark.skipif(not PIJUL_EXE, reason="requires pijul") no_root = pytest.mark.xfail(is_root, reason="fails when user is root") posix = pytest.mark.skipif(not is_posix, reason="Windows not supported") diff --git a/tests/test_covered_files.py b/tests/test_covered_files.py new file mode 100644 index 000000000..a60c1a2c0 --- /dev/null +++ b/tests/test_covered_files.py @@ -0,0 +1,350 @@ +# SPDX-FileCopyrightText: 2017 Free Software Foundation Europe e.V. +# SPDX-FileCopyrightText: 2022 Florian Snow +# SPDX-FileCopyrightText: 2023 Carmen Bianca BAKKER +# SPDX-FileCopyrightText: © 2020 Liferay, Inc. +# +# SPDX-License-Identifier: GPL-3.0-or-later + +"""Tests for reuse.covered_files.""" + +import os +from pathlib import Path + +from conftest import git, hg, pijul, posix + +from reuse.covered_files import iter_files +from reuse.vcs import VCSStrategyGit, VCSStrategyHg, VCSStrategyPijul + + +class TestIterFiles: + """Test the iter_files function.""" + + def test_simple(self, empty_directory): + """Given a directory with some files, yield all files.""" + (empty_directory / "foo").write_text("foo") + (empty_directory / "bar").write_text("foo") + + assert {file_.name for file_ in iter_files(empty_directory)} == { + "foo", + "bar", + } + + def test_ignore_dot_license(self, empty_directory): + """When file and file.license are present, only yield file.""" + (empty_directory / "foo").write_text("foo") + (empty_directory / "foo.license").write_text("foo") + + assert {file_.name for file_ in iter_files(empty_directory)} == {"foo"} + + def test_ignore_cal_license(self, empty_directory): + """CAL licenses contain SPDX tags referencing themselves. They should be + skipped. + """ + (empty_directory / "CAL-1.0").write_text("foo") + (empty_directory / "CAL-1.0.txt").write_text("foo") + (empty_directory / "CAL-1.0-Combined-Work-Exception").write_text("foo") + (empty_directory / "CAL-1.0-Combined-Work-Exception.txt").write_text( + "foo" + ) + + assert not list(iter_files(empty_directory)) + + def test_ignore_shl_license(self, empty_directory): + """SHL-2.1 contains an SPDX tag referencing itself. It should be + skipped. + """ + (empty_directory / "SHL-2.1").write_text("foo") + (empty_directory / "SHL-2.1.txt").write_text("foo") + + assert not list(iter_files(empty_directory)) + + def test_ignore_git(self, empty_directory): + """When the git directory is present, ignore it.""" + (empty_directory / ".git").mkdir() + (empty_directory / ".git/config").write_text("foo") + + assert not list(iter_files(empty_directory)) + + def test_ignore_hg(self, empty_directory): + """When the hg directory is present, ignore it.""" + (empty_directory / ".hg").mkdir() + (empty_directory / ".hg/config").write_text("foo") + + assert not list(iter_files(empty_directory)) + + def test_ignore_license_copying(self, empty_directory): + """When there are files names LICENSE, LICENSE.ext, COPYING, or + COPYING.ext, ignore them. + """ + (empty_directory / "LICENSE").write_text("foo") + (empty_directory / "LICENSE.txt").write_text("foo") + (empty_directory / "COPYING").write_text("foo") + (empty_directory / "COPYING.txt").write_text("foo") + + assert not list(iter_files(empty_directory)) + + def test_not_ignore_license_copying_no_ext(self, empty_directory): + """Do not ignore files that start with LICENSE or COPYING and are + followed by some non-extension text. + """ + (empty_directory / "LICENSE_README.md").write_text("foo") + (empty_directory / "COPYING2").write_text("foo") + + assert len(list(iter_files(empty_directory))) == 2 + + @posix + def test_ignore_symlinks(self, empty_directory): + """All symlinks must be ignored.""" + (empty_directory / "blob").write_text("foo") + (empty_directory / "symlink").symlink_to("blob") + + assert Path("symlink").absolute() not in iter_files(empty_directory) + + def test_ignore_zero_sized(self, empty_directory): + """Empty files should be skipped.""" + (empty_directory / "foo").touch() + + assert Path("foo").absolute() not in iter_files(empty_directory) + + def test_include_meson_subprojects(self, empty_directory): + """include_meson_subprojects is correctly interpreted.""" + (empty_directory / "foo.py").write_text("foo.py") + subprojects_dir = empty_directory / "subprojects" + subprojects_dir.mkdir() + libfoo_dir = subprojects_dir / "libfoo" + libfoo_dir.mkdir() + (libfoo_dir / "bar.py").write_text("pass") + + assert (libfoo_dir / "bar.py") not in iter_files(empty_directory) + assert (libfoo_dir / "bar.py") in iter_files( + empty_directory, include_meson_subprojects=True + ) + + +class TestIterFilesSubet: + """Tests for subset_files in iter_files.""" + + def test_single(self, fake_repository): + """Only yield the single specified file.""" + result = list( + iter_files( + fake_repository, + subset_files={fake_repository / "src/custom.py"}, + ) + ) + assert result == [fake_repository / "src/custom.py"] + + def test_two(self, fake_repository): + """Yield multiple specified files.""" + result = set( + iter_files( + fake_repository, + subset_files={ + fake_repository / "src/custom.py", + fake_repository / "src/exception.py", + }, + ) + ) + assert result == { + fake_repository / "src/custom.py", + fake_repository / "src/exception.py", + } + + def test_non_existent(self, fake_repository): + """If a file does not exist, don't yield it.""" + result = list( + iter_files( + fake_repository, + subset_files={ + fake_repository / "src/custom.py", + fake_repository / "not_exist.py", + fake_repository / "also/does/not/exist.py", + }, + ) + ) + assert result == [fake_repository / "src/custom.py"] + + def test_outside_cwd(self, fake_repository): + """If a file is outside of the project, don't yield it.""" + result = list( + iter_files( + fake_repository, + subset_files={ + fake_repository / "src/custom.py", + (fake_repository / "../outside.py").resolve(), + }, + ) + ) + assert result == [fake_repository / "src/custom.py"] + + def test_empty(self, fake_repository): + """If no files are provided, yield nothing.""" + result = list(iter_files(fake_repository, subset_files=set())) + assert not result + + def test_list_arg(self, fake_repository): + """Also accepts a list argument.""" + result = list( + iter_files( + fake_repository, + subset_files=[fake_repository / "src/custom.py"], + ) + ) + assert result == [fake_repository / "src/custom.py"] + + def test_relative_path(self, fake_repository): + """Also handles relative paths.""" + result = list( + iter_files(fake_repository, subset_files={"src/custom.py"}) + ) + assert result == [fake_repository / "src/custom.py"] + + +@git +class TestAllFilesGit: + """Test the iter_files function with git.""" + + def test_simple(self, git_repository): + """Given a Git repository where some files are ignored, do not yield + those files. + """ + assert Path("build/hello.py").absolute() not in iter_files( + git_repository, vcs_strategy=VCSStrategyGit(git_repository) + ) + + def test_not_ignored_if_no_strategy(self, git_repository): + """If no strategy is provided, the file is not ignored.""" + assert Path("build/hello.py").absolute() in iter_files(git_repository) + + def test_different_cwd(self, git_repository): + """Given a Git repository where some files are ignored, do not yield + those files. + + Be in a different CWD during the above. + """ + os.chdir(git_repository / "LICENSES") + assert Path("build/hello.py").absolute() not in iter_files( + git_repository, vcs_strategy=VCSStrategyGit(git_repository) + ) + + def test_ignored_contains_space(self, git_repository): + """Files that contain spaces are also ignored.""" + (git_repository / "I contain spaces.pyc").write_text("foo") + assert Path("I contain spaces.pyc").absolute() not in iter_files( + git_repository, vcs_strategy=VCSStrategyGit(git_repository) + ) + + @posix + def test_ignored_contains_newline(self, git_repository): + """Files that contain newlines are also ignored.""" + (git_repository / "hello\nworld.pyc").write_text("foo") + assert Path("hello\nworld.pyc").absolute() not in iter_files( + git_repository, vcs_strategy=VCSStrategyGit(git_repository) + ) + + def test_ignore_submodules(self, submodule_repository): + """Normally ignore submodules.""" + (submodule_repository / "submodule/foo.py").write_text("foo") + assert Path("submodule/foo.py").absolute() not in iter_files( + submodule_repository, + vcs_strategy=VCSStrategyGit(submodule_repository), + ) + + def test_include_submodules(self, submodule_repository): + """If include_submodules is True, include files from the submodule.""" + (submodule_repository / "submodule/foo.py").write_text("foo") + assert Path("submodule/foo.py").absolute() in iter_files( + submodule_repository, + include_submodules=True, + vcs_strategy=VCSStrategyGit(submodule_repository), + ) + + def test_submodule_is_ignored(self, submodule_repository): + """If a submodule is ignored, iter_files should not raise an Exception""" + (submodule_repository / "submodule/foo.py").write_text("foo") + gitignore = submodule_repository / ".gitignore" + contents = gitignore.read_text() + contents += "\nsubmodule/\n" + gitignore.write_text(contents) + assert Path("submodule/foo.py").absolute() not in iter_files( + submodule_repository, + vcs_strategy=VCSStrategyGit(submodule_repository), + ) + + +@hg +class TestAllFilesHg: + """Test the iter_files function with Mercurial.""" + + def test_simple(self, hg_repository): + """Given a mercurial repository where some files are ignored, do not + yield those files. + """ + assert Path("build/hello.py").absolute() not in iter_files( + hg_repository, vcs_strategy=VCSStrategyHg(hg_repository) + ) + + def test_different_cwd(self, hg_repository): + """Given a mercurial repository where some files are ignored, do not + yield those files. + + Be in a different CWD during the above. + """ + os.chdir(hg_repository / "LICENSES") + assert Path("build/hello.py").absolute() not in iter_files( + hg_repository, vcs_strategy=VCSStrategyHg(hg_repository) + ) + + def test_ignored_contains_space(self, hg_repository): + """File names that contain spaces are also ignored.""" + (hg_repository / "I contain spaces.pyc").touch() + assert Path("I contain spaces.pyc").absolute() not in iter_files( + hg_repository, vcs_strategy=VCSStrategyHg(hg_repository) + ) + + @posix + def test_ignored_contains_newline(self, hg_repository): + """File names that contain newlines are also ignored.""" + (hg_repository / "hello\nworld.pyc").touch() + assert Path("hello\nworld.pyc").absolute() not in iter_files( + hg_repository, vcs_strategy=VCSStrategyHg(hg_repository) + ) + + +@pijul +class TestAllFilesPijul: + """Test the iter_files function with Pijul.""" + + def test_simple(self, pijul_repository): + """Given a pijul repository where some files are ignored, do not yield + those files. + """ + assert Path("build/hello.py").absolute() not in iter_files( + pijul_repository, vcs_strategy=VCSStrategyPijul(pijul_repository) + ) + + def test_iter_files_pijul_ignored_different_cwd(self, pijul_repository): + """Given a pijul repository where some files are ignored, do not yield + those files. + + Be in a different CWD during the above. + """ + os.chdir(pijul_repository / "LICENSES") + assert Path("build/hello.py").absolute() not in iter_files( + pijul_repository, vcs_strategy=VCSStrategyPijul(pijul_repository) + ) + + def test_ignored_contains_space(self, pijul_repository): + """File names that contain spaces are also ignored.""" + (pijul_repository / "I contain spaces.pyc").touch() + assert Path("I contain spaces.pyc").absolute() not in iter_files( + pijul_repository, vcs_strategy=VCSStrategyPijul(pijul_repository) + ) + + @posix + def test_ignored_contains_newline(self, pijul_repository): + """File names that contain newlines are also ignored.""" + (pijul_repository / "hello\nworld.pyc").touch() + assert Path("hello\nworld.pyc").absolute() not in iter_files( + pijul_repository, vcs_strategy=VCSStrategyPijul(pijul_repository) + ) diff --git a/tests/test_project.py b/tests/test_project.py index b590aa1bc..6d90c49a1 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -13,13 +13,15 @@ import warnings from inspect import cleandoc from pathlib import Path +from unittest import mock import pytest -from conftest import RESOURCES_DIRECTORY, posix +from conftest import RESOURCES_DIRECTORY from license_expression import LicenseSymbol from reuse import ReuseInfo, SourceType from reuse._util import _LICENSING +from reuse.covered_files import iter_files from reuse.global_licensing import ( GlobalLicensingParseError, NestedReuseTOML, @@ -54,340 +56,67 @@ def test_project_conflicting_global_licensing(empty_directory): Project.from_directory(empty_directory) -def test_all_files(empty_directory): - """Given a directory with some files, yield all files.""" - (empty_directory / "foo").write_text("foo") - (empty_directory / "bar").write_text("foo") +class TestProjectAllFiles: + """Test Project.all_files.""" - project = Project.from_directory(empty_directory) - assert {file_.name for file_ in project.all_files()} == {"foo", "bar"} - - -def test_all_files_ignore_dot_license(empty_directory): - """When file and file.license are present, only yield file.""" - (empty_directory / "foo").write_text("foo") - (empty_directory / "foo.license").write_text("foo") - - project = Project.from_directory(empty_directory) - assert {file_.name for file_ in project.all_files()} == {"foo"} - - -def test_all_files_ignore_cal_license(empty_directory): - """CAL licenses contain SPDX tags referencing themselves. They should be - skipped. - """ - (empty_directory / "CAL-1.0").write_text("foo") - (empty_directory / "CAL-1.0.txt").write_text("foo") - (empty_directory / "CAL-1.0-Combined-Work-Exception").write_text("foo") - (empty_directory / "CAL-1.0-Combined-Work-Exception.txt").write_text("foo") - - project = Project.from_directory(empty_directory) - assert not list(project.all_files()) - - -def test_all_files_ignore_shl_license(empty_directory): - """SHL-2.1 contains an SPDX tag referencing itself. It should be skipped.""" - (empty_directory / "SHL-2.1").write_text("foo") - (empty_directory / "SHL-2.1.txt").write_text("foo") - - project = Project.from_directory(empty_directory) - assert not list(project.all_files()) - - -def test_all_files_ignore_git(empty_directory): - """When the git directory is present, ignore it.""" - (empty_directory / ".git").mkdir() - (empty_directory / ".git/config").write_text("foo") - - project = Project.from_directory(empty_directory) - assert not list(project.all_files()) - - -def test_all_files_ignore_hg(empty_directory): - """When the hg directory is present, ignore it.""" - (empty_directory / ".hg").mkdir() - (empty_directory / ".hg/config").write_text("foo") - - project = Project.from_directory(empty_directory) - assert not list(project.all_files()) - - -def test_all_files_ignore_license_copying(empty_directory): - """When there are files named LICENSE, LICENSE.ext, LICENSE-MIT, COPYING, - COPYING.ext, or COPYING-MIT, ignore them. - """ - (empty_directory / "LICENSE").write_text("foo") - (empty_directory / "LICENSE.txt").write_text("foo") - (empty_directory / "LICENSE-MIT").write_text("foo") - (empty_directory / "COPYING").write_text("foo") - (empty_directory / "COPYING.txt").write_text("foo") - (empty_directory / "COPYING-MIT").write_text("foo") - - project = Project.from_directory(empty_directory) - assert not list(project.all_files()) - - -def test_all_files_ignore_uk_license(empty_directory): - """Ignore UK spelling of licence.""" - (empty_directory / "LICENCE").write_text("foo") - - project = Project.from_directory(empty_directory) - assert not list(project.all_files()) - - -def test_all_files_not_ignore_license_copying_no_ext(empty_directory): - """Do not ignore files that start with LICENSE or COPYING and are followed - by some non-extension text. - """ - (empty_directory / "LICENSE_README.md").write_text("foo") - (empty_directory / "COPYING2").write_text("foo") - - project = Project.from_directory(empty_directory) - assert len(list(project.all_files())) == 2 - - -@posix -def test_all_files_symlinks(empty_directory): - """All symlinks must be ignored.""" - (empty_directory / "blob").write_text("foo") - (empty_directory / "blob.license").write_text( - cleandoc( - """ - SPDX-FileCopyrightText: Jane Doe - - SPDX-License-Identifier: GPL-3.0-or-later - """ + def test_with_mock(self, monkeypatch, empty_directory): + """Instead of testing the entire behaviour, just test that a mock is + called and its value is returned. + """ + mock_iter_files = mock.create_autospec(iter_files) + mock_iter_files.return_value = "foo" + monkeypatch.setattr("reuse.project.iter_files", mock_iter_files) + + project = Project.from_directory(empty_directory) + result = project.all_files(empty_directory) + assert result == "foo" + + mock_iter_files.assert_called_once_with( + empty_directory, + include_submodules=project.include_submodules, + include_meson_subprojects=project.include_meson_subprojects, + vcs_strategy=project.vcs_strategy, ) - ) - (empty_directory / "symlink").symlink_to("blob") - project = Project.from_directory(empty_directory) - assert Path("symlink").absolute() not in project.all_files() - - -def test_all_files_ignore_zero_sized(empty_directory): - """Empty files should be skipped.""" - (empty_directory / "foo").touch() - - project = Project.from_directory(empty_directory) - assert Path("foo").absolute() not in project.all_files() - - -def test_all_files_git_ignored(git_repository): - """Given a Git repository where some files are ignored, do not yield those - files. - """ - project = Project.from_directory(git_repository) - assert Path("build/hello.py").absolute() not in project.all_files() - - -def test_all_files_git_ignored_different_cwd(git_repository): - """Given a Git repository where some files are ignored, do not yield those - files. - - Be in a different CWD during the above. - """ - os.chdir(git_repository / "LICENSES") - project = Project.from_directory(git_repository) - assert Path("build/hello.py").absolute() not in project.all_files() - - -def test_all_files_git_ignored_contains_space(git_repository): - """Files that contain spaces are also ignored.""" - (git_repository / "I contain spaces.pyc").write_text("foo") - project = Project.from_directory(git_repository) - assert Path("I contain spaces.pyc").absolute() not in project.all_files() - - -@posix -def test_all_files_git_ignored_contains_newline(git_repository): - """Files that contain newlines are also ignored.""" - (git_repository / "hello\nworld.pyc").write_text("foo") - project = Project.from_directory(git_repository) - assert Path("hello\nworld.pyc").absolute() not in project.all_files() - - -def test_all_files_submodule_is_ignored(submodule_repository): - """If a submodule is ignored, all_files should not raise an Exception.""" - (submodule_repository / "submodule/foo.py").write_text("foo") - gitignore = submodule_repository / ".gitignore" - contents = gitignore.read_text() - contents += "\nsubmodule/\n" - gitignore.write_text(contents) - project = Project.from_directory(submodule_repository) - assert Path("submodule/foo.py").absolute() not in project.all_files() - - -def test_all_files_hg_ignored(hg_repository): - """Given a mercurial repository where some files are ignored, do not yield - those files. - """ - project = Project.from_directory(hg_repository) - assert Path("build/hello.py").absolute() not in project.all_files() - - -def test_all_files_hg_ignored_different_cwd(hg_repository): - """Given a mercurial repository where some files are ignored, do not yield - those files. - - Be in a different CWD during the above. - """ - os.chdir(hg_repository / "LICENSES") - project = Project.from_directory(hg_repository) - assert Path("build/hello.py").absolute() not in project.all_files() - - -def test_all_files_hg_ignored_contains_space(hg_repository): - """File names that contain spaces are also ignored.""" - (hg_repository / "I contain spaces.pyc").touch() - project = Project.from_directory(hg_repository) - assert Path("I contain spaces.pyc").absolute() not in project.all_files() - - -@posix -def test_all_files_hg_ignored_contains_newline(hg_repository): - """File names that contain newlines are also ignored.""" - (hg_repository / "hello\nworld.pyc").touch() - project = Project.from_directory(hg_repository) - assert Path("hello\nworld.pyc").absolute() not in project.all_files() - - -def test_all_files_jujutsu_ignored(jujutsu_repository): - """Given a jujutsu repository where some files are ignored, do not yield - those files. - """ - project = Project.from_directory(jujutsu_repository) - assert Path("build/hello.py").absolute() not in project.all_files() - - -def test_all_files_jujutsu_ignored_different_cwd(jujutsu_repository): - """Given a jujutsu repository where some files are ignored, do not yield - those files. - - Be in a different CWD during the above. - """ - os.chdir(jujutsu_repository / "LICENSES") - project = Project.from_directory(jujutsu_repository) - assert Path("build/hello.py").absolute() not in project.all_files() - - -def test_all_files_jujutsu_ignored_contains_space(jujutsu_repository): - """File names that contain spaces are also ignored.""" - (jujutsu_repository / "I contain spaces.pyc").touch() - project = Project.from_directory(jujutsu_repository) - assert Path("I contain spaces.pyc").absolute() not in project.all_files() - - -@posix -def test_all_files_jujutsu_ignored_contains_newline(jujutsu_repository): - """File names that contain newlines are also ignored.""" - (jujutsu_repository / "hello\nworld.pyc").touch() - project = Project.from_directory(jujutsu_repository) - assert Path("hello\nworld.pyc").absolute() not in project.all_files() - - -def test_all_files_pijul_ignored(pijul_repository): - """Given a pijul repository where some files are ignored, do not yield - those files. - """ - project = Project.from_directory(pijul_repository) - assert Path("build/hello.py").absolute() not in project.all_files() + def test_with_mock_implicit_dir(self, monkeypatch, empty_directory): + """If no argument is given to Project.iter_files, project.root is + implied. + """ + mock_iter_files = mock.create_autospec(iter_files) + mock_iter_files.return_value = "foo" + monkeypatch.setattr("reuse.project.iter_files", mock_iter_files) + + project = Project.from_directory(empty_directory) + result = project.all_files() + assert result == "foo" + + mock_iter_files.assert_called_once_with( + empty_directory, + include_submodules=project.include_submodules, + include_meson_subprojects=project.include_meson_subprojects, + vcs_strategy=project.vcs_strategy, + ) -def test_all_files_pijul_ignored_different_cwd(pijul_repository): - """Given a pijul repository where some files are ignored, do not yield - those files. + def test_with_mock_includes(self, monkeypatch, empty_directory): + """include_submodules and include_meson_subprojects are true.""" + mock_iter_files = mock.create_autospec(iter_files) + mock_iter_files.return_value = "foo" + monkeypatch.setattr("reuse.project.iter_files", mock_iter_files) - Be in a different CWD during the above. - """ - os.chdir(pijul_repository / "LICENSES") - project = Project.from_directory(pijul_repository) - assert Path("build/hello.py").absolute() not in project.all_files() - - -def test_all_files_pijul_ignored_contains_space(pijul_repository): - """File names that contain spaces are also ignored.""" - (pijul_repository / "I contain spaces.pyc").touch() - project = Project.from_directory(pijul_repository) - assert Path("I contain spaces.pyc").absolute() not in project.all_files() - - -@posix -def test_all_files_pijul_ignored_contains_newline(pijul_repository): - """File names that contain newlines are also ignored.""" - (pijul_repository / "hello\nworld.pyc").touch() - project = Project.from_directory(pijul_repository) - assert Path("hello\nworld.pyc").absolute() not in project.all_files() - - -class TestSubsetFiles: - """Tests for subset_files.""" - - def test_single(self, fake_repository): - """Only yield the single specified file.""" - project = Project.from_directory(fake_repository) - result = list(project.subset_files({fake_repository / "src/custom.py"})) - assert result == [fake_repository / "src/custom.py"] - - def test_two(self, fake_repository): - """Yield multiple specified files.""" - project = Project.from_directory(fake_repository) - result = set( - project.subset_files( - { - fake_repository / "src/custom.py", - fake_repository / "src/exception.py", - } - ) + project = Project.from_directory( + empty_directory, + include_submodules=True, + include_meson_subprojects=True, ) - assert result == { - fake_repository / "src/custom.py", - fake_repository / "src/exception.py", - } - - def test_non_existent(self, fake_repository): - """If a file does not exist, don't yield it.""" - project = Project.from_directory(fake_repository) - result = list( - project.subset_files( - { - fake_repository / "src/custom.py", - fake_repository / "not_exist.py", - fake_repository / "also/does/not/exist.py", - } - ) - ) - assert result == [fake_repository / "src/custom.py"] - - def test_outside_cwd(self, fake_repository): - """If a file is outside of the project, don't yield it.""" - project = Project.from_directory(fake_repository) - result = list( - project.subset_files( - { - fake_repository / "src/custom.py", - (fake_repository / "../outside.py").resolve(), - } - ) + result = project.all_files() + assert result == "foo" + + mock_iter_files.assert_called_once_with( + empty_directory, + include_submodules=project.include_submodules, + include_meson_subprojects=project.include_meson_subprojects, + vcs_strategy=project.vcs_strategy, ) - assert result == [fake_repository / "src/custom.py"] - - def test_empty(self, fake_repository): - """If no files are provided, yield nothing.""" - project = Project.from_directory(fake_repository) - result = list(project.subset_files(set())) - assert not result - - def test_list_arg(self, fake_repository): - """Also accepts a list argument.""" - project = Project.from_directory(fake_repository) - result = list(project.subset_files([fake_repository / "src/custom.py"])) - assert result == [fake_repository / "src/custom.py"] - - def test_relative_path(self, fake_repository): - """Also handles relative paths.""" - project = Project.from_directory(fake_repository) - result = list(project.subset_files({"src/custom.py"})) - assert result == [fake_repository / "src/custom.py"] def test_reuse_info_of_file_does_not_exist(fake_repository): From a8a1f2868c330d0e09acd0103a46066c5987362c Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Mon, 8 Jul 2024 18:15:30 +0200 Subject: [PATCH 04/10] Create fixture subproject_repository Signed-off-by: Carmen Bianca BAKKER --- tests/conftest.py | 29 +++++++++++++++++++++ tests/test_main.py | 65 ++++------------------------------------------ 2 files changed, 34 insertions(+), 60 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 40433342e..abcacfe00 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -401,6 +401,35 @@ def submodule_repository( return git_repository +@pytest.fixture() +def subproject_repository(fake_repository: Path) -> Path: + """Add a Meson subproject to the fake repo.""" + (fake_repository / "meson.build").write_text( + cleandoc( + """ + SPDX-FileCopyrightText: 2022 Jane Doe + SPDX-License-Identifier: CC0-1.0 + """ + ) + ) + subprojects_dir = fake_repository / "subprojects" + subprojects_dir.mkdir() + libfoo_dir = subprojects_dir / "libfoo" + libfoo_dir.mkdir() + # ./subprojects/foo.wrap has license and linter succeeds + (subprojects_dir / "foo.wrap").write_text( + cleandoc( + """ + SPDX-FileCopyrightText: 2022 Jane Doe + SPDX-License-Identifier: CC0-1.0 + """ + ) + ) + # ./subprojects/libfoo/foo.c misses license but is ignored + (libfoo_dir / "foo.c").write_text("foo") + return fake_repository + + @pytest.fixture(scope="session") def reuse_dep5(): """Create a ReuseDep5 object.""" diff --git a/tests/test_main.py b/tests/test_main.py index 94c078f9f..86095ec67 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -150,89 +150,34 @@ def test_lint_submodule_included_fail(submodule_repository, stringio): def test_lint_meson_subprojects(fake_repository, stringio): """Verify that subprojects are ignored.""" - (fake_repository / "meson.build").write_text( - cleandoc( - """ - SPDX-FileCopyrightText: 2022 Jane Doe - SPDX-License-Identifier: CC0-1.0 - """ - ) - ) - subprojects_dir = fake_repository / "subprojects" - subprojects_dir.mkdir() - libfoo_dir = subprojects_dir / "libfoo" - libfoo_dir.mkdir() - # ./subprojects/foo.wrap has license and linter succeeds - (subprojects_dir / "foo.wrap").write_text( - cleandoc( - """ - SPDX-FileCopyrightText: 2022 Jane Doe - SPDX-License-Identifier: CC0-1.0 - """ - ) - ) - # ./subprojects/libfoo/foo.c misses license but is ignored - (libfoo_dir / "foo.c").write_text("foo") result = main(["lint"], out=stringio) assert result == 0 assert ":-)" in stringio.getvalue() -def test_lint_meson_subprojects_fail(fake_repository, stringio): +def test_lint_meson_subprojects_fail(subproject_repository, stringio): """Verify that files in './subprojects' are not ignored.""" - (fake_repository / "meson.build").write_text( - cleandoc( - """ - SPDX-FileCopyrightText: 2022 Jane Doe - SPDX-License-Identifier: CC0-1.0 - """ - ) - ) - subprojects_dir = fake_repository / "subprojects" - subprojects_dir.mkdir() # ./subprojects/foo.wrap misses license and linter fails - (subprojects_dir / "foo.wrap").write_text("foo") + (subproject_repository / "subprojects/foo.wrap").write_text("foo") result = main(["lint"], out=stringio) assert result == 1 assert ":-(" in stringio.getvalue() -def test_lint_meson_subprojects_included_fail(fake_repository, stringio): +def test_lint_meson_subprojects_included_fail(subproject_repository, stringio): """When Meson subprojects are included, fail on errors.""" - (fake_repository / "meson.build").write_text( - cleandoc( - """ - SPDX-FileCopyrightText: 2022 Jane Doe - SPDX-License-Identifier: CC0-1.0 - """ - ) - ) - libfoo_dir = fake_repository / "subprojects/libfoo" - libfoo_dir.mkdir(parents=True) - # ./subprojects/libfoo/foo.c misses license and linter fails - (libfoo_dir / "foo.c").write_text("foo") result = main(["--include-meson-subprojects", "lint"], out=stringio) assert result == 1 assert ":-(" in stringio.getvalue() -def test_lint_meson_subprojects_included(fake_repository, stringio): +def test_lint_meson_subprojects_included(subproject_repository, stringio): """Successfully lint when Meson subprojects are included.""" - (fake_repository / "meson.build").write_text( - cleandoc( - """ - SPDX-FileCopyrightText: 2022 Jane Doe - SPDX-License-Identifier: CC0-1.0 - """ - ) - ) - libfoo_dir = fake_repository / "subprojects/libfoo" - libfoo_dir.mkdir(parents=True) # ./subprojects/libfoo/foo.c has license and linter succeeds - (libfoo_dir / "foo.c").write_text( + (subproject_repository / "subprojects/libfoo/foo.c").write_text( cleandoc( """ SPDX-FileCopyrightText: 2022 Jane Doe From f08708286157829adb926a2482590bb300df72d1 Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Mon, 8 Jul 2024 18:38:28 +0200 Subject: [PATCH 05/10] Fix performance regression when searching for REUSE.toml Previously, the glob **/REUSE.toml would search _all_ directories, including big directories containing build artefacts that are otherwise ignored by VCS. This commit uses the same logic to find REUSE.toml as any other file is found. It's not super rapid, but it does a heap more filtering. Signed-off-by: Carmen Bianca BAKKER --- src/reuse/global_licensing.py | 41 ++++++++++++++++---- src/reuse/project.py | 36 ++++++++++++++++-- tests/test_global_licensing.py | 69 +++++++++++++++++++++++++++++++++- tests/test_project.py | 2 +- 4 files changed, 134 insertions(+), 14 deletions(-) diff --git a/src/reuse/global_licensing.py b/src/reuse/global_licensing.py index 6a659c9ff..6345e6aae 100644 --- a/src/reuse/global_licensing.py +++ b/src/reuse/global_licensing.py @@ -40,6 +40,8 @@ from . import ReuseInfo, SourceType from ._util import _LICENSING, StrPath, is_relative_to +from .covered_files import iter_files +from .vcs import VCSStrategy _LOGGER = logging.getLogger(__name__) @@ -231,7 +233,7 @@ class GlobalLicensing(ABC): @classmethod @abstractmethod - def from_file(cls, path: StrPath) -> "GlobalLicensing": + def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing": """Parse the file and create a :class:`GlobalLicensing` object from its contents. @@ -261,7 +263,7 @@ class ReuseDep5(GlobalLicensing): dep5_copyright: Copyright @classmethod - def from_file(cls, path: StrPath) -> "ReuseDep5": + def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseDep5": path = Path(path) try: with path.open(encoding="utf-8") as fp: @@ -437,7 +439,7 @@ def from_toml(cls, toml: str, source: str) -> "ReuseTOML": return cls.from_dict(tomldict, source) @classmethod - def from_file(cls, path: StrPath) -> "ReuseTOML": + def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseTOML": with Path(path).open(encoding="utf-8") as fp: return cls.from_toml(fp.read(), str(path)) @@ -483,11 +485,21 @@ class NestedReuseTOML(GlobalLicensing): reuse_tomls: List[ReuseTOML] = attrs.field() @classmethod - def from_file(cls, path: StrPath) -> "GlobalLicensing": + def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing": """TODO: *path* is a directory instead of a file.""" + include_submodules: bool = kwargs.get("include_submodules", False) + include_meson_subprojects: bool = kwargs.get( + "include_meson_subprojects", False + ) + vcs_strategy: Optional[VCSStrategy] = kwargs.get("vcs_strategy") tomls = [ ReuseTOML.from_file(toml_path) - for toml_path in cls.find_reuse_tomls(path) + for toml_path in cls.find_reuse_tomls( + path, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ) ] return cls(reuse_tomls=tomls, source=str(path)) @@ -546,9 +558,24 @@ def reuse_info_of( return dict(result) @classmethod - def find_reuse_tomls(cls, path: StrPath) -> Generator[Path, None, None]: + def find_reuse_tomls( + cls, + path: StrPath, + include_submodules: bool = False, + include_meson_subprojects: bool = False, + vcs_strategy: Optional[VCSStrategy] = None, + ) -> Generator[Path, None, None]: """Find all REUSE.toml files in *path*.""" - return Path(path).rglob("**/REUSE.toml") + return ( + item + for item in iter_files( + path, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ) + if item.name == "REUSE.toml" + ) def _find_relevant_tomls(self, path: StrPath) -> List[ReuseTOML]: found = [] diff --git a/src/reuse/project.py b/src/reuse/project.py index f5a92796d..11612d6b9 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -67,6 +67,10 @@ class GlobalLicensingFound(NamedTuple): cls: Type[GlobalLicensing] +# TODO: The information (root, include_submodules, include_meson_subprojects, +# vcs_strategy) is passed to SO MANY PLACES. Maybe Project should be simplified +# to contain exclusively those values, or maybe these values should be extracted +# out of Project to simplify passing this information around. class Project: """Simple object that holds the project's root, which is necessary for many interactions. @@ -146,9 +150,19 @@ def from_directory( vcs_strategy = cls._detect_vcs_strategy(root) global_licensing: Optional[GlobalLicensing] = None - found = cls.find_global_licensing(root) + found = cls.find_global_licensing( + root, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ) if found: - global_licensing = found.cls.from_file(found.path) + global_licensing = found.cls.from_file( + found.path, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ) project = cls( root, @@ -321,7 +335,11 @@ def relative_from_root(self, path: StrPath) -> Path: @classmethod def find_global_licensing( - cls, root: Path + cls, + root: Path, + include_submodules: bool = False, + include_meson_subprojects: bool = False, + vcs_strategy: Optional[VCSStrategy] = None, ) -> Optional[GlobalLicensingFound]: """Find the path and corresponding class of a project directory's :class:`GlobalLicensing`. @@ -344,9 +362,18 @@ def find_global_licensing( PendingDeprecationWarning, ) candidate = GlobalLicensingFound(dep5_path, ReuseDep5) + + # TODO: the performance of this isn't great. toml_path = None with contextlib.suppress(StopIteration): - toml_path = next(root.rglob("**/REUSE.toml")) + toml_path = next( + NestedReuseTOML.find_reuse_tomls( + root, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, + ) + ) if toml_path is not None: if candidate is not None: raise GlobalLicensingConflict( @@ -357,6 +384,7 @@ def find_global_licensing( ).format(new_path=toml_path, old_path=dep5_path) ) candidate = GlobalLicensingFound(root, NestedReuseTOML) + return candidate def _identifier_of_license(self, path: Path) -> str: diff --git a/tests/test_global_licensing.py b/tests/test_global_licensing.py index bd2b289d8..26325827e 100644 --- a/tests/test_global_licensing.py +++ b/tests/test_global_licensing.py @@ -25,6 +25,7 @@ ReuseDep5, ReuseTOML, ) +from reuse.vcs import VCSStrategyGit # REUSE-IgnoreStart @@ -699,19 +700,83 @@ def test_one_deep(self, empty_directory): """Find a single REUSE.toml deeper in the directory tree.""" (empty_directory / "src").mkdir() path = empty_directory / "src/REUSE.toml" - path.touch() + path.write_text("version = 1") result = NestedReuseTOML.find_reuse_tomls(empty_directory) assert list(result) == [path] def test_multiple(self, fake_repository_reuse_toml): """Find multiple REUSE.tomls.""" - (fake_repository_reuse_toml / "src/REUSE.toml").touch() + (fake_repository_reuse_toml / "src/REUSE.toml").write_text( + "version = 1" + ) result = NestedReuseTOML.find_reuse_tomls(fake_repository_reuse_toml) assert set(result) == { fake_repository_reuse_toml / "REUSE.toml", fake_repository_reuse_toml / "src/REUSE.toml", } + def test_with_vcs_strategy(self, git_repository): + """Ignore the correct files ignored by the repository.""" + (git_repository / "REUSE.toml").write_text("version = 1") + (git_repository / "build/REUSE.toml").write_text("version =1") + (git_repository / "src/REUSE.toml").write_text("version = 1") + + result = NestedReuseTOML.find_reuse_tomls( + git_repository, vcs_strategy=VCSStrategyGit(git_repository) + ) + assert set(result) == { + git_repository / "REUSE.toml", + git_repository / "src/REUSE.toml", + } + + def test_includes_submodule(self, submodule_repository): + """include_submodules is correctly implemented.""" + (submodule_repository / "REUSE.toml").write_text("version = 1") + (submodule_repository / "submodule/REUSE.toml").write_text( + "version = 1" + ) + + result_without = NestedReuseTOML.find_reuse_tomls( + submodule_repository, + vcs_strategy=VCSStrategyGit(submodule_repository), + ) + assert set(result_without) == {submodule_repository / "REUSE.toml"} + + result_with = NestedReuseTOML.find_reuse_tomls( + submodule_repository, + include_submodules=True, + vcs_strategy=VCSStrategyGit(submodule_repository), + ) + assert set(result_with) == { + submodule_repository / "REUSE.toml", + submodule_repository / "submodule/REUSE.toml", + } + + def test_includes_meson_subprojects(self, subproject_repository): + """include_meson_subprojects is correctly implemented.""" + (subproject_repository / "REUSE.toml").write_text("version = 1") + (subproject_repository / "subprojects/REUSE.toml").write_text( + "version = 1" + ) + (subproject_repository / "subprojects/libfoo/REUSE.toml").write_text( + "version = 1" + ) + + result_without = NestedReuseTOML.find_reuse_tomls(subproject_repository) + assert set(result_without) == { + subproject_repository / "REUSE.toml", + subproject_repository / "subprojects/REUSE.toml", + } + + result_with = NestedReuseTOML.find_reuse_tomls( + subproject_repository, include_meson_subprojects=True + ) + assert set(result_with) == { + subproject_repository / "REUSE.toml", + subproject_repository / "subprojects/REUSE.toml", + subproject_repository / "subprojects/libfoo/REUSE.toml", + } + class TestNestedReuseTOMLReuseInfoOf: """Tests for NestedReuseTOML.reuse_info_of.""" diff --git a/tests/test_project.py b/tests/test_project.py index 6d90c49a1..24991d3bb 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -591,7 +591,7 @@ def test_find_global_licensing_none(empty_directory): def test_find_global_licensing_conflict(fake_repository_dep5): """Expect an error on a conflict""" - (fake_repository_dep5 / "REUSE.toml").touch() + (fake_repository_dep5 / "REUSE.toml").write_text("version = 1") with pytest.raises(GlobalLicensingConflict): Project.find_global_licensing(fake_repository_dep5) From 5651081165ff0530a5e1cbcb5e7175f660d1c4e1 Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Mon, 8 Jul 2024 19:32:41 +0200 Subject: [PATCH 06/10] Convert Project into attrs class Signed-off-by: Carmen Bianca BAKKER --- src/reuse/project.py | 56 ++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/src/reuse/project.py b/src/reuse/project.py index 11612d6b9..e0b7c9685 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -32,6 +32,7 @@ cast, ) +import attrs from binaryornot.check import is_binary from . import IdentifierNotFound, ReuseInfo @@ -71,42 +72,35 @@ class GlobalLicensingFound(NamedTuple): # vcs_strategy) is passed to SO MANY PLACES. Maybe Project should be simplified # to contain exclusively those values, or maybe these values should be extracted # out of Project to simplify passing this information around. +@attrs.define class Project: """Simple object that holds the project's root, which is necessary for many interactions. """ - # pylint: disable=too-many-arguments - def __init__( - self, - root: StrPath, - vcs_strategy: Optional[VCSStrategy] = None, - license_map: Optional[Dict[str, Dict]] = None, - licenses: Optional[Dict[str, Path]] = None, - global_licensing: Optional[GlobalLicensing] = None, - include_submodules: bool = False, - include_meson_subprojects: bool = False, - ): - self.root = Path(root) - - if vcs_strategy is None: - vcs_strategy = VCSStrategyNone(root) - self.vcs_strategy = vcs_strategy - - if license_map is None: - license_map = LICENSE_MAP - self.license_map = license_map.copy() - self.license_map.update(EXCEPTION_MAP) - self.licenses_without_extension: Dict[str, Path] = {} - - if licenses is None: - licenses = {} - self.licenses = licenses - - self.global_licensing = global_licensing - - self.include_submodules = include_submodules - self.include_meson_subprojects = include_meson_subprojects + root: Path = attrs.field(converter=Path) + include_submodules: bool = False + include_meson_subprojects: bool = False + vcs_strategy: VCSStrategy = attrs.field() + global_licensing: Optional[GlobalLicensing] = None + + # TODO: I want to get rid of these, or somehow refactor this mess. + license_map: Dict[str, Dict] = attrs.field() + licenses: Dict[str, Path] = attrs.field(factory=dict) + + licenses_without_extension: Dict[str, Path] = attrs.field( + init=False, factory=dict + ) + + @vcs_strategy.default + def _default_vcs_strategy(self) -> VCSStrategy: + return VCSStrategyNone(self.root) + + @license_map.default + def _default_license_map(self) -> Dict[str, Dict]: + license_map = LICENSE_MAP.copy() + license_map.update(EXCEPTION_MAP) + return license_map @classmethod def from_directory( From 82807989df36a7c70e409a042bebab27a64b14b1 Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Tue, 9 Jul 2024 18:48:20 +0200 Subject: [PATCH 07/10] Get errant source from exception instead of find_global_licensing In _main, find_global_licensing was called to find the file that contained some parsing error. This may have contained false information in the case of multiple REUSE.tomls. Instead of needlessly calling this function, the errant file is now an attribute on the exception. Signed-off-by: Carmen Bianca BAKKER --- src/reuse/_main.py | 10 ++---- src/reuse/global_licensing.py | 65 +++++++++++++++++++++++----------- tests/test_global_licensing.py | 43 ++++++++++++++++++---- tests/test_main.py | 38 ++++++++++++++++++-- tests/test_project.py | 2 +- 5 files changed, 119 insertions(+), 39 deletions(-) diff --git a/src/reuse/_main.py b/src/reuse/_main.py index 37d133056..b5df4a7f7 100644 --- a/src/reuse/_main.py +++ b/src/reuse/_main.py @@ -31,7 +31,7 @@ from ._format import INDENT, fill_all, fill_paragraph from ._util import PathType, setup_logging from .global_licensing import GlobalLicensingParseError -from .project import GlobalLicensingConflict, GlobalLicensingFound, Project +from .project import GlobalLicensingConflict, Project from .vcs import find_root _LOGGER = logging.getLogger(__name__) @@ -267,18 +267,12 @@ def main(args: Optional[List[str]] = None, out: IO[str] = sys.stdout) -> int: project = Project.from_directory(root) # FileNotFoundError and NotADirectoryError don't need to be caught because # argparse already made sure of these things. - except UnicodeDecodeError: - found = cast(GlobalLicensingFound, Project.find_global_licensing(root)) - main_parser.error( - _("'{path}' could not be decoded as UTF-8.").format(path=found.path) - ) except GlobalLicensingParseError as error: - found = cast(GlobalLicensingFound, Project.find_global_licensing(root)) main_parser.error( _( "'{path}' could not be parsed. We received the following error" " message: {message}" - ).format(path=found.path, message=str(error)) + ).format(path=error.source, message=str(error)) ) except GlobalLicensingConflict as error: main_parser.error(str(error)) diff --git a/src/reuse/global_licensing.py b/src/reuse/global_licensing.py index 6345e6aae..724c2c7b6 100644 --- a/src/reuse/global_licensing.py +++ b/src/reuse/global_licensing.py @@ -38,9 +38,9 @@ from debian.copyright import Error as DebianError from license_expression import ExpressionError -from . import ReuseInfo, SourceType from ._util import _LICENSING, StrPath, is_relative_to from .covered_files import iter_files +from . import ReuseException, ReuseInfo, SourceType from .vcs import VCSStrategy _LOGGER = logging.getLogger(__name__) @@ -67,13 +67,17 @@ class PrecedenceType(Enum): OVERRIDE = "override" -class GlobalLicensingParseError(Exception): +class GlobalLicensingParseError(ReuseException): """An exception representing any kind of error that occurs when trying to parse a :class:`GlobalLicensing` file. """ + def __init__(self, *args: Any, source: Optional[str] = None): + super().__init__(*args) + self.source = source -class GlobalLicensingParseTypeError(TypeError, GlobalLicensingParseError): + +class GlobalLicensingParseTypeError(GlobalLicensingParseError, TypeError): """An exception representing a type error while trying to parse a :class:`GlobalLicensing` file. """ @@ -103,6 +107,7 @@ def __call__( attr_name = instance.TOML_KEYS[attribute.name] else: attr_name = attribute.name + source = getattr(instance, "source", None) if not isinstance(value, self.collection_type): raise GlobalLicensingParseTypeError( @@ -114,7 +119,8 @@ def __call__( type_name=self.collection_type.__name__, value=repr(value), value_class=repr(value.__class__), - ) + ), + source=source, ) for item in value: if not isinstance(item, self.value_type): @@ -127,13 +133,15 @@ def __call__( type_name=self.value_type.__name__, item_value=repr(item), item_class=repr(item.__class__), - ) + ), + source=source, ) if not self.optional and not value: raise GlobalLicensingParseValueError( _("{attr_name} must not be empty.").format( - attr_name=repr(attr_name) - ) + attr_name=repr(attr_name), + ), + source=source, ) @@ -161,7 +169,8 @@ def __call__(self, inst: Any, attr: attrs.Attribute, value: _T) -> None: type=repr(error.args[2].__name__), value=repr(error.args[3]), value_type=repr(error.args[3].__class__), - ) + ), + source=getattr(inst, "source", None), ) from error @@ -174,7 +183,7 @@ def _instance_of( def _str_to_global_precedence(value: Any) -> PrecedenceType: try: return PrecedenceType(value) - except ValueError as err: + except ValueError as error: raise GlobalLicensingParseValueError( _( "The value of 'precedence' must be one of {precedence_vals}" @@ -185,7 +194,7 @@ def _str_to_global_precedence(value: Any) -> PrecedenceType: ), received=repr(value), ) - ) from err + ) from error @overload @@ -239,7 +248,6 @@ def from_file(cls, path: StrPath, **kwargs: Any) -> "GlobalLicensing": Raises: FileNotFoundError: file doesn't exist. - UnicodeDecodeError: could not decode file as UTF-8. OSError: some other error surrounding I/O. GlobalLicensingParseError: file could not be parsed. """ @@ -268,13 +276,17 @@ def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseDep5": try: with path.open(encoding="utf-8") as fp: return cls(str(path), Copyright(fp)) - except UnicodeDecodeError: - raise + except UnicodeDecodeError as error: + raise GlobalLicensingParseError( + str(error), source=str(path) + ) from error # TODO: Remove ValueError once # # is closed except (DebianError, ValueError) as error: - raise GlobalLicensingParseError(str(error)) from error + raise GlobalLicensingParseError( + str(error), source=str(path) + ) from error def reuse_info_of( self, path: StrPath @@ -420,10 +432,14 @@ def from_dict(cls, values: Dict[str, Any], source: str) -> "ReuseTOML": new_dict["source"] = source annotation_dicts = values.get("annotations", []) - annotations = [ - AnnotationsItem.from_dict(annotation) - for annotation in annotation_dicts - ] + try: + annotations = [ + AnnotationsItem.from_dict(annotation) + for annotation in annotation_dicts + ] + except GlobalLicensingParseError as error: + error.source = source + raise error from error new_dict["annotations"] = annotations @@ -435,13 +451,20 @@ def from_toml(cls, toml: str, source: str) -> "ReuseTOML": try: tomldict = tomlkit.loads(toml) except tomlkit.exceptions.TOMLKitError as error: - raise GlobalLicensingParseError(str(error)) from error + raise GlobalLicensingParseError( + str(error), source=source + ) from error return cls.from_dict(tomldict, source) @classmethod def from_file(cls, path: StrPath, **kwargs: Any) -> "ReuseTOML": - with Path(path).open(encoding="utf-8") as fp: - return cls.from_toml(fp.read(), str(path)) + try: + with Path(path).open(encoding="utf-8") as fp: + return cls.from_toml(fp.read(), str(path)) + except UnicodeDecodeError as error: + raise GlobalLicensingParseError( + str(error), source=str(path) + ) from error def find_annotations_item(self, path: StrPath) -> Optional[AnnotationsItem]: """Find a :class:`AnnotationsItem` that matches *path*. The latest match diff --git a/tests/test_global_licensing.py b/tests/test_global_licensing.py index 26325827e..2a415426f 100644 --- a/tests/test_global_licensing.py +++ b/tests/test_global_licensing.py @@ -347,33 +347,37 @@ def test_simple(self, annotations_item): def test_version_not_int(self, annotations_item): """Version must be an int""" - with pytest.raises(GlobalLicensingParseTypeError): + with pytest.raises(GlobalLicensingParseTypeError) as exc_info: ReuseTOML( version=1.2, source="REUSE.toml", annotations=[annotations_item] ) + assert exc_info.value.source == "REUSE.toml" def test_source_not_str(self, annotations_item): """Source must be a str.""" - with pytest.raises(GlobalLicensingParseTypeError): + with pytest.raises(GlobalLicensingParseTypeError) as exc_info: ReuseTOML(version=1, source=123, annotations=[annotations_item]) + assert exc_info.value.source == 123 def test_annotations_must_be_list(self, annotations_item): """Annotations must be in a list, not any other collection.""" # TODO: Technically we could change this to 'any collection that is # ordered', but let's not split hairs. - with pytest.raises(GlobalLicensingParseTypeError): + with pytest.raises(GlobalLicensingParseTypeError) as exc_info: ReuseTOML( version=1, source="REUSE.toml", annotations=iter([annotations_item]), ) + assert exc_info.value.source == "REUSE.toml" def test_annotations_must_be_object(self): """Annotations must be AnnotationsItem objects.""" - with pytest.raises(GlobalLicensingParseTypeError): + with pytest.raises(GlobalLicensingParseTypeError) as exc_info: ReuseTOML( version=1, source="REUSE.toml", annotations=[{"foo": "bar"}] ) + assert exc_info.value.source == "REUSE.toml" class TestReuseTOMLFromDict: @@ -428,6 +432,24 @@ def test_no_version(self): "REUSE.toml", ) + def test_annotations_error(self): + """If there is an error in the annotations, the error get ReuseTOML's + source. + """ + with pytest.raises(GlobalLicensingParseTypeError) as exc_info: + ReuseTOML.from_dict( + { + "version": 1, + "annotations": [ + { + "path": {1}, + } + ], + }, + "REUSE.toml", + ) + assert exc_info.value.source == "REUSE.toml" + class TestReuseTOMLFromToml: """Test the from_toml method of ReuseTOML.""" @@ -1097,14 +1119,19 @@ def test_unicode_decode_error(self, fake_repository_dep5): shutil.copy( RESOURCES_DIRECTORY / "fsfe.png", fake_repository_dep5 / "fsfe.png" ) - with pytest.raises(UnicodeDecodeError): + with pytest.raises(GlobalLicensingParseError) as exc_info: ReuseDep5.from_file(fake_repository_dep5 / "fsfe.png") + error = exc_info.value + assert error.source == str(fake_repository_dep5 / "fsfe.png") + assert "'utf-8' codec can't decode byte" in str(error) def test_parse_error(self, empty_directory): """Raise GlobalLicensingParseError on parse error.""" (empty_directory / "foo").write_text("foo") - with pytest.raises(GlobalLicensingParseError): + with pytest.raises(GlobalLicensingParseError) as exc_info: ReuseDep5.from_file(empty_directory / "foo") + error = exc_info.value + assert error.source == str(empty_directory / "foo") def test_double_copyright_parse_error(self, empty_directory): """Raise GlobalLicensingParseError on double Copyright lines.""" @@ -1123,8 +1150,10 @@ def test_double_copyright_parse_error(self, empty_directory): """ ) ) - with pytest.raises(GlobalLicensingParseError): + with pytest.raises(GlobalLicensingParseError) as exc_info: ReuseDep5.from_file(empty_directory / "foo") + error = exc_info.value + assert error.source == str(empty_directory / "foo") def test_reuse_dep5_reuse_info_of(reuse_dep5): diff --git a/tests/test_main.py b/tests/test_main.py index 86095ec67..424ea7048 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -217,7 +217,10 @@ def test_lint_dep5_decode_error(fake_repository_dep5, capsys): ) with pytest.raises(SystemExit): main(["lint"]) - assert "could not be decoded" in capsys.readouterr().err + error = capsys.readouterr().err + assert str(fake_repository_dep5 / ".reuse/dep5") in error + assert "could not be parsed" in error + assert "'utf-8' codec can't decode byte" in error def test_lint_dep5_parse_error(fake_repository_dep5, capsys): @@ -225,7 +228,38 @@ def test_lint_dep5_parse_error(fake_repository_dep5, capsys): (fake_repository_dep5 / ".reuse/dep5").write_text("foo") with pytest.raises(SystemExit): main(["lint"]) - assert "could not be parsed" in capsys.readouterr().err + error = capsys.readouterr().err + assert str(fake_repository_dep5 / ".reuse/dep5") in error + assert "could not be parsed" in error + + +def test_lint_toml_parse_error_version(fake_repository_reuse_toml, capsys): + """If version has the wrong type, print an error.""" + (fake_repository_reuse_toml / "REUSE.toml").write_text("version = 'a'") + with pytest.raises(SystemExit): + main(["lint"]) + error = capsys.readouterr().err + assert str(fake_repository_reuse_toml / "REUSE.toml") in error + assert "could not be parsed" in error + + +def test_lint_toml_parse_error_annotation(fake_repository_reuse_toml, capsys): + """If there is an error in an annotation, print an error.""" + (fake_repository_reuse_toml / "REUSE.toml").write_text( + cleandoc_nl( + """ + version = 1 + + [[annotations]] + path = 1 + """ + ) + ) + with pytest.raises(SystemExit): + main(["lint"]) + error = capsys.readouterr().err + assert str(fake_repository_reuse_toml / "REUSE.toml") in error + assert "could not be parsed" in error def test_lint_json(fake_repository, stringio): diff --git a/tests/test_project.py b/tests/test_project.py index 24991d3bb..0be3ed3f6 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -51,7 +51,7 @@ def test_project_conflicting_global_licensing(empty_directory): """ (empty_directory / "REUSE.toml").write_text("version = 1") (empty_directory / ".reuse").mkdir() - (empty_directory / ".reuse/dep5").touch() + shutil.copy(RESOURCES_DIRECTORY / "dep5", empty_directory / ".reuse/dep5") with pytest.raises(GlobalLicensingConflict): Project.from_directory(empty_directory) From a8be9db47281097fbaebe61bde12d3a33cc21e27 Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Tue, 9 Jul 2024 19:13:23 +0200 Subject: [PATCH 08/10] Reduce calls to iter_files Previously this function would be called three times on a lint: - once by NestedReuseTOML.find_reuse_tomls in Project.find_global_licensing - once by NestedReuseTOML.find_reuse_tomls in NestedReuseTOML.from_file - once to lint all the files I now no longer use NestedReuseTOML.from_file. It's still not ideal to go over all files twice, but it's better than thrice. Signed-off-by: Carmen Bianca BAKKER --- src/reuse/project.py | 60 +++++++++++++++++++++++++------------------ tests/test_project.py | 25 +++++++++++++----- 2 files changed, 54 insertions(+), 31 deletions(-) diff --git a/src/reuse/project.py b/src/reuse/project.py index e0b7c9685..9a2906290 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -9,7 +9,6 @@ """Module that contains the central Project class.""" -import contextlib import errno import glob import logging @@ -51,6 +50,7 @@ NestedReuseTOML, PrecedenceType, ReuseDep5, + ReuseTOML, ) from .vcs import VCSStrategy, VCSStrategyNone, all_vcs_strategies @@ -151,11 +151,8 @@ def from_directory( vcs_strategy=vcs_strategy, ) if found: - global_licensing = found.cls.from_file( - found.path, - include_submodules=include_submodules, - include_meson_subprojects=include_meson_subprojects, - vcs_strategy=vcs_strategy, + global_licensing = cls._global_licensing_from_found( + found, str(root) ) project = cls( @@ -334,7 +331,7 @@ def find_global_licensing( include_submodules: bool = False, include_meson_subprojects: bool = False, vcs_strategy: Optional[VCSStrategy] = None, - ) -> Optional[GlobalLicensingFound]: + ) -> List[GlobalLicensingFound]: """Find the path and corresponding class of a project directory's :class:`GlobalLicensing`. @@ -342,7 +339,7 @@ def find_global_licensing( GlobalLicensingConflict: if more than one global licensing config file is present. """ - candidate: Optional[GlobalLicensingFound] = None + candidates: List[GlobalLicensingFound] = [] dep5_path = root / ".reuse/dep5" if (dep5_path).exists(): # Sneaky workaround to not print this warning. @@ -355,31 +352,44 @@ def find_global_licensing( ), PendingDeprecationWarning, ) - candidate = GlobalLicensingFound(dep5_path, ReuseDep5) - - # TODO: the performance of this isn't great. - toml_path = None - with contextlib.suppress(StopIteration): - toml_path = next( - NestedReuseTOML.find_reuse_tomls( - root, - include_submodules=include_submodules, - include_meson_subprojects=include_meson_subprojects, - vcs_strategy=vcs_strategy, - ) + candidates = [GlobalLicensingFound(dep5_path, ReuseDep5)] + + reuse_toml_candidates = [ + GlobalLicensingFound(path, ReuseTOML) + for path in NestedReuseTOML.find_reuse_tomls( + root, + include_submodules=include_submodules, + include_meson_subprojects=include_meson_subprojects, + vcs_strategy=vcs_strategy, ) - if toml_path is not None: - if candidate is not None: + ] + if reuse_toml_candidates: + if candidates: raise GlobalLicensingConflict( _( "Found both '{new_path}' and '{old_path}'. You" " cannot keep both files simultaneously; they are" " not intercompatible." - ).format(new_path=toml_path, old_path=dep5_path) + ).format( + new_path=reuse_toml_candidates[0].path, + old_path=dep5_path, + ) ) - candidate = GlobalLicensingFound(root, NestedReuseTOML) + candidates = reuse_toml_candidates + + return candidates - return candidate + @classmethod + def _global_licensing_from_found( + cls, found: List[GlobalLicensingFound], root: StrPath + ) -> GlobalLicensing: + if len(found) == 1 and found[0].cls == ReuseDep5: + return ReuseDep5.from_file(found[0].path) + # This is an impossible scenario at time of writing. + if not all(item.cls == ReuseTOML for item in found): + raise NotImplementedError() + tomls = [ReuseTOML.from_file(item.path) for item in found] + return NestedReuseTOML(reuse_tomls=tomls, source=str(root)) def _identifier_of_license(self, path: Path) -> str: """Figure out the SPDX License identifier of a license given its path. diff --git a/tests/test_project.py b/tests/test_project.py index 0be3ed3f6..59014bf4c 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -24,8 +24,8 @@ from reuse.covered_files import iter_files from reuse.global_licensing import ( GlobalLicensingParseError, - NestedReuseTOML, ReuseDep5, + ReuseTOML, ) from reuse.project import GlobalLicensingConflict, Project @@ -567,8 +567,10 @@ def test_find_global_licensing_dep5(fake_repository_dep5): """Find the dep5 file. Also output a PendingDeprecationWarning.""" with warnings.catch_warnings(record=True) as caught_warnings: result = Project.find_global_licensing(fake_repository_dep5) - assert result.path == fake_repository_dep5 / ".reuse/dep5" - assert result.cls == ReuseDep5 + assert len(result) == 1 + dep5 = result[0] + assert dep5.path == fake_repository_dep5 / ".reuse/dep5" + assert dep5.cls == ReuseDep5 assert len(caught_warnings) == 1 assert issubclass( @@ -579,14 +581,25 @@ def test_find_global_licensing_dep5(fake_repository_dep5): def test_find_global_licensing_reuse_toml(fake_repository_reuse_toml): """Find the REUSE.toml file.""" result = Project.find_global_licensing(fake_repository_reuse_toml) - assert result.path == fake_repository_reuse_toml / "." - assert result.cls == NestedReuseTOML + assert len(result) == 1 + toml = result[0] + assert toml.path == fake_repository_reuse_toml / "REUSE.toml" + assert toml.cls == ReuseTOML + + +def test_find_global_licensing_reuse_toml_multiple(fake_repository_reuse_toml): + """Find multiple REUSE.tomls.""" + (fake_repository_reuse_toml / "src/REUSE.toml").write_text("version = 1") + result = Project.find_global_licensing(fake_repository_reuse_toml) + assert len(result) == 2 + assert result[0].path == fake_repository_reuse_toml / "REUSE.toml" + assert result[1].path == fake_repository_reuse_toml / "src/REUSE.toml" def test_find_global_licensing_none(empty_directory): """Find no file.""" result = Project.find_global_licensing(empty_directory) - assert result is None + assert not result def test_find_global_licensing_conflict(fake_repository_dep5): From 455fad9120dad920dc65255382aa5c3bf0a055af Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Tue, 9 Jul 2024 19:25:02 +0200 Subject: [PATCH 09/10] Add change log entries Signed-off-by: Carmen Bianca BAKKER --- changelog.d/changed/reuse-toml-ignored.md | 2 ++ changelog.d/fixed/performance.md | 3 +++ changelog.d/fixed/source-bug.md | 3 +++ 3 files changed, 8 insertions(+) create mode 100644 changelog.d/changed/reuse-toml-ignored.md create mode 100644 changelog.d/fixed/performance.md create mode 100644 changelog.d/fixed/source-bug.md diff --git a/changelog.d/changed/reuse-toml-ignored.md b/changelog.d/changed/reuse-toml-ignored.md new file mode 100644 index 000000000..c99ce516b --- /dev/null +++ b/changelog.d/changed/reuse-toml-ignored.md @@ -0,0 +1,2 @@ +- If `REUSE.toml` is ignored by VCS, the linter now also ignores this files. + (#1047) diff --git a/changelog.d/fixed/performance.md b/changelog.d/fixed/performance.md new file mode 100644 index 000000000..fafae0995 --- /dev/null +++ b/changelog.d/fixed/performance.md @@ -0,0 +1,3 @@ +- Performance greatly improved for projects with large directories ignored by + VCS. (#1047) +- Performance slightly improved for large projects. (#1047) diff --git a/changelog.d/fixed/source-bug.md b/changelog.d/fixed/source-bug.md new file mode 100644 index 000000000..a4959bc75 --- /dev/null +++ b/changelog.d/fixed/source-bug.md @@ -0,0 +1,3 @@ +- In some scenarios, where a user has multiple `REUSE.toml` files and one of + those files could not be parsed, the wrong `REUSE.toml` was signalled as being + unparseable. This is now fixed. (#1047) From 9a237f406cbb0e1cec254f48e5328353ae11070e Mon Sep 17 00:00:00 2001 From: Carmen Bianca BAKKER Date: Wed, 18 Sep 2024 15:12:39 +0200 Subject: [PATCH 10/10] Small cleanup This cleanup is the result of a rebase. It would be more effort to incorporate these fixes into their original commits. Signed-off-by: Carmen Bianca BAKKER --- src/reuse/global_licensing.py | 2 +- src/reuse/project.py | 4 ---- tests/test_covered_files.py | 9 +++++---- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/reuse/global_licensing.py b/src/reuse/global_licensing.py index 724c2c7b6..1a28c50e1 100644 --- a/src/reuse/global_licensing.py +++ b/src/reuse/global_licensing.py @@ -38,9 +38,9 @@ from debian.copyright import Error as DebianError from license_expression import ExpressionError +from . import ReuseException, ReuseInfo, SourceType from ._util import _LICENSING, StrPath, is_relative_to from .covered_files import iter_files -from . import ReuseException, ReuseInfo, SourceType from .vcs import VCSStrategy _LOGGER = logging.getLogger(__name__) diff --git a/src/reuse/project.py b/src/reuse/project.py index 9a2906290..3f6148931 100644 --- a/src/reuse/project.py +++ b/src/reuse/project.py @@ -21,14 +21,11 @@ Collection, DefaultDict, Dict, - Generator, Iterator, List, NamedTuple, Optional, - Set, Type, - cast, ) import attrs @@ -40,7 +37,6 @@ _LICENSEREF_PATTERN, StrPath, _determine_license_path, - is_relative_to, relative_from_root, reuse_info_of_file, ) diff --git a/tests/test_covered_files.py b/tests/test_covered_files.py index a60c1a2c0..df8f0ad7e 100644 --- a/tests/test_covered_files.py +++ b/tests/test_covered_files.py @@ -113,10 +113,11 @@ def test_include_meson_subprojects(self, empty_directory): subprojects_dir.mkdir() libfoo_dir = subprojects_dir / "libfoo" libfoo_dir.mkdir() - (libfoo_dir / "bar.py").write_text("pass") + bar_file = libfoo_dir / "bar.py" + bar_file.write_text("pass") - assert (libfoo_dir / "bar.py") not in iter_files(empty_directory) - assert (libfoo_dir / "bar.py") in iter_files( + assert bar_file not in iter_files(empty_directory) + assert bar_file in iter_files( empty_directory, include_meson_subprojects=True ) @@ -260,7 +261,7 @@ def test_include_submodules(self, submodule_repository): ) def test_submodule_is_ignored(self, submodule_repository): - """If a submodule is ignored, iter_files should not raise an Exception""" + """If a submodule is ignored, iter_files shouldn't raise an Exception""" (submodule_repository / "submodule/foo.py").write_text("foo") gitignore = submodule_repository / ".gitignore" contents = gitignore.read_text()