Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dvc check-ignore command #4282

Merged
merged 17 commits into from
Aug 3, 2020
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dvc/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .command import (
add,
cache,
check_ignore,
checkout,
commit,
completion,
Expand Down Expand Up @@ -79,6 +80,7 @@
git_hook,
plots,
experiments,
check_ignore,
]


Expand Down
70 changes: 70 additions & 0 deletions dvc/command/check_ignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import argparse
import logging

from dvc.command import completion
from dvc.command.base import CmdBase, append_doc_link
from dvc.exceptions import DvcException

logger = logging.getLogger(__name__)


class CmdCheckIgnore(CmdBase):
def __init__(self, args):
super().__init__(args)
self.ignore_filter = self.repo.tree.dvcignore

def _show_results(self, results):
for result in results:
if result.matches or self.args.non_matching:
if self.args.details:
logger.info(
"{}\t{}".format(result.patterns[-1], result.file)
)
else:
logger.info(result.file)

def run(self):
if self.args.non_matching and not self.args.details:
raise DvcException("--non-matching is only valid with --details")

if self.args.quiet and self.args.details:
raise DvcException("cannot both --details and --quiet")

results = self.ignore_filter.check_ignore(self.args.targets)
self._show_results(results)
if any(result.matches for result in results):
return 0
return 1


def add_parser(subparsers, parent_parser):
ADD_HELP = "Debug DVC ignore/exclude files"
Copy link
Contributor

@jorgeorpinel jorgeorpinel Aug 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about something like "Check whether files or directories are excluded due to .dvcignore." ?

Per https://github.com/iterative/dvc.org/pull/1629/files

Copy link
Contributor Author

@karajan1001 karajan1001 Aug 6, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@johntharian
No problem.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrong handle πŸ˜†


parser = subparsers.add_parser(
"check-ignore",
parents=[parent_parser],
description=append_doc_link(ADD_HELP, "check-ignore"),
help=ADD_HELP,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-d",
"--details",
action="store_true",
default=False,
help="Show the exclude pattern together with each target path.",
)
parser.add_argument(
"-n",
"--non-matching",
action="store_true",
default=False,
help="Show the target paths which don’t match any pattern. "
"Only usable when `--details` is also employed",
)
parser.add_argument(
"targets",
nargs="+",
help="Input files/directories to check " "ignore patterns.",
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
).complete = completion.FILE
parser.set_defaults(func=CmdCheckIgnore)
140 changes: 110 additions & 30 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
import logging
import os
import re
from collections import namedtuple
from itertools import groupby, takewhile

from pathspec.patterns import GitWildMatchPattern
from pathspec.util import normalize_file
from pygtrie import StringTrie

from dvc.exceptions import DvcException
from dvc.path_info import PathInfo
from dvc.pathspec_math import merge_patterns
from dvc.pathspec_math import PatternInfo, merge_patterns
from dvc.system import System
from dvc.utils import relpath

logger = logging.getLogger(__name__)


class OutOfWorkingSpaceError(DvcException):
"""Thrown when unable to acquire the lock for DVC repo."""


class DvcIgnore:
DVCIGNORE_FILE = ".dvcignore"

Expand All @@ -24,28 +30,46 @@ def __call__(self, root, dirs, files):

class DvcIgnorePatterns(DvcIgnore):
def __init__(self, pattern_list, dirname):
if pattern_list:
if isinstance(pattern_list[0], str):
pattern_list = [
PatternInfo(pattern, "") for pattern in pattern_list
]

self.pattern_list = pattern_list
self.dirname = dirname
self.prefix = self.dirname + os.sep

regex_pattern_list = map(
GitWildMatchPattern.pattern_to_regex, pattern_list
)
self.regex_pattern_list = [
GitWildMatchPattern.pattern_to_regex(pattern_info.patterns)
for pattern_info in pattern_list
]

self.ignore_spec = [
(ignore, re.compile("|".join(item[0] for item in group)))
for ignore, group in groupby(regex_pattern_list, lambda x: x[1])
for ignore, group in groupby(
self.regex_pattern_list, lambda x: x[1]
)
if ignore is not None
]

@classmethod
def from_files(cls, ignore_file_path, tree):
assert os.path.isabs(ignore_file_path)
dirname = os.path.normpath(os.path.dirname(ignore_file_path))
ignore_file_rel_path = os.path.relpath(
ignore_file_path, tree.tree_root
)
with tree.open(ignore_file_path, encoding="utf-8") as fobj:
path_spec_lines = [
line for line in map(str.strip, fobj.readlines()) if line
PatternInfo(
line,
"{}:{}:{}".format(ignore_file_rel_path, line_no + 1, line),
)
for line_no, line in enumerate(
map(str.strip, fobj.readlines())
)
if line
]

return cls(path_spec_lines, dirname)
Expand All @@ -56,7 +80,7 @@ def __call__(self, root, dirs, files):

return dirs, files

def matches(self, dirname, basename, is_dir=False):
def _get_normalize_path(self, dirname, basename):
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
# NOTE: `relpath` is too slow, so we have to assume that both
# `dirname` and `self.dirname` are relative or absolute together.
if dirname == self.dirname:
Expand All @@ -66,10 +90,16 @@ def matches(self, dirname, basename, is_dir=False):
# NOTE: `os.path.join` is ~x5.5 slower
path = f"{rel}{os.sep}{basename}"
else:
return False
raise OutOfWorkingSpaceError(
f"`{dirname}` is out side of `{self.dirname}`"
)

if not System.is_unix():
path = normalize_file(path)
return path

def matches(self, dirname, basename, is_dir=False):
path = self._get_normalize_path(dirname, basename)
return self.ignore(path, is_dir)

def ignore(self, path, is_dir):
Expand All @@ -85,20 +115,43 @@ def ignore(self, path, is_dir):
result = ignore
return result

def match_details(self, dirname, basename, is_dir=False):
path = self._get_normalize_path(dirname, basename)
return self._ignore_details(path, is_dir)

def _ignore_details(self, path, is_dir):
result = []
for ignore, pattern in zip(self.regex_pattern_list, self.pattern_list):
regex = re.compile(ignore[0])
if regex.match(path) or (is_dir and regex.match(f"{path}/")):
if not pattern.file_info:
raise OutOfWorkingSpaceError(
f"`{path}` is not in work space."
)
result.append(pattern.file_info)

return result

def __hash__(self):
return hash(self.dirname + ":" + "\n".join(self.pattern_list))
return hash(self.dirname + ":" + str(self.pattern_list))

def __eq__(self, other):
if not isinstance(other, DvcIgnorePatterns):
return NotImplemented
return (self.dirname == other.dirname) & (
self.pattern_list == other.pattern_list
[pattern.patterns for pattern in self.pattern_list]
== [pattern.patterns for pattern in other.pattern_list]
)

def __bool__(self):
return bool(self.pattern_list)


CheckIgnoreResult = namedtuple(
"CheckIgnoreResult", ["file", "matches", "patterns"]
)


class DvcIgnoreFilterNoop:
def __init__(self, tree, root_dir):
pass
Expand All @@ -112,6 +165,9 @@ def is_ignored_dir(self, _):
def is_ignored_file(self, _):
return False

def check_ignore(self, _):
return []


class DvcIgnoreFilter:
@staticmethod
Expand Down Expand Up @@ -166,26 +222,25 @@ def _update(self, dirname):
def _update_sub_repo(self, root, dirs):
for d in dirs:
if self._is_dvc_repo(root, d):
new_pattern = DvcIgnorePatterns(["/{}/".format(d)], root)
old_pattern = self.ignores_trie_tree.longest_prefix(root).value
if old_pattern:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
*merge_patterns(
old_pattern.pattern_list,
old_pattern.dirname,
["/{}/".format(d)],
root,
new_pattern.pattern_list,
new_pattern.dirname,
)
)
else:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
["/{}/".format(d)], root
)
self.ignores_trie_tree[root] = new_pattern

def __call__(self, root, dirs, files):
ignore_pattern = self._get_trie_pattern(root)
if ignore_pattern:
try:
ignore_pattern = self._get_trie_pattern(root)
return ignore_pattern(root, dirs, files)
else:
except OutOfWorkingSpaceError:
return dirs, files

def _get_trie_pattern(self, dirname):
Expand All @@ -195,8 +250,9 @@ def _get_trie_pattern(self, dirname):

prefix = self.ignores_trie_tree.longest_prefix(dirname).key
if not prefix:
# outside of the repo
return None
raise OutOfWorkingSpaceError(
f"`{dirname}` is out side of `{self.root_dir}`"
)

dirs = list(
takewhile(
Expand All @@ -213,14 +269,13 @@ def _get_trie_pattern(self, dirname):
return self.ignores_trie_tree.get(dirname)

def _is_ignored(self, path, is_dir=False):
if self._outside_repo(path):
return True
dirname, basename = os.path.split(os.path.normpath(path))
ignore_pattern = self._get_trie_pattern(dirname)
if ignore_pattern:
try:
self._is_inside_repo(path)
dirname, basename = os.path.split(os.path.normpath(path))
ignore_pattern = self._get_trie_pattern(dirname)
return ignore_pattern.matches(dirname, basename, is_dir)
else:
return False
except OutOfWorkingSpaceError:
return True

def is_ignored_dir(self, path):
path = os.path.abspath(path)
Expand All @@ -230,9 +285,10 @@ def is_ignored_dir(self, path):
return self._is_ignored(path, True)

def is_ignored_file(self, path):
path = os.path.abspath(path)
return self._is_ignored(path, False)

def _outside_repo(self, path):
def _is_inside_repo(self, path):
path = PathInfo(path)

# paths outside of the repo should be ignored
Expand All @@ -243,5 +299,29 @@ def _outside_repo(self, path):
[os.path.abspath(path), self.root_dir]
)
):
return True
return False
raise OutOfWorkingSpaceError(f"{path} is out of {self.root_dir}")

def check_ignore(self, targets):
check_results = []
for target in targets:
full_target = os.path.abspath(target)
try:
self._is_inside_repo(full_target)
dirname, basename = os.path.split(
os.path.normpath(full_target)
)
pattern = self._get_trie_pattern(dirname)
matches = pattern.match_details(
dirname, basename, os.path.isdir(full_target)
)

if matches:
check_results.append(
CheckIgnoreResult(target, True, matches)
)
continue
except OutOfWorkingSpaceError:
pass
check_results.append(CheckIgnoreResult(target, False, ["::"]))

return check_results
2 changes: 1 addition & 1 deletion dvc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def main(argv=None):
# so won't be reused by any other subsequent run anyway.
clean_repos()

if ret != 0:
if ret != 0 and ret != 1:
karajan1001 marked this conversation as resolved.
Show resolved Hide resolved
logger.info(FOOTER)

if analytics.is_enabled():
Expand Down
8 changes: 7 additions & 1 deletion dvc/pathspec_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
# of two path specification patterns with different base
# All the operations follow the documents of `gitignore`
import os
from collections import namedtuple

from pathspec.util import normalize_file

from dvc.utils import relpath

PatternInfo = namedtuple("PatternInfo", ["patterns", "file_info"])


def _not_ignore(rule):
return (True, rule[1:]) if rule.startswith("!") else (False, rule)
Expand Down Expand Up @@ -59,7 +62,10 @@ def _change_dirname(dirname, pattern_list, new_dirname):
if rel.startswith(".."):
raise ValueError("change dirname can only change to parent path")

return [change_rule(rule, rel) for rule in pattern_list]
return [
PatternInfo(change_rule(rule.patterns, rel), rule.file_info)
for rule in pattern_list
]


def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b):
Expand Down
Loading