From 2b2a4d378a91eb15e92ee2d875b5a7f61e368165 Mon Sep 17 00:00:00 2001 From: Tsotne Tabidze Date: Mon, 26 Apr 2021 18:40:47 -0700 Subject: [PATCH] Allow Feast apply to import files recursively (and add .feastignore) (#1482) * feast apply should import files recursively, add .feastignore Signed-off-by: Tsotne Tabidze * Add assertpy to ci dependencies Signed-off-by: Tsotne Tabidze * Simplify reading using Path.read_text() and update the test file after merging with master Signed-off-by: Tsotne Tabidze * Add documentation Signed-off-by: Tsotne Tabidze --- docs/SUMMARY.md | 1 + docs/concepts/feature-repository.md | 39 +++++-- docs/reference/feast-ignore.md | 32 ++++++ sdk/python/feast/repo_operations.py | 61 +++++++++- sdk/python/setup.py | 3 +- sdk/python/tests/test_cli_local.py | 46 ++++---- sdk/python/tests/test_repo_operations.py | 140 +++++++++++++++++++++++ 7 files changed, 287 insertions(+), 35 deletions(-) create mode 100644 docs/reference/feast-ignore.md create mode 100644 sdk/python/tests/test_repo_operations.py diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 9f8362b8c6..1403f3d323 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -24,6 +24,7 @@ ## Reference * [feature\_store.yaml](reference/feature-store-yaml.md) +* [.feastignore](reference/feast-ignore.md) * [Python API reference](http://rtd.feast.dev/) ## Feast on Kubernetes diff --git a/docs/concepts/feature-repository.md b/docs/concepts/feature-repository.md index e65477c6c3..dd95660232 100644 --- a/docs/concepts/feature-repository.md +++ b/docs/concepts/feature-repository.md @@ -10,6 +10,7 @@ A feature repository consists of: * A collection of Python files containing feature declarations. * A `feature_store.yaml` file containing infrastructural configuration. +* A `.feastignore` file containing paths in the feature repository to ignore. {% hint style="info" %} Typically, users store their feature repositories in a Git repository, especially when working in teams. However, using Git is not a requirement. @@ -19,27 +20,28 @@ Typically, users store their feature repositories in a Git repository, especiall The structure of a feature repository is as follows: -* The root of the repository should contain a `feature_store.yaml` file. +* The root of the repository should contain a `feature_store.yaml` file and may contain a `.feastignore` file. * The repository should contain Python files that contain feature definitions. * The repository can contain other files as well, including documentation and potentially data files. An example structure of a feature repository is shown below: ```text -$ tree +$ tree -a . ├── data │ └── driver_stats.parquet ├── driver_features.py -└── feature_store.yaml +├── feature_store.yaml +└── .feastignore -1 directory, 3 files +1 directory, 4 files ``` A couple of things to note about the feature repository: -* Feast does not currently read through subdirectories of the feature repository when commands. All feature definition files must reside at the root of the repository. -* Feast reads _all_ Python files when `feast apply` is ran, even if they don't contain feature definitions. It's recommended to store imperative scripts in a different location than inside the feature registry for this purpose. +* Feast reads _all_ Python files recursively when `feast apply` is ran, including subdirectories, even if they don't contain feature definitions. +* It's recommended to add `.feastignore` and add paths to all imperative scripts if you need to store them inside the feature registry. ## The feature\_store.yaml configuration file @@ -57,6 +59,28 @@ online_store: The `feature_store.yaml` file configures how the feature store should run. See [feature\_store.yaml](../reference/feature-store-yaml.md) for more details. +## The .feastignore file + +This file contains paths that should be ignored when running `feast apply`. An example `.feastignore` is shown below: + +{% code title=".feastignore" %} +``` +# Ignore virtual environment +venv + +# Ignore a specific Python file +scripts/foo.py + +# Ignore all Python files directly under scripts directory +scripts/*.py + +# Ignore all "foo.py" anywhere under scripts directory +scripts/**/foo.py +``` +{% endcode %} + +See [.feastignore](../reference/feast-ignore.md) for more details. + ## Feature definitions A feature repository can also contain one or more Python files that contain feature definitions. An example feature definition file is shown below: @@ -97,5 +121,4 @@ To declare new feature definitions, just add code to the feature repository, eit ### Next steps * See [Create a feature repository](../how-to-guides/create-a-feature-repository.md) to get started with an example feature repository. -* See [feature\_store.yaml](../reference/feature-store-yaml.md) or [Feature Views](feature-views.md) for more information on the configuration files that live in a feature registry. - +* See [feature\_store.yaml](../reference/feature-store-yaml.md), [.feastignore](../reference/feast-ignore.md) or [Feature Views](feature-views.md) for more information on the configuration files that live in a feature registry. diff --git a/docs/reference/feast-ignore.md b/docs/reference/feast-ignore.md new file mode 100644 index 0000000000..c5a0846d7c --- /dev/null +++ b/docs/reference/feast-ignore.md @@ -0,0 +1,32 @@ +# .feastignore + +## Overview + +`.feastignore` is a file that is placed at the root of the [Feature Repository](../concepts/feature-repository.md). This file contains paths that should be ignored when running `feast apply`. An example `.feastignore` is shown below: + +{% code title=".feastignore" %} +``` +# Ignore virtual environment +venv + +# Ignore a specific Python file +scripts/foo.py + +# Ignore all Python files directly under scripts directory +scripts/*.py + +# Ignore all "foo.py" anywhere under scripts directory +scripts/**/foo.py +``` +{% endcode %} + +`.feastignore` file is optional. If the file can not be found, every Python in the feature repo directory will be parsed by `feast apply`. + +## Feast Ignore Patterns + +| Pattern | Example matches | Explanation | +| ----------------- | -------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| venv | venv/foo.py
venv/a/foo.py | You can specify a path to a specific directory. Everything in that directory will be ignored. | +| scripts/foo.py | scripts/foo.py | You can specify a path to a specific file. Only that file will be ignored. | +| scripts/*.py | scripts/foo.py
scripts/bar.py | You can specify asterisk (*) anywhere in the expression. An asterisk matches zero or more characters, except "/". | +| scripts/**/foo.py | scripts/foo.py
scripts/a/foo.py
scripts/a/b/foo.py | You can specify double asterisk (**) anywhere in the expression. A double asterisk matches zero or more directories. | diff --git a/sdk/python/feast/repo_operations.py b/sdk/python/feast/repo_operations.py index daaf441733..552d3a4bc3 100644 --- a/sdk/python/feast/repo_operations.py +++ b/sdk/python/feast/repo_operations.py @@ -5,7 +5,7 @@ from datetime import timedelta from importlib.abc import Loader from pathlib import Path -from typing import List, NamedTuple, Union +from typing import List, NamedTuple, Set, Union import click @@ -31,15 +31,64 @@ class ParsedRepo(NamedTuple): entities: List[Entity] +def read_feastignore(repo_root: Path) -> List[str]: + """Read .feastignore in the repo root directory (if exists) and return the list of user-defined ignore paths""" + feast_ignore = repo_root / ".feastignore" + if not feast_ignore.is_file(): + return [] + lines = feast_ignore.read_text().strip().split("\n") + ignore_paths = [] + for line in lines: + # Remove everything after the first occurance of "#" symbol (comments) + if line.find("#") >= 0: + line = line[: line.find("#")] + # Strip leading or ending whitespaces + line = line.strip() + # Add this processed line to ignore_paths if it's not empty + if len(line) > 0: + ignore_paths.append(line) + return ignore_paths + + +def get_ignore_files(repo_root: Path, ignore_paths: List[str]) -> Set[Path]: + """Get all ignore files that match any of the user-defined ignore paths""" + ignore_files = set() + for ignore_path in ignore_paths: + # ignore_path may contains matchers (* or **). Use glob() to match user-defined path to actual paths + for matched_path in repo_root.glob(ignore_path): + if matched_path.is_file(): + # If the matched path is a file, add that to ignore_files set + ignore_files.add(matched_path.resolve()) + else: + # Otherwise, list all Python files in that directory and add all of them to ignore_files set + ignore_files |= { + sub_path.resolve() + for sub_path in matched_path.glob("**/*.py") + if sub_path.is_file() + } + return ignore_files + + +def get_repo_files(repo_root: Path) -> List[Path]: + """Get the list of all repo files, ignoring undesired files & directories specified in .feastignore""" + # Read ignore paths from .feastignore and create a set of all files that match any of these paths + ignore_paths = read_feastignore(repo_root) + ignore_files = get_ignore_files(repo_root, ignore_paths) + + # List all Python files in the root directory (recursively) + repo_files = {p.resolve() for p in repo_root.glob("**/*.py") if p.is_file()} + # Ignore all files that match any of the ignore paths in .feastignore + repo_files -= ignore_files + + # Sort repo_files to read them in the same order every time + return sorted(repo_files) + + def parse_repo(repo_root: Path) -> ParsedRepo: """ Collect feature table definitions from feature repo """ res = ParsedRepo(feature_tables=[], entities=[], feature_views=[]) - # FIXME: process subdirs but exclude hidden ones - repo_files = [p.resolve() for p in repo_root.glob("*.py")] - - for repo_file in repo_files: - + for repo_file in get_repo_files(repo_root): module_path = py_path_to_module(repo_file, repo_root) module = importlib.import_module(module_path) diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 70dbb9b429..93536e9cbf 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -88,7 +88,8 @@ "adlfs==0.5.9", "firebase-admin==4.5.2", "google-cloud-datastore==2.1.0", - "pre-commit" + "pre-commit", + "assertpy==1.1", ] # README file from Feast repo root directory diff --git a/sdk/python/tests/test_cli_local.py b/sdk/python/tests/test_cli_local.py index 115cfaaf07..1ac20c16bb 100644 --- a/sdk/python/tests/test_cli_local.py +++ b/sdk/python/tests/test_cli_local.py @@ -3,6 +3,8 @@ from pathlib import Path from textwrap import dedent +import assertpy + from feast.feature_store import FeatureStore from tests.cli_utils import CliRunner from tests.online_read_write_test import basic_rw_test @@ -39,31 +41,31 @@ def test_workflow() -> None: ) result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) # entity & feature view list commands should succeed result = runner.run(["entities", "list"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run(["feature-views", "list"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) # entity & feature view describe commands should succeed when objects exist result = runner.run(["entities", "describe", "driver"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) result = runner.run( ["feature-views", "describe", "driver_locations"], cwd=repo_path ) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) # entity & feature view describe commands should fail when objects don't exist result = runner.run(["entities", "describe", "foo"], cwd=repo_path) - assert result.returncode == 1 + assertpy.assert_that(result.returncode).is_equal_to(1) result = runner.run(["feature-views", "describe", "foo"], cwd=repo_path) - assert result.returncode == 1 + assertpy.assert_that(result.returncode).is_equal_to(1) # Doing another apply should be a no op, and should not cause errors result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) basic_rw_test( FeatureStore(repo_path=str(repo_path), config=None), @@ -71,7 +73,7 @@ def test_workflow() -> None: ) result = runner.run(["teardown"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) def test_non_local_feature_repo() -> None: @@ -104,13 +106,13 @@ def test_non_local_feature_repo() -> None: ) result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) fs = FeatureStore(repo_path=str(repo_path)) - assert len(fs.list_feature_views()) == 3 + assertpy.assert_that(fs.list_feature_views()).is_length(3) result = runner.run(["teardown"], cwd=repo_path) - assert result.returncode == 0 + assertpy.assert_that(result.returncode).is_equal_to(0) @contextmanager @@ -150,19 +152,23 @@ def test_3rd_party_providers() -> None: # Check with incorrect built-in provider name (no dots) with setup_third_party_provider_repo("feast123") as repo_path: return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assert return_code == 1 - assert b"Provider 'feast123' is not implemented" in output + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains(b"Provider 'feast123' is not implemented") # Check with incorrect third-party provider name (with dots) with setup_third_party_provider_repo("feast_foo.provider") as repo_path: return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assert return_code == 1 - assert b"Could not import provider module 'feast_foo'" in output + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains( + b"Could not import provider module 'feast_foo'" + ) # Check with incorrect third-party provider name (with dots) - with setup_third_party_provider_repo("foo.provider") as repo_path: + with setup_third_party_provider_repo("foo.FooProvider") as repo_path: return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assert return_code == 1 - assert b"Could not import provider 'provider' from module 'foo'" in output + assertpy.assert_that(return_code).is_equal_to(1) + assertpy.assert_that(output).contains( + b"Could not import provider 'FooProvider' from module 'foo'" + ) # Check with correct third-party provider name with setup_third_party_provider_repo("foo.provider.FooProvider") as repo_path: return_code, output = runner.run_with_output(["apply"], cwd=repo_path) - assert return_code == 0 + assertpy.assert_that(return_code).is_equal_to(0) diff --git a/sdk/python/tests/test_repo_operations.py b/sdk/python/tests/test_repo_operations.py new file mode 100644 index 0000000000..70c8b05c2e --- /dev/null +++ b/sdk/python/tests/test_repo_operations.py @@ -0,0 +1,140 @@ +from contextlib import contextmanager +from pathlib import Path +from tempfile import TemporaryDirectory +from textwrap import dedent +from typing import Optional + +import assertpy + +from feast.repo_operations import get_ignore_files, get_repo_files, read_feastignore + + +@contextmanager +def feature_repo(feastignore_contents: Optional[str]): + with TemporaryDirectory() as tmp_dir: + repo_root = Path(tmp_dir) + (repo_root / "foo").mkdir() + (repo_root / "foo1").mkdir() + (repo_root / "foo1/bar").mkdir() + (repo_root / "bar").mkdir() + (repo_root / "bar/subdir1").mkdir() + (repo_root / "bar/subdir1/subdir2").mkdir() + + (repo_root / "a.py").touch() + (repo_root / "foo/b.py").touch() + (repo_root / "foo1/c.py").touch() + (repo_root / "foo1/bar/d.py").touch() + (repo_root / "bar/e.py").touch() + (repo_root / "bar/subdir1/f.py").touch() + (repo_root / "bar/subdir1/subdir2/g.py").touch() + + if feastignore_contents: + with open(repo_root / ".feastignore", "w") as f: + f.write(feastignore_contents) + + yield repo_root + + +def test_feastignore_no_file(): + # Tests feature repo without .feastignore file + with feature_repo(None) as repo_root: + assertpy.assert_that(read_feastignore(repo_root)).is_equal_to([]) + assertpy.assert_that(get_ignore_files(repo_root, [])).is_equal_to(set()) + assertpy.assert_that(get_repo_files(repo_root)).is_equal_to( + [ + (repo_root / "a.py").resolve(), + (repo_root / "bar/e.py").resolve(), + (repo_root / "bar/subdir1/f.py").resolve(), + (repo_root / "bar/subdir1/subdir2/g.py").resolve(), + (repo_root / "foo/b.py").resolve(), + (repo_root / "foo1/bar/d.py").resolve(), + (repo_root / "foo1/c.py").resolve(), + ] + ) + + +def test_feastignore_no_stars(): + # Tests .feastignore that doesn't contain "*" in paths + feastignore_contents = dedent( + """ + # We can put some comments here + + foo # match directory + bar/subdir1/f.py # match specific file + """ + ) + with feature_repo(feastignore_contents) as repo_root: + ignore_paths = ["foo", "bar/subdir1/f.py"] + assertpy.assert_that(read_feastignore(repo_root)).is_equal_to(ignore_paths) + assertpy.assert_that(get_ignore_files(repo_root, ignore_paths)).is_equal_to( + { + (repo_root / "foo/b.py").resolve(), + (repo_root / "bar/subdir1/f.py").resolve(), + } + ) + assertpy.assert_that(get_repo_files(repo_root)).is_equal_to( + [ + (repo_root / "a.py").resolve(), + (repo_root / "bar/e.py").resolve(), + (repo_root / "bar/subdir1/subdir2/g.py").resolve(), + (repo_root / "foo1/bar/d.py").resolve(), + (repo_root / "foo1/c.py").resolve(), + ] + ) + + +def test_feastignore_with_stars(): + # Tests .feastignore that contains "*" and "**" in paths + feastignore_contents = dedent( + """ + foo/*.py # match python files directly under foo/ + bar/** # match everything (recursively) under bar/ + */c.py # match c.py in any directory + */d.py # match d.py in any directory (this shouldn't match anything) + """ + ) + with feature_repo(feastignore_contents) as repo_root: + ignore_paths = ["foo/*.py", "bar/**", "*/c.py", "*/d.py"] + assertpy.assert_that(read_feastignore(repo_root)).is_equal_to(ignore_paths) + assertpy.assert_that(get_ignore_files(repo_root, ignore_paths)).is_equal_to( + { + (repo_root / "foo/b.py").resolve(), + (repo_root / "bar/subdir1/f.py").resolve(), + (repo_root / "bar/e.py").resolve(), + (repo_root / "bar/subdir1/f.py").resolve(), + (repo_root / "bar/subdir1/subdir2/g.py").resolve(), + (repo_root / "foo1/c.py").resolve(), + } + ) + assertpy.assert_that(get_repo_files(repo_root)).is_equal_to( + [(repo_root / "a.py").resolve(), (repo_root / "foo1/bar/d.py").resolve()] + ) + + +def test_feastignore_with_stars2(): + # Another test of .feastignore that contains "**" in paths + feastignore_contents = dedent( + """ + # match everything (recursively) that has "bar" in its path + **/bar/** + """ + ) + with feature_repo(feastignore_contents) as repo_root: + ignore_paths = ["**/bar/**"] + assertpy.assert_that(read_feastignore(repo_root)).is_equal_to(ignore_paths) + assertpy.assert_that(get_ignore_files(repo_root, ignore_paths)).is_equal_to( + { + (repo_root / "bar/subdir1/f.py").resolve(), + (repo_root / "bar/e.py").resolve(), + (repo_root / "bar/subdir1/f.py").resolve(), + (repo_root / "bar/subdir1/subdir2/g.py").resolve(), + (repo_root / "foo1/bar/d.py").resolve(), + } + ) + assertpy.assert_that(get_repo_files(repo_root)).is_equal_to( + [ + (repo_root / "a.py").resolve(), + (repo_root / "foo/b.py").resolve(), + (repo_root / "foo1/c.py").resolve(), + ] + )