From 3c392610df93a18cb573dd70f09f11b7792fcbd6 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Wed, 1 Sep 2021 11:46:18 -0400 Subject: [PATCH 01/19] Adding initial type support related tests for BQ (#1768) * Adding initial type support related tests for BQ Signed-off-by: Danny Chiao --- sdk/python/feast/inference.py | 5 +- sdk/python/tests/data/data_creator.py | 45 ++++- .../integration/e2e/test_universal_e2e.py | 2 +- .../feature_repos/test_repo_configuration.py | 99 +++++++++- .../feature_repos/universal/entities.py | 4 +- .../feature_repos/universal/feature_views.py | 6 +- .../test_offline_online_store_consistency.py | 12 +- .../registration/test_universal_types.py | 173 ++++++++++++++++++ 8 files changed, 324 insertions(+), 22 deletions(-) create mode 100644 sdk/python/tests/integration/registration/test_universal_types.py diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index 0417cae257..0ee2a437a6 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -29,9 +29,11 @@ def update_entities_with_inferred_types_from_feature_views( col_names_and_types = view.batch_source.get_table_column_names_and_types(config) for entity_name in view.entities: if entity_name in incomplete_entities: + entity = incomplete_entities[entity_name] + # get entity information from information extracted from the view batch source extracted_entity_name_type_pairs = list( - filter(lambda tup: tup[0] == entity_name, col_names_and_types) + filter(lambda tup: tup[0] == entity.join_key, col_names_and_types,) ) if len(extracted_entity_name_type_pairs) == 0: # Doesn't mention inference error because would also be an error without inferencing @@ -40,7 +42,6 @@ def update_entities_with_inferred_types_from_feature_views( its entity's name.""" ) - entity = incomplete_entities[entity_name] inferred_value_type = view.batch_source.source_datatype_to_feast_value_type()( extracted_entity_name_type_pairs[0][1] ) diff --git a/sdk/python/tests/data/data_creator.py b/sdk/python/tests/data/data_creator.py index a6fc9d423d..b6f377e581 100644 --- a/sdk/python/tests/data/data_creator.py +++ b/sdk/python/tests/data/data_creator.py @@ -1,15 +1,22 @@ from datetime import datetime, timedelta +from typing import List import pandas as pd from pytz import timezone, utc +from feast.value_type import ValueType -def create_dataset() -> pd.DataFrame: - now = datetime.utcnow() + +def create_dataset( + entity_type: ValueType = ValueType.INT32, + feature_dtype: str = None, + feature_is_list: bool = False, +) -> pd.DataFrame: + now = datetime.now().replace(microsecond=0, second=0, minute=0) ts = pd.Timestamp(now).round("ms") data = { - "id": [1, 2, 1, 3, 3], - "value": [0.1, None, 0.3, 4, 5], + "driver_id": get_entities_for_value_type(entity_type), + "value": get_feature_values_for_dtype(feature_dtype, feature_is_list), "ts_1": [ ts - timedelta(hours=4), ts, @@ -25,3 +32,33 @@ def create_dataset() -> pd.DataFrame: "created_ts": [ts, ts, ts, ts, ts], } return pd.DataFrame.from_dict(data) + + +def get_entities_for_value_type(value_type: ValueType) -> List: + value_type_map = { + ValueType.INT32: [1, 2, 1, 3, 3], + ValueType.INT64: [1, 2, 1, 3, 3], + ValueType.FLOAT: [1.0, 2.0, 1.0, 3.0, 3.0], + ValueType.STRING: ["1", "2", "1", "3", "3"], + } + return value_type_map[value_type] + + +def get_feature_values_for_dtype(dtype: str, is_list: bool) -> List: + if dtype is None: + return [0.1, None, 0.3, 4, 5] + # TODO(adchia): for int columns, consider having a better error when dealing with None values (pandas int dfs can't + # have na) + dtype_map = { + "int32": [1, 2, 3, 4, 5], + "int64": [1, 2, 3, 4, 5], + "float": [1.0, None, 3.0, 4.0, 5.0], + "string": ["1", None, "3", "4", "5"], + "bool": [True, None, False, True, False], + } + non_list_val = dtype_map[dtype] + # Duplicate the value once if this is a list + if is_list: + return [[n, n] if n is not None else None for n in non_list_val] + else: + return non_list_val diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index 5c89d9b966..d0f000163f 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -78,7 +78,7 @@ def check_offline_and_online_features( def run_offline_online_store_consistency_test( fs: FeatureStore, fv: FeatureView ) -> None: - now = datetime.utcnow() + now = datetime.now() full_feature_names = True check_offline_store: bool = True diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py index 463cfc5c5d..062cf7c184 100644 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py @@ -10,6 +10,7 @@ from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data, importer from feast.data_source import DataSource +from feast.value_type import ValueType from tests.data.data_creator import create_dataset from tests.integration.feature_repos.universal.data_source_creator import ( DataSourceCreator, @@ -70,7 +71,6 @@ def ds_creator_path(cls: str): ), ] - OFFLINE_STORES: List[str] = [] ONLINE_STORES: List[str] = [] PROVIDERS: List[str] = [] @@ -83,6 +83,9 @@ class Environment: feature_store: FeatureStore data_source: DataSource data_source_creator: DataSourceCreator + entity_type: ValueType + feature_dtype: str + feature_is_list: bool end_date = datetime.now().replace(microsecond=0, second=0, minute=0) start_date = end_date - timedelta(days=7) @@ -199,6 +202,9 @@ def construct_test_environment( test_repo_config: TestRepoConfig, create_and_apply: bool = False, materialize: bool = False, + entity_type: ValueType = ValueType.INT32, + feature_dtype: str = None, + feature_is_list: bool = False, ) -> Environment: """ This method should take in the parameters from the test repo config and created a feature repo, apply it, @@ -208,9 +214,14 @@ def construct_test_environment( The user is *not* expected to perform any clean up actions. :param test_repo_config: configuration + :param create_and_apply: whether to create and apply the repo config + :param materialize: whether to materialize features to online store + :param entity_type: the data type for the entity column (i.e. id) + :param feature_dtype: the data type for the feature column (i.e. value) + :param feature_is_list: whether the feature column (i.e. value) should be a list feature :return: A feature store built using the supplied configuration. """ - df = create_dataset() + df = create_dataset(entity_type, feature_dtype, feature_is_list) project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}" @@ -221,9 +232,7 @@ def construct_test_environment( offline_creator: DataSourceCreator = importer.get_class_from_type( module_name, config_class_name, "DataSourceCreator" )(project) - ds = offline_creator.create_data_source( - project, df, field_mapping={"ts_1": "ts", "id": "driver_id"} - ) + ds = offline_creator.create_data_source(project, df, field_mapping={"ts_1": "ts"}) offline_store = offline_creator.create_offline_store_config() online_store = test_repo_config.online_store @@ -243,6 +252,9 @@ def construct_test_environment( feature_store=fs, data_source=ds, data_source_creator=offline_creator, + entity_type=entity_type, + feature_dtype=feature_dtype, + feature_is_list=feature_is_list, ) fvs = [] @@ -341,3 +353,80 @@ def inner_test(config): online_test(environment) return inner_test + + +def parametrize_types_no_materialize_test(types_test): + """ + This decorator should be used by tests that want to parametrize by different kinds of entity + feature types and + not materialize said features + """ + return _parametrize_types_test_internal(types_test, create_apply_materialize=False) + + +def parametrize_types_materialize_test(types_test): + """ + This decorator should be used by tests that want to parametrize by different kinds of entity + feature types and + materialize said features + """ + return _parametrize_types_test_internal(types_test, create_apply_materialize=True) + + +def parametrize_types_no_materialize_test_no_list(types_test): + """ + This decorator should be used by tests that want to parametrize by different kinds of entity + feature types, but + not materializing and not allowing for feature list types + """ + return _parametrize_types_test_internal( + types_test, create_apply_materialize=False, vary_feature_is_list=False + ) + + +def _parametrize_types_test_internal( + types_test, create_apply_materialize: bool, vary_feature_is_list: bool = True +): + def entity_feature_types_ids(entity_type: ValueType, feature_dtype: str): + return f"entity_type:{str(entity_type)}-feature_dtype:{feature_dtype}" + + # TODO(adchia): consider adding timestamp / bytes for feature_dtypes + # TODO(adchia): test materializing float entity types and ensure we throw an error before querying BQ + entity_type_feature_dtypes = [ + (ValueType.INT32, "int32"), + (ValueType.INT64, "int64"), + (ValueType.STRING, "float"), + (ValueType.STRING, "bool"), + ] + + # TODO(adchia): fix conversion to allow for lists in materialization + feature_is_list = [True, False] if vary_feature_is_list else [False] + + @pytest.mark.integration + @pytest.mark.parametrize( + "entity_type,feature_dtype", + entity_type_feature_dtypes, + ids=[ + entity_feature_types_ids(entity_type, feature_dtype) + for entity_type, feature_dtype in entity_type_feature_dtypes + ], + ) + @pytest.mark.parametrize( + "feature_is_list", feature_is_list, ids=lambda v: f"feature_is_list:{str(v)}" + ) + def inner_test(entity_type: ValueType, feature_dtype: str, feature_is_list: bool): + # TODO: parametrize config + with construct_test_environment( + TestRepoConfig( + provider="gcp", + offline_store_creator=ds_creator_path( + "bigquery.BigQueryDataSourceCreator" + ), + online_store="datastore", + ), + create_and_apply=create_apply_materialize, + materialize=create_apply_materialize, + entity_type=entity_type, + feature_dtype=feature_dtype, + feature_is_list=feature_is_list, + ) as environment: + types_test(environment) + + return inner_test diff --git a/sdk/python/tests/integration/feature_repos/universal/entities.py b/sdk/python/tests/integration/feature_repos/universal/entities.py index 1db362043b..8886e813d5 100644 --- a/sdk/python/tests/integration/feature_repos/universal/entities.py +++ b/sdk/python/tests/integration/feature_repos/universal/entities.py @@ -1,10 +1,10 @@ from feast import Entity, ValueType -def driver(): +def driver(value_type: ValueType = ValueType.INT64): return Entity( name="driver", # The name is derived from this argument, not object name. - value_type=ValueType.INT64, + value_type=value_type, description="driver id", join_key="driver_id", ) diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index 0306044ecd..dace1ab502 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -5,12 +5,14 @@ def driver_feature_view( - data_source: DataSource, name="test_correctness" + data_source: DataSource, + name="test_correctness", + value_type: ValueType = ValueType.FLOAT, ) -> FeatureView: return FeatureView( name=name, entities=["driver"], - features=[Feature("value", ValueType.FLOAT)], + features=[Feature("value", value_type)], ttl=timedelta(days=5), input=data_source, ) diff --git a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py index 3eac9073db..2c48f0fbe6 100644 --- a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py +++ b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py @@ -59,7 +59,7 @@ def prep_bq_fs_and_fv( event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", - field_mapping={"ts_1": "ts", "id": "driver_id"}, + field_mapping={"ts_1": "ts"}, ) fv = driver_feature_view(bigquery_source) @@ -122,7 +122,7 @@ def prep_redshift_fs_and_fv( event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", - field_mapping={"ts_1": "ts", "id": "driver_id"}, + field_mapping={"ts_1": "ts"}, ) fv = driver_feature_view(redshift_source) @@ -171,7 +171,7 @@ def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", - field_mapping={"ts_1": "ts", "id": "driver_id"}, + field_mapping={"ts_1": "ts"}, ) fv = driver_feature_view(file_source) e = Entity( @@ -212,7 +212,7 @@ def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", - field_mapping={"ts_1": "ts", "id": "driver_id"}, + field_mapping={"ts_1": "ts"}, ) fv = driver_feature_view(file_source) e = Entity( @@ -254,7 +254,7 @@ def prep_dynamodb_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: event_timestamp_column="ts", created_timestamp_column="created_ts", date_partition_column="", - field_mapping={"ts_1": "ts", "id": "driver_id"}, + field_mapping={"ts_1": "ts"}, ) fv = driver_feature_view(file_source) e = Entity( @@ -332,7 +332,7 @@ def check_offline_and_online_features( def run_offline_online_store_consistency_test( fs: FeatureStore, fv: FeatureView, full_feature_names: bool, ) -> None: - now = datetime.utcnow() + now = datetime.now() # Run materialize() # use both tz-naive & tz-aware timestamps to test that they're both correctly handled start_date = (now - timedelta(hours=5)).replace(tzinfo=utc) diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py new file mode 100644 index 0000000000..99c1e91f23 --- /dev/null +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -0,0 +1,173 @@ +from datetime import datetime, timedelta + +import pandas as pd +from data.data_creator import get_feature_values_for_dtype +from integration.feature_repos.test_repo_configuration import ( + Environment, + parametrize_types_no_materialize_test, + parametrize_types_no_materialize_test_no_list, +) +from integration.feature_repos.universal.entities import driver +from integration.feature_repos.universal.feature_views import driver_feature_view + +from feast.infra.offline_stores.offline_store import RetrievalJob +from feast.type_map import python_type_to_feast_value_type +from feast.value_type import ValueType + + +# TODO: change parametrization to allow for other providers aside from gcp +@parametrize_types_no_materialize_test +def test_entity_inference_types_match(environment: Environment): + feature_dtype, feature_is_list, fs, fv = get_test_fixtures(environment) + # Don't specify value type in entity to force inference + entity = driver(value_type=ValueType.UNKNOWN) + fs.apply([fv, entity]) + + entities = fs.list_entities() + entity_type_to_expected_inferred_entity_type = { + ValueType.INT32: ValueType.INT64, + ValueType.INT64: ValueType.INT64, + ValueType.FLOAT: ValueType.DOUBLE, + ValueType.STRING: ValueType.STRING, + } + for entity in entities: + assert ( + entity.value_type + == entity_type_to_expected_inferred_entity_type[environment.entity_type] + ) + + +@parametrize_types_no_materialize_test +def test_feature_get_historical_features_types_match(environment: Environment): + feature_dtype, feature_is_list, fs, fv = get_test_fixtures(environment) + entity = driver() + fs.apply([fv, entity]) + + features = [f"{fv.name}:value"] + df = pd.DataFrame() + df["driver_id"] = ( + ["1", "3"] if environment.entity_type == ValueType.STRING else [1, 3] + ) + now = datetime.utcnow() + ts = pd.Timestamp(now).round("ms") + df["ts"] = [ + ts - timedelta(hours=4), + ts - timedelta(hours=2), + ] + historical_features = fs.get_historical_features(entity_df=df, features=features,) + + # TODO(adchia): pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs + historical_features_df = historical_features.to_df() + print(historical_features_df) + if feature_is_list: + assert_feature_list_types(feature_dtype, historical_features_df) + else: + assert_expected_historical_feature_types(feature_dtype, historical_features_df) + assert_expected_arrow_types(feature_dtype, feature_is_list, historical_features) + + +@parametrize_types_no_materialize_test_no_list +def test_feature_get_online_features_types_match(environment: Environment): + feature_dtype, feature_is_list, fs, fv = get_test_fixtures(environment) + if feature_is_list: + pass + + features = [fv.name + ":value"] + entity = driver(value_type=ValueType.UNKNOWN) + fs.apply([fv, entity]) + fs.materialize(environment.start_date, environment.end_date) + driver_id_value = "1" if environment.entity_type == ValueType.STRING else 1 + online_features = fs.get_online_features( + features=features, entity_rows=[{"driver": driver_id_value}], + ).to_dict() + + feature_list_dtype_to_expected_online_response_value_type = { + "int32": "int", + "int64": "int", + "float": "float", + "string": "str", + "bool": "bool", + } + assert ( + type(online_features["value"][0]).__name__ + == feature_list_dtype_to_expected_online_response_value_type[feature_dtype] + ) + + +def get_test_fixtures(environment: Environment): + feature_dtype = environment.feature_dtype + feature_is_list = environment.feature_is_list + fs, fv = ( + environment.feature_store, + driver_feature_view( + environment.data_source, + value_type=python_type_to_feast_value_type( + feature_dtype, + value=get_feature_values_for_dtype(feature_dtype, feature_is_list)[0], + ), + ), + ) + return feature_dtype, feature_is_list, fs, fv + + +def assert_expected_historical_feature_types( + feature_dtype: str, historical_features_df: pd.DataFrame +): + print("Asserting historical feature types") + feature_dtype_to_expected_historical_feature_dtype = { + "int32": "int64", + "int64": "int64", + "float": "float64", + "string": "object", + "bool": "bool", + } + assert ( + str(historical_features_df.dtypes["value"]) + == feature_dtype_to_expected_historical_feature_dtype[feature_dtype] + ) + + +def assert_feature_list_types(feature_dtype: str, historical_features_df: pd.DataFrame): + print("Asserting historical feature list types") + # Note, these expected values only hold for BQ + feature_list_dtype_to_expected_historical_feature_list_dtype = { + "int32": "int", + "int64": "int", + "float": "float", + "string": "str", + "bool": "bool", + } + assert str(historical_features_df.dtypes["value"]) == "object" + # Note, this struct schema is only true for BQ and not for other stores + assert ( + type(historical_features_df.value[0]["list"][0]["item"]).__name__ + == feature_list_dtype_to_expected_historical_feature_list_dtype[feature_dtype] + ) + + +def assert_expected_arrow_types( + feature_dtype: str, feature_is_list: bool, historical_features: RetrievalJob +): + print("Asserting historical feature arrow types") + historical_features_arrow = historical_features.to_arrow() + print(historical_features_arrow) + feature_list_dtype_to_expected_historical_feature_arrow_type = { + "int32": "int64", + "int64": "int64", + "float": "double", + "string": "string", + "bool": "bool", + } + arrow_type = feature_list_dtype_to_expected_historical_feature_arrow_type[ + feature_dtype + ] + if feature_is_list: + assert ( + str(historical_features_arrow.schema.field_by_name("value").type) + == f"struct> not null>" + ) + else: + assert ( + str(historical_features_arrow.schema.field_by_name("value").type) + == arrow_type + ) From ef7200a0d6c3902199999cb6a428bf9bcd07a45a Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Wed, 1 Sep 2021 18:41:42 +0000 Subject: [PATCH 02/19] GitBook: [master] 62 pages modified --- docs/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/README.md b/docs/README.md index 088adb6696..4cf62940a5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -49,8 +49,9 @@ The best way to learn Feast is to use it. Head over to our [Quickstart](getting- Explore the following resources to get started with Feast: * [Quickstart](getting-started/quickstart.md) is the fastest way to get started with Feast -* [Getting started ](how-to-guides/feast-gcp-aws/)provides a step-by-step guide to using Feast. -* [Concepts](getting-started/concepts/data-model-and-concepts/#concepts) describes all important Feast API concepts. +* [Concepts](getting-started/concepts/) describes all important Feast API concepts and Feast's overall architecture. +* [Tutorials](tutorials/tutorials-overview.md) shows full examples of using Feast in machine learning applications. +* [Running Feast with GCP/AWS](how-to-guides/feast-gcp-aws/) provides a more in-depth guide to using Feast. * [Reference](reference/feast-cli-commands.md) contains detailed API and design documents. * [Contributing](project/contributing.md) contains resources for anyone who wants to contribute to Feast. From 66cf6a407264d220c8aabb8a2cb4ed386e8ec4ee Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 1 Sep 2021 14:51:18 -0700 Subject: [PATCH 03/19] Refactor Environment class and DataSourceCreator API, and use fixtures for datasets and data sources (#1790) * Fix API cruft from DataSourceCreator Signed-off-by: Achal Shah * Remove the need for get_prefixed_table_name Signed-off-by: Achal Shah * major refactor Signed-off-by: Achal Shah * move start time Signed-off-by: Achal Shah * Remove one dimension of variation to be added in later Signed-off-by: Achal Shah * Fix default Signed-off-by: Achal Shah * Fixups Signed-off-by: Achal Shah * Fixups Signed-off-by: Achal Shah * Fix up tests Signed-off-by: Achal Shah * Add retries to execute_redshift_statement_async Signed-off-by: Achal Shah * Add retries to execute_redshift_statement_async Signed-off-by: Achal Shah * refactoooor Signed-off-by: Achal Shah * remove retries Signed-off-by: Achal Shah * Remove provider variation since they don't really play a big role Signed-off-by: Achal Shah * Session scoped cache for test datasets and skipping older tests whose functionality is present in other universal tests Signed-off-by: Achal Shah * make format Signed-off-by: Achal Shah * make format Signed-off-by: Achal Shah * remove import Signed-off-by: Achal Shah * fix merge Signed-off-by: Achal Shah * Use an enum for the stopping procedure instead of the bools Signed-off-by: Achal Shah * Fix refs Signed-off-by: Achal Shah * fix step Signed-off-by: Achal Shah * WIP fixes Signed-off-by: Achal Shah * Fix for feature inferencing Signed-off-by: Achal Shah * C901 '_python_value_to_proto_value' is too complex :( Signed-off-by: Achal Shah * Split out construct_test_repo and construct_universal_test_repo Signed-off-by: Achal Shah * remove import Signed-off-by: Achal Shah * add unsafe_hash Signed-off-by: Achal Shah * Update testrepoconfig Signed-off-by: Achal Shah * Update testrepoconfig Signed-off-by: Achal Shah * Remove kwargs from construct_universal_test_environment Signed-off-by: Achal Shah * Remove unneeded method Signed-off-by: Achal Shah * Docs Signed-off-by: Achal Shah * Kill skipped tests Signed-off-by: Achal Shah * reorder Signed-off-by: Achal Shah * add todo Signed-off-by: Achal Shah * Split universal vs non data_source_cache Signed-off-by: Achal Shah * make format Signed-off-by: Achal Shah * WIP fixtures Signed-off-by: Achal Shah * WIP Trying fixtures more effectively Signed-off-by: Achal Shah * fix refs Signed-off-by: Achal Shah * Fix refs Signed-off-by: Achal Shah * Fix refs Signed-off-by: Achal Shah * Fix refs Signed-off-by: Achal Shah * fix historical tests Signed-off-by: Achal Shah * renames Signed-off-by: Achal Shah * CR updates Signed-off-by: Achal Shah * use the actual ref to data source creators Signed-off-by: Achal Shah * format Signed-off-by: Achal Shah * unused imports' Signed-off-by: Achal Shah * Add ids for pytest params Signed-off-by: Achal Shah --- sdk/python/feast/feature_view.py | 24 +- .../feast/infra/offline_stores/bigquery.py | 1 + sdk/python/feast/infra/provider.py | 6 + sdk/python/feast/infra/utils/aws_utils.py | 2 +- sdk/python/feast/type_map.py | 11 +- sdk/python/tests/conftest.py | 51 ++ .../integration/e2e/test_universal_e2e.py | 18 +- .../feature_repos/repo_configuration.py | 192 ++++++ .../feature_repos/test_repo_configuration.py | 432 ------------- .../universal/data_source_creator.py | 23 +- .../universal/data_sources/bigquery.py | 44 +- .../universal/data_sources/file.py | 39 +- .../universal/data_sources/redshift.py | 19 +- .../feature_repos/universal/feature_views.py | 15 +- .../test_offline_online_store_consistency.py | 12 - .../test_historical_retrieval.py | 567 +----------------- .../offline_store/test_s3_custom_endpoint.py | 24 +- .../test_universal_historical_retrieval.py | 40 +- .../online_store/test_universal_online.py | 36 +- .../registration/test_universal_types.py | 268 ++++++--- 20 files changed, 629 insertions(+), 1195 deletions(-) create mode 100644 sdk/python/tests/integration/feature_repos/repo_configuration.py delete mode 100644 sdk/python/tests/integration/feature_repos/test_repo_configuration.py diff --git a/sdk/python/feast/feature_view.py b/sdk/python/feast/feature_view.py index a833c7a5e5..801d578e7d 100644 --- a/sdk/python/feast/feature_view.py +++ b/sdk/python/feast/feature_view.py @@ -325,6 +325,28 @@ def infer_features_from_batch_source(self, config: RepoConfig): self.batch_source.created_timestamp_column, } | set(self.entities) + if ( + self.batch_source.event_timestamp_column + in self.batch_source.field_mapping + ): + columns_to_exclude.add( + self.batch_source.field_mapping[ + self.batch_source.event_timestamp_column + ] + ) + if ( + self.batch_source.created_timestamp_column + in self.batch_source.field_mapping + ): + columns_to_exclude.add( + self.batch_source.field_mapping[ + self.batch_source.created_timestamp_column + ] + ) + for e in self.entities: + if e in self.batch_source.field_mapping: + columns_to_exclude.add(self.batch_source.field_mapping[e]) + for ( col_name, col_datatype, @@ -335,7 +357,7 @@ def infer_features_from_batch_source(self, config: RepoConfig): ): feature_name = ( self.batch_source.field_mapping[col_name] - if col_name in self.batch_source.field_mapping.keys() + if col_name in self.batch_source.field_mapping else col_name ) self.features.append( diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index 3463e96898..2bfd863991 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -86,6 +86,7 @@ def pull_latest_from_table_or_query( ) WHERE _feast_row = 1 """ + return BigQueryRetrievalJob(query=query, client=client, config=config) @staticmethod diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 99414bd9d2..e5210b566f 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -242,6 +242,12 @@ def _get_column_names( reverse_field_mapping[col] if col in reverse_field_mapping.keys() else col for col in feature_names ] + + # We need to exclude join keys and timestamp columns from the list of features, after they are mapped to + # their final column names via the `field_mapping` field of the source. + _feature_names = set(feature_names) - set(join_keys) + _feature_names = _feature_names - {event_timestamp_column, created_timestamp_column} + feature_names = list(_feature_names) return ( join_keys, feature_names, diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index aea460cfb8..84fa611c15 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -82,7 +82,7 @@ class RedshiftStatementNotFinishedError(Exception): @retry( - wait=wait_exponential(multiplier=0.1, max=30), + wait=wait_exponential(multiplier=1, max=30), retry=retry_if_exception_type(RedshiftStatementNotFinishedError), ) def wait_for_redshift_statement(redshift_data_client, statement: dict) -> None: diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index af09069407..c65ec6e14c 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -13,11 +13,13 @@ # limitations under the License. import re +from datetime import datetime from typing import Any, Dict, Union import numpy as np import pandas as pd from google.protobuf.json_format import MessageToDict +from google.protobuf.timestamp_pb2 import Timestamp from feast.protos.feast.types.Value_pb2 import ( BoolList, @@ -104,6 +106,8 @@ def python_type_to_feast_value_type( "int8": ValueType.INT32, "bool": ValueType.BOOL, "timedelta": ValueType.UNIX_TIMESTAMP, + "Timestamp": ValueType.UNIX_TIMESTAMP, + "datetime": ValueType.UNIX_TIMESTAMP, "datetime64[ns]": ValueType.UNIX_TIMESTAMP, "datetime64[ns, tz]": ValueType.UNIX_TIMESTAMP, "category": ValueType.STRING, @@ -160,7 +164,8 @@ def _type_err(item, dtype): raise ValueError(f'Value "{item}" is of type {type(item)} not of type {dtype}') -def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: +# TODO(achals): Simplify this method and remove the noqa. +def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: # noqa: C901 """ Converts a Python (native, pandas) value to a Feast Proto Value based on a provided value type @@ -281,6 +286,10 @@ def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: elif feast_value_type == ValueType.INT64: return ProtoValue(int64_val=int(value)) elif feast_value_type == ValueType.UNIX_TIMESTAMP: + if isinstance(value, datetime): + return ProtoValue(int64_val=int(value.timestamp())) + elif isinstance(value, Timestamp): + return ProtoValue(int64_val=int(value.ToSeconds())) return ProtoValue(int64_val=int(value)) elif feast_value_type == ValueType.FLOAT: return ProtoValue(float_val=float(value)) diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 0c94f4d57a..55bdeb3a7d 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -18,6 +18,16 @@ import pandas as pd import pytest +from tests.data.data_creator import create_dataset +from tests.integration.feature_repos.repo_configuration import ( + FULL_REPO_CONFIGS, + Environment, + construct_test_environment, + construct_universal_data_sources, + construct_universal_datasets, + construct_universal_entities, +) + def pytest_configure(config): if platform in ["darwin", "windows"]: @@ -87,3 +97,44 @@ def simple_dataset_2() -> pd.DataFrame: ], } return pd.DataFrame.from_dict(data) + + +@pytest.fixture( + params=FULL_REPO_CONFIGS, scope="session", ids=[str(c) for c in FULL_REPO_CONFIGS] +) +def environment(request): + with construct_test_environment(request.param) as e: + yield e + + +@pytest.fixture(scope="session") +def universal_data_sources(environment): + entities = construct_universal_entities() + datasets = construct_universal_datasets( + entities, environment.start_date, environment.end_date + ) + datasources = construct_universal_data_sources( + datasets, environment.data_source_creator + ) + + yield entities, datasets, datasources + + environment.data_source_creator.teardown() + + +@pytest.fixture(scope="session") +def e2e_data_sources(environment: Environment): + df = create_dataset() + data_source = environment.data_source_creator.create_data_source( + df, environment.feature_store.project, field_mapping={"ts_1": "ts"}, + ) + + yield df, data_source + + environment.data_source_creator.teardown() + + +@pytest.fixture(params=FULL_REPO_CONFIGS, scope="session") +def type_test_environment(request): + with construct_test_environment(request.param) as e: + yield e diff --git a/sdk/python/tests/integration/e2e/test_universal_e2e.py b/sdk/python/tests/integration/e2e/test_universal_e2e.py index d0f000163f..e985e8ed10 100644 --- a/sdk/python/tests/integration/e2e/test_universal_e2e.py +++ b/sdk/python/tests/integration/e2e/test_universal_e2e.py @@ -3,23 +3,21 @@ from typing import Optional import pandas as pd +import pytest from pytz import utc from feast import FeatureStore, FeatureView -from tests.integration.feature_repos.test_repo_configuration import ( - Environment, - parametrize_e2e_test, -) from tests.integration.feature_repos.universal.entities import driver from tests.integration.feature_repos.universal.feature_views import driver_feature_view -@parametrize_e2e_test -def test_e2e_consistency(test_environment: Environment): - fs, fv = ( - test_environment.feature_store, - driver_feature_view(test_environment.data_source), - ) +@pytest.mark.integration +@pytest.mark.parametrize("infer_features", [True, False]) +def test_e2e_consistency(environment, e2e_data_sources, infer_features): + fs = environment.feature_store + df, data_source = e2e_data_sources + fv = driver_feature_view(data_source=data_source, infer_features=infer_features) + entity = driver() fs.apply([fv, entity]) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py new file mode 100644 index 0000000000..00c439ca89 --- /dev/null +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -0,0 +1,192 @@ +import tempfile +import uuid +from contextlib import contextmanager +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from pathlib import Path +from typing import Any, Dict, List, Optional, Type, Union + +import pandas as pd + +from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data +from feast.data_source import DataSource +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.bigquery import ( + BigQueryDataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.file import ( + FileDataSourceCreator, +) +from tests.integration.feature_repos.universal.data_sources.redshift import ( + RedshiftDataSourceCreator, +) +from tests.integration.feature_repos.universal.feature_views import ( + create_customer_daily_profile_feature_view, + create_driver_hourly_stats_feature_view, +) + + +@dataclass(frozen=True, repr=True) +class IntegrationTestRepoConfig: + """ + This class should hold all possible parameters that may need to be varied by individual tests. + """ + + provider: str = "local" + online_store: Union[str, Dict] = "sqlite" + + offline_store_creator: Type[DataSourceCreator] = FileDataSourceCreator + + full_feature_names: bool = True + infer_event_timestamp_col: bool = True + infer_features: bool = False + + +DYNAMO_CONFIG = {"type": "dynamodb", "region": "us-west-2"} +REDIS_CONFIG = {"type": "redis", "connection_string": "localhost:6379,db=0"} +FULL_REPO_CONFIGS: List[IntegrationTestRepoConfig] = [ + # Local configurations + IntegrationTestRepoConfig(), + IntegrationTestRepoConfig(online_store=REDIS_CONFIG), + # GCP configurations + IntegrationTestRepoConfig( + provider="gcp", + offline_store_creator=BigQueryDataSourceCreator, + online_store="datastore", + ), + IntegrationTestRepoConfig( + provider="gcp", + offline_store_creator=BigQueryDataSourceCreator, + online_store=REDIS_CONFIG, + ), + # AWS configurations + IntegrationTestRepoConfig( + provider="aws", + offline_store_creator=RedshiftDataSourceCreator, + online_store=DYNAMO_CONFIG, + ), + IntegrationTestRepoConfig( + provider="aws", + offline_store_creator=RedshiftDataSourceCreator, + online_store=REDIS_CONFIG, + ), +] + + +def construct_universal_entities() -> Dict[str, List[Any]]: + return {"customer": list(range(1001, 1110)), "driver": list(range(5001, 5110))} + + +def construct_universal_datasets( + entities: Dict[str, List[Any]], start_time: datetime, end_time: datetime +) -> Dict[str, pd.DataFrame]: + customer_df = driver_test_data.create_customer_daily_profile_df( + entities["customer"], start_time, end_time + ) + driver_df = driver_test_data.create_driver_hourly_stats_df( + entities["driver"], start_time, end_time + ) + orders_df = driver_test_data.create_orders_df( + customers=entities["customer"], + drivers=entities["driver"], + start_date=end_time - timedelta(days=365), + end_date=end_time + timedelta(days=365), + order_count=1000, + ) + + return {"customer": customer_df, "driver": driver_df, "orders": orders_df} + + +def construct_universal_data_sources( + datasets: Dict[str, pd.DataFrame], data_source_creator: DataSourceCreator +) -> Dict[str, DataSource]: + customer_ds = data_source_creator.create_data_source( + datasets["customer"], + destination_name="customer_profile", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", + ) + driver_ds = data_source_creator.create_data_source( + datasets["driver"], + destination_name="driver_hourly", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", + ) + orders_ds = data_source_creator.create_data_source( + datasets["orders"], + destination_name="orders", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", + ) + return {"customer": customer_ds, "driver": driver_ds, "orders": orders_ds} + + +def construct_universal_feature_views( + data_sources: Dict[str, DataSource], +) -> Dict[str, FeatureView]: + return { + "customer": create_customer_daily_profile_feature_view( + data_sources["customer"] + ), + "driver": create_driver_hourly_stats_feature_view(data_sources["driver"]), + } + + +@dataclass +class Environment: + name: str + test_repo_config: IntegrationTestRepoConfig + feature_store: FeatureStore + data_source_creator: DataSourceCreator + + end_date: datetime = field( + default=datetime.now().replace(microsecond=0, second=0, minute=0) + ) + + def __post_init__(self): + self.start_date: datetime = self.end_date - timedelta(days=7) + + +def table_name_from_data_source(ds: DataSource) -> Optional[str]: + if hasattr(ds, "table_ref"): + return ds.table_ref + elif hasattr(ds, "table"): + return ds.table + return None + + +@contextmanager +def construct_test_environment( + test_repo_config: IntegrationTestRepoConfig, + test_suite_name: str = "integration_test", +) -> Environment: + project = f"{test_suite_name}_{str(uuid.uuid4()).replace('-', '')[:8]}" + + offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) + + offline_store_config = offline_creator.create_offline_store_config() + online_store = test_repo_config.online_store + + with tempfile.TemporaryDirectory() as repo_dir_name: + config = RepoConfig( + registry=str(Path(repo_dir_name) / "registry.db"), + project=project, + provider=test_repo_config.provider, + offline_store=offline_store_config, + online_store=online_store, + repo_path=repo_dir_name, + ) + fs = FeatureStore(config=config) + environment = Environment( + name=project, + test_repo_config=test_repo_config, + feature_store=fs, + data_source_creator=offline_creator, + ) + + try: + yield environment + finally: + fs.teardown() diff --git a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py b/sdk/python/tests/integration/feature_repos/test_repo_configuration.py deleted file mode 100644 index 062cf7c184..0000000000 --- a/sdk/python/tests/integration/feature_repos/test_repo_configuration.py +++ /dev/null @@ -1,432 +0,0 @@ -import tempfile -import uuid -from contextlib import contextmanager -from dataclasses import dataclass, replace -from datetime import datetime, timedelta -from pathlib import Path -from typing import Dict, List, Optional, Union - -import pytest - -from feast import FeatureStore, FeatureView, RepoConfig, driver_test_data, importer -from feast.data_source import DataSource -from feast.value_type import ValueType -from tests.data.data_creator import create_dataset -from tests.integration.feature_repos.universal.data_source_creator import ( - DataSourceCreator, -) -from tests.integration.feature_repos.universal.entities import customer, driver -from tests.integration.feature_repos.universal.feature_views import ( - create_customer_daily_profile_feature_view, - create_driver_hourly_stats_feature_view, -) - - -@dataclass(frozen=True, repr=True) -class TestRepoConfig: - """ - This class should hold all possible parameters that may need to be varied by individual tests. - """ - - provider: str = "local" - online_store: Union[str, Dict] = "sqlite" - - offline_store_creator: str = "tests.integration.feature_repos.universal.data_sources.file.FileDataSourceCreator" - - full_feature_names: bool = True - infer_event_timestamp_col: bool = True - - -def ds_creator_path(cls: str): - return f"tests.integration.feature_repos.universal.data_sources.{cls}" - - -DYNAMO_CONFIG = {"type": "dynamodb", "region": "us-west-2"} -REDIS_CONFIG = {"type": "redis", "connection_string": "localhost:6379,db=0"} -FULL_REPO_CONFIGS: List[TestRepoConfig] = [ - # Local configurations - TestRepoConfig(), - TestRepoConfig(online_store=REDIS_CONFIG), - # GCP configurations - TestRepoConfig( - provider="gcp", - offline_store_creator=ds_creator_path("bigquery.BigQueryDataSourceCreator"), - online_store="datastore", - ), - TestRepoConfig( - provider="gcp", - offline_store_creator=ds_creator_path("bigquery.BigQueryDataSourceCreator"), - online_store=REDIS_CONFIG, - ), - # AWS configurations - TestRepoConfig( - provider="aws", - offline_store_creator=ds_creator_path("redshift.RedshiftDataSourceCreator"), - online_store=DYNAMO_CONFIG, - ), - TestRepoConfig( - provider="aws", - offline_store_creator=ds_creator_path("redshift.RedshiftDataSourceCreator"), - online_store=REDIS_CONFIG, - ), -] - -OFFLINE_STORES: List[str] = [] -ONLINE_STORES: List[str] = [] -PROVIDERS: List[str] = [] - - -@dataclass -class Environment: - name: str - test_repo_config: TestRepoConfig - feature_store: FeatureStore - data_source: DataSource - data_source_creator: DataSourceCreator - entity_type: ValueType - feature_dtype: str - feature_is_list: bool - - end_date = datetime.now().replace(microsecond=0, second=0, minute=0) - start_date = end_date - timedelta(days=7) - before_start_date = end_date - timedelta(days=365) - after_end_date = end_date + timedelta(days=365) - - customer_entities = list(range(1001, 1110)) - customer_df = driver_test_data.create_customer_daily_profile_df( - customer_entities, start_date, end_date - ) - _customer_feature_view: Optional[FeatureView] = None - - driver_entities = list(range(5001, 5110)) - driver_df = driver_test_data.create_driver_hourly_stats_df( - driver_entities, start_date, end_date - ) - _driver_stats_feature_view: Optional[FeatureView] = None - - orders_df = driver_test_data.create_orders_df( - customers=customer_entities, - drivers=driver_entities, - start_date=before_start_date, - end_date=after_end_date, - order_count=1000, - ) - _orders_table: Optional[str] = None - - def customer_feature_view(self) -> FeatureView: - if self._customer_feature_view is None: - customer_table_id = self.data_source_creator.get_prefixed_table_name( - self.name, "customer_profile" - ) - ds = self.data_source_creator.create_data_source( - customer_table_id, - self.customer_df, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - self._customer_feature_view = create_customer_daily_profile_feature_view(ds) - return self._customer_feature_view - - def driver_stats_feature_view(self) -> FeatureView: - if self._driver_stats_feature_view is None: - driver_table_id = self.data_source_creator.get_prefixed_table_name( - self.name, "driver_hourly" - ) - ds = self.data_source_creator.create_data_source( - driver_table_id, - self.driver_df, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - self._driver_stats_feature_view = create_driver_hourly_stats_feature_view( - ds - ) - return self._driver_stats_feature_view - - def orders_table(self) -> Optional[str]: - if self._orders_table is None: - orders_table_id = self.data_source_creator.get_prefixed_table_name( - self.name, "orders" - ) - ds = self.data_source_creator.create_data_source( - orders_table_id, - self.orders_df, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - if hasattr(ds, "table_ref"): - self._orders_table = ds.table_ref - elif hasattr(ds, "table"): - self._orders_table = ds.table - return self._orders_table - - -def vary_full_feature_names(configs: List[TestRepoConfig]) -> List[TestRepoConfig]: - new_configs = [] - for c in configs: - true_c = replace(c, full_feature_names=True) - false_c = replace(c, full_feature_names=False) - new_configs.extend([true_c, false_c]) - return new_configs - - -def vary_infer_event_timestamp_col( - configs: List[TestRepoConfig], -) -> List[TestRepoConfig]: - new_configs = [] - for c in configs: - true_c = replace(c, infer_event_timestamp_col=True) - false_c = replace(c, infer_event_timestamp_col=False) - new_configs.extend([true_c, false_c]) - return new_configs - - -def vary_providers_for_offline_stores( - configs: List[TestRepoConfig], -) -> List[TestRepoConfig]: - new_configs = [] - for c in configs: - if "FileDataSourceCreator" in c.offline_store_creator: - new_configs.append(c) - elif "RedshiftDataSourceCreator" in c.offline_store_creator: - for p in ["local", "aws"]: - new_configs.append(replace(c, provider=p)) - elif "BigQueryDataSourceCreator" in c.offline_store_creator: - for p in ["local", "gcp"]: - new_configs.append(replace(c, provider=p)) - return new_configs - - -@contextmanager -def construct_test_environment( - test_repo_config: TestRepoConfig, - create_and_apply: bool = False, - materialize: bool = False, - entity_type: ValueType = ValueType.INT32, - feature_dtype: str = None, - feature_is_list: bool = False, -) -> Environment: - """ - This method should take in the parameters from the test repo config and created a feature repo, apply it, - and return the constructed feature store object to callers. - - This feature store object can be interacted for the purposes of tests. - The user is *not* expected to perform any clean up actions. - - :param test_repo_config: configuration - :param create_and_apply: whether to create and apply the repo config - :param materialize: whether to materialize features to online store - :param entity_type: the data type for the entity column (i.e. id) - :param feature_dtype: the data type for the feature column (i.e. value) - :param feature_is_list: whether the feature column (i.e. value) should be a list feature - :return: A feature store built using the supplied configuration. - """ - df = create_dataset(entity_type, feature_dtype, feature_is_list) - - project = f"test_correctness_{str(uuid.uuid4()).replace('-', '')[:8]}" - - module_name, config_class_name = test_repo_config.offline_store_creator.rsplit( - ".", 1 - ) - - offline_creator: DataSourceCreator = importer.get_class_from_type( - module_name, config_class_name, "DataSourceCreator" - )(project) - ds = offline_creator.create_data_source(project, df, field_mapping={"ts_1": "ts"}) - offline_store = offline_creator.create_offline_store_config() - online_store = test_repo_config.online_store - - with tempfile.TemporaryDirectory() as repo_dir_name: - config = RepoConfig( - registry=str(Path(repo_dir_name) / "registry.db"), - project=project, - provider=test_repo_config.provider, - offline_store=offline_store, - online_store=online_store, - repo_path=repo_dir_name, - ) - fs = FeatureStore(config=config) - environment = Environment( - name=project, - test_repo_config=test_repo_config, - feature_store=fs, - data_source=ds, - data_source_creator=offline_creator, - entity_type=entity_type, - feature_dtype=feature_dtype, - feature_is_list=feature_is_list, - ) - - fvs = [] - entities = [] - try: - if create_and_apply: - entities.extend([driver(), customer()]) - fvs.extend( - [ - environment.driver_stats_feature_view(), - environment.customer_feature_view(), - ] - ) - fs.apply(fvs + entities) - - if materialize: - fs.materialize(environment.start_date, environment.end_date) - - yield environment - finally: - offline_creator.teardown() - fs.teardown() - - -def parametrize_e2e_test(e2e_test): - """ - This decorator should be used for end-to-end tests. These tests are expected to be parameterized, - and receive an empty feature repo created for all supported configurations. - - The decorator also ensures that sample data needed for the test is available in the relevant offline store. - - Decorated tests should create and apply the objects needed by the tests, and perform any operations needed - (such as materialization and looking up feature values). - - The decorator takes care of tearing down the feature store, as well as the sample data. - """ - - @pytest.mark.integration - @pytest.mark.parametrize("config", FULL_REPO_CONFIGS, ids=lambda v: str(v)) - def inner_test(config): - with construct_test_environment(config) as environment: - e2e_test(environment) - - return inner_test - - -def parametrize_offline_retrieval_test(offline_retrieval_test): - """ - This decorator should be used by tests that rely on the offline store. These tests are expected to be parameterized, - and receive an Environment object that contains a reference to a Feature Store with pre-applied - entities and feature views. - - The decorator also ensures that sample data needed for the test is available in the relevant offline store. - - Decorated tests should interact with the offline store, via the FeatureStore.get_historical_features method. They - may perform more operations as needed. - - The decorator takes care of tearing down the feature store, as well as the sample data. - """ - - configs = vary_providers_for_offline_stores(FULL_REPO_CONFIGS) - configs = vary_full_feature_names(configs) - configs = vary_infer_event_timestamp_col(configs) - - @pytest.mark.integration - @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config): - with construct_test_environment(config, create_and_apply=True) as environment: - offline_retrieval_test(environment) - - return inner_test - - -def parametrize_online_test(online_test): - """ - This decorator should be used by tests that rely on the offline store. These tests are expected to be parameterized, - and receive an Environment object that contains a reference to a Feature Store with pre-applied - entities and feature views. - - The decorator also ensures that sample data needed for the test is available in the relevant offline store. This - data is also materialized into the online store. - - The decorator takes care of tearing down the feature store, as well as the sample data. - """ - - configs = vary_providers_for_offline_stores(FULL_REPO_CONFIGS) - configs = vary_full_feature_names(configs) - configs = vary_infer_event_timestamp_col(configs) - - @pytest.mark.integration - @pytest.mark.parametrize("config", configs, ids=lambda v: str(v)) - def inner_test(config): - with construct_test_environment( - config, create_and_apply=True, materialize=True - ) as environment: - online_test(environment) - - return inner_test - - -def parametrize_types_no_materialize_test(types_test): - """ - This decorator should be used by tests that want to parametrize by different kinds of entity + feature types and - not materialize said features - """ - return _parametrize_types_test_internal(types_test, create_apply_materialize=False) - - -def parametrize_types_materialize_test(types_test): - """ - This decorator should be used by tests that want to parametrize by different kinds of entity + feature types and - materialize said features - """ - return _parametrize_types_test_internal(types_test, create_apply_materialize=True) - - -def parametrize_types_no_materialize_test_no_list(types_test): - """ - This decorator should be used by tests that want to parametrize by different kinds of entity + feature types, but - not materializing and not allowing for feature list types - """ - return _parametrize_types_test_internal( - types_test, create_apply_materialize=False, vary_feature_is_list=False - ) - - -def _parametrize_types_test_internal( - types_test, create_apply_materialize: bool, vary_feature_is_list: bool = True -): - def entity_feature_types_ids(entity_type: ValueType, feature_dtype: str): - return f"entity_type:{str(entity_type)}-feature_dtype:{feature_dtype}" - - # TODO(adchia): consider adding timestamp / bytes for feature_dtypes - # TODO(adchia): test materializing float entity types and ensure we throw an error before querying BQ - entity_type_feature_dtypes = [ - (ValueType.INT32, "int32"), - (ValueType.INT64, "int64"), - (ValueType.STRING, "float"), - (ValueType.STRING, "bool"), - ] - - # TODO(adchia): fix conversion to allow for lists in materialization - feature_is_list = [True, False] if vary_feature_is_list else [False] - - @pytest.mark.integration - @pytest.mark.parametrize( - "entity_type,feature_dtype", - entity_type_feature_dtypes, - ids=[ - entity_feature_types_ids(entity_type, feature_dtype) - for entity_type, feature_dtype in entity_type_feature_dtypes - ], - ) - @pytest.mark.parametrize( - "feature_is_list", feature_is_list, ids=lambda v: f"feature_is_list:{str(v)}" - ) - def inner_test(entity_type: ValueType, feature_dtype: str, feature_is_list: bool): - # TODO: parametrize config - with construct_test_environment( - TestRepoConfig( - provider="gcp", - offline_store_creator=ds_creator_path( - "bigquery.BigQueryDataSourceCreator" - ), - online_store="datastore", - ), - create_and_apply=create_apply_materialize, - materialize=create_apply_materialize, - entity_type=entity_type, - feature_dtype=feature_dtype, - feature_is_list=feature_is_list, - ) as environment: - types_test(environment) - - return inner_test diff --git a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py index 42667a983d..e0d6983bf1 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_source_creator.py @@ -11,12 +11,29 @@ class DataSourceCreator(ABC): @abstractmethod def create_data_source( self, - destination: str, df: pd.DataFrame, + destination_name: str, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: + """ + Create a data source based on the dataframe. Implementing this method requires the underlying implementation to + persist the dataframe in offline store, using the destination string as a way to differentiate multiple + dataframes and data sources. + + Args: + df: The dataframe to be used to create the data source. + destination_name: This str is used by the implementing classes to + isolate the multiple dataframes from each other. + event_timestamp_column: Pass through for the underlying data source. + created_timestamp_column: Pass through for the underlying data source. + field_mapping: Pass through for the underlying data source. + + Returns: + A Data source object, pointing to a table or file that is uploaded/persisted for the purpose of the + test. + """ ... @abstractmethod @@ -26,7 +43,3 @@ def create_offline_store_config(self) -> FeastConfigBaseModel: @abstractmethod def teardown(self): ... - - @abstractmethod - def get_prefixed_table_name(self, name: str, suffix: str) -> str: - ... diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index 07e65f9660..9b702ebe6c 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -1,7 +1,8 @@ -from typing import Dict +from typing import Dict, Optional import pandas as pd from google.cloud import bigquery +from google.cloud.bigquery import Dataset from feast import BigQuerySource from feast.data_source import DataSource @@ -12,21 +13,26 @@ class BigQueryDataSourceCreator(DataSourceCreator): + dataset: Optional[Dataset] = None + def __init__(self, project_name: str): self.client = bigquery.Client() self.project_name = project_name self.gcp_project = self.client.project self.dataset_id = f"{self.gcp_project}.{project_name}" - self.dataset = bigquery.Dataset(self.dataset_id) - print(f"Creating dataset: {self.dataset_id}") - self.client.create_dataset(self.dataset, exists_ok=True) - self.dataset.default_table_expiration_ms = ( - 1000 * 60 * 60 * 24 * 14 - ) # 2 weeks in milliseconds - self.client.update_dataset(self.dataset, ["default_table_expiration_ms"]) self.tables = [] + def create_dataset(self): + if not self.dataset: + self.dataset = bigquery.Dataset(self.dataset_id) + print(f"Creating dataset: {self.dataset_id}") + self.client.create_dataset(self.dataset, exists_ok=True) + self.dataset.default_table_expiration_ms = ( + 1000 * 60 * 60 * 24 * 14 + ) # 2 weeks in milliseconds + self.client.update_dataset(self.dataset, ["default_table_expiration_ms"]) + def teardown(self): for table in self.tables: @@ -42,32 +48,38 @@ def create_offline_store_config(self): def create_data_source( self, - destination: str, df: pd.DataFrame, + destination_name: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, **kwargs, ) -> DataSource: + destination_name = self.get_prefixed_table_name(destination_name) + + self.create_dataset() + job_config = bigquery.LoadJobConfig() - if self.gcp_project not in destination: - destination = f"{self.gcp_project}.{self.project_name}.{destination}" + if self.gcp_project not in destination_name: + destination_name = ( + f"{self.gcp_project}.{self.project_name}.{destination_name}" + ) job = self.client.load_table_from_dataframe( - df, destination, job_config=job_config + df, destination_name, job_config=job_config ) job.result() - self.tables.append(destination) + self.tables.append(destination_name) return BigQuerySource( - table_ref=destination, + table_ref=destination_name, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, ) - def get_prefixed_table_name(self, name: str, suffix: str) -> str: - return f"{self.client.project}.{name}.{suffix}" + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.client.project}.{self.project_name}.{suffix}" diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py index 93048fa3a1..0d402b2314 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/file.py @@ -1,5 +1,5 @@ import tempfile -from typing import Any, Dict +from typing import Any, Dict, List, Optional import pandas as pd from minio import Minio @@ -17,38 +17,48 @@ class FileDataSourceCreator(DataSourceCreator): - f: Any + files: List[Any] - def __init__(self, _: str): - pass + def __init__(self, project_name: str): + self.project_name = project_name + self.files = [] def create_data_source( self, - destination: str, df: pd.DataFrame, + destination_name: str, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - self.f = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) - df.to_parquet(self.f.name) + + destination_name = self.get_prefixed_table_name(destination_name) + + f = tempfile.NamedTemporaryFile( + prefix=f"{self.project_name}_{destination_name}", + suffix=".parquet", + delete=False, + ) + df.to_parquet(f.name) + self.files.append(f) return FileSource( file_format=ParquetFormat(), - path=f"file://{self.f.name}", + path=f"{f.name}", event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", field_mapping=field_mapping or {"ts_1": "ts"}, ) - def get_prefixed_table_name(self, name: str, suffix: str) -> str: - return f"{name}.{suffix}" + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}.{suffix}" def create_offline_store_config(self) -> FeastConfigBaseModel: return FileOfflineStoreConfig() def teardown(self): - self.f.close() + for f in self.files: + f.close() class S3FileDataSourceCreator(DataSourceCreator): @@ -93,13 +103,14 @@ def _upload_parquet_file(self, df, file_name, minio_endpoint): def create_data_source( self, - destination: str, df: pd.DataFrame, + destination_name: Optional[str] = None, + suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: - filename = f"{destination}.parquet" + filename = f"{destination_name}.parquet" port = self.minio.get_exposed_port("9000") host = self.minio.get_container_host_ip() minio_endpoint = f"{host}:{port}" @@ -116,7 +127,7 @@ def create_data_source( s3_endpoint_override=f"http://{host}:{port}", ) - def get_prefixed_table_name(self, name: str, suffix: str) -> str: + def get_prefixed_table_name(self, suffix: str) -> str: return f"{suffix}" def create_offline_store_config(self) -> FeastConfigBaseModel: diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py index 997bedaf26..88780f07a0 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/redshift.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional import pandas as pd @@ -33,29 +33,32 @@ def __init__(self, project_name: str): def create_data_source( self, - destination: str, df: pd.DataFrame, + destination_name: str, + suffix: Optional[str] = None, event_timestamp_column="ts", created_timestamp_column="created_ts", field_mapping: Dict[str, str] = None, ) -> DataSource: + destination_name = self.get_prefixed_table_name(destination_name) + aws_utils.upload_df_to_redshift( self.client, self.offline_store_config.cluster_id, self.offline_store_config.database, self.offline_store_config.user, self.s3, - f"{self.offline_store_config.s3_staging_location}/copy/{destination}.parquet", + f"{self.offline_store_config.s3_staging_location}/copy/{destination_name}.parquet", self.offline_store_config.iam_role, - destination, + destination_name, df, ) - self.tables.append(destination) + self.tables.append(destination_name) return RedshiftSource( - table=destination, + table=destination_name, event_timestamp_column=event_timestamp_column, created_timestamp_column=created_timestamp_column, date_partition_column="", @@ -65,8 +68,8 @@ def create_data_source( def create_offline_store_config(self) -> FeastConfigBaseModel: return self.offline_store_config - def get_prefixed_table_name(self, name: str, suffix: str) -> str: - return f"{name}_{suffix}" + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" def teardown(self): for table in self.tables: diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index dace1ab502..d03b89f0e0 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -7,22 +7,25 @@ def driver_feature_view( data_source: DataSource, name="test_correctness", + infer_features: bool = False, value_type: ValueType = ValueType.FLOAT, ) -> FeatureView: return FeatureView( name=name, entities=["driver"], - features=[Feature("value", value_type)], + features=None if infer_features else [Feature("value", value_type)], ttl=timedelta(days=5), input=data_source, ) -def create_driver_hourly_stats_feature_view(source): +def create_driver_hourly_stats_feature_view(source, infer_features: bool = True): driver_stats_feature_view = FeatureView( name="driver_stats", entities=["driver"], - features=[ + features=None + if infer_features + else [ Feature(name="conv_rate", dtype=ValueType.FLOAT), Feature(name="acc_rate", dtype=ValueType.FLOAT), Feature(name="avg_daily_trips", dtype=ValueType.INT32), @@ -33,11 +36,13 @@ def create_driver_hourly_stats_feature_view(source): return driver_stats_feature_view -def create_customer_daily_profile_feature_view(source): +def create_customer_daily_profile_feature_view(source, infer_features: bool = False): customer_profile_feature_view = FeatureView( name="customer_profile", entities=["customer_id"], - features=[ + features=None + if infer_features + else [ Feature(name="current_balance", dtype=ValueType.FLOAT), Feature(name="avg_passenger_count", dtype=ValueType.FLOAT), Feature(name="lifetime_trip_count", dtype=ValueType.INT32), diff --git a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py index 2c48f0fbe6..2efe343218 100644 --- a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py +++ b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py @@ -382,18 +382,6 @@ def run_offline_online_store_consistency_test( ) -@pytest.mark.integration -@pytest.mark.parametrize( - "bq_source_type", ["query", "table"], -) -@pytest.mark.parametrize("full_feature_names", [True, False]) -def test_bq_offline_online_store_consistency( - bq_source_type: str, full_feature_names: bool -): - with prep_bq_fs_and_fv(bq_source_type) as (fs, fv): - run_offline_online_store_consistency_test(fs, fv, full_feature_names) - - @pytest.mark.parametrize("full_feature_names", [True, False]) @pytest.mark.integration def test_redis_offline_online_store_consistency(full_feature_names: bool): diff --git a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py index 5d735fcd9d..44f9e595e3 100644 --- a/sdk/python/tests/integration/offline_store/test_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_historical_retrieval.py @@ -5,7 +5,6 @@ from datetime import datetime, timedelta from tempfile import TemporaryDirectory -import assertpy import numpy as np import pandas as pd import pytest @@ -14,15 +13,7 @@ from pytz import utc import feast.driver_test_data as driver_data -from feast import ( - BigQuerySource, - FeatureService, - FileSource, - RedshiftSource, - RepoConfig, - errors, - utils, -) +from feast import BigQuerySource, FeatureService, FileSource, RepoConfig, utils from feast.entity import Entity from feast.errors import FeatureNameCollisionError from feast.feature import Feature @@ -32,10 +23,7 @@ from feast.infra.offline_stores.offline_utils import ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, ) -from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig -from feast.infra.online_stores.dynamodb import DynamoDBOnlineStoreConfig from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig -from feast.infra.utils import aws_utils from feast.value_type import ValueType np.random.seed(0) @@ -394,559 +382,6 @@ def test_historical_features_from_parquet_sources( store.teardown() -@pytest.mark.integration -@pytest.mark.parametrize( - "provider_type", ["local", "gcp", "gcp_custom_offline_config"], -) -@pytest.mark.parametrize( - "infer_event_timestamp_col", [False, True], -) -@pytest.mark.parametrize( - "full_feature_names", [False, True], -) -def test_historical_features_from_bigquery_sources( - provider_type, infer_event_timestamp_col, capsys, full_feature_names -): - start_date = datetime.now().replace(microsecond=0, second=0, minute=0) - ( - customer_entities, - driver_entities, - end_date, - orders_df, - start_date, - ) = generate_entities(start_date, infer_event_timestamp_col) - - bigquery_dataset = ( - f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" - ) - - with BigQueryDataSet(bigquery_dataset), TemporaryDirectory() as temp_dir: - gcp_project = bigquery.Client().project - - # Orders Query - table_id = f"{bigquery_dataset}.orders" - stage_orders_bigquery(orders_df, table_id) - entity_df_query = f"SELECT * FROM {gcp_project}.{table_id}" - - # Driver Feature View - driver_df = driver_data.create_driver_hourly_stats_df( - driver_entities, start_date, end_date - ) - driver_table_id = f"{gcp_project}.{bigquery_dataset}.driver_hourly" - stage_driver_hourly_stats_bigquery_source(driver_df, driver_table_id) - driver_source = BigQuerySource( - table_ref=driver_table_id, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - driver_fv = create_driver_hourly_stats_feature_view(driver_source) - - # Customer Feature View - customer_df = driver_data.create_customer_daily_profile_df( - customer_entities, start_date, end_date - ) - customer_table_id = f"{gcp_project}.{bigquery_dataset}.customer_profile" - - stage_customer_daily_profile_bigquery_source(customer_df, customer_table_id) - customer_source = BigQuerySource( - table_ref=customer_table_id, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - customer_fv = create_customer_daily_profile_feature_view(customer_source) - - driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) - customer = Entity(name="customer_id", value_type=ValueType.INT64) - - if provider_type == "local": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="default", - provider="local", - online_store=SqliteOnlineStoreConfig( - path=os.path.join(temp_dir, "online_store.db"), - ), - offline_store=BigQueryOfflineStoreConfig( - type="bigquery", dataset=bigquery_dataset - ), - ) - ) - elif provider_type == "gcp": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="".join( - random.choices(string.ascii_uppercase + string.digits, k=10) - ), - provider="gcp", - offline_store=BigQueryOfflineStoreConfig( - type="bigquery", dataset=bigquery_dataset - ), - ) - ) - elif provider_type == "gcp_custom_offline_config": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="".join( - random.choices(string.ascii_uppercase + string.digits, k=10) - ), - provider="gcp", - offline_store=BigQueryOfflineStoreConfig( - type="bigquery", dataset="foo" - ), - ) - ) - else: - raise Exception("Invalid provider used as part of test configuration") - - store.apply([driver, customer, driver_fv, customer_fv]) - - try: - event_timestamp = ( - DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns - else "e_ts" - ) - expected_df = get_expected_training_df( - customer_df, - customer_fv, - driver_df, - driver_fv, - orders_df, - event_timestamp, - full_feature_names, - ) - - job_from_sql = store.get_historical_features( - entity_df=entity_df_query, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - start_time = datetime.utcnow() - actual_df_from_sql_entities = job_from_sql.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_sql_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_sql_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_sql_entities = job_from_sql.to_arrow() - assert_frame_equal( - actual_df_from_sql_entities, table_from_sql_entities.to_pandas() - ) - - timestamp_column = ( - "e_ts" - if infer_event_timestamp_col - else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - ) - - entity_df_query_with_invalid_join_key = ( - f"select order_id, driver_id, customer_id as customer, " - f"order_is_success, {timestamp_column}, FROM {gcp_project}.{table_id}" - ) - # Rename the join key; this should now raise an error. - assertpy.assert_that(store.get_historical_features).raises( - errors.FeastEntityDFMissingColumnsError - ).when_called_with( - entity_df=entity_df_query_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ) - - job_from_df = store.get_historical_features( - entity_df=orders_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - # Rename the join key; this should now raise an error. - orders_df_with_invalid_join_key = orders_df.rename( - {"customer_id": "customer"}, axis="columns" - ) - assertpy.assert_that(store.get_historical_features).raises( - errors.FeastEntityDFMissingColumnsError - ).when_called_with( - entity_df=orders_df_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ) - - # Make sure that custom dataset name is being used from the offline_store config - if provider_type == "gcp_custom_offline_config": - assertpy.assert_that(job_from_df.query).contains("foo.feast_entity_df") - else: - assertpy.assert_that(job_from_df.query).contains( - f"{bigquery_dataset}.feast_entity_df" - ) - - start_time = datetime.utcnow() - actual_df_from_df_entities = job_from_df.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_df_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_df_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_df_entities = job_from_df.to_arrow() - assert_frame_equal( - actual_df_from_df_entities, table_from_df_entities.to_pandas() - ) - finally: - store.teardown() - - -@pytest.mark.integration -@pytest.mark.parametrize( - "provider_type", ["local", "aws"], -) -@pytest.mark.parametrize( - "infer_event_timestamp_col", [False, True], -) -@pytest.mark.parametrize( - "full_feature_names", [False, True], -) -def test_historical_features_from_redshift_sources( - provider_type, infer_event_timestamp_col, capsys, full_feature_names -): - client = aws_utils.get_redshift_data_client("us-west-2") - s3 = aws_utils.get_s3_resource("us-west-2") - - offline_store = RedshiftOfflineStoreConfig( - cluster_id="feast-integration-tests", - region="us-west-2", - user="admin", - database="feast", - s3_staging_location="s3://feast-integration-tests/redshift/tests/ingestion", - iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", - ) - - start_date = datetime.now().replace(microsecond=0, second=0, minute=0) - ( - customer_entities, - driver_entities, - end_date, - orders_df, - start_date, - ) = generate_entities(start_date, infer_event_timestamp_col) - - redshift_table_prefix = ( - f"test_hist_retrieval_{int(time.time_ns())}_{random.randint(1000, 9999)}" - ) - - # Stage orders_df to Redshift - table_name = f"{redshift_table_prefix}_orders" - entity_df_query = f"SELECT * FROM {table_name}" - orders_context = aws_utils.temporarily_upload_df_to_redshift( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - s3, - f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", - offline_store.iam_role, - table_name, - orders_df, - ) - - # Stage driver_df to Redshift - driver_df = driver_data.create_driver_hourly_stats_df( - driver_entities, start_date, end_date - ) - driver_table_name = f"{redshift_table_prefix}_driver_hourly" - driver_context = aws_utils.temporarily_upload_df_to_redshift( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - s3, - f"{offline_store.s3_staging_location}/copy/{driver_table_name}.parquet", - offline_store.iam_role, - driver_table_name, - driver_df, - ) - - # Stage customer_df to Redshift - customer_df = driver_data.create_customer_daily_profile_df( - customer_entities, start_date, end_date - ) - customer_table_name = f"{redshift_table_prefix}_customer_profile" - customer_context = aws_utils.temporarily_upload_df_to_redshift( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - s3, - f"{offline_store.s3_staging_location}/copy/{customer_table_name}.parquet", - offline_store.iam_role, - customer_table_name, - customer_df, - ) - - with orders_context, driver_context, customer_context, TemporaryDirectory() as temp_dir: - driver_source = RedshiftSource( - table=driver_table_name, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - driver_fv = create_driver_hourly_stats_feature_view(driver_source) - - customer_source = RedshiftSource( - table=customer_table_name, - event_timestamp_column="event_timestamp", - created_timestamp_column="created", - ) - customer_fv = create_customer_daily_profile_feature_view(customer_source) - - driver = Entity(name="driver", join_key="driver_id", value_type=ValueType.INT64) - customer = Entity(name="customer_id", value_type=ValueType.INT64) - - if provider_type == "local": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="default", - provider="local", - online_store=SqliteOnlineStoreConfig( - path=os.path.join(temp_dir, "online_store.db"), - ), - offline_store=offline_store, - ) - ) - elif provider_type == "aws": - store = FeatureStore( - config=RepoConfig( - registry=os.path.join(temp_dir, "registry.db"), - project="".join( - random.choices(string.ascii_uppercase + string.digits, k=10) - ), - provider="aws", - online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), - offline_store=offline_store, - ) - ) - else: - raise Exception("Invalid provider used as part of test configuration") - - store.apply([driver, customer, driver_fv, customer_fv]) - - try: - event_timestamp = ( - DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - if DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL in orders_df.columns - else "e_ts" - ) - expected_df = get_expected_training_df( - customer_df, - customer_fv, - driver_df, - driver_fv, - orders_df, - event_timestamp, - full_feature_names, - ) - - job_from_sql = store.get_historical_features( - entity_df=entity_df_query, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - start_time = datetime.utcnow() - actual_df_from_sql_entities = job_from_sql.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"\nTime to execute job_from_sql.to_df() = '{(end_time - start_time)}'" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_sql_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_sql_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_sql_entities = job_from_sql.to_arrow() - assert_frame_equal( - actual_df_from_sql_entities.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - table_from_sql_entities.to_pandas() - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - ) - - timestamp_column = ( - "e_ts" - if infer_event_timestamp_col - else DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL - ) - - entity_df_query_with_invalid_join_key = ( - f"select order_id, driver_id, customer_id as customer, " - f"order_is_success, {timestamp_column} FROM {table_name}" - ) - # Rename the join key; this should now raise an error. - assertpy.assert_that( - store.get_historical_features( - entity_df=entity_df_query_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ).to_df - ).raises(errors.FeastEntityDFMissingColumnsError).when_called_with() - - job_from_df = store.get_historical_features( - entity_df=orders_df, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - full_feature_names=full_feature_names, - ) - - # Rename the join key; this should now raise an error. - orders_df_with_invalid_join_key = orders_df.rename( - {"customer_id": "customer"}, axis="columns" - ) - assertpy.assert_that( - store.get_historical_features( - entity_df=orders_df_with_invalid_join_key, - features=[ - "driver_stats:conv_rate", - "driver_stats:avg_daily_trips", - "customer_profile:current_balance", - "customer_profile:avg_passenger_count", - "customer_profile:lifetime_trip_count", - ], - ).to_df - ).raises(errors.FeastEntityDFMissingColumnsError).when_called_with() - - start_time = datetime.utcnow() - actual_df_from_df_entities = job_from_df.to_df() - end_time = datetime.utcnow() - with capsys.disabled(): - print( - str( - f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n" - ) - ) - - assert sorted(expected_df.columns) == sorted( - actual_df_from_df_entities.columns - ) - assert_frame_equal( - expected_df.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - actual_df_from_df_entities[expected_df.columns] - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - check_dtype=False, - ) - - table_from_df_entities = job_from_df.to_arrow() - assert_frame_equal( - actual_df_from_df_entities.sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ).reset_index(drop=True), - table_from_df_entities.to_pandas() - .sort_values( - by=[event_timestamp, "order_id", "driver_id", "customer_id"] - ) - .reset_index(drop=True), - ) - finally: - store.teardown() - - def test_feature_name_collision_on_historical_retrieval(): # _validate_feature_refs is the function that checks for colliding feature names diff --git a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py index 207ebd9732..3eb3f4da6f 100644 --- a/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py +++ b/sdk/python/tests/integration/offline_store/test_s3_custom_endpoint.py @@ -1,9 +1,11 @@ import pytest -from tests.integration.feature_repos.test_repo_configuration import ( - TestRepoConfig, +from tests.integration.feature_repos.repo_configuration import ( + IntegrationTestRepoConfig, construct_test_environment, + construct_universal_feature_views, ) +from tests.integration.feature_repos.universal.entities import customer, driver # TODO: Allow integration tests to run using different credentials. @@ -12,8 +14,8 @@ @pytest.mark.skip( reason="No way to run this test today. Credentials conflict with real AWS credentials in CI" ) -def test_registration_and_retrieval_from_custom_s3_endpoint(): - config = TestRepoConfig( +def test_registration_and_retrieval_from_custom_s3_endpoint(universal_data_sources): + config = IntegrationTestRepoConfig( offline_store_creator="tests.integration.feature_repos.universal.data_sources.file.S3FileDataSourceCreator" ) import os @@ -27,10 +29,18 @@ def test_registration_and_retrieval_from_custom_s3_endpoint(): os.environ["AWS_ACCESS_KEY_ID"] = "AKIAIOSFODNN7EXAMPLE" os.environ["AWS_SECRET_ACCESS_KEY"] = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" - with construct_test_environment( - config, create_and_apply=True, materialize=True - ) as environment: + with construct_test_environment(config) as environment: fs = environment.feature_store + + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + + feast_objects = [] + feast_objects.extend(feature_views.values()) + feast_objects.extend([driver(), customer()]) + fs.apply(feast_objects) + fs.materialize(environment.start_date, environment.end_date) + out = fs.get_online_features( features=["driver_stats:conv_rate"], entity_rows=[{"driver": 5001}] ).to_dict() diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 7379c27a62..75cd5bbf70 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import pytest from pandas.testing import assert_frame_equal from pytz import utc @@ -11,10 +12,11 @@ from feast.infra.offline_stores.offline_utils import ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL, ) -from tests.integration.feature_repos.test_repo_configuration import ( - Environment, - parametrize_offline_retrieval_test, +from tests.integration.feature_repos.repo_configuration import ( + construct_universal_feature_views, + table_name_from_data_source, ) +from tests.integration.feature_repos.universal.entities import customer, driver np.random.seed(0) @@ -135,24 +137,32 @@ def get_expected_training_df( return expected_df -@parametrize_offline_retrieval_test -def test_historical_features(environment: Environment): +@pytest.mark.integration +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +def test_historical_features(environment, universal_data_sources, full_feature_names): store = environment.feature_store - customer_df, customer_fv = ( - environment.customer_df, - environment.customer_feature_view(), + (entities, datasets, data_sources) = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + + customer_df, driver_df, orders_df = ( + datasets["customer"], + datasets["driver"], + datasets["orders"], ) - driver_df, driver_fv = ( - environment.driver_df, - environment.driver_stats_feature_view(), + customer_fv, driver_fv = ( + feature_views["customer"], + feature_views["driver"], ) - orders_df = environment.orders_df - full_feature_names = environment.test_repo_config.full_feature_names + + feast_objects = [] + feast_objects.extend([customer_fv, driver_fv, driver(), customer()]) + store.apply(feast_objects) entity_df_query = None - if environment.orders_table(): - entity_df_query = f"SELECT * FROM {environment.orders_table()}" + orders_table = table_name_from_data_source(data_sources["orders"]) + if orders_table: + entity_df_query = f"SELECT * FROM {orders_table}" event_timestamp = ( DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index a4337a305c..b3bcf688bd 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -2,26 +2,36 @@ import unittest import pandas as pd +import pytest -from tests.integration.feature_repos.test_repo_configuration import ( - Environment, - parametrize_online_test, +from tests.integration.feature_repos.repo_configuration import ( + construct_universal_feature_views, ) +from tests.integration.feature_repos.universal.entities import customer, driver -@parametrize_online_test -def test_online_retrieval(environment: Environment): - fs = environment.feature_store - full_feature_names = environment.test_repo_config.full_feature_names +@pytest.mark.integration +@pytest.mark.parametrize("full_feature_names", [True, False], ids=lambda v: str(v)) +def test_online_retrieval(environment, universal_data_sources, full_feature_names): - sample_drivers = random.sample(environment.driver_entities, 10) - drivers_df = environment.driver_df[ - environment.driver_df["driver_id"].isin(sample_drivers) + fs = environment.feature_store + entities, datasets, data_sources = universal_data_sources + feature_views = construct_universal_feature_views(data_sources) + + feast_objects = [] + feast_objects.extend(feature_views.values()) + feast_objects.extend([driver(), customer()]) + fs.apply(feast_objects) + fs.materialize(environment.start_date, environment.end_date) + + sample_drivers = random.sample(entities["driver"], 10) + drivers_df = datasets["driver"][ + datasets["driver"]["driver_id"].isin(sample_drivers) ] - sample_customers = random.sample(environment.customer_entities, 10) - customers_df = environment.customer_df[ - environment.customer_df["customer_id"].isin(sample_customers) + sample_customers = random.sample(entities["customer"], 10) + customers_df = datasets["customer"][ + datasets["customer"]["customer_id"].isin(sample_customers) ] entity_rows = [ diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 99c1e91f23..5390b72cc0 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -1,113 +1,203 @@ from datetime import datetime, timedelta import pandas as pd -from data.data_creator import get_feature_values_for_dtype -from integration.feature_repos.test_repo_configuration import ( - Environment, - parametrize_types_no_materialize_test, - parametrize_types_no_materialize_test_no_list, -) -from integration.feature_repos.universal.entities import driver -from integration.feature_repos.universal.feature_views import driver_feature_view +import pytest from feast.infra.offline_stores.offline_store import RetrievalJob from feast.type_map import python_type_to_feast_value_type from feast.value_type import ValueType +from tests.data.data_creator import create_dataset, get_feature_values_for_dtype +from tests.integration.feature_repos.repo_configuration import ( + IntegrationTestRepoConfig, + construct_test_environment, +) +from tests.integration.feature_repos.universal.data_sources.bigquery import ( + BigQueryDataSourceCreator, +) +from tests.integration.feature_repos.universal.entities import driver +from tests.integration.feature_repos.universal.feature_views import driver_feature_view + + +def entity_feature_types_ids(entity_type: ValueType, feature_dtype: str): + return f"entity_type:{str(entity_type)}-feature_dtype:{feature_dtype}" + + +entity_type_feature_dtypes = [ + (ValueType.INT32, "int32"), + (ValueType.INT64, "int64"), + (ValueType.STRING, "float"), + (ValueType.STRING, "bool"), +] +GCP_CONFIG = IntegrationTestRepoConfig( + provider="gcp", + offline_store_creator=BigQueryDataSourceCreator, + online_store="datastore", +) # TODO: change parametrization to allow for other providers aside from gcp -@parametrize_types_no_materialize_test -def test_entity_inference_types_match(environment: Environment): - feature_dtype, feature_is_list, fs, fv = get_test_fixtures(environment) - # Don't specify value type in entity to force inference - entity = driver(value_type=ValueType.UNKNOWN) - fs.apply([fv, entity]) - - entities = fs.list_entities() - entity_type_to_expected_inferred_entity_type = { - ValueType.INT32: ValueType.INT64, - ValueType.INT64: ValueType.INT64, - ValueType.FLOAT: ValueType.DOUBLE, - ValueType.STRING: ValueType.STRING, - } - for entity in entities: - assert ( - entity.value_type - == entity_type_to_expected_inferred_entity_type[environment.entity_type] +@pytest.mark.integration +@pytest.mark.parametrize( + "entity_type,feature_dtype", + entity_type_feature_dtypes, + ids=[ + entity_feature_types_ids(entity_type, feature_dtype) + for entity_type, feature_dtype in entity_type_feature_dtypes + ], +) +@pytest.mark.parametrize( + "feature_is_list", [False], ids=lambda v: f"feature_is_list:{str(v)}" +) +def test_entity_inference_types_match(entity_type, feature_dtype, feature_is_list): + with construct_test_environment(GCP_CONFIG) as environment: + df = create_dataset(entity_type, feature_dtype, feature_is_list) + data_source = environment.data_source_creator.create_data_source( + df, + destination_name=environment.feature_store.project, + field_mapping={"ts_1": "ts"}, ) + fv = create_feature_view(feature_dtype, feature_is_list, data_source) + fs = environment.feature_store + try: + # Don't specify value type in entity to force inference + entity = driver(value_type=ValueType.UNKNOWN) + fs.apply([fv, entity]) -@parametrize_types_no_materialize_test -def test_feature_get_historical_features_types_match(environment: Environment): - feature_dtype, feature_is_list, fs, fv = get_test_fixtures(environment) - entity = driver() - fs.apply([fv, entity]) + entities = fs.list_entities() + entity_type_to_expected_inferred_entity_type = { + ValueType.INT32: ValueType.INT64, + ValueType.INT64: ValueType.INT64, + ValueType.FLOAT: ValueType.DOUBLE, + ValueType.STRING: ValueType.STRING, + } + for entity in entities: + assert ( + entity.value_type + == entity_type_to_expected_inferred_entity_type[entity_type] + ) + finally: + environment.data_source_creator.teardown() - features = [f"{fv.name}:value"] - df = pd.DataFrame() - df["driver_id"] = ( - ["1", "3"] if environment.entity_type == ValueType.STRING else [1, 3] - ) - now = datetime.utcnow() - ts = pd.Timestamp(now).round("ms") - df["ts"] = [ - ts - timedelta(hours=4), - ts - timedelta(hours=2), - ] - historical_features = fs.get_historical_features(entity_df=df, features=features,) - # TODO(adchia): pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs - historical_features_df = historical_features.to_df() - print(historical_features_df) - if feature_is_list: - assert_feature_list_types(feature_dtype, historical_features_df) - else: - assert_expected_historical_feature_types(feature_dtype, historical_features_df) - assert_expected_arrow_types(feature_dtype, feature_is_list, historical_features) +@pytest.mark.integration +@pytest.mark.parametrize( + "entity_type,feature_dtype", + entity_type_feature_dtypes, + ids=[ + entity_feature_types_ids(entity_type, feature_dtype) + for entity_type, feature_dtype in entity_type_feature_dtypes + ], +) +@pytest.mark.parametrize( + "feature_is_list", [True, False], ids=lambda v: f"feature_is_list:{str(v)}" +) +def test_feature_get_historical_features_types_match( + entity_type, feature_dtype, feature_is_list +): + with construct_test_environment(GCP_CONFIG) as environment: + df = create_dataset(entity_type, feature_dtype, feature_is_list) + data_source = environment.data_source_creator.create_data_source( + df, + destination_name=environment.feature_store.project, + field_mapping={"ts_1": "ts"}, + ) + fv = create_feature_view(feature_dtype, feature_is_list, data_source) + fs = environment.feature_store + entity = driver() + try: + fs.apply([fv, entity]) + features = [f"{fv.name}:value"] + df = pd.DataFrame() + df["driver_id"] = ["1", "3"] if entity_type == ValueType.STRING else [1, 3] + now = datetime.utcnow() + ts = pd.Timestamp(now).round("ms") + df["ts"] = [ + ts - timedelta(hours=4), + ts - timedelta(hours=2), + ] + historical_features = fs.get_historical_features( + entity_df=df, features=features, + ) -@parametrize_types_no_materialize_test_no_list -def test_feature_get_online_features_types_match(environment: Environment): - feature_dtype, feature_is_list, fs, fv = get_test_fixtures(environment) - if feature_is_list: - pass - - features = [fv.name + ":value"] - entity = driver(value_type=ValueType.UNKNOWN) - fs.apply([fv, entity]) - fs.materialize(environment.start_date, environment.end_date) - driver_id_value = "1" if environment.entity_type == ValueType.STRING else 1 - online_features = fs.get_online_features( - features=features, entity_rows=[{"driver": driver_id_value}], - ).to_dict() - - feature_list_dtype_to_expected_online_response_value_type = { - "int32": "int", - "int64": "int", - "float": "float", - "string": "str", - "bool": "bool", - } - assert ( - type(online_features["value"][0]).__name__ - == feature_list_dtype_to_expected_online_response_value_type[feature_dtype] - ) + # TODO(adchia): pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs + historical_features_df = historical_features.to_df() + print(historical_features_df) + if feature_is_list: + assert_feature_list_types(feature_dtype, historical_features_df) + else: + assert_expected_historical_feature_types( + feature_dtype, historical_features_df + ) + assert_expected_arrow_types( + feature_dtype, feature_is_list, historical_features + ) + finally: + environment.data_source_creator.teardown() + + +@pytest.mark.integration +@pytest.mark.parametrize( + "entity_type,feature_dtype", + entity_type_feature_dtypes, + ids=[ + entity_feature_types_ids(entity_type, feature_dtype) + for entity_type, feature_dtype in entity_type_feature_dtypes + ], +) +@pytest.mark.parametrize( + "feature_is_list", [False], ids=lambda v: f"feature_is_list:{str(v)}" +) +def test_feature_get_online_features_types_match( + entity_type, feature_dtype, feature_is_list +): + with construct_test_environment(GCP_CONFIG) as environment: + df = create_dataset(entity_type, feature_dtype, feature_is_list) + data_source = environment.data_source_creator.create_data_source( + df, + destination_name=environment.feature_store.project, + field_mapping={"ts_1": "ts"}, + ) + fv = create_feature_view(feature_dtype, feature_is_list, data_source) + fs = environment.feature_store + + features = [fv.name + ":value"] + entity = driver(value_type=ValueType.UNKNOWN) + + try: + fs.apply([fv, entity]) + fs.materialize(environment.start_date, environment.end_date) + driver_id_value = "1" if entity_type == ValueType.STRING else 1 + online_features = fs.get_online_features( + features=features, entity_rows=[{"driver": driver_id_value}], + ).to_dict() + + feature_list_dtype_to_expected_online_response_value_type = { + "int32": "int", + "int64": "int", + "float": "float", + "string": "str", + "bool": "bool", + } + assert ( + type(online_features["value"][0]).__name__ + == feature_list_dtype_to_expected_online_response_value_type[ + feature_dtype + ] + ) + finally: + environment.data_source_creator.teardown() -def get_test_fixtures(environment: Environment): - feature_dtype = environment.feature_dtype - feature_is_list = environment.feature_is_list - fs, fv = ( - environment.feature_store, - driver_feature_view( - environment.data_source, - value_type=python_type_to_feast_value_type( - feature_dtype, - value=get_feature_values_for_dtype(feature_dtype, feature_is_list)[0], - ), +def create_feature_view(feature_dtype, feature_is_list, data_source): + return driver_feature_view( + data_source, + value_type=python_type_to_feast_value_type( + feature_dtype, + value=get_feature_values_for_dtype(feature_dtype, feature_is_list)[0], ), ) - return feature_dtype, feature_is_list, fs, fv def assert_expected_historical_feature_types( From 95d1d5b0b50a5ee5739606ff88712c1b1a69d4d1 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Wed, 1 Sep 2021 15:22:17 -0700 Subject: [PATCH 04/19] Refactor the datastore online_read method to be slightly more efficient (#1819) * Refactor the datastore online_read method to be slightly more efficient Signed-off-by: Achal Shah * Remove double import Signed-off-by: Achal Shah --- .../feast/infra/online_stores/datastore.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/datastore.py b/sdk/python/feast/infra/online_stores/datastore.py index 767b934fd1..37011c7d51 100644 --- a/sdk/python/feast/infra/online_stores/datastore.py +++ b/sdk/python/feast/infra/online_stores/datastore.py @@ -243,22 +243,21 @@ def online_read( ) keys.append(key) + # NOTE: get_multi doesn't return values in the same order as the keys in the request. + # Also, len(values) can be less than len(keys) in the case of missing values. values = client.get_multi(keys) - - if values is not None: - keys_missing_from_response = set(keys) - set([v.key for v in values]) - values = sorted(values, key=lambda v: keys.index(v.key)) - for value in values: + values_dict = {v.key: v for v in values} if values is not None else {} + for key in keys: + if key in values_dict: + value = values_dict[key] res = {} for feature_name, value_bin in value["values"].items(): val = ValueProto() val.ParseFromString(value_bin) res[feature_name] = val result.append((value["event_ts"], res)) - for missing_key_idx in sorted( - [keys.index(k) for k in keys_missing_from_response] - ): - result.insert(missing_key_idx, (None, None)) + else: + result.append((None, None)) return result From 021daf0dac2f86c3cded4696183c2603a3365e10 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Wed, 1 Sep 2021 20:26:17 -0400 Subject: [PATCH 05/19] Reducing size of universal repo, decreasing test time from 6 minutes to 5:15 (#1826) Signed-off-by: Danny Chiao --- .../integration/feature_repos/repo_configuration.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 00c439ca89..6ff91eb1b1 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -76,7 +76,7 @@ class IntegrationTestRepoConfig: def construct_universal_entities() -> Dict[str, List[Any]]: - return {"customer": list(range(1001, 1110)), "driver": list(range(5001, 5110))} + return {"customer": list(range(1001, 1020)), "driver": list(range(5001, 5020))} def construct_universal_datasets( @@ -91,9 +91,9 @@ def construct_universal_datasets( orders_df = driver_test_data.create_orders_df( customers=entities["customer"], drivers=entities["driver"], - start_date=end_time - timedelta(days=365), - end_date=end_time + timedelta(days=365), - order_count=1000, + start_date=end_time - timedelta(days=3), + end_date=end_time + timedelta(days=3), + order_count=20, ) return {"customer": customer_df, "driver": driver_df, "orders": orders_df} @@ -146,7 +146,7 @@ class Environment: ) def __post_init__(self): - self.start_date: datetime = self.end_date - timedelta(days=7) + self.start_date: datetime = self.end_date - timedelta(days=3) def table_name_from_data_source(ds: DataSource) -> Optional[str]: From 9dc9e60aa6a5d6a85f012307f1910f5233a251c6 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Wed, 1 Sep 2021 21:01:18 -0400 Subject: [PATCH 06/19] Initial scaffolding for on demand feature view (#1803) * Initial scaffolding for on demand feature view, with initial support for transforms on online fetches Signed-off-by: Danny Chiao * Fixing comments Signed-off-by: Danny Chiao * Comments Signed-off-by: Danny Chiao * Added basic test Signed-off-by: Danny Chiao * Simplifying function serialization Signed-off-by: Danny Chiao * Refactor logic into odfv Signed-off-by: Danny Chiao --- protos/feast/core/FeatureView.proto | 1 + protos/feast/core/OnDemandFeatureView.proto | 57 +++++++ protos/feast/core/Registry.proto | 2 + sdk/python/feast/__init__.py | 2 + sdk/python/feast/errors.py | 7 + sdk/python/feast/feature_store.py | 61 ++++++- sdk/python/feast/on_demand_feature_view.py | 150 ++++++++++++++++++ sdk/python/feast/registry.py | 87 +++++++++- sdk/python/feast/repo_operations.py | 38 ++++- sdk/python/setup.py | 1 + .../feature_repos/universal/feature_views.py | 22 ++- .../online_store/test_universal_online.py | 17 +- 12 files changed, 434 insertions(+), 11 deletions(-) create mode 100644 protos/feast/core/OnDemandFeatureView.proto create mode 100644 sdk/python/feast/on_demand_feature_view.py diff --git a/protos/feast/core/FeatureView.proto b/protos/feast/core/FeatureView.proto index f39fcf5e73..6edba9f7fe 100644 --- a/protos/feast/core/FeatureView.proto +++ b/protos/feast/core/FeatureView.proto @@ -35,6 +35,7 @@ message FeatureView { FeatureViewMeta meta = 2; } +// TODO(adchia): refactor common fields from this and ODFV into separate metadata proto message FeatureViewSpec { // Name of the feature view. Must be unique. Not updated. string name = 1; diff --git a/protos/feast/core/OnDemandFeatureView.proto b/protos/feast/core/OnDemandFeatureView.proto new file mode 100644 index 0000000000..6aa938e8ee --- /dev/null +++ b/protos/feast/core/OnDemandFeatureView.proto @@ -0,0 +1,57 @@ +// +// Copyright 2020 The Feast Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +syntax = "proto3"; +package feast.core; + +option go_package = "github.com/feast-dev/feast/sdk/go/protos/feast/core"; +option java_outer_classname = "OnDemandFeatureViewProto"; +option java_package = "feast.proto.core"; + +import "feast/core/FeatureView.proto"; +import "feast/core/Feature.proto"; + +message OnDemandFeatureView { + // User-specified specifications of this feature view. + OnDemandFeatureViewSpec spec = 1; +} + +message OnDemandFeatureViewSpec { + // Name of the feature view. Must be unique. Not updated. + string name = 1; + + // Name of Feast project that this feature view belongs to. + string project = 2; + + // List of features specifications for each feature defined with this feature view. + repeated FeatureSpecV2 features = 3; + + // List of features specifications for each feature defined with this feature view. + // TODO(adchia): add support for request data + map inputs = 4; + + UserDefinedFunction user_defined_function = 5; +} + +// Serialized representation of python function. +message UserDefinedFunction { + // The function name + string name = 1; + + // The python-syntax function body (serialized by dill) + bytes body = 2; +} diff --git a/protos/feast/core/Registry.proto b/protos/feast/core/Registry.proto index 6900a5b1e7..b8570301e9 100644 --- a/protos/feast/core/Registry.proto +++ b/protos/feast/core/Registry.proto @@ -25,12 +25,14 @@ import "feast/core/Entity.proto"; import "feast/core/FeatureService.proto"; import "feast/core/FeatureTable.proto"; import "feast/core/FeatureView.proto"; +import "feast/core/OnDemandFeatureView.proto"; import "google/protobuf/timestamp.proto"; message Registry { repeated Entity entities = 1; repeated FeatureTable feature_tables = 2; repeated FeatureView feature_views = 6; + repeated OnDemandFeatureView on_demand_feature_views = 8; repeated FeatureService feature_services = 7; string registry_schema_version = 3; // to support migrations; incremented when schema is changed diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 83d9286132..cd4730efa3 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -13,6 +13,7 @@ from .feature_store import FeatureStore from .feature_table import FeatureTable from .feature_view import FeatureView +from .on_demand_feature_view import OnDemandFeatureView from .repo_config import RepoConfig from .value_type import ValueType @@ -37,6 +38,7 @@ "FeatureStore", "FeatureTable", "FeatureView", + "OnDemandFeatureView", "RepoConfig", "SourceType", "ValueType", diff --git a/sdk/python/feast/errors.py b/sdk/python/feast/errors.py index 1202d4df49..fa4a779a31 100644 --- a/sdk/python/feast/errors.py +++ b/sdk/python/feast/errors.py @@ -200,3 +200,10 @@ def __init__(self, entity_type: type): f"The entity dataframe you have provided must be a Pandas DataFrame or a SQL query, " f"but we found: {entity_type} " ) + + +class ConflictingFeatureViewNames(Exception): + def __init__(self, feature_view_name: str): + super().__init__( + f"The feature view name: {feature_view_name} refers to both an on-demand feature view and a feature view" + ) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 77650b5cf1..c4f1987572 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -37,6 +37,7 @@ update_entities_with_inferred_types_from_feature_views, ) from feast.infra.provider import Provider, RetrievalJob, get_provider +from feast.on_demand_feature_view import OnDemandFeatureView from feast.online_response import OnlineResponse, _infer_online_entity_rows from feast.protos.feast.serving.ServingService_pb2 import ( GetOnlineFeaturesRequestV2, @@ -45,6 +46,7 @@ from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.registry import Registry from feast.repo_config import RepoConfig, load_repo_config +from feast.type_map import python_value_to_proto_value from feast.usage import log_exceptions, log_exceptions_and_usage from feast.version import get_version @@ -267,8 +269,9 @@ def apply( objects: Union[ Entity, FeatureView, + OnDemandFeatureView, FeatureService, - List[Union[FeatureView, Entity, FeatureService]], + List[Union[FeatureView, OnDemandFeatureView, Entity, FeatureService]], ], commit: bool = True, ): @@ -314,6 +317,7 @@ def apply( assert isinstance(objects, list) views_to_update = [ob for ob in objects if isinstance(ob, FeatureView)] + odfvs_to_update = [ob for ob in objects if isinstance(ob, OnDemandFeatureView)] _validate_feature_views(views_to_update) entities_to_update = [ob for ob in objects if isinstance(ob, Entity)] services_to_update = [ob for ob in objects if isinstance(ob, FeatureService)] @@ -332,11 +336,15 @@ def apply( if len(views_to_update) + len(entities_to_update) + len( services_to_update - ) != len(objects): + ) + len(odfvs_to_update) != len(objects): raise ValueError("Unknown object type provided as part of apply() call") for view in views_to_update: self._registry.apply_feature_view(view, project=self.project, commit=False) + for odfv in odfvs_to_update: + self._registry.apply_on_demand_feature_view( + odfv, project=self.project, commit=False + ) for ent in entities_to_update: self._registry.apply_entity(ent, project=self.project, commit=False) for feature_service in services_to_update: @@ -717,7 +725,6 @@ def get_online_features( all_feature_views = self._registry.list_feature_views( project=self.project, allow_cache=True ) - _validate_feature_refs(_feature_refs, full_feature_names) grouped_refs = _group_feature_refs(_feature_refs, all_feature_views) for table, requested_features in grouped_refs: @@ -759,6 +766,47 @@ def get_online_features( feature_ref ] = GetOnlineFeaturesResponse.FieldStatus.PRESENT + initial_response = OnlineResponse( + GetOnlineFeaturesResponse(field_values=result_rows) + ) + return self._augment_response_with_on_demand_transforms( + _feature_refs, full_feature_names, initial_response, result_rows + ) + + def _augment_response_with_on_demand_transforms( + self, + feature_refs: List[str], + full_feature_names: bool, + initial_response: OnlineResponse, + result_rows: List[GetOnlineFeaturesResponse.FieldValues], + ) -> OnlineResponse: + all_on_demand_feature_views = self._registry.list_on_demand_feature_views( + project=self.project, allow_cache=True + ) + if len(all_on_demand_feature_views) == 0: + return initial_response + initial_response_df = initial_response.to_df() + # Apply on demand transformations + for odfv in all_on_demand_feature_views: + feature_ref = odfv.name + if feature_ref in feature_refs: + transformed_features_df = odfv.get_transformed_features_df( + full_feature_names, initial_response_df + ) + for row_idx in range(len(result_rows)): + result_row = result_rows[row_idx] + # TODO(adchia): support multiple output features in an ODFV, which requires different naming + # conventions + result_row.fields[odfv.name].CopyFrom( + python_value_to_proto_value( + transformed_features_df[odfv.features[0].name].values[ + row_idx + ] + ) + ) + result_row.statuses[ + feature_ref + ] = GetOnlineFeaturesResponse.FieldStatus.PRESENT return OnlineResponse(GetOnlineFeaturesResponse(field_values=result_rows)) @log_exceptions_and_usage @@ -791,7 +839,9 @@ def _validate_feature_refs(feature_refs: List[str], full_feature_names: bool = F ref for ref, occurrences in Counter(feature_refs).items() if occurrences > 1 ] else: - feature_names = [ref.split(":")[1] for ref in feature_refs] + feature_names = [ + ref.split(":")[1] if ":" in ref else ref for ref in feature_refs + ] collided_feature_names = [ ref for ref, occurrences in Counter(feature_names).items() @@ -820,6 +870,9 @@ def _group_feature_refs( if isinstance(features, list) and isinstance(features[0], str): for ref in features: + if ":" not in ref: + # This is an on demand feature view ref + continue view_name, feat_name = ref.split(":") if view_name not in view_index: raise FeatureViewNotFoundException(view_name) diff --git a/sdk/python/feast/on_demand_feature_view.py b/sdk/python/feast/on_demand_feature_view.py new file mode 100644 index 0000000000..b5b71c164c --- /dev/null +++ b/sdk/python/feast/on_demand_feature_view.py @@ -0,0 +1,150 @@ +import functools +from types import MethodType +from typing import Dict, List + +import dill +import pandas as pd + +from feast.feature import Feature +from feast.feature_view import FeatureView +from feast.protos.feast.core.OnDemandFeatureView_pb2 import ( + OnDemandFeatureView as OnDemandFeatureViewProto, +) +from feast.protos.feast.core.OnDemandFeatureView_pb2 import OnDemandFeatureViewSpec +from feast.protos.feast.core.OnDemandFeatureView_pb2 import ( + UserDefinedFunction as UserDefinedFunctionProto, +) +from feast.usage import log_exceptions +from feast.value_type import ValueType + + +class OnDemandFeatureView: + """ + An OnDemandFeatureView defines on demand transformations on existing feature view values and request data. + + Args: + name: Name of the group of features. + features: Output schema of transformation with feature names + inputs: The input feature views passed into the transform. + udf: User defined transformation function that takes as input pandas dataframes + """ + + name: str + features: List[Feature] + inputs: Dict[str, FeatureView] + udf: MethodType + + @log_exceptions + def __init__( + self, + name: str, + features: List[Feature], + inputs: Dict[str, FeatureView], + udf: MethodType, + ): + """ + Creates an OnDemandFeatureView object. + """ + + self.name = name + self.features = features + self.inputs = inputs + self.udf = udf + + def to_proto(self) -> OnDemandFeatureViewProto: + """ + Converts an on demand feature view object to its protobuf representation. + + Returns: + A OnDemandFeatureViewProto protobuf. + """ + spec = OnDemandFeatureViewSpec( + name=self.name, + features=[feature.to_proto() for feature in self.features], + inputs={k: fv.to_proto() for k, fv in self.inputs.items()}, + user_defined_function=UserDefinedFunctionProto( + name=self.udf.__name__, body=dill.dumps(self.udf, recurse=True), + ), + ) + + return OnDemandFeatureViewProto(spec=spec) + + @classmethod + def from_proto(cls, on_demand_feature_view_proto: OnDemandFeatureViewProto): + """ + Creates an on demand feature view from a protobuf representation. + + Args: + on_demand_feature_view_proto: A protobuf representation of an on-demand feature view. + + Returns: + A OnDemandFeatureView object based on the on-demand feature view protobuf. + """ + on_demand_feature_view_obj = cls( + name=on_demand_feature_view_proto.spec.name, + features=[ + Feature( + name=feature.name, + dtype=ValueType(feature.value_type), + labels=dict(feature.labels), + ) + for feature in on_demand_feature_view_proto.spec.features + ], + inputs={ + feature_view_name: FeatureView.from_proto(feature_view_proto) + for feature_view_name, feature_view_proto in on_demand_feature_view_proto.spec.inputs.items() + }, + udf=dill.loads( + on_demand_feature_view_proto.spec.user_defined_function.body + ), + ) + + return on_demand_feature_view_obj + + def get_transformed_features_df( + self, full_feature_names: bool, df_with_features: pd.DataFrame + ) -> pd.DataFrame: + # Apply on demand transformations + # TODO(adchia): Include only the feature values from the specified input FVs in the ODFV. + # Copy over un-prefixed features even if not requested since transform may need it + columns_to_cleanup = [] + if full_feature_names: + for input_fv in self.inputs.values(): + for feature in input_fv.features: + full_feature_ref = f"{input_fv.name}__{feature.name}" + if full_feature_ref in df_with_features.keys(): + df_with_features[feature.name] = df_with_features[ + full_feature_ref + ] + columns_to_cleanup.append(feature.name) + + # Compute transformed values and apply to each result row + df_with_transformed_features = self.udf.__call__(df_with_features) + + # Cleanup extra columns used for transformation + df_with_features.drop(columns=columns_to_cleanup, inplace=True) + return df_with_transformed_features + + +def on_demand_feature_view(features: List[Feature], inputs: Dict[str, FeatureView]): + """ + Declare an on-demand feature view + + :param features: Output schema with feature names + :param inputs: The inputs passed into the transform. + :return: An On Demand Feature View. + """ + + def decorator(user_function): + on_demand_feature_view_obj = OnDemandFeatureView( + name=user_function.__name__, + inputs=inputs, + features=features, + udf=user_function, + ) + functools.update_wrapper( + wrapper=on_demand_feature_view_obj, wrapped=user_function + ) + return on_demand_feature_view_obj + + return decorator diff --git a/sdk/python/feast/registry.py b/sdk/python/feast/registry.py index af04de5f3f..8a1994bbc5 100644 --- a/sdk/python/feast/registry.py +++ b/sdk/python/feast/registry.py @@ -18,11 +18,12 @@ from datetime import datetime, timedelta from pathlib import Path from tempfile import TemporaryFile -from typing import List, Optional +from typing import List, Optional, Set from urllib.parse import urlparse from feast.entity import Entity from feast.errors import ( + ConflictingFeatureViewNames, EntityNotFoundException, FeatureServiceNotFoundException, FeatureTableNotFoundException, @@ -33,6 +34,7 @@ from feast.feature_service import FeatureService from feast.feature_table import FeatureTable from feast.feature_view import FeatureView +from feast.on_demand_feature_view import OnDemandFeatureView from feast.protos.feast.core.Registry_pb2 import Registry as RegistryProto REGISTRY_SCHEMA_VERSION = "1" @@ -272,6 +274,9 @@ def apply_feature_view( self._prepare_registry_for_changes() assert self.cached_registry_proto + if feature_view.name in self._get_existing_on_demand_feature_view_names(): + raise ConflictingFeatureViewNames(feature_view.name) + for idx, existing_feature_view_proto in enumerate( self.cached_registry_proto.feature_views ): @@ -289,6 +294,51 @@ def apply_feature_view( if commit: self.commit() + def apply_on_demand_feature_view( + self, + on_demand_feature_view: OnDemandFeatureView, + project: str, + commit: bool = True, + ): + """ + Registers a single on demand feature view with Feast + + Args: + on_demand_feature_view: Feature view that will be registered + project: Feast project that this feature view belongs to + commit: Whether the change should be persisted immediately + """ + on_demand_feature_view_proto = on_demand_feature_view.to_proto() + on_demand_feature_view_proto.spec.project = project + self._prepare_registry_for_changes() + assert self.cached_registry_proto + + if on_demand_feature_view.name in self._get_existing_feature_view_names(): + raise ConflictingFeatureViewNames(on_demand_feature_view.name) + + for idx, existing_feature_view_proto in enumerate( + self.cached_registry_proto.on_demand_feature_views + ): + if ( + existing_feature_view_proto.spec.name + == on_demand_feature_view_proto.spec.name + and existing_feature_view_proto.spec.project == project + ): + if ( + OnDemandFeatureView.from_proto(existing_feature_view_proto) + == on_demand_feature_view + ): + return + else: + del self.cached_registry_proto.on_demand_feature_views[idx] + break + + self.cached_registry_proto.on_demand_feature_views.append( + on_demand_feature_view_proto + ) + if commit: + self.commit() + def apply_materialization( self, feature_view: FeatureView, @@ -370,6 +420,28 @@ def list_feature_views( feature_views.append(FeatureView.from_proto(feature_view_proto)) return feature_views + def list_on_demand_feature_views( + self, project: str, allow_cache: bool = False + ) -> List[OnDemandFeatureView]: + """ + Retrieve a list of on demand feature views from the registry + + Args: + allow_cache: Allow returning feature views from the cached registry + project: Filter feature tables based on project name + + Returns: + List of on demand feature views + """ + registry_proto = self._get_registry_proto(allow_cache=allow_cache) + on_demand_feature_views = [] + for on_demand_feature_view_proto in registry_proto.on_demand_feature_views: + if on_demand_feature_view_proto.spec.project == project: + on_demand_feature_views.append( + OnDemandFeatureView.from_proto(on_demand_feature_view_proto) + ) + return on_demand_feature_views + def get_feature_table(self, name: str, project: str) -> FeatureTable: """ Retrieves a feature table. @@ -546,6 +618,19 @@ def _get_registry_proto(self, allow_cache: bool = False) -> RegistryProto: self.cache_being_updated = False return registry_proto + def _get_existing_feature_view_names(self) -> Set[str]: + assert self.cached_registry_proto + return set([fv.spec.name for fv in self.cached_registry_proto.feature_views]) + + def _get_existing_on_demand_feature_view_names(self) -> Set[str]: + assert self.cached_registry_proto + return set( + [ + odfv.spec.name + for odfv in self.cached_registry_proto.on_demand_feature_views + ] + ) + class RegistryStore(ABC): """ diff --git a/sdk/python/feast/repo_operations.py b/sdk/python/feast/repo_operations.py index ea0e79931b..1170701e15 100644 --- a/sdk/python/feast/repo_operations.py +++ b/sdk/python/feast/repo_operations.py @@ -17,6 +17,7 @@ from feast.feature_view import FeatureView from feast.infra.provider import get_provider from feast.names import adjectives, animals +from feast.on_demand_feature_view import OnDemandFeatureView from feast.registry import Registry from feast.repo_config import RepoConfig from feast.usage import log_exceptions_and_usage @@ -33,6 +34,7 @@ def py_path_to_module(path: Path, repo_root: Path) -> str: class ParsedRepo(NamedTuple): feature_tables: List[FeatureTable] feature_views: List[FeatureView] + on_demand_feature_views: List[OnDemandFeatureView] entities: List[Entity] feature_services: List[FeatureService] @@ -93,7 +95,11 @@ def get_repo_files(repo_root: Path) -> List[Path]: def parse_repo(repo_root: Path) -> ParsedRepo: """ Collect feature table definitions from feature repo """ res = ParsedRepo( - feature_tables=[], entities=[], feature_views=[], feature_services=[] + feature_tables=[], + entities=[], + feature_views=[], + feature_services=[], + on_demand_feature_views=[], ) for repo_file in get_repo_files(repo_root): @@ -109,6 +115,8 @@ def parse_repo(repo_root: Path) -> ParsedRepo: res.entities.append(obj) elif isinstance(obj, FeatureService): res.feature_services.append(obj) + elif isinstance(obj, OnDemandFeatureView): + res.on_demand_feature_views.append(obj) return res @@ -143,6 +151,10 @@ def apply_total(repo_config: RepoConfig, repo_path: Path, skip_source_validation views_to_keep, views_to_delete = _tag_registry_views_for_keep_delete( project, registry, repo ) + ( + odfvs_to_keep, + odfvs_to_delete, + ) = _tag_registry_on_demand_feature_views_for_keep_delete(project, registry, repo) tables_to_keep, tables_to_delete = _tag_registry_tables_for_keep_delete( project, registry, repo ) @@ -181,10 +193,14 @@ def apply_total(repo_config: RepoConfig, repo_path: Path, skip_source_validation # TODO: delete entities from the registry too # Add / update views + entities + services - all_to_apply: List[Union[Entity, FeatureView, FeatureService]] = [] + all_to_apply: List[ + Union[Entity, FeatureView, FeatureService, OnDemandFeatureView] + ] = [] all_to_apply.extend(entities_to_keep) all_to_apply.extend(views_to_keep) all_to_apply.extend(services_to_keep) + all_to_apply.extend(odfvs_to_keep) + # TODO: delete odfvs store.apply(all_to_apply, commit=False) for entity in entities_to_keep: click.echo( @@ -194,6 +210,10 @@ def apply_total(repo_config: RepoConfig, repo_path: Path, skip_source_validation click.echo( f"Registered feature view {Style.BRIGHT + Fore.GREEN}{view.name}{Style.RESET_ALL}" ) + for odfv in odfvs_to_keep: + click.echo( + f"Registered on demand feature view {Style.BRIGHT + Fore.GREEN}{odfv.name}{Style.RESET_ALL}" + ) for feature_service in services_to_keep: click.echo( f"Registered feature service {Style.BRIGHT + Fore.GREEN}{feature_service.name}{Style.RESET_ALL}" @@ -263,6 +283,20 @@ def _tag_registry_views_for_keep_delete( return views_to_keep, views_to_delete +def _tag_registry_on_demand_feature_views_for_keep_delete( + project: str, registry: Registry, repo: ParsedRepo +) -> Tuple[List[OnDemandFeatureView], List[OnDemandFeatureView]]: + odfvs_to_keep: List[OnDemandFeatureView] = repo.on_demand_feature_views + odfvs_to_delete: List[OnDemandFeatureView] = [] + repo_on_demand_feature_view_names = set( + t.name for t in repo.on_demand_feature_views + ) + for registry_odfv in registry.list_on_demand_feature_views(project=project): + if registry_odfv.name not in repo_on_demand_feature_view_names: + odfvs_to_delete.append(registry_odfv) + return odfvs_to_keep, odfvs_to_delete + + def _tag_registry_tables_for_keep_delete( project: str, registry: Registry, repo: ParsedRepo ) -> Tuple[List[FeatureTable], List[FeatureTable]]: diff --git a/sdk/python/setup.py b/sdk/python/setup.py index af82301212..b776369ca6 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -113,6 +113,7 @@ "google-cloud-core==1.4.*", "redis-py-cluster==2.1.2", "boto3==1.17.*", + "dill==0.3.0" ] diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index d03b89f0e0..c8029474aa 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -1,6 +1,9 @@ from datetime import timedelta +from typing import Dict -from feast import Feature, FeatureView, ValueType +import pandas as pd + +from feast import Feature, FeatureView, OnDemandFeatureView, ValueType from feast.data_source import DataSource @@ -19,6 +22,23 @@ def driver_feature_view( ) +def conv_rate_plus_100(driver_hourly_stats: pd.DataFrame) -> pd.DataFrame: + df = pd.DataFrame() + df["conv_rate_plus_100"] = driver_hourly_stats["conv_rate"] + 100 + return df + + +def conv_rate_plus_100_feature_view( + inputs: Dict[str, FeatureView] +) -> OnDemandFeatureView: + return OnDemandFeatureView( + name=conv_rate_plus_100.__name__, + inputs=inputs, + features=[Feature("conv_rate_plus_100", ValueType.FLOAT)], + udf=conv_rate_plus_100, + ) + + def create_driver_hourly_stats_feature_view(source, infer_features: bool = True): driver_stats_feature_view = FeatureView( name="driver_stats", diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index b3bcf688bd..d86af4521c 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -3,6 +3,9 @@ import pandas as pd import pytest +from integration.feature_repos.universal.feature_views import ( + conv_rate_plus_100_feature_view, +) from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, @@ -17,10 +20,10 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) - + odfv = conv_rate_plus_100_feature_view(inputs={"driver": feature_views["driver"]}) feast_objects = [] feast_objects.extend(feature_views.values()) - feast_objects.extend([driver(), customer()]) + feast_objects.extend([odfv, driver(), customer()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) @@ -45,8 +48,9 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", + "conv_rate_plus_100", ] - unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs] + unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] online_features = fs.get_online_features( features=feature_refs, @@ -60,6 +64,9 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name len(keys) == len(feature_refs) + 2 ) # Add two for the driver id and the customer id entity keys. for feature in feature_refs: + if ":" in feature: + # This is the ODFV + continue if full_feature_names: assert feature.replace(":", "__") in keys else: @@ -75,6 +82,10 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name assert df_features["customer_id"] == online_features_dict["customer_id"][i] assert df_features["driver_id"] == online_features_dict["driver_id"][i] + assert ( + online_features_dict["conv_rate_plus_100"][i] + == df_features["conv_rate"] + 100 + ) for unprefixed_feature_ref in unprefixed_feature_refs: tc.assertEqual( df_features[unprefixed_feature_ref], From fc448dc2f0f74b75e4f292f10c1d2331f557d87d Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Thu, 2 Sep 2021 19:27:05 -0400 Subject: [PATCH 07/19] Implementing initial on demand transforms for historical retrieval to_df (#1824) * On demand transforms for historical retrieval Signed-off-by: Danny Chiao * Merge error Signed-off-by: Danny Chiao --- .../feast/infra/offline_stores/bigquery.py | 49 ++++++++++++++++--- sdk/python/feast/infra/offline_stores/file.py | 36 ++++++++++++-- .../infra/offline_stores/offline_store.py | 24 +++++++++ .../infra/offline_stores/offline_utils.py | 2 +- .../feast/infra/offline_stores/redshift.py | 32 ++++++++++-- sdk/python/feast/infra/provider.py | 3 ++ 6 files changed, 129 insertions(+), 17 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index 2bfd863991..edd6957b8c 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Union import numpy as np -import pandas +import pandas as pd import pyarrow from pydantic import StrictStr from pydantic.typing import Literal @@ -19,6 +19,7 @@ from feast.feature_view import FeatureView from feast.infra.offline_stores import offline_utils from feast.infra.offline_stores.offline_store import OfflineStore, RetrievalJob +from feast.on_demand_feature_view import OnDemandFeatureView from feast.registry import Registry from feast.repo_config import FeastConfigBaseModel, RepoConfig @@ -87,14 +88,21 @@ def pull_latest_from_table_or_query( WHERE _feast_row = 1 """ - return BigQueryRetrievalJob(query=query, client=client, config=config) + # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized + return BigQueryRetrievalJob( + query=query, + client=client, + config=config, + full_feature_names=False, + on_demand_feature_views=None, + ) @staticmethod def get_historical_features( config: RepoConfig, feature_views: List[FeatureView], feature_refs: List[str], - entity_df: Union[pandas.DataFrame, str], + entity_df: Union[pd.DataFrame, str], registry: Registry, project: str, full_feature_names: bool = False, @@ -140,16 +148,41 @@ def get_historical_features( full_feature_names=full_feature_names, ) - return BigQueryRetrievalJob(query=query, client=client, config=config) + return BigQueryRetrievalJob( + query=query, + client=client, + config=config, + full_feature_names=full_feature_names, + on_demand_feature_views=registry.list_on_demand_feature_views( + project, allow_cache=True + ), + ) class BigQueryRetrievalJob(RetrievalJob): - def __init__(self, query, client, config): + def __init__( + self, + query: str, + client: bigquery.Client, + config: RepoConfig, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]], + ): self.query = query self.client = client self.config = config + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + return self._on_demand_feature_views - def to_df(self): + def to_df_internal(self) -> pd.DataFrame: # TODO: Ideally only start this job when the user runs "get_historical_features", not when they run to_df() df = self.client.query(self.query).to_dataframe(create_bqstorage_client=True) return df @@ -266,7 +299,7 @@ def _get_table_reference_for_new_entity( def _upload_entity_df_and_get_entity_schema( - client: Client, table_name: str, entity_df: Union[pandas.DataFrame, str], + client: Client, table_name: str, entity_df: Union[pd.DataFrame, str], ) -> Dict[str, np.dtype]: """Uploads a Pandas entity dataframe into a BigQuery table and returns the resulting table""" @@ -278,7 +311,7 @@ def _upload_entity_df_and_get_entity_schema( client.query(f"SELECT * FROM {table_name} LIMIT 1").result().to_dataframe() ) entity_schema = dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) - elif isinstance(entity_df, pandas.DataFrame): + elif isinstance(entity_df, pd.DataFrame): # Drop the index so that we dont have unnecessary columns entity_df.reset_index(drop=True, inplace=True) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 5c6f96df57..1afa5df08d 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -6,7 +6,7 @@ import pytz from pydantic.typing import Literal -from feast import FileSource +from feast import FileSource, OnDemandFeatureView from feast.data_source import DataSource from feast.errors import FeastJoinKeysDuringMaterialization from feast.feature_view import FeatureView @@ -30,13 +30,28 @@ class FileOfflineStoreConfig(FeastConfigBaseModel): class FileRetrievalJob(RetrievalJob): - def __init__(self, evaluation_function: Callable): + def __init__( + self, + evaluation_function: Callable, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]], + ): """Initialize a lazy historical retrieval job""" # The evaluation function executes a stored procedure to compute a historical retrieval. self.evaluation_function = evaluation_function + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views - def to_df(self): + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + return self._on_demand_feature_views + + def to_df_internal(self) -> pd.DataFrame: # Only execute the evaluation function to build the final historical retrieval dataframe at the last moment. df = self.evaluation_function() return df @@ -224,7 +239,13 @@ def evaluate_historical_retrieval(): return entity_df_with_features - job = FileRetrievalJob(evaluation_function=evaluate_historical_retrieval) + job = FileRetrievalJob( + evaluation_function=evaluate_historical_retrieval, + full_feature_names=full_feature_names, + on_demand_feature_views=registry.list_on_demand_feature_views( + project, allow_cache=True + ), + ) return job @staticmethod @@ -284,4 +305,9 @@ def evaluate_offline_job(): ) return last_values_df[columns_to_extract] - return FileRetrievalJob(evaluation_function=evaluate_offline_job) + # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized + return FileRetrievalJob( + evaluation_function=evaluate_offline_job, + full_feature_names=False, + on_demand_feature_views=None, + ) diff --git a/sdk/python/feast/infra/offline_stores/offline_store.py b/sdk/python/feast/infra/offline_stores/offline_store.py index e8d32cd384..16fc3b1fa0 100644 --- a/sdk/python/feast/infra/offline_stores/offline_store.py +++ b/sdk/python/feast/infra/offline_stores/offline_store.py @@ -20,6 +20,7 @@ from feast.data_source import DataSource from feast.feature_view import FeatureView +from feast.on_demand_feature_view import OnDemandFeatureView from feast.registry import Registry from feast.repo_config import RepoConfig @@ -27,11 +28,34 @@ class RetrievalJob(ABC): """RetrievalJob is used to manage the execution of a historical feature retrieval""" + @property @abstractmethod + def full_feature_names(self) -> bool: + pass + + @property + @abstractmethod + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + pass + def to_df(self) -> pd.DataFrame: + """Return dataset as Pandas DataFrame synchronously including on demand transforms""" + features_df = self.to_df_internal() + if self.on_demand_feature_views is None: + return features_df + + for odfv in self.on_demand_feature_views: + features_df = features_df.join( + odfv.get_transformed_features_df(self.full_feature_names, features_df) + ) + return features_df + + @abstractmethod + def to_df_internal(self) -> pd.DataFrame: """Return dataset as Pandas DataFrame synchronously""" pass + # TODO(adchia): implement ODFV for to_arrow method @abstractmethod def to_arrow(self) -> pyarrow.Table: """Return dataset as pyarrow Table synchronously""" diff --git a/sdk/python/feast/infra/offline_stores/offline_utils.py b/sdk/python/feast/infra/offline_stores/offline_utils.py index 304bdc8e91..f9125ab156 100644 --- a/sdk/python/feast/infra/offline_stores/offline_utils.py +++ b/sdk/python/feast/infra/offline_stores/offline_utils.py @@ -148,7 +148,7 @@ def build_point_in_time_query( entity_df_event_timestamp_col: str, query_template: str, full_feature_names: bool = False, -): +) -> str: """Build point-in-time query between each feature view table and the entity dataframe for Bigquery and Redshift""" template = Environment(loader=BaseLoader()).from_string(source=query_template) diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index 94d6a2877b..4a71d89752 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -9,7 +9,7 @@ from pydantic import StrictStr from pydantic.typing import Literal -from feast import RedshiftSource +from feast import OnDemandFeatureView, RedshiftSource from feast.data_source import DataSource from feast.errors import InvalidEntityType from feast.feature_view import FeatureView @@ -90,11 +90,14 @@ def pull_latest_from_table_or_query( ) WHERE _feast_row = 1 """ + # When materializing a single feature view, we don't need full feature names. On demand transforms aren't materialized return RedshiftRetrievalJob( query=query, redshift_client=redshift_client, s3_resource=s3_resource, config=config, + full_feature_names=False, + on_demand_feature_views=None, ) @staticmethod @@ -164,6 +167,10 @@ def query_generator() -> Iterator[str]: redshift_client=redshift_client, s3_resource=s3_resource, config=config, + full_feature_names=full_feature_names, + on_demand_feature_views=registry.list_on_demand_feature_views( + project=project, allow_cache=True + ), drop_columns=["entity_timestamp"] + [ f"{feature_view.name}__entity_row_unique_id" @@ -179,6 +186,8 @@ def __init__( redshift_client, s3_resource, config: RepoConfig, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]], drop_columns: Optional[List[str]] = None, ): """Initialize RedshiftRetrievalJob object. @@ -188,6 +197,8 @@ def __init__( redshift_client: boto3 redshift-data client s3_resource: boto3 s3 resource object config: Feast repo config + full_feature_names: Whether to add the feature view prefixes to the feature names + on_demand_feature_views: A list of on demand transforms to apply at retrieval time drop_columns: Optionally a list of columns to drop before unloading to S3. This is a convenient field, since "SELECT ... EXCEPT col" isn't supported in Redshift. """ @@ -209,9 +220,19 @@ def query_generator() -> Iterator[str]: + "/unload/" + str(uuid.uuid4()) ) + self._full_feature_names = full_feature_names + self._on_demand_feature_views = on_demand_feature_views self._drop_columns = drop_columns - def to_df(self) -> pd.DataFrame: + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + return self._on_demand_feature_views + + def to_df_internal(self) -> pd.DataFrame: with self._query_generator() as query: return aws_utils.unload_redshift_query_to_df( self._redshift_client, @@ -304,7 +325,12 @@ def _upload_entity_df_and_get_entity_schema( f"CREATE TABLE {table_name} AS ({entity_df})", ) limited_entity_df = RedshiftRetrievalJob( - f"SELECT * FROM {table_name} LIMIT 1", redshift_client, s3_resource, config + f"SELECT * FROM {table_name} LIMIT 1", + redshift_client, + s3_resource, + config, + full_feature_names=False, + on_demand_feature_views=None, ).to_df() return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) else: diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index e5210b566f..4c78a5d109 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -177,6 +177,9 @@ def _get_requested_feature_views_to_features_dict( feature_views_to_feature_map: Dict[FeatureView, List[str]] = {} for ref in feature_refs: + if ":" not in ref: + # ODFV + continue ref_parts = ref.split(":") feature_view_from_ref = ref_parts[0] feature_from_ref = ref_parts[1] From 0d775c274da3640dd201b33e554795363daa3835 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 2 Sep 2021 17:58:57 -0700 Subject: [PATCH 08/19] Add Registry operations for on demand feature views (#1828) Signed-off-by: Achal Shah --- sdk/python/feast/cli.py | 52 +++++++++++++++++++++++ sdk/python/feast/errors.py | 10 +++++ sdk/python/feast/feature_store.py | 26 ++++++++++++ sdk/python/feast/registry.py | 70 +++++++++++++++++++++---------- 4 files changed, 136 insertions(+), 22 deletions(-) diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index fea1fac6ca..6eff06e28b 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -236,6 +236,58 @@ def feature_view_list(ctx: click.Context): print(tabulate(table, headers=["NAME", "ENTITIES"], tablefmt="plain")) +@cli.group(name="on-demand-feature-views") +def on_demand_feature_views_cmd(): + """ + Access feature views + """ + pass + + +@on_demand_feature_views_cmd.command("describe") +@click.argument("name", type=click.STRING) +@click.pass_context +def on_demand_feature_view_describe(ctx: click.Context, name: str): + """ + Describe an on demand feature view + """ + repo = ctx.obj["CHDIR"] + cli_check_repo(repo) + store = FeatureStore(repo_path=str(repo)) + + try: + on_demand_feature_view = store.get_on_demand_feature_view(name) + except FeastObjectNotFoundException as e: + print(e) + exit(1) + + print( + yaml.dump( + yaml.safe_load(str(on_demand_feature_view)), + default_flow_style=False, + sort_keys=False, + ) + ) + + +@on_demand_feature_views_cmd.command(name="list") +@click.pass_context +def on_demand_feature_view_list(ctx: click.Context): + """ + List all on demand feature views + """ + repo = ctx.obj["CHDIR"] + cli_check_repo(repo) + store = FeatureStore(repo_path=str(repo)) + table = [] + for on_demand_feature_view in store.list_on_demand_feature_views(): + table.append([on_demand_feature_view.name]) + + from tabulate import tabulate + + print(tabulate(table, headers=["NAME"], tablefmt="plain")) + + @cli.command("apply", cls=NoOptionDefaultFormat) @click.option( "--skip-source-validation", diff --git a/sdk/python/feast/errors.py b/sdk/python/feast/errors.py index fa4a779a31..59736ab001 100644 --- a/sdk/python/feast/errors.py +++ b/sdk/python/feast/errors.py @@ -40,6 +40,16 @@ def __init__(self, name, project=None): super().__init__(f"Feature view {name} does not exist") +class OnDemandFeatureViewNotFoundException(FeastObjectNotFoundException): + def __init__(self, name, project=None): + if project: + super().__init__( + f"On demand feature view {name} does not exist in project {project}" + ) + else: + super().__init__(f"On demand feature view {name} does not exist") + + class FeatureTableNotFoundException(FeastObjectNotFoundException): def __init__(self, name, project=None): if project: diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index c4f1987572..861d42d3ae 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -170,6 +170,16 @@ def list_feature_views(self) -> List[FeatureView]: """ return self._registry.list_feature_views(self.project) + @log_exceptions_and_usage + def list_on_demand_feature_views(self) -> List[OnDemandFeatureView]: + """ + Retrieves the list of on demand feature views from the registry. + + Returns: + A list of on demand feature views. + """ + return self._registry.list_on_demand_feature_views(self.project) + @log_exceptions_and_usage def get_entity(self, name: str) -> Entity: """ @@ -218,6 +228,22 @@ def get_feature_view(self, name: str) -> FeatureView: """ return self._registry.get_feature_view(name, self.project) + @log_exceptions_and_usage + def get_on_demand_feature_view(self, name: str) -> OnDemandFeatureView: + """ + Retrieves a feature view. + + Args: + name: Name of feature view. + + Returns: + The specified feature view. + + Raises: + FeatureViewNotFoundException: The feature view could not be found. + """ + return self._registry.get_on_demand_feature_view(name, self.project) + @log_exceptions_and_usage def delete_feature_view(self, name: str): """ diff --git a/sdk/python/feast/registry.py b/sdk/python/feast/registry.py index 8a1994bbc5..864a560385 100644 --- a/sdk/python/feast/registry.py +++ b/sdk/python/feast/registry.py @@ -28,6 +28,7 @@ FeatureServiceNotFoundException, FeatureTableNotFoundException, FeatureViewNotFoundException, + OnDemandFeatureViewNotFoundException, S3RegistryBucketForbiddenAccess, S3RegistryBucketNotExist, ) @@ -339,6 +340,53 @@ def apply_on_demand_feature_view( if commit: self.commit() + def list_on_demand_feature_views( + self, project: str, allow_cache: bool = False + ) -> List[OnDemandFeatureView]: + """ + Retrieve a list of on demand feature views from the registry + + Args: + allow_cache: Whether to allow returning on demand feature views from a cached registry + project: Filter on demand feature views based on project name + + Returns: + List of on demand feature views + """ + + registry = self._get_registry_proto(allow_cache=allow_cache) + on_demand_feature_views = [] + for on_demand_feature_view in registry.on_demand_feature_views: + if on_demand_feature_view.spec.project == project: + on_demand_feature_views.append( + OnDemandFeatureView.from_proto(on_demand_feature_view) + ) + return on_demand_feature_views + + def get_on_demand_feature_view( + self, name: str, project: str, allow_cache: bool = False + ) -> OnDemandFeatureView: + """ + Retrieves an on demand feature view. + + Args: + name: Name of on demand feature view + project: Feast project that this on demand feature belongs to + + Returns: + Returns either the specified on demand feature view, or raises an exception if + none is found + """ + registry = self._get_registry_proto(allow_cache=allow_cache) + + for on_demand_feature_view in registry.on_demand_feature_views: + if ( + on_demand_feature_view.spec.project == project + and on_demand_feature_view.spec.name == name + ): + return OnDemandFeatureView.from_proto(on_demand_feature_view) + raise OnDemandFeatureViewNotFoundException(name, project=project) + def apply_materialization( self, feature_view: FeatureView, @@ -420,28 +468,6 @@ def list_feature_views( feature_views.append(FeatureView.from_proto(feature_view_proto)) return feature_views - def list_on_demand_feature_views( - self, project: str, allow_cache: bool = False - ) -> List[OnDemandFeatureView]: - """ - Retrieve a list of on demand feature views from the registry - - Args: - allow_cache: Allow returning feature views from the cached registry - project: Filter feature tables based on project name - - Returns: - List of on demand feature views - """ - registry_proto = self._get_registry_proto(allow_cache=allow_cache) - on_demand_feature_views = [] - for on_demand_feature_view_proto in registry_proto.on_demand_feature_views: - if on_demand_feature_view_proto.spec.project == project: - on_demand_feature_views.append( - OnDemandFeatureView.from_proto(on_demand_feature_view_proto) - ) - return on_demand_feature_views - def get_feature_table(self, name: str, project: str) -> FeatureTable: """ Retrieves a feature table. From a565ab97189012ffa3fda9c8cf93e90f1ef25fb3 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 3 Sep 2021 09:00:00 -0700 Subject: [PATCH 09/19] Remove older offline/online consistency tests (#1831) Signed-off-by: Achal Shah --- .../test_offline_online_store_consistency.py | 414 ------------------ 1 file changed, 414 deletions(-) delete mode 100644 sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py diff --git a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py b/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py deleted file mode 100644 index 2efe343218..0000000000 --- a/sdk/python/tests/integration/materialization/test_offline_online_store_consistency.py +++ /dev/null @@ -1,414 +0,0 @@ -import contextlib -import math -import random -import tempfile -import time -import uuid -from datetime import datetime, timedelta -from pathlib import Path -from typing import Iterator, Optional, Tuple - -import pandas as pd -import pytest -from google.cloud import bigquery -from pytz import utc - -from feast import BigQuerySource, FileSource, RedshiftSource -from feast.data_format import ParquetFormat -from feast.entity import Entity -from feast.feature_store import FeatureStore -from feast.feature_view import FeatureView -from feast.infra.offline_stores.file import FileOfflineStoreConfig -from feast.infra.offline_stores.redshift import RedshiftOfflineStoreConfig -from feast.infra.online_stores.datastore import DatastoreOnlineStoreConfig -from feast.infra.online_stores.dynamodb import DynamoDBOnlineStoreConfig -from feast.infra.online_stores.redis import RedisOnlineStoreConfig, RedisType -from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig -from feast.infra.utils import aws_utils -from feast.repo_config import RepoConfig -from feast.value_type import ValueType -from tests.data.data_creator import create_dataset -from tests.integration.feature_repos.universal.feature_views import driver_feature_view - - -@contextlib.contextmanager -def prep_bq_fs_and_fv( - bq_source_type: str, -) -> Iterator[Tuple[FeatureStore, FeatureView]]: - client = bigquery.Client() - gcp_project = client.project - bigquery_dataset = f"test_ingestion{time.time_ns()}" - dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}") - client.create_dataset(dataset, exists_ok=True) - dataset.default_table_expiration_ms = ( - 1000 * 60 * 60 * 24 * 14 - ) # 2 weeks in milliseconds - client.update_dataset(dataset, ["default_table_expiration_ms"]) - - df = create_dataset() - - job_config = bigquery.LoadJobConfig() - table_ref = f"{gcp_project}.{bigquery_dataset}.{bq_source_type}_correctness_{int(time.time_ns())}" - query = f"SELECT * FROM `{table_ref}`" - job = client.load_table_from_dataframe(df, table_ref, job_config=job_config) - job.result() - - bigquery_source = BigQuerySource( - table_ref=table_ref if bq_source_type == "table" else None, - query=query if bq_source_type == "query" else None, - event_timestamp_column="ts", - created_timestamp_column="created_ts", - date_partition_column="", - field_mapping={"ts_1": "ts"}, - ) - - fv = driver_feature_view(bigquery_source) - e = Entity( - name="driver", - description="id for driver", - join_key="driver_id", - value_type=ValueType.INT32, - ) - with tempfile.TemporaryDirectory() as repo_dir_name: - config = RepoConfig( - registry=str(Path(repo_dir_name) / "registry.db"), - project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", - provider="gcp", - online_store=DatastoreOnlineStoreConfig(namespace="integration_test"), - ) - fs = FeatureStore(config=config) - fs.apply([fv, e]) - - yield fs, fv - - fs.teardown() - - -@contextlib.contextmanager -def prep_redshift_fs_and_fv( - source_type: str, -) -> Iterator[Tuple[FeatureStore, FeatureView]]: - client = aws_utils.get_redshift_data_client("us-west-2") - s3 = aws_utils.get_s3_resource("us-west-2") - - df = create_dataset() - - table_name = f"test_ingestion_{source_type}_correctness_{int(time.time_ns())}_{random.randint(1000, 9999)}" - - offline_store = RedshiftOfflineStoreConfig( - cluster_id="feast-integration-tests", - region="us-west-2", - user="admin", - database="feast", - s3_staging_location="s3://feast-integration-tests/redshift/tests/ingestion", - iam_role="arn:aws:iam::402087665549:role/redshift_s3_access_role", - ) - - aws_utils.upload_df_to_redshift( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - s3, - f"{offline_store.s3_staging_location}/copy/{table_name}.parquet", - offline_store.iam_role, - table_name, - df, - ) - - redshift_source = RedshiftSource( - table=table_name if source_type == "table" else None, - query=f"SELECT * FROM {table_name}" if source_type == "query" else None, - event_timestamp_column="ts", - created_timestamp_column="created_ts", - date_partition_column="", - field_mapping={"ts_1": "ts"}, - ) - - fv = driver_feature_view(redshift_source) - e = Entity( - name="driver", - description="id for driver", - join_key="driver_id", - value_type=ValueType.INT32, - ) - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - config = RepoConfig( - registry=str(Path(repo_dir_name) / "registry.db"), - project=f"test_bq_correctness_{str(uuid.uuid4()).replace('-', '')}", - provider="local", - online_store=SqliteOnlineStoreConfig( - path=str(Path(data_dir_name) / "online_store.db") - ), - offline_store=offline_store, - ) - fs = FeatureStore(config=config) - fs.apply([fv, e]) - - yield fs, fv - - fs.teardown() - - # Clean up the uploaded Redshift table - aws_utils.execute_redshift_statement( - client, - offline_store.cluster_id, - offline_store.database, - offline_store.user, - f"DROP TABLE {table_name}", - ) - - -@contextlib.contextmanager -def prep_local_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: - with tempfile.NamedTemporaryFile(suffix=".parquet") as f: - df = create_dataset() - f.close() - df.to_parquet(f.name) - file_source = FileSource( - file_format=ParquetFormat(), - path=f"file://{f.name}", - event_timestamp_column="ts", - created_timestamp_column="created_ts", - date_partition_column="", - field_mapping={"ts_1": "ts"}, - ) - fv = driver_feature_view(file_source) - e = Entity( - name="driver", - description="id for driver", - join_key="driver_id", - value_type=ValueType.INT32, - ) - project = f"test_local_correctness_{str(uuid.uuid4()).replace('-', '')}" - print(f"Using project: {project}") - - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - config = RepoConfig( - registry=str(Path(repo_dir_name) / "registry.db"), - project=project, - provider="local", - online_store=SqliteOnlineStoreConfig( - path=str(Path(data_dir_name) / "online_store.db") - ), - ) - fs = FeatureStore(config=config) - fs.apply([fv, e]) - - yield fs, fv - - fs.teardown() - - -@contextlib.contextmanager -def prep_redis_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: - with tempfile.NamedTemporaryFile(suffix=".parquet") as f: - df = create_dataset() - f.close() - df.to_parquet(f.name) - file_source = FileSource( - file_format=ParquetFormat(), - path=f"file://{f.name}", - event_timestamp_column="ts", - created_timestamp_column="created_ts", - date_partition_column="", - field_mapping={"ts_1": "ts"}, - ) - fv = driver_feature_view(file_source) - e = Entity( - name="driver", - description="id for driver", - join_key="driver_id", - value_type=ValueType.INT32, - ) - project = f"test_redis_correctness_{str(uuid.uuid4()).replace('-', '')}" - print(f"Using project: {project}") - with tempfile.TemporaryDirectory() as repo_dir_name: - config = RepoConfig( - registry=str(Path(repo_dir_name) / "registry.db"), - project=project, - provider="local", - online_store=RedisOnlineStoreConfig( - type="redis", - redis_type=RedisType.redis, - connection_string="localhost:6379,db=0", - ), - ) - fs = FeatureStore(config=config) - fs.apply([fv, e]) - - yield fs, fv - - fs.teardown() - - -@contextlib.contextmanager -def prep_dynamodb_fs_and_fv() -> Iterator[Tuple[FeatureStore, FeatureView]]: - with tempfile.NamedTemporaryFile(suffix=".parquet") as f: - df = create_dataset() - f.close() - df.to_parquet(f.name) - file_source = FileSource( - file_format=ParquetFormat(), - path=f"file://{f.name}", - event_timestamp_column="ts", - created_timestamp_column="created_ts", - date_partition_column="", - field_mapping={"ts_1": "ts"}, - ) - fv = driver_feature_view(file_source) - e = Entity( - name="driver", - description="id for driver", - join_key="driver_id", - value_type=ValueType.INT32, - ) - project = f"test_dynamo_correctness_{str(uuid.uuid4()).replace('-', '')}" - print(f"Using project {project}") - with tempfile.TemporaryDirectory() as repo_dir_name: - config = RepoConfig( - registry=str(Path(repo_dir_name) / "registry.db"), - project=project, - provider="aws", - online_store=DynamoDBOnlineStoreConfig(region="us-west-2"), - offline_store=FileOfflineStoreConfig(), - ) - fs = FeatureStore(config=config) - fs.apply([fv, e]) - - yield fs, fv - - fs.teardown() - - -# Checks that both offline & online store values are as expected -def check_offline_and_online_features( - fs: FeatureStore, - fv: FeatureView, - driver_id: int, - event_timestamp: datetime, - expected_value: Optional[float], - full_feature_names: bool, -) -> None: - # Check online store - response_dict = fs.get_online_features( - [f"{fv.name}:value"], - [{"driver": driver_id}], - full_feature_names=full_feature_names, - ).to_dict() - - if full_feature_names: - if expected_value: - assert abs(response_dict[f"{fv.name}__value"][0] - expected_value) < 1e-6 - else: - assert response_dict[f"{fv.name}__value"][0] is None - else: - if expected_value: - assert abs(response_dict["value"][0] - expected_value) < 1e-6 - else: - assert response_dict["value"][0] is None - - # Check offline store - df = fs.get_historical_features( - entity_df=pd.DataFrame.from_dict( - {"driver_id": [driver_id], "event_timestamp": [event_timestamp]} - ), - features=[f"{fv.name}:value"], - full_feature_names=full_feature_names, - ).to_df() - - if full_feature_names: - if expected_value: - assert abs(df.to_dict()[f"{fv.name}__value"][0] - expected_value) < 1e-6 - else: - assert math.isnan(df.to_dict()[f"{fv.name}__value"][0]) - else: - if expected_value: - assert abs(df.to_dict()["value"][0] - expected_value) < 1e-6 - else: - assert math.isnan(df.to_dict()["value"][0]) - - -def run_offline_online_store_consistency_test( - fs: FeatureStore, fv: FeatureView, full_feature_names: bool, -) -> None: - now = datetime.now() - # Run materialize() - # use both tz-naive & tz-aware timestamps to test that they're both correctly handled - start_date = (now - timedelta(hours=5)).replace(tzinfo=utc) - end_date = now - timedelta(hours=2) - fs.materialize(feature_views=[fv.name], start_date=start_date, end_date=end_date) - - # check result of materialize() - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=1, - event_timestamp=end_date, - expected_value=0.3, - full_feature_names=full_feature_names, - ) - - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=2, - event_timestamp=end_date, - expected_value=None, - full_feature_names=full_feature_names, - ) - - # check prior value for materialize_incremental() - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=3, - event_timestamp=end_date, - expected_value=4, - full_feature_names=full_feature_names, - ) - - # run materialize_incremental() - fs.materialize_incremental(feature_views=[fv.name], end_date=now) - - # check result of materialize_incremental() - check_offline_and_online_features( - fs=fs, - fv=fv, - driver_id=3, - event_timestamp=now, - expected_value=5, - full_feature_names=full_feature_names, - ) - - -@pytest.mark.parametrize("full_feature_names", [True, False]) -@pytest.mark.integration -def test_redis_offline_online_store_consistency(full_feature_names: bool): - with prep_redis_fs_and_fv() as (fs, fv): - run_offline_online_store_consistency_test(fs, fv, full_feature_names) - - -@pytest.mark.parametrize("full_feature_names", [True, False]) -@pytest.mark.integration -def test_dynamodb_offline_online_store_consistency(full_feature_names: bool): - with prep_dynamodb_fs_and_fv() as (fs, fv): - run_offline_online_store_consistency_test(fs, fv, full_feature_names) - - -@pytest.mark.integration -@pytest.mark.parametrize( - "source_type", ["query", "table"], -) -@pytest.mark.parametrize("full_feature_names", [True, False]) -def test_redshift_offline_online_store_consistency( - source_type: str, full_feature_names: bool -): - with prep_redshift_fs_and_fv(source_type) as (fs, fv): - run_offline_online_store_consistency_test(fs, fv, full_feature_names) - - -@pytest.mark.parametrize("full_feature_names", [True, False]) -def test_local_offline_online_store_consistency(full_feature_names: bool): - with prep_local_fs_and_fv() as (fs, fv): - run_offline_online_store_consistency_test(fs, fv, full_feature_names) From ee8be70c5ed51ca2ac62818dd344bf218ae8ef84 Mon Sep 17 00:00:00 2001 From: Benn Ma Date: Sat, 4 Sep 2021 00:10:01 +0800 Subject: [PATCH 10/19] Fix wrong links in README (#1832) Signed-off-by: Benn Ma --- README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2e3c7b9086..78f657bf40 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Please see our [documentation](https://docs.feast.dev/) for more information abo ## Architecture -The above architecture is the minimal Feast deployment. Want to run the full Feast on Kubernetes? Click [here](https://docs.feast.dev/feast-on-kubernetes/getting-started). +The above architecture is the minimal Feast deployment. Want to run the full Feast on GCP/AWS? Click [here](https://docs.feast.dev/how-to-guides/feast-gcp-aws). ## Getting Started @@ -126,16 +126,17 @@ pprint(feature_vector) ## Important resources Please refer to the official documentation at [Documentation](https://docs.feast.dev/) - * [Quickstart](https://docs.feast.dev/quickstart) + * [Quickstart](https://docs.feast.dev/getting-started/quickstart) * [Roadmap](https://docs.feast.dev/roadmap) - * [Feast on Kubernetes](https://docs.feast.dev/feast-on-kubernetes/getting-started) + * [Tutorials](https://docs.feast.dev/tutorials/tutorials-overview) + * [Running Feast with GCP/AWS](https://docs.feast.dev/how-to-guides/feast-gcp-aws) * [Change Log](https://github.com/feast-dev/feast/blob/master/CHANGELOG.md) * [Slack (#Feast)](https://slack.feast.dev/) ## Contributing Feast is a community project and is still under active development. Please have a look at our contributing and development guides if you want to contribute to the project: -- [Contribution Process for Feast](https://docs.feast.dev/contributing/contributing) -- [Development Guide for Feast](https://docs.feast.dev/contributing/development-guide) +- [Contribution Process for Feast](https://docs.feast.dev/project/contributing) +- [Development Guide for Feast](https://docs.feast.dev/project/development-guide) - [Development Guide for the Main Feast Repository](./CONTRIBUTING.md) ## Contributors ✨ From 4464bbf8fe14f49a406f7a7a2fbec7c3fc129480 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 3 Sep 2021 09:14:00 -0700 Subject: [PATCH 11/19] Init registry during create_test_environment (#1829) Signed-off-by: Achal Shah --- .../tests/integration/feature_repos/repo_configuration.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 6ff91eb1b1..47f16c2b75 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -179,6 +179,9 @@ def construct_test_environment( repo_path=repo_dir_name, ) fs = FeatureStore(config=config) + # We need to initialize the registry, because if nothing is applied in the test before tearing down + # the feature store, that will cause the teardown method to blow up. + fs.registry._initialize_registry() environment = Environment( name=project, test_repo_config=test_repo_config, From 027247df9d1fd77ce9f0077248ceb80ae700cc9c Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Fri, 3 Sep 2021 12:18:58 -0400 Subject: [PATCH 12/19] Update reviewers/approvers to include Danny/Felix (#1833) Signed-off-by: Danny Chiao --- OWNERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/OWNERS b/OWNERS index 7d42ec60f8..3ed906d2e9 100644 --- a/OWNERS +++ b/OWNERS @@ -3,9 +3,13 @@ approvers: - jklegar - tsotnet - achals + - adchia + - felixwang9817 reviewers: - woop - jklegar - tsotnet - achals - tedhtchang + - adchia + - felixwang9817 From 09e0955f709fca74457f1cfe1d049b1b6b8a7fcb Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Fri, 3 Sep 2021 12:23:57 -0400 Subject: [PATCH 13/19] Reducing wait interval for BQ integration tests (#1827) * For tests, reducing BQ's wait time. Some quick logging indicated that the uploading of the entity_df in integration tests was very slow Signed-off-by: Danny Chiao * Moving environment variables into yml and MakeFile Signed-off-by: Danny Chiao * Reducing prod retry cadence Signed-off-by: Danny Chiao --- .github/workflows/integration_tests.yml | 2 +- .github/workflows/pr_integration_tests.yml | 2 +- Makefile | 2 +- sdk/python/feast/infra/offline_stores/bigquery.py | 8 +++++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/integration_tests.yml b/.github/workflows/integration_tests.yml index 322ee0b6e5..accd95efbb 100644 --- a/.github/workflows/integration_tests.yml +++ b/.github/workflows/integration_tests.yml @@ -52,7 +52,7 @@ jobs: - name: Install dependencies run: make install-python-ci-dependencies - name: Test python - run: FEAST_USAGE=False pytest -n 8 --cov=./ --cov-report=xml --verbose --color=yes sdk/python/tests --integration + run: FEAST_USAGE=False IS_TEST=True pytest -n 8 --cov=./ --cov-report=xml --verbose --color=yes sdk/python/tests --integration - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: diff --git a/.github/workflows/pr_integration_tests.yml b/.github/workflows/pr_integration_tests.yml index 26ac821ac2..dda17496b2 100644 --- a/.github/workflows/pr_integration_tests.yml +++ b/.github/workflows/pr_integration_tests.yml @@ -63,7 +63,7 @@ jobs: - name: Install dependencies run: make install-python-ci-dependencies - name: Test python - run: FEAST_USAGE=False pytest -n 8 --cov=./ --cov-report=xml --verbose --color=yes sdk/python/tests --integration + run: FEAST_USAGE=False IS_TEST=True pytest -n 8 --cov=./ --cov-report=xml --verbose --color=yes sdk/python/tests --integration - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 with: diff --git a/Makefile b/Makefile index b18c5bca2e..881ace110e 100644 --- a/Makefile +++ b/Makefile @@ -56,7 +56,7 @@ test-python: FEAST_USAGE=False pytest -n 8 sdk/python/tests test-python-integration: - FEAST_USAGE=False pytest -n 8 --integration sdk/python/tests + FEAST_USAGE=False IS_TEST=True pytest -n 8 --integration sdk/python/tests format-python: # Sort diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index edd6957b8c..91feb1312f 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -1,3 +1,4 @@ +import os import uuid from datetime import date, datetime, timedelta from typing import Dict, List, Optional, Union @@ -239,7 +240,7 @@ def block_until_done( client: Client, bq_job: Union[bigquery.job.query.QueryJob, bigquery.job.load.LoadJob], timeout: int = 1800, - retry_cadence: int = 10, + retry_cadence: float = 1, ): """ Waits for bq_job to finish running, up to a maximum amount of time specified by the timeout parameter (defaulting to 30 minutes). @@ -255,6 +256,11 @@ def block_until_done( BigQueryJobCancelled exception to signify when that the job has been cancelled (i.e. from timeout or KeyboardInterrupt). """ + # For test environments, retry more aggressively + is_test = os.getenv("IS_TEST", default="False") == "True" + if is_test: + retry_cadence = 0.1 + def _wait_until_done(job_id): if client.get_job(job_id).state in ["PENDING", "RUNNING"]: raise BigQueryJobStillRunning(job_id=job_id) From 1044745cfbbdf47086362704ac280cd5d440cf14 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 3 Sep 2021 10:14:57 -0700 Subject: [PATCH 14/19] Replace individual cli tests with parametrized tests (#1830) * Replace individual cli tests with parametrized tests Signed-off-by: Achal Shah * port over some tests from test_cli Signed-off-by: Achal Shah --- .../{test_cli_local.py => test_cli.py} | 109 ++++++++---------- .../integration/registration/test_cli_aws.py | 64 ---------- .../integration/registration/test_cli_gcp.py | 86 -------------- .../registration/test_cli_redis.py | 101 ---------------- 4 files changed, 45 insertions(+), 315 deletions(-) rename sdk/python/tests/integration/registration/{test_cli_local.py => test_cli.py} (75%) delete mode 100644 sdk/python/tests/integration/registration/test_cli_aws.py delete mode 100644 sdk/python/tests/integration/registration/test_cli_gcp.py delete mode 100644 sdk/python/tests/integration/registration/test_cli_redis.py diff --git a/sdk/python/tests/integration/registration/test_cli_local.py b/sdk/python/tests/integration/registration/test_cli.py similarity index 75% rename from sdk/python/tests/integration/registration/test_cli_local.py rename to sdk/python/tests/integration/registration/test_cli.py index d59d2ebf51..695c2ceaa4 100644 --- a/sdk/python/tests/integration/registration/test_cli_local.py +++ b/sdk/python/tests/integration/registration/test_cli.py @@ -1,47 +1,41 @@ import tempfile +import uuid from contextlib import contextmanager -from pathlib import Path +from pathlib import Path, PosixPath from textwrap import dedent -import assertpy import pytest - -from feast.feature_store import FeatureStore +import yaml +from assertpy import assertpy + +from feast import FeatureStore, RepoConfig +from tests.integration.feature_repos.repo_configuration import FULL_REPO_CONFIGS +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) from tests.utils.cli_utils import CliRunner, get_example_repo from tests.utils.online_read_write_test import basic_rw_test @pytest.mark.integration -def test_workflow() -> None: - """ - Test running apply on a sample repo, and make sure the infra gets created. - """ +@pytest.mark.parametrize("test_repo_config", FULL_REPO_CONFIGS) +def test_universal_cli(test_repo_config) -> None: + project = f"test_universal_cli_{str(uuid.uuid4()).replace('-', '')[:8]}" + runner = CliRunner() - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - # Construct an example repo in a temporary dir + with tempfile.TemporaryDirectory() as repo_dir_name: + feature_store_yaml = make_feature_store_yaml( + project, test_repo_config, repo_dir_name + ) repo_path = Path(repo_dir_name) - data_path = Path(data_dir_name) repo_config = repo_path / "feature_store.yaml" - repo_config.write_text( - dedent( - f""" - project: foo - registry: {data_path / "registry.db"} - provider: local - online_store: - path: {data_path / "online_store.db"} - offline_store: - type: bigquery - """ - ) - ) + repo_config.write_text(dedent(feature_store_yaml)) repo_example = repo_path / "example.py" repo_example.write_text(get_example_repo("example_feature_repo_1.py")) - result = runner.run(["apply"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) @@ -65,6 +59,9 @@ def test_workflow() -> None: ) assertpy.assert_that(result.returncode).is_equal_to(0) + fs = FeatureStore(repo_path=str(repo_path)) + assertpy.assert_that(fs.list_feature_views()).is_length(3) + # entity & feature view describe commands should fail when objects don't exist result = runner.run(["entities", "describe", "foo"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(1) @@ -76,7 +73,6 @@ def test_workflow() -> None: # Doing another apply should be a no op, and should not cause errors result = runner.run(["apply"], cwd=repo_path) assertpy.assert_that(result.returncode).is_equal_to(0) - basic_rw_test( FeatureStore(repo_path=str(repo_path), config=None), view_name="driver_locations", @@ -86,44 +82,29 @@ def test_workflow() -> None: assertpy.assert_that(result.returncode).is_equal_to(0) -@pytest.mark.integration -def test_non_local_feature_repo() -> None: - """ - Test running apply on a sample repo, and make sure the infra gets created. - """ - runner = CliRunner() - with tempfile.TemporaryDirectory() as repo_dir_name: - - # Construct an example repo in a temporary dir - repo_path = Path(repo_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - """ - project: foo - registry: data/registry.db - provider: local - online_store: - path: data/online_store.db - offline_store: - type: bigquery - """ - ) - ) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_1.py")) - - result = runner.run(["apply"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) - - fs = FeatureStore(repo_path=str(repo_path)) - assertpy.assert_that(fs.list_feature_views()).is_length(3) - - result = runner.run(["teardown"], cwd=repo_path) - assertpy.assert_that(result.returncode).is_equal_to(0) +def make_feature_store_yaml(project, test_repo_config, repo_dir_name: PosixPath): + offline_creator: DataSourceCreator = test_repo_config.offline_store_creator(project) + + offline_store_config = offline_creator.create_offline_store_config() + online_store = test_repo_config.online_store + + config = RepoConfig( + registry=str(Path(repo_dir_name) / "registry.db"), + project=project, + provider=test_repo_config.provider, + offline_store=offline_store_config, + online_store=online_store, + repo_path=str(Path(repo_dir_name)), + ) + config_dict = config.dict() + if ( + isinstance(config_dict["online_store"], dict) + and "redis_type" in config_dict["online_store"] + ): + del config_dict["online_store"]["redis_type"] + config_dict["repo_path"] = str(config_dict["repo_path"]) + + return yaml.safe_dump(config_dict) @contextmanager diff --git a/sdk/python/tests/integration/registration/test_cli_aws.py b/sdk/python/tests/integration/registration/test_cli_aws.py deleted file mode 100644 index df70b17eba..0000000000 --- a/sdk/python/tests/integration/registration/test_cli_aws.py +++ /dev/null @@ -1,64 +0,0 @@ -import random -import string -import tempfile -from pathlib import Path -from textwrap import dedent - -import pytest - -from feast.feature_store import FeatureStore -from tests.utils.cli_utils import CliRunner, get_example_repo -from tests.utils.online_read_write_test import basic_rw_test - - -@pytest.mark.integration -def test_basic() -> None: - project_id = "".join( - random.choice(string.ascii_lowercase + string.digits) for _ in range(10) - ) - runner = CliRunner() - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - - repo_path = Path(repo_dir_name) - data_path = Path(data_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - f""" - project: {project_id} - registry: {data_path / "registry.db"} - provider: aws - online_store: - type: dynamodb - region: us-west-2 - offline_store: - type: redshift - cluster_id: feast-integration-tests - region: us-west-2 - user: admin - database: feast - s3_staging_location: s3://feast-integration-tests/redshift - iam_role: arn:aws:iam::402087665549:role/redshift_s3_access_role - """ - ) - ) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_1.py")) - - result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 - - # Doing another apply should be a no op, and should not cause errors - result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 - - basic_rw_test( - FeatureStore(repo_path=str(repo_path), config=None), - view_name="driver_locations", - ) - - result = runner.run(["teardown"], cwd=repo_path) - assert result.returncode == 0 diff --git a/sdk/python/tests/integration/registration/test_cli_gcp.py b/sdk/python/tests/integration/registration/test_cli_gcp.py deleted file mode 100644 index b4be581088..0000000000 --- a/sdk/python/tests/integration/registration/test_cli_gcp.py +++ /dev/null @@ -1,86 +0,0 @@ -import random -import string -import tempfile -from pathlib import Path -from textwrap import dedent - -import pytest - -from feast.feature_store import FeatureStore -from tests.utils.cli_utils import CliRunner, get_example_repo -from tests.utils.online_read_write_test import basic_rw_test - - -@pytest.mark.integration -def test_basic() -> None: - project_id = "".join( - random.choice(string.ascii_lowercase + string.digits) for _ in range(10) - ) - runner = CliRunner() - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - - repo_path = Path(repo_dir_name) - data_path = Path(data_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - f""" - project: {project_id} - registry: {data_path / "registry.db"} - provider: gcp - """ - ) - ) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_1.py")) - - result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 - - # Doing another apply should be a no op, and should not cause errors - result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 - - basic_rw_test( - FeatureStore(repo_path=str(repo_path), config=None), - view_name="driver_locations", - ) - - result = runner.run(["teardown"], cwd=repo_path) - assert result.returncode == 0 - - -@pytest.mark.integration -def test_missing_bq_source_fail() -> None: - project_id = "".join( - random.choice(string.ascii_lowercase + string.digits) for _ in range(10) - ) - runner = CliRunner() - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - - repo_path = Path(repo_dir_name) - data_path = Path(data_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - f""" - project: {project_id} - registry: {data_path / "registry.db"} - provider: gcp - """ - ) - ) - - repo_example = repo_path / "example.py" - repo_example.write_text( - get_example_repo("example_feature_repo_with_missing_bq_source.py") - ) - - returncode, output = runner.run_with_output(["apply"], cwd=repo_path) - assert returncode == 1 - assert b"DataSourceNotFoundException" in output diff --git a/sdk/python/tests/integration/registration/test_cli_redis.py b/sdk/python/tests/integration/registration/test_cli_redis.py deleted file mode 100644 index a4b146a29c..0000000000 --- a/sdk/python/tests/integration/registration/test_cli_redis.py +++ /dev/null @@ -1,101 +0,0 @@ -import random -import string -import tempfile -from pathlib import Path -from textwrap import dedent - -import pytest -import redis - -from feast.feature_store import FeatureStore -from tests.utils.cli_utils import CliRunner, get_example_repo -from tests.utils.online_read_write_test import basic_rw_test - - -@pytest.mark.integration -def test_basic() -> None: - project_id = "".join( - random.choice(string.ascii_lowercase + string.digits) for _ in range(10) - ) - runner = CliRunner() - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - - repo_path = Path(repo_dir_name) - data_path = Path(data_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - f""" - project: {project_id} - registry: {data_path / "registry.db"} - provider: local - offline_store: - type: bigquery - online_store: - type: redis - connection_string: localhost:6379,db=0 - """ - ) - ) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_1.py")) - - result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 - - # Doing another apply should be a no op, and should not cause errors - result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 - - basic_rw_test( - FeatureStore(repo_path=str(repo_path), config=None), - view_name="driver_locations", - ) - - result = runner.run(["teardown"], cwd=repo_path) - assert result.returncode == 0 - - -@pytest.mark.integration -def test_connection_error() -> None: - project_id = "".join( - random.choice(string.ascii_lowercase + string.digits) for _ in range(10) - ) - runner = CliRunner() - with tempfile.TemporaryDirectory() as repo_dir_name, tempfile.TemporaryDirectory() as data_dir_name: - - repo_path = Path(repo_dir_name) - data_path = Path(data_dir_name) - - repo_config = repo_path / "feature_store.yaml" - - repo_config.write_text( - dedent( - f""" - project: {project_id} - registry: {data_path / "registry.db"} - provider: local - offline_store: - type: file - online_store: - type: redis - connection_string: localhost:6379,db=0= - """ - ) - ) - - repo_example = repo_path / "example.py" - repo_example.write_text(get_example_repo("example_feature_repo_2.py")) - - result = runner.run(["apply"], cwd=repo_path) - assert result.returncode == 0 - - # Redis does not support names for its databases. - with pytest.raises(redis.exceptions.ResponseError): - basic_rw_test( - FeatureStore(repo_path=str(repo_path), config=None), - view_name="driver_hourly_stats", - ) From fda5b55e91c000ae7274b2bcecae6cd073d1692e Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 3 Sep 2021 13:03:10 -0700 Subject: [PATCH 15/19] Fix flaky connection to redshift data API (#1834) * Fix flaky connection to redshift data API Signed-off-by: Achal Shah * Lower max Signed-off-by: Achal Shah --- sdk/python/feast/infra/utils/aws_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 84fa611c15..aaeff96c6f 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -15,7 +15,7 @@ try: import boto3 from botocore.config import Config - from botocore.exceptions import ClientError + from botocore.exceptions import ClientError, ConnectionClosedError except ImportError as e: from feast.errors import FeastExtrasDependencyImportError @@ -50,6 +50,10 @@ def get_bucket_and_key(s3_path: str) -> Tuple[str, str]: return bucket, key +@retry( + wait=wait_exponential(multiplier=1, max=4), + retry=retry_if_exception_type(ConnectionClosedError), +) def execute_redshift_statement_async( redshift_data_client, cluster_id: str, database: str, user: str, query: str ) -> dict: From b0f38adc07497e089b8f019558761b85397fda62 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 3 Sep 2021 14:12:09 -0700 Subject: [PATCH 16/19] Update historical retrieval integration test for on demand feature views (#1836) Signed-off-by: Achal Shah --- .../feature_repos/repo_configuration.py | 7 +++- .../test_universal_historical_retrieval.py | 38 +++++++++++++++---- .../online_store/test_universal_online.py | 6 +-- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index 47f16c2b75..c1d63c2f43 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -23,6 +23,7 @@ RedshiftDataSourceCreator, ) from tests.integration.feature_repos.universal.feature_views import ( + conv_rate_plus_100_feature_view, create_customer_daily_profile_feature_view, create_driver_hourly_stats_feature_view, ) @@ -126,11 +127,15 @@ def construct_universal_data_sources( def construct_universal_feature_views( data_sources: Dict[str, DataSource], ) -> Dict[str, FeatureView]: + driver_hourly_stats = create_driver_hourly_stats_feature_view( + data_sources["driver"] + ) return { "customer": create_customer_daily_profile_feature_view( data_sources["customer"] ), - "driver": create_driver_hourly_stats_feature_view(data_sources["driver"]), + "driver": driver_hourly_stats, + "driver_odfv": conv_rate_plus_100_feature_view({"driver": driver_hourly_stats}), } diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 75cd5bbf70..1d3ae7edb5 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -134,6 +134,9 @@ def get_expected_training_df( for col, typ in expected_column_types.items(): expected_df[col] = expected_df[col].astype(typ) + conv_feature_name = "driver_stats__conv_rate" if full_feature_names else "conv_rate" + expected_df["conv_rate_plus_100"] = expected_df[conv_feature_name] + 100 + return expected_df @@ -150,13 +153,14 @@ def test_historical_features(environment, universal_data_sources, full_feature_n datasets["driver"], datasets["orders"], ) - customer_fv, driver_fv = ( + customer_fv, driver_fv, driver_odfv = ( feature_views["customer"], feature_views["driver"], + feature_views["driver_odfv"], ) feast_objects = [] - feast_objects.extend([customer_fv, driver_fv, driver(), customer()]) + feast_objects.extend([customer_fv, driver_fv, driver_odfv, driver(), customer()]) store.apply(feast_objects) entity_df_query = None @@ -188,6 +192,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", + "conv_rate_plus_100", ], full_feature_names=full_feature_names, ) @@ -221,14 +226,21 @@ def test_historical_features(environment, universal_data_sources, full_feature_n actual_df_from_sql_entities, expected_df, check_dtype=False, ) + expected_df_from_arrow = expected_df.drop(columns=["conv_rate_plus_100"]) table_from_sql_entities = job_from_sql.to_arrow() df_from_sql_entities = ( - table_from_sql_entities.to_pandas()[expected_df.columns] + table_from_sql_entities.to_pandas()[expected_df_from_arrow.columns] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) - assert_frame_equal(actual_df_from_sql_entities, df_from_sql_entities) + + for col in df_from_sql_entities.columns: + expected_df_from_arrow[col] = expected_df_from_arrow[col].astype( + df_from_sql_entities[col].dtype + ) + + assert_frame_equal(expected_df_from_arrow, df_from_sql_entities) job_from_df = store.get_historical_features( entity_df=orders_df, @@ -238,6 +250,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n "customer_profile:current_balance", "customer_profile:avg_passenger_count", "customer_profile:lifetime_trip_count", + "conv_rate_plus_100", ], full_feature_names=full_feature_names, ) @@ -250,7 +263,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n print(str(f"Time to execute job_from_df.to_df() = '{(end_time - start_time)}'\n")) assert sorted(expected_df.columns) == sorted(actual_df_from_df_entities.columns) - expected_df = ( + expected_df: pd.DataFrame = ( expected_df.sort_values( by=[event_timestamp, "order_id", "driver_id", "customer_id"] ) @@ -268,11 +281,20 @@ def test_historical_features(environment, universal_data_sources, full_feature_n expected_df, actual_df_from_df_entities, check_dtype=False, ) - table_from_df_entities = job_from_df.to_arrow().to_pandas() + # on demand features is only plumbed through to to_df for now. + table_from_df_entities: pd.DataFrame = job_from_df.to_arrow().to_pandas() + actual_df_from_df_entities_for_table = actual_df_from_df_entities.drop( + columns=["conv_rate_plus_100"] + ) + assert "conv_rate_plus_100" not in table_from_df_entities.columns + + columns_expected_in_table = expected_df.columns.tolist() + columns_expected_in_table.remove("conv_rate_plus_100") + table_from_df_entities = ( - table_from_df_entities[expected_df.columns] + table_from_df_entities[columns_expected_in_table] .sort_values(by=[event_timestamp, "order_id", "driver_id", "customer_id"]) .drop_duplicates() .reset_index(drop=True) ) - assert_frame_equal(actual_df_from_df_entities, table_from_df_entities) + assert_frame_equal(actual_df_from_df_entities_for_table, table_from_df_entities) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index d86af4521c..fc1404298b 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -3,9 +3,6 @@ import pandas as pd import pytest -from integration.feature_repos.universal.feature_views import ( - conv_rate_plus_100_feature_view, -) from tests.integration.feature_repos.repo_configuration import ( construct_universal_feature_views, @@ -20,10 +17,9 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name fs = environment.feature_store entities, datasets, data_sources = universal_data_sources feature_views = construct_universal_feature_views(data_sources) - odfv = conv_rate_plus_100_feature_view(inputs={"driver": feature_views["driver"]}) feast_objects = [] feast_objects.extend(feature_views.values()) - feast_objects.extend([odfv, driver(), customer()]) + feast_objects.extend([driver(), customer()]) fs.apply(feast_objects) fs.materialize(environment.start_date, environment.end_date) From f829232d8cfb247e023e6ad3a91a53ba8106eef3 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Fri, 3 Sep 2021 16:54:09 -0700 Subject: [PATCH 17/19] Simplify _python_value_to_proto_value by looking up values in a dict (#1837) Signed-off-by: Achal Shah --- sdk/python/feast/type_map.py | 170 ++++++++++++----------------------- 1 file changed, 58 insertions(+), 112 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index c65ec6e14c..3e4567c607 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -14,7 +14,7 @@ import re from datetime import datetime -from typing import Any, Dict, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union import numpy as np import pandas as pd @@ -164,8 +164,43 @@ def _type_err(item, dtype): raise ValueError(f'Value "{item}" is of type {type(item)} not of type {dtype}') -# TODO(achals): Simplify this method and remove the noqa. -def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: # noqa: C901 +PYTHON_LIST_VALUE_TYPE_TO_PROTO_VALUE: Dict[Any, Tuple[Any, str, List[Any]]] = { + ValueType.FLOAT_LIST: ( + FloatList, + "float_list_val", + [np.float32, np.float64, float], + ), + ValueType.DOUBLE_LIST: ( + DoubleList, + "double_list_val", + [np.float64, np.float32, float], + ), + ValueType.INT32_LIST: (Int32List, "int32_list_val", [np.int32, int]), + ValueType.INT64_LIST: (Int64List, "int64_list_val", [np.int64, np.int32, int]), + ValueType.UNIX_TIMESTAMP_LIST: ( + Int64List, + "int64_list_val", + [np.int64, np.int32, int], + ), + ValueType.STRING_LIST: (StringList, "string_list_val", [np.str_, str]), + ValueType.BOOL_LIST: (BoolList, "bool_list_val", [np.bool_, bool]), + ValueType.BYTES_LIST: (BytesList, "bytes_list_val", [np.bytes_, bytes]), +} + +PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: Dict[ + Any, Tuple[str, Any, Optional[Set[Any]]] +] = { + ValueType.INT32: ("int32_val", lambda x: int(x), None), + ValueType.INT64: ("int64_val", lambda x: int(x), None), + ValueType.FLOAT: ("float_val", lambda x: float(x), None), + ValueType.DOUBLE: ("double_val", lambda x: x, {float, np.float64}), + ValueType.STRING: ("string_val", lambda x: str(x), None), + ValueType.BYTES: ("bytes_val", lambda x: x, {bytes}), + ValueType.BOOL: ("bool_val", lambda x: x, {bool}), +} + + +def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: """ Converts a Python (native, pandas) value to a Feast Proto Value based on a provided value type @@ -180,130 +215,41 @@ def _python_value_to_proto_value(feast_value_type, value) -> ProtoValue: # noqa # Detect list type and handle separately if "list" in feast_value_type.name.lower(): - - if feast_value_type == ValueType.FLOAT_LIST: - return ProtoValue( - float_list_val=FloatList( - val=[ - item - if type(item) in [np.float32, np.float64, float] - else _type_err(item, np.float32) - for item in value - ] - ) - ) - - if feast_value_type == ValueType.DOUBLE_LIST: - return ProtoValue( - double_list_val=DoubleList( + if feast_value_type in PYTHON_LIST_VALUE_TYPE_TO_PROTO_VALUE: + proto_type, field_name, valid_types = PYTHON_LIST_VALUE_TYPE_TO_PROTO_VALUE[ + feast_value_type + ] + f = { + field_name: proto_type( val=[ item - if type(item) in [np.float64, np.float32, float] - else _type_err(item, np.float64) + if type(item) in valid_types + else _type_err(item, valid_types[0]) for item in value ] ) - ) - - if feast_value_type == ValueType.INT32_LIST: - return ProtoValue( - int32_list_val=Int32List( - val=[ - item - if type(item) in [np.int32, int] - else _type_err(item, np.int32) - for item in value - ] - ) - ) - - if feast_value_type == ValueType.INT64_LIST: - return ProtoValue( - int64_list_val=Int64List( - val=[ - item - if type(item) in [np.int64, np.int32, int] - else _type_err(item, np.int64) - for item in value - ] - ) - ) - - if feast_value_type == ValueType.UNIX_TIMESTAMP_LIST: - return ProtoValue( - int64_list_val=Int64List( - val=[ - item - if type(item) in [np.int64, np.int32, int] - else _type_err(item, np.int64) - for item in value - ] - ) - ) - - if feast_value_type == ValueType.STRING_LIST: - return ProtoValue( - string_list_val=StringList( - val=[ - item - if type(item) in [np.str_, str] - else _type_err(item, np.str_) - for item in value - ] - ) - ) - - if feast_value_type == ValueType.BOOL_LIST: - return ProtoValue( - bool_list_val=BoolList( - val=[ - item - if type(item) in [np.bool_, bool] - else _type_err(item, np.bool_) - for item in value - ] - ) - ) - - if feast_value_type == ValueType.BYTES_LIST: - return ProtoValue( - bytes_list_val=BytesList( - val=[ - item - if type(item) in [np.bytes_, bytes] - else _type_err(item, np.bytes_) - for item in value - ] - ) - ) - + } + return ProtoValue(**f) # Handle scalar types below else: if pd.isnull(value): return ProtoValue() - elif feast_value_type == ValueType.INT32: - return ProtoValue(int32_val=int(value)) - elif feast_value_type == ValueType.INT64: - return ProtoValue(int64_val=int(value)) elif feast_value_type == ValueType.UNIX_TIMESTAMP: if isinstance(value, datetime): return ProtoValue(int64_val=int(value.timestamp())) elif isinstance(value, Timestamp): return ProtoValue(int64_val=int(value.ToSeconds())) return ProtoValue(int64_val=int(value)) - elif feast_value_type == ValueType.FLOAT: - return ProtoValue(float_val=float(value)) - elif feast_value_type == ValueType.DOUBLE: - assert type(value) in [float, np.float64] - return ProtoValue(double_val=value) - elif feast_value_type == ValueType.STRING: - return ProtoValue(string_val=str(value)) - elif feast_value_type == ValueType.BYTES: - assert type(value) is bytes - return ProtoValue(bytes_val=value) - elif feast_value_type == ValueType.BOOL: - assert type(value) is bool - return ProtoValue(bool_val=value) + elif feast_value_type in PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: + ( + field_name, + func, + valid_scalar_types, + ) = PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE[feast_value_type] + if valid_scalar_types: + assert type(value) in valid_scalar_types + kwargs = {field_name: func(value)} + return ProtoValue(**kwargs) raise Exception(f"Unsupported data type: ${str(type(value))}") From fd255cae7153cd44432f172b1e5c4738e4a7d583 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 5 Sep 2021 00:46:23 -0400 Subject: [PATCH 18/19] Enable the types test to run on all compatible environments (#1840) * Adding fixtures for type test Signed-off-by: Danny Chiao * Refactor to use fixtures Signed-off-by: Danny Chiao * Add issue on github Signed-off-by: Danny Chiao --- .../feast/infra/online_stores/sqlite.py | 5 +- sdk/python/tests/conftest.py | 6 - .../universal/data_sources/bigquery.py | 1 + .../registration/test_universal_types.py | 381 ++++++++++-------- 4 files changed, 216 insertions(+), 177 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/sqlite.py b/sdk/python/feast/infra/online_stores/sqlite.py index dbd837c5df..4be90257dd 100644 --- a/sdk/python/feast/infra/online_stores/sqlite.py +++ b/sdk/python/feast/infra/online_stores/sqlite.py @@ -192,7 +192,10 @@ def teardown( tables: Sequence[Union[FeatureTable, FeatureView]], entities: Sequence[Entity], ): - os.unlink(self._get_db_path(config)) + try: + os.unlink(self._get_db_path(config)) + except FileNotFoundError: + pass def _table_id(project: str, table: Union[FeatureTable, FeatureView]) -> str: diff --git a/sdk/python/tests/conftest.py b/sdk/python/tests/conftest.py index 55bdeb3a7d..bf8f4d83a5 100644 --- a/sdk/python/tests/conftest.py +++ b/sdk/python/tests/conftest.py @@ -132,9 +132,3 @@ def e2e_data_sources(environment: Environment): yield df, data_source environment.data_source_creator.teardown() - - -@pytest.fixture(params=FULL_REPO_CONFIGS, scope="session") -def type_test_environment(request): - with construct_test_environment(request.param) as e: - yield e diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py index 9b702ebe6c..228e9959d5 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/bigquery.py @@ -42,6 +42,7 @@ def teardown(self): self.dataset_id, delete_contents=True, not_found_ok=True ) print(f"Deleted dataset '{self.dataset_id}'") + self.dataset = None def create_offline_store_config(self): return BigQueryOfflineStoreConfig() diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 5390b72cc0..f6fd942ef9 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -1,4 +1,6 @@ +from dataclasses import dataclass from datetime import datetime, timedelta +from typing import List import pandas as pd import pytest @@ -8,186 +10,210 @@ from feast.value_type import ValueType from tests.data.data_creator import create_dataset, get_feature_values_for_dtype from tests.integration.feature_repos.repo_configuration import ( + FULL_REPO_CONFIGS, + REDIS_CONFIG, IntegrationTestRepoConfig, construct_test_environment, ) -from tests.integration.feature_repos.universal.data_sources.bigquery import ( - BigQueryDataSourceCreator, -) from tests.integration.feature_repos.universal.entities import driver from tests.integration.feature_repos.universal.feature_views import driver_feature_view -def entity_feature_types_ids(entity_type: ValueType, feature_dtype: str): - return f"entity_type:{str(entity_type)}-feature_dtype:{feature_dtype}" +def populate_test_configs(offline: bool): + entity_type_feature_dtypes = [ + (ValueType.INT32, "int32"), + (ValueType.INT64, "int64"), + (ValueType.STRING, "float"), + (ValueType.STRING, "bool"), + ] + configs: List[TypeTestConfig] = [] + for test_repo_config in FULL_REPO_CONFIGS: + for entity_type, feature_dtype in entity_type_feature_dtypes: + for feature_is_list in [True, False]: + # Redshift doesn't support list features + if test_repo_config.provider == "aws" and feature_is_list is True: + continue + # For offline tests, don't need to vary for online store + if offline and test_repo_config.online_store == REDIS_CONFIG: + continue + # TODO(https://github.com/feast-dev/feast/issues/1839): Fix BQ materialization of list features + if ( + not offline + and test_repo_config.provider == "gcp" + and feature_is_list is True + ): + continue + configs.append( + TypeTestConfig( + entity_type=entity_type, + feature_dtype=feature_dtype, + feature_is_list=feature_is_list, + test_repo_config=test_repo_config, + ) + ) + return configs -entity_type_feature_dtypes = [ - (ValueType.INT32, "int32"), - (ValueType.INT64, "int64"), - (ValueType.STRING, "float"), - (ValueType.STRING, "bool"), -] -GCP_CONFIG = IntegrationTestRepoConfig( - provider="gcp", - offline_store_creator=BigQueryDataSourceCreator, - online_store="datastore", -) +@dataclass(frozen=True, repr=True) +class TypeTestConfig: + entity_type: ValueType + feature_dtype: str + feature_is_list: bool + test_repo_config: IntegrationTestRepoConfig -# TODO: change parametrization to allow for other providers aside from gcp -@pytest.mark.integration -@pytest.mark.parametrize( - "entity_type,feature_dtype", - entity_type_feature_dtypes, - ids=[ - entity_feature_types_ids(entity_type, feature_dtype) - for entity_type, feature_dtype in entity_type_feature_dtypes - ], +OFFLINE_TYPE_TEST_CONFIGS: List[TypeTestConfig] = populate_test_configs(offline=True) +ONLINE_TYPE_TEST_CONFIGS: List[TypeTestConfig] = populate_test_configs(offline=False) + + +@pytest.fixture( + params=OFFLINE_TYPE_TEST_CONFIGS, + scope="session", + ids=[str(c) for c in OFFLINE_TYPE_TEST_CONFIGS], ) -@pytest.mark.parametrize( - "feature_is_list", [False], ids=lambda v: f"feature_is_list:{str(v)}" +def offline_types_test_fixtures(request): + yield from get_fixtures(request) + + +@pytest.fixture( + params=ONLINE_TYPE_TEST_CONFIGS, + scope="session", + ids=[str(c) for c in ONLINE_TYPE_TEST_CONFIGS], ) -def test_entity_inference_types_match(entity_type, feature_dtype, feature_is_list): - with construct_test_environment(GCP_CONFIG) as environment: - df = create_dataset(entity_type, feature_dtype, feature_is_list) - data_source = environment.data_source_creator.create_data_source( +def online_types_test_fixtures(request): + yield from get_fixtures(request) + + +def get_fixtures(request): + config: TypeTestConfig = request.param + # Lower case needed because Redshift lower-cases all table names + test_project_id = f"{config.entity_type}{config.feature_dtype}{config.feature_is_list}".replace( + ".", "" + ).lower() + with construct_test_environment( + test_repo_config=config.test_repo_config, + test_suite_name=f"test_{test_project_id}", + ) as type_test_environment: + config = request.param + df = create_dataset( + config.entity_type, config.feature_dtype, config.feature_is_list + ) + data_source = type_test_environment.data_source_creator.create_data_source( df, - destination_name=environment.feature_store.project, + destination_name=type_test_environment.feature_store.project, field_mapping={"ts_1": "ts"}, ) - fv = create_feature_view(feature_dtype, feature_is_list, data_source) - fs = environment.feature_store - - try: - # Don't specify value type in entity to force inference - entity = driver(value_type=ValueType.UNKNOWN) - fs.apply([fv, entity]) - - entities = fs.list_entities() - entity_type_to_expected_inferred_entity_type = { - ValueType.INT32: ValueType.INT64, - ValueType.INT64: ValueType.INT64, - ValueType.FLOAT: ValueType.DOUBLE, - ValueType.STRING: ValueType.STRING, - } - for entity in entities: - assert ( - entity.value_type - == entity_type_to_expected_inferred_entity_type[entity_type] - ) - finally: - environment.data_source_creator.teardown() + fv = create_feature_view( + config.feature_dtype, config.feature_is_list, data_source + ) + yield type_test_environment, config, data_source, fv + type_test_environment.data_source_creator.teardown() @pytest.mark.integration -@pytest.mark.parametrize( - "entity_type,feature_dtype", - entity_type_feature_dtypes, - ids=[ - entity_feature_types_ids(entity_type, feature_dtype) - for entity_type, feature_dtype in entity_type_feature_dtypes - ], -) -@pytest.mark.parametrize( - "feature_is_list", [True, False], ids=lambda v: f"feature_is_list:{str(v)}" -) -def test_feature_get_historical_features_types_match( - entity_type, feature_dtype, feature_is_list -): - with construct_test_environment(GCP_CONFIG) as environment: - df = create_dataset(entity_type, feature_dtype, feature_is_list) - data_source = environment.data_source_creator.create_data_source( - df, - destination_name=environment.feature_store.project, - field_mapping={"ts_1": "ts"}, +def test_entity_inference_types_match(offline_types_test_fixtures): + environment, config, data_source, fv = offline_types_test_fixtures + fs = environment.feature_store + + # Don't specify value type in entity to force inference + entity = driver(value_type=ValueType.UNKNOWN) + fs.apply([fv, entity]) + + entities = fs.list_entities() + entity_type_to_expected_inferred_entity_type = { + ValueType.INT32: ValueType.INT64, + ValueType.INT64: ValueType.INT64, + ValueType.FLOAT: ValueType.DOUBLE, + ValueType.STRING: ValueType.STRING, + } + for entity in entities: + assert ( + entity.value_type + == entity_type_to_expected_inferred_entity_type[config.entity_type] ) - fv = create_feature_view(feature_dtype, feature_is_list, data_source) - fs = environment.feature_store - entity = driver() - try: - fs.apply([fv, entity]) - - features = [f"{fv.name}:value"] - df = pd.DataFrame() - df["driver_id"] = ["1", "3"] if entity_type == ValueType.STRING else [1, 3] - now = datetime.utcnow() - ts = pd.Timestamp(now).round("ms") - df["ts"] = [ - ts - timedelta(hours=4), - ts - timedelta(hours=2), - ] - historical_features = fs.get_historical_features( - entity_df=df, features=features, - ) - # TODO(adchia): pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs - historical_features_df = historical_features.to_df() - print(historical_features_df) - if feature_is_list: - assert_feature_list_types(feature_dtype, historical_features_df) - else: - assert_expected_historical_feature_types( - feature_dtype, historical_features_df - ) - assert_expected_arrow_types( - feature_dtype, feature_is_list, historical_features - ) - finally: - environment.data_source_creator.teardown() + +@pytest.mark.integration +def test_feature_get_historical_features_types_match(offline_types_test_fixtures): + environment, config, data_source, fv = offline_types_test_fixtures + fs = environment.feature_store + fv = create_feature_view(config.feature_dtype, config.feature_is_list, data_source) + entity = driver() + fs.apply([fv, entity]) + + features = [f"{fv.name}:value"] + entity_df = pd.DataFrame() + entity_df["driver_id"] = ( + ["1", "3"] if config.entity_type == ValueType.STRING else [1, 3] + ) + now = datetime.utcnow() + ts = pd.Timestamp(now).round("ms") + entity_df["ts"] = [ + ts - timedelta(hours=4), + ts - timedelta(hours=2), + ] + historical_features = fs.get_historical_features( + entity_df=entity_df, features=features, + ) + # Note: Pandas doesn't play well with nan values in ints. BQ will also coerce to floats if there are NaNs + historical_features_df = historical_features.to_df() + print(historical_features_df) + + if config.feature_is_list: + assert_feature_list_types( + environment.test_repo_config.provider, + config.feature_dtype, + historical_features_df, + ) + else: + assert_expected_historical_feature_types( + config.feature_dtype, historical_features_df + ) + assert_expected_arrow_types( + environment.test_repo_config.provider, + config.feature_dtype, + config.feature_is_list, + historical_features, + ) @pytest.mark.integration -@pytest.mark.parametrize( - "entity_type,feature_dtype", - entity_type_feature_dtypes, - ids=[ - entity_feature_types_ids(entity_type, feature_dtype) - for entity_type, feature_dtype in entity_type_feature_dtypes - ], -) -@pytest.mark.parametrize( - "feature_is_list", [False], ids=lambda v: f"feature_is_list:{str(v)}" -) -def test_feature_get_online_features_types_match( - entity_type, feature_dtype, feature_is_list -): - with construct_test_environment(GCP_CONFIG) as environment: - df = create_dataset(entity_type, feature_dtype, feature_is_list) - data_source = environment.data_source_creator.create_data_source( - df, - destination_name=environment.feature_store.project, - field_mapping={"ts_1": "ts"}, +def test_feature_get_online_features_types_match(online_types_test_fixtures): + environment, config, data_source, fv = online_types_test_fixtures + fv = create_feature_view(config.feature_dtype, config.feature_is_list, data_source) + fs = environment.feature_store + features = [fv.name + ":value"] + entity = driver(value_type=ValueType.UNKNOWN) + fs.apply([fv, entity]) + fs.materialize(environment.start_date, environment.end_date) + + driver_id_value = "1" if config.entity_type == ValueType.STRING else 1 + online_features = fs.get_online_features( + features=features, entity_rows=[{"driver": driver_id_value}], + ).to_dict() + + feature_list_dtype_to_expected_online_response_value_type = { + "int32": "int", + "int64": "int", + "float": "float", + "string": "str", + "bool": "bool", + } + if config.feature_is_list: + assert type(online_features["value"][0]).__name__ == "list" + assert ( + type(online_features["value"][0][0]).__name__ + == feature_list_dtype_to_expected_online_response_value_type[ + config.feature_dtype + ] + ) + else: + assert ( + type(online_features["value"][0]).__name__ + == feature_list_dtype_to_expected_online_response_value_type[ + config.feature_dtype + ] ) - fv = create_feature_view(feature_dtype, feature_is_list, data_source) - fs = environment.feature_store - - features = [fv.name + ":value"] - entity = driver(value_type=ValueType.UNKNOWN) - - try: - fs.apply([fv, entity]) - fs.materialize(environment.start_date, environment.end_date) - driver_id_value = "1" if entity_type == ValueType.STRING else 1 - online_features = fs.get_online_features( - features=features, entity_rows=[{"driver": driver_id_value}], - ).to_dict() - - feature_list_dtype_to_expected_online_response_value_type = { - "int32": "int", - "int64": "int", - "float": "float", - "string": "str", - "bool": "bool", - } - assert ( - type(online_features["value"][0]).__name__ - == feature_list_dtype_to_expected_online_response_value_type[ - feature_dtype - ] - ) - finally: - environment.data_source_creator.teardown() def create_feature_view(feature_dtype, feature_is_list, data_source): @@ -208,18 +234,19 @@ def assert_expected_historical_feature_types( "int32": "int64", "int64": "int64", "float": "float64", - "string": "object", - "bool": "bool", + "string": {"string", "object"}, + "bool": {"bool", "object"}, } assert ( str(historical_features_df.dtypes["value"]) - == feature_dtype_to_expected_historical_feature_dtype[feature_dtype] + in feature_dtype_to_expected_historical_feature_dtype[feature_dtype] ) -def assert_feature_list_types(feature_dtype: str, historical_features_df: pd.DataFrame): +def assert_feature_list_types( + provider: str, feature_dtype: str, historical_features_df: pd.DataFrame +): print("Asserting historical feature list types") - # Note, these expected values only hold for BQ feature_list_dtype_to_expected_historical_feature_list_dtype = { "int32": "int", "int64": "int", @@ -228,15 +255,23 @@ def assert_feature_list_types(feature_dtype: str, historical_features_df: pd.Dat "bool": "bool", } assert str(historical_features_df.dtypes["value"]) == "object" - # Note, this struct schema is only true for BQ and not for other stores - assert ( - type(historical_features_df.value[0]["list"][0]["item"]).__name__ - == feature_list_dtype_to_expected_historical_feature_list_dtype[feature_dtype] - ) + if provider == "gcp": + assert ( + feature_list_dtype_to_expected_historical_feature_list_dtype[feature_dtype] + in type(historical_features_df.value[0]["list"][0]["item"]).__name__ + ) + else: + assert ( + feature_list_dtype_to_expected_historical_feature_list_dtype[feature_dtype] + in type(historical_features_df.value[0][0]).__name__ + ) def assert_expected_arrow_types( - feature_dtype: str, feature_is_list: bool, historical_features: RetrievalJob + provider: str, + feature_dtype: str, + feature_is_list: bool, + historical_features: RetrievalJob, ): print("Asserting historical feature arrow types") historical_features_arrow = historical_features.to_arrow() @@ -252,10 +287,16 @@ def assert_expected_arrow_types( feature_dtype ] if feature_is_list: - assert ( - str(historical_features_arrow.schema.field_by_name("value").type) - == f"struct> not null>" - ) + if provider == "gcp": + assert ( + str(historical_features_arrow.schema.field_by_name("value").type) + == f"struct> not null>" + ) + else: + assert ( + str(historical_features_arrow.schema.field_by_name("value").type) + == f"list" + ) else: assert ( str(historical_features_arrow.schema.field_by_name("value").type) From 17d18748a5fedf8579663286622911bd2e14790f Mon Sep 17 00:00:00 2001 From: Willem Pienaar Date: Tue, 7 Sep 2021 14:15:34 +0000 Subject: [PATCH 19/19] GitBook: [master] 62 pages modified --- docs/reference/data-sources/file.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/reference/data-sources/file.md b/docs/reference/data-sources/file.md index 5dd5cb80b1..80e262aa9f 100644 --- a/docs/reference/data-sources/file.md +++ b/docs/reference/data-sources/file.md @@ -4,6 +4,10 @@ File data sources allow for the retrieval of historical feature values from files on disk for building training datasets, as well as for materializing features into an online store. +{% hint style="warning" %} +FileSource is meant for development purposes only and is not optimized for production use. +{% endhint %} + ## Example ```python