From 2ff317f9f6fba6c1a780c74ab7cd92496d4a5a75 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Tue, 14 Jun 2022 18:52:12 -0700 Subject: [PATCH 1/7] Enable materialization for stream feature views Signed-off-by: Felix Wang --- sdk/python/feast/feature_store.py | 104 +++++++++++++++++++----------- 1 file changed, 67 insertions(+), 37 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 7a5a8299eb..c6877ea074 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -265,6 +265,19 @@ def _list_feature_views( feature_views.append(fv) return feature_views + def _list_stream_feature_views( + self, allow_cache: bool = False, hide_dummy_entity: bool = True, + ) -> List[StreamFeatureView]: + stream_feature_views = [] + for sfv in self._registry.list_stream_feature_views( + self.project, allow_cache=allow_cache + ): + if hide_dummy_entity and sfv.entities[0] == DUMMY_ENTITY_NAME: + sfv.entities = [] + sfv.entity_columns = [] + stream_feature_views.append(sfv) + return stream_feature_views + @log_exceptions_and_usage def list_on_demand_feature_views( self, allow_cache: bool = False @@ -289,9 +302,7 @@ def list_stream_feature_views( Returns: A list of stream feature views. """ - return self._registry.list_stream_feature_views( - self.project, allow_cache=allow_cache - ) + return self._list_stream_feature_views(allow_cache) @log_exceptions_and_usage def list_data_sources(self, allow_cache: bool = False) -> List[DataSource]: @@ -574,6 +585,53 @@ def _make_inferences( for feature_service in feature_services_to_update: feature_service.infer_features(fvs_to_update=fvs_to_update_map) + def _get_feature_views_to_materialize( + self, feature_views: Optional[List[str]], + ) -> List[FeatureView]: + """ + Returns the list of feature views that should be materialized. + + If no feature views are specified, all feature views will be returned. + + Args: + feature_views: List of names of feature views to materialize. + + Raises: + FeatureViewNotFoundException: One of the specified feature views could not be found. + ValueError: One of the specified feature views is not configured for materialization. + """ + feature_views_to_materialize: List[FeatureView] = [] + + if feature_views is None: + feature_views_to_materialize = self._list_feature_views( + hide_dummy_entity=False + ) + feature_views_to_materialize = [ + fv for fv in feature_views_to_materialize if fv.online + ] + stream_feature_views_to_materialize = self._list_stream_feature_views( + hide_dummy_entity=False + ) + feature_views_to_materialize += [ + sfv for sfv in stream_feature_views_to_materialize if sfv.online + ] + else: + for name in feature_views: + try: + feature_view = self._get_feature_view(name, hide_dummy_entity=False) + except FeatureViewNotFoundException: + feature_view = self._get_stream_feature_view( + name, hide_dummy_entity=False + ) + + if not feature_view.online: + raise ValueError( + f"FeatureView {feature_view.name} is not configured to be served online." + ) + feature_views_to_materialize.append(feature_view) + + return feature_views_to_materialize + @log_exceptions_and_usage def _plan( self, desired_repo_contents: RepoContents @@ -1151,23 +1209,9 @@ def materialize_incremental( ... """ - feature_views_to_materialize: List[FeatureView] = [] - if feature_views is None: - feature_views_to_materialize = self._list_feature_views( - hide_dummy_entity=False - ) - feature_views_to_materialize = [ - fv for fv in feature_views_to_materialize if fv.online - ] - else: - for name in feature_views: - feature_view = self._get_feature_view(name, hide_dummy_entity=False) - if not feature_view.online: - raise ValueError( - f"FeatureView {feature_view.name} is not configured to be served online." - ) - feature_views_to_materialize.append(feature_view) - + feature_views_to_materialize = self._get_feature_views_to_materialize( + feature_views + ) _print_materialization_log( None, end_date, @@ -1258,23 +1302,9 @@ def materialize( f"The given start_date {start_date} is greater than the given end_date {end_date}." ) - feature_views_to_materialize: List[FeatureView] = [] - if feature_views is None: - feature_views_to_materialize = self._list_feature_views( - hide_dummy_entity=False - ) - feature_views_to_materialize = [ - fv for fv in feature_views_to_materialize if fv.online - ] - else: - for name in feature_views: - feature_view = self._get_feature_view(name, hide_dummy_entity=False) - if not feature_view.online: - raise ValueError( - f"FeatureView {feature_view.name} is not configured to be served online." - ) - feature_views_to_materialize.append(feature_view) - + feature_views_to_materialize = self._get_feature_views_to_materialize( + feature_views + ) _print_materialization_log( start_date, end_date, From b7416eb3420b324e68c6b3302ef7ac05f43b2184 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Wed, 15 Jun 2022 11:29:35 -0700 Subject: [PATCH 2/7] Fix bugs with stream feature view materialization Signed-off-by: Felix Wang --- sdk/python/feast/feature_store.py | 8 ++- sdk/python/feast/inference.py | 4 ++ sdk/python/feast/registry.py | 24 +++++++ sdk/python/feast/stream_feature_view.py | 68 ++++++++++++------- .../feature_repos/universal/feature_views.py | 3 +- sdk/python/tests/unit/test_feature_view.py | 68 ------------------- sdk/python/tests/unit/test_feature_views.py | 52 ++++++++++++++ 7 files changed, 132 insertions(+), 95 deletions(-) delete mode 100644 sdk/python/tests/unit/test_feature_view.py diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index c6877ea074..31fcbf7d42 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -569,6 +569,9 @@ def _make_inferences( update_feature_views_with_inferred_features_and_entities( views_to_update, entities + entities_to_update, self.config ) + update_feature_views_with_inferred_features_and_entities( + sfvs_to_update, entities + entities_to_update, self.config + ) # TODO(kevjumba): Update schema inferrence for sfv in sfvs_to_update: if not sfv.schema: @@ -931,8 +934,8 @@ def apply( self._get_provider().update_infra( project=self.project, - tables_to_delete=views_to_delete if not partial else [], - tables_to_keep=views_to_update, + tables_to_delete=views_to_delete + sfvs_to_delete if not partial else [], + tables_to_keep=views_to_update + sfvs_to_update, entities_to_delete=entities_to_delete if not partial else [], entities_to_keep=entities_to_update, partial=partial, @@ -1357,6 +1360,7 @@ def push( from feast.data_source import PushSource all_fvs = self.list_feature_views(allow_cache=allow_registry_cache) + all_fvs += self.list_stream_feature_views(allow_cache=allow_registry_cache) fvs_with_push_sources = { fv diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index bf9af26b82..c09ac9efc0 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -99,6 +99,10 @@ def update_feature_views_with_inferred_features_and_entities( other columns except designated timestamp columns are considered to be feature columns. If the feature view already has features, feature inference is skipped. + Note that this inference logic currently does not take transformations into account. For + example, even if a stream feature view has a transformation, this method assumes that the + batch source contains transformed data with the correct final schema. + Args: fvs: The feature views to be updated. entities: A list containing entities associated with the feature views. diff --git a/sdk/python/feast/registry.py b/sdk/python/feast/registry.py index c8b00befc6..c721bd648a 100644 --- a/sdk/python/feast/registry.py +++ b/sdk/python/feast/registry.py @@ -1267,6 +1267,30 @@ def apply_materialization( self.commit() return + for idx, existing_stream_feature_view_proto in enumerate( + self.cached_registry_proto.stream_feature_views + ): + if ( + existing_stream_feature_view_proto.spec.name == feature_view.name + and existing_stream_feature_view_proto.spec.project == project + ): + existing_stream_feature_view = StreamFeatureView.from_proto( + existing_stream_feature_view_proto + ) + existing_stream_feature_view.materialization_intervals.append( + (start_date, end_date) + ) + existing_stream_feature_view.last_updated_timestamp = datetime.utcnow() + stream_feature_view_proto = existing_stream_feature_view.to_proto() + stream_feature_view_proto.spec.project = project + del self.cached_registry_proto.stream_feature_views[idx] + self.cached_registry_proto.stream_feature_views.append( + stream_feature_view_proto + ) + if commit: + self.commit() + return + raise FeatureViewNotFoundException(feature_view.name, project) def list_feature_views( diff --git a/sdk/python/feast/stream_feature_view.py b/sdk/python/feast/stream_feature_view.py index 3bd525596b..76d63a9ce0 100644 --- a/sdk/python/feast/stream_feature_view.py +++ b/sdk/python/feast/stream_feature_view.py @@ -1,9 +1,9 @@ import copy import functools import warnings -from datetime import timedelta +from datetime import datetime, timedelta from types import MethodType -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional, Tuple, Union import dill from google.protobuf.duration_pb2 import Duration @@ -42,26 +42,42 @@ class StreamFeatureView(FeatureView): schemas with Feast. Attributes: - name: str. The unique name of the stream feature view. - entities: Union[List[Entity], List[str]]. List of entities or entity join keys. - ttl: timedelta. The amount of time this group of features lives. A ttl of 0 indicates that + name: The unique name of the stream feature view. + entities: List of entities or entity join keys. + ttl: The amount of time this group of features lives. A ttl of 0 indicates that this group of features lives forever. Note that large ttl's or a ttl of 0 can result in extremely computationally intensive queries. - tags: Dict[str, str]. A dictionary of key-value pairs to store arbitrary metadata. - online: bool. Defines whether this stream feature view is used in online feature retrieval. - description: str. A human-readable description. + schema: The schema of the feature view, including feature, timestamp, and entity + columns. If not specified, can be inferred from the underlying data source. + source: DataSource. The stream source of data where this group of features is stored. + aggregations: List of aggregations registered with the stream feature view. + mode: The mode of execution. + timestamp_field: Must be specified if aggregations are specified. Defines the timestamp column on which to aggregate windows. + online: Defines whether this stream feature view is used in online feature retrieval. + description: A human-readable description. + tags: A dictionary of key-value pairs to store arbitrary metadata. owner: The owner of the on demand feature view, typically the email of the primary maintainer. - schema: List[Field] The schema of the feature view, including feature, timestamp, and entity - columns. If not specified, can be inferred from the underlying data source. - source: DataSource. The stream source of data where this group of features - is stored. - aggregations (optional): List[Aggregation]. List of aggregations registered with the stream feature view. - mode(optional): str. The mode of execution. - timestamp_field (optional): Must be specified if aggregations are specified. Defines the timestamp column on which to aggregate windows. - udf (optional): MethodType The user defined transformation function. This transformation function should have all of the corresponding imports imported within the function. + udf: The user defined transformation function. This transformation function should have all of the corresponding imports imported within the function. """ + name: str + entities: List[str] + ttl: Optional[timedelta] + source: DataSource + schema: List[Field] + entity_columns: List[Field] + features: List[Field] + online: bool + description: str + tags: Dict[str, str] + owner: str + aggregations: List[Aggregation] + mode: str + timestamp_field: str + materialization_intervals: List[Tuple[datetime, datetime]] + udf: Optional[MethodType] + def __init__( self, *, @@ -222,7 +238,7 @@ def from_proto(cls, sfv_proto): if sfv_proto.spec.HasField("user_defined_function") else None ) - sfv_feature_view = cls( + stream_feature_view = cls( name=sfv_proto.spec.name, description=sfv_proto.spec.description, tags=dict(sfv_proto.spec.tags), @@ -247,23 +263,27 @@ def from_proto(cls, sfv_proto): ) if batch_source: - sfv_feature_view.batch_source = batch_source + stream_feature_view.batch_source = batch_source if stream_source: - sfv_feature_view.stream_source = stream_source + stream_feature_view.stream_source = stream_source - sfv_feature_view.entities = list(sfv_proto.spec.entities) + stream_feature_view.entities = list(sfv_proto.spec.entities) - sfv_feature_view.features = [ + stream_feature_view.features = [ Field.from_proto(field_proto) for field_proto in sfv_proto.spec.features ] + stream_feature_view.entity_columns = [ + Field.from_proto(field_proto) + for field_proto in sfv_proto.spec.entity_columns + ] if sfv_proto.meta.HasField("created_timestamp"): - sfv_feature_view.created_timestamp = ( + stream_feature_view.created_timestamp = ( sfv_proto.meta.created_timestamp.ToDatetime() ) if sfv_proto.meta.HasField("last_updated_timestamp"): - sfv_feature_view.last_updated_timestamp = ( + stream_feature_view.last_updated_timestamp = ( sfv_proto.meta.last_updated_timestamp.ToDatetime() ) @@ -275,7 +295,7 @@ def from_proto(cls, sfv_proto): ) ) - return sfv_feature_view + return stream_feature_view def __copy__(self): fv = StreamFeatureView( diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index b93ad987fa..3fee0b7001 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -11,6 +11,7 @@ Field, OnDemandFeatureView, PushSource, + StreamFeatureView, ValueType, ) from feast.data_source import DataSource, RequestSource @@ -297,7 +298,7 @@ def create_pushable_feature_view(batch_source: DataSource): push_source = PushSource( name="location_stats_push_source", batch_source=batch_source, ) - return FeatureView( + return StreamFeatureView( name="pushable_location_stats", entities=[location()], schema=[ diff --git a/sdk/python/tests/unit/test_feature_view.py b/sdk/python/tests/unit/test_feature_view.py deleted file mode 100644 index 1ef36081ec..0000000000 --- a/sdk/python/tests/unit/test_feature_view.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright 2022 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from feast.feature_view import FeatureView -from feast.field import Field -from feast.infra.offline_stores.file_source import FileSource -from feast.types import Float32 - - -def test_hash(): - file_source = FileSource(name="my-file-source", path="test.parquet") - feature_view_1 = FeatureView( - name="my-feature-view", - entities=[], - schema=[ - Field(name="feature1", dtype=Float32), - Field(name="feature2", dtype=Float32), - ], - source=file_source, - ) - feature_view_2 = FeatureView( - name="my-feature-view", - entities=[], - schema=[ - Field(name="feature1", dtype=Float32), - Field(name="feature2", dtype=Float32), - ], - source=file_source, - ) - feature_view_3 = FeatureView( - name="my-feature-view", - entities=[], - schema=[Field(name="feature1", dtype=Float32)], - source=file_source, - ) - feature_view_4 = FeatureView( - name="my-feature-view", - entities=[], - schema=[Field(name="feature1", dtype=Float32)], - source=file_source, - description="test", - ) - - s1 = {feature_view_1, feature_view_2} - assert len(s1) == 1 - - s2 = {feature_view_1, feature_view_3} - assert len(s2) == 2 - - s3 = {feature_view_3, feature_view_4} - assert len(s3) == 2 - - s4 = {feature_view_1, feature_view_2, feature_view_3, feature_view_4} - assert len(s4) == 3 - - -# TODO(felixwang9817): Add tests for proto conversion. -# TODO(felixwang9817): Add tests for field mapping logic. diff --git a/sdk/python/tests/unit/test_feature_views.py b/sdk/python/tests/unit/test_feature_views.py index 64b23edd2c..a1d134a2f0 100644 --- a/sdk/python/tests/unit/test_feature_views.py +++ b/sdk/python/tests/unit/test_feature_views.py @@ -7,6 +7,7 @@ from feast.data_format import AvroFormat from feast.data_source import KafkaSource, PushSource from feast.entity import Entity +from feast.feature_view import FeatureView from feast.field import Field from feast.infra.offline_stores.file_source import FileSource from feast.stream_feature_view import StreamFeatureView, stream_feature_view @@ -201,3 +202,54 @@ def test_stream_feature_view_initialization_with_optional_fields_omitted(): new_sfv = StreamFeatureView.from_proto(sfv_proto=sfv_proto) assert new_sfv == sfv + + +def test_hash(): + file_source = FileSource(name="my-file-source", path="test.parquet") + feature_view_1 = FeatureView( + name="my-feature-view", + entities=[], + schema=[ + Field(name="feature1", dtype=Float32), + Field(name="feature2", dtype=Float32), + ], + source=file_source, + ) + feature_view_2 = FeatureView( + name="my-feature-view", + entities=[], + schema=[ + Field(name="feature1", dtype=Float32), + Field(name="feature2", dtype=Float32), + ], + source=file_source, + ) + feature_view_3 = FeatureView( + name="my-feature-view", + entities=[], + schema=[Field(name="feature1", dtype=Float32)], + source=file_source, + ) + feature_view_4 = FeatureView( + name="my-feature-view", + entities=[], + schema=[Field(name="feature1", dtype=Float32)], + source=file_source, + description="test", + ) + + s1 = {feature_view_1, feature_view_2} + assert len(s1) == 1 + + s2 = {feature_view_1, feature_view_3} + assert len(s2) == 2 + + s3 = {feature_view_3, feature_view_4} + assert len(s3) == 2 + + s4 = {feature_view_1, feature_view_2, feature_view_3, feature_view_4} + assert len(s4) == 3 + + +# TODO(felixwang9817): Add tests for proto conversion. +# TODO(felixwang9817): Add tests for field mapping logic. From 39e41480d340f8372cf002545bc7396f1c910ebc Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Wed, 15 Jun 2022 11:42:56 -0700 Subject: [PATCH 3/7] Fix SFV tests to use actual data to allow for inference Signed-off-by: Felix Wang --- .../test_stream_feature_view_apply.py | 236 +++++++++--------- 1 file changed, 122 insertions(+), 114 deletions(-) diff --git a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py b/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py index 29cd2f1c26..adeb15317e 100644 --- a/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py +++ b/sdk/python/tests/integration/registration/test_stream_feature_view_apply.py @@ -2,139 +2,147 @@ import pytest -from feast import Entity, Field, FileSource from feast.aggregation import Aggregation from feast.data_format import AvroFormat from feast.data_source import KafkaSource +from feast.entity import Entity +from feast.field import Field from feast.stream_feature_view import stream_feature_view from feast.types import Float32 +from tests.utils.cli_utils import CliRunner, get_example_repo +from tests.utils.data_source_utils import prep_file_source @pytest.mark.integration -def test_apply_stream_feature_view(environment) -> None: +def test_apply_stream_feature_view(simple_dataset_1) -> None: """ Test apply of StreamFeatureView. """ - fs = environment.feature_store - - # Create Feature Views - entity = Entity(name="driver_entity", join_keys=["test_key"]) - - stream_source = KafkaSource( - name="kafka", - timestamp_field="event_timestamp", - bootstrap_servers="", - message_format=AvroFormat(""), - topic="topic", - batch_source=FileSource(path="test_path", timestamp_field="event_timestamp"), - watermark=timedelta(days=1), - ) - - @stream_feature_view( - entities=[entity], - ttl=timedelta(days=30), - owner="test@example.com", - online=True, - schema=[Field(name="dummy_field", dtype=Float32)], - description="desc", - aggregations=[ - Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), - ), - Aggregation( - column="dummy_field2", function="count", time_window=timedelta(days=24), - ), - ], - timestamp_field="event_timestamp", - mode="spark", - source=stream_source, - tags={}, - ) - def simple_sfv(df): - return df - - fs.apply([entity, simple_sfv]) - stream_feature_views = fs.list_stream_feature_views() - assert len(stream_feature_views) == 1 - assert stream_feature_views[0] == simple_sfv - - entities = fs.list_entities() - assert len(entities) == 1 - assert entities[0] == entity - - features = fs.get_online_features( - features=["simple_sfv:dummy_field"], entity_rows=[{"test_key": 1001}], - ).to_dict(include_event_timestamps=True) - - assert "test_key" in features - assert features["test_key"] == [1001] - assert "dummy_field" in features - assert features["dummy_field"] == [None] + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_1.py"), "bigquery" + ) as fs, prep_file_source( + df=simple_dataset_1, timestamp_field="ts_1" + ) as file_source: + entity = Entity(name="driver_entity", join_keys=["test_key"]) + + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=file_source, + watermark=timedelta(days=1), + ) + + @stream_feature_view( + entities=[entity], + ttl=timedelta(days=30), + owner="test@example.com", + online=True, + schema=[Field(name="dummy_field", dtype=Float32)], + description="desc", + aggregations=[ + Aggregation( + column="dummy_field", function="max", time_window=timedelta(days=1), + ), + Aggregation( + column="dummy_field2", + function="count", + time_window=timedelta(days=24), + ), + ], + timestamp_field="event_timestamp", + mode="spark", + source=stream_source, + tags={}, + ) + def simple_sfv(df): + return df + + fs.apply([entity, simple_sfv]) + + stream_feature_views = fs.list_stream_feature_views() + assert len(stream_feature_views) == 1 + assert stream_feature_views[0] == simple_sfv + + features = fs.get_online_features( + features=["simple_sfv:dummy_field"], entity_rows=[{"test_key": 1001}], + ).to_dict(include_event_timestamps=True) + + assert "test_key" in features + assert features["test_key"] == [1001] + assert "dummy_field" in features + assert features["dummy_field"] == [None] @pytest.mark.integration -def test_stream_feature_view_udf(environment) -> None: +def test_stream_feature_view_udf(simple_dataset_1) -> None: """ Test apply of StreamFeatureView udfs are serialized correctly and usable. """ - fs = environment.feature_store - - # Create Feature Views - entity = Entity(name="driver_entity", join_keys=["test_key"]) - - stream_source = KafkaSource( - name="kafka", - timestamp_field="event_timestamp", - bootstrap_servers="", - message_format=AvroFormat(""), - topic="topic", - batch_source=FileSource(path="test_path", timestamp_field="event_timestamp"), - watermark=timedelta(days=1), - ) - - @stream_feature_view( - entities=[entity], - ttl=timedelta(days=30), - owner="test@example.com", - online=True, - schema=[Field(name="dummy_field", dtype=Float32)], - description="desc", - aggregations=[ - Aggregation( - column="dummy_field", function="max", time_window=timedelta(days=1), - ), - Aggregation( - column="dummy_field2", function="count", time_window=timedelta(days=24), - ), - ], - timestamp_field="event_timestamp", - mode="spark", - source=stream_source, - tags={}, - ) - def pandas_view(pandas_df): - import pandas as pd - - assert type(pandas_df) == pd.DataFrame - df = pandas_df.transform(lambda x: x + 10, axis=1) - df.insert(2, "C", [20.2, 230.0, 34.0], True) - return df + runner = CliRunner() + with runner.local_repo( + get_example_repo("example_feature_repo_1.py"), "bigquery" + ) as fs, prep_file_source( + df=simple_dataset_1, timestamp_field="ts_1" + ) as file_source: + entity = Entity(name="driver_entity", join_keys=["test_key"]) + + stream_source = KafkaSource( + name="kafka", + timestamp_field="event_timestamp", + bootstrap_servers="", + message_format=AvroFormat(""), + topic="topic", + batch_source=file_source, + watermark=timedelta(days=1), + ) + + @stream_feature_view( + entities=[entity], + ttl=timedelta(days=30), + owner="test@example.com", + online=True, + schema=[Field(name="dummy_field", dtype=Float32)], + description="desc", + aggregations=[ + Aggregation( + column="dummy_field", function="max", time_window=timedelta(days=1), + ), + Aggregation( + column="dummy_field2", + function="count", + time_window=timedelta(days=24), + ), + ], + timestamp_field="event_timestamp", + mode="spark", + source=stream_source, + tags={}, + ) + def pandas_view(pandas_df): + import pandas as pd + + assert type(pandas_df) == pd.DataFrame + df = pandas_df.transform(lambda x: x + 10, axis=1) + df.insert(2, "C", [20.2, 230.0, 34.0], True) + return df - import pandas as pd - - df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + import pandas as pd - fs.apply([entity, pandas_view]) - stream_feature_views = fs.list_stream_feature_views() - assert len(stream_feature_views) == 1 - assert stream_feature_views[0].name == "pandas_view" - assert stream_feature_views[0] == pandas_view + fs.apply([entity, pandas_view]) - sfv = stream_feature_views[0] + stream_feature_views = fs.list_stream_feature_views() + assert len(stream_feature_views) == 1 + assert stream_feature_views[0] == pandas_view - new_df = sfv.udf(df) + sfv = stream_feature_views[0] - expected_df = pd.DataFrame( - {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} - ) - assert new_df.equals(expected_df) + df = pd.DataFrame({"A": [1, 2, 3], "B": [10, 20, 30]}) + new_df = sfv.udf(df) + expected_df = pd.DataFrame( + {"A": [11, 12, 13], "B": [20, 30, 40], "C": [20.2, 230.0, 34.0]} + ) + assert new_df.equals(expected_df) From 1a61567147388701dfe5973c73ce01f9ffbb1478 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Wed, 15 Jun 2022 16:10:54 -0700 Subject: [PATCH 4/7] Expand online retrieval test to also retrieve pushable_location_stats SFV Signed-off-by: Felix Wang --- .../online_store/test_universal_online.py | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index d05045e295..e775e46360 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -490,7 +490,7 @@ def test_online_retrieval( ) entity_sample = datasets.orders_df.sample(10)[ - ["customer_id", "driver_id", "order_id", "event_timestamp"] + ["customer_id", "driver_id", "order_id", "origin_id", "event_timestamp"] ] orders_df = datasets.orders_df[ ( @@ -509,6 +509,8 @@ def test_online_retrieval( datasets.customer_df["customer_id"].isin(sample_customers) ] + sample_origins = entity_sample["origin_id"] + location_pairs = np.array(list(itertools.permutations(entities.location_vals, 2))) sample_location_pairs = location_pairs[ np.random.choice(len(location_pairs), 10) @@ -521,10 +523,12 @@ def test_online_retrieval( ] global_df = datasets.global_df + location_df = datasets.location_df + # Rename origin -> location for the purposes of selecting entity_rows = [ - {"driver_id": d, "customer_id": c, "val_to_add": 50} - for (d, c) in zip(sample_drivers, sample_customers) + {"driver_id": d, "customer_id": c, "location_id": o, "val_to_add": 50} + for (d, c, o) in zip(sample_drivers, sample_customers, sample_origins) ] feature_refs = [ @@ -538,6 +542,7 @@ def test_online_retrieval( "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", + "pushable_location_stats:temperature", ] unprefixed_feature_refs = [f.rsplit(":", 1)[-1] for f in feature_refs if ":" in f] # Remove the on demand feature view output features, since they're not present in the source dataframe @@ -568,7 +573,7 @@ def test_online_retrieval( expected_keys = set( f.replace(":", "__") if full_feature_names else f.split(":")[-1] for f in feature_refs - ) | {"customer_id", "driver_id"} + ) | {"customer_id", "driver_id", "location_id"} assert ( keys == expected_keys ), f"Response keys are different from expected: {keys - expected_keys} (extra) and {expected_keys - keys} (missing)" @@ -581,6 +586,7 @@ def test_online_retrieval( orders_df=orders_df, global_df=global_df, entity_row=entity_row, + location_df=location_df, ) assert df_features["customer_id"] == online_features_dict["customer_id"][i] @@ -619,7 +625,9 @@ def test_online_retrieval( environment=environment, endpoint=feature_server_endpoint, features=feature_refs, - entity_rows=[{"driver_id": 0, "customer_id": 0, "val_to_add": 100}], + entity_rows=[ + {"driver_id": 0, "customer_id": 0, "location_id": 0, "val_to_add": 100} + ], full_feature_names=full_feature_names, ) assert missing_responses_dict is not None @@ -639,7 +647,7 @@ def test_online_retrieval( environment=environment, endpoint=feature_server_endpoint, features=feature_refs, - entity_rows=[{"driver_id": 0, "customer_id": 0}], + entity_rows=[{"driver_id": 0, "customer_id": 0, "location_id": 0}], full_feature_names=full_feature_names, ) @@ -653,6 +661,7 @@ def test_online_retrieval( customers_df, orders_df, global_df, + location_df, ) entity_rows = [ @@ -780,14 +789,19 @@ def get_latest_feature_values_from_dataframes( customer_df, orders_df, entity_row, + location_df, global_df=None, origin_df=None, destination_df=None, ): + # TODO: retrieve temperature feature latest_driver_row = get_latest_row(entity_row, driver_df, "driver_id", "driver_id") latest_customer_row = get_latest_row( entity_row, customer_df, "customer_id", "customer_id" ) + latest_location_row = get_latest_row( + entity_row, location_df, "location_id", "location_id" + ) # Since the event timestamp columns may contain timestamps of different timezones, # we must first convert the timestamps to UTC before we can compare them. @@ -807,7 +821,7 @@ def get_latest_feature_values_from_dataframes( global_df["event_timestamp"].idxmax() ].to_dict() if origin_df is not None: - latest_location_row = get_latest_feature_values_for_location_df( + latest_location_aliased_row = get_latest_feature_values_for_location_df( entity_row, origin_df, destination_df ) @@ -820,6 +834,7 @@ def get_latest_feature_values_from_dataframes( **latest_driver_row, **latest_orders_row, **latest_global_row, + **latest_location_row, **request_data_features, } if origin_df is not None: @@ -830,12 +845,14 @@ def get_latest_feature_values_from_dataframes( **latest_driver_row, **latest_orders_row, **latest_location_row, + **latest_location_aliased_row, **request_data_features, } return { **latest_customer_row, **latest_driver_row, **latest_orders_row, + **latest_location_row, **request_data_features, } @@ -869,6 +886,7 @@ def assert_feature_service_correctness( customers_df, orders_df, global_df, + location_df, ): feature_service_online_features_dict = get_online_features_dict( environment=environment, @@ -888,6 +906,7 @@ def assert_feature_service_correctness( assert set(feature_service_keys) == set(expected_feature_refs) | { "customer_id", "driver_id", + "location_id", } tc = unittest.TestCase() @@ -898,6 +917,7 @@ def assert_feature_service_correctness( orders_df=orders_df, global_df=global_df, entity_row=entity_row, + location_df=location_df, ) tc.assertAlmostEqual( feature_service_online_features_dict[ From 5ded8108d1f4ff63886805f75357365eaeda0233 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Wed, 15 Jun 2022 16:19:33 -0700 Subject: [PATCH 5/7] Remove stray comment Signed-off-by: Felix Wang --- .../tests/integration/online_store/test_universal_online.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index e775e46360..37b61def3d 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -525,7 +525,6 @@ def test_online_retrieval( global_df = datasets.global_df location_df = datasets.location_df - # Rename origin -> location for the purposes of selecting entity_rows = [ {"driver_id": d, "customer_id": c, "location_id": o, "val_to_add": 50} for (d, c, o) in zip(sample_drivers, sample_customers, sample_origins) From 0453a6ca768b5023329085df5f2ca16416187be7 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Thu, 16 Jun 2022 10:35:59 -0700 Subject: [PATCH 6/7] Add SFVs to Go serving path Signed-off-by: Felix Wang --- go/internal/feast/featurestore.go | 16 +++++++ go/internal/feast/model/featureview.go | 19 +++++++- go/internal/feast/registry/registry.go | 46 ++++++++++++++++++- sdk/python/feast/stream_feature_view.py | 2 +- .../online_store/test_universal_online.py | 1 + 5 files changed, 81 insertions(+), 3 deletions(-) diff --git a/go/internal/feast/featurestore.go b/go/internal/feast/featurestore.go index b0fc987fb4..ad1f94a4ba 100644 --- a/go/internal/feast/featurestore.go +++ b/go/internal/feast/featurestore.go @@ -224,6 +224,14 @@ func (fs *FeatureStore) listAllViews() (map[string]*model.FeatureView, map[strin fvs[featureView.Base.Name] = featureView } + streamFeatureViews, err := fs.ListStreamFeatureViews() + if err != nil { + return nil, nil, err + } + for _, streamFeatureView := range streamFeatureViews { + fvs[streamFeatureView.Base.Name] = streamFeatureView + } + onDemandFeatureViews, err := fs.registry.ListOnDemandFeatureViews(fs.config.Project) if err != nil { return nil, nil, err @@ -242,6 +250,14 @@ func (fs *FeatureStore) ListFeatureViews() ([]*model.FeatureView, error) { return featureViews, nil } +func (fs *FeatureStore) ListStreamFeatureViews() ([]*model.FeatureView, error) { + streamFeatureViews, err := fs.registry.ListStreamFeatureViews(fs.config.Project) + if err != nil { + return streamFeatureViews, err + } + return streamFeatureViews, nil +} + func (fs *FeatureStore) ListEntities(hideDummyEntity bool) ([]*model.Entity, error) { allEntities, err := fs.registry.ListEntities(fs.config.Project) diff --git a/go/internal/feast/model/featureview.go b/go/internal/feast/model/featureview.go index ceb3736f99..b6fde78658 100644 --- a/go/internal/feast/model/featureview.go +++ b/go/internal/feast/model/featureview.go @@ -24,7 +24,24 @@ type FeatureView struct { func NewFeatureViewFromProto(proto *core.FeatureView) *FeatureView { featureView := &FeatureView{Base: NewBaseFeatureView(proto.Spec.Name, proto.Spec.Features), - Ttl: &(*proto.Spec.Ttl), + Ttl: proto.Spec.Ttl, + } + if len(proto.Spec.Entities) == 0 { + featureView.EntityNames = []string{DUMMY_ENTITY_NAME} + } else { + featureView.EntityNames = proto.Spec.Entities + } + entityColumns := make([]*Field, len(proto.Spec.EntityColumns)) + for i, entityColumn := range proto.Spec.EntityColumns { + entityColumns[i] = NewFieldFromProto(entityColumn) + } + featureView.EntityColumns = entityColumns + return featureView +} + +func NewFeatureViewFromStreamFeatureViewProto(proto *core.StreamFeatureView) *FeatureView { + featureView := &FeatureView{Base: NewBaseFeatureView(proto.Spec.Name, proto.Spec.Features), + Ttl: proto.Spec.Ttl, } if len(proto.Spec.Entities) == 0 { featureView.EntityNames = []string{DUMMY_ENTITY_NAME} diff --git a/go/internal/feast/registry/registry.go b/go/internal/feast/registry/registry.go index 38cf167a9f..c67a50a5a6 100644 --- a/go/internal/feast/registry/registry.go +++ b/go/internal/feast/registry/registry.go @@ -30,6 +30,7 @@ type Registry struct { cachedFeatureServices map[string]map[string]*core.FeatureService cachedEntities map[string]map[string]*core.Entity cachedFeatureViews map[string]map[string]*core.FeatureView + cachedStreamFeatureViews map[string]map[string]*core.StreamFeatureView cachedOnDemandFeatureViews map[string]map[string]*core.OnDemandFeatureView cachedRegistry *core.Registry cachedRegistryProtoLastUpdated time.Time @@ -106,10 +107,12 @@ func (r *Registry) load(registry *core.Registry) { r.cachedFeatureServices = make(map[string]map[string]*core.FeatureService) r.cachedEntities = make(map[string]map[string]*core.Entity) r.cachedFeatureViews = make(map[string]map[string]*core.FeatureView) + r.cachedStreamFeatureViews = make(map[string]map[string]*core.StreamFeatureView) r.cachedOnDemandFeatureViews = make(map[string]map[string]*core.OnDemandFeatureView) r.loadEntities(registry) r.loadFeatureServices(registry) r.loadFeatureViews(registry) + r.loadStreamFeatureViews(registry) r.loadOnDemandFeatureViews(registry) r.cachedRegistryProtoLastUpdated = time.Now() } @@ -144,6 +147,16 @@ func (r *Registry) loadFeatureViews(registry *core.Registry) { } } +func (r *Registry) loadStreamFeatureViews(registry *core.Registry) { + streamFeatureViews := registry.StreamFeatureViews + for _, streamFeatureView := range streamFeatureViews { + if _, ok := r.cachedStreamFeatureViews[streamFeatureView.Spec.Project]; !ok { + r.cachedStreamFeatureViews[streamFeatureView.Spec.Project] = make(map[string]*core.StreamFeatureView) + } + r.cachedStreamFeatureViews[streamFeatureView.Spec.Project][streamFeatureView.Spec.Name] = streamFeatureView + } +} + func (r *Registry) loadOnDemandFeatureViews(registry *core.Registry) { onDemandFeatureViews := registry.OnDemandFeatureViews for _, onDemandFeatureView := range onDemandFeatureViews { @@ -193,7 +206,26 @@ func (r *Registry) ListFeatureViews(project string) ([]*model.FeatureView, error } /* - Look up Feature Views inside project + Look up Stream Feature Views inside project + Returns empty list if project not found +*/ + +func (r *Registry) ListStreamFeatureViews(project string) ([]*model.FeatureView, error) { + if cachedStreamFeatureViews, ok := r.cachedStreamFeatureViews[project]; !ok { + return []*model.FeatureView{}, nil + } else { + streamFeatureViews := make([]*model.FeatureView, len(cachedStreamFeatureViews)) + index := 0 + for _, streamFeatureViewProto := range cachedStreamFeatureViews { + streamFeatureViews[index] = model.NewFeatureViewFromStreamFeatureViewProto(streamFeatureViewProto) + index += 1 + } + return streamFeatureViews, nil + } +} + +/* + Look up Feature Services inside project Returns empty list if project not found */ @@ -254,6 +286,18 @@ func (r *Registry) GetFeatureView(project, featureViewName string) (*model.Featu } } +func (r *Registry) GetStreamFeatureView(project, streamFeatureViewName string) (*model.FeatureView, error) { + if cachedStreamFeatureViews, ok := r.cachedStreamFeatureViews[project]; !ok { + return nil, fmt.Errorf("no cached stream feature views found for project %s", project) + } else { + if streamFeatureViewProto, ok := cachedStreamFeatureViews[streamFeatureViewName]; !ok { + return nil, fmt.Errorf("no cached stream feature view %s found for project %s", streamFeatureViewName, project) + } else { + return model.NewFeatureViewFromStreamFeatureViewProto(streamFeatureViewProto), nil + } + } +} + func (r *Registry) GetFeatureService(project, featureServiceName string) (*model.FeatureService, error) { if cachedFeatureServices, ok := r.cachedFeatureServices[project]; !ok { return nil, fmt.Errorf("no cached feature services found for project %s", project) diff --git a/sdk/python/feast/stream_feature_view.py b/sdk/python/feast/stream_feature_view.py index 76d63a9ce0..2122ff1d55 100644 --- a/sdk/python/feast/stream_feature_view.py +++ b/sdk/python/feast/stream_feature_view.py @@ -310,7 +310,7 @@ def __copy__(self): aggregations=self.aggregations, mode=self.mode, timestamp_field=self.timestamp_field, - sources=self.sources, + source=self.source, udf=self.udf, ) fv.projection = copy.copy(self.projection) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 37b61def3d..f83047d225 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -458,6 +458,7 @@ def test_online_retrieval( feature_views.driver[["conv_rate"]], feature_views.driver_odfv, feature_views.customer[["current_balance"]], + feature_views.pushed_locations, ], ) feature_service_entity_mapping = FeatureService( From 2362991382705d70da74a22885e1c3e7cf3d35d1 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Thu, 16 Jun 2022 11:33:14 -0700 Subject: [PATCH 7/7] Clarify comment Signed-off-by: Felix Wang --- sdk/python/feast/inference.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index c09ac9efc0..011a3b99b2 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -99,9 +99,9 @@ def update_feature_views_with_inferred_features_and_entities( other columns except designated timestamp columns are considered to be feature columns. If the feature view already has features, feature inference is skipped. - Note that this inference logic currently does not take transformations into account. For - example, even if a stream feature view has a transformation, this method assumes that the - batch source contains transformed data with the correct final schema. + Note that this inference logic currently does not take any transformations (either a UDF or + aggregations) into account. For example, even if a stream feature view has a transformation, + this method assumes that the batch source contains transformed data with the correct final schema. Args: fvs: The feature views to be updated.