From b8e1cc0aa11dcb97d4be175b0a57cf2744dfac02 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Wed, 13 Apr 2022 14:25:28 -0700 Subject: [PATCH 01/11] Add docs for Go feature server Signed-off-by: Felix Wang --- .../feature-servers/go-feature-server.md | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 docs/reference/feature-servers/go-feature-server.md diff --git a/docs/reference/feature-servers/go-feature-server.md b/docs/reference/feature-servers/go-feature-server.md new file mode 100644 index 0000000000..a1e1636d72 --- /dev/null +++ b/docs/reference/feature-servers/go-feature-server.md @@ -0,0 +1,23 @@ +# Go feature server + +## Overview + +The Go feature server is a Go implementation of the core feature serving logic, embedded in the Python SDK. It supports retrieval of feature references, feature services, and on demand feature views, and can be used either through the Python SDK or the [Python feature server](local-feature-server.md). + +The Go feature server currently only supports Redis as an online store; support for other online stores will be added soon. We also plan on adding support for the Java feature server (e.g. the capability to call into the Go feature server and execute Java UDFs). Initial benchmarks indicate that the Go feature server is significantly faster than the Python feature server. We plan to release a more comprehensive set of benchmarks. For more details, see the [RFC](https://docs.google.com/document/d/1Lgqv6eWYFJgQ7LA_jNeTh8NzOPhqI9kGTeyESRpNHnE). + +## Usage + +To enable the Go feature server, set `go_feature_server: True` in your `feature_store.yaml`. + +{% code title="feature_store.yaml" %} +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +online_store: + type: redis + connection_string: "localhost:6379" +go_feature_server: True +``` +{% endcode %} From bfe75e1e74609388088bada16dacb5092eea7ef7 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Wed, 13 Apr 2022 16:51:41 -0700 Subject: [PATCH 02/11] Update go feature server docs Signed-off-by: Kevin Zhang --- docs/reference/feature-servers/go-feature-server.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/reference/feature-servers/go-feature-server.md b/docs/reference/feature-servers/go-feature-server.md index a1e1636d72..75cbfc640e 100644 --- a/docs/reference/feature-servers/go-feature-server.md +++ b/docs/reference/feature-servers/go-feature-server.md @@ -4,7 +4,9 @@ The Go feature server is a Go implementation of the core feature serving logic, embedded in the Python SDK. It supports retrieval of feature references, feature services, and on demand feature views, and can be used either through the Python SDK or the [Python feature server](local-feature-server.md). -The Go feature server currently only supports Redis as an online store; support for other online stores will be added soon. We also plan on adding support for the Java feature server (e.g. the capability to call into the Go feature server and execute Java UDFs). Initial benchmarks indicate that the Go feature server is significantly faster than the Python feature server. We plan to release a more comprehensive set of benchmarks. For more details, see the [RFC](https://docs.google.com/document/d/1Lgqv6eWYFJgQ7LA_jNeTh8NzOPhqI9kGTeyESRpNHnE). +Currently, the Go feature server only supports online serving and does not have an offline component including APIs to create feast feature repositories or apply configuration to the registry to facilitate online materialization. It also does not expose its own dedicated cli to perform feast actions. Currently, the Go feature server is only meant to expose an online serving API that can be called through the python SDK to facilitate faster online feature retrieval. + +The Go feature server currently only supports Redis and Sqlite as online stores; support for other online stores will be added soon. Initial benchmarks indicate that the Go feature server is significantly faster than the Python feature server. We plan to release a more comprehensive set of benchmarks. For more details, see the [RFC](https://docs.google.com/document/d/1Lgqv6eWYFJgQ7LA_jNeTh8NzOPhqI9kGTeyESRpNHnE). ## Usage @@ -21,3 +23,10 @@ online_store: go_feature_server: True ``` {% endcode %} + +## Future/Current Work + +The Go feature server online feature logging for Data Quality Monitoring is currently in development. More information can be found [here](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98/edit#heading=h.9gaqqtox9jg6). + +We also plan on adding support for the Java feature server (e.g. the capability to call into the Go feature server and execute Java UDFs). + From 0c04a3a958f7e3b97da6680ab0e16ee8da1b8625 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 10:49:34 -0700 Subject: [PATCH 03/11] Address review components Signed-off-by: Kevin Zhang --- .../feature-servers/go-feature-server.md | 20 +++++++++++-------- sdk/python/setup.py | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/docs/reference/feature-servers/go-feature-server.md b/docs/reference/feature-servers/go-feature-server.md index 75cbfc640e..0fbc6f5166 100644 --- a/docs/reference/feature-servers/go-feature-server.md +++ b/docs/reference/feature-servers/go-feature-server.md @@ -1,16 +1,20 @@ -# Go feature server +# Go-based Feature Retrieval ## Overview -The Go feature server is a Go implementation of the core feature serving logic, embedded in the Python SDK. It supports retrieval of feature references, feature services, and on demand feature views, and can be used either through the Python SDK or the [Python feature server](local-feature-server.md). +The Go Feature Retrieval component is a Go implementation of the core feature serving logic, embedded in the Python SDK. It supports retrieval of feature references, feature services, and on demand feature views, and can be used either through the Python SDK or the [Python feature server](local-feature-server.md). -Currently, the Go feature server only supports online serving and does not have an offline component including APIs to create feast feature repositories or apply configuration to the registry to facilitate online materialization. It also does not expose its own dedicated cli to perform feast actions. Currently, the Go feature server is only meant to expose an online serving API that can be called through the python SDK to facilitate faster online feature retrieval. +Currently, this component only supports online serving and does not have an offline component including APIs to create feast feature repositories or apply configuration to the registry to facilitate online materialization. It also does not expose its own dedicated cli to perform feast actions. Furthermore, this component is only meant to expose an online serving API that can be called through the python SDK to facilitate faster online feature retrieval. -The Go feature server currently only supports Redis and Sqlite as online stores; support for other online stores will be added soon. Initial benchmarks indicate that the Go feature server is significantly faster than the Python feature server. We plan to release a more comprehensive set of benchmarks. For more details, see the [RFC](https://docs.google.com/document/d/1Lgqv6eWYFJgQ7LA_jNeTh8NzOPhqI9kGTeyESRpNHnE). +The Go Feature Retrieval component currently only supports Redis and Sqlite as online stores; support for other online stores will be added soon. Initial benchmarks indicate that it is significantly faster than the Python feature server for online feature retrieval. We plan to release a more comprehensive set of benchmarks. For more details, see the [RFC](https://docs.google.com/document/d/1Lgqv6eWYFJgQ7LA_jNeTh8NzOPhqI9kGTeyESRpNHnE). + +## Installation + +To install, run `export COMPILE_GO=true` before you run `pip install feast`. ## Usage -To enable the Go feature server, set `go_feature_server: True` in your `feature_store.yaml`. +To enable the Go online feature retrieval component, set `go_feature_retrieval: True` in your `feature_store.yaml`. This will direct all online feature retrieval to Go instead of Python. This flag will be enabled by default in the future. {% code title="feature_store.yaml" %} ```yaml @@ -20,13 +24,13 @@ provider: local online_store: type: redis connection_string: "localhost:6379" -go_feature_server: True +go_feature_retrieval: True ``` {% endcode %} ## Future/Current Work -The Go feature server online feature logging for Data Quality Monitoring is currently in development. More information can be found [here](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98/edit#heading=h.9gaqqtox9jg6). +The Go feature retrieval online feature logging for Data Quality Monitoring is currently in development. More information can be found [here](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98/edit#heading=h.9gaqqtox9jg6). -We also plan on adding support for the Java feature server (e.g. the capability to call into the Go feature server and execute Java UDFs). +We also plan on adding support for the Java feature server (e.g. the capability to call into the Go component and execute Java UDFs). diff --git a/sdk/python/setup.py b/sdk/python/setup.py index dbecd26e2b..d68797b5ce 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -108,7 +108,7 @@ CI_REQUIRED = ( [ - "cryptography==3.3.2", + "cryptography==3.4.8", "flake8", "black==19.10b0", "isort>=5", From 7fd61b2658cbcfe869f533a5ba23cc9b7bada118 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 10:50:09 -0700 Subject: [PATCH 04/11] Fix Signed-off-by: Kevin Zhang --- .../feature-servers/go-feature-server.md | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100644 docs/reference/feature-servers/go-feature-server.md diff --git a/docs/reference/feature-servers/go-feature-server.md b/docs/reference/feature-servers/go-feature-server.md deleted file mode 100644 index 0fbc6f5166..0000000000 --- a/docs/reference/feature-servers/go-feature-server.md +++ /dev/null @@ -1,36 +0,0 @@ -# Go-based Feature Retrieval - -## Overview - -The Go Feature Retrieval component is a Go implementation of the core feature serving logic, embedded in the Python SDK. It supports retrieval of feature references, feature services, and on demand feature views, and can be used either through the Python SDK or the [Python feature server](local-feature-server.md). - -Currently, this component only supports online serving and does not have an offline component including APIs to create feast feature repositories or apply configuration to the registry to facilitate online materialization. It also does not expose its own dedicated cli to perform feast actions. Furthermore, this component is only meant to expose an online serving API that can be called through the python SDK to facilitate faster online feature retrieval. - -The Go Feature Retrieval component currently only supports Redis and Sqlite as online stores; support for other online stores will be added soon. Initial benchmarks indicate that it is significantly faster than the Python feature server for online feature retrieval. We plan to release a more comprehensive set of benchmarks. For more details, see the [RFC](https://docs.google.com/document/d/1Lgqv6eWYFJgQ7LA_jNeTh8NzOPhqI9kGTeyESRpNHnE). - -## Installation - -To install, run `export COMPILE_GO=true` before you run `pip install feast`. - -## Usage - -To enable the Go online feature retrieval component, set `go_feature_retrieval: True` in your `feature_store.yaml`. This will direct all online feature retrieval to Go instead of Python. This flag will be enabled by default in the future. - -{% code title="feature_store.yaml" %} -```yaml -project: my_feature_repo -registry: data/registry.db -provider: local -online_store: - type: redis - connection_string: "localhost:6379" -go_feature_retrieval: True -``` -{% endcode %} - -## Future/Current Work - -The Go feature retrieval online feature logging for Data Quality Monitoring is currently in development. More information can be found [here](https://docs.google.com/document/d/110F72d4NTv80p35wDSONxhhPBqWRwbZXG4f9mNEMd98/edit#heading=h.9gaqqtox9jg6). - -We also plan on adding support for the Java feature server (e.g. the capability to call into the Go component and execute Java UDFs). - From 6411d1e0f526d1e56b87e322cf5377ed0892f709 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 11:09:59 -0700 Subject: [PATCH 05/11] Revert Signed-off-by: Kevin Zhang --- sdk/python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/setup.py b/sdk/python/setup.py index d68797b5ce..dbecd26e2b 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -108,7 +108,7 @@ CI_REQUIRED = ( [ - "cryptography==3.4.8", + "cryptography==3.3.2", "flake8", "black==19.10b0", "isort>=5", From 4c91cb6a4f585e5dbf64264a5147d1f7fea44de4 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 11:30:57 -0700 Subject: [PATCH 06/11] Fix Signed-off-by: Kevin Zhang --- docs/reference/feature-servers/go-feature-retrieval.md | 2 +- sdk/python/setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/reference/feature-servers/go-feature-retrieval.md b/docs/reference/feature-servers/go-feature-retrieval.md index 999a142c07..415817dd85 100644 --- a/docs/reference/feature-servers/go-feature-retrieval.md +++ b/docs/reference/feature-servers/go-feature-retrieval.md @@ -10,7 +10,7 @@ The Go Feature Retrieval component currently only supports Redis and Sqlite as o ## Installation -As long as you are running macOS or linux x86 with python version 3.7-3.10, the go component comes pre-compiled when you run install feast. +As long as you are running macOS or linux, on x86, with python version 3.7-3.10, the go component comes pre-compiled when you install feast. For developers, if you want to build from source, run `make compile-go-lib` to build and compile the go server. diff --git a/sdk/python/setup.py b/sdk/python/setup.py index dbecd26e2b..d68797b5ce 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -108,7 +108,7 @@ CI_REQUIRED = ( [ - "cryptography==3.3.2", + "cryptography==3.4.8", "flake8", "black==19.10b0", "isort>=5", From 85770de6740114879b0b7cbeb74d68c2504816ae Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 17:26:38 -0700 Subject: [PATCH 07/11] Update comment Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index cb6e874f8a..f5a9020c58 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -1,3 +1,5 @@ +import os + from datetime import datetime from typing import Callable, List, Optional, Tuple, Union @@ -6,6 +8,7 @@ import pyarrow import pytz from pydantic.typing import Literal +from regex import P from feast import FileSource, OnDemandFeatureView from feast.data_source import DataSource @@ -299,17 +302,24 @@ def evaluate_offline_job(): if created_timestamp_column else [event_timestamp_column] ) + # try-catch block is added to deal with this issue https://github.com/dask/dask/issues/8939. + # will remove once a fix is added. + try: + if created_timestamp_column: + source_df = source_df.sort_values(by=created_timestamp_column,) + + source_df = source_df.sort_values(by=event_timestamp_column) - if created_timestamp_column: - source_df = source_df.sort_values(by=created_timestamp_column) + except ZeroDivisionError: + if created_timestamp_column: + source_df = source_df.sort_values(by=created_timestamp_column, npartitions=1) - source_df = source_df.sort_values(by=event_timestamp_column) + source_df = source_df.sort_values(by=event_timestamp_column, npartitions=1) source_df = source_df[ (source_df[event_timestamp_column] >= start_date) & (source_df[event_timestamp_column] < end_date) ] - source_df = source_df.persist() columns_to_extract = set( @@ -323,6 +333,7 @@ def evaluate_offline_job(): source_df[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL columns_to_extract.add(DUMMY_ENTITY_ID) + print(source_df.head()) source_df = source_df.persist() return source_df[list(columns_to_extract)].persist() @@ -386,8 +397,12 @@ def _read_datasource(data_source) -> dd.DataFrame: if data_source.file_options.s3_endpoint_override else None ) + print(os.path.abspath(data_source.path)) - return dd.read_parquet(data_source.path, storage_options=storage_options,) + df = dd.read_parquet(data_source.path, storage_options=storage_options,) + print("Asdfasdf") + print(df.npartitions) + return df def _field_mapping( From b05d46c9d6c4ea919ff3268d9c42bfd97982801a Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 17:30:47 -0700 Subject: [PATCH 08/11] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index f5a9020c58..5632f6b6cd 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -1,5 +1,3 @@ -import os - from datetime import datetime from typing import Callable, List, Optional, Tuple, Union @@ -8,7 +6,6 @@ import pyarrow import pytz from pydantic.typing import Literal -from regex import P from feast import FileSource, OnDemandFeatureView from feast.data_source import DataSource @@ -312,9 +309,13 @@ def evaluate_offline_job(): except ZeroDivisionError: if created_timestamp_column: - source_df = source_df.sort_values(by=created_timestamp_column, npartitions=1) + source_df = source_df.sort_values( + by=created_timestamp_column, npartitions=1 + ) - source_df = source_df.sort_values(by=event_timestamp_column, npartitions=1) + source_df = source_df.sort_values( + by=event_timestamp_column, npartitions=1 + ) source_df = source_df[ (source_df[event_timestamp_column] >= start_date) @@ -397,11 +398,8 @@ def _read_datasource(data_source) -> dd.DataFrame: if data_source.file_options.s3_endpoint_override else None ) - print(os.path.abspath(data_source.path)) df = dd.read_parquet(data_source.path, storage_options=storage_options,) - print("Asdfasdf") - print(df.npartitions) return df From dbf3333ef8e62fef1949c7a53738d5dabc7bfa96 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 17:33:55 -0700 Subject: [PATCH 09/11] Fix Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 6 +++--- sdk/python/setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 5632f6b6cd..30ccec387b 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -308,6 +308,8 @@ def evaluate_offline_job(): source_df = source_df.sort_values(by=event_timestamp_column) except ZeroDivisionError: + # Use 1 partition to get around case where everything in timestamp column is the same so the partition algorithm doesn't + # try to divide by zero. if created_timestamp_column: source_df = source_df.sort_values( by=created_timestamp_column, npartitions=1 @@ -334,7 +336,6 @@ def evaluate_offline_job(): source_df[DUMMY_ENTITY_ID] = DUMMY_ENTITY_VAL columns_to_extract.add(DUMMY_ENTITY_ID) - print(source_df.head()) source_df = source_df.persist() return source_df[list(columns_to_extract)].persist() @@ -399,8 +400,7 @@ def _read_datasource(data_source) -> dd.DataFrame: else None ) - df = dd.read_parquet(data_source.path, storage_options=storage_options,) - return df + return dd.read_parquet(data_source.path, storage_options=storage_options,) def _field_mapping( diff --git a/sdk/python/setup.py b/sdk/python/setup.py index d68797b5ce..dbecd26e2b 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -108,7 +108,7 @@ CI_REQUIRED = ( [ - "cryptography==3.4.8", + "cryptography==3.3.2", "flake8", "black==19.10b0", "isort>=5", From dc279c3835bd18b6977bf3b380faaae1b7df9af2 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 17:38:52 -0700 Subject: [PATCH 10/11] Revert indent Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 30ccec387b..4a16be0d13 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -323,6 +323,7 @@ def evaluate_offline_job(): (source_df[event_timestamp_column] >= start_date) & (source_df[event_timestamp_column] < end_date) ] + source_df = source_df.persist() columns_to_extract = set( From 6a750225370134db605fc7ac21770725a40b4232 Mon Sep 17 00:00:00 2001 From: Kevin Zhang Date: Thu, 14 Apr 2022 17:39:38 -0700 Subject: [PATCH 11/11] fix comment Signed-off-by: Kevin Zhang --- sdk/python/feast/infra/offline_stores/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 4a16be0d13..a7d8b25abf 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -300,7 +300,7 @@ def evaluate_offline_job(): else [event_timestamp_column] ) # try-catch block is added to deal with this issue https://github.com/dask/dask/issues/8939. - # will remove once a fix is added. + # TODO(kevjumba): remove try catch when fix is merged upstream in Dask. try: if created_timestamp_column: source_df = source_df.sort_values(by=created_timestamp_column,)