From d8bd8cf4ca851e00818ee554d32cf87471f95913 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sat, 6 Nov 2021 14:24:05 +0000 Subject: [PATCH] GitBook: [#332] Updating roadmap and adding stream push API docs --- docs/SUMMARY.md | 1 + docs/how-to-guides/adding-or-reusing-tests.md | 60 +++++++++---------- docs/reference/alpha-stream-ingestion.md | 45 ++++++++++++++ docs/roadmap.md | 7 ++- 4 files changed, 80 insertions(+), 33 deletions(-) create mode 100644 docs/reference/alpha-stream-ingestion.md diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 9924f7a268..987a432ac9 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -72,6 +72,7 @@ * [feature\_store.yaml](reference/feature-repository/feature-store-yaml.md) * [.feastignore](reference/feature-repository/feast-ignore.md) * [\[Alpha\] On demand feature view](reference/alpha-on-demand-feature-view.md) +* [\[Alpha\] Stream ingestion](reference/alpha-stream-ingestion.md) * [\[Alpha\] Local feature server](reference/feature-server.md) * [\[Alpha\] AWS Lambda feature server](reference/alpha-aws-lambda-feature-server.md) * [Feast CLI reference](reference/feast-cli-commands.md) diff --git a/docs/how-to-guides/adding-or-reusing-tests.md b/docs/how-to-guides/adding-or-reusing-tests.md index 12f8e9b19c..1730abe209 100644 --- a/docs/how-to-guides/adding-or-reusing-tests.md +++ b/docs/how-to-guides/adding-or-reusing-tests.md @@ -17,35 +17,35 @@ $ tree . ├── e2e -│   └── test_universal_e2e.py +│ └── test_universal_e2e.py ├── feature_repos -│   ├── repo_configuration.py -│   └── universal -│   ├── data_source_creator.py -│   ├── data_sources -│   │   ├── bigquery.py -│   │   ├── file.py -│   │   └── redshift.py -│   ├── entities.py -│   └── feature_views.py +│ ├── repo_configuration.py +│ └── universal +│ ├── data_source_creator.py +│ ├── data_sources +│ │ ├── bigquery.py +│ │ ├── file.py +│ │ └── redshift.py +│ ├── entities.py +│ └── feature_views.py ├── offline_store -│   ├── test_s3_custom_endpoint.py -│   └── test_universal_historical_retrieval.py +│ ├── test_s3_custom_endpoint.py +│ └── test_universal_historical_retrieval.py ├── online_store -│   ├── test_e2e_local.py -│   ├── test_feature_service_read.py -│   ├── test_online_retrieval.py -│   └── test_universal_online.py +│ ├── test_e2e_local.py +│ ├── test_feature_service_read.py +│ ├── test_online_retrieval.py +│ └── test_universal_online.py ├── registration -│   ├── test_cli.py -│   ├── test_cli_apply_duplicated_featureview_names.py -│   ├── test_cli_chdir.py -│   ├── test_feature_service_apply.py -│   ├── test_feature_store.py -│   ├── test_inference.py -│   ├── test_registry.py -│   ├── test_universal_odfv_feature_inference.py -│   └── test_universal_types.py +│ ├── test_cli.py +│ ├── test_cli_apply_duplicated_featureview_names.py +│ ├── test_cli_chdir.py +│ ├── test_feature_service_apply.py +│ ├── test_feature_store.py +│ ├── test_inference.py +│ ├── test_registry.py +│ ├── test_universal_odfv_feature_inference.py +│ └── test_universal_types.py └── scaffolding ├── test_init.py ├── test_partial_apply.py @@ -148,30 +148,30 @@ The key fixtures are the `environment` and `universal_data_sources` fixtures, wh ## Writing a new test or reusing existing tests -To add a new test to an existing test file: +### To add a new test to an existing test file * Use the same function signatures as an existing test (e.g. use `environment` as an argument) to include the relevant test fixtures. * If possible, expand an individual test instead of writing a new test, due to the cost of standing up offline / online stores. -To test a new offline / online store from a plugin repo: +### To test a new offline / online store from a plugin repo * Install Feast in editable mode with `pip install -e`. * The core tests for offline / online store behavior are parametrized by the `FULL_REPO_CONFIGS` variable defined in `feature_repos/repo_configuration.py`. To overwrite this variable without modifying the Feast repo, create your own file that contains a `FULL_REPO_CONFIGS` (which will require adding a new `IntegrationTestRepoConfig` or two) and set the environment variable `FULL_REPO_CONFIGS_MODULE` to point to that file. Then the core offline / online store tests can be run with `make test-python-universal`. * See the [custom offline store demo](https://github.com/feast-dev/feast-custom-offline-store-demo) and the [custom online store demo](https://github.com/feast-dev/feast-custom-online-store-demo) for examples. -To include a new offline / online store in the main Feast repo: +### To include a new offline / online store in the main Feast repo * Extend `data_source_creator.py` for your offline store. * In `repo_configuration.py` add a new`IntegrationTestRepoConfig` or two (depending on how many online stores you want to test). * Run the full test suite with `make test-python-integration.` -To include a new online store: +### To include a new online store * In `repo_configuration.py` add a new config that maps to a serialized version of configuration you need in `feature_store.yaml` to setup the online store. * In `repo_configuration.py`, add new`IntegrationTestRepoConfig` for offline stores you want to test. * Run the full test suite with `make test-python-integration` -To use custom data in a new test: +### To use custom data in a new test * Check `test_universal_types.py` for an example of how to do this. diff --git a/docs/reference/alpha-stream-ingestion.md b/docs/reference/alpha-stream-ingestion.md new file mode 100644 index 0000000000..ae7f81d080 --- /dev/null +++ b/docs/reference/alpha-stream-ingestion.md @@ -0,0 +1,45 @@ +# \[Alpha] Stream ingestion + +**Warning**: This is an _experimental_ feature. It's intended for early testing and feedback, and could change without warnings in future releases. + +{% hint style="info" %} +To enable this feature, run **`feast alpha enable direct_ingest_to_online_store`** +{% endhint %} + +## Overview + +Streaming data sources are important sources of feature values. A typical setup with streaming data looks like: + +1. Raw events come in (stream 1) +2. Streaming transformations applied (e.g. `last_N_purchased_categories`) (stream 2) +3. Write stream 2 values to an offline store as a historical log for training +4. Write stream 2 values to an online store for low latency feature serving +5. Periodically materialize feature values from the offline store into the online store for improved correctness + +Feast now allows users to push features previously registered in a feature view to the online store. This most commonly would be done from a stream processing job (e.g. a Beam or Spark Streaming job). Future versions of Feast will allow writing features directly to the offline store as well. + +## Example + +See [https://github.com/feast-dev/feast-demo](https://github.com/feast-dev/on-demand-feature-views-demo) for an example on how to use on demand feature views. + +We register a feature view as normal, and during stream processing (e.g. Kafka consumers), now we push a dataframe matching the feature view schema: + +```python +event_df = pd.DataFrame.from_dict( + { + "driver_id": [1001], + "event_timestamp": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "created": [ + datetime(2021, 5, 13, 10, 59, 42), + ], + "conv_rate": [1.0], + "acc_rate": [1.0], + "avg_daily_trips": [1000], + } +) +store.write_to_online_store("driver_hourly_stats", event_df) +``` + +Feast will coordinate between pushed stream data and regular materialization jobs to ensure only the latest feature values are written to the online store. This ensures correctness in served features for model inference. diff --git a/docs/roadmap.md b/docs/roadmap.md index e74188e763..c56cae9849 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -13,7 +13,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Synapse source (community plugin)](https://github.com/Azure/feast-azure) * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (community plugin)](https://github.com/nossrannug/feast-postgres) - * [ ] Kafka source (Planned for Q4 2021) + * [x] Kafka source (with [push support into the online store](reference/alpha-stream-ingestion.md)) * [ ] Snowflake source (Planned for Q4 2021) * [ ] HTTP source * **Offline Stores** @@ -38,7 +38,8 @@ The list below contains the functionality that contributors are planning to deve * [ ] Cassandra * **Streaming** * [x] [Custom streaming ingestion job support](https://docs.feast.dev/how-to-guides/creating-a-custom-provider) - * [ ] Streaming ingestion on AWS (Planned for Q4 2021) + * [x] [Push based streaming data ingestion](reference/alpha-stream-ingestion.md) + * [ ] Streaming ingestion on AWS * [ ] Streaming ingestion on GCP * **Feature Engineering** * [x] On-demand Transformations (Alpha release. See [RFC](https://docs.google.com/document/d/1lgfIw0Drc65LpaxbUu49RCeJgMew547meSJttnUqz7c/edit#)) @@ -53,9 +54,9 @@ The list below contains the functionality that contributors are planning to deve * [x] Python Client * [x] REST Feature Server (Python) (Alpha release. See [RFC](https://docs.google.com/document/d/1iXvFhAsJ5jgAhPOpTdB3j-Wj1S9x3Ev\_Wr6ZpnLzER4/edit)) * [x] gRPC Feature Server (Java) (See [#1497](https://github.com/feast-dev/feast/issues/1497)) + * [x] Push API * [ ] Java Client * [ ] Go Client - * [ ] Push API * [ ] Delete API * [ ] Feature Logging (for training) * **Data Quality Management**