From 90c9786e0598f3ed037be2003aa1e24b3c15d043 Mon Sep 17 00:00:00 2001 From: Willem Pienaar <6728866+woop@users.noreply.github.com> Date: Sun, 2 Feb 2020 15:39:48 +0800 Subject: [PATCH] Deduplicate example notebooks (#456) * Deduplicate example notebooks * Merge docker-compose.yml for both batch and online serving. --- examples/basic/basic.ipynb | 256 ++++++-- infra/docker-compose/.env.sample | 24 +- infra/docker-compose/docker-compose.batch.yml | 25 - infra/docker-compose/docker-compose.yml | 38 +- .../jupyter/features/cust_trans_fs.yaml | 11 - .../features/cust_trans_fs_updated.yaml | 13 - .../notebooks/feast-batch-serving.ipynb | 504 ---------------- .../jupyter/notebooks/feast-quickstart.ipynb | 569 ------------------ infra/docker/jupyter/Dockerfile | 3 - infra/docker/jupyter/Dockerfile.dev | 8 - sdk/python/setup.py | 9 +- 11 files changed, 242 insertions(+), 1218 deletions(-) delete mode 100644 infra/docker-compose/docker-compose.batch.yml delete mode 100644 infra/docker-compose/jupyter/features/cust_trans_fs.yaml delete mode 100644 infra/docker-compose/jupyter/features/cust_trans_fs_updated.yaml delete mode 100644 infra/docker-compose/jupyter/notebooks/feast-batch-serving.ipynb delete mode 100644 infra/docker-compose/jupyter/notebooks/feast-quickstart.ipynb delete mode 100644 infra/docker/jupyter/Dockerfile delete mode 100644 infra/docker/jupyter/Dockerfile.dev diff --git a/examples/basic/basic.ipynb b/examples/basic/basic.ipynb index 49658b4235..94fc82f2ce 100644 --- a/examples/basic/basic.ipynb +++ b/examples/basic/basic.ipynb @@ -15,15 +15,15 @@ "1. Create a synthetic customer feature dataset\n", "2. Register a feature set to represent these features in Feast\n", "3. Ingest these features into Feast\n", - "4. Create a feature query and retrieve historical feature data\n", - "5. Create a feature query and retrieve online feature data" + "4. Create a feature query and retrieve online feature data\n", + "5. Create a feature query and retrieve historical feature data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 1. Clone Feast and install all dependencies" + "### 0. Configuration" ] }, { @@ -32,9 +32,79 @@ "metadata": {}, "outputs": [], "source": [ - "!git clone https://github.com/gojek/feast.git \\\n", - "&& cd feast/sdk/python/ && pip install --upgrade --quiet -e . \\\n", - "&& pip install --quiet --upgrade pandas numpy protobuf" + "import os\n", + "\n", + "# Feast Core acts as the central feature registry\n", + "FEAST_CORE_URL = os.getenv('FEAST_CORE_URL', 'core:6565')\n", + "\n", + "# Feast Online Serving allows for the retrieval of real-time feature data\n", + "FEAST_ONLINE_SERVING_URL = os.getenv('FEAST_ONLINE_SERVING_URL', 'online-serving:6566')\n", + "\n", + "# Feast Batch Serving allows for the retrieval of historical feature data\n", + "FEAST_BATCH_SERVING_URL = os.getenv('FEAST_BATCH_SERVING_URL', 'batch-serving:6567')\n", + "\n", + "# PYTHON_REPOSITORY_PATH is the path to the Python SDK inside the Feast Git Repo\n", + "PYTHON_REPOSITORY_PATH = os.getenv('PYTHON_REPOSITORY_PATH', '../../')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Install Feast SDK" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install from PyPi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --ignore-installed --upgrade feast" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(Alternative) Install from local repository" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "os.environ['PYTHON_SDK_PATH'] = os.path.join(PYTHON_REPOSITORY_PATH, 'sdk/python')\n", + "sys.path.append(os.environ['PYTHON_SDK_PATH'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!echo $PYTHON_SDK_PATH" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python -m pip install --ignore-installed --upgrade -e ${PYTHON_SDK_PATH}" ] }, { @@ -66,7 +136,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Configure Feast services and connect the Feast client" + "### 3. Configure Feast services and connect the Feast client\n", + "\n", + "Connect to Feast Core and Feast Online Serving" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client = Client(core_url=FEAST_CORE_URL, serving_url=FEAST_ONLINE_SERVING_URL)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a project workspace" ] }, { @@ -75,9 +163,14 @@ "metadata": {}, "outputs": [], "source": [ - "CORE_URL = 'localhost:6565'\n", - "ONLINE_SERVING_URL = 'localhost:6566'\n", - "BATCH_SERVING_URL = 'localhost:6567'" + "client.create_project('customer_project')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the active project" ] }, { @@ -86,8 +179,6 @@ "metadata": {}, "outputs": [], "source": [ - "client = Client(core_url=CORE_URL, serving_url=BATCH_SERVING_URL) # Connect to Feast Core\n", - "client.create_project('customer_project')\n", "client.set_project('customer_project')" ] }, @@ -95,7 +186,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 4. Create synthetic customer features" + "### 4. Create customer features" ] }, { @@ -132,7 +223,7 @@ " }\n", ")\n", "\n", - "print(customer_features.head(10))" + "print(customer_features.head(500))" ] }, { @@ -147,9 +238,7 @@ "metadata": {}, "source": [ "Now we will create a feature set for these features. Feature sets are essentially a schema that represent\n", - "feature values. Feature sets allow Feast to both identify feature values and their structure. \n", - "\n", - "In this case we need to define any entity columns as well as the maximum age. The entity column in this case is \"customer_id\". Max age is set to 1 day (defined in seconds). This means that for each feature query during retrieval, the serving API will only retrieve features up to a maximum of 1 day per provided timestamp and entity combination. " + "feature values. Feature sets allow Feast to both identify feature values and their structure. The following feature set contains no features yet." ] }, { @@ -160,8 +249,8 @@ "source": [ "customer_fs = FeatureSet(\n", " \"customer_transactions\",\n", - " max_age=Duration(seconds=86400),\n", - " entities=[Entity(name='customer_id', dtype=ValueType.INT64)]\n", + " entities=[Entity(name='customer_id', dtype=ValueType.INT64)],\n", + " max_age=Duration(seconds=432000) \n", ")" ] }, @@ -169,7 +258,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here we are automatically inferring the schema from the provided dataset" + "Here we are automatically inferring the schema from the provided dataset. The two features from the dataset will be added to the feature set" ] }, { @@ -241,16 +330,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 8. Create a batch retrieval query" + "### 8. Retrieve online features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In order to retrieve historical feature data, the user must provide an entity_rows dataframe. This dataframe contains a combination of timestamps and entities. In this case, the user must provide both customer_ids and timestamps. \n", - "\n", - "We will randomly generate timestamps over the last 30 days, and assign customer_ids to them. When these entity rows are sent to the Feast Serving API to retrieve feature values, along with a list of feature ids, Feast is then able to attach the correct feature values to each entity row. The one exception is if the feature values fall outside of the maximum age window." + "The process of retrieving features from the online API is very similar to that of the batch API. The only major difference is that users do not have to provide timestamps (only the latest features are returned, as long as they are within the maximum age window)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The example below retrieves online features for a single customer: \"1001\". It is possible to retrieve any features from feast, even outside of the current project." ] }, { @@ -259,23 +353,51 @@ "metadata": {}, "outputs": [], "source": [ - "event_timestamps = [datetime.utcnow().replace(tzinfo=utc) - timedelta(days=randrange(15), hours=randrange(24), minutes=randrange(60)) for day in range(30)]\n", - "\n", - "entity_rows = pd.DataFrame(\n", - " {\n", - " \"datetime\": event_timestamps,\n", - " \"customer_id\": [customers[idx % len(customers)] for idx in range(len(event_timestamps))],\n", - " }\n", + "online_features = client.get_online_features(\n", + " feature_refs=[\n", + " f\"daily_transactions\",\n", + " f\"total_transactions\",\n", + " ],\n", + " entity_rows=[\n", + " GetOnlineFeaturesRequest.EntityRow(\n", + " fields={\n", + " \"customer_id\": Value(\n", + " int64_val=1001)\n", + " }\n", + " )\n", + " ],\n", ")\n", - "\n", - "print(entity_rows.head(10))" + "print(online_features)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 9. Retrieve historical/batch features" + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The following section requires Google Cloud Platform (Google Cloud Storage and BigQuery)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9. Create a batch retrieval query" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In order to retrieve historical feature data, the user must provide an entity_rows dataframe. This dataframe contains a combination of timestamps and entities. In this case, the user must provide both customer_ids and timestamps. \n", + "\n", + "We will randomly generate timestamps over the last 30 days, and assign customer_ids to them. When these entity rows are sent to the Feast Serving API to retrieve feature values, along with a list of feature ids, Feast is then able to attach the correct feature values to each entity row. " ] }, { @@ -284,29 +406,30 @@ "metadata": {}, "outputs": [], "source": [ - "job = client.get_batch_features(\n", - " feature_refs=[\n", - " f\"daily_transactions\", \n", - " f\"total_transactions\", \n", - " ],\n", - " entity_rows=entity_rows\n", - " )\n", - "df = job.to_dataframe()\n", - "print(df.head(10))" + "event_timestamps = [datetime.utcnow().replace(tzinfo=utc) - timedelta(days=randrange(15), hours=randrange(24), minutes=randrange(60)) for day in range(30)]\n", + "\n", + "entity_rows = pd.DataFrame(\n", + " {\n", + " \"datetime\": event_timestamps,\n", + " \"customer_id\": [customers[idx % len(customers)] for idx in range(len(event_timestamps))],\n", + " }\n", + ")\n", + "\n", + "print(entity_rows.head(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 10. Retrieve online features" + "### 10. Retrieve historical/batch features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The process of retrieving features from the online API is very similar to that of the batch API. The only major difference is that users do not have to provide timestamps (only the latest features are returned, as long as they are within the maximum age window)" + "Next we will create a new client object, but this time we will configure it to connect to the Batch Serving Service. This service will allow us to retrieve historical feature data." ] }, { @@ -315,37 +438,39 @@ "metadata": {}, "outputs": [], "source": [ - "online_client = Client(core_url=CORE_URL, serving_url=ONLINE_SERVING_URL)\n", - "online_client.set_project(\"customer_project\")" + "batch_client = Client(core_url=FEAST_CORE_URL, serving_url=FEAST_BATCH_SERVING_URL)\n", + "batch_client.set_project(\"customer_project\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The example below retrieves online features for a single customer: \"1001\"" + "By calling the `get_batch_features` method we are able to retrieve a `job` object for the exporting of feature data. For every entity and timestamp combination in `entity_rows` we will be receiving a row with feature values joined to it." ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ - "online_features = online_client.get_online_features(\n", - " feature_refs=[\n", - " f\"daily_transactions\",\n", - " f\"total_transactions\",\n", - " ],\n", - " entity_rows=[\n", - " GetOnlineFeaturesRequest.EntityRow(\n", - " fields={\n", - " \"customer_id\": Value(\n", - " int64_val=1001)\n", - " }\n", - " )\n", - " ],\n", - ")" + "job = batch_client.get_batch_features(\n", + " feature_refs=[\n", + " f\"customer_project/daily_transactions\", \n", + " f\"customer_project/total_transactions\", \n", + " ],\n", + " entity_rows=entity_rows\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the job is complete, it is possible to retrieve the exported data (from Google Cloud Storage) and load it into memory as a Pandas Dataframe." ] }, { @@ -354,7 +479,8 @@ "metadata": {}, "outputs": [], "source": [ - "print(online_features)" + "df = job.to_dataframe()\n", + "print(df.head(10))" ] } ], @@ -374,7 +500,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.4" }, "pycharm": { "stem_cell": { diff --git a/infra/docker-compose/.env.sample b/infra/docker-compose/.env.sample index e14bde2772..c8652e8fe0 100644 --- a/infra/docker-compose/.env.sample +++ b/infra/docker-compose/.env.sample @@ -1,19 +1,21 @@ +# General COMPOSE_PROJECT_NAME=feast - FEAST_VERSION=latest +# Feast Core FEAST_CORE_IMAGE=gcr.io/kf-feast/feast-core -FEAST_CORE_CONFIG=direct-runner -FEAST_CORE_GCP_SERVICE_ACCOUNT_KEY=placeholder +FEAST_CORE_CONFIG=direct-runner.yml +FEAST_CORE_GCP_SERVICE_ACCOUNT_KEY=placeholder.json +# Feast Serving FEAST_SERVING_IMAGE=gcr.io/kf-feast/feast-serving -FEAST_ONLINE_SERVING_CONFIG=online-serving -FEAST_ONLINE_STORE_CONFIG=redis-store -FEAST_BATCH_SERVING_CONFIG=batch-serving -FEAST_BATCH_STORE_CONFIG=bq-store -FEAST_BATCH_SERVING_GCP_SERVICE_ACCOUNT_KEY=placeholder -FEAST_JOB_STAGING_LOCATION=gs://your-gcp-project/bucket +FEAST_ONLINE_SERVING_CONFIG=online-serving.yml +FEAST_ONLINE_STORE_CONFIG=redis-store.yml +FEAST_BATCH_SERVING_CONFIG=batch-serving.yml +FEAST_BATCH_STORE_CONFIG=bq-store.yml +FEAST_BATCH_SERVING_GCP_SERVICE_ACCOUNT_KEY=placeholder.json +FEAST_JOB_STAGING_LOCATION=gs://your-gcs-bucket/staging -FEAST_JUPYTER_IMAGE=gcr.io/kf-feast/feast-jupyter -FEAST_JUPYTER_GCP_SERVICE_ACCOUNT_KEY=placeholder +# Jupyter +FEAST_JUPYTER_GCP_SERVICE_ACCOUNT_KEY=placeholder.json diff --git a/infra/docker-compose/docker-compose.batch.yml b/infra/docker-compose/docker-compose.batch.yml deleted file mode 100644 index c00ac9475b..0000000000 --- a/infra/docker-compose/docker-compose.batch.yml +++ /dev/null @@ -1,25 +0,0 @@ -version: "3.7" - -services: - batch-serving: - image: ${FEAST_SERVING_IMAGE}:${FEAST_VERSION} - volumes: - - ./serving/${FEAST_BATCH_SERVING_CONFIG}.yml:/etc/feast/application.yml - - ./serving/${FEAST_BATCH_STORE_CONFIG}.yml:/etc/feast/store.yml - - ./gcp-service-accounts/${FEAST_BATCH_SERVING_GCP_SERVICE_ACCOUNT_KEY}.json:/etc/gcloud/service-accounts/key.json - depends_on: - - core - - redis - ports: - - 6567:6567 - restart: on-failure - environment: - GOOGLE_APPLICATION_CREDENTIALS: /etc/gcloud/service-accounts/key.json - FEAST_JOB_STAGING_LOCATION: ${FEAST_JOB_STAGING_LOCATION} - command: - - "java" - - "-Xms1024m" - - "-Xmx1024m" - - "-jar" - - "/opt/feast/feast-serving.jar" - - "--spring.config.location=classpath:/application.yml,file:/etc/feast/application.yml" \ No newline at end of file diff --git a/infra/docker-compose/docker-compose.yml b/infra/docker-compose/docker-compose.yml index 44750650ce..27d82efc3c 100644 --- a/infra/docker-compose/docker-compose.yml +++ b/infra/docker-compose/docker-compose.yml @@ -4,8 +4,8 @@ services: core: image: ${FEAST_CORE_IMAGE}:${FEAST_VERSION} volumes: - - ./core/${FEAST_CORE_CONFIG}.yml:/etc/feast/application.yml - - ./gcp-service-accounts/${FEAST_CORE_GCP_SERVICE_ACCOUNT_KEY}.json:/etc/gcloud/service-accounts/key.json + - ./core/${FEAST_CORE_CONFIG}:/etc/feast/application.yml + - ./gcp-service-accounts/${FEAST_CORE_GCP_SERVICE_ACCOUNT_KEY}:/etc/gcloud/service-accounts/key.json environment: DB_HOST: db GOOGLE_APPLICATION_CREDENTIALS: /etc/gcloud/service-accounts/key.json @@ -24,8 +24,8 @@ services: online-serving: image: ${FEAST_SERVING_IMAGE}:${FEAST_VERSION} volumes: - - ./serving/${FEAST_ONLINE_SERVING_CONFIG}.yml:/etc/feast/application.yml - - ./serving/${FEAST_ONLINE_STORE_CONFIG}.yml:/etc/feast/store.yml + - ./serving/${FEAST_ONLINE_SERVING_CONFIG}:/etc/feast/application.yml + - ./serving/${FEAST_ONLINE_STORE_CONFIG}:/etc/feast/store.yml depends_on: - core - redis @@ -38,12 +38,34 @@ services: - /opt/feast/feast-serving.jar - --spring.config.location=classpath:/application.yml,file:/etc/feast/application.yml + batch-serving: + image: ${FEAST_SERVING_IMAGE}:${FEAST_VERSION} + volumes: + - ./serving/${FEAST_BATCH_SERVING_CONFIG}:/etc/feast/application.yml + - ./serving/${FEAST_BATCH_STORE_CONFIG}:/etc/feast/store.yml + - ./gcp-service-accounts/${FEAST_BATCH_SERVING_GCP_SERVICE_ACCOUNT_KEY}:/etc/gcloud/service-accounts/key.json + depends_on: + - core + - redis + ports: + - 6567:6567 + restart: on-failure + environment: + GOOGLE_APPLICATION_CREDENTIALS: /etc/gcloud/service-accounts/key.json + FEAST_JOB_STAGING_LOCATION: ${FEAST_JOB_STAGING_LOCATION} + command: + - "java" + - "-Xms1024m" + - "-Xmx1024m" + - "-jar" + - "/opt/feast/feast-serving.jar" + - "--spring.config.location=classpath:/application.yml,file:/etc/feast/application.yml" + jupyter: - image: ${FEAST_JUPYTER_IMAGE}:${FEAST_VERSION} + image: jupyter/datascience-notebook:latest volumes: - - ./jupyter/notebooks:/home/jovyan/feast-notebooks - - ./jupyter/features:/home/jovyan/features - - ./gcp-service-accounts/${FEAST_JUPYTER_GCP_SERVICE_ACCOUNT_KEY}.json:/etc/gcloud/service-accounts/key.json + - ../../:/home/jovyan/feast + - ./gcp-service-accounts/${FEAST_JUPYTER_GCP_SERVICE_ACCOUNT_KEY}:/etc/gcloud/service-accounts/key.json depends_on: - core - online-serving diff --git a/infra/docker-compose/jupyter/features/cust_trans_fs.yaml b/infra/docker-compose/jupyter/features/cust_trans_fs.yaml deleted file mode 100644 index eb21ce9b35..0000000000 --- a/infra/docker-compose/jupyter/features/cust_trans_fs.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name: customer_transactions -kind: feature_set -entities: -- name: customer_id - valueType: INT64 -features: -- name: daily_transactions - valueType: FLOAT -- name: total_transactions - valueType: FLOAT -maxAge: 3600s \ No newline at end of file diff --git a/infra/docker-compose/jupyter/features/cust_trans_fs_updated.yaml b/infra/docker-compose/jupyter/features/cust_trans_fs_updated.yaml deleted file mode 100644 index 8293d04b88..0000000000 --- a/infra/docker-compose/jupyter/features/cust_trans_fs_updated.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name: customer_transactions -kind: feature_set -entities: -- name: customer_id - valueType: INT64 -features: -- name: daily_transactions - valueType: FLOAT -- name: total_transactions - valueType: FLOAT -- name: discounts - valueType: FLOAT -maxAge: 3600s \ No newline at end of file diff --git a/infra/docker-compose/jupyter/notebooks/feast-batch-serving.ipynb b/infra/docker-compose/jupyter/notebooks/feast-batch-serving.ipynb deleted file mode 100644 index c288093f07..0000000000 --- a/infra/docker-compose/jupyter/notebooks/feast-batch-serving.ipynb +++ /dev/null @@ -1,504 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feast Batch Serving\n", - "This is an extension to `feast-quickstart` notebook to demonstrate the batch serving capability of Feast.\n", - "\n", - "## Prerequisite\n", - "- A running Feast Serving service with store configuration that supports batch retrieval. (eg. BigQuery store)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import feast\n", - "import numpy as np\n", - "import pandas as pd\n", - "from datetime import datetime, timedelta\n", - "from feast.serving.ServingService_pb2 import GetOnlineFeaturesRequest\n", - "from feast.types.Value_pb2 import Value as Value\n", - "from feast.client import Client\n", - "from feast.feature_set import FeatureSet" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "client = feast.Client(core_url=\"core:6565\", serving_url=\"batch-serving:6567\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "cust_trans_fs = FeatureSet.from_yaml(\"../features/cust_trans_fs.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature set updated/created: \"customer_transactions:1\".\n" - ] - } - ], - "source": [ - "client.apply(cust_trans_fs)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datetimecustomer_iddaily_transactionstotal_transactions
02019-12-06 02:17:46.899904100002.797627175.978266
12019-12-06 02:17:46.899915100014.931632153.871975
22019-12-06 02:17:46.899922100020.206628108.558844
32019-12-06 02:17:46.899929100032.354937119.549455
42019-12-06 02:17:46.899937100047.171423115.345183
\n", - "
" - ], - "text/plain": [ - " datetime customer_id daily_transactions \\\n", - "0 2019-12-06 02:17:46.899904 10000 2.797627 \n", - "1 2019-12-06 02:17:46.899915 10001 4.931632 \n", - "2 2019-12-06 02:17:46.899922 10002 0.206628 \n", - "3 2019-12-06 02:17:46.899929 10003 2.354937 \n", - "4 2019-12-06 02:17:46.899937 10004 7.171423 \n", - "\n", - " total_transactions \n", - "0 175.978266 \n", - "1 153.871975 \n", - "2 108.558844 \n", - "3 119.549455 \n", - "4 115.345183 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "offset = 10000\n", - "nr_of_customers = 5\n", - "customer_df = pd.DataFrame(\n", - " {\n", - " \"datetime\": [datetime.utcnow() for _ in range(nr_of_customers)],\n", - " \"customer_id\": [offset + inc for inc in range(nr_of_customers)],\n", - " \"daily_transactions\": [np.random.uniform(0, 10) for _ in range(nr_of_customers)],\n", - " \"total_transactions\": [np.random.uniform(100, 200) for _ in range(nr_of_customers)],\n", - " }\n", - ")\n", - "customer_df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 5/5 [00:00<00:00, 7.24rows/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Ingested 5 rows into customer_transactions:1\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "client.ingest(cust_trans_fs, dataframe=customer_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Batch Retrieval\n", - "Batch retrieval takes a dataframe containing the entities column and event timestamp as an input. The result would be the outer join of the input and the features. The input dataframe needs to have a column named `datetime` as event timestamp. No results will be returned if the difference between the feature ingestion timestamp and the `event_timestamp` is greater than the `maxAge` parameter specified in the feature set." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_transactions_v1_feature_timestampcustomer_idevent_timestampcustomer_transactions_v1_daily_transactionscustomer_transactions_v1_total_transactions
02019-12-06 02:17:46+00:00100012019-12-06 02:17:55.612449+00:004.931632153.871980
12019-12-06 02:17:46+00:00100042019-12-06 02:17:55.612449+00:007.171423115.345184
22019-12-06 02:17:46+00:00100002019-12-06 02:17:55.612449+00:002.797627175.978270
32019-12-06 02:17:46+00:00100022019-12-06 02:17:55.612449+00:000.206628108.558846
42019-12-06 02:17:46+00:00100032019-12-06 02:17:55.612449+00:002.354937119.549450
\n", - "
" - ], - "text/plain": [ - " customer_transactions_v1_feature_timestamp customer_id \\\n", - "0 2019-12-06 02:17:46+00:00 10001 \n", - "1 2019-12-06 02:17:46+00:00 10004 \n", - "2 2019-12-06 02:17:46+00:00 10000 \n", - "3 2019-12-06 02:17:46+00:00 10002 \n", - "4 2019-12-06 02:17:46+00:00 10003 \n", - "\n", - " event_timestamp \\\n", - "0 2019-12-06 02:17:55.612449+00:00 \n", - "1 2019-12-06 02:17:55.612449+00:00 \n", - "2 2019-12-06 02:17:55.612449+00:00 \n", - "3 2019-12-06 02:17:55.612449+00:00 \n", - "4 2019-12-06 02:17:55.612449+00:00 \n", - "\n", - " customer_transactions_v1_daily_transactions \\\n", - "0 4.931632 \n", - "1 7.171423 \n", - "2 2.797627 \n", - "3 0.206628 \n", - "4 2.354937 \n", - "\n", - " customer_transactions_v1_total_transactions \n", - "0 153.871980 \n", - "1 115.345184 \n", - "2 175.978270 \n", - "3 108.558846 \n", - "4 119.549450 " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "entity_df = customer_df[[\"customer_id\"]].assign(datetime=datetime.utcnow())\n", - "feature_ids=[\n", - " \"customer_transactions:1:daily_transactions\",\n", - " \"customer_transactions:1:total_transactions\",\n", - "]\n", - "batch_job = client.get_batch_features(feature_ids, entity_df)\n", - "batch_job.to_dataframe()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
customer_transactions_v1_feature_timestampcustomer_idevent_timestampcustomer_transactions_v1_daily_transactionscustomer_transactions_v1_total_transactions
0None100002020-01-05 02:18:43.900732+00:00NoneNone
1None100012020-01-05 02:18:43.900732+00:00NoneNone
2None100022020-01-05 02:18:43.900732+00:00NoneNone
3None100032020-01-05 02:18:43.900732+00:00NoneNone
4None100042020-01-05 02:18:43.900732+00:00NoneNone
\n", - "
" - ], - "text/plain": [ - " customer_transactions_v1_feature_timestamp customer_id \\\n", - "0 None 10000 \n", - "1 None 10001 \n", - "2 None 10002 \n", - "3 None 10003 \n", - "4 None 10004 \n", - "\n", - " event_timestamp \\\n", - "0 2020-01-05 02:18:43.900732+00:00 \n", - "1 2020-01-05 02:18:43.900732+00:00 \n", - "2 2020-01-05 02:18:43.900732+00:00 \n", - "3 2020-01-05 02:18:43.900732+00:00 \n", - "4 2020-01-05 02:18:43.900732+00:00 \n", - "\n", - " customer_transactions_v1_daily_transactions \\\n", - "0 None \n", - "1 None \n", - "2 None \n", - "3 None \n", - "4 None \n", - "\n", - " customer_transactions_v1_total_transactions \n", - "0 None \n", - "1 None \n", - "2 None \n", - "3 None \n", - "4 None " - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "stale_entity_df = customer_df[[\"customer_id\"]].assign(datetime=datetime.utcnow() + timedelta(days=30))\n", - "feature_ids=[\n", - " \"customer_transactions:1:daily_transactions\",\n", - " \"customer_transactions:1:total_transactions\",\n", - "]\n", - "batch_job = client.get_batch_features(feature_ids, stale_entity_df)\n", - "batch_job.to_dataframe()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "metadata": { - "collapsed": false - }, - "source": [] - } - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/infra/docker-compose/jupyter/notebooks/feast-quickstart.ipynb b/infra/docker-compose/jupyter/notebooks/feast-quickstart.ipynb deleted file mode 100644 index b89e59b1e4..0000000000 --- a/infra/docker-compose/jupyter/notebooks/feast-quickstart.ipynb +++ /dev/null @@ -1,569 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feast Quick Start\n", - "This is a quick example to demonstrate:\n", - "- Register a feature set on Feast\n", - "- Ingest features into Feast\n", - "- Retrieve the ingested features from Feast\n", - "- Update a feature" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import feast\n", - "import numpy as np\n", - "import pandas as pd\n", - "from datetime import datetime\n", - "from feast.serving.ServingService_pb2 import GetOnlineFeaturesRequest\n", - "from feast.types.Value_pb2 import Value as Value\n", - "from feast.client import Client\n", - "from feast.feature_set import FeatureSet" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, instantiate the client.\n", - "Feast endpoints can be set via the following environmental variables: `FEAST_CORE_URL`, `FEAST_SERVING_URL`.\n", - "Alternatively, they can also be passed in explicitly as follows:\n", - " \n", - "`client = feast.Client(core_url=core:6565, serving_url=online-serving:6566)`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "client = feast.Client()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Register a feature set\n", - "\n", - "Let's create and register our first feature set. Below is an example of a basic customer transactions feature set that has been exported to YAML:\n", - "```\n", - "name: customer_transactions\n", - "kind: feature_set\n", - "entities:\n", - "- name: customer_id\n", - " valueType: INT64\n", - "features:\n", - "- name: daily_transactions\n", - " valueType: FLOAT\n", - "- name: total_transactions\n", - " valueType: FLOAT\n", - "maxAge: 3600s \n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "cust_trans_fs = FeatureSet.from_yaml(\"../features/cust_trans_fs.yaml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Feature set updated/created: \"customer_transactions:1\".\n" - ] - } - ], - "source": [ - "client.apply(cust_trans_fs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ingest features into Feast\n", - "The dataframe below contains the features and entities of the above feature set." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datetimecustomer_iddaily_transactionstotal_transactions
02019-11-26 12:03:47.320634100005.178112110.670651
12019-11-26 12:03:47.320644100010.268114195.393913
22019-11-26 12:03:47.320651100021.486614136.929052
32019-11-26 12:03:47.320658100039.676433166.022999
42019-11-26 12:03:47.320665100045.928573165.687951
\n", - "
" - ], - "text/plain": [ - " datetime customer_id daily_transactions \\\n", - "0 2019-11-26 12:03:47.320634 10000 5.178112 \n", - "1 2019-11-26 12:03:47.320644 10001 0.268114 \n", - "2 2019-11-26 12:03:47.320651 10002 1.486614 \n", - "3 2019-11-26 12:03:47.320658 10003 9.676433 \n", - "4 2019-11-26 12:03:47.320665 10004 5.928573 \n", - "\n", - " total_transactions \n", - "0 110.670651 \n", - "1 195.393913 \n", - "2 136.929052 \n", - "3 166.022999 \n", - "4 165.687951 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "offset = 10000\n", - "nr_of_customers = 5\n", - "customer_df = pd.DataFrame(\n", - " {\n", - " \"datetime\": [datetime.utcnow() for _ in range(nr_of_customers)],\n", - " \"customer_id\": [offset + inc for inc in range(nr_of_customers)],\n", - " \"daily_transactions\": [np.random.uniform(0, 10) for _ in range(nr_of_customers)],\n", - " \"total_transactions\": [np.random.uniform(100, 200) for _ in range(nr_of_customers)],\n", - " }\n", - ")\n", - "customer_df" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/5 [00:00\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
datetimecustomer_iddaily_transactionstotal_transactionsdiscounts
02019-11-26 12:03:47.320634100005.178112110.6706518.389938
12019-11-26 12:03:47.320644100010.268114195.3939130.430047
22019-11-26 12:03:47.320651100021.486614136.9290527.408917
32019-11-26 12:03:47.320658100039.676433166.0229991.192721
42019-11-26 12:03:47.320665100045.928573165.6879512.051037
\n", - "" - ], - "text/plain": [ - " datetime customer_id daily_transactions \\\n", - "0 2019-11-26 12:03:47.320634 10000 5.178112 \n", - "1 2019-11-26 12:03:47.320644 10001 0.268114 \n", - "2 2019-11-26 12:03:47.320651 10002 1.486614 \n", - "3 2019-11-26 12:03:47.320658 10003 9.676433 \n", - "4 2019-11-26 12:03:47.320665 10004 5.928573 \n", - "\n", - " total_transactions discounts \n", - "0 110.670651 8.389938 \n", - "1 195.393913 0.430047 \n", - "2 136.929052 7.408917 \n", - "3 166.022999 1.192721 \n", - "4 165.687951 2.051037 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "discounts = [np.random.uniform(0, 10) for _ in range(nr_of_customers)]\n", - "customer_df_updated = customer_df.assign(discounts=discounts)\n", - "customer_df_updated" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/5 [00:00