Skip to content

Commit

Permalink
Data Science test scenarios (feast-dev#18)
Browse files Browse the repository at this point in the history
Performs ingestion of 20k items for 2 data science scenarios:

Product Computer Vision features (20K/day - generated once a day)
Product Text Attributes (20K/day - generated once a day)

Fixes KE-647
  • Loading branch information
algattik authored Jun 5, 2020
1 parent 88e43a8 commit 8c07d40
Show file tree
Hide file tree
Showing 4 changed files with 273 additions and 0 deletions.
6 changes: 6 additions & 0 deletions infra/scripts/test-docker-compose.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,11 @@ docker logs feast_jupyter_1
# Wait for Jupyter Notebook Container to come online
${PROJECT_ROOT_DIR}/infra/scripts/wait-for-it.sh ${JUPYTER_DOCKER_CONTAINER_IP_ADDRESS}:8888 --timeout=300

# Wait for Feast Core to come online
docker exec feast_core_1 grpc-health-probe -addr :6565 -connect-timeout 300s

# Run e2e tests for Redis
docker exec feast_jupyter_1 bash -c 'cd feast/tests/e2e/ && pytest -s basic-ingest-redis-serving.py --core_url core:6565 --serving_url=online-serving:6566'

# Run ingestion tests for FF Data Science scenarios
docker exec feast_jupyter_1 bash -c 'cd feast/tests/ds_scenarios/ && pytest -s test-ingest.py --core_url core:6565 --serving_url=online-serving:6566'
7 changes: 7 additions & 0 deletions tests/ds_scenarios/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import pytest


def pytest_addoption(parser):
parser.addoption("--core_url", action="store", default="localhost:6565")
parser.addoption("--serving_url", action="store", default="localhost:6566")
parser.addoption("--allow_dirty", action="store", default="False")
204 changes: 204 additions & 0 deletions tests/ds_scenarios/ds_example_feature_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import datetime
import numpy as np
import pandas as pd
from feast import Feature, FeatureSet, Entity, ValueType
from pytz import utc

"""
Examples of anticipated daily feature ingestion load:
Product Computer Vision features (20K/day - generated once a day), see create_product_image_features_df() below:
- CV1: 4 x 64 float
- CV2: 2 x 64 float
- CV3: 256 x float
- CV4: 8192 x float
Product Text Attributes (20K/day - generated once a day), see create_product_text_attributes_df() below:
- TX1-n: string or list of string
Fraud features: customer counts for different windows of time (15M throughout day):
- FR1-7: int
"""

product = Entity('product_id', ValueType.INT64)


PRODUCT_IMAGE_FEATURE_SET = FeatureSet(
'product_image_features',
entities=[Entity('product_id', ValueType.INT64)],
features=[
Feature('cv1', ValueType.DOUBLE_LIST),
Feature('cv2', ValueType.DOUBLE_LIST),
Feature('cv3', ValueType.DOUBLE_LIST),
Feature('cv4', ValueType.DOUBLE_LIST),
]
)


PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET = FeatureSet(
'product_text_attributes',
entities=[Entity('product_id', ValueType.INT64)],
features=[
Feature('brand', ValueType.STRING),
Feature('brand-range', ValueType.STRING),
Feature('colours', ValueType.STRING_LIST),
Feature('footware', ValueType.STRING),
Feature('heel-height', ValueType.STRING),
Feature('heel-type', ValueType.STRING),
Feature('materials', ValueType.STRING_LIST),
Feature('sole-height', ValueType.STRING),
]
)


FRAUD_COUNTS_FEATURE_SET = FeatureSet(
'fraud_count_features',
entities=[Entity('customer_id', ValueType.INT64)],
features=[
Feature('window_count1', ValueType.INT64),
Feature('window_count2', ValueType.INT64),
Feature('window_count3', ValueType.INT64),
Feature('window_count4', ValueType.INT64),
Feature('window_count5', ValueType.INT64),
Feature('window_count6', ValueType.INT64),
Feature('window_count7', ValueType.INT64),
]
)


PRODUCT_ATTRIBUTES = {
'brand': [
'Ash',
'PRADA',
'Birkenstock',
'Valentino Garavani',
'LEMAIRE',
'R . M . Williams',
'Eres',
'Moschino',
'GIUSEPPE JUNIOR',
'Montelpare Tradition',
'Moa Master Of Arts',
"L ' Autre Chose",
'See by Chloé',
'Y - 3',
'Nike',
],
'brand-range': [
'Gizeh',
'Gain',
'Stingray',
'Majolica',
'Glory',
'Revekka',
'Diana Strass',
'Blackout',
'Air Jordan 1 High',
'Authentic',
'Galore',
'Montecarlo Mondial',
],
'colours': [
['black', 'white'],
['silver'],
['red'],
['light green'],
['bright orange'],
['tan brown'],
['yellow', 'green'],
['bright blue'],
['orange'],
['cinnamon brown'],
['hot - pink'],
['silver grey'],
['navy'],
],
'footware': [
'mules',
'sandals',
'flip - flops',
'sliders',
'ballerina shoes',
'mule',
'slingbacks',
'derby shoes',
'school shoes',
'wedge shoes',
'runner',
],
'heel-height': ['{}mm'.format(x) for x in range(0, 101, 5)],
'heel-type': [
'sculpted',
'chunky',
'screw',
'stacked',
'discrete',
'slender',
'collapsible',
'platform',
],
'materials': [
['raffia'],
['rubber', 'polyester'],
['nylon'],
['silk'],
['patent sheepskin'],
['mesh'],
],
'sole-height': ['{}mm'.format(x) for x in range(0, 101, 5)],
}


def create_cv1():
value = np.random.random((4, 64))
value[:, 25:] = 0
np.random.shuffle(value.T)
return value


def create_cv2():
value = np.random.random((2, 64))
value[:, 15:] = 0
np.random.shuffle(value.T)
return value


def create_cv3():
value = np.random.randn(256).astype(np.float32)
return value / np.linalg.norm(value)


def create_cv4():
value = np.random.randn(8192)
return value / np.linalg.norm(value)


def create_product_image_features_df(initial_product_id=1, n=20000):
dt = datetime.datetime.now(datetime.timezone.utc)
return pd.DataFrame(
{
'datetime': dt,
'product_id': list(range(initial_product_id, initial_product_id + n)),
'cv1': [create_cv1().flatten() for _ in range(n)],
'cv2': [create_cv2().flatten() for _ in range(n)],
'cv3': [create_cv3() for _ in range(n)],
'cv4': [create_cv4() for _ in range(n)],
})


def create_product_text_attributes_df(initial_product_id=1, n=20000):
dt = datetime.datetime.now(datetime.timezone.utc)
return pd.DataFrame(
{
'datetime': dt,
'product_id': list(range(initial_product_id, initial_product_id + n)),
'brand': [np.random.choice(PRODUCT_ATTRIBUTES['brand']) for _ in range(n)],
'brand-range': [np.random.choice(PRODUCT_ATTRIBUTES['brand-range']) for _ in range(n)],
'colours': [np.random.choice(PRODUCT_ATTRIBUTES['colours']) for _ in range(n)],
'footware': [np.random.choice(PRODUCT_ATTRIBUTES['footware']) for _ in range(n)],
'heel-height': [np.random.choice(PRODUCT_ATTRIBUTES['heel-height']) for _ in range(n)],
'heel-type': [np.random.choice(PRODUCT_ATTRIBUTES['heel-type']) for _ in range(n)],
'materials': [np.random.choice(PRODUCT_ATTRIBUTES['materials']) for _ in range(n)],
'sole-height': [np.random.choice(PRODUCT_ATTRIBUTES['sole-height']) for _ in range(n)],
})

56 changes: 56 additions & 0 deletions tests/ds_scenarios/test-ingest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import pytest
from feast.client import Client

import uuid

from ds_example_feature_data import (
PRODUCT_IMAGE_FEATURE_SET, create_product_image_features_df,
PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET, create_product_text_attributes_df,
)

PROJECT_NAME = 'ds_' + uuid.uuid4().hex.upper()[0:6]


@pytest.fixture(scope='module')
def core_url(pytestconfig):
return pytestconfig.getoption("core_url")


@pytest.fixture(scope='module')
def serving_url(pytestconfig):
return pytestconfig.getoption("serving_url")


@pytest.fixture(scope='module')
def allow_dirty(pytestconfig):
return True if pytestconfig.getoption(
"allow_dirty").lower() == "true" else False


@pytest.fixture(scope='module')
def client(core_url, serving_url, allow_dirty):
# Get client for core and serving
client = Client(core_url=core_url, serving_url=serving_url)
client.create_project(PROJECT_NAME)

# Ensure Feast core is active, but empty
if not allow_dirty:
feature_sets = client.list_feature_sets(project=PROJECT_NAME)
if len(feature_sets) > 0:
raise Exception(
"Feast cannot have existing feature sets registered. Exiting tests."
)

return client


@pytest.mark.timeout(600)
@pytest.mark.parametrize("data_frame_generator,feature_set", [
(create_product_image_features_df, PRODUCT_IMAGE_FEATURE_SET),
(create_product_text_attributes_df, PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET),
])
def test_ingestion(client, data_frame_generator, feature_set):
client.apply(feature_set)
data_frame = data_frame_generator()
client.ingest(feature_set, data_frame)

0 comments on commit 8c07d40

Please sign in to comment.