Data Science test scenarios (feast-dev#18)

Performs ingestion of 20k items for 2 data science scenarios: Product Computer Vision features (20K/day - generated once a day) Product Text Attributes (20K/day - generated once a day) Fixes KE-647
Yanson · Jun 5, 2020 · 8c07d40 · 8c07d40
1 parent 88e43a8
commit 8c07d40
Show file tree

Hide file tree

Showing 4 changed files with 273 additions and 0 deletions.
diff --git a/infra/scripts/test-docker-compose.sh b/infra/scripts/test-docker-compose.sh
@@ -41,5 +41,11 @@ docker logs feast_jupyter_1
 # Wait for Jupyter Notebook Container to come online
 ${PROJECT_ROOT_DIR}/infra/scripts/wait-for-it.sh ${JUPYTER_DOCKER_CONTAINER_IP_ADDRESS}:8888 --timeout=300
 
+# Wait for Feast Core to come online
+docker exec feast_core_1 grpc-health-probe -addr :6565 -connect-timeout 300s
+
 # Run e2e tests for Redis
 docker exec feast_jupyter_1 bash -c 'cd feast/tests/e2e/ && pytest -s basic-ingest-redis-serving.py --core_url core:6565 --serving_url=online-serving:6566'
+
+# Run ingestion tests for FF Data Science scenarios
+docker exec feast_jupyter_1 bash -c 'cd feast/tests/ds_scenarios/ && pytest -s test-ingest.py --core_url core:6565 --serving_url=online-serving:6566'
diff --git a/tests/ds_scenarios/conftest.py b/tests/ds_scenarios/conftest.py
@@ -0,0 +1,7 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption("--core_url", action="store", default="localhost:6565")
+    parser.addoption("--serving_url", action="store", default="localhost:6566")
+    parser.addoption("--allow_dirty", action="store", default="False")
diff --git a/tests/ds_scenarios/ds_example_feature_data.py b/tests/ds_scenarios/ds_example_feature_data.py
@@ -0,0 +1,204 @@
+import datetime 
+import numpy as np
+import pandas as pd
+from feast import Feature, FeatureSet, Entity, ValueType
+from pytz import utc
+
+"""
+Examples of anticipated daily feature ingestion load:
+
+Product Computer Vision features (20K/day - generated once a day), see create_product_image_features_df() below:
+- CV1: 4 x 64 float
+- CV2: 2 x 64 float
+- CV3: 256 x float
+- CV4: 8192 x float
+
+Product Text Attributes (20K/day - generated once a day), see create_product_text_attributes_df() below: 
+- TX1-n: string or list of string
+
+Fraud features: customer counts for different windows of time (15M throughout day):
+- FR1-7: int
+"""
+
+product = Entity('product_id', ValueType.INT64)
+
+
+PRODUCT_IMAGE_FEATURE_SET = FeatureSet(
+    'product_image_features',
+    entities=[Entity('product_id', ValueType.INT64)],
+    features=[
+        Feature('cv1', ValueType.DOUBLE_LIST),
+        Feature('cv2', ValueType.DOUBLE_LIST),
+        Feature('cv3', ValueType.DOUBLE_LIST),
+        Feature('cv4', ValueType.DOUBLE_LIST),
+    ]
+)
+
+
+PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET = FeatureSet(
+    'product_text_attributes',
+    entities=[Entity('product_id', ValueType.INT64)],
+    features=[
+        Feature('brand', ValueType.STRING),
+        Feature('brand-range', ValueType.STRING),
+        Feature('colours', ValueType.STRING_LIST),
+        Feature('footware', ValueType.STRING),
+        Feature('heel-height', ValueType.STRING),
+        Feature('heel-type', ValueType.STRING),
+        Feature('materials', ValueType.STRING_LIST),
+        Feature('sole-height', ValueType.STRING),
+    ]
+)
+
+
+FRAUD_COUNTS_FEATURE_SET = FeatureSet(
+    'fraud_count_features',
+    entities=[Entity('customer_id', ValueType.INT64)],
+    features=[
+        Feature('window_count1', ValueType.INT64),
+        Feature('window_count2', ValueType.INT64),
+        Feature('window_count3', ValueType.INT64),
+        Feature('window_count4', ValueType.INT64),
+        Feature('window_count5', ValueType.INT64),
+        Feature('window_count6', ValueType.INT64),
+        Feature('window_count7', ValueType.INT64),
+    ]
+)
+
+
+PRODUCT_ATTRIBUTES = {
+    'brand': [
+        'Ash',
+        'PRADA',
+        'Birkenstock',
+        'Valentino Garavani',
+        'LEMAIRE',
+        'R . M . Williams',
+        'Eres',
+        'Moschino',
+        'GIUSEPPE JUNIOR',
+        'Montelpare Tradition',
+        'Moa Master Of Arts',
+        "L ' Autre Chose",
+        'See by Chloé',
+        'Y - 3',
+        'Nike',
+    ],
+    'brand-range': [
+        'Gizeh',
+        'Gain',
+        'Stingray',
+        'Majolica',
+        'Glory',
+        'Revekka',
+        'Diana Strass',
+        'Blackout',
+        'Air Jordan 1 High',
+        'Authentic',
+        'Galore',
+        'Montecarlo Mondial',
+    ],
+    'colours': [
+        ['black', 'white'],
+        ['silver'],
+        ['red'],
+        ['light green'],
+        ['bright orange'],
+        ['tan brown'],
+        ['yellow', 'green'],
+        ['bright blue'],
+        ['orange'],
+        ['cinnamon brown'],
+        ['hot - pink'],
+        ['silver grey'],
+        ['navy'],
+    ],
+    'footware': [    	
+        'mules',
+        'sandals',
+        'flip - flops',
+        'sliders',
+        'ballerina shoes',
+        'mule',
+        'slingbacks',
+        'derby shoes',
+        'school shoes',
+        'wedge shoes',
+        'runner',
+    ],
+    'heel-height': ['{}mm'.format(x) for x in range(0, 101, 5)],
+    'heel-type': [
+        'sculpted',
+        'chunky',
+        'screw',
+        'stacked',
+        'discrete',
+        'slender',
+        'collapsible',
+        'platform',
+    ],
+    'materials': [
+        ['raffia'],
+        ['rubber', 'polyester'],
+        ['nylon'],
+        ['silk'],
+        ['patent sheepskin'],
+        ['mesh'],
+    ],
+    'sole-height': ['{}mm'.format(x) for x in range(0, 101, 5)],
+}
+
+
+def create_cv1():
+    value = np.random.random((4, 64))
+    value[:, 25:] = 0
+    np.random.shuffle(value.T)
+    return value
+
+
+def create_cv2():
+    value = np.random.random((2, 64))
+    value[:, 15:] = 0
+    np.random.shuffle(value.T)
+    return value
+
+
+def create_cv3():
+    value = np.random.randn(256).astype(np.float32)
+    return value / np.linalg.norm(value)
+
+
+def create_cv4():
+    value = np.random.randn(8192)
+    return value / np.linalg.norm(value)
+
+
+def create_product_image_features_df(initial_product_id=1, n=20000):
+    dt = datetime.datetime.now(datetime.timezone.utc)
+    return pd.DataFrame(
+    {
+        'datetime': dt,
+        'product_id': list(range(initial_product_id, initial_product_id + n)),
+        'cv1': [create_cv1().flatten() for _ in range(n)],
+        'cv2': [create_cv2().flatten() for _ in range(n)],
+        'cv3': [create_cv3() for _ in range(n)],
+        'cv4': [create_cv4() for _ in range(n)],
+    })
+
+
+def create_product_text_attributes_df(initial_product_id=1, n=20000):
+    dt = datetime.datetime.now(datetime.timezone.utc)
+    return pd.DataFrame(
+    {
+        'datetime': dt,
+        'product_id': list(range(initial_product_id, initial_product_id + n)),
+        'brand': [np.random.choice(PRODUCT_ATTRIBUTES['brand']) for _ in range(n)],
+        'brand-range': [np.random.choice(PRODUCT_ATTRIBUTES['brand-range']) for _ in range(n)],
+        'colours': [np.random.choice(PRODUCT_ATTRIBUTES['colours']) for _ in range(n)],
+        'footware': [np.random.choice(PRODUCT_ATTRIBUTES['footware']) for _ in range(n)],
+        'heel-height': [np.random.choice(PRODUCT_ATTRIBUTES['heel-height']) for _ in range(n)],
+        'heel-type': [np.random.choice(PRODUCT_ATTRIBUTES['heel-type']) for _ in range(n)],
+        'materials': [np.random.choice(PRODUCT_ATTRIBUTES['materials']) for _ in range(n)],
+        'sole-height': [np.random.choice(PRODUCT_ATTRIBUTES['sole-height']) for _ in range(n)],
+    })
+
diff --git a/tests/ds_scenarios/test-ingest.py b/tests/ds_scenarios/test-ingest.py
@@ -0,0 +1,56 @@
+import pytest
+from feast.client import Client
+
+import uuid
+
+from ds_example_feature_data import (
+    PRODUCT_IMAGE_FEATURE_SET, create_product_image_features_df,
+    PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET, create_product_text_attributes_df,
+)
+
+PROJECT_NAME = 'ds_' + uuid.uuid4().hex.upper()[0:6]
+
+
+@pytest.fixture(scope='module')
+def core_url(pytestconfig):
+    return pytestconfig.getoption("core_url")
+
+
+@pytest.fixture(scope='module')
+def serving_url(pytestconfig):
+    return pytestconfig.getoption("serving_url")
+
+
+@pytest.fixture(scope='module')
+def allow_dirty(pytestconfig):
+    return True if pytestconfig.getoption(
+        "allow_dirty").lower() == "true" else False
+
+
+@pytest.fixture(scope='module')
+def client(core_url, serving_url, allow_dirty):
+    # Get client for core and serving
+    client = Client(core_url=core_url, serving_url=serving_url)
+    client.create_project(PROJECT_NAME)
+
+    # Ensure Feast core is active, but empty
+    if not allow_dirty:
+        feature_sets = client.list_feature_sets(project=PROJECT_NAME)
+        if len(feature_sets) > 0:
+            raise Exception(
+                "Feast cannot have existing feature sets registered. Exiting tests."
+            )
+
+    return client
+
+
+@pytest.mark.timeout(600)
+@pytest.mark.parametrize("data_frame_generator,feature_set", [
+    (create_product_image_features_df, PRODUCT_IMAGE_FEATURE_SET),
+    (create_product_text_attributes_df, PRODUCT_TEXT_ATTRIBUTE_FEATURE_SET),
+])
+def test_ingestion(client, data_frame_generator, feature_set):
+    client.apply(feature_set)
+    data_frame = data_frame_generator()
+    client.ingest(feature_set, data_frame)
+