From aad0a478b0dfc851944f7c4a714071ab30bcdbfb Mon Sep 17 00:00:00 2001 From: Terence Date: Fri, 2 Oct 2020 23:11:35 +0800 Subject: [PATCH 01/34] Cleanup cli Signed-off-by: Terence --- sdk/python/feast/cli.py | 327 +++------------------------------------- 1 file changed, 18 insertions(+), 309 deletions(-) diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index 1c774ea89f..f041b4c12f 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -15,7 +15,7 @@ import json import logging import sys -from typing import Dict, List +from typing import Dict import click import pkg_resources @@ -23,10 +23,7 @@ from feast.client import Client from feast.config import Config -from feast.contrib.job_controller.client import Client as JCClient -from feast.core.IngestionJob_pb2 import IngestionJobStatus -from feast.entity import EntityV2 -from feast.feature_set import FeatureSet, FeatureSetRef +from feast.entity import Entity from feast.feature_table import FeatureTable from feast.loaders.yaml import yaml_loader @@ -143,9 +140,7 @@ def entity_create(filename, project): Create or update an entity """ - entities = [ - EntityV2.from_dict(entity_dict) for entity_dict in yaml_loader(filename) - ] + entities = [Entity.from_dict(entity_dict) for entity_dict in yaml_loader(filename)] feast_client = Client() # type: Client feast_client.apply_entity(entities, project) @@ -217,6 +212,21 @@ def feature_table(): pass +def _get_labels_dict(label_str: str) -> Dict[str, str]: + """ + Converts CLI input labels string to dictionary format if provided string is valid. + """ + labels_dict: Dict[str, str] = {} + labels_kv = label_str.split(",") + if label_str == "": + return labels_dict + if len(labels_kv) % 2 == 1: + raise ValueError("Uneven key-value label pairs were entered") + for k, v in zip(labels_kv[0::2], labels_kv[1::2]): + labels_dict[k] = v + return labels_dict + + @feature_table.command("apply") @click.option( "--filename", @@ -291,170 +301,6 @@ def feature_table_list(project: str, labels: str): print(tabulate(table, headers=["NAME", "ENTITIES"], tablefmt="plain")) -@cli.group(name="features") -def feature(): - """ - Manage feature - """ - pass - - -def _convert_entity_string_to_list(entities_str: str) -> List[str]: - """ - Converts CLI input entities string to list format if provided string is valid. - """ - if entities_str == "": - return [] - return entities_str.split(",") - - -@feature.command(name="list") -@click.option( - "--project", - "-p", - help="Project that feature belongs to", - type=click.STRING, - default="*", -) -@click.option( - "--entities", - "-n", - help="Entities to filter for features", - type=click.STRING, - default="", -) -@click.option( - "--labels", - "-l", - help="Labels to filter for features", - type=click.STRING, - default="", -) -def feature_list(project: str, entities: str, labels: str): - """ - List all features - """ - feast_client = Client() # type: Client - - entities_list = _convert_entity_string_to_list(entities) - labels_dict: Dict[str, str] = _get_labels_dict(labels) - - table = [] - for feature_ref, feature in feast_client.list_features_by_ref( - project=project, entities=entities_list, labels=labels_dict - ).items(): - table.append([feature.name, feature.dtype, repr(feature_ref)]) - - from tabulate import tabulate - - print(tabulate(table, headers=["NAME", "DTYPE", "REFERENCE"], tablefmt="plain")) - - -@cli.group(name="feature-sets") -def feature_set(): - """ - Create and manage feature sets - """ - pass - - -def _get_labels_dict(label_str: str) -> Dict[str, str]: - """ - Converts CLI input labels string to dictionary format if provided string is valid. - """ - labels_dict: Dict[str, str] = {} - labels_kv = label_str.split(",") - if label_str == "": - return labels_dict - if len(labels_kv) % 2 == 1: - raise ValueError("Uneven key-value label pairs were entered") - for k, v in zip(labels_kv[0::2], labels_kv[1::2]): - labels_dict[k] = v - return labels_dict - - -@feature_set.command(name="list") -@click.option( - "--project", - "-p", - help="Project that feature set belongs to", - type=click.STRING, - default="*", -) -@click.option( - "--name", - "-n", - help="Filters feature sets by name. Wildcards (*) may be included to match multiple feature sets", - type=click.STRING, - default="*", -) -@click.option( - "--labels", - "-l", - help="Labels to filter for feature sets", - type=click.STRING, - default="", -) -def feature_set_list(project: str, name: str, labels: str): - """ - List all feature sets - """ - feast_client = Client() # type: Client - - labels_dict = _get_labels_dict(labels) - - table = [] - for fs in feast_client.list_feature_sets( - project=project, name=name, labels=labels_dict - ): - table.append([fs.name, repr(fs)]) - - from tabulate import tabulate - - print(tabulate(table, headers=["NAME", "REFERENCE"], tablefmt="plain")) - - -@feature_set.command("apply") -# TODO: add project option to overwrite project setting. -@click.option( - "--filename", - "-f", - help="Path to a feature set configuration file that will be applied", - type=click.Path(exists=True), -) -def feature_set_create(filename): - """ - Create or update a feature set - """ - - feature_sets = [FeatureSet.from_dict(fs_dict) for fs_dict in yaml_loader(filename)] - feast_client = Client() # type: Client - feast_client.apply(feature_sets) - - -@feature_set.command("describe") -@click.argument("name", type=click.STRING) -@click.option( - "--project", - "-p", - help="Project that feature set belongs to", - type=click.STRING, - default="default", -) -def feature_set_describe(name: str, project: str): - """ - Describe a feature set - """ - feast_client = Client() # type: Client - fs = feast_client.get_feature_set(name=name, project=project) - - if not fs: - print(f'Feature set with name "{name}" could not be found') - return - - print(yaml.dump(yaml.safe_load(str(fs)), default_flow_style=False, sort_keys=False)) - - @cli.group(name="projects") def project(): """ @@ -499,142 +345,5 @@ def project_list(): print(tabulate(table, headers=["NAME"], tablefmt="plain")) -@cli.group(name="ingest-jobs") -def ingest_job(): - """ - Manage ingestion jobs - """ - pass - - -@ingest_job.command("list") -@click.option("--job-id", "-i", help="Show only ingestion jobs with the given job id") -@click.option( - "--feature-set-ref", - "-f", - help="Show only ingestion job targeting the feature set with the given reference", -) -@click.option( - "--store-name", - "-s", - help="List only ingestion job that ingest into feast store with given name", -) -# TODO: types -def ingest_job_list(job_id, feature_set_ref, store_name): - """ - List ingestion jobs - """ - # parse feature set reference - if feature_set_ref is not None: - feature_set_ref = FeatureSetRef.from_str(feature_set_ref) - - # pull & render ingestion jobs as a table - feast_client = JCClient() - table = [] - for ingest_job in feast_client.list_ingest_jobs( - job_id=job_id, feature_set_ref=feature_set_ref, store_name=store_name - ): - table.append([ingest_job.id, IngestionJobStatus.Name(ingest_job.status)]) - - from tabulate import tabulate - - print(tabulate(table, headers=["ID", "STATUS"], tablefmt="plain")) - - -@ingest_job.command("describe") -@click.argument("job_id") -def ingest_job_describe(job_id: str): - """ - Describe the ingestion job with the given id. - """ - # find ingestion job for id - feast_client = JCClient() - jobs = feast_client.list_ingest_jobs(job_id=job_id) - if len(jobs) < 1: - print(f"Ingestion Job with id {job_id} could not be found") - sys.exit(1) - job = jobs[0] - - # pretty render ingestion job as yaml - print( - yaml.dump(yaml.safe_load(str(job)), default_flow_style=False, sort_keys=False) - ) - - -@ingest_job.command("stop") -@click.option( - "--wait", "-w", is_flag=True, help="Wait for the ingestion job to fully stop." -) -@click.option( - "--timeout", - "-t", - default=600, - help="Timeout in seconds to wait for the job to stop.", -) -@click.argument("job_id") -def ingest_job_stop(wait: bool, timeout: int, job_id: str): - """ - Stop ingestion job for id. - """ - # find ingestion job for id - feast_client = JCClient() - jobs = feast_client.list_ingest_jobs(job_id=job_id) - if len(jobs) < 1: - print(f"Ingestion Job with id {job_id} could not be found") - sys.exit(1) - job = jobs[0] - - feast_client.stop_ingest_job(job) - - # wait for ingestion job to stop - if wait: - job.wait(IngestionJobStatus.ABORTED, timeout=timeout) - - -@ingest_job.command("restart") -@click.argument("job_id") -def ingest_job_restart(job_id: str): - """ - Restart job for id. - Waits for the job to fully restart. - """ - # find ingestion job for id - feast_client = JCClient() - jobs = feast_client.list_ingest_jobs(job_id=job_id) - if len(jobs) < 1: - print(f"Ingestion Job with id {job_id} could not be found") - sys.exit(1) - job = jobs[0] - - feast_client.restart_ingest_job(job) - - -@cli.command() -@click.option( - "--name", "-n", help="Feature set name to ingest data into", required=True -) -@click.option( - "--filename", - "-f", - help="Path to file to be ingested", - type=click.Path(exists=True), - required=True, -) -@click.option( - "--file-type", - "-t", - type=click.Choice(["CSV"], case_sensitive=False), - help="Type of file to ingest. Defaults to CSV.", -) -def ingest(name, filename, file_type): - """ - Ingest feature data into a feature set - """ - - feast_client = Client() # type: Client - feature_set = feast_client.get_feature_set(name=name) - feature_set.ingest_file(file_path=filename) - - if __name__ == "__main__": cli() From 53d40cf0dafdca8c9476b68e48124b4e89f740e0 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 12:06:28 +0800 Subject: [PATCH 02/34] Cleanup Python SDK Signed-off-by: Terence --- sdk/python/feast/__init__.py | 21 +- sdk/python/feast/client.py | 737 ++--------- sdk/python/feast/contrib/__init__.py | 0 .../feast/contrib/job_controller/__init__.py | 0 .../feast/contrib/job_controller/client.py | 145 --- .../feast/contrib/job_controller/job.py | 122 -- sdk/python/feast/entity.py | 37 +- sdk/python/feast/feature.py | 171 +-- sdk/python/feast/feature_set.py | 1078 ----------------- sdk/python/feast/feature_table.py | 8 +- sdk/python/feast/feature_v2.py | 94 -- sdk/python/feast/job.py | 210 ---- sdk/python/feast/loaders/ingest.py | 170 +-- sdk/python/feast/source.py | 121 -- sdk/python/feast/type_map.py | 84 -- sdk/python/feast/value_type.py | 2 +- 16 files changed, 212 insertions(+), 2788 deletions(-) delete mode 100644 sdk/python/feast/contrib/__init__.py delete mode 100644 sdk/python/feast/contrib/job_controller/__init__.py delete mode 100644 sdk/python/feast/contrib/job_controller/client.py delete mode 100644 sdk/python/feast/contrib/job_controller/job.py delete mode 100644 sdk/python/feast/feature_set.py delete mode 100644 sdk/python/feast/feature_v2.py delete mode 100644 sdk/python/feast/job.py delete mode 100644 sdk/python/feast/source.py diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 8342de4c9b..298b8ac975 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -1,10 +1,17 @@ from pkg_resources import DistributionNotFound, get_distribution from .client import Client +from .data_source import ( + BigQueryOptions, + DataSource, + FileOptions, + KafkaOptions, + KinesisOptions, + SourceType, +) from .entity import Entity from .feature import Feature -from .feature_set import FeatureSet -from .source import KafkaSource, Source +from .feature_table import FeatureTable from .value_type import ValueType try: @@ -16,9 +23,13 @@ __all__ = [ "Client", "Entity", + "DataSource", + "BigQueryOptions", + "FileOptions", + "KafkaOptions", + "KinesisOptions", "Feature", - "FeatureSet", - "Source", - "KafkaSource", + "FeatureTable", + "SourceType", "ValueType", ] diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index 713776f1f5..bc21a6f2e4 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -18,14 +18,12 @@ import shutil import tempfile import time -import uuid from math import ceil -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Dict, List, Optional, Tuple, Union import grpc import pandas as pd import pyarrow as pa -from google.protobuf.timestamp_pb2 import Timestamp from pyarrow import parquet as pq from feast.config import Config @@ -44,8 +42,6 @@ from feast.core.CoreService_pb2 import ( ApplyEntityRequest, ApplyEntityResponse, - ApplyFeatureSetRequest, - ApplyFeatureSetResponse, ApplyFeatureTableRequest, ApplyFeatureTableResponse, ArchiveProjectRequest, @@ -55,49 +51,28 @@ GetEntityRequest, GetEntityResponse, GetFeastCoreVersionRequest, - GetFeatureSetRequest, - GetFeatureSetResponse, - GetFeatureStatisticsRequest, GetFeatureTableRequest, GetFeatureTableResponse, ListEntitiesRequest, ListEntitiesResponse, - ListFeatureSetsRequest, - ListFeatureSetsResponse, - ListFeaturesRequest, - ListFeaturesResponse, ListFeatureTablesRequest, ListFeatureTablesResponse, ListProjectsRequest, ListProjectsResponse, ) from feast.core.CoreService_pb2_grpc import CoreServiceStub -from feast.core.FeatureSet_pb2 import FeatureSetStatus -from feast.entity import EntityV2 -from feast.feature import Feature, FeatureRef -from feast.feature_set import FeatureSet +from feast.data_source import SourceType +from feast.entity import Entity from feast.feature_table import FeatureTable from feast.grpc import auth as feast_auth from feast.grpc.grpc import create_grpc_channel -from feast.job import RetrievalJob -from feast.loaders.abstract_producer import get_producer -from feast.loaders.file import export_source_to_staging_location -from feast.loaders.ingest import KAFKA_CHUNK_PRODUCTION_TIMEOUT, get_feature_row_chunks -from feast.online_response import OnlineResponse -from feast.serving.ServingService_pb2 import ( - DataFormat, - DatasetSource, - FeastServingType, - FeatureReference, - GetBatchFeaturesRequest, - GetFeastServingInfoRequest, - GetFeastServingInfoResponse, - GetOnlineFeaturesRequest, +from feast.loaders.ingest import ( + BATCH_INGESTION_PRODUCTION_TIMEOUT, + check_field_mappings, ) +from feast.serving.ServingService_pb2 import GetFeastServingInfoRequest from feast.serving.ServingService_pb2_grpc import ServingServiceStub -from feast.type_map import _python_value_to_proto_value, python_type_to_feast_value_type -from feast.types.Value_pb2 import Value as Value -from tensorflow_metadata.proto.v0 import statistics_pb2 +from feast.staging.storage_client import get_staging_client _logger = logging.getLogger(__name__) @@ -368,9 +343,7 @@ def archive_project(self, project): if self._project == project: self._project = FEAST_DEFAULT_OPTIONS[CONFIG_PROJECT_KEY] - def apply_entity( - self, entities: Union[List[EntityV2], EntityV2], project: str = None - ): + def apply_entity(self, entities: Union[List[Entity], Entity], project: str = None): """ Idempotently registers entities with Feast Core. Either a single entity or a list can be provided. @@ -380,11 +353,11 @@ def apply_entity( Examples: >>> from feast import Client - >>> from feast.entity import EntityV2 + >>> from feast.entity import Entity >>> from feast.value_type import ValueType >>> >>> feast_client = Client(core_url="localhost:6565") - >>> entity = EntityV2( + >>> entity = Entity( >>> name="driver_entity", >>> description="Driver entity for car rides", >>> value_type=ValueType.STRING, @@ -401,12 +374,12 @@ def apply_entity( if not isinstance(entities, list): entities = [entities] for entity in entities: - if isinstance(entity, EntityV2): + if isinstance(entity, Entity): self._apply_entity(project, entity) # type: ignore continue raise ValueError(f"Could not determine entity type to apply {entity}") - def _apply_entity(self, project: str, entity: EntityV2): + def _apply_entity(self, project: str, entity: Entity): """ Registers a single entity with Feast @@ -428,14 +401,14 @@ def _apply_entity(self, project: str, entity: EntityV2): raise grpc.RpcError(e.details()) # Extract the returned entity - applied_entity = EntityV2.from_proto(apply_entity_response.entity) + applied_entity = Entity.from_proto(apply_entity_response.entity) # Deep copy from the returned entity to the local entity entity._update_from_entity(applied_entity) def list_entities( self, project: str = None, labels: Dict[str, str] = dict() - ) -> List[EntityV2]: + ) -> List[Entity]: """ Retrieve a list of entities from Feast Core @@ -460,12 +433,12 @@ def list_entities( # Extract entities and return entities = [] for entity_proto in entity_protos.entities: - entity = EntityV2.from_proto(entity_proto) + entity = Entity.from_proto(entity_proto) entity._client = self entities.append(entity) return entities - def get_entity(self, name: str, project: str = None) -> Union[EntityV2, None]: + def get_entity(self, name: str, project: str = None) -> Union[Entity, None]: """ Retrieves an entity. @@ -488,7 +461,7 @@ def get_entity(self, name: str, project: str = None) -> Union[EntityV2, None]: ) # type: GetEntityResponse except grpc.RpcError as e: raise grpc.RpcError(e.details()) - entity = EntityV2.from_proto(get_entity_response.entity) + entity = Entity.from_proto(get_entity_response.entity) return entity @@ -605,370 +578,21 @@ def get_feature_table( raise grpc.RpcError(e.details()) return FeatureTable.from_proto(get_feature_table_response.table) - def apply(self, feature_sets: Union[List[FeatureSet], FeatureSet]): - """ - Idempotently registers feature set(s) with Feast Core. Either a single - feature set or a list can be provided. - - Args: - feature_sets: List of feature sets that will be registered - """ - if not isinstance(feature_sets, list): - feature_sets = [feature_sets] - for feature_set in feature_sets: - if isinstance(feature_set, FeatureSet): - self._apply_feature_set(feature_set) - continue - raise ValueError( - f"Could not determine feature set type to apply {feature_set}" - ) - - def _apply_feature_set(self, feature_set: FeatureSet): - """ - Registers a single feature set with Feast - - Args: - feature_set: Feature set that will be registered - """ - - feature_set.is_valid() - feature_set_proto = feature_set.to_proto() - if len(feature_set_proto.spec.project) == 0: - if self.project is not None: - feature_set_proto.spec.project = self.project - - # Convert the feature set to a request and send to Feast Core - try: - apply_fs_response = self._core_service.ApplyFeatureSet( - ApplyFeatureSetRequest(feature_set=feature_set_proto), - timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), - metadata=self._get_grpc_metadata(), - ) # type: ApplyFeatureSetResponse - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - - # Extract the returned feature set - applied_fs = FeatureSet.from_proto(apply_fs_response.feature_set) - - # If the feature set has changed, update the local copy - if apply_fs_response.status == ApplyFeatureSetResponse.Status.CREATED: - print(f'Feature set created: "{applied_fs.name}"') - - if apply_fs_response.status == ApplyFeatureSetResponse.Status.UPDATED: - print(f'Feature set updated: "{applied_fs.name}"') - - # If no change has been applied, do nothing - if apply_fs_response.status == ApplyFeatureSetResponse.Status.NO_CHANGE: - print(f"No change detected or applied: {feature_set.name}") - - # Deep copy from the returned feature set to the local feature set - feature_set._update_from_feature_set(applied_fs) - - def list_feature_sets( - self, project: str = None, name: str = None, labels: Dict[str, str] = dict() - ) -> List[FeatureSet]: - """ - Retrieve a list of feature sets from Feast Core - - Args: - project: Filter feature sets based on project name - name: Filter feature sets based on feature set name - - Returns: - List of feature sets - """ - - if project is None: - if self.project is not None: - project = self.project - else: - project = "*" - - if name is None: - name = "*" - - filter = ListFeatureSetsRequest.Filter( - project=project, feature_set_name=name, labels=labels - ) - - # Get latest feature sets from Feast Core - feature_set_protos = self._core_service.ListFeatureSets( - ListFeatureSetsRequest(filter=filter), metadata=self._get_grpc_metadata(), - ) # type: ListFeatureSetsResponse - - # Extract feature sets and return - feature_sets = [] - for feature_set_proto in feature_set_protos.feature_sets: - feature_set = FeatureSet.from_proto(feature_set_proto) - feature_set._client = self - feature_sets.append(feature_set) - return feature_sets - - def get_feature_set( - self, name: str, project: str = None - ) -> Union[FeatureSet, None]: - """ - Retrieves a feature set. - - Args: - project: Feast project that this feature set belongs to - name: Name of feature set - - Returns: - Returns either the specified feature set, or raises an exception if - none is found - """ - - if project is None: - if self.project is not None: - project = self.project - else: - raise ValueError("No project has been configured.") - - try: - get_feature_set_response = self._core_service.GetFeatureSet( - GetFeatureSetRequest(project=project, name=name.strip()), - metadata=self._get_grpc_metadata(), - ) # type: GetFeatureSetResponse - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - return FeatureSet.from_proto(get_feature_set_response.feature_set) - - def list_features_by_ref( - self, - project: str = None, - entities: List[str] = list(), - labels: Dict[str, str] = dict(), - ) -> Dict[FeatureRef, Feature]: - """ - Returns a list of features based on filters provided. - - Args: - project: Feast project that these features belongs to - entities: Feast entity that these features are associated with - labels: Feast labels that these features are associated with - - Returns: - Dictionary of - - Examples: - >>> from feast import Client - >>> - >>> feast_client = Client(core_url="localhost:6565") - >>> features = list_features_by_ref(project="test_project", entities=["driver_id"], labels={"key1":"val1","key2":"val2"}) - >>> print(features) - """ - if project is None: - if self.project is not None: - project = self.project - else: - project = "default" - - filter = ListFeaturesRequest.Filter( - project=project, entities=entities, labels=labels - ) - - feature_protos = self._core_service.ListFeatures( - ListFeaturesRequest(filter=filter), metadata=self._get_grpc_metadata(), - ) # type: ListFeaturesResponse - - features_dict = {} - for ref_str, feature_proto in feature_protos.features.items(): - feature_ref = FeatureRef.from_str(ref_str, ignore_project=True) - feature = Feature.from_proto(feature_proto) - features_dict[feature_ref] = feature - - return features_dict - - def get_historical_features( - self, - feature_refs: List[str], - entity_rows: Union[pd.DataFrame, str], - compute_statistics: bool = False, - project: str = None, - ) -> RetrievalJob: - """ - Retrieves historical features from a Feast Serving deployment. - - Args: - feature_refs: List of feature references that will be returned for each entity. - Each feature reference should have the following format: - "feature_set:feature" where "feature_set" & "feature" refer to - the feature and feature set names respectively. - Only the feature name is required. - entity_rows (Union[pd.DataFrame, str]): - Pandas dataframe containing entities and a 'datetime' column. - Each entity in a feature set must be present as a column in this - dataframe. The datetime column must contain timestamps in - datetime64 format. - compute_statistics (bool): - Indicates whether Feast should compute statistics over the retrieved dataset. - project: Specifies the project which contain the FeatureSets - which the requested features belong to. - - Returns: - feast.job.RetrievalJob: - Returns a retrival job object that can be used to monitor retrieval - progress asynchronously, and can be used to materialize the - results. - - Examples: - >>> from feast import Client - >>> from datetime import datetime - >>> - >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") - >>> feature_refs = ["my_project/bookings_7d", "booking_14d"] - >>> entity_rows = pd.DataFrame( - >>> { - >>> "datetime": [pd.datetime.now() for _ in range(3)], - >>> "customer": [1001, 1002, 1003], - >>> } - >>> ) - >>> feature_retrieval_job = feast_client.get_historical_features( - >>> feature_refs, entity_rows, project="my_project") - >>> df = feature_retrieval_job.to_dataframe() - >>> print(df) - """ - - # Retrieve serving information to determine store type and - # staging location - serving_info = self._serving_service.GetFeastServingInfo( - GetFeastServingInfoRequest(), - timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), - metadata=self._get_grpc_metadata(), - ) # type: GetFeastServingInfoResponse - - if serving_info.type != FeastServingType.FEAST_SERVING_TYPE_BATCH: - raise Exception( - f'You are connected to a store "{self.serving_url}" which ' - f"does not support batch retrieval " - ) - - if isinstance(entity_rows, pd.DataFrame): - # Pandas DataFrame detected - - # Remove timezone from datetime column - if isinstance( - entity_rows["datetime"].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype - ): - entity_rows["datetime"] = pd.DatetimeIndex( - entity_rows["datetime"] - ).tz_localize(None) - elif isinstance(entity_rows, str): - # String based source - if not entity_rows.endswith((".avro", "*")): - raise Exception( - "Only .avro and wildcard paths are accepted as entity_rows" - ) - else: - raise Exception( - f"Only pandas.DataFrame and str types are allowed" - f" as entity_rows, but got {type(entity_rows)}." - ) - - # Export and upload entity row DataFrame to staging location - # provided by Feast - staged_files = export_source_to_staging_location( - entity_rows, serving_info.job_staging_location - ) # type: List[str] - request = GetBatchFeaturesRequest( - features=_build_feature_references( - feature_ref_strs=feature_refs, - project=project if project is not None else self.project, - ), - dataset_source=DatasetSource( - file_source=DatasetSource.FileSource( - file_uris=staged_files, data_format=DataFormat.DATA_FORMAT_AVRO - ) - ), - compute_statistics=compute_statistics, - ) - - # Retrieve Feast Job object to manage life cycle of retrieval - try: - response = self._serving_service.GetBatchFeatures( - request, metadata=self._get_grpc_metadata() - ) - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - - return RetrievalJob( - response.job, - self._serving_service, - auth_metadata_plugin=self._auth_metadata, - ) - - def get_online_features( - self, - feature_refs: List[str], - entity_rows: List[Dict[str, Any]], - project: Optional[str] = None, - omit_entities: bool = False, - ) -> OnlineResponse: - """ - Retrieves the latest online feature data from Feast Serving - - Args: - feature_refs: List of feature references that will be returned for each entity. - Each feature reference should have the following format: - "feature_set:feature" where "feature_set" & "feature" refer to - the feature and feature set names respectively. - Only the feature name is required. - entity_rows: A list of dictionaries where each key is an entity and each value is - feast.types.Value or Python native form. - project: Optionally specify the the project override. If specified, uses given project for retrieval. - Overrides the projects specified in Feature References if also are specified. - omit_entities: If true will omit entity values in the returned feature data. - Returns: - GetOnlineFeaturesResponse containing the feature data in records. - Each EntityRow provided will yield one record, which contains - data fields with data value and field status metadata (if included). - - Examples: - >>> from feast import Client - >>> - >>> feast_client = Client(core_url="localhost:6565", serving_url="localhost:6566") - >>> feature_refs = ["daily_transactions"] - >>> entity_rows = [{"customer_id": 0},{"customer_id": 1}] - >>> - >>> online_response = feast_client.get_online_features( - >>> feature_refs, entity_rows, project="my_project") - >>> online_response_dict = online_response.to_dict() - >>> print(online_response_dict) - {'daily_transactions': [1.1,1.2], 'customer_id': [0,1]} - """ - - try: - response = self._serving_service.GetOnlineFeatures( - GetOnlineFeaturesRequest( - omit_entities_in_response=omit_entities, - features=_build_feature_references(feature_ref_strs=feature_refs), - entity_rows=_infer_online_entity_rows(entity_rows), - project=project if project is not None else self.project, - ), - metadata=self._get_grpc_metadata(), - ) - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - - response = OnlineResponse(response) - return response - def ingest( self, - feature_set: Union[str, FeatureSet], + feature_table: Union[str, FeatureTable], source: Union[pd.DataFrame, str], + project: str = None, chunk_size: int = 10000, max_workers: int = max(CPU_COUNT - 1, 1), - disable_progress_bar: bool = False, - timeout: int = KAFKA_CHUNK_PRODUCTION_TIMEOUT, - ) -> str: + timeout: int = BATCH_INGESTION_PRODUCTION_TIMEOUT, + ) -> None: """ - Loads feature data into Feast for a specific feature set. + Batch load feature data into batch source of a specific feature table. Args: - feature_set (typing.Union[str, feast.feature_set.FeatureSet]): - Feature set object or the string name of the feature set + feature_table (typing.Union[str, feast.feature_table.FeatureTable]): + Feature table object or the string name of the feature table source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast @@ -977,27 +601,22 @@ def ingest( * csv * json + project: Feast project to locate FeatureTable + chunk_size (int): Amount of rows to load and ingest at a time. max_workers (int): Number of worker processes to use to encode values. - disable_progress_bar (bool): - Disable printing of progress statistics. - timeout (int): Timeout in seconds to wait for completion. - Returns: - str: - ingestion id for this dataset - Examples: >>> from feast import Client >>> >>> client = Client(core_url="localhost:6565") - >>> fs_df = pd.DataFrame( + >>> ft_df = pd.DataFrame( >>> { >>> "datetime": [pd.datetime.now()], >>> "driver": [1001], @@ -1005,169 +624,105 @@ def ingest( >>> } >>> ) >>> client.set_project("project1") - >>> client.ingest("driver", fs_df) >>> - >>> driver_fs = client.get_feature_set(name="driver", project="project1") - >>> client.ingest(driver_fs, fs_df) + >>> driver_ft = client.get_feature_table(name="driver") + >>> client.ingest(driver_ft, ft_df) """ - if isinstance(feature_set, FeatureSet): - name = feature_set.name - project = feature_set.project - elif isinstance(feature_set, str): - if self.project is not None: - project = self.project - else: - project = "default" - name = feature_set - else: - raise Exception("Feature set name must be provided") + if project is None: + project = self.project + if isinstance(feature_table, FeatureTable): + name = feature_table.name # Read table and get row count - dir_path, dest_path = _read_table_from_source(source, chunk_size, max_workers) - - pq_file = pq.ParquetFile(dest_path) - - row_count = pq_file.metadata.num_rows + dir_path, dest_path, column_names = _read_table_from_source( + source, chunk_size, max_workers + ) current_time = time.time() - - print("Waiting for feature set to be ready for ingestion...") + print("Waiting for feature table to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: - raise TimeoutError("Timed out waiting for feature set to be ready") - fetched_feature_set: Optional[FeatureSet] = self.get_feature_set( + raise TimeoutError("Timed out waiting for feature table to be ready") + fetched_feature_table: Optional[FeatureTable] = self.get_feature_table( name, project ) - if ( - fetched_feature_set is not None - and fetched_feature_set.status == FeatureSetStatus.STATUS_READY - ): - feature_set = fetched_feature_set + if fetched_feature_table is not None: + feature_table = fetched_feature_table break time.sleep(3) if timeout is not None: timeout = timeout - int(time.time() - current_time) - try: - # Kafka configs - brokers = feature_set.get_kafka_source_brokers() - topic = feature_set.get_kafka_source_topic() - producer = get_producer(brokers, row_count, disable_progress_bar) - - # Loop optimization declarations - produce = producer.produce - flush = producer.flush - ingestion_id = _generate_ingestion_id(feature_set) - - # Transform and push data to Kafka - if feature_set.source.source_type == "Kafka": - for chunk in get_feature_row_chunks( - file=dest_path, - row_groups=list(range(pq_file.num_row_groups)), - fs=feature_set, - ingestion_id=ingestion_id, - max_workers=max_workers, - ): - - # Push FeatureRow one chunk at a time to kafka - for serialized_row in chunk: - produce(topic=topic, value=serialized_row) - - # Force a flush after each chunk - flush(timeout=timeout) - - # Remove chunk from memory - del chunk - - else: - raise Exception( - f"Could not determine source type for feature set " - f'"{feature_set.name}" with source type ' - f'"{feature_set.source.source_type}"' - ) - - # Print ingestion statistics - producer.print_results() - finally: - # Remove parquet file(s) that were created earlier - print("Removing temporary file(s)...") - shutil.rmtree(dir_path) - - return ingestion_id + # Check 1) Only parquet file format for FeatureTable batch source is supported + if ( + feature_table.batch_source + and SourceType(feature_table.batch_source.type).name == "BATCH_FILE" + and "".join( + feature_table.batch_source.file_options.file_format.split() + ).lower() + != "parquet" + ): + raise Exception( + f"No suitable batch source found for FeatureTable, {name}." + f"Only BATCH_FILE source with parquet format is supported for batch ingestion." + ) - def get_statistics( - self, - feature_set_id: str, - store: str, - features: List[str] = [], - ingestion_ids: Optional[List[str]] = None, - start_date: Optional[datetime.datetime] = None, - end_date: Optional[datetime.datetime] = None, - force_refresh: bool = False, - project: Optional[str] = None, - ) -> statistics_pb2.DatasetFeatureStatisticsList: - """ - Retrieves the feature featureStatistics computed over the data in the batch - stores. + # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table + check_field_mappings( + column_names, name, feature_table.batch_source.field_mapping + ) - Args: - feature_set_id: Feature set id to retrieve batch featureStatistics for. If project - is not provided, the default ("default") will be used. - store: Name of the store to retrieve feature featureStatistics over. This - store must be a historical store. - features: Optional list of feature names to filter from the results. - ingestion_ids: Optional list of dataset Ids by which to filter data - before retrieving featureStatistics. Cannot be used with start_date - and end_date. - If multiple dataset ids are provided, unaggregatable featureStatistics - will be dropped. - start_date: Optional start date over which to filter statistical data. - Data from this date will be included. - Cannot be used with dataset_ids. If the provided period spans - multiple days, unaggregatable featureStatistics will be dropped. - end_date: Optional end date over which to filter statistical data. - Data from this data will not be included. - Cannot be used with dataset_ids. If the provided period spans - multiple days, unaggregatable featureStatistics will be dropped. - force_refresh: Setting this flag to true will force a recalculation - of featureStatistics and overwrite results currently in the cache, if any. - project: Manual override for default project. + batch_source_type = SourceType(feature_table.batch_source.type).name - Returns: - Returns a tensorflow DatasetFeatureStatisticsList containing TFDV featureStatistics. - """ + try: + if batch_source_type == "BATCH_FILE": + from urllib.parse import urlparse - if ingestion_ids is not None and ( - start_date is not None or end_date is not None - ): - raise ValueError( - "Only one of dataset_id or [start_date, end_date] can be provided." - ) + file_url = feature_table.batch_source.file_options.file_url[:-1] + uri = urlparse(file_url) + staging_client = get_staging_client(uri.scheme) - if project != "" and "/" not in feature_set_id: - feature_set_id = f"{project}/{feature_set_id}" + file_name = dest_path.split("/")[-1] + date_today = datetime.datetime.today().strftime("%Y-%m-%d") - request = GetFeatureStatisticsRequest( - feature_set_id=feature_set_id, - features=features, - store=store, - force_refresh=force_refresh, - ) - if ingestion_ids is not None: - request.ingestion_ids.extend(ingestion_ids) - else: - if start_date is not None: - request.start_date.CopyFrom( - Timestamp(seconds=int(start_date.timestamp())) + staging_client.upload_file( + dest_path, + uri.hostname, + str(uri.path).strip("/") + "/" + f"date={date_today}/" + file_name, ) - if end_date is not None: - request.end_date.CopyFrom(Timestamp(seconds=int(end_date.timestamp()))) + if batch_source_type == "BATCH_BIGQUERY": + from google.cloud import bigquery + + bq_table_ref = feature_table.batch_source.bigquery_options.table_ref + gcp_project, dataset_table = bq_table_ref.split(":") + dataset, table = dataset_table.split(".") + + client = bigquery.Client(project=gcp_project) + + table_ref = client.dataset(dataset).table(table) + job_config = bigquery.LoadJobConfig() + job_config.source_format = bigquery.SourceFormat.PARQUET + + # Check for date partitioning column in FeatureTable spec + if feature_table.batch_source.date_partition_column: + time_partitioning_obj = bigquery.table.TimePartitioning( + field=feature_table.batch_source.date_partition_column + ) + job_config.time_partitioning = time_partitioning_obj + with open(dest_path, "rb") as source_file: + client.load_table_from_file( + source_file, table_ref, job_config=job_config + ) + finally: + # Remove parquet file(s) that were created earlier + print("Removing temporary file(s)...") + shutil.rmtree(dir_path) - return self._core_service.GetFeatureStatistics( - request - ).dataset_feature_statistics_list + print( + f"Data has been successfully ingested into FeatureTable {batch_source_type} batch source." + ) def _get_grpc_metadata(self): """ @@ -1181,85 +736,9 @@ def _get_grpc_metadata(self): return () -def _infer_online_entity_rows( - entity_rows: List[Dict[str, Any]], -) -> List[GetOnlineFeaturesRequest.EntityRow]: - """ - Builds a list of EntityRow protos from Python native type format passed by user. - - Args: - entity_rows: A list of dictionaries where each key is an entity and each value is - feast.types.Value or Python native form. - - Returns: - A list of EntityRow protos parsed from args. - """ - entity_rows_dicts = cast(List[Dict[str, Any]], entity_rows) - entity_row_list = [] - entity_type_map = dict() - - for entity in entity_rows_dicts: - fields = {} - for key, value in entity.items(): - # Allow for feast.types.Value - if isinstance(value, Value): - proto_value = value - else: - # Infer the specific type for this row - current_dtype = python_type_to_feast_value_type(name=key, value=value) - - if key not in entity_type_map: - entity_type_map[key] = current_dtype - else: - if current_dtype != entity_type_map[key]: - raise TypeError( - f"Input entity {key} has mixed types, {current_dtype} and {entity_type_map[key]}. That is not allowed. " - ) - proto_value = _python_value_to_proto_value(current_dtype, value) - fields[key] = proto_value - entity_row_list.append(GetOnlineFeaturesRequest.EntityRow(fields=fields)) - return entity_row_list - - -def _build_feature_references( - feature_ref_strs: List[str], project: Optional[str] = None -) -> List[FeatureReference]: - """ - Builds a list of FeatureReference protos from string feature set references - - Args: - feature_ref_strs: List of string feature references - project: Optionally specifies the project in the parsed feature references. - - Returns: - A list of FeatureReference protos parsed from args. - """ - feature_refs = [FeatureRef.from_str(ref_str) for ref_str in feature_ref_strs] - feature_ref_protos = [ref.to_proto() for ref in feature_refs] - # apply project if specified - if project is not None: - for feature_ref_proto in feature_ref_protos: - feature_ref_proto.project = project - return feature_ref_protos - - -def _generate_ingestion_id(feature_set: FeatureSet) -> str: - """ - Generates a UUID from the feature set name, version, and the current time. - - Args: - feature_set: Feature set of the dataset to be ingested. - - Returns: - UUID unique to current time and the feature set provided. - """ - uuid_str = f"{feature_set.name}_{int(time.time())}" - return str(uuid.uuid3(uuid.NAMESPACE_DNS, uuid_str)) - - def _read_table_from_source( source: Union[pd.DataFrame, str], chunk_size: int, max_workers: int -) -> Tuple[str, str]: +) -> Tuple[str, str, List[str]]: """ Infers a data source type (path or Pandas DataFrame) and reads it in as a PyArrow Table. @@ -1283,9 +762,9 @@ def _read_table_from_source( Amount of rows to load and ingest at a time. Returns: - Tuple[str, str]: - Tuple containing parent directory path and destination path to - parquet file. + Tuple[str, str, List[str]]: + Tuple containing parent directory path, destination path to + parquet file and column names of pyarrow table. """ # Pandas DataFrame detected @@ -1320,7 +799,9 @@ def _read_table_from_source( row_group_size = min(ceil(table.num_rows / max_workers), chunk_size) pq.write_table(table=table, where=dest_path, row_group_size=row_group_size) + column_names = table.column_names + # Remove table from memory del table - return dir_path, dest_path + return dir_path, dest_path, column_names diff --git a/sdk/python/feast/contrib/__init__.py b/sdk/python/feast/contrib/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/sdk/python/feast/contrib/job_controller/__init__.py b/sdk/python/feast/contrib/job_controller/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/sdk/python/feast/contrib/job_controller/client.py b/sdk/python/feast/contrib/job_controller/client.py deleted file mode 100644 index 9a9ffbcc84..0000000000 --- a/sdk/python/feast/contrib/job_controller/client.py +++ /dev/null @@ -1,145 +0,0 @@ -from typing import Optional - -import grpc - -from feast.config import Config -from feast.constants import ( - CONFIG_CORE_ENABLE_SSL_KEY, - CONFIG_CORE_SERVER_SSL_CERT_KEY, - CONFIG_ENABLE_AUTH_KEY, - CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY, - CONFIG_JOB_CONTROLLER_SERVER_KEY, -) -from feast.contrib.job_controller.job import IngestJob -from feast.core.CoreService_pb2 import ( - ListIngestionJobsRequest, - RestartIngestionJobRequest, - StopIngestionJobRequest, -) -from feast.core.CoreService_pb2_grpc import JobControllerServiceStub -from feast.feature_set import FeatureSetRef -from feast.grpc import auth as feast_auth -from feast.grpc.grpc import create_grpc_channel - - -class Client: - """ - JobController Client: used internally to manage Ingestion Jobs - """ - - def __init__(self, options=None, **kwargs): - """ - JobControllerClient should be initialized with - jobcontroller_url: Feast JobController address - - :param options: Configuration options to initialize client with - :param kwargs: options in kwargs style - """ - if options is None: - options = dict() - self._config = Config(options={**options, **kwargs}) - - self._jobcontroller_service_stub: Optional[JobControllerServiceStub] = None - self._auth_metadata: Optional[grpc.AuthMetadataPlugin] = None - - # Configure Auth Metadata Plugin if auth is enabled - if self._config.getboolean(CONFIG_ENABLE_AUTH_KEY): - self._auth_metadata = feast_auth.get_auth_metadata_plugin(self._config) - - @property - def _jobcontroller_service(self): - if not self._jobcontroller_service_stub: - channel = create_grpc_channel( - url=self._config.get(CONFIG_JOB_CONTROLLER_SERVER_KEY), - enable_ssl=self._config.getboolean(CONFIG_CORE_ENABLE_SSL_KEY), - enable_auth=self._config.getboolean(CONFIG_ENABLE_AUTH_KEY), - ssl_server_cert_path=self._config.get(CONFIG_CORE_SERVER_SSL_CERT_KEY), - auth_metadata_plugin=self._auth_metadata, - timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY), - ) - self._jobcontroller_service_stub = JobControllerServiceStub(channel) - - return self._jobcontroller_service_stub - - def list_ingest_jobs( - self, - job_id: str = None, - feature_set_ref: FeatureSetRef = None, - store_name: str = None, - ): - """ - List the ingestion jobs currently registered in Feast, with optional filters. - Provides detailed metadata about each ingestion job. - - Args: - job_id: Select specific ingestion job with the given job_id - feature_set_ref: Filter ingestion jobs by target feature set (via reference) - store_name: Filter ingestion jobs by target feast store's name - - Returns: - List of IngestJobs matching the given filters - """ - # construct list request - feature_set_ref_proto = None - if feature_set_ref: - feature_set_ref_proto = feature_set_ref.to_proto() - list_filter = ListIngestionJobsRequest.Filter( - id=job_id, - feature_set_reference=feature_set_ref_proto, - store_name=store_name, - ) - request = ListIngestionJobsRequest(filter=list_filter) - # make list request & unpack response - response = self._jobcontroller_service.ListIngestionJobs(request, metadata=self._get_grpc_metadata(),) # type: ignore - ingest_jobs = [ - IngestJob(proto, self._jobcontroller_service, auth_metadata_plugin=self._auth_metadata) for proto in response.jobs # type: ignore - ] - - return ingest_jobs - - def restart_ingest_job(self, job: IngestJob): - """ - Restart ingestion job currently registered in Feast. - NOTE: Data might be lost during the restart for some job runners. - Does not support stopping a job in a transitional (ie pending, suspending, aborting), - terminal state (ie suspended or aborted) or unknown status - - Args: - job: IngestJob to restart - """ - request = RestartIngestionJobRequest(id=job.id) - try: - self._jobcontroller_service.RestartIngestionJob( - request, metadata=self._get_grpc_metadata(), - ) # type: ignore - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - - def stop_ingest_job(self, job: IngestJob): - """ - Stop ingestion job currently resgistered in Feast - Does nothing if the target job if already in a terminal state (ie suspended or aborted). - Does not support stopping a job in a transitional (ie pending, suspending, aborting) - or in a unknown status - - Args: - job: IngestJob to restart - """ - request = StopIngestionJobRequest(id=job.id) - try: - self._jobcontroller_service.StopIngestionJob( - request, metadata=self._get_grpc_metadata(), - ) # type: ignore - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - - def _get_grpc_metadata(self): - """ - Returns a metadata tuple to attach to gRPC requests. This is primarily - used when authentication is enabled but SSL/TLS is disabled. - - Returns: Tuple of metadata to attach to each gRPC call - """ - if self._config.getboolean(CONFIG_ENABLE_AUTH_KEY) and self._auth_metadata: - return self._auth_metadata.get_signed_meta() - return () diff --git a/sdk/python/feast/contrib/job_controller/job.py b/sdk/python/feast/contrib/job_controller/job.py deleted file mode 100644 index 8f2800cba6..0000000000 --- a/sdk/python/feast/contrib/job_controller/job.py +++ /dev/null @@ -1,122 +0,0 @@ -from typing import List - -import grpc -from google.protobuf.json_format import MessageToJson - -from feast import Source -from feast.core.CoreService_pb2 import ListIngestionJobsRequest -from feast.core.CoreService_pb2_grpc import JobControllerServiceStub -from feast.core.IngestionJob_pb2 import IngestionJob as IngestJobProto -from feast.core.IngestionJob_pb2 import IngestionJobStatus -from feast.core.Store_pb2 import Store -from feast.feature_set import FeatureSetRef -from feast.wait import wait_retry_backoff - - -class IngestJob: - """ - Defines a job for feature ingestion in feast. - """ - - def __init__( - self, - job_proto: IngestJobProto, - core_stub: JobControllerServiceStub, - auth_metadata_plugin: grpc.AuthMetadataPlugin = None, - ): - """ - Construct a native ingest job from its protobuf version. - - Args: - job_proto: Job proto object to construct from. - core_stub: stub for Feast CoreService - auth_metadata_plugin: plugin to fetch auth metadata - """ - self.proto = job_proto - self.core_svc = core_stub - self.auth_metadata = auth_metadata_plugin - - def reload(self): - """ - Update this IngestJob with the latest info from Feast - """ - # pull latest proto from feast core - response = self.core_svc.ListIngestionJobs( - ListIngestionJobsRequest( - filter=ListIngestionJobsRequest.Filter(id=self.id) - ), - metadata=self.auth_metadata.get_signed_meta() if self.auth_metadata else (), - ) - self.proto = response.jobs[0] - - @property - def id(self) -> str: - """ - Getter for IngestJob's job id. - """ - return self.proto.id - - @property - def external_id(self) -> str: - """ - Getter for IngestJob's external job id. - """ - self.reload() - return self.proto.external_id - - @property - def status(self) -> IngestionJobStatus: # type: ignore - """ - Getter for IngestJob's status - """ - self.reload() - return self.proto.status - - @property - def feature_sets(self) -> List[FeatureSetRef]: - """ - Getter for the IngestJob's feature sets - """ - # convert featureset protos to native objects - return [ - FeatureSetRef.from_proto(fs) for fs in self.proto.feature_set_references - ] - - @property - def source(self) -> Source: - """ - Getter for the IngestJob's data source. - """ - return Source.from_proto(self.proto.source) - - @property - def stores(self) -> List[Store]: - """ - Getter for the IngestJob's target feast store. - """ - return list(self.proto.stores) - - def wait(self, status: IngestionJobStatus, timeout_secs: int = 300): # type: ignore - """ - Wait for this IngestJob to transtion to the given status. - Raises TimeoutError if the wait operation times out. - - Args: - status: The IngestionJobStatus to wait for. - timeout_secs: Maximum seconds to wait before timing out. - """ - # poll & wait for job status to transition - wait_retry_backoff( - retry_fn=(lambda: (None, self.status == status)), # type: ignore - timeout_secs=timeout_secs, - timeout_msg="Wait for IngestJob's status to transition timed out", - ) - - def __str__(self): - # render the contents of ingest job as human readable string - self.reload() - return str(MessageToJson(self.proto)) - - def __repr__(self): - # render the ingest job as human readable string - return f"IngestJob<{self.id}>" diff --git a/sdk/python/feast/entity.py b/sdk/python/feast/entity.py index caa8b22f78..a6e79437af 100644 --- a/sdk/python/feast/entity.py +++ b/sdk/python/feast/entity.py @@ -22,42 +22,11 @@ from feast.core.Entity_pb2 import Entity as EntityV2Proto from feast.core.Entity_pb2 import EntityMeta as EntityMetaProto from feast.core.Entity_pb2 import EntitySpecV2 as EntitySpecProto -from feast.core.FeatureSet_pb2 import EntitySpec as EntityProto -from feast.field import Field from feast.loaders import yaml as feast_yaml -from feast.types import Value_pb2 as ValueTypeProto from feast.value_type import ValueType -class Entity(Field): - """Entity field type""" - - def to_proto(self) -> EntityProto: - """ - Converts Entity to its Protocol Buffer representation - - Returns: - Returns EntitySpec object - """ - value_type = ValueTypeProto.ValueType.Enum.Value(self.dtype.name) - return EntityProto(name=self.name, value_type=value_type,) - - @classmethod - def from_proto(cls, entity_proto: EntityProto): - """ - Creates a Feast Entity object from its Protocol Buffer representation - - Args: - entity_proto: EntitySpec protobuf object - - Returns: - Entity object - """ - entity = cls(name=entity_proto.name, dtype=ValueType(entity_proto.value_type)) - return entity - - -class EntityV2: +class Entity: """ Represents a collection of entities and associated metadata. """ @@ -81,8 +50,8 @@ def __init__( self._last_updated_timestamp: Optional[Timestamp] = None def __eq__(self, other): - if not isinstance(other, EntityV2): - raise TypeError("Comparisons should only involve EntityV2 class objects.") + if not isinstance(other, Entity): + raise TypeError("Comparisons should only involve Entity class objects.") if isinstance(self.value_type, int): self.value_type = ValueType(self.value_type).name diff --git a/sdk/python/feast/feature.py b/sdk/python/feast/feature.py index 054bf5ecc5..4627598d12 100644 --- a/sdk/python/feast/feature.py +++ b/sdk/python/feast/feature.py @@ -1,4 +1,4 @@ -# Copyright 2019 The Feast Authors +# Copyright 2020 The Feast Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,140 +12,83 @@ # See the License for the specific language governing permissions and # limitations under the License. -from feast.core.FeatureSet_pb2 import FeatureSpec as FeatureProto -from feast.field import Field -from feast.serving.ServingService_pb2 import FeatureReference as FeatureRefProto +from typing import MutableMapping, Optional + +from feast.core.Feature_pb2 import FeatureSpecV2 as FeatureSpecProto from feast.types import Value_pb2 as ValueTypeProto from feast.value_type import ValueType -class Feature(Field): +class Feature: """Feature field type""" - def to_proto(self) -> FeatureProto: - """Converts Feature object to its Protocol Buffer representation""" - value_type = ValueTypeProto.ValueType.Enum.Value(self.dtype.name) - return FeatureProto( - name=self.name, - value_type=value_type, - labels=self.labels, - presence=self.presence, - group_presence=self.group_presence, - shape=self.shape, - value_count=self.value_count, - domain=self.domain, - int_domain=self.int_domain, - float_domain=self.float_domain, - string_domain=self.string_domain, - bool_domain=self.bool_domain, - struct_domain=self.struct_domain, - natural_language_domain=self.natural_language_domain, - image_domain=self.image_domain, - mid_domain=self.mid_domain, - url_domain=self.url_domain, - time_domain=self.time_domain, - time_of_day_domain=self.time_of_day_domain, - ) + def __init__( + self, + name: str, + dtype: ValueType, + labels: Optional[MutableMapping[str, str]] = None, + ): + self._name = name + if not isinstance(dtype, ValueType): + raise ValueError("dtype is not a valid ValueType") + self._dtype = dtype + if labels is None: + self._labels = dict() # type: MutableMapping + else: + self._labels = labels - @classmethod - def from_proto(cls, feature_proto: FeatureProto): + def __eq__(self, other): + if ( + self.name != other.name + or self.dtype != other.dtype + or self.labels != other.labels + ): + return False + return True + + @property + def name(self): """ - - Args: - feature_proto: FeatureSpec protobuf object - - Returns: - Feature object + Getter for name of this field """ - feature = cls( - name=feature_proto.name, - dtype=ValueType(feature_proto.value_type), - labels=feature_proto.labels, - ) - feature.update_presence_constraints(feature_proto) - feature.update_shape_type(feature_proto) - feature.update_domain_info(feature_proto) - return feature + return self._name - -class FeatureRef: - """ Feature Reference represents a reference to a specific feature. """ - - def __init__(self, name: str, feature_set: str = None): - self.proto = FeatureRefProto(name=name, feature_set=feature_set) - - @classmethod - def from_proto(cls, proto: FeatureRefProto): + @property + def dtype(self) -> ValueType: """ - Construct a feature reference from the given FeatureReference proto - - Arg: - proto: Protobuf FeatureReference to construct from - - Returns: - FeatureRef that refers to the given feature + Getter for data type of this field """ - return cls(name=proto.name, feature_set=proto.feature_set) + return self._dtype - @classmethod - def from_str(cls, feature_ref_str: str, ignore_project: bool = False): + @property + def labels(self) -> MutableMapping[str, str]: """ - Parse the given string feature reference into FeatureRef model - String feature reference should be in the format feature_set:feature. - Where "feature_set" and "name" are the feature_set name and feature name - respectively. - - Args: - feature_ref_str: String representation of the feature reference - ignore_project: Ignore projects in given string feature reference - instead throwing an error - - Returns: - FeatureRef that refers to the given feature + Getter for labels of this field """ - proto = FeatureRefProto() - if "/" in feature_ref_str: - if ignore_project: - _, feature_ref_str = feature_ref_str.split("/") - else: - raise ValueError(f"Unsupported feature reference: {feature_ref_str}") + return self._labels - # parse feature set name if specified - if ":" in feature_ref_str: - proto.feature_set, feature_ref_str = feature_ref_str.split(":") + def to_proto(self) -> FeatureSpecProto: + """Converts Feature object to its Protocol Buffer representation""" + value_type = ValueTypeProto.ValueType.Enum.Value(self.dtype.name) - proto.name = feature_ref_str - return cls.from_proto(proto) + return FeatureSpecProto( + name=self.name, value_type=value_type, labels=self.labels, + ) - def to_proto(self) -> FeatureRefProto: + @classmethod + def from_proto(cls, feature_proto: FeatureSpecProto): """ - Convert and return this feature set reference to protobuf. + Args: + feature_proto: FeatureSpecV2 protobuf object Returns: - Protobuf respresentation of this feature set reference. + Feature object """ - return self.proto - - def __repr__(self): - # return string representation of the reference - # [project/][feature_set:]name - # in protov3 unset string and int fields default to "" and 0 - ref_str = "" - if len(self.proto.project) > 0: - ref_str += self.proto.project + "/" - if len(self.proto.feature_set) > 0: - ref_str += self.proto.feature_set + ":" - ref_str += self.proto.name - return ref_str - def __str__(self): - # human readable string of the reference - return f"FeatureRef<{self.__repr__()}>" - - def __eq__(self, other): - # compare with other feature set - return hash(self) == hash(other) + feature = cls( + name=feature_proto.name, + dtype=ValueType(feature_proto.value_type), + labels=feature_proto.labels, + ) - def __hash__(self): - # hash this reference - return hash(repr(self)) + return feature diff --git a/sdk/python/feast/feature_set.py b/sdk/python/feast/feature_set.py deleted file mode 100644 index fd2e17a2eb..0000000000 --- a/sdk/python/feast/feature_set.py +++ /dev/null @@ -1,1078 +0,0 @@ -# Copyright 2019 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import warnings -from collections import OrderedDict -from typing import Dict, List, MutableMapping, Optional - -import pandas as pd -import pyarrow as pa -import yaml -from google.protobuf import json_format -from google.protobuf.duration_pb2 import Duration -from google.protobuf.json_format import MessageToDict, MessageToJson -from google.protobuf.message import Message -from google.protobuf.timestamp_pb2 import Timestamp -from pandas.api.types import is_datetime64_ns_dtype -from pyarrow.lib import TimestampType - -from feast.core.FeatureSet_pb2 import FeatureSet as FeatureSetProto -from feast.core.FeatureSet_pb2 import FeatureSetMeta as FeatureSetMetaProto -from feast.core.FeatureSet_pb2 import FeatureSetSpec as FeatureSetSpecProto -from feast.core.FeatureSetReference_pb2 import ( - FeatureSetReference as FeatureSetReferenceProto, -) -from feast.entity import Entity -from feast.feature import Feature, Field -from feast.loaders import yaml as feast_yaml -from feast.source import Source -from feast.type_map import ( - DATETIME_COLUMN, - pa_to_feast_value_type, - python_type_to_feast_value_type, -) -from tensorflow_metadata.proto.v0 import schema_pb2 - - -class FeatureSet: - """ - Represents a collection of features and associated metadata. - """ - - def __init__( - self, - name: str, - project: str = None, - features: List[Feature] = None, - entities: List[Entity] = None, - source: Source = None, - max_age: Optional[Duration] = None, - labels: Optional[MutableMapping[str, str]] = None, - ): - self._name = name - self._project = project - self._fields = OrderedDict() # type: Dict[str, Field] - if features is not None: - self.features: Optional[List[Feature]] = features - if entities is not None: - self.entities = entities - if source is None: - self._source = None - else: - self._source = source - if labels is None: - self._labels = OrderedDict() # type: MutableMapping[str, str] - else: - self._labels = labels - self._max_age = max_age - self._status = None - self._created_timestamp: Optional[Timestamp] = None - - def __eq__(self, other): - if not isinstance(other, FeatureSet): - return NotImplemented - - for key in self.fields.keys(): - if key not in other.fields.keys() or self.fields[key] != other.fields[key]: - return False - - if self.fields[key] != other.fields[key]: - return False - - if ( - self.labels != other.labels - or self.name != other.name - or self.project != other.project - or self.max_age != other.max_age - ): - return False - - if self.source != other.source: - return False - return True - - def __str__(self): - return str(MessageToJson(self.to_proto())) - - def __repr__(self): - return FeatureSetRef.from_feature_set(self).__repr__() - - @property - def fields(self) -> Dict[str, Field]: - """ - Returns a dict of fields from this feature set - """ - return self._fields - - @property - def features(self) -> List[Feature]: - """ - Returns a list of features from this feature set - """ - return [field for field in self._fields.values() if isinstance(field, Feature)] - - @features.setter - def features(self, features: List[Feature]): - """ - Sets the active features within this feature set - - Args: - features: List of feature objects - """ - for feature in features: - if not isinstance(feature, Feature): - raise Exception("object type is not a Feature: " + str(type(feature))) - - for key in list(self._fields.keys()): - if isinstance(self._fields[key], Feature): - del self._fields[key] - - if features is not None: - self._add_fields(features) - - @property - def entities(self) -> List[Entity]: - """ - Returns list of entities from this feature set - """ - return [field for field in self._fields.values() if isinstance(field, Entity)] - - @entities.setter - def entities(self, entities: List[Entity]): - """ - Sets the active entities within this feature set - - Args: - entities: List of entities objects - """ - for entity in entities: - if not isinstance(entity, Entity): - raise Exception("object type is not na Entity: " + str(type(entity))) - - for key in list(self._fields.keys()): - if isinstance(self._fields[key], Entity): - del self._fields[key] - - if entities is not None: - self._add_fields(entities) - - @property - def name(self): - """ - Returns the name of this feature set - """ - return self._name - - @name.setter - def name(self, name): - """ - Sets the name of this feature set - """ - self._name = name - - @property - def project(self): - """ - Returns the project that this feature set belongs to - """ - return self._project - - @project.setter - def project(self, project): - """ - Sets the project that this feature set belongs to - """ - self._project = project - - @property - def source(self): - """ - Returns the source of this feature set - """ - return self._source - - @source.setter - def source(self, source: Source): - """ - Sets the source of this feature set - """ - self._source = source - - @property - def max_age(self): - """ - Returns the maximum age of this feature set. This is the total maximum - amount of staleness that will be allowed during feature retrieval for - each specific feature row that is looked up. - """ - return self._max_age - - @max_age.setter - def max_age(self, max_age): - """ - Set the maximum age for this feature set - """ - self._max_age = max_age - - @property - def labels(self): - """ - Returns the labels of this feature set. This is the user defined metadata - defined as a dictionary. - """ - return self._labels - - @labels.setter - def labels(self, labels: MutableMapping[str, str]): - """ - Set the labels for this feature set - """ - self._labels = labels - - @property - def status(self): - """ - Returns the status of this feature set - """ - return self._status - - @status.setter - def status(self, status): - """ - Sets the status of this feature set - """ - self._status = status - - @property - def created_timestamp(self): - """ - Returns the created_timestamp of this feature set - """ - return self._created_timestamp - - @created_timestamp.setter - def created_timestamp(self, created_timestamp): - """ - Sets the status of this feature set - """ - self._created_timestamp = created_timestamp - - def set_label(self, key: str, value: str): - """ - Sets the label value for a given key - """ - self.labels[key] = value - - def remove_label(self, key: str): - """ - Removes a label based on key - """ - del self.labels[key] - - def add(self, resource): - """ - Adds a resource (Feature, Entity) to this Feature Set. - Does not register the updated Feature Set with Feast Core - - Args: - resource: A resource can be either a Feature or an Entity object - """ - if resource.name in self._fields.keys(): - raise ValueError( - 'could not add field "' - + resource.name - + '" since it already exists in feature set "' - + self._name - + '"' - ) - - if issubclass(type(resource), Field): - return self._set_field(resource) - - raise ValueError("Could not identify the resource being added") - - def _set_field(self, field: Field): - self._fields[field.name] = field - return - - def drop(self, name: str): - """ - Removes a Feature or Entity from a Feature Set. This does not apply - any changes to Feast Core until the apply() method is called. - - Args: - name: Name of Feature or Entity to be removed - """ - del self._fields[name] - - def _add_fields(self, fields): - """ - Adds multiple Fields to a Feature Set - - Args: - fields: List of Field (Feature or Entity) Objects - """ - for field in fields: - self.add(field) - - def infer_fields_from_df( - self, - df: pd.DataFrame, - entities: Optional[List[Entity]] = None, - features: Optional[List[Feature]] = None, - replace_existing_features: bool = False, - replace_existing_entities: bool = False, - discard_unused_fields: bool = False, - rows_to_sample: int = 100, - ): - """ - Adds fields (Features or Entities) to a feature set based on the schema - of a Datatframe. Only Pandas dataframes are supported. All columns are - detected as features, so setting at least one entity manually is - advised. - - Args: - df: Pandas dataframe to read schema from - entities: List of entities that will be set manually and not - inferred. These will take precedence over any existing entities - or entities found in the dataframe. - features: List of features that will be set manually and not - inferred. These will take precedence over any existing feature - or features found in the dataframe. - replace_existing_features: If true, will replace - existing features in this feature set with features found in - dataframe. If false, will skip conflicting features. - replace_existing_entities: If true, will replace existing entities - in this feature set with features found in dataframe. If false, - will skip conflicting entities. - discard_unused_fields: Boolean flag. Setting this to True will - discard any existing fields that are not found in the dataset or - provided by the user - rows_to_sample: Number of rows to sample to infer types. All rows - must have consistent types, even values within list types must - be homogeneous - """ - - if entities is None: - entities = list() - if features is None: - features = list() - - # Validate whether the datetime column exists with the right name - if DATETIME_COLUMN not in df: - raise Exception("No column 'datetime'") - - # Validate the data type for the datetime column - if not is_datetime64_ns_dtype(df.dtypes[DATETIME_COLUMN]): - raise Exception( - "Column 'datetime' does not have the correct type: datetime64[ns]" - ) - - # Create dictionary of fields that will not be inferred (manually set) - provided_fields = OrderedDict() - fields = _create_field_list(entities, features) - - for field in fields: - if not isinstance(field, Field): - raise Exception(f"Invalid field object type provided {type(field)}") - if field.name not in provided_fields: - provided_fields[field.name] = field - else: - raise Exception(f"Duplicate field name detected {field.name}.") - - new_fields = self._fields.copy() - output_log = "" - - # Add in provided fields - for name, field in provided_fields.items(): - if name in new_fields.keys(): - upsert_message = "created" - else: - upsert_message = "updated (replacing an existing field)" - - output_log += ( - f"{type(field).__name__} {field.name}" - f"({field.dtype}) manually {upsert_message}.\n" - ) - new_fields[name] = field - - # Iterate over all of the columns and create features - for column in df.columns: - column = column.strip() - - # Skip datetime column - if DATETIME_COLUMN in column: - continue - - # Skip user provided fields - if column in provided_fields.keys(): - continue - - # Only overwrite conflicting fields if replacement is allowed - if column in new_fields: - if ( - isinstance(self._fields[column], Feature) - and not replace_existing_features - ): - continue - - if ( - isinstance(self._fields[column], Entity) - and not replace_existing_entities - ): - continue - - # Store this field as a feature - new_fields[column] = Feature( - name=column, - dtype=_infer_pd_column_type(column, df[column], rows_to_sample), - ) - - output_log += f"{type(new_fields[column]).__name__} {new_fields[column].name} ({new_fields[column].dtype}) added from dataframe.\n" - - # Discard unused fields from feature set - if discard_unused_fields: - keys_to_remove = [] - for key in new_fields.keys(): - if not (key in df.columns or key in provided_fields.keys()): - output_log += f"{type(new_fields[key]).__name__} {new_fields[key].name} ({new_fields[key].dtype}) removed because it is unused.\n" - keys_to_remove.append(key) - for key in keys_to_remove: - del new_fields[key] - - # Update feature set - self._fields = new_fields - print(output_log) - - def infer_fields_from_pa( - self, - table: pa.lib.Table, - entities: Optional[List[Entity]] = None, - features: Optional[List[Feature]] = None, - replace_existing_features: bool = False, - replace_existing_entities: bool = False, - discard_unused_fields: bool = False, - ) -> None: - """ - Adds fields (Features or Entities) to a feature set based on the schema - of a PyArrow table. Only PyArrow tables are supported. All columns are - detected as features, so setting at least one entity manually is - advised. - - - Args: - table (pyarrow.lib.Table): - PyArrow table to read schema from. - - entities (Optional[List[Entity]]): - List of entities that will be set manually and not inferred. - These will take precedence over any existing entities or - entities found in the PyArrow table. - - features (Optional[List[Feature]]): - List of features that will be set manually and not inferred. - These will take precedence over any existing feature or features - found in the PyArrow table. - - replace_existing_features (bool): - Boolean flag. If true, will replace existing features in this - feature set with features found in dataframe. If false, will - skip conflicting features. - - replace_existing_entities (bool): - Boolean flag. If true, will replace existing entities in this - feature set with features found in dataframe. If false, will - skip conflicting entities. - - discard_unused_fields (bool): - Boolean flag. Setting this to True will discard any existing - fields that are not found in the dataset or provided by the - user. - - Returns: - None: - None - """ - if entities is None: - entities = list() - if features is None: - features = list() - - # Validate whether the datetime column exists with the right name - if DATETIME_COLUMN not in table.column_names: - raise Exception("No column 'datetime'") - - # Validate the date type for the datetime column - if not isinstance(table.column(DATETIME_COLUMN).type, TimestampType): - raise Exception( - "Column 'datetime' does not have the correct type: datetime64[ms]" - ) - - # Create dictionary of fields that will not be inferred (manually set) - provided_fields = OrderedDict() - fields = _create_field_list(entities, features) - - for field in fields: - if not isinstance(field, Field): - raise Exception(f"Invalid field object type provided {type(field)}") - if field.name not in provided_fields: - provided_fields[field.name] = field - else: - raise Exception(f"Duplicate field name detected {field.name}.") - - new_fields = self._fields.copy() - output_log = "" - - # Add in provided fields - for name, field in provided_fields.items(): - if name in new_fields.keys(): - upsert_message = "created" - else: - upsert_message = "updated (replacing an existing field)" - - output_log += ( - f"{type(field).__name__} {field.name}" - f"({field.dtype}) manually {upsert_message}.\n" - ) - new_fields[name] = field - - # Iterate over all of the column names and create features - for column in table.column_names: - column = column.strip() - - # Skip datetime column - if DATETIME_COLUMN in column: - continue - - # Skip user provided fields - if column in provided_fields.keys(): - continue - - # Only overwrite conflicting fields if replacement is allowed - if column in new_fields: - if ( - isinstance(self._fields[column], Feature) - and not replace_existing_features - ): - continue - - if ( - isinstance(self._fields[column], Entity) - and not replace_existing_entities - ): - continue - - # Store this fields as a feature - # TODO: (Minor) Change the parameter name from dtype to patype - new_fields[column] = Feature( - name=column, dtype=self._infer_pa_column_type(table.column(column)) - ) - - output_log += f"{type(new_fields[column]).__name__} {new_fields[column].name} ({new_fields[column].dtype}) added from PyArrow Table.\n" - - # Discard unused fields from feature set - if discard_unused_fields: - keys_to_remove = [] - for key in new_fields.keys(): - if not (key in table.column_names or key in provided_fields.keys()): - output_log += f"{type(new_fields[key]).__name__} {new_fields[key].name} ({new_fields[key].dtype}) removed because it is unused.\n" - keys_to_remove.append(key) - for key in keys_to_remove: - del new_fields[key] - - # Update feature set - self._fields = new_fields - print(output_log) - - def _infer_pd_column_type(self, column, series, rows_to_sample): - dtype = None - sample_count = 0 - - # Loop over all rows for this column to infer types - for key, value in series.iteritems(): - sample_count += 1 - # Stop sampling at the row limit - if sample_count > rows_to_sample: - continue - - # Infer the specific type for this row - current_dtype = python_type_to_feast_value_type(name=column, value=value) - - # Make sure the type is consistent for column - if dtype: - if dtype != current_dtype: - raise ValueError( - f"Type mismatch detected in column {column}. Both " - f"the types {current_dtype} and {dtype} " - f"have been found." - ) - else: - # Store dtype in field to type map if it isnt already - dtype = current_dtype - - return dtype - - def _infer_pa_column_type(self, column: pa.lib.ChunkedArray): - """ - Infers the PyArrow column type. - - :param column: Column from a PyArrow table - :type column: pa.lib.ChunkedArray - :return: - :rtype: - """ - # Validates the column to ensure that value types are consistent - column.validate() - return pa_to_feast_value_type(column) - - def _update_from_feature_set(self, feature_set): - """ - Deep replaces one feature set with another - - Args: - feature_set: Feature set to use as a source of configuration - """ - - self.name = feature_set.name - self.project = feature_set.project - self.source = feature_set.source - self.max_age = feature_set.max_age - self.features = feature_set.features - self.entities = feature_set.entities - self.source = feature_set.source - self.status = feature_set.status - self.created_timestamp = feature_set.created_timestamp - - def get_kafka_source_brokers(self) -> str: - """ - Get the broker list for the source in this feature set - """ - if self.source and self.source.source_type == "Kafka": - return self.source.brokers - raise Exception("Source type could not be identified") - - def get_kafka_source_topic(self) -> str: - """ - Get the topic that this feature set has been configured to use as source - """ - if self.source and self.source.source_type == "Kafka": - return self.source.topic - raise Exception("Source type could not be identified") - - def is_valid(self): - """ - Validates the state of a feature set locally. Raises an exception - if feature set is invalid. - """ - - if not self.name: - raise ValueError("No name found in feature set.") - - if len(self.entities) == 0: - raise ValueError("No entities found in feature set {self.name}") - - def import_tfx_schema(self, schema: schema_pb2.Schema): - """ - Updates presence_constraints, shape_type and domain_info for all fields - (features and entities) in the FeatureSet from schema in the Tensorflow metadata. - - Args: - schema: Schema from Tensorflow metadata - - Returns: - None - - """ - _make_tfx_schema_domain_info_inline(schema) - for feature_from_tfx_schema in schema.feature: - if feature_from_tfx_schema.name in self._fields.keys(): - field = self._fields[feature_from_tfx_schema.name] - field.update_presence_constraints(feature_from_tfx_schema) - field.update_shape_type(feature_from_tfx_schema) - field.update_domain_info(feature_from_tfx_schema) - else: - warnings.warn( - f"The provided schema contains feature name '{feature_from_tfx_schema.name}' " - f"that does not exist in the FeatureSet '{self.name}' in Feast" - ) - - def export_tfx_schema(self) -> schema_pb2.Schema: - """ - Create a Tensorflow metadata schema from a FeatureSet. - - Returns: - Tensorflow metadata schema. - - """ - schema = schema_pb2.Schema() - - # List of attributes to copy from fields in the FeatureSet to feature in - # Tensorflow metadata schema where the attribute name is the same. - attributes_to_copy_from_field_to_feature = [ - "name", - "presence", - "group_presence", - "shape", - "value_count", - "domain", - "int_domain", - "float_domain", - "string_domain", - "bool_domain", - "struct_domain", - "_natural_language_domain", - "image_domain", - "mid_domain", - "url_domain", - "time_domain", - "time_of_day_domain", - ] - - for _, field in self._fields.items(): - if isinstance(field, Entity): - continue - feature = schema_pb2.Feature() - for attr in attributes_to_copy_from_field_to_feature: - if getattr(field, attr) is None: - # This corresponds to an unset member in the proto Oneof field. - continue - if issubclass(type(getattr(feature, attr)), Message): - # Proto message field to copy is an "embedded" field, so MergeFrom() - # method must be used. - getattr(feature, attr).MergeFrom(getattr(field, attr)) - elif issubclass(type(getattr(feature, attr)), (int, str, bool)): - # Proto message field is a simple Python type, so setattr() - # can be used. - setattr(feature, attr, getattr(field, attr)) - else: - warnings.warn( - f"Attribute '{attr}' cannot be copied from Field " - f"'{field.name}' in FeatureSet '{self.name}' to a " - f"Feature in the Tensorflow metadata schema, because" - f"the type is neither a Protobuf message or Python " - f"int, str and bool" - ) - # "type" attr is handled separately because the attribute name is different - # ("dtype" in field and "type" in Feature) and "type" in Feature is only - # a subset of "dtype". - feature.type = field.dtype.to_tfx_schema_feature_type() - schema.feature.append(feature) - - return schema - - @classmethod - def from_yaml(cls, yml: str): - """ - Creates a feature set from a YAML string body or a file path - - Args: - yml: Either a file path containing a yaml file or a YAML string - - Returns: - Returns a FeatureSet object based on the YAML file - """ - - return cls.from_dict(feast_yaml.yaml_loader(yml, load_single=True)) - - @classmethod - def from_dict(cls, fs_dict): - """ - Creates a feature set from a dict - - Args: - fs_dict: A dict representation of a feature set - - Returns: - Returns a FeatureSet object based on the feature set dict - """ - - feature_set_proto = json_format.ParseDict( - fs_dict, FeatureSetProto(), ignore_unknown_fields=True - ) - return cls.from_proto(feature_set_proto) - - @classmethod - def from_proto(cls, feature_set_proto: FeatureSetProto): - """ - Creates a feature set from a protobuf representation of a feature set - - Args: - feature_set_proto: A protobuf representation of a feature set - - Returns: - Returns a FeatureSet object based on the feature set protobuf - """ - - feature_set = cls( - name=feature_set_proto.spec.name, - features=[ - Feature.from_proto(feature) - for feature in feature_set_proto.spec.features - ], - entities=[ - Entity.from_proto(entity) for entity in feature_set_proto.spec.entities - ], - max_age=( - None - if feature_set_proto.spec.max_age.seconds == 0 - and feature_set_proto.spec.max_age.nanos == 0 - else feature_set_proto.spec.max_age - ), - labels=feature_set_proto.spec.labels, - source=( - None - if feature_set_proto.spec.source.type == 0 - else Source.from_proto(feature_set_proto.spec.source) - ), - project=None - if len(feature_set_proto.spec.project) == 0 - else feature_set_proto.spec.project, - ) - feature_set._status = feature_set_proto.meta.status # type: ignore - feature_set._created_timestamp = feature_set_proto.meta.created_timestamp - return feature_set - - def to_proto(self) -> FeatureSetProto: - """ - Converts a feature set object to its protobuf representation - - Returns: - FeatureSetProto protobuf - """ - - meta = FeatureSetMetaProto( - created_timestamp=self.created_timestamp, status=self.status - ) - - spec = FeatureSetSpecProto( - name=self.name, - project=self.project, - max_age=self.max_age, - labels=self.labels, - source=self.source.to_proto() if self.source is not None else None, - features=[ - field.to_proto() - for field in self._fields.values() - if type(field) == Feature - ], - entities=[ - field.to_proto() - for field in self._fields.values() - if type(field) == Entity - ], - ) - - return FeatureSetProto(spec=spec, meta=meta) - - def to_dict(self) -> Dict: - """ - Converts feature set to dict - - :return: Dictionary object representation of feature set - """ - feature_set_dict = MessageToDict(self.to_proto()) - - # Remove meta when empty for more readable exports - if feature_set_dict["meta"] == {}: - del feature_set_dict["meta"] - - return feature_set_dict - - def to_yaml(self): - """ - Converts a feature set to a YAML string. - - :return: Feature set string returned in YAML format - """ - feature_set_dict = self.to_dict() - return yaml.dump(feature_set_dict, allow_unicode=True, sort_keys=False) - - -class FeatureSetRef: - """ - Represents a reference to a featureset - """ - - def __init__(self, project: str = None, name: str = None): - self.proto = FeatureSetReferenceProto(project=project, name=name) - - @property - def project(self) -> str: - """ - Get the project of feature set referenced by this reference - """ - return self.proto.project - - @property - def name(self) -> str: - """ - Get the name of feature set referenced by this reference - """ - return self.proto.name - - @classmethod - def from_proto(cls, feature_set_ref_proto: FeatureSetReferenceProto): - return cls( - project=feature_set_ref_proto.project, name=feature_set_ref_proto.name, - ) - - @classmethod - def from_feature_set(cls, feature_set: FeatureSet): - """ - Construct a feature set reference that refers to the given feature set. - - Args: - feature_set: Feature set to create reference from. - - Returns: - FeatureSetRef that refers to the given feature set - """ - return cls(feature_set.project, feature_set.name) - - @classmethod - def from_str(cls, ref_str: str): - """ - Parse a feature reference from string representation. - (as defined by __repr__()) - - Args: - ref_str: string representation of the reference. - - Returns: - FeatureSetRef constructed from the string - """ - project = "" - if "/" in ref_str: - project, ref_str = ref_str.split("/") - - return cls(project, ref_str) - - def to_proto(self) -> FeatureSetReferenceProto: - """ - Convert and return this feature set reference to protobuf. - - Returns: - Protobuf version of this feature set reference. - """ - return self.proto - - def __str__(self): - # human readable string of the reference - return f"FeatureSetRef<{self.__repr__()}>" - - def __repr__(self): - # return string representation of the reference - # [project/]name - # in protov3 unset string and int fields default to "" and 0 - ref_str = "" - if len(self.proto.project) > 0: - ref_str += self.proto.project + "/" - ref_str += self.proto.name - return ref_str - - def __eq__(self, other): - # compare with other feature set - return hash(self) == hash(other) - - def __hash__(self): - # hash this reference - return hash(repr(self)) - - -def _make_tfx_schema_domain_info_inline(schema: schema_pb2.Schema) -> None: - """ - Copy top level domain info defined at schema level into inline definition. - One use case is when importing domain info from Tensorflow metadata schema - into Feast features. Feast features do not have access to schema level information - so the domain info needs to be inline. - - Args: - schema: Tensorflow metadata schema - - Returns: None - """ - # Reference to domains defined at schema level - domain_ref_to_string_domain = {d.name: d for d in schema.string_domain} - domain_ref_to_float_domain = {d.name: d for d in schema.float_domain} - domain_ref_to_int_domain = {d.name: d for d in schema.int_domain} - - # With the reference, it is safe to remove the domains defined at schema level - del schema.string_domain[:] - del schema.float_domain[:] - del schema.int_domain[:] - - for feature in schema.feature: - domain_info_case = feature.WhichOneof("domain_info") - if domain_info_case == "domain": - domain_ref = feature.domain - if domain_ref in domain_ref_to_string_domain: - feature.string_domain.MergeFrom(domain_ref_to_string_domain[domain_ref]) - elif domain_ref in domain_ref_to_float_domain: - feature.float_domain.MergeFrom(domain_ref_to_float_domain[domain_ref]) - elif domain_ref in domain_ref_to_int_domain: - feature.int_domain.MergeFrom(domain_ref_to_int_domain[domain_ref]) - - -def _infer_pd_column_type(column, series, rows_to_sample): - dtype = None - sample_count = 0 - - # Loop over all rows for this column to infer types - for key, value in series.iteritems(): - sample_count += 1 - # Stop sampling at the row limit - if sample_count > rows_to_sample: - continue - - # Infer the specific type for this row - current_dtype = python_type_to_feast_value_type(name=column, value=value) - - # Make sure the type is consistent for column - if dtype: - if dtype != current_dtype: - raise ValueError( - f"Type mismatch detected in column {column}. Both " - f"the types {current_dtype} and {dtype} " - f"have been found." - ) - else: - # Store dtype in field to type map if it isnt already - dtype = current_dtype - - return dtype - - -def _create_field_list(entities: List[Entity], features: List[Feature]) -> List[Field]: - """ - Convert entities and features List to Field List - - Args: - entities: List of Entity Objects - features: List of Features Objects - - - Returns: - List[Field]: - List of field from entities and features combined - """ - fields: List[Field] = [] - - for entity in entities: - if isinstance(entity, Field): - fields.append(entity) - - for feature in features: - if isinstance(feature, Field): - fields.append(feature) - - return fields diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index 6e73df78c3..ebe69e7fad 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -31,7 +31,7 @@ KinesisOptions, SourceType, ) -from feast.feature_v2 import FeatureV2 +from feast.feature import Feature from feast.loaders import yaml as feast_yaml @@ -44,7 +44,7 @@ def __init__( self, name: str, entities: Union[str, List[str]], - features: Union[FeatureV2, List[FeatureV2]], + features: Union[Feature, List[Feature]], batch_source: Optional[DataSource] = None, stream_source: Optional[DataSource] = None, max_age: Optional[Duration] = None, @@ -320,7 +320,7 @@ def from_proto(cls, feature_table_proto: FeatureTableProto): name=feature_table_proto.spec.name, entities=[entity for entity in feature_table_proto.spec.entities], features=[ - FeatureV2.from_proto(feature).to_proto() + Feature.from_proto(feature).to_proto() for feature in feature_table_proto.spec.features ], labels=feature_table_proto.spec.labels, @@ -420,7 +420,7 @@ def _update_from_feature_table(self, feature_table): Deep replaces one feature table with another Args: - feature_table: Feature set to use as a source of configuration + feature_table: Feature table to use as a source of configuration """ self.name = feature_table.name diff --git a/sdk/python/feast/feature_v2.py b/sdk/python/feast/feature_v2.py deleted file mode 100644 index f3aecf3a4f..0000000000 --- a/sdk/python/feast/feature_v2.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright 2020 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import MutableMapping, Optional - -from feast.core.Feature_pb2 import FeatureSpecV2 as FeatureSpecProto -from feast.types import Value_pb2 as ValueTypeProto -from feast.value_type import ValueType - - -class FeatureV2: - """FeatureV2 field type""" - - def __init__( - self, - name: str, - dtype: ValueType, - labels: Optional[MutableMapping[str, str]] = None, - ): - self._name = name - if not isinstance(dtype, ValueType): - raise ValueError("dtype is not a valid ValueType") - self._dtype = dtype - if labels is None: - self._labels = dict() # type: MutableMapping - else: - self._labels = labels - - def __eq__(self, other): - if ( - self.name != other.name - or self.dtype != other.dtype - or self.labels != other.labels - ): - return False - return True - - @property - def name(self): - """ - Getter for name of this field - """ - return self._name - - @property - def dtype(self) -> ValueType: - """ - Getter for data type of this field - """ - return self._dtype - - @property - def labels(self) -> MutableMapping[str, str]: - """ - Getter for labels of this field - """ - return self._labels - - def to_proto(self) -> FeatureSpecProto: - """Converts FeatureV2 object to its Protocol Buffer representation""" - value_type = ValueTypeProto.ValueType.Enum.Value(self.dtype.name) - - return FeatureSpecProto( - name=self.name, value_type=value_type, labels=self.labels, - ) - - @classmethod - def from_proto(cls, feature_proto: FeatureSpecProto): - """ - Args: - feature_proto: FeatureSpecV2 protobuf object - - Returns: - FeatureV2 object - """ - - feature = cls( - name=feature_proto.name, - dtype=ValueType(feature_proto.value_type), - labels=feature_proto.labels, - ) - - return feature diff --git a/sdk/python/feast/job.py b/sdk/python/feast/job.py deleted file mode 100644 index ff684d9cbe..0000000000 --- a/sdk/python/feast/job.py +++ /dev/null @@ -1,210 +0,0 @@ -from typing import List -from urllib.parse import urlparse - -import fastavro -import grpc -import pandas as pd - -from feast.constants import CONFIG_TIMEOUT_KEY -from feast.constants import FEAST_DEFAULT_OPTIONS as defaults -from feast.serving.ServingService_pb2 import ( - DATA_FORMAT_AVRO, - JOB_STATUS_DONE, - GetJobRequest, -) -from feast.serving.ServingService_pb2 import Job as JobProto -from feast.serving.ServingService_pb2_grpc import ServingServiceStub -from feast.staging.storage_client import get_staging_client -from feast.wait import wait_retry_backoff -from tensorflow_metadata.proto.v0 import statistics_pb2 - -# Maximum no of seconds to wait until the retrieval jobs status is DONE in Feast -# Currently set to the maximum query execution time limit in BigQuery -DEFAULT_TIMEOUT_SEC: int = 21600 - -# Maximum no of seconds to wait before reloading the job status in Feast -MAX_WAIT_INTERVAL_SEC: int = 60 - - -class RetrievalJob: - """ - A class representing a job for feature retrieval in Feast. - """ - - def __init__( - self, - job_proto: JobProto, - serving_stub: ServingServiceStub, - auth_metadata_plugin: grpc.AuthMetadataPlugin = None, - ): - """ - Args: - job_proto: Job proto object (wrapped by this job object) - serving_stub: Stub for Feast serving service - auth_metadata_plugin: plugin to fetch auth metadata - """ - self.job_proto = job_proto - self.serving_stub = serving_stub - self.auth_metadata = auth_metadata_plugin - - @property - def id(self): - """ - Getter for the Job Id - """ - return self.job_proto.id - - @property - def status(self): - """ - Getter for the Job status from Feast Core - """ - return self.job_proto.status - - def reload(self): - """ - Reload the latest job status - Returns: None - """ - self.job_proto = self.serving_stub.GetJob( - GetJobRequest(job=self.job_proto), - metadata=self.auth_metadata.get_signed_meta() if self.auth_metadata else (), - ).job - - def get_avro_files(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])): - """ - Wait until job is done to get the file uri to Avro result files on - Google Cloud Storage. - - Args: - timeout_sec (int): - Max no of seconds to wait until job is done. If "timeout_sec" - is exceeded, an exception will be raised. - - Returns: - str: Google Cloud Storage file uris of the returned Avro files. - """ - - def try_retrieve(): - self.reload() - return None, self.status == JOB_STATUS_DONE - - wait_retry_backoff( - retry_fn=try_retrieve, - timeout_secs=timeout_sec, - timeout_msg="Timeout exceeded while waiting for result. Please retry " - "this method or use a longer timeout value.", - ) - - if self.job_proto.error: - raise Exception(self.job_proto.error) - - if self.job_proto.data_format != DATA_FORMAT_AVRO: - raise Exception( - "Feast only supports Avro data format for now. Please check " - "your Feast Serving deployment." - ) - - return [urlparse(uri) for uri in self.job_proto.file_uris] - - def result(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])): - """ - Wait until job is done to get an iterable rows of result. The row can - only represent an Avro row in Feast 0.3. - - Args: - timeout_sec (int): - Max no of seconds to wait until job is done. If "timeout_sec" - is exceeded, an exception will be raised. - - Returns: - Iterable of Avro rows. - """ - uris = self.get_avro_files(timeout_sec) - for file_uri in uris: - file_obj = get_staging_client(file_uri.scheme).download_file(file_uri) - file_obj.seek(0) - avro_reader = fastavro.reader(file_obj) - - for record in avro_reader: - yield record - - def to_dataframe( - self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY]) - ) -> pd.DataFrame: - """ - Wait until a job is done to get an iterable rows of result. This method - will return the response as a DataFrame. - - Args: - timeout_sec (int): - Max no of seconds to wait until job is done. If "timeout_sec" - is exceeded, an exception will be raised. - - Returns: - pd.DataFrame: - Pandas DataFrame of the feature values. - """ - records = [r for r in self.result(timeout_sec=timeout_sec)] - return pd.DataFrame.from_records(records) - - def to_chunked_dataframe( - self, - max_chunk_size: int = -1, - timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY]), - ) -> pd.DataFrame: - """ - Wait until a job is done to get an iterable rows of result. This method - will split the response into chunked DataFrame of a specified size to - to be yielded to the instance calling it. - - Args: - max_chunk_size (int): - Maximum number of rows that the DataFrame should contain. - - timeout_sec (int): - Max no of seconds to wait until job is done. If "timeout_sec" - is exceeded, an exception will be raised. - - Returns: - pd.DataFrame: - Pandas DataFrame of the feature values. - """ - - # Object is Avro row type object, refer to self.result function for this type - records: List[dict] = [] - - # Max chunk size defined by user - for result in self.result(timeout_sec=timeout_sec): - records.append(result) - if len(records) == max_chunk_size: - df = pd.DataFrame.from_records(records) - records.clear() # Empty records array - yield df - - # Handle for last chunk that is < max_chunk_size - if records: - yield pd.DataFrame.from_records(records) - - def __iter__(self): - return iter(self.result()) - - def statistics( - self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY]) - ) -> statistics_pb2.DatasetFeatureStatisticsList: - """ - Get statistics computed over the retrieved data set. Statistics will only be computed for - columns that are part of Feast, and not the columns that were provided. - - Args: - timeout_sec (int): - Max no of seconds to wait until job is done. If "timeout_sec" - is exceeded, an exception will be raised. - - Returns: - DatasetFeatureStatisticsList containing statistics of Feast features over the retrieved dataset. - """ - self.get_avro_files(timeout_sec) # wait for job completion - if self.job_proto.error: - raise Exception(self.job_proto.error) - return self.job_proto.dataset_feature_statistics_list diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index 1a56d04819..0d1c3e5e31 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -1,165 +1,39 @@ -import logging -from concurrent.futures import ProcessPoolExecutor -from functools import partial -from typing import Iterable, List - -import pandas as pd -from pyarrow import parquet as pq - -from feast.constants import DATETIME_COLUMN -from feast.feature_set import FeatureSet -from feast.type_map import ( - pa_column_to_proto_column, - pa_column_to_timestamp_proto_column, -) -from feast.types import Field_pb2 as FieldProto -from feast.types.FeatureRow_pb2 import FeatureRow - -_logger = logging.getLogger(__name__) +from typing import Dict, List GRPC_CONNECTION_TIMEOUT_DEFAULT = 3 # type: int GRPC_CONNECTION_TIMEOUT_APPLY = 300 # type: int FEAST_SERVING_URL_ENV_KEY = "FEAST_SERVING_URL" # type: str FEAST_CORE_URL_ENV_KEY = "FEAST_CORE_URL" # type: str BATCH_FEATURE_REQUEST_WAIT_TIME_SECONDS = 300 -KAFKA_CHUNK_PRODUCTION_TIMEOUT = 120 # type: int +BATCH_INGESTION_PRODUCTION_TIMEOUT = 120 # type: int -def _encode_pa_tables( - file: str, feature_set: str, fields: dict, ingestion_id: str, row_group_idx: int -) -> List[bytes]: +def check_field_mappings( + column_names: List[str], + feature_table_name: str, + feature_table_field_mappings: Dict[str, str], +) -> None: """ - Helper function to encode a PyArrow table(s) read from parquet file(s) into - FeatureRows. - - This function accepts a list of file directory pointing to many parquet - files. All parquet files must have the same schema. - - Each parquet file will be read into as a table and encoded into FeatureRows - using a pool of max_workers workers. - - Args: - file (str): - File directory of all the parquet file to encode. - Parquet file must have more than one row group. - - feature_set (str): - Feature set reference in the format f"{project}/{name}". - - fields (dict[str, enum.Enum.ValueType]): - A mapping of field names to their value types. - - ingestion_id (str): - UUID unique to this ingestion job. + Checks that all specified field mappings in FeatureTable can be found in + column names of specified ingestion source. - row_group_idx(int): - Row group index to read and encode into byte like FeatureRow - protobuf objects. - - Returns: - List[bytes]: - List of byte encoded FeatureRows from the parquet file. + Args: + column_names: Column names in provided ingestion source + feature_table_name: Name of FeatureTable + feature_table_field_mappings: Field mappings of FeatureTable """ - pq_file = pq.ParquetFile(file) - # Read parquet file as a PyArrow table - table = pq_file.read_row_group(row_group_idx) - - # Add datetime column - datetime_col = pa_column_to_timestamp_proto_column(table.column(DATETIME_COLUMN)) - - # Preprocess the columns by converting all its values to Proto values - proto_columns = { - field_name: pa_column_to_proto_column(dtype, table.column(field_name)) - for field_name, dtype in fields.items() - } - - # List to store result - feature_rows: List[bytes] = [] - # Loop optimization declaration(s) - field = FieldProto.Field - proto_items = proto_columns.items() - append = feature_rows.append - - # Iterate through the rows - for row_idx in range(table.num_rows): - feature_row = FeatureRow( - event_timestamp=datetime_col[row_idx], - feature_set=feature_set, - ingestion_id=ingestion_id, + if "datetime" not in column_names: + raise ValueError( + f'Provided data source does not contain entity "datetime" in columns {column_names}' ) - # Loop optimization declaration - ext = feature_row.fields.extend - - # Insert field from each column - for k, v in proto_items: - ext([field(name=k, value=v[row_idx])]) - - # Append FeatureRow in byte string form - append(feature_row.SerializeToString()) - return feature_rows + specified_field_mappings = [v for k, v in feature_table_field_mappings.items()] + is_valid = all(col_name in column_names for col_name in specified_field_mappings) -def get_feature_row_chunks( - file: str, - row_groups: List[int], - fs: FeatureSet, - ingestion_id: str, - max_workers: int, -) -> Iterable[List[bytes]]: - """ - Iterator function to encode a PyArrow table read from a parquet file to - FeatureRow(s). - - Args: - file (str): - File directory of the parquet file. The parquet file must have more - than one row group. - - row_groups (List[int]): - Specific row group indexes to be read and transformed in the parquet - file. - - fs (feast.feature_set.FeatureSet): - FeatureSet describing parquet files. - - ingestion_id (str): - UUID unique to this ingestion job. - - max_workers (int): - Maximum number of workers to spawn. - - Returns: - Iterable[List[bytes]]: - Iterable list of byte encoded FeatureRow(s). - """ - - feature_set = f"{fs.project}/{fs.name}" - - field_map = {field.name: field.dtype for field in fs.fields.values()} - func = partial(_encode_pa_tables, file, feature_set, field_map, ingestion_id) - - with ProcessPoolExecutor(max_workers) as pool: - for chunk in pool.map(func, row_groups): - yield chunk - return - - -def validate_dataframe(dataframe: pd.DataFrame, feature_set: FeatureSet): - if "datetime" not in dataframe.columns: - raise ValueError( - f'Dataframe does not contain entity "datetime" in columns {dataframe.columns}' + if not is_valid: + raise Exception( + f"Provided data source does not contain all field mappings previously " + f"defined for FeatureTable, {feature_table_name}." ) - - for entity in feature_set.entities: - if entity.name not in dataframe.columns: - raise ValueError( - f"Dataframe does not contain entity {entity.name} in columns {dataframe.columns}" - ) - - for feature in feature_set.features: - if feature.name not in dataframe.columns: - raise ValueError( - f"Dataframe does not contain feature {feature.name} in columns {dataframe.columns}" - ) diff --git a/sdk/python/feast/source.py b/sdk/python/feast/source.py deleted file mode 100644 index 8e388376b3..0000000000 --- a/sdk/python/feast/source.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright 2019 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from feast.core.Source_pb2 import KafkaSourceConfig as KafkaSourceConfigProto -from feast.core.Source_pb2 import Source as SourceProto -from feast.core.Source_pb2 import SourceType as SourceTypeProto - - -class Source: - """ - Source is the top level class that represents a data source for finding - feature data. Source must be extended with specific implementations to - be useful - """ - - def __eq__(self, other): - return True - - @property - def source_type(self) -> str: - """ - The type of source. If not implemented, this will return "None" - """ - return "None" - - def to_proto(self): - """ - Converts this source object to its protobuf representation. - """ - return None - - @classmethod - def from_proto(cls, source_proto: SourceProto): - """ - Creates a source from a protobuf representation. This will instantiate - and return a specific source type, depending on the protobuf that is - passed in. - - Args: - source_proto: SourceProto python object - - Returns: - Source object - """ - if source_proto.type == SourceTypeProto.KAFKA: - return KafkaSource( - brokers=source_proto.kafka_source_config.bootstrap_servers, - topic=source_proto.kafka_source_config.topic, - ) - - return cls() - - -class KafkaSource(Source): - """ - Kafka feature set source type. - """ - - def __init__(self, brokers: str = "", topic: str = ""): - """ - - Args: - brokers: Comma separated list of Kafka brokers/bootstrap server - addresses, for example: my-host:9092,other-host:9092 - topic: Kafka topic to find feature rows for this feature set - """ - self._source_type = "Kafka" - self._brokers = brokers - self._topic = topic - - def __eq__(self, other): - if ( - self.brokers != other.brokers - or self.topic != other.topic - or self.source_type != other.source_type - ): - return False - return True - - @property - def brokers(self) -> str: - """ - Returns the list of broker addresses for this Kafka source - """ - return self._brokers - - @property - def topic(self) -> str: - """ - Returns the topic for this feature set - """ - return self._topic - - @property - def source_type(self) -> str: - """ - Returns the type of source. For a Kafka source this will always return - "kafka" - """ - return self._source_type - - def to_proto(self) -> SourceProto: - """ - Converts this Source into its protobuf representation - """ - return SourceProto( - type=SourceTypeProto.KAFKA, - kafka_source_config=KafkaSourceConfigProto( - bootstrap_servers=self.brokers, topic=self.topic - ), - ) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 82ac90bbd1..611e50dfb2 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -22,9 +22,6 @@ from google.protobuf.timestamp_pb2 import Timestamp from pyarrow.lib import TimestampType -from feast.constants import DATETIME_COLUMN -from feast.types import FeatureRow_pb2 as FeatureRowProto -from feast.types import Field_pb2 as FieldProto from feast.types.Value_pb2 import ( BoolList, BytesList, @@ -163,87 +160,6 @@ def python_type_to_feast_value_type( return type_map[value.dtype.__str__()] -def convert_df_to_feature_rows(dataframe: pd.DataFrame, feature_set): - """ - Returns a function that converts a Pandas Series to a Feast FeatureRow - for a given Feature Set and Pandas Dataframe - - Args: - dataframe: Dataframe that will be converted - feature_set: Feature set used as schema for conversion - - Returns: - Function that will do conversion - """ - - def convert_series_to_proto_values(row: pd.Series): - """ - Converts a Pandas Series to a Feast FeatureRow - - Args: - row: pd.Series The row that should be converted - - Returns: - Feast FeatureRow - """ - - feature_row = FeatureRowProto.FeatureRow( - event_timestamp=_pd_datetime_to_timestamp_proto( - dataframe[DATETIME_COLUMN].dtype, row[DATETIME_COLUMN] - ), - feature_set=feature_set.project + "/" + feature_set.name, - ) - - for field_name, field in feature_set.fields.items(): - feature_row.fields.extend( - [ - FieldProto.Field( - name=field.name, - value=_python_value_to_proto_value( - field.dtype, row[field.name] - ), - ) - ] - ) - return feature_row - - return convert_series_to_proto_values - - -def convert_dict_to_proto_values( - row: dict, df_datetime_dtype: pd.DataFrame.dtypes, feature_set -) -> FeatureRowProto.FeatureRow: - """ - Encode a dictionary describing a feature row into a FeatureRows object. - - Args: - row: Dictionary describing a feature row. - df_datetime_dtype: Pandas dtype of datetime column. - feature_set: Feature set describing feature row. - - Returns: - FeatureRow - """ - - feature_row = FeatureRowProto.FeatureRow( - event_timestamp=_pd_datetime_to_timestamp_proto( - df_datetime_dtype, row[DATETIME_COLUMN] - ), - feature_set=f"{feature_set.project}/{feature_set.name}", - ) - - for field_name, field in feature_set.fields.items(): - feature_row.fields.extend( - [ - FieldProto.Field( - name=field.name, - value=_python_value_to_proto_value(field.dtype, row[field.name]), - ) - ] - ) - return feature_row - - def _pd_datetime_to_timestamp_proto(dtype, value) -> Timestamp: """ Converts a Pandas datetime to a Timestamp Proto diff --git a/sdk/python/feast/value_type.py b/sdk/python/feast/value_type.py index aaf3de1822..eba16015d3 100644 --- a/sdk/python/feast/value_type.py +++ b/sdk/python/feast/value_type.py @@ -19,7 +19,7 @@ class ValueType(enum.Enum): """ - Feature value type. Used to define data types in Feature Sets. + Feature value type. Used to define data types in Feature Tables. """ UNKNOWN = 0 From 6ed731ce9ef4cd5bd04a3cc8cc67fa8a876d81bc Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 12:07:23 +0800 Subject: [PATCH 03/34] Cleanup python unit tests Signed-off-by: Terence --- sdk/python/tests/feast_core_server.py | 104 +-- sdk/python/tests/feast_serving_server.py | 26 +- sdk/python/tests/test_client.py | 865 +++++++---------------- sdk/python/tests/test_entity.py | 10 +- sdk/python/tests/test_feature.py | 23 - sdk/python/tests/test_feature_set.py | 394 ----------- sdk/python/tests/test_feature_table.py | 6 +- sdk/python/tests/test_job.py | 143 ---- 8 files changed, 322 insertions(+), 1249 deletions(-) delete mode 100644 sdk/python/tests/test_feature.py delete mode 100644 sdk/python/tests/test_feature_set.py delete mode 100644 sdk/python/tests/test_job.py diff --git a/sdk/python/tests/feast_core_server.py b/sdk/python/tests/feast_core_server.py index 677ecb84ec..f66830d7a4 100644 --- a/sdk/python/tests/feast_core_server.py +++ b/sdk/python/tests/feast_core_server.py @@ -7,16 +7,20 @@ from feast.core import CoreService_pb2_grpc as Core from feast.core.CoreService_pb2 import ( - ApplyFeatureSetRequest, - ApplyFeatureSetResponse, + ApplyEntityRequest, + ApplyEntityResponse, + ApplyFeatureTableRequest, + ApplyFeatureTableResponse, GetFeastCoreVersionResponse, - ListFeatureSetsRequest, - ListFeatureSetsResponse, + ListEntitiesRequest, + ListEntitiesResponse, + ListFeatureTablesRequest, + ListFeatureTablesResponse, ) -from feast.core.FeatureSet_pb2 import FeatureSet as FeatureSetProto -from feast.core.FeatureSet_pb2 import FeatureSetMeta, FeatureSetStatus -from feast.core.Source_pb2 import KafkaSourceConfig as KafkaSourceConfigProto -from feast.core.Source_pb2 import SourceType as SourceTypeProto +from feast.core.Entity_pb2 import Entity as EntityProto +from feast.core.Entity_pb2 import EntityMeta +from feast.core.FeatureTable_pb2 import FeatureTable as FeatureTableProto +from feast.core.FeatureTable_pb2 import FeatureTableMeta _logger = logging.getLogger(__name__) @@ -56,58 +60,62 @@ def intercept_service(self, continuation, handler_call_details): class CoreServicer(Core.CoreServiceServicer): def __init__(self): - self._feature_sets = dict() + self._feature_tables = dict() + self._entities = dict() def GetFeastCoreVersion(self, request, context): - return GetFeastCoreVersionResponse(version="0.3.2") - - def ListFeatureSets(self, request: ListFeatureSetsRequest, context): - - filtered_feature_set_response = [ - fs - for fs in list(self._feature_sets.values()) - if ( - not request.filter.feature_set_name - or request.filter.feature_set_name == "*" - or fs.spec.name == request.filter.feature_set_name - ) - ] - - return ListFeatureSetsResponse(feature_sets=filtered_feature_set_response) - - def ApplyFeatureSet(self, request: ApplyFeatureSetRequest, context): - feature_set = request.feature_set - - if feature_set.spec.source.type == SourceTypeProto.INVALID: - feature_set.spec.source.kafka_source_config.CopyFrom( - KafkaSourceConfigProto(bootstrap_servers="server.com", topic="topic1") - ) - feature_set.spec.source.type = SourceTypeProto.KAFKA - - feature_set_meta = FeatureSetMeta( - status=FeatureSetStatus.STATUS_READY, - created_timestamp=Timestamp(seconds=10), - ) - applied_feature_set = FeatureSetProto( - spec=feature_set.spec, meta=feature_set_meta + return GetFeastCoreVersionResponse(version="0.10.0") + + def ListFeatureTables(self, request: ListFeatureTablesRequest, context): + + filtered_feature_table_response = list(self._feature_tables.values()) + + return ListFeatureTablesResponse(tables=filtered_feature_table_response) + + def ApplyFeatureTable(self, request: ApplyFeatureTableRequest, context): + feature_table_spec = request.table_spec + + feature_table_meta = FeatureTableMeta(created_timestamp=Timestamp(seconds=10),) + applied_feature_table = FeatureTableProto( + spec=feature_table_spec, meta=feature_table_meta ) - self._feature_sets[feature_set.spec.name] = applied_feature_set + self._feature_tables[feature_table_spec.name] = applied_feature_table _logger.info( - "registered feature set " - + feature_set.spec.name + "registered feature table " + + feature_table_spec.name + " with " - + str(len(feature_set.spec.entities)) + + str(len(feature_table_spec.entities)) + " entities and " - + str(len(feature_set.spec.features)) + + str(len(feature_table_spec.features)) + " features" ) - return ApplyFeatureSetResponse( - feature_set=applied_feature_set, - status=ApplyFeatureSetResponse.Status.CREATED, + return ApplyFeatureTableResponse(table=applied_feature_table,) + + def ListEntities(self, request: ListEntitiesRequest, context): + + filtered_entities_response = list(self._entities.values()) + + return ListEntitiesResponse(entities=filtered_entities_response) + + def ApplyEntity(self, request: ApplyEntityRequest, context): + entity_spec = request.spec + + entity_meta = EntityMeta(created_timestamp=Timestamp(seconds=10),) + applied_entity = EntityProto(spec=entity_spec, meta=entity_meta) + self._entities[entity_spec.name] = applied_entity + + _logger.info( + "registered entity " + + entity_spec.name + + " with " + + str(entity_spec.value_type) + + " value" ) + return ApplyEntityResponse(entity=applied_entity,) + def serve(): server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) diff --git a/sdk/python/tests/feast_serving_server.py b/sdk/python/tests/feast_serving_server.py index aba6713275..50ce551405 100644 --- a/sdk/python/tests/feast_serving_server.py +++ b/sdk/python/tests/feast_serving_server.py @@ -5,8 +5,8 @@ import grpc -from feast.core import FeatureSet_pb2 as FeatureSetProto -from feast.core.CoreService_pb2 import ListFeatureSetsResponse +from feast.core import FeatureTable_pb2 as FeatureTableProto +from feast.core.CoreService_pb2 import ListFeatureTablesResponse from feast.core.CoreService_pb2_grpc import CoreServiceStub from feast.serving import ServingService_pb2_grpc as Serving from feast.serving.ServingService_pb2 import GetFeastServingInfoResponse @@ -19,9 +19,9 @@ def __init__(self, core_url: str = None): if core_url: self.__core_channel = None self.__connect_core(core_url) - self._feature_sets = ( + self._feature_tables = ( dict() - ) # type: Dict[str, FeatureSetProto.FeatureSetSpec] + ) # type: Dict[str, FeatureTableProto.FeatureTable] def __connect_core(self, core_url: str): if not core_url: @@ -40,18 +40,18 @@ def __connect_core(self, core_url: str): else: self._core_service_stub = CoreServiceStub(self.__core_channel) - def __get_feature_sets_from_core(self): - # Get updated list of feature sets - feature_sets = ( - self._core_service_stub.ListFeatureSets - ) # type: ListFeatureSetsResponse + def __get_feature_tables_from_core(self): + # Get updated list of feature tables + feature_tables = ( + self._core_service_stub.ListFeatureTables + ) # type: ListFeatureTablesResponse - # Store each feature set locally - for feature_set in list(feature_sets.feature_sets): - self._feature_sets[feature_set.name] = feature_set + # Store each feature table locally + for feature_table in list(feature_tables.tables): + self._feature_tables[feature_table.name] = feature_table def GetFeastServingVersion(self, request, context): - return GetFeastServingInfoResponse(version="0.3.2") + return GetFeastServingInfoResponse(version="0.10.0") def serve(): diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index be8bc78679..c152d6d400 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -13,58 +13,38 @@ # limitations under the License. import pkgutil import socket -import tempfile from concurrent import futures -from datetime import datetime from unittest import mock -import dataframes import grpc -import pandas as pd -import pandavro import pytest from google.protobuf.duration_pb2 import Duration from mock import MagicMock, patch from pytest_lazyfixture import lazy_fixture -from pytz import timezone from feast.client import Client -from feast.contrib.job_controller.client import Client as JCClient -from feast.contrib.job_controller.job import IngestJob from feast.core import CoreService_pb2_grpc as Core from feast.core.CoreService_pb2 import ( + GetEntityResponse, GetFeastCoreVersionResponse, - GetFeatureSetResponse, - ListFeatureSetsResponse, - ListFeaturesResponse, - ListIngestionJobsResponse, + GetFeatureTableResponse, + ListEntitiesResponse, + ListFeatureTablesResponse, ) -from feast.core.FeatureSet_pb2 import EntitySpec as EntitySpecProto -from feast.core.FeatureSet_pb2 import FeatureSet as FeatureSetProto -from feast.core.FeatureSet_pb2 import FeatureSetMeta as FeatureSetMetaProto -from feast.core.FeatureSet_pb2 import FeatureSetSpec as FeatureSetSpecProto -from feast.core.FeatureSet_pb2 import FeatureSetStatus as FeatureSetStatusProto -from feast.core.FeatureSet_pb2 import FeatureSpec as FeatureSpecProto -from feast.core.IngestionJob_pb2 import IngestionJob as IngestJobProto -from feast.core.IngestionJob_pb2 import IngestionJobStatus -from feast.core.Source_pb2 import KafkaSourceConfig, Source, SourceType -from feast.core.Store_pb2 import Store +from feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.core.Entity_pb2 import Entity as EntityProto +from feast.core.Entity_pb2 import EntityMeta as EntityMetaProto +from feast.core.Entity_pb2 import EntitySpecV2 as EntitySpecProto +from feast.core.Feature_pb2 import FeatureSpecV2 as FeatureSpecProto +from feast.core.FeatureTable_pb2 import FeatureTable as FeatureTableProto +from feast.core.FeatureTable_pb2 import FeatureTableMeta as FeatureTableMetaProto +from feast.core.FeatureTable_pb2 import FeatureTableSpec as FeatureTableSpecProto +from feast.data_source import DataSource, FileOptions, KafkaOptions, SourceType from feast.entity import Entity from feast.feature import Feature -from feast.feature_set import FeatureSet, FeatureSetRef +from feast.feature_table import FeatureTable from feast.serving import ServingService_pb2_grpc as Serving -from feast.serving.ServingService_pb2 import DataFormat, FeastServingType -from feast.serving.ServingService_pb2 import FeatureReference as FeatureRefProto -from feast.serving.ServingService_pb2 import ( - GetBatchFeaturesResponse, - GetFeastServingInfoResponse, - GetJobResponse, - GetOnlineFeaturesRequest, - GetOnlineFeaturesResponse, -) -from feast.serving.ServingService_pb2 import Job as BatchRetrievalJob -from feast.serving.ServingService_pb2 import JobStatus, JobType -from feast.source import KafkaSource +from feast.serving.ServingService_pb2 import GetFeastServingInfoResponse from feast.types import Value_pb2 as ValueProto from feast.value_type import ValueType from feast_core_server import ( @@ -76,7 +56,6 @@ CORE_URL = "core.feast.example.com" SERVING_URL = "serving.example.com" -jobcontroller_URL = "jobcontroller.feast.example.com" _PRIVATE_KEY_RESOURCE_PATH = "data/localhost.key" _CERTIFICATE_CHAIN_RESOURCE_PATH = "data/localhost.pem" _ROOT_CERTIFICATE_RESOURCE_PATH = "data/localhost.crt" @@ -114,11 +93,6 @@ def mock_client(self): client._serving_url = SERVING_URL return client - @pytest.fixture - def mock_jobcontroller_client(self): - client = JCClient(jobcontroller_url=jobcontroller_URL) - return client - @pytest.fixture def mock_client_with_auth(self): client = Client( @@ -324,83 +298,138 @@ def test_version(self, mocked_client, mocker): ], ) def test_get_online_features(self, mocked_client, auth_metadata, mocker): - ROW_COUNT = 300 + assert 1 == 1 - mocked_client._serving_service_stub = Serving.ServingServiceStub( + @pytest.mark.parametrize( + "mocked_client", + [ + lazy_fixture("mock_client"), + lazy_fixture("mock_client_with_auth"), + lazy_fixture("secure_mock_client"), + lazy_fixture("secure_mock_client_with_auth"), + ], + ) + def test_get_historical_features(self, mocked_client, mocker): + assert 1 == 1 + + @pytest.mark.parametrize( + "mocked_client", + [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], + ) + def test_get_entity(self, mocked_client, mocker): + mocked_client._core_service_stub = Core.CoreServiceStub( grpc.insecure_channel("") ) - def int_val(x): - return ValueProto.Value(int64_val=x) + entity_proto = EntityProto( + spec=EntitySpecProto( + name="driver_car_id", + description="Car driver id", + value_type=ValueProto.ValueType.STRING, + labels={"key1": "val1", "key2": "val2"}, + ), + meta=EntityMetaProto(), + ) - request = GetOnlineFeaturesRequest(project="driver_project") - request.features.extend( - [ - FeatureRefProto(feature_set="driver", name="age"), - FeatureRefProto(name="rating"), - FeatureRefProto(name="null_value"), - ] + mocker.patch.object( + mocked_client._core_service_stub, + "GetEntity", + return_value=GetEntityResponse(entity=entity_proto), ) - recieve_response = GetOnlineFeaturesResponse() - entity_rows = [] - for row_number in range(1, ROW_COUNT + 1): - request.entity_rows.append( - GetOnlineFeaturesRequest.EntityRow( - fields={"driver_id": int_val(row_number)} - ) + mocked_client.set_project("my_project") + entity = mocked_client.get_entity("my_entity") + + assert ( + entity.name == "driver_car_id" + and entity.description == "Car driver id" + and entity.value_type == ValueType(ValueProto.ValueType.STRING).name + and "key1" in entity.labels + and entity.labels["key1"] == "val1" + ) + + @pytest.mark.parametrize( + "mocked_client", + [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], + ) + def test_list_entities(self, mocked_client, mocker): + mocker.patch.object( + mocked_client, + "_core_service_stub", + return_value=Core.CoreServiceStub(grpc.insecure_channel("")), + ) + + entity_1_proto = EntityProto( + spec=EntitySpecProto( + name="driver_car_id", + description="Car driver id", + value_type=ValueProto.ValueType.INT64, + labels={"key1": "val1", "key2": "val2"}, ) - entity_rows.append({"driver_id": int_val(row_number)}) - field_values = GetOnlineFeaturesResponse.FieldValues( - fields={ - "driver_id": int_val(row_number), - "driver:age": int_val(1), - "rating": int_val(9), - "null_value": ValueProto.Value(), - }, - statuses={ - "driver_id": GetOnlineFeaturesResponse.FieldStatus.PRESENT, - "driver:age": GetOnlineFeaturesResponse.FieldStatus.PRESENT, - "rating": GetOnlineFeaturesResponse.FieldStatus.PRESENT, - "null_value": GetOnlineFeaturesResponse.FieldStatus.NULL_VALUE, - }, + ) + entity_2_proto = EntityProto( + spec=EntitySpecProto( + name="driver_ride_id", + description="Ride driver id", + value_type=ValueProto.ValueType.STRING, + labels={"key3": "val3", "key4": "val4"}, ) - recieve_response.field_values.append(field_values) + ) mocker.patch.object( - mocked_client._serving_service_stub, - "GetOnlineFeatures", - return_value=recieve_response, + mocked_client._core_service_stub, + "ListEntities", + return_value=ListEntitiesResponse( + entities=[entity_1_proto, entity_2_proto] + ), ) - got_response = mocked_client.get_online_features( - entity_rows=entity_rows, - feature_refs=["driver:age", "rating", "null_value"], - project="driver_project", - ) # type: GetOnlineFeaturesResponse - mocked_client._serving_service_stub.GetOnlineFeatures.assert_called_with( - request, metadata=auth_metadata + + entities = mocked_client.list_entities(labels={"key1": "val1"}) + assert len(entities) == 2 + + entity = entities[1] + assert ( + entity.name == "driver_ride_id" + and entity.description == "Ride driver id" + and entity.value_type == ValueType(ValueProto.ValueType.STRING).name + and "key3" in entity.labels + and entity.labels["key3"] == "val3" + and "key4" in entity.labels + and entity.labels["key4"] == "val4" ) - got_fields = got_response.field_values[0].fields - got_statuses = got_response.field_values[0].statuses + @pytest.mark.parametrize( + "test_client", [lazy_fixture("client"), lazy_fixture("secure_client")], + ) + def test_apply_entity_success(self, test_client): + + test_client.set_project("project1") + entity = Entity( + name="driver_car_id", + description="Car driver id", + value_type=ValueType.STRING, + labels={"team": "matchmaking"}, + ) + + # Register Entity with Core + test_client.apply_entity(entity) + + entities = test_client.list_entities() + + entity = entities[0] assert ( - got_fields["driver_id"] == int_val(1) - and got_statuses["driver_id"] - == GetOnlineFeaturesResponse.FieldStatus.PRESENT - and got_fields["driver:age"] == int_val(1) - and got_statuses["driver:age"] - == GetOnlineFeaturesResponse.FieldStatus.PRESENT - and got_fields["rating"] == int_val(9) - and got_statuses["rating"] == GetOnlineFeaturesResponse.FieldStatus.PRESENT - and got_fields["null_value"] == ValueProto.Value() - and got_statuses["null_value"] - == GetOnlineFeaturesResponse.FieldStatus.NULL_VALUE + len(entities) == 1 + and entity.name == "driver_car_id" + and entity.value_type == ValueType(ValueProto.ValueType.STRING).name + and entity.description == "Car driver id" + and "team" in entity.labels + and entity.labels["team"] == "matchmaking" ) @pytest.mark.parametrize( "mocked_client", [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], ) - def test_get_feature_set(self, mocked_client, mocker): + def test_get_feature_table(self, mocked_client, mocker): mocked_client._core_service_stub = Core.CoreServiceStub( grpc.insecure_channel("") ) @@ -409,11 +438,11 @@ def test_get_feature_set(self, mocked_client, mocker): mocker.patch.object( mocked_client._core_service_stub, - "GetFeatureSet", - return_value=GetFeatureSetResponse( - feature_set=FeatureSetProto( - spec=FeatureSetSpecProto( - name="my_feature_set", + "GetFeatureTable", + return_value=GetFeatureTableResponse( + table=FeatureTableProto( + spec=FeatureTableSpecProto( + name="my_feature_table", max_age=Duration(seconds=3600), labels={"key1": "val1", "key2": "val2"}, features=[ @@ -426,54 +455,63 @@ def test_get_feature_set(self, mocked_client, mocker): value_type=ValueProto.ValueType.FLOAT, ), ], - entities=[ - EntitySpecProto( - name="my_entity_1", - value_type=ValueProto.ValueType.INT64, - ) - ], - source=Source( - type=SourceType.KAFKA, - kafka_source_config=KafkaSourceConfig( - bootstrap_servers="localhost:9092", topic="topic" + entities=["my_entity_1"], + batch_source=DataSourceProto( + type=SourceType(1).name, + field_mapping={ + "ride_distance": "ride_distance", + "ride_duration": "ride_duration", + }, + file_options=DataSourceProto.FileOptions( + file_format="avro", file_url="data/test.avro" ), + timestamp_column="ts_col", + date_partition_column="date_partition_col", ), ), - meta=FeatureSetMetaProto(), + meta=FeatureTableMetaProto(), ) ), ) mocked_client.set_project("my_project") - feature_set = mocked_client.get_feature_set("my_feature_set") + feature_table = mocked_client.get_feature_table("my_feature_table") assert ( - feature_set.name == "my_feature_set" - and "key1" in feature_set.labels - and feature_set.labels["key1"] == "val1" - and "key2" in feature_set.labels - and feature_set.labels["key2"] == "val2" - and feature_set.fields["my_feature_1"].name == "my_feature_1" - and feature_set.fields["my_feature_1"].dtype == ValueType.FLOAT - and feature_set.fields["my_entity_1"].name == "my_entity_1" - and feature_set.fields["my_entity_1"].dtype == ValueType.INT64 - and len(feature_set.features) == 2 - and len(feature_set.entities) == 1 + feature_table.name == "my_feature_table" + and "key1" in feature_table.labels + and feature_table.labels["key1"] == "val1" + and "key2" in feature_table.labels + and feature_table.labels["key2"] == "val2" + and len(feature_table.features) == 2 + and len(feature_table.entities) == 1 ) @pytest.mark.parametrize( "mocked_client", [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], ) - def test_list_feature_sets(self, mocked_client, mocker): + def test_list_feature_tables(self, mocked_client, mocker): mocker.patch.object( mocked_client, "_core_service_stub", return_value=Core.CoreServiceStub(grpc.insecure_channel("")), ) - feature_set_1_proto = FeatureSetProto( - spec=FeatureSetSpecProto( - project="test", + batch_source = DataSourceProto( + type=SourceType(1).name, + field_mapping={ + "ride_distance": "ride_distance", + "ride_duration": "ride_duration", + }, + file_options=DataSourceProto.FileOptions( + file_format="avro", file_url="data/test.avro" + ), + timestamp_column="ts_col", + date_partition_column="date_partition_col", + ) + + feature_table_1_proto = FeatureTableProto( + spec=FeatureTableSpecProto( name="driver_car", max_age=Duration(seconds=3600), labels={"key1": "val1", "key2": "val2"}, @@ -482,11 +520,12 @@ def test_list_feature_sets(self, mocked_client, mocker): name="feature_1", value_type=ValueProto.ValueType.FLOAT ) ], + entities=["driver_car_id"], + batch_source=batch_source, ) ) - feature_set_2_proto = FeatureSetProto( - spec=FeatureSetSpecProto( - project="test", + feature_table_2_proto = FeatureTableProto( + spec=FeatureTableSpecProto( name="driver_ride", max_age=Duration(seconds=3600), labels={"key1": "val1"}, @@ -495,516 +534,102 @@ def test_list_feature_sets(self, mocked_client, mocker): name="feature_1", value_type=ValueProto.ValueType.FLOAT ) ], + entities=["driver_ride_id"], + batch_source=batch_source, ) ) mocker.patch.object( mocked_client._core_service_stub, - "ListFeatureSets", - return_value=ListFeatureSetsResponse( - feature_sets=[feature_set_1_proto, feature_set_2_proto] - ), - ) - - feature_sets = mocked_client.list_feature_sets(labels={"key1": "val1"}) - assert len(feature_sets) == 2 - - feature_set = feature_sets[0] - assert ( - feature_set.name == "driver_car" - and "key1" in feature_set.labels - and feature_set.labels["key1"] == "val1" - and "key2" in feature_set.labels - and feature_set.labels["key2"] == "val2" - and feature_set.fields["feature_1"].name == "feature_1" - and feature_set.fields["feature_1"].dtype == ValueType.FLOAT - and len(feature_set.features) == 1 - ) - - @pytest.mark.parametrize( - "mocked_client", - [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], - ) - def test_list_features(self, mocked_client, mocker): - mocker.patch.object( - mocked_client, - "_core_service_stub", - return_value=Core.CoreServiceStub(grpc.insecure_channel("")), - ) - - feature1_proto = FeatureSpecProto( - name="feature_1", value_type=ValueProto.ValueType.FLOAT - ) - feature2_proto = FeatureSpecProto( - name="feature_2", value_type=ValueProto.ValueType.STRING - ) - - mocker.patch.object( - mocked_client._core_service_stub, - "ListFeatures", - return_value=ListFeaturesResponse( - features={ - "driver_car:feature_1": feature1_proto, - "driver_car:feature_2": feature2_proto, - } - ), - ) - - features = mocked_client.list_features_by_ref(project="test") - assert len(features) == 2 - - ref_str_list = [] - feature_name_list = [] - feature_dtype_list = [] - for ref_str, feature_proto in features.items(): - ref_str_list.append(ref_str) - feature_name_list.append(feature_proto.name) - feature_dtype_list.append(feature_proto.dtype) - - assert ( - set(ref_str_list) == set(["driver_car:feature_1", "driver_car:feature_2"]) - and set(feature_name_list) == set(["feature_1", "feature_2"]) - and set(feature_dtype_list) == set([ValueType.FLOAT, ValueType.STRING]) - ) - - def test_list_ingest_jobs(self, mock_jobcontroller_client, mocker): - mocker.patch.object( - mock_jobcontroller_client, - "_jobcontroller_service_stub", - return_value=Core.JobControllerServiceStub(grpc.insecure_channel("")), - ) - - feature_set_ref = FeatureSetRef(project="test", name="driver",) - - mocker.patch.object( - mock_jobcontroller_client._jobcontroller_service_stub, - "ListIngestionJobs", - return_value=ListIngestionJobsResponse( - jobs=[ - IngestJobProto( - id="kafka-to-redis", - external_id="job-2222", - status=IngestionJobStatus.RUNNING, - feature_set_references=[feature_set_ref.to_proto()], - source=Source( - type=SourceType.KAFKA, - kafka_source_config=KafkaSourceConfig( - bootstrap_servers="localhost:9092", topic="topic" - ), - ), - stores=[Store(name="redis")], - ) - ] - ), - ) - - # list ingestion jobs by target feature set reference - ingest_jobs = mock_jobcontroller_client.list_ingest_jobs( - feature_set_ref=feature_set_ref - ) - assert len(ingest_jobs) >= 1 - - ingest_job = ingest_jobs[0] - assert ( - ingest_job.status == IngestionJobStatus.RUNNING - and ingest_job.id == "kafka-to-redis" - and ingest_job.external_id == "job-2222" - and ingest_job.feature_sets[0].name == "driver" - and ingest_job.source.source_type == "Kafka" - ) - - def test_restart_ingest_job(self, mock_jobcontroller_client, mocker): - mocker.patch.object( - mock_jobcontroller_client, - "_jobcontroller_service_stub", - return_value=Core.JobControllerServiceStub(grpc.insecure_channel("")), - ) - - ingest_job = IngestJob( - job_proto=IngestJobProto( - id="kafka-to-redis", - external_id="job#2222", - status=IngestionJobStatus.ERROR, + "ListFeatureTables", + return_value=ListFeatureTablesResponse( + tables=[feature_table_1_proto, feature_table_2_proto] ), - core_stub=mock_jobcontroller_client._jobcontroller_service_stub, ) - mock_jobcontroller_client.restart_ingest_job(ingest_job) - assert ( - mock_jobcontroller_client._jobcontroller_service_stub.RestartIngestionJob.called - ) - - def test_stop_ingest_job(self, mock_jobcontroller_client, mocker): - mocker.patch.object( - mock_jobcontroller_client, - "_jobcontroller_service_stub", - return_value=Core.JobControllerServiceStub(grpc.insecure_channel("")), - ) - - ingest_job = IngestJob( - job_proto=IngestJobProto( - id="kafka-to-redis", - external_id="job#2222", - status=IngestionJobStatus.RUNNING, - ), - core_stub=mock_jobcontroller_client._jobcontroller_service_stub, - ) + feature_tables = mocked_client.list_feature_tables(labels={"key1": "val1"}) + assert len(feature_tables) == 2 - mock_jobcontroller_client.stop_ingest_job(ingest_job) + feature_table = feature_tables[0] assert ( - mock_jobcontroller_client._jobcontroller_service_stub.StopIngestionJob.called + feature_table.name == "driver_car" + and "key1" in feature_table.labels + and feature_table.labels["key1"] == "val1" + and "key2" in feature_table.labels + and feature_table.labels["key2"] == "val2" + and len(feature_table.features) == 1 ) - @pytest.mark.parametrize( - "mocked_client", - [ - lazy_fixture("mock_client"), - lazy_fixture("mock_client_with_auth"), - lazy_fixture("secure_mock_client"), - lazy_fixture("secure_mock_client_with_auth"), - ], - ) - def test_get_historical_features(self, mocked_client, mocker): - - mocked_client._serving_service_stub = Serving.ServingServiceStub( - grpc.insecure_channel("") - ) - mocked_client._core_service_stub = Core.CoreServiceStub( - grpc.insecure_channel("") - ) - - mocker.patch.object( - mocked_client._core_service_stub, - "GetFeatureSet", - return_value=GetFeatureSetResponse( - feature_set=FeatureSetProto( - spec=FeatureSetSpecProto( - name="driver", - project="driver_project", - entities=[ - EntitySpecProto( - name="driver", value_type=ValueProto.ValueType.INT64 - ), - EntitySpecProto( - name="transaction", - value_type=ValueProto.ValueType.INT64, - ), - ], - features=[ - FeatureSpecProto( - name="driver_id", value_type=ValueProto.ValueType.FLOAT, - ), - FeatureSpecProto( - name="driver_name", - value_type=ValueProto.ValueType.STRING, - ), - ], - ), - meta=FeatureSetMetaProto(status=FeatureSetStatusProto.STATUS_READY), - ) - ), - ) - - expected_dataframe = pd.DataFrame( - { - "datetime": [datetime.utcnow() for _ in range(3)], - "driver": [1001, 1002, 1003], - "transaction": [1001, 1002, 1003], - "driver_id": [1001, 1002, 1003], - } - ) - - final_results = tempfile.mktemp() - pandavro.to_avro(file_path_or_buffer=final_results, df=expected_dataframe) - - mocker.patch.object( - mocked_client._serving_service_stub, - "GetBatchFeatures", - return_value=GetBatchFeaturesResponse( - job=BatchRetrievalJob( - id="123", - type=JobType.JOB_TYPE_DOWNLOAD, - status=JobStatus.JOB_STATUS_DONE, - file_uris=[f"file://{final_results}"], - data_format=DataFormat.DATA_FORMAT_AVRO, - ) - ), - ) - - mocker.patch.object( - mocked_client._serving_service_stub, - "GetJob", - return_value=GetJobResponse( - job=BatchRetrievalJob( - id="123", - type=JobType.JOB_TYPE_DOWNLOAD, - status=JobStatus.JOB_STATUS_DONE, - file_uris=[f"file://{final_results}"], - data_format=DataFormat.DATA_FORMAT_AVRO, - ) - ), - ) - - mocker.patch.object( - mocked_client._serving_service_stub, - "GetFeastServingInfo", - return_value=GetFeastServingInfoResponse( - job_staging_location=f"file://{tempfile.mkdtemp()}/", - type=FeastServingType.FEAST_SERVING_TYPE_BATCH, - ), - ) - - mocked_client.set_project("project1") - # TODO: Abstract away GCS client and GCP dependency - # NOTE: Feast Serving does not allow for feature references - # that specify the same feature in the same request. - with patch("google.cloud.storage.Client"): - response = mocked_client.get_historical_features( - entity_rows=pd.DataFrame( - { - "datetime": [ - pd.datetime.now(tz=timezone("Asia/Singapore")) - for _ in range(3) - ], - "driver": [1001, 1002, 1003], - "transaction": [1001, 1002, 1003], - } - ), - feature_refs=["driver:driver_id", "driver_id"], - project="driver_project", - ) # Type: GetBatchFeaturesResponse - - assert response.id == "123" and response.status == JobStatus.JOB_STATUS_DONE - - actual_dataframe = response.to_dataframe() - - assert actual_dataframe[["driver_id"]].equals(expected_dataframe[["driver_id"]]) - @pytest.mark.parametrize( "test_client", [lazy_fixture("client"), lazy_fixture("secure_client")], ) - def test_apply_feature_set_success(self, test_client): + def test_apply_feature_table_success(self, test_client): test_client.set_project("project1") - # Create Feature Sets - fs1 = FeatureSet("my-feature-set-1") - fs1.add(Feature(name="fs1-my-feature-1", dtype=ValueType.INT64)) - fs1.add(Feature(name="fs1-my-feature-2", dtype=ValueType.STRING)) - fs1.add(Entity(name="fs1-my-entity-1", dtype=ValueType.INT64)) - - fs2 = FeatureSet("my-feature-set-2") - fs2.add(Feature(name="fs2-my-feature-1", dtype=ValueType.STRING_LIST)) - fs2.add(Feature(name="fs2-my-feature-2", dtype=ValueType.BYTES_LIST)) - fs2.add(Entity(name="fs2-my-entity-1", dtype=ValueType.INT64)) - - # Register Feature Set with Core - test_client.apply(fs1) - test_client.apply(fs2) - - feature_sets = test_client.list_feature_sets() - - # List Feature Sets - assert ( - len(feature_sets) == 2 - and feature_sets[0].name == "my-feature-set-1" - and feature_sets[0].features[0].name == "fs1-my-feature-1" - and feature_sets[0].features[0].dtype == ValueType.INT64 - and feature_sets[0].features[1].name == "fs1-my-feature-2" - and feature_sets[0].features[1].dtype == ValueType.STRING - and feature_sets[0].entities[0].name == "fs1-my-entity-1" - and feature_sets[0].entities[0].dtype == ValueType.INT64 - and feature_sets[1].features[0].name == "fs2-my-feature-1" - and feature_sets[1].features[0].dtype == ValueType.STRING_LIST - and feature_sets[1].features[1].name == "fs2-my-feature-2" - and feature_sets[1].features[1].dtype == ValueType.BYTES_LIST - and feature_sets[1].entities[0].name == "fs2-my-entity-1" - and feature_sets[1].entities[0].dtype == ValueType.INT64 - ) - - @pytest.mark.parametrize( - "dataframe,test_client", - [ - (dataframes.GOOD, lazy_fixture("client")), - (dataframes.GOOD, lazy_fixture("secure_client")), - ], - ) - def test_feature_set_ingest_success(self, dataframe, test_client, mocker): - test_client.set_project("project1") - driver_fs = FeatureSet( - "driver-feature-set", source=KafkaSource(brokers="kafka:9092", topic="test") - ) - driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) - driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) - driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) - driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) - - # Register with Feast core - test_client.apply(driver_fs) - driver_fs = driver_fs.to_proto() - driver_fs.meta.status = FeatureSetStatusProto.STATUS_READY - - mocker.patch.object( - test_client._core_service_stub, - "GetFeatureSet", - return_value=GetFeatureSetResponse(feature_set=driver_fs), - ) - - # Need to create a mock producer - with patch("feast.client.get_producer"): - # Ingest data into Feast - test_client.ingest("driver-feature-set", dataframe) - - @pytest.mark.parametrize( - "dataframe,test_client,exception", - [(dataframes.GOOD, lazy_fixture("client"), Exception)], - ) - def test_feature_set_ingest_throws_exception_if_kafka_down( - self, dataframe, test_client, exception, mocker - ): - - test_client.set_project("project1") - driver_fs = FeatureSet( - "driver-feature-set", - source=KafkaSource(brokers="localhost:4412", topic="test"), - ) - driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) - driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) - driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) - driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) - - # Register with Feast core - test_client.apply(driver_fs) - driver_fs = driver_fs.to_proto() - driver_fs.meta.status = FeatureSetStatusProto.STATUS_READY - - mocker.patch.object( - test_client._core_service_stub, - "GetFeatureSet", - return_value=GetFeatureSetResponse(feature_set=driver_fs), - ) - - with pytest.raises(exception): - test_client.ingest("driver-feature-set", dataframe, timeout=1) - - @pytest.mark.parametrize( - "dataframe,exception,test_client", - [ - (dataframes.GOOD, TimeoutError, lazy_fixture("client")), - (dataframes.GOOD, TimeoutError, lazy_fixture("secure_client")), - ], - ) - def test_feature_set_ingest_fail_if_pending( - self, dataframe, exception, test_client, mocker - ): - with pytest.raises(exception): - test_client.set_project("project1") - driver_fs = FeatureSet( - "driver-feature-set", - source=KafkaSource(brokers="kafka:9092", topic="test"), - ) - driver_fs.add(Feature(name="feature_1", dtype=ValueType.FLOAT)) - driver_fs.add(Feature(name="feature_2", dtype=ValueType.STRING)) - driver_fs.add(Feature(name="feature_3", dtype=ValueType.INT64)) - driver_fs.add(Entity(name="entity_id", dtype=ValueType.INT64)) - - # Register with Feast core - test_client.apply(driver_fs) - driver_fs = driver_fs.to_proto() - driver_fs.meta.status = FeatureSetStatusProto.STATUS_PENDING - - mocker.patch.object( - test_client._core_service_stub, - "GetFeatureSet", - return_value=GetFeatureSetResponse(feature_set=driver_fs), - ) - - # Need to create a mock producer - with patch("feast.client.get_producer"): - # Ingest data into Feast - test_client.ingest("driver-feature-set", dataframe, timeout=1) - - @pytest.mark.parametrize( - "dataframe,exception,test_client", - [ - (dataframes.BAD_NO_DATETIME, Exception, lazy_fixture("client")), - ( - dataframes.BAD_INCORRECT_DATETIME_TYPE, - Exception, - lazy_fixture("client"), + # Create Feature Tables + batch_source = DataSource( + type=SourceType(1).name, + field_mapping={ + "ride_distance": "ride_distance", + "ride_duration": "ride_duration", + }, + options=FileOptions(file_format="avro", file_url="data/test.avro"), + timestamp_column="ts_col", + date_partition_column="date_partition_col", + ) + + stream_source = DataSource( + type=SourceType(3).name, + field_mapping={ + "ride_distance": "ride_distance", + "ride_duration": "ride_duration", + }, + options=KafkaOptions( + bootstrap_servers="localhost:9094", + class_path="random/path/to/class", + topic="test_topic", ), - (dataframes.BAD_NO_ENTITY, Exception, lazy_fixture("client")), - (dataframes.NO_FEATURES, Exception, lazy_fixture("client")), - (dataframes.BAD_NO_DATETIME, Exception, lazy_fixture("secure_client"),), - ( - dataframes.BAD_INCORRECT_DATETIME_TYPE, - Exception, - lazy_fixture("secure_client"), - ), - (dataframes.BAD_NO_ENTITY, Exception, lazy_fixture("secure_client")), - (dataframes.NO_FEATURES, Exception, lazy_fixture("secure_client")), - ], - ) - def test_feature_set_ingest_failure(self, test_client, dataframe, exception): - with pytest.raises(exception): - # Create feature set - driver_fs = FeatureSet("driver-feature-set") - - # Update based on dataset - driver_fs.infer_fields_from_df(dataframe) - - # Register with Feast core - test_client.apply(driver_fs) - - # Ingest data into Feast - test_client.ingest(driver_fs, dataframe=dataframe) - - @pytest.mark.parametrize( - "dataframe,test_client", - [ - (dataframes.ALL_TYPES, lazy_fixture("client")), - (dataframes.ALL_TYPES, lazy_fixture("secure_client")), - ], - ) - def test_feature_set_types_success(self, test_client, dataframe, mocker): - - test_client.set_project("project1") + timestamp_column="ts_col", + ) - all_types_fs = FeatureSet( - name="all_types", - entities=[Entity(name="user_id", dtype=ValueType.INT64)], + ft1 = FeatureTable( + name="my-feature-table-1", features=[ - Feature(name="float_feature", dtype=ValueType.FLOAT), - Feature(name="int64_feature", dtype=ValueType.INT64), - Feature(name="int32_feature", dtype=ValueType.INT32), - Feature(name="string_feature", dtype=ValueType.STRING), - Feature(name="bytes_feature", dtype=ValueType.BYTES), - Feature(name="bool_feature", dtype=ValueType.BOOL), - Feature(name="double_feature", dtype=ValueType.DOUBLE), - Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST), - Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST), - Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST), - Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), - Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), - Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST), - Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST), + Feature(name="fs1-my-feature-1", dtype=ValueType.INT64).to_proto(), + Feature(name="fs1-my-feature-2", dtype=ValueType.STRING).to_proto(), + Feature( + name="fs1-my-feature-3", dtype=ValueType.STRING_LIST + ).to_proto(), + Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST).to_proto(), ], - max_age=Duration(seconds=3600), + entities=["fs1-my-entity-1"], + labels={"team": "matchmaking"}, + batch_source=batch_source.to_proto(), + stream_source=stream_source.to_proto(), ) - # Register with Feast core - test_client.apply(all_types_fs) + # Register Feature Table with Core + test_client.apply_feature_table(ft1) - mocker.patch.object( - test_client._core_service, - "GetFeatureSet", - return_value=GetFeatureSetResponse(feature_set=all_types_fs.to_proto()), - ) + feature_tables = test_client.list_feature_tables() - # Need to create a mock producer - with patch("feast.client.get_producer"): - # Ingest data into Feast - test_client.ingest(all_types_fs, dataframe) + # List Feature Tables + assert ( + len(feature_tables) == 1 + and feature_tables[0].name == "my-feature-table-1" + and feature_tables[0].features[0].name == "fs1-my-feature-1" + and feature_tables[0].features[0].value_type == ValueProto.ValueType.INT64 + and feature_tables[0].features[1].name == "fs1-my-feature-2" + and feature_tables[0].features[1].value_type == ValueProto.ValueType.STRING + and feature_tables[0].features[2].name == "fs1-my-feature-3" + and feature_tables[0].features[2].value_type + == ValueProto.ValueType.STRING_LIST + and feature_tables[0].features[3].name == "fs1-my-feature-4" + and feature_tables[0].features[3].value_type + == ValueProto.ValueType.BYTES_LIST + and feature_tables[0].entities[0] == "fs1-my-entity-1" + ) @patch("grpc.channel_ready_future") def test_secure_channel_creation_with_secure_client( @@ -1058,7 +683,7 @@ def test_secure_channel_creation_with_secure_core_url( def test_auth_success_with_secure_channel_on_core_url( self, secure_core_client_with_auth ): - secure_core_client_with_auth.list_feature_sets() + secure_core_client_with_auth.list_feature_tables() def test_auth_success_with_insecure_channel_on_core_url( self, insecure_core_server_with_auth @@ -1068,10 +693,10 @@ def test_auth_success_with_insecure_channel_on_core_url( enable_auth=True, auth_token=_FAKE_JWT_TOKEN, ) - client.list_feature_sets() + client.list_feature_tables() def test_no_auth_sent_when_auth_disabled( self, insecure_core_server_that_blocks_auth ): client = Client(core_url=f"localhost:{insecure_core_server_that_blocks_auth}") - client.list_feature_sets() + client.list_feature_tables() diff --git a/sdk/python/tests/test_entity.py b/sdk/python/tests/test_entity.py index 4d146da729..d05412c3bb 100644 --- a/sdk/python/tests/test_entity.py +++ b/sdk/python/tests/test_entity.py @@ -21,7 +21,7 @@ from feast.client import Client from feast.core import CoreService_pb2_grpc as Core -from feast.entity import EntityV2 +from feast.entity import Entity from feast.value_type import ValueType from feast_core_server import CoreServicer @@ -52,7 +52,7 @@ def client(self, server): def test_entity_import_export_yaml(self): - test_entity = EntityV2( + test_entity = Entity( name="car_driver_entity", description="Driver entity for car rides", value_type=ValueType.STRING, @@ -63,14 +63,14 @@ def test_entity_import_export_yaml(self): string_yaml = test_entity.to_yaml() # Create a new entity object from the YAML string - actual_entity_from_string = EntityV2.from_yaml(string_yaml) + actual_entity_from_string = Entity.from_yaml(string_yaml) # Ensure equality is upheld to original entity assert test_entity == actual_entity_from_string def test_entity_class_contains_labels(): - entity = EntityV2( + entity = Entity( "my-entity", description="My entity", value_type=ValueType.STRING, @@ -81,6 +81,6 @@ def test_entity_class_contains_labels(): def test_entity_without_labels_empty_dict(): - entity = EntityV2("my-entity", description="My entity", value_type=ValueType.STRING) + entity = Entity("my-entity", description="My entity", value_type=ValueType.STRING) assert entity.labels == dict() assert len(entity.labels) == 0 diff --git a/sdk/python/tests/test_feature.py b/sdk/python/tests/test_feature.py deleted file mode 100644 index bc83683e0f..0000000000 --- a/sdk/python/tests/test_feature.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2019 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from feast.feature import FeatureRef - - -class TestFeatureRef: - def test_str_ref(self): - original_ref = FeatureRef(feature_set="test", name="test") - ref_str = repr(original_ref) - parsed_ref = FeatureRef.from_str(ref_str) - assert original_ref == parsed_ref diff --git a/sdk/python/tests/test_feature_set.py b/sdk/python/tests/test_feature_set.py deleted file mode 100644 index cf78cf048b..0000000000 --- a/sdk/python/tests/test_feature_set.py +++ /dev/null @@ -1,394 +0,0 @@ -# Copyright 2019 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pathlib -from collections import OrderedDict -from concurrent import futures -from datetime import datetime - -import dataframes -import grpc -import pandas as pd -import pytest -import pytz -from google.protobuf import json_format - -from feast.client import Client -from feast.core import CoreService_pb2_grpc as Core -from feast.entity import Entity -from feast.feature_set import ( - Feature, - FeatureSet, - FeatureSetRef, - _make_tfx_schema_domain_info_inline, -) -from feast.value_type import ValueType -from feast_core_server import CoreServicer -from tensorflow_metadata.proto.v0 import schema_pb2 - -CORE_URL = "core.feast.local" -SERVING_URL = "serving.feast.local" - - -class TestFeatureSet: - @pytest.fixture(scope="function") - def server(self): - server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - Core.add_CoreServiceServicer_to_server(CoreServicer(), server) - server.add_insecure_port("[::]:50051") - server.start() - yield server - server.stop(0) - - @pytest.fixture - def client(self, server): - return Client(core_url="localhost:50051") - - def test_add_remove_features_success(self): - fs = FeatureSet("my-feature-set") - fs.add(Feature(name="my-feature-1", dtype=ValueType.INT64)) - fs.add(Feature(name="my-feature-2", dtype=ValueType.INT64)) - fs.drop(name="my-feature-1") - assert len(fs.features) == 1 and fs.features[0].name == "my-feature-2" - - def test_remove_feature_failure(self): - with pytest.raises(KeyError): - fs = FeatureSet("my-feature-set") - fs.drop(name="my-feature-1") - - def test_update_from_source_failure(self): - with pytest.raises(Exception): - df = pd.DataFrame() - fs = FeatureSet("driver-feature-set") - fs.infer_fields_from_df(df) - - @pytest.mark.parametrize( - "dataframe,feature_count,entity_count,discard_unused_fields,features,entities", - [ - ( - dataframes.GOOD, - 3, - 1, - True, - [], - [Entity(name="entity_id", dtype=ValueType.INT64)], - ), - ( - dataframes.GOOD_FIVE_FEATURES, - 5, - 1, - True, - [], - [Entity(name="entity_id", dtype=ValueType.INT64)], - ), - ( - dataframes.GOOD_FIVE_FEATURES, - 6, - 1, - True, - [Feature(name="feature_6", dtype=ValueType.INT64)], - [Entity(name="entity_id", dtype=ValueType.INT64)], - ), - ( - dataframes.GOOD_FIVE_FEATURES_TWO_ENTITIES, - 5, - 2, - True, - [], - [ - Entity(name="entity_1_id", dtype=ValueType.INT64), - Entity(name="entity_2_id", dtype=ValueType.INT64), - ], - ), - ( - dataframes.GOOD_FIVE_FEATURES_TWO_ENTITIES, - 6, - 3, - False, - [], - [ - Entity(name="entity_1_id", dtype=ValueType.INT64), - Entity(name="entity_2_id", dtype=ValueType.INT64), - ], - ), - ( - dataframes.NO_FEATURES, - 0, - 1, - True, - [], - [Entity(name="entity_id", dtype=ValueType.INT64)], - ), - ( - pd.DataFrame( - { - "datetime": [ - datetime.utcnow().replace(tzinfo=pytz.utc) for _ in range(3) - ] - } - ), - 0, - 0, - True, - [], - [], - ), - ], - ids=[ - "Test small dataframe update with hardcoded entity", - "Test larger dataframe update with hardcoded entity", - "Test larger dataframe update with hardcoded entity and feature", - "Test larger dataframe update with two hardcoded entities and discarding of existing fields", - "Test larger dataframe update with two hardcoded entities and retention of existing fields", - "Test dataframe with no featuresdataframe", - "Test empty dataframe", - ], - ) - def test_add_features_from_df_success( - self, - dataframe, - feature_count, - entity_count, - discard_unused_fields, - features, - entities, - ): - my_feature_set = FeatureSet( - name="my_feature_set", - features=[Feature(name="dummy_f1", dtype=ValueType.INT64)], - entities=[Entity(name="dummy_entity_1", dtype=ValueType.INT64)], - ) - my_feature_set.infer_fields_from_df( - dataframe, - discard_unused_fields=discard_unused_fields, - features=features, - entities=entities, - ) - assert len(my_feature_set.features) == feature_count - assert len(my_feature_set.entities) == entity_count - - def test_import_tfx_schema(self): - tests_folder = pathlib.Path(__file__).parent - test_input_schema_json = open( - tests_folder / "data" / "tensorflow_metadata" / "bikeshare_schema.json" - ).read() - test_input_schema = schema_pb2.Schema() - json_format.Parse(test_input_schema_json, test_input_schema) - - feature_set = FeatureSet( - name="bikeshare", - entities=[Entity(name="station_id", dtype=ValueType.INT64)], - features=[ - Feature(name="name", dtype=ValueType.STRING), - Feature(name="status", dtype=ValueType.STRING), - Feature(name="latitude", dtype=ValueType.FLOAT), - Feature(name="longitude", dtype=ValueType.FLOAT), - Feature(name="location", dtype=ValueType.STRING), - ], - ) - - # Before update - for entity in feature_set.entities: - assert entity.presence is None - assert entity.shape is None - for feature in feature_set.features: - assert feature.presence is None - assert feature.shape is None - assert feature.string_domain is None - assert feature.float_domain is None - assert feature.int_domain is None - - feature_set.import_tfx_schema(test_input_schema) - - # After update - for feature in feature_set.features: - assert feature.presence is not None - assert feature.shape is not None - if feature.name in ["location", "name", "status"]: - assert feature.string_domain is not None - elif feature.name in ["latitude", "longitude"]: - assert feature.float_domain is not None - elif feature.name in ["station_id"]: - assert feature.int_domain is not None - - def test_export_tfx_schema(self): - tests_folder = pathlib.Path(__file__).parent - test_input_feature_set = FeatureSet.from_yaml( - str( - tests_folder - / "data" - / "tensorflow_metadata" - / "bikeshare_feature_set.yaml" - ) - ) - - expected_schema_json = open( - tests_folder / "data" / "tensorflow_metadata" / "bikeshare_schema.json" - ).read() - expected_schema = schema_pb2.Schema() - json_format.Parse(expected_schema_json, expected_schema) - _make_tfx_schema_domain_info_inline(expected_schema) - - actual_schema = test_input_feature_set.export_tfx_schema() - - assert len(actual_schema.feature) == len(expected_schema.feature) - for actual, expected in zip(actual_schema.feature, expected_schema.feature): - assert actual.SerializeToString() == expected.SerializeToString() - - def test_feature_set_import_export_yaml(self): - - test_feature_set = FeatureSet( - name="bikeshare", - entities=[Entity(name="station_id", dtype=ValueType.INT64)], - features=[ - Feature(name="name", dtype=ValueType.STRING), - Feature(name="longitude", dtype=ValueType.FLOAT), - Feature(name="location", dtype=ValueType.STRING), - ], - ) - - # Create a string YAML representation of the feature set - string_yaml = test_feature_set.to_yaml() - - # Create a new feature set object from the YAML string - actual_feature_set_from_string = FeatureSet.from_yaml(string_yaml) - - # Ensure equality is upheld to original feature set - assert test_feature_set == actual_feature_set_from_string - - -def make_tfx_schema_domain_info_inline(schema): - # Copy top-level domain info defined in the schema to inline definition. - # One use case is in FeatureSet which does not have access to the top-level domain - # info. - domain_ref_to_string_domain = {d.name: d for d in schema.string_domain} - domain_ref_to_float_domain = {d.name: d for d in schema.float_domain} - domain_ref_to_int_domain = {d.name: d for d in schema.int_domain} - - for feature in schema.feature: - domain_info_case = feature.WhichOneof("domain_info") - if domain_info_case == "domain": - domain_ref = feature.domain - if domain_ref in domain_ref_to_string_domain: - feature.string_domain.MergeFrom(domain_ref_to_string_domain[domain_ref]) - elif domain_ref in domain_ref_to_float_domain: - feature.float_domain.MergeFrom(domain_ref_to_float_domain[domain_ref]) - elif domain_ref in domain_ref_to_int_domain: - feature.int_domain.MergeFrom(domain_ref_to_int_domain[domain_ref]) - - -def test_feature_set_class_contains_labels(): - fs = FeatureSet("my-feature-set", labels={"key1": "val1", "key2": "val2"}) - assert "key1" in fs.labels.keys() and fs.labels["key1"] == "val1" - assert "key2" in fs.labels.keys() and fs.labels["key2"] == "val2" - - -def test_feature_class_contains_labels(): - fs = FeatureSet("my-feature-set", labels={"key1": "val1", "key2": "val2"}) - fs.add( - Feature( - name="my-feature-1", - dtype=ValueType.INT64, - labels={"feature_key1": "feature_val1"}, - ) - ) - assert "feature_key1" in fs.features[0].labels.keys() - assert fs.features[0].labels["feature_key1"] == "feature_val1" - - -def test_feature_set_without_labels_empty_dict(): - fs = FeatureSet("my-feature-set") - assert fs.labels == OrderedDict() - assert len(fs.labels) == 0 - - -def test_feature_without_labels_empty_dict(): - f = Feature("my feature", dtype=ValueType.INT64) - assert f.labels == OrderedDict() - assert len(f.labels) == 0 - - -def test_set_label_feature_set(): - fs = FeatureSet("my-feature-set") - fs.set_label("k1", "v1") - assert fs.labels["k1"] == "v1" - - -def test_set_labels_overwrites_existing(): - fs = FeatureSet("my-feature-set") - fs.set_label("k1", "v1") - fs.set_label("k1", "v2") - assert fs.labels["k1"] == "v2" - - -def test_remove_labels_empty_failure(): - fs = FeatureSet("my-feature-set") - with pytest.raises(KeyError): - fs.remove_label("key1") - - -def test_remove_labels_invalid_key_failure(): - fs = FeatureSet("my-feature-set") - fs.set_label("k1", "v1") - with pytest.raises(KeyError): - fs.remove_label("key1") - - -def test_unequal_feature_based_on_labels(): - f1 = Feature(name="feature-1", dtype=ValueType.INT64, labels={"k1": "v1"}) - f2 = Feature(name="feature-1", dtype=ValueType.INT64, labels={"k1": "v1"}) - assert f1 == f2 - f3 = Feature(name="feature-1", dtype=ValueType.INT64) - assert f1 != f3 - f4 = Feature(name="feature-1", dtype=ValueType.INT64, labels={"k1": "notv1"}) - assert f1 != f4 - - -def test_unequal_feature_set_based_on_labels(): - fs1 = FeatureSet("my-feature-set") - fs2 = FeatureSet("my-feature-set") - assert fs1 == fs2 - fs1.set_label("k1", "v1") - fs2.set_label("k1", "v1") - assert fs1 == fs2 - fs2.set_label("k1", "unequal") - assert not fs1 == fs2 - - -def test_unequal_feature_set_other_has_no_labels(): - fs1 = FeatureSet("my-feature-set") - fs2 = FeatureSet("my-feature-set") - assert fs1 == fs2 - fs1.set_label("k1", "v1") - assert not fs1 == fs2 - - -def test_unequal_feature_other_has_no_labels(): - f1 = Feature(name="feature-1", dtype=ValueType.INT64, labels={"k1": "v1"}) - f2 = Feature(name="feature-1", dtype=ValueType.INT64) - assert f1 != f2 - - -class TestFeatureSetRef: - def test_from_feature_set(self): - feature_set = FeatureSet("test", "test") - ref = FeatureSetRef.from_feature_set(feature_set) - - assert ref.name == "test" - assert ref.project == "test" - - def test_str_ref(self): - original_ref = FeatureSetRef(project="test", name="test") - ref_str = repr(original_ref) - parsed_ref = FeatureSetRef.from_str(ref_str) - assert original_ref == parsed_ref diff --git a/sdk/python/tests/test_feature_table.py b/sdk/python/tests/test_feature_table.py index a7a8849c76..8a1059bcb6 100644 --- a/sdk/python/tests/test_feature_table.py +++ b/sdk/python/tests/test_feature_table.py @@ -22,8 +22,8 @@ from feast.client import Client from feast.core import CoreService_pb2_grpc as Core from feast.data_source import DataSource, FileOptions, KafkaOptions, SourceType +from feast.feature import Feature from feast.feature_table import FeatureTable -from feast.feature_v2 import FeatureV2 from feast.value_type import ValueType from feast_core_server import CoreServicer @@ -82,8 +82,8 @@ def test_feature_table_import_export_yaml(self): test_feature_table = FeatureTable( name="car_driver", features=[ - FeatureV2(name="ride_distance", dtype=ValueType.FLOAT).to_proto(), - FeatureV2(name="ride_duration", dtype=ValueType.STRING).to_proto(), + Feature(name="ride_distance", dtype=ValueType.FLOAT).to_proto(), + Feature(name="ride_duration", dtype=ValueType.STRING).to_proto(), ], entities=["car_driver_entity"], labels={"team": "matchmaking"}, diff --git a/sdk/python/tests/test_job.py b/sdk/python/tests/test_job.py deleted file mode 100644 index 092130401e..0000000000 --- a/sdk/python/tests/test_job.py +++ /dev/null @@ -1,143 +0,0 @@ -# -# Copyright 2020 The Feast Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -import tempfile - -import boto3 -import grpc -import pandas as pd -import pandavro -import pytest -from moto import mock_s3 -from pandas.testing import assert_frame_equal -from pytest import fixture, raises - -from feast.job import JobProto, RetrievalJob -from feast.serving import ServingService_pb2_grpc as Serving -from feast.serving.ServingService_pb2 import DataFormat, GetJobResponse -from feast.serving.ServingService_pb2 import Job as BatchRetrievalJob -from feast.serving.ServingService_pb2 import JobStatus, JobType - -BUCKET = "test_bucket" - -TEST_DATA_FRAME = pd.DataFrame( - { - "driver": [1001, 1002, 1003], - "transaction": [1001, 1002, 1003], - "driver_id": [1001, 1002, 1003], - } -) - - -class TestRetrievalJob: - @fixture - def retrieve_job(self): - - serving_service_stub = Serving.ServingServiceStub(grpc.insecure_channel("")) - job_proto = JobProto( - id="123", - type=JobType.JOB_TYPE_DOWNLOAD, - status=JobStatus.JOB_STATUS_RUNNING, - ) - return RetrievalJob(job_proto, serving_service_stub) - - @fixture - def avro_data_path(self): - final_results = tempfile.mktemp() - pandavro.to_avro(file_path_or_buffer=final_results, df=TEST_DATA_FRAME) - return final_results - - def test_to_dataframe_local_file_staging_should_pass( - self, retrieve_job, avro_data_path, mocker - ): - mocker.patch.object( - retrieve_job.serving_stub, - "GetJob", - return_value=GetJobResponse( - job=BatchRetrievalJob( - id="123", - type=JobType.JOB_TYPE_DOWNLOAD, - status=JobStatus.JOB_STATUS_DONE, - file_uris=[f"file://{avro_data_path}"], - data_format=DataFormat.DATA_FORMAT_AVRO, - ) - ), - ) - retrived_df = retrieve_job.to_dataframe() - assert_frame_equal(TEST_DATA_FRAME, retrived_df, check_like=True) - - @mock_s3 - def test_to_dataframe_s3_file_staging_should_pass( - self, retrieve_job, avro_data_path, mocker - ): - s3_client = boto3.client("s3") - target = "test_proj/test_features.avro" - s3_client.create_bucket(Bucket=BUCKET) - with open(avro_data_path, "rb") as data: - s3_client.upload_fileobj(data, BUCKET, target) - - mocker.patch.object( - retrieve_job.serving_stub, - "GetJob", - return_value=GetJobResponse( - job=BatchRetrievalJob( - id="123", - type=JobType.JOB_TYPE_DOWNLOAD, - status=JobStatus.JOB_STATUS_DONE, - file_uris=[f"s3://{BUCKET}/{target}"], - data_format=DataFormat.DATA_FORMAT_AVRO, - ) - ), - ) - retrived_df = retrieve_job.to_dataframe() - assert_frame_equal(TEST_DATA_FRAME, retrived_df, check_like=True) - - @pytest.mark.parametrize( - "job_proto,exception", - [ - ( - GetJobResponse( - job=BatchRetrievalJob( - id="123", - type=JobType.JOB_TYPE_DOWNLOAD, - status=JobStatus.JOB_STATUS_DONE, - data_format=DataFormat.DATA_FORMAT_AVRO, - error="Testing job failure", - ) - ), - Exception, - ), - ( - GetJobResponse( - job=BatchRetrievalJob( - id="123", - type=JobType.JOB_TYPE_DOWNLOAD, - status=JobStatus.JOB_STATUS_DONE, - data_format=DataFormat.DATA_FORMAT_INVALID, - ) - ), - Exception, - ), - ], - ids=["when_retrieve_job_fails", "when_data_format_is_not_avro"], - ) - def test_to_dataframe_s3_file_staging_should_raise( - self, retrieve_job, mocker, job_proto, exception - ): - mocker.patch.object( - retrieve_job.serving_stub, "GetJob", return_value=job_proto, - ) - with raises(exception): - retrieve_job.to_dataframe() From 306ec08bb67397ba1e7d77bed0f44eb9592ff075 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 12:11:57 +0800 Subject: [PATCH 04/34] Refactor and parallelize e2e redis tests Signed-off-by: Terence --- infra/scripts/test-docker-compose.sh | 2 +- .../scripts/test-end-to-end-redis-cluster.sh | 5 +- infra/scripts/test-end-to-end.sh | 3 +- tests/e2e/conftest.py | 18 +- tests/e2e/pytest.ini | 5 +- .../all_types_parquet/all_types_parquet.yaml | 34 - tests/e2e/redis/basic-ingest-redis-serving.py | 1539 ----------------- tests/e2e/redis/basic/cust_trans_fs.yaml | 14 - tests/e2e/redis/basic/data.csv | 3 - tests/e2e/redis/basic/driver_fs.yaml | 12 - .../large_volume/cust_trans_large_fs.yaml | 12 - .../redis/parallel-ingest-redis-serving.py | 192 ++ tests/e2e/redis/specifications/dev_ft.yaml | 38 + tests/e2e/requirements.txt | 1 + 14 files changed, 258 insertions(+), 1620 deletions(-) delete mode 100644 tests/e2e/redis/all_types_parquet/all_types_parquet.yaml delete mode 100644 tests/e2e/redis/basic-ingest-redis-serving.py delete mode 100644 tests/e2e/redis/basic/cust_trans_fs.yaml delete mode 100644 tests/e2e/redis/basic/data.csv delete mode 100644 tests/e2e/redis/basic/driver_fs.yaml delete mode 100644 tests/e2e/redis/large_volume/cust_trans_large_fs.yaml create mode 100644 tests/e2e/redis/parallel-ingest-redis-serving.py create mode 100644 tests/e2e/redis/specifications/dev_ft.yaml diff --git a/infra/scripts/test-docker-compose.sh b/infra/scripts/test-docker-compose.sh index 35e1593dd6..45105d4839 100755 --- a/infra/scripts/test-docker-compose.sh +++ b/infra/scripts/test-docker-compose.sh @@ -63,4 +63,4 @@ export FEAST_ONLINE_SERVING_CONTAINER_IP_ADDRESS=$(docker inspect -f '{{range .N ${PROJECT_ROOT_DIR}/infra/scripts/wait-for-it.sh ${FEAST_ONLINE_SERVING_CONTAINER_IP_ADDRESS}:6566 --timeout=120 # Run e2e tests for Redis -docker exec feast_jupyter_1 bash -c 'cd /feast/tests/e2e/redis && pytest --verbose -rs basic-ingest-redis-serving.py --core_url core:6565 --serving_url=online_serving:6566 --jobcontroller_url=jobcontroller:6570 --kafka_brokers=kafka:9092' +docker exec feast_jupyter_1 bash -c 'cd /feast/tests/e2e/redis && pytest --verbose -rs parallel-ingest-redis-serving.py --core_url core:6565 --serving_url=online_serving:6566 --kafka_brokers=kafka:9092' diff --git a/infra/scripts/test-end-to-end-redis-cluster.sh b/infra/scripts/test-end-to-end-redis-cluster.sh index ba29961de6..544c1f4d3d 100755 --- a/infra/scripts/test-end-to-end-redis-cluster.sh +++ b/infra/scripts/test-end-to-end-redis-cluster.sh @@ -73,7 +73,7 @@ feast: # Connection string specifies the IP and ports of Redis instances in Redis cluster connection_string: "localhost:7000,localhost:7001,localhost:7002,localhost:7003,localhost:7004,localhost:7005" flush_frequency_seconds: 1 - # Subscriptions indicate which feature sets needs to be retrieved and used to populate this store + # Subscriptions indicate which feature tables needs to be retrieved and used to populate this store subscriptions: # Wildcards match all options. No filtering is done. - name: "*" @@ -102,7 +102,8 @@ ORIGINAL_DIR=$(pwd) cd tests/e2e set +e -pytest redis/* --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +CORE_NO=$(nproc --all) +pytest redis/parallel-ingest-redis-serving.py -n CORE_NO --dist=loadscope --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then diff --git a/infra/scripts/test-end-to-end.sh b/infra/scripts/test-end-to-end.sh index 8f05efa9df..e65c72b0ba 100755 --- a/infra/scripts/test-end-to-end.sh +++ b/infra/scripts/test-end-to-end.sh @@ -119,7 +119,8 @@ cd tests/e2e set +e export GOOGLE_APPLICATION_CREDENTIALS=/etc/gcloud/service-account.json -pytest redis/* --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +CORE_NO=$(nproc --all) +pytest redis/parallel-ingest-redis-serving.py -n CORE_NO --dist=loadscope --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index ea2b809f4f..73d141145b 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -1,10 +1,26 @@ +import pytest + + def pytest_addoption(parser): parser.addoption("--core_url", action="store", default="localhost:6565") parser.addoption("--serving_url", action="store", default="localhost:6566") - parser.addoption("--jobcontroller_url", action="store", default="localhost:6570") parser.addoption("--allow_dirty", action="store", default="False") parser.addoption( "--gcs_path", action="store", default="gs://feast-templocation-kf-feast/" ) parser.addoption("--enable_auth", action="store", default="False") parser.addoption("--kafka_brokers", action="store", default="localhost:9092") + + +def pytest_runtest_makereport(item, call): + if "incremental" in item.keywords: + if call.excinfo is not None: + parent = item.parent + parent._previousfailed = item + + +def pytest_runtest_setup(item): + if "incremental" in item.keywords: + previousfailed = getattr(item.parent, "_previousfailed", None) + if previousfailed is not None: + pytest.xfail("previous test failed (%s)" % previousfailed.name) diff --git a/tests/e2e/pytest.ini b/tests/e2e/pytest.ini index b0e5a945f5..0e44395b67 100644 --- a/tests/e2e/pytest.ini +++ b/tests/e2e/pytest.ini @@ -1,3 +1,6 @@ [pytest] filterwarnings = - ignore::DeprecationWarning \ No newline at end of file + ignore::DeprecationWarning + +markers = + incremental: Skip subsequent tests if the previous test failed. diff --git a/tests/e2e/redis/all_types_parquet/all_types_parquet.yaml b/tests/e2e/redis/all_types_parquet/all_types_parquet.yaml deleted file mode 100644 index b054913c65..0000000000 --- a/tests/e2e/redis/all_types_parquet/all_types_parquet.yaml +++ /dev/null @@ -1,34 +0,0 @@ -kind: feature_set -spec: - name: all_types_parquet - entities: - - name: customer_id - valueType: INT64 - features: - - name: int32_feature_parquet - valueType: INT32 - - name: int64_feature_parquet - valueType: INT64 - - name: float_feature_parquet - valueType: DOUBLE - - name: double_feature_parquet - valueType: DOUBLE - - name: string_feature_parquet - valueType: STRING - - name: bytes_feature_parquet - valueType: BYTES - - name: int32_list_feature_parquet - valueType: INT64_LIST - - name: int64_list_feature_parquet - valueType: INT64_LIST - - name: float_list_feature_parquet - valueType: DOUBLE_LIST - - name: double_list_feature_parquet - valueType: DOUBLE_LIST - - name: string_list_feature_parquet - valueType: STRING_LIST - - name: bytes_list_feature_parquet - valueType: BYTES_LIST - - name: bool_list_feature_parquet - valueType: BOOL_LIST - maxAge: 0s diff --git a/tests/e2e/redis/basic-ingest-redis-serving.py b/tests/e2e/redis/basic-ingest-redis-serving.py deleted file mode 100644 index 853da9f529..0000000000 --- a/tests/e2e/redis/basic-ingest-redis-serving.py +++ /dev/null @@ -1,1539 +0,0 @@ -import math -import os -import random -import tempfile -import time -import uuid -from copy import copy -from datetime import datetime, timedelta - -import grpc -import numpy as np -import pandas as pd -import pytest -import pytz -from google.protobuf.duration_pb2 import Duration - -from feast.client import Client -from feast.config import Config -from feast.constants import CONFIG_AUTH_PROVIDER -from feast.contrib.job_controller.client import Client as JCClient -from feast.core import CoreService_pb2 -from feast.core.CoreService_pb2 import ApplyFeatureSetResponse, GetFeatureSetResponse -from feast.core.CoreService_pb2_grpc import CoreServiceStub -from feast.core.IngestionJob_pb2 import IngestionJobStatus -from feast.entity import Entity -from feast.feature import Feature -from feast.feature_set import FeatureSet, FeatureSetRef -from feast.grpc.auth import get_auth_metadata_plugin -from feast.serving.ServingService_pb2 import GetOnlineFeaturesResponse -from feast.source import KafkaSource -from feast.type_map import ValueType -from feast.types.Value_pb2 import Int64List -from feast.types.Value_pb2 import Value as Value -from feast.wait import wait_retry_backoff - -FLOAT_TOLERANCE = 0.00001 -PROJECT_NAME = "basic_" + uuid.uuid4().hex.upper()[0:6] -DIR_PATH = os.path.dirname(os.path.realpath(__file__)) -AUTH_PROVIDER = "google" - - -def basic_dataframe(entities, features, ingest_time, n_size, null_features=[]): - """ - Generate a basic feast-ingestable dataframe for testing. - Entity value incrementally increase from 1 to n_size - Features values are randomlly generated floats. - entities - names of entities - features - names of the features - ingest_time - ingestion timestamp - n_size - no. of rows in the generated dataframe. - null_features - names of features that contain null values - Returns the generated dataframe - """ - df_dict = { - "datetime": [ingest_time.replace(tzinfo=pytz.utc) for _ in range(n_size)], - } - for entity_name in entities: - df_dict[entity_name] = list(range(1, n_size + 1)) - for feature_name in features: - df_dict[feature_name] = [np.random.rand() for _ in range(n_size)] - for null_feature_name in null_features: - df_dict[null_feature_name] = [None for _ in range(n_size)] - return pd.DataFrame(df_dict) - - -def check_online_response(feature_ref, ingest_df, response): - """ - Check the feature value and status in the given online serving response. - feature_refs - string feature ref used to access feature in response - ingest_df - dataframe of ingested values - response - response to extract retrieved feature value and metadata - Returns True if given response has expected feature value and metadata, otherwise False. - """ - feature_ref_splits = feature_ref.split(":") - if len(feature_ref_splits) == 1: - feature_name = feature_ref - else: - _, feature_name = feature_ref_splits - - returned_status = response.field_values[0].statuses[feature_ref] - if ingest_df.loc[0, feature_name] is None: - return returned_status == GetOnlineFeaturesResponse.FieldStatus.NULL_VALUE - else: - sent_value = float(ingest_df.iloc[0][feature_name]) - returned_value = float(response.field_values[0].fields[feature_ref].float_val) - return ( - math.isclose(sent_value, returned_value, abs_tol=FLOAT_TOLERANCE) - and returned_status == GetOnlineFeaturesResponse.FieldStatus.PRESENT - ) - - -@pytest.fixture(scope="module") -def core_url(pytestconfig): - return pytestconfig.getoption("core_url") - - -@pytest.fixture(scope="module") -def serving_url(pytestconfig): - return pytestconfig.getoption("serving_url") - - -@pytest.fixture(scope="module") -def jobcontroller_url(pytestconfig): - return pytestconfig.getoption("jobcontroller_url") - - -@pytest.fixture(scope="module") -def allow_dirty(pytestconfig): - return True if pytestconfig.getoption("allow_dirty").lower() == "true" else False - - -@pytest.fixture(scope="module") -def enable_auth(pytestconfig): - return True if pytestconfig.getoption("enable_auth").lower() == "true" else False - - -@pytest.fixture(scope="module") -def kafka_brokers(pytestconfig): - return pytestconfig.getoption("kafka_brokers") - - -@pytest.fixture(scope="module") -def client(core_url, serving_url, allow_dirty, enable_auth): - # Get client for core and serving - # if enable_auth is True, Google Id token will be - # passed in the metadata for authentication. - client = Client( - core_url=core_url, - serving_url=serving_url, - enable_auth=enable_auth, - auth_provider=AUTH_PROVIDER, - ) - client.create_project(PROJECT_NAME) - - # Ensure Feast core is active, but empty - if not allow_dirty: - feature_sets = client.list_feature_sets() - if len(feature_sets) > 0: - raise Exception( - "Feast cannot have existing feature sets registered. Exiting tests." - ) - - return client - - -@pytest.fixture(scope="module") -def jobcontroller_client(jobcontroller_url): - client = JCClient(jobcontroller_url=jobcontroller_url) - return client - - -@pytest.fixture(scope="module") -def ingest_time(): - return datetime.utcnow() - - -@pytest.fixture(scope="module") -def cust_trans_df(ingest_time): - return basic_dataframe( - entities=["customer_id"], - features=["daily_transactions", "total_transactions"], - null_features=["null_values"], - ingest_time=ingest_time, - n_size=5, - ) - - -@pytest.fixture(scope="module") -def driver_df(ingest_time): - return basic_dataframe( - entities=["driver_id"], - features=["rating", "cost"], - ingest_time=ingest_time, - n_size=5, - ) - - -def test_version_returns_results(client): - version_info = client.version() - assert not version_info["core"] == "not configured" - assert not version_info["serving"] == "not configured" - - -def test_list_feature_sets_when_auth_enabled_should_raise(enable_auth): - if enable_auth: - client = Client(core_url=core_url, serving_url=serving_url, enable_auth=False) - with pytest.raises(ConnectionError): - client.list_feature_sets() - - -@pytest.mark.timeout(45) -@pytest.mark.run(order=10) -def test_basic_register_feature_set_success(client): - # Register feature set without project - cust_trans_fs_expected = FeatureSet.from_yaml( - f"{DIR_PATH}/basic/cust_trans_fs.yaml" - ) - driver_fs_expected = FeatureSet.from_yaml(f"{DIR_PATH}/basic/driver_fs.yaml") - client.apply(cust_trans_fs_expected) - client.apply(driver_fs_expected) - cust_trans_fs_actual = client.get_feature_set("customer_transactions") - assert cust_trans_fs_actual == cust_trans_fs_expected - driver_fs_actual = client.get_feature_set("driver") - assert driver_fs_actual == driver_fs_expected - - # Register feature set with project - cust_trans_fs_expected = FeatureSet.from_yaml( - f"{DIR_PATH}/basic/cust_trans_fs.yaml" - ) - client.set_project(PROJECT_NAME) - client.apply(cust_trans_fs_expected) - cust_trans_fs_actual = client.get_feature_set( - "customer_transactions", project=PROJECT_NAME - ) - assert cust_trans_fs_actual == cust_trans_fs_expected - - # Register feature set with labels - driver_unlabelled_fs = FeatureSet( - "driver_unlabelled", - features=[Feature("rating", ValueType.FLOAT), Feature("cost", ValueType.FLOAT)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - driver_labeled_fs_expected = FeatureSet( - "driver_labeled", - features=[Feature("rating", ValueType.FLOAT), Feature("cost", ValueType.FLOAT)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - labels={"key1": "val1"}, - ) - client.set_project(PROJECT_NAME) - client.apply(driver_unlabelled_fs) - client.apply(driver_labeled_fs_expected) - driver_fs_actual = client.list_feature_sets( - project=PROJECT_NAME, labels={"key1": "val1"} - )[0] - assert driver_fs_actual == driver_labeled_fs_expected - - # reset client's project for other tests - client.set_project() - - -@pytest.mark.timeout(300) -@pytest.mark.run(order=11) -def test_basic_ingest_success(client, cust_trans_df, driver_df): - cust_trans_fs = client.get_feature_set(name="customer_transactions") - driver_fs = client.get_feature_set(name="driver") - - # Ingest customer transaction data - client.ingest(cust_trans_fs, cust_trans_df) - client.ingest(driver_fs, driver_df) - time.sleep(5) - - -@pytest.mark.timeout(90) -@pytest.mark.run(order=12) -def test_basic_retrieve_online_success(client, cust_trans_df): - feature_refs = ["daily_transactions", "total_transactions", "null_values"] - - # Poll serving for feature values until the correct values are returned - def try_get_features(): - response = client.get_online_features( - entity_rows=[ - {"customer_id": Value(int64_val=cust_trans_df.iloc[0]["customer_id"])} - ], - feature_refs=feature_refs, - ) # type: GetOnlineFeaturesResponse - is_ok = all( - [ - check_online_response(ref, cust_trans_df, response) - for ref in feature_refs - ] - ) - return response, is_ok - - wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - -@pytest.mark.timeout(90) -@pytest.mark.run(order=13) -def test_basic_retrieve_online_multiple_featureset(client, cust_trans_df, driver_df): - # Test retrieve with different variations of the string feature refs - # ie feature set inference for feature refs without specified feature set - feature_ref_df_mapping = [ - ("customer_transactions:daily_transactions", cust_trans_df), - ("driver:rating", driver_df), - ("total_transactions", cust_trans_df), - ] - - # Poll serving for feature values until the correct values are returned - def try_get_features(): - feature_refs = [mapping[0] for mapping in feature_ref_df_mapping] - response = client.get_online_features( - entity_rows=[ - { - "customer_id": Value( - int64_val=cust_trans_df.iloc[0]["customer_id"] - ), - "driver_id": Value(int64_val=driver_df.iloc[0]["driver_id"]), - } - ], - feature_refs=feature_refs, - ) # type: GetOnlineFeaturesResponse - is_ok = all( - [ - check_online_response(ref, df, response) - for ref, df in feature_ref_df_mapping - ] - ) - return response, is_ok - - wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - -@pytest.fixture(scope="module") -def nonlist_entity_dataframe(): - # Dataframe setup for feature retrieval with entity provided not in list format - N_ROWS = 2 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - customer_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "customer_id2": [i for i in range(N_ROWS)], - "customer2_rating": [i for i in range(N_ROWS)], - "customer2_cost": [float(i) + 0.5 for i in range(N_ROWS)], - "customer2_past_transactions_int": [[i, i + 2] for i in range(N_ROWS)], - "customer2_past_transactions_double": [ - [float(i) + 0.5, float(i) + 2] for i in range(N_ROWS) - ], - "customer2_past_transactions_float": [ - [float(i) + 0.5, float(i) + 2] for i in range(N_ROWS) - ], - "customer2_past_transactions_string": [ - ["first_" + str(i), "second_" + str(i)] for i in range(N_ROWS) - ], - "customer2_past_transactions_bool": [[True, False] for _ in range(N_ROWS)], - } - ) - return customer_df - - -@pytest.fixture(scope="module") -def list_entity_dataframe(): - # Dataframe setup for feature retrieval with entity provided in list format - N_ROWS = 2 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - customer_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "district_ids": [ - [np.int64(i), np.int64(i + 1), np.int64(i + 2)] for i in range(N_ROWS) - ], - "district_rating": [i for i in range(N_ROWS)], - "district_cost": [float(i) + 0.5 for i in range(N_ROWS)], - "district_past_transactions_int": [[i, i + 2] for i in range(N_ROWS)], - "district_past_transactions_double": [ - [float(i) + 0.5, float(i) + 2] for i in range(N_ROWS) - ], - "district_past_transactions_float": [ - [float(i) + 0.5, float(i) + 2] for i in range(N_ROWS) - ], - "district_past_transactions_string": [ - ["first_" + str(i), "second_" + str(i)] for i in range(N_ROWS) - ], - "district_past_transactions_bool": [[True, False] for _ in range(N_ROWS)], - } - ) - return customer_df - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=14) -def test_basic_retrieve_online_entity_nonlistform( - client, nonlist_entity_dataframe, list_entity_dataframe -): - # Case 1: Feature retrieval with multiple entities retrieval check - customer_fs = FeatureSet( - name="customer2", - features=[ - Feature(name="customer2_rating", dtype=ValueType.INT64), - Feature(name="customer2_cost", dtype=ValueType.FLOAT), - Feature(name="customer2_past_transactions_int", dtype=ValueType.INT64_LIST), - Feature( - name="customer2_past_transactions_double", dtype=ValueType.DOUBLE_LIST - ), - Feature( - name="customer2_past_transactions_float", dtype=ValueType.FLOAT_LIST - ), - Feature( - name="customer2_past_transactions_string", dtype=ValueType.STRING_LIST - ), - Feature(name="customer2_past_transactions_bool", dtype=ValueType.BOOL_LIST), - ], - entities=[Entity("customer_id2", ValueType.INT64)], - max_age=Duration(seconds=3600), - ) - - client.set_project(PROJECT_NAME) - client.apply(customer_fs) - - customer_fs = client.get_feature_set(name="customer2") - client.ingest(customer_fs, nonlist_entity_dataframe, timeout=600) - time.sleep(15) - - online_request_entity = [{"customer_id2": 0}, {"customer_id2": 1}] - online_request_features = [ - "customer2_rating", - "customer2_cost", - "customer2_past_transactions_int", - "customer2_past_transactions_double", - "customer2_past_transactions_float", - "customer2_past_transactions_string", - "customer2_past_transactions_bool", - ] - online_request_entity2 = [ - {"customer_id2": Value(int64_val=0)}, - {"customer_id2": Value(int64_val=1)}, - ] - - def try_get_features1(): - response = client.get_online_features( - entity_rows=online_request_entity, feature_refs=online_request_features - ) - is_ok = check_online_response( - "customer2_rating", nonlist_entity_dataframe, response - ) - return response, is_ok - - def try_get_features2(): - response = client.get_online_features( - entity_rows=online_request_entity2, feature_refs=online_request_features - ) - is_ok = check_online_response( - "customer2_rating", nonlist_entity_dataframe, response - ) - return response, is_ok - - online_features_actual1 = wait_retry_backoff( - retry_fn=try_get_features1, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - online_features_actual2 = wait_retry_backoff( - retry_fn=try_get_features2, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - online_features_expected = { - "customer_id2": [0, 1], - "customer2_rating": [0, 1], - "customer2_cost": [0.5, 1.5], - "customer2_past_transactions_int": [[0, 2], [1, 3]], - "customer2_past_transactions_double": [[0.5, 2.0], [1.5, 3.0]], - "customer2_past_transactions_float": [[0.5, 2.0], [1.5, 3.0]], - "customer2_past_transactions_string": [ - ["first_0", "second_0"], - ["first_1", "second_1"], - ], - "customer2_past_transactions_bool": [[True, False], [True, False]], - } - - assert online_features_actual1.to_dict() == online_features_expected - assert online_features_actual2.to_dict() == online_features_expected - - # Case 2: Feature retrieval with multiple entities retrieval check with mixed types - with pytest.raises(TypeError) as excinfo: - online_request_entity2 = [{"customer_id": 0}, {"customer_id": "error_pls"}] - online_features_actual2 = client.get_online_features( - entity_rows=online_request_entity2, feature_refs=online_request_features - ) - - assert ( - "Input entity customer_id has mixed types, ValueType.STRING and ValueType.INT64. That is not allowed." - in str(excinfo.value) - ) - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=15) -def test_basic_retrieve_online_entity_listform(client, list_entity_dataframe): - # Case 1: Features retrieval with entity in list format check - district_fs = FeatureSet( - name="district", - features=[ - Feature(name="district_rating", dtype=ValueType.INT64), - Feature(name="district_cost", dtype=ValueType.FLOAT), - Feature(name="district_past_transactions_int", dtype=ValueType.INT64_LIST), - Feature( - name="district_past_transactions_double", dtype=ValueType.DOUBLE_LIST - ), - Feature( - name="district_past_transactions_float", dtype=ValueType.FLOAT_LIST - ), - Feature( - name="district_past_transactions_string", dtype=ValueType.STRING_LIST - ), - Feature(name="district_past_transactions_bool", dtype=ValueType.BOOL_LIST), - ], - entities=[Entity("district_ids", dtype=ValueType.INT64_LIST)], - max_age=Duration(seconds=3600), - ) - - client.set_project(PROJECT_NAME) - client.apply(district_fs) - - district_fs = client.get_feature_set(name="district") - client.ingest(district_fs, list_entity_dataframe, timeout=600) - time.sleep(15) - - online_request_entity = [{"district_ids": [np.int64(1), np.int64(2), np.int64(3)]}] - online_request_features = [ - "district_rating", - "district_cost", - "district_past_transactions_int", - "district_past_transactions_double", - "district_past_transactions_float", - "district_past_transactions_string", - "district_past_transactions_bool", - ] - online_request_entity2 = [ - {"district_ids": Value(int64_list_val=Int64List(val=[1, 2, 3]))} - ] - - def try_get_features1(): - response = client.get_online_features( - entity_rows=online_request_entity, feature_refs=online_request_features - ) - is_ok = check_online_response( - "district_rating", list_entity_dataframe, response - ) - return response, is_ok - - def try_get_features2(): - response = client.get_online_features( - entity_rows=online_request_entity2, feature_refs=online_request_features - ) - is_ok = check_online_response( - "district_rating", list_entity_dataframe, response - ) - return response, is_ok - - online_features_actual = wait_retry_backoff( - retry_fn=try_get_features1, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - online_features_actual2 = wait_retry_backoff( - retry_fn=try_get_features2, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - online_features_expected = { - "district_ids": [[np.int64(1), np.int64(2), np.int64(3)]], - "district_rating": [1], - "district_cost": [1.5], - "district_past_transactions_int": [[1, 3]], - "district_past_transactions_double": [[1.5, 3.0]], - "district_past_transactions_float": [[1.5, 3.0]], - "district_past_transactions_string": [["first_1", "second_1"]], - "district_past_transactions_bool": [[True, False]], - } - - assert online_features_actual.to_dict() == online_features_expected - assert online_features_actual2.to_dict() == online_features_expected - - # Case 2: Features retrieval with entity in list format check with mixed types - with pytest.raises(ValueError) as excinfo: - online_request_entity2 = [{"district_ids": [np.int64(1), np.int64(2), True]}] - online_features_actual2 = client.get_online_features( - entity_rows=online_request_entity2, feature_refs=online_request_features - ) - - assert ( - "List value type for field district_ids is inconsistent. ValueType.INT64 different from ValueType.BOOL." - in str(excinfo.value) - ) - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=16) -def test_basic_ingest_retrieval_fs(client): - # Set to another project to test ingestion based on current project context - client.set_project(PROJECT_NAME + "_NS1") - driver_fs = FeatureSet( - name="driver_fs", - features=[ - Feature(name="driver_fs_rating", dtype=ValueType.FLOAT), - Feature(name="driver_fs_cost", dtype=ValueType.FLOAT), - ], - entities=[Entity("driver_fs_id", ValueType.INT64)], - max_age=Duration(seconds=3600), - ) - client.apply(driver_fs) - - N_ROWS = 2 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - driver_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "driver_fs_id": [i for i in range(N_ROWS)], - "driver_fs_rating": [float(i) for i in range(N_ROWS)], - "driver_fs_cost": [float(i) + 0.5 for i in range(N_ROWS)], - } - ) - client.ingest(driver_fs, driver_df, timeout=600) - time.sleep(15) - - online_request_entity = [{"driver_fs_id": 0}, {"driver_fs_id": 1}] - online_request_features = ["driver_fs_rating", "driver_fs_cost"] - - def try_get_features(): - response = client.get_online_features( - entity_rows=online_request_entity, feature_refs=online_request_features - ) - is_ok = check_online_response("driver_fs_rating", driver_df, response) - return response, is_ok - - online_features_actual = wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - online_features_expected = { - "driver_fs_id": [0, 1], - "driver_fs_rating": [0.0, 1.0], - "driver_fs_cost": [0.5, 1.5], - } - - assert online_features_actual.to_dict() == online_features_expected - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=17) -def test_basic_ingest_retrieval_str(client): - # Set to another project to test ingestion based on current project context - client.set_project(PROJECT_NAME + "_NS1") - customer_fs = FeatureSet( - name="cust_fs", - features=[ - Feature(name="cust_rating", dtype=ValueType.INT64), - Feature(name="cust_cost", dtype=ValueType.FLOAT), - ], - entities=[Entity("cust_id", ValueType.INT64)], - max_age=Duration(seconds=3600), - ) - client.apply(customer_fs) - - N_ROWS = 2 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - cust_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "cust_id": [i for i in range(N_ROWS)], - "cust_rating": [i for i in range(N_ROWS)], - "cust_cost": [float(i) + 0.5 for i in range(N_ROWS)], - } - ) - client.ingest("cust_fs", cust_df, timeout=600) - time.sleep(15) - - online_request_entity = [{"cust_id": 0}, {"cust_id": 1}] - online_request_features = ["cust_rating", "cust_cost"] - - def try_get_features(): - response = client.get_online_features( - entity_rows=online_request_entity, feature_refs=online_request_features - ) - is_ok = check_online_response("cust_rating", cust_df, response) - return response, is_ok - - online_features_actual = wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - online_features_expected = { - "cust_id": [0, 1], - "cust_rating": [0, 1], - "cust_cost": [0.5, 1.5], - } - - assert online_features_actual.to_dict() == online_features_expected - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=18) -def test_basic_ingest_retrieval_multi_entities(client): - # Set to another project to test ingestion based on current project context - client.set_project(PROJECT_NAME + "_NS1") - merchant_fs = FeatureSet( - name="merchant_fs", - features=[Feature(name="merchant_sales", dtype=ValueType.FLOAT)], - entities=[ - Entity("driver_id", ValueType.INT64), - Entity("merchant_id", ValueType.INT64), - ], - max_age=Duration(seconds=3600), - ) - client.apply(merchant_fs) - - N_ROWS = 2 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - merchant_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "driver_id": [i for i in range(N_ROWS)], - "merchant_id": [i for i in range(N_ROWS)], - "merchant_sales": [float(i) + 0.5 for i in range(N_ROWS)], - } - ) - client.ingest("merchant_fs", merchant_df, timeout=600) - time.sleep(15) - - online_request_entity = [ - {"driver_id": 0, "merchant_id": 0}, - {"driver_id": 1, "merchant_id": 1}, - ] - online_request_features = ["merchant_sales"] - - def try_get_features(): - response = client.get_online_features( - entity_rows=online_request_entity, feature_refs=online_request_features - ) - is_ok = check_online_response("merchant_sales", merchant_df, response) - return response, is_ok - - online_features_actual = wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - online_features_expected = { - "driver_id": [0, 1], - "merchant_id": [0, 1], - "merchant_sales": [0.5, 1.5], - } - - assert online_features_actual.to_dict() == online_features_expected - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=19) -def test_basic_retrieve_feature_row_missing_fields(client, cust_trans_df): - feature_refs = ["daily_transactions", "total_transactions", "null_values"] - - # apply cust_trans_fs and ingest dataframe - client.set_project(PROJECT_NAME + "_basic_retrieve_missing_fields") - old_cust_trans_fs = FeatureSet.from_yaml(f"{DIR_PATH}/basic/cust_trans_fs.yaml") - client.apply(old_cust_trans_fs) - client.ingest(old_cust_trans_fs, cust_trans_df) - - # update cust_trans_fs with one additional feature. - # feature rows ingested before the feature set update will be missing a field. - new_cust_trans_fs = client.get_feature_set(name="customer_transactions") - new_cust_trans_fs.add(Feature("n_trips", ValueType.INT64)) - client.apply(new_cust_trans_fs) - # sleep to ensure feature set update is propagated - time.sleep(15) - - # attempt to retrieve features from feature rows with missing fields - def try_get_features(): - response = client.get_online_features( - entity_rows=[ - {"customer_id": np.int64(cust_trans_df.iloc[0]["customer_id"])} - ], - feature_refs=feature_refs + ["n_trips"], - ) # type: GetOnlineFeaturesResponse - # check if the ingested fields can be correctly retrieved. - is_ok = all( - [ - check_online_response(ref, cust_trans_df, response) - for ref in feature_refs - ] - ) - # should return null_value status for missing field n_trips - is_missing_ok = ( - response.field_values[0].statuses["n_trips"] - == GetOnlineFeaturesResponse.FieldStatus.NULL_VALUE - ) - return response, is_ok and is_missing_ok - - wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=20) -def test_basic_retrieve_feature_row_extra_fields(client, cust_trans_df): - feature_refs = ["daily_transactions", "total_transactions"] - # apply cust_trans_fs and ingest dataframe - client.set_project(PROJECT_NAME + "_basic_retrieve_missing_fields") - old_cust_trans_fs = FeatureSet.from_yaml(f"{DIR_PATH}/basic/cust_trans_fs.yaml") - client.apply(old_cust_trans_fs) - client.ingest(old_cust_trans_fs, cust_trans_df) - - # update cust_trans_fs with the null_values feature dropped. - # feature rows ingested before the feature set update will have an extra field. - new_cust_trans_fs = client.get_feature_set(name="customer_transactions") - new_cust_trans_fs.drop("null_values") - client.apply(new_cust_trans_fs) - # sleep to ensure feature set update is propagated - time.sleep(15) - - # attempt to retrieve features from feature rows with extra fields - def try_get_features(): - response = client.get_online_features( - entity_rows=[ - {"customer_id": np.int64(cust_trans_df.iloc[0]["customer_id"])} - ], - feature_refs=feature_refs, - ) # type: GetOnlineFeaturesResponse - # check if the non dropped fields can be correctly retrieved. - is_ok = all( - [ - check_online_response(ref, cust_trans_df, response) - for ref in feature_refs - ] - ) - return response, is_ok - - wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - -@pytest.fixture(scope="module") -def all_types_dataframe(): - return pd.DataFrame( - { - "datetime": [datetime.utcnow().replace(tzinfo=pytz.utc) for _ in range(3)], - "user_id": [1001, 1002, 1003], - "int32_feature": [np.int32(1), np.int32(2), np.int32(3)], - "int64_feature": [np.int64(1), np.int64(2), np.int64(3)], - "float_feature": [np.float(0.1), np.float(0.2), np.float(0.3)], - "double_feature": [np.float64(0.1), np.float64(0.2), np.float64(0.3)], - "string_feature": ["one", "two", "three"], - "bytes_feature": [b"one", b"two", b"three"], - "bool_feature": [True, False, False], - "int32_list_feature": [ - np.array([1, 2, 3, 4], dtype=np.int32), - np.array([1, 2, 3, 4], dtype=np.int32), - np.array([1, 2, 3, 4], dtype=np.int32), - ], - "int64_list_feature": [ - np.array([1, 2, 3, 4], dtype=np.int64), - np.array([1, 2, 3, 4], dtype=np.int64), - np.array([1, 2, 3, 4], dtype=np.int64), - ], - "float_list_feature": [ - np.array([1.1, 1.2, 1.3, 1.4], dtype=np.float32), - np.array([1.1, 1.2, 1.3, 1.4], dtype=np.float32), - np.array([1.1, 1.2, 1.3, 1.4], dtype=np.float32), - ], - "double_list_feature": [ - np.array([1.1, 1.2, 1.3, 1.4], dtype=np.float64), - np.array([1.1, 1.2, 1.3, 1.4], dtype=np.float64), - np.array([1.1, 1.2, 1.3, 1.4], dtype=np.float64), - ], - "string_list_feature": [ - np.array(["one", "two", "three"]), - np.array(["one", "two", "three"]), - np.array(["one", "two", "three"]), - ], - "bytes_list_feature": [ - np.array([b"one", b"two", b"three"]), - np.array([b"one", b"two", b"three"]), - np.array([b"one", b"two", b"three"]), - ], - "bool_list_feature": [ - [True, False, True], - [True, False, True], - [True, False, True], - ], - } - ) - - -@pytest.mark.timeout(45) -@pytest.mark.run(order=21) -def test_all_types_register_feature_set_success(client): - client.set_project(PROJECT_NAME) - - all_types_fs_expected = FeatureSet( - name="all_types", - entities=[Entity(name="user_id", dtype=ValueType.INT64)], - features=[ - Feature(name="float_feature", dtype=ValueType.FLOAT), - Feature(name="int64_feature", dtype=ValueType.INT64), - Feature(name="int32_feature", dtype=ValueType.INT32), - Feature(name="string_feature", dtype=ValueType.STRING), - Feature(name="bytes_feature", dtype=ValueType.BYTES), - Feature(name="bool_feature", dtype=ValueType.BOOL), - Feature(name="double_feature", dtype=ValueType.DOUBLE), - Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST), - Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST), - Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST), - Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST), - Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), - Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), - Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST), - ], - max_age=Duration(seconds=3600), - ) - - # Register feature set - client.apply(all_types_fs_expected) - - # Feast Core needs some time to fully commit the FeatureSet applied - # when there is no existing job yet for the Featureset - time.sleep(15) - - all_types_fs_actual = client.get_feature_set(name="all_types") - - assert all_types_fs_actual == all_types_fs_expected - - if all_types_fs_actual is None: - raise Exception( - "Client cannot retrieve 'all_types_fs' FeatureSet " - "after registration. Either Feast Core does not save the " - "FeatureSet correctly or the client needs to wait longer for FeatureSet " - "to be committed." - ) - - -@pytest.mark.timeout(300) -@pytest.mark.run(order=22) -def test_all_types_ingest_success(client, all_types_dataframe): - # Get all_types feature set - all_types_fs = client.get_feature_set(name="all_types") - - # Ingest user embedding data - client.ingest(all_types_fs, all_types_dataframe) - - -@pytest.mark.timeout(90) -@pytest.mark.run(order=23) -def test_all_types_retrieve_online_success(client, all_types_dataframe): - # Poll serving for feature values until the correct values are returned_float_list - feature_refs = [ - "float_feature", - "int64_feature", - "int32_feature", - "double_feature", - "string_feature", - "bool_feature", - "bytes_feature", - "float_list_feature", - "int64_list_feature", - "int32_list_feature", - "string_list_feature", - "bytes_list_feature", - "double_list_feature", - "bool_list_feature", - ] - - def try_get_features(): - response = client.get_online_features( - entity_rows=[ - {"user_id": Value(int64_val=all_types_dataframe.iloc[0]["user_id"])} - ], - feature_refs=feature_refs, - ) # type: GetOnlineFeaturesResponse - is_ok = check_online_response("float_feature", all_types_dataframe, response) - return response, is_ok - - response = wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - # check returned values - returned_float_list = ( - response.field_values[0].fields["float_list_feature"].float_list_val.val - ) - sent_float_list = all_types_dataframe.iloc[0]["float_list_feature"] - assert math.isclose( - returned_float_list[0], sent_float_list[0], abs_tol=FLOAT_TOLERANCE - ) - # check returned metadata - assert ( - response.field_values[0].statuses["float_list_feature"] - == GetOnlineFeaturesResponse.FieldStatus.PRESENT - ) - - -@pytest.mark.timeout(300) -@pytest.mark.run(order=35) -def test_all_types_ingest_jobs(jobcontroller_client, client, all_types_dataframe): - # list ingestion jobs given featureset - client.set_project(PROJECT_NAME) - - all_types_fs = client.get_feature_set(name="all_types") - ingest_jobs = jobcontroller_client.list_ingest_jobs( - feature_set_ref=FeatureSetRef.from_feature_set(all_types_fs) - ) - # filter ingestion jobs to only those that are running - ingest_jobs = [ - job for job in ingest_jobs if job.status == IngestionJobStatus.RUNNING - ] - assert len(ingest_jobs) >= 1 - - ingest_job = ingest_jobs[0] - # restart ingestion ingest_job - # restart means stop current job - # (replacement will be automatically spawned) - jobcontroller_client.restart_ingest_job(ingest_job) - # wait for replacement to be created - time.sleep(15) # should be more than polling_interval - - # id without timestamp part - # that remains the same between jobs - shared_id = "-".join(ingest_job.id.split("-")[:-1]) - ingest_jobs = jobcontroller_client.list_ingest_jobs( - feature_set_ref=FeatureSetRef.from_feature_set(all_types_fs) - ) - replacement_jobs = [ - job - for job in ingest_jobs - if job.status == IngestionJobStatus.RUNNING - and job.id.startswith(shared_id) - and job.id != ingest_job.id - ] - - assert len(replacement_jobs) >= 1 - replacement_job = replacement_jobs[0] - - replacement_job.wait(IngestionJobStatus.RUNNING) - assert replacement_job.status == IngestionJobStatus.RUNNING - - # stop ingestion ingest_job - jobcontroller_client.stop_ingest_job(replacement_job) - replacement_job.wait(IngestionJobStatus.ABORTED) - assert replacement_job.status == IngestionJobStatus.ABORTED - - -@pytest.fixture(scope="module") -def large_volume_dataframe(): - ROW_COUNT = 100000 - offset = random.randint(1000000, 10000000) # ensure a unique key space - customer_data = pd.DataFrame( - { - "datetime": [ - datetime.utcnow().replace(tzinfo=pytz.utc) for _ in range(ROW_COUNT) - ], - "customer_id": [offset + inc for inc in range(ROW_COUNT)], - "daily_transactions_large": [np.random.rand() for _ in range(ROW_COUNT)], - "total_transactions_large": [256 for _ in range(ROW_COUNT)], - } - ) - return customer_data - - -@pytest.mark.timeout(45) -@pytest.mark.run(order=40) -def test_large_volume_register_feature_set_success(client): - cust_trans_fs_expected = FeatureSet.from_yaml( - f"{DIR_PATH}/large_volume/cust_trans_large_fs.yaml" - ) - - # Register feature set - client.apply(cust_trans_fs_expected) - - # Feast Core needs some time to fully commit the FeatureSet applied - # when there is no existing job yet for the Featureset - time.sleep(10) - cust_trans_fs_actual = client.get_feature_set(name="customer_transactions_large") - - assert cust_trans_fs_actual == cust_trans_fs_expected - - if cust_trans_fs_actual is None: - raise Exception( - "Client cannot retrieve 'customer_transactions' FeatureSet " - "after registration. Either Feast Core does not save the " - "FeatureSet correctly or the client needs to wait longer for FeatureSet " - "to be committed." - ) - - -@pytest.mark.timeout(300) -@pytest.mark.run(order=41) -def test_large_volume_ingest_success(client, large_volume_dataframe): - # Get large volume feature set - cust_trans_fs = client.get_feature_set(name="customer_transactions_large") - - # Ingest customer transaction data - client.ingest(cust_trans_fs, large_volume_dataframe) - - -@pytest.mark.timeout(90) -@pytest.mark.run(order=42) -def test_large_volume_retrieve_online_success(client, large_volume_dataframe): - # Poll serving for feature values until the correct values are returned - feature_refs = [ - "daily_transactions_large", - "total_transactions_large", - ] - while True: - response = client.get_online_features( - entity_rows=[ - { - "customer_id": Value( - int64_val=large_volume_dataframe.iloc[0]["customer_id"] - ) - } - ], - feature_refs=feature_refs, - ) # type: GetOnlineFeaturesResponse - is_ok = all( - [ - check_online_response(ref, large_volume_dataframe, response) - for ref in feature_refs - ] - ) - return None, is_ok - - -@pytest.fixture(scope="module") -def all_types_parquet_file(): - COUNT = 20000 - - df = pd.DataFrame( - { - "datetime": [datetime.utcnow() for _ in range(COUNT)], - "customer_id": [np.int32(random.randint(0, 10000)) for _ in range(COUNT)], - "int32_feature_parquet": [ - np.int32(random.randint(0, 10000)) for _ in range(COUNT) - ], - "int64_feature_parquet": [ - np.int64(random.randint(0, 10000)) for _ in range(COUNT) - ], - "float_feature_parquet": [np.float(random.random()) for _ in range(COUNT)], - "double_feature_parquet": [ - np.float64(random.random()) for _ in range(COUNT) - ], - "string_feature_parquet": [ - "one" + str(random.random()) for _ in range(COUNT) - ], - "bytes_feature_parquet": [b"one" for _ in range(COUNT)], - "int32_list_feature_parquet": [ - np.array([1, 2, 3, random.randint(0, 10000)], dtype=np.int32) - for _ in range(COUNT) - ], - "int64_list_feature_parquet": [ - np.array([1, random.randint(0, 10000), 3, 4], dtype=np.int64) - for _ in range(COUNT) - ], - "float_list_feature_parquet": [ - np.array([1.1, 1.2, 1.3, random.random()], dtype=np.float32) - for _ in range(COUNT) - ], - "double_list_feature_parquet": [ - np.array([1.1, 1.2, 1.3, random.random()], dtype=np.float64) - for _ in range(COUNT) - ], - "string_list_feature_parquet": [ - np.array(["one", "two" + str(random.random()), "three"]) - for _ in range(COUNT) - ], - "bytes_list_feature_parquet": [ - np.array([b"one", b"two", b"three"]) for _ in range(COUNT) - ], - "bool_list_feature_parquet": [[True, False, True] for _ in range(COUNT)], - } - ) - - file_path = os.path.join(tempfile.mkdtemp(), "all_types.parquet") - df.to_parquet(file_path, allow_truncated_timestamps=True) - return file_path - - -@pytest.mark.timeout(300) -@pytest.mark.run(order=50) -def test_all_types_parquet_register_feature_set_success(client): - # Load feature set from file - all_types_parquet_expected = FeatureSet.from_yaml( - f"{DIR_PATH}/all_types_parquet/all_types_parquet.yaml" - ) - - # Register feature set - client.apply(all_types_parquet_expected) - - # Feast Core needs some time to fully commit the FeatureSet applied - # when there is no existing job yet for the Featureset - time.sleep(30) - - all_types_parquet_actual = client.get_feature_set(name="all_types_parquet") - - assert all_types_parquet_actual == all_types_parquet_expected - - if all_types_parquet_actual is None: - raise Exception( - "Client cannot retrieve 'customer_transactions' FeatureSet " - "after registration. Either Feast Core does not save the " - "FeatureSet correctly or the client needs to wait longer for FeatureSet " - "to be committed." - ) - - -@pytest.mark.timeout(600) -@pytest.mark.run(order=51) -def test_all_types_infer_register_ingest_file_success(client, all_types_parquet_file): - # Get feature set - all_types_fs = client.get_feature_set(name="all_types_parquet") - - # Ingest user embedding data - client.ingest(feature_set=all_types_fs, source=all_types_parquet_file) - - -@pytest.mark.timeout(200) -@pytest.mark.run(order=60) -def test_list_entities_and_features(client): - customer_entity = Entity("customer_id", ValueType.INT64) - driver_entity = Entity("driver_id", ValueType.INT64) - - customer_feature_rating = Feature( - name="rating", dtype=ValueType.FLOAT, labels={"key1": "val1"} - ) - customer_feature_cost = Feature(name="cost", dtype=ValueType.FLOAT) - driver_feature_rating = Feature(name="rating", dtype=ValueType.FLOAT) - driver_feature_cost = Feature( - name="cost", dtype=ValueType.FLOAT, labels={"key1": "val1"} - ) - - filter_by_project_entity_labels_expected = dict( - [("customer:rating", customer_feature_rating)] - ) - - filter_by_project_entity_expected = dict( - [("driver:cost", driver_feature_cost), ("driver:rating", driver_feature_rating)] - ) - - filter_by_project_labels_expected = dict( - [ - ("customer:rating", customer_feature_rating), - ("driver:cost", driver_feature_cost), - ] - ) - - customer_fs = FeatureSet( - "customer", - features=[customer_feature_rating, customer_feature_cost], - entities=[customer_entity], - max_age=Duration(seconds=100), - ) - - driver_fs = FeatureSet( - "driver", - features=[driver_feature_rating, driver_feature_cost], - entities=[driver_entity], - max_age=Duration(seconds=100), - ) - - client.set_project(PROJECT_NAME) - client.apply(customer_fs) - client.apply(driver_fs) - - # Test for listing of features - # Case 1: Filter by: project, entities and labels - filter_by_project_entity_labels_actual = client.list_features_by_ref( - project=PROJECT_NAME, entities=["customer_id"], labels={"key1": "val1"} - ) - - # Case 2: Filter by: project, entities - filter_by_project_entity_actual = client.list_features_by_ref( - project=PROJECT_NAME, entities=["driver_id"] - ) - - # Case 3: Filter by: project, labels - filter_by_project_labels_actual = client.list_features_by_ref( - project=PROJECT_NAME, labels={"key1": "val1"} - ) - - assert set(filter_by_project_entity_labels_expected) == set( - filter_by_project_entity_labels_actual - ) - assert set(filter_by_project_entity_expected) == set( - filter_by_project_entity_actual - ) - assert set(filter_by_project_labels_expected) == set( - filter_by_project_labels_actual - ) - - -@pytest.mark.timeout(500) -@pytest.mark.run(order=70) -def test_sources_deduplicate_ingest_jobs(client, jobcontroller_client, kafka_brokers): - shared_source = KafkaSource(kafka_brokers, "dup_shared") - dup_source_fs_1 = FeatureSet( - name="duplicate_source_fs_1", - features=[Feature("fs1", ValueType.FLOAT), Feature("fs2", ValueType.FLOAT)], - entities=[Entity("e2", ValueType.INT64)], - source=shared_source, - ) - dup_source_fs_2 = copy(dup_source_fs_1) - dup_source_fs_2.name = "duplicate_source_fs_2" - - def is_same_jobs(): - fs_1_jobs = jobcontroller_client.list_ingest_jobs( - feature_set_ref=FeatureSetRef( - name=dup_source_fs_1.name, project=dup_source_fs_1.project - ) - ) - fs_2_jobs = jobcontroller_client.list_ingest_jobs( - feature_set_ref=FeatureSetRef( - name=dup_source_fs_2.name, project=dup_source_fs_2.project - ) - ) - same = True - if not (len(fs_1_jobs) > 0 and len(fs_1_jobs) == len(fs_2_jobs)): - same = False - for fs_1_job in fs_1_jobs: - for fs_2_job in fs_2_jobs: - if ( - not fs_1_job.source.to_proto() == fs_2_job.source.to_proto() - and fs_1_job.source.to_proto() == shared_source.to_proto() - ): - same = False - if fs_1_job.id != fs_2_job.id: - same = False - return same - - def is_different_jobs(): - fs_1_jobs = jobcontroller_client.list_ingest_jobs( - feature_set_ref=FeatureSetRef( - name=dup_source_fs_1.name, project=dup_source_fs_1.project - ) - ) - fs_2_jobs = jobcontroller_client.list_ingest_jobs( - feature_set_ref=FeatureSetRef( - name=dup_source_fs_2.name, project=dup_source_fs_2.project - ) - ) - different = True - if not (len(fs_1_jobs) > 0 and len(fs_2_jobs) > 0): - different = False - for fs_1_job in fs_1_jobs: - if fs_1_job.source.to_proto() == alt_source.to_proto(): - different = False - for fs_2_job in fs_2_jobs: - if fs_2_job.source.to_proto() == shared_source.to_proto(): - different = False - for fs_1_job in fs_1_jobs: - for fs_2_job in fs_2_jobs: - if fs_1_job.id == fs_2_job.id: - different = False - return different - - # register multiple feature sets with the same source - # only one ingest job should spawned due to test ingest job deduplication - client.apply(dup_source_fs_1) - client.apply(dup_source_fs_2) - - while not is_same_jobs(): - time.sleep(1) - - # update feature sets with different sources, should have different jobs - alt_source = KafkaSource(kafka_brokers, "alt_source") - dup_source_fs_2.source = alt_source - client.apply(dup_source_fs_2) - - while not is_different_jobs(): - time.sleep(1) - - # update feature sets with same source again, should have the same job - dup_source_fs_2.source = shared_source - client.apply(dup_source_fs_2) - - while not is_same_jobs(): - time.sleep(1) - - -@pytest.mark.run(order=30) -def test_sink_writes_only_recent_rows(client): - client.set_project("default") - - feature_refs = ["driver:rating", "driver:cost"] - - later_df = basic_dataframe( - entities=["driver_id"], - features=["rating", "cost"], - ingest_time=datetime.utcnow(), - n_size=5, - ) - - earlier_df = basic_dataframe( - entities=["driver_id"], - features=["rating", "cost"], - ingest_time=datetime.utcnow() - timedelta(minutes=5), - n_size=5, - ) - - def try_get_features(): - response = client.get_online_features( - entity_rows=[{"driver_id": Value(int64_val=later_df.iloc[0]["driver_id"])}], - feature_refs=feature_refs, - ) # type: GetOnlineFeaturesResponse - is_ok = all( - [check_online_response(ref, later_df, response) for ref in feature_refs] - ) - return response, is_ok - - # test compaction within batch - client.ingest("driver", pd.concat([earlier_df, later_df])) - wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - # test read before write - client.ingest("driver", earlier_df) - time.sleep(10) - wait_retry_backoff( - retry_fn=try_get_features, - timeout_secs=90, - timeout_msg="Timed out trying to get online feature values", - ) - - -# TODO: rewrite these using python SDK once the labels are implemented there -class TestsBasedOnGrpc: - GRPC_CONNECTION_TIMEOUT = 3 - LABEL_KEY = "my" - LABEL_VALUE = "label" - - @pytest.fixture(scope="module") - def core_service_stub(self, core_url): - if core_url.endswith(":443"): - core_channel = grpc.secure_channel(core_url, grpc.ssl_channel_credentials()) - else: - core_channel = grpc.insecure_channel(core_url) - - try: - grpc.channel_ready_future(core_channel).result( - timeout=self.GRPC_CONNECTION_TIMEOUT - ) - except grpc.FutureTimeoutError: - raise ConnectionError( - f"Connection timed out while attempting to connect to Feast " - f"Core gRPC server {core_url} " - ) - core_service_stub = CoreServiceStub(core_channel) - return core_service_stub - - @pytest.fixture(scope="module") - def auth_meta_data(self, enable_auth): - if not enable_auth: - return None - else: - metadata = {CONFIG_AUTH_PROVIDER: AUTH_PROVIDER} - metadata_plugin = get_auth_metadata_plugin(config=Config(metadata)) - return metadata_plugin.get_signed_meta() - - def apply_feature_set(self, core_service_stub, feature_set_proto, auth_meta_data): - try: - apply_fs_response = core_service_stub.ApplyFeatureSet( - CoreService_pb2.ApplyFeatureSetRequest(feature_set=feature_set_proto), - timeout=self.GRPC_CONNECTION_TIMEOUT, - metadata=auth_meta_data, - ) # type: ApplyFeatureSetResponse - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - return apply_fs_response.feature_set - - def get_feature_set(self, core_service_stub, name, project, auth_meta_data): - try: - get_feature_set_response = core_service_stub.GetFeatureSet( - CoreService_pb2.GetFeatureSetRequest( - project=project, name=name.strip(), - ), - metadata=auth_meta_data, - ) # type: GetFeatureSetResponse - except grpc.RpcError as e: - raise grpc.RpcError(e.details()) - return get_feature_set_response.feature_set - - @pytest.mark.timeout(45) - @pytest.mark.run(order=51) - def test_register_feature_set_with_labels(self, core_service_stub, auth_meta_data): - feature_set_name = "test_feature_set_labels" - feature_set_proto = FeatureSet( - name=feature_set_name, - project=PROJECT_NAME, - labels={self.LABEL_KEY: self.LABEL_VALUE}, - ).to_proto() - self.apply_feature_set(core_service_stub, feature_set_proto, auth_meta_data) - - retrieved_feature_set = self.get_feature_set( - core_service_stub, feature_set_name, PROJECT_NAME, auth_meta_data - ) - - assert self.LABEL_KEY in retrieved_feature_set.spec.labels - assert retrieved_feature_set.spec.labels[self.LABEL_KEY] == self.LABEL_VALUE - - @pytest.mark.timeout(45) - @pytest.mark.run(order=52) - def test_register_feature_with_labels(self, core_service_stub, auth_meta_data): - feature_set_name = "test_feature_labels" - feature_set_proto = FeatureSet( - name=feature_set_name, - project=PROJECT_NAME, - features=[ - Feature( - name="rating", - dtype=ValueType.INT64, - labels={self.LABEL_KEY: self.LABEL_VALUE}, - ) - ], - ).to_proto() - self.apply_feature_set(core_service_stub, feature_set_proto, auth_meta_data) - - retrieved_feature_set = self.get_feature_set( - core_service_stub, feature_set_name, PROJECT_NAME, auth_meta_data - ) - retrieved_feature = retrieved_feature_set.spec.features[0] - - assert self.LABEL_KEY in retrieved_feature.labels - assert retrieved_feature.labels[self.LABEL_KEY] == self.LABEL_VALUE diff --git a/tests/e2e/redis/basic/cust_trans_fs.yaml b/tests/e2e/redis/basic/cust_trans_fs.yaml deleted file mode 100644 index 941037670d..0000000000 --- a/tests/e2e/redis/basic/cust_trans_fs.yaml +++ /dev/null @@ -1,14 +0,0 @@ -kind: feature_set -spec: - name: customer_transactions - entities: - - name: customer_id - valueType: INT64 - features: - - name: daily_transactions - valueType: FLOAT - - name: total_transactions - valueType: FLOAT - - name: null_values - valueType: FLOAT - maxAge: 3600s diff --git a/tests/e2e/redis/basic/data.csv b/tests/e2e/redis/basic/data.csv deleted file mode 100644 index d2994d253a..0000000000 --- a/tests/e2e/redis/basic/data.csv +++ /dev/null @@ -1,3 +0,0 @@ -datetime,customer_id,daily_transactions,total_transactions -1570366527,1001,1.3,500 -1570366536,1002,1.4,600 \ No newline at end of file diff --git a/tests/e2e/redis/basic/driver_fs.yaml b/tests/e2e/redis/basic/driver_fs.yaml deleted file mode 100644 index f25ca95678..0000000000 --- a/tests/e2e/redis/basic/driver_fs.yaml +++ /dev/null @@ -1,12 +0,0 @@ -kind: feature_set -spec: - name: driver - entities: - - name: driver_id - valueType: INT64 - features: - - name: rating - valueType: FLOAT - - name: cost - valueType: FLOAT - maxAge: 3600s diff --git a/tests/e2e/redis/large_volume/cust_trans_large_fs.yaml b/tests/e2e/redis/large_volume/cust_trans_large_fs.yaml deleted file mode 100644 index 7f36151392..0000000000 --- a/tests/e2e/redis/large_volume/cust_trans_large_fs.yaml +++ /dev/null @@ -1,12 +0,0 @@ -kind: feature_set -spec: - name: customer_transactions_large - entities: - - name: customer_id - valueType: INT64 - features: - - name: daily_transactions_large - valueType: FLOAT - - name: total_transactions_large - valueType: FLOAT - maxAge: 3600s diff --git a/tests/e2e/redis/parallel-ingest-redis-serving.py b/tests/e2e/redis/parallel-ingest-redis-serving.py new file mode 100644 index 0000000000..fb30746a0f --- /dev/null +++ b/tests/e2e/redis/parallel-ingest-redis-serving.py @@ -0,0 +1,192 @@ +import os +import uuid +from datetime import datetime + +import pytest +from google.protobuf.duration_pb2 import Duration + +from feast.client import Client +from feast.data_source import DataSource, FileOptions, SourceType +from feast.entity import Entity +from feast.feature import Feature +from feast.feature_table import FeatureTable +from feast.value_type import ValueType + +DIR_PATH = os.path.dirname(os.path.realpath(__file__)) +PROJECT_NAME = "basic_" + uuid.uuid4().hex.upper()[0:6] + + +@pytest.fixture(scope="module") +def client(pytestconfig): + core_url = pytestconfig.getoption("core_url") + serving_url = pytestconfig.getoption("serving_url") + + client = Client(core_url=core_url, serving_url=serving_url,) + + client.set_project(PROJECT_NAME) + + return client + + +@pytest.mark.incremental +class TestBasicIngestionRetrieval: + def setup_class(cls): + prefix = "basic_ingestion" + suffix = str(int(datetime.now().timestamp())) + cls.customer_ft_name = f"{prefix}_customer_{suffix}" + cls.driver_ft_name = f"{prefix}_driver_{suffix}" + + cls.customer_entity = Entity( + name="customer_id", + description="Customer entity for rides", + value_type=ValueType.STRING, + labels={"team": "customer_service", "common_key": "common_val"}, + ) + + cls.driver_entity = Entity( + name="driver_id", + description="Driver entity for car rides", + value_type=ValueType.STRING, + labels={"team": "matchmaking", "common_key": "common_val"}, + ) + + cls.basic_ft_spec = FeatureTable.from_yaml( + f"{DIR_PATH}/specifications/dev_ft.yaml" + ) + + def test_discovery(self, client): + + # ApplyEntity + client.apply_entity(self.customer_entity) + client.apply_entity(self.driver_entity) + + # GetEntity Check + assert client.get_entity(name="customer_id") == self.customer_entity + assert client.get_entity(name="driver_id") == self.driver_entity + + # ListEntities Check + common_filtering_labels = {"common_key": "common_val"} + matchmaking_filtering_labels = {"team": "matchmaking"} + + actual_common_entities = client.list_entities(labels=common_filtering_labels) + actual_matchmaking_entities = client.list_entities( + labels=matchmaking_filtering_labels + ) + + assert len(actual_common_entities) == 2 + assert len(actual_matchmaking_entities) == 1 + + # ApplyFeatureTable + client.apply_feature_table(self.basic_ft_spec, PROJECT_NAME) + + # GetFeatureTable Check + actual_get_feature_table = client.get_feature_table(name="dev_featuretable") + assert actual_get_feature_table.name == self.basic_ft_spec.name + assert actual_get_feature_table.entities == self.basic_ft_spec.entities + assert actual_get_feature_table.features == self.basic_ft_spec.features + + # ListFeatureTables Check + actual_list_feature_table = client.list_feature_tables()[0] + assert actual_list_feature_table.name == self.basic_ft_spec.name + assert actual_list_feature_table.entities == self.basic_ft_spec.entities + assert actual_list_feature_table.features == self.basic_ft_spec.features + + def test_basic_retrieval(self, client): + # TODO: Add ingest and retrieval check + pass + + +@pytest.mark.incremental +class TestAllTypesIngestionRetrieval: + def setup_class(cls): + prefix = "alltypes_ingestion" + suffix = str(int(datetime.now().timestamp())) + batch_source = DataSource( + type=SourceType(1).name, + field_mapping={ + "ride_distance": "ride_distance", + "ride_duration": "ride_duration", + }, + options=FileOptions(file_format="parquet", file_url="file://feast/*"), + timestamp_column="ts_col", + date_partition_column="date_partition_col", + ) + + cls.alltypes_entity = Entity( + name="alltypes_id", + description="Driver entity for car rides", + value_type=ValueType.STRING, + labels={"cat": "alltypes"}, + ) + + cls.alltypes_ft_name = f"{prefix}_alltypes_{suffix}" + cls.alltypes_ft_spec = FeatureTable( + name="alltypes", + entities=["alltypes_id"], + features=[ + Feature(name="float_feature", dtype=ValueType.FLOAT).to_proto(), + Feature(name="int64_feature", dtype=ValueType.INT64).to_proto(), + Feature(name="int32_feature", dtype=ValueType.INT32).to_proto(), + Feature(name="string_feature", dtype=ValueType.STRING).to_proto(), + Feature(name="bytes_feature", dtype=ValueType.BYTES).to_proto(), + Feature(name="bool_feature", dtype=ValueType.BOOL).to_proto(), + Feature(name="double_feature", dtype=ValueType.DOUBLE).to_proto(), + Feature( + name="double_list_feature", dtype=ValueType.DOUBLE_LIST + ).to_proto(), + Feature( + name="float_list_feature", dtype=ValueType.FLOAT_LIST + ).to_proto(), + Feature( + name="int64_list_feature", dtype=ValueType.INT64_LIST + ).to_proto(), + Feature( + name="int32_list_feature", dtype=ValueType.INT32_LIST + ).to_proto(), + Feature( + name="string_list_feature", dtype=ValueType.STRING_LIST + ).to_proto(), + Feature( + name="bytes_list_feature", dtype=ValueType.BYTES_LIST + ).to_proto(), + Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST).to_proto(), + ], + max_age=Duration(seconds=3600), + batch_source=batch_source.to_proto(), + labels={"cat": "alltypes"}, + ) + + def test_discovery(self, client): + # ApplyEntity + client.apply_entity(self.alltypes_entity) + + # GetEntity Check + assert client.get_entity(name="alltypes_id") == self.alltypes_entity + + # ListEntities Check + alltypes_filtering_labels = {"cat": "alltypes"} + actual_alltypes_entities = client.list_entities( + labels=alltypes_filtering_labels + ) + + assert len(client.list_entities()) == 1 + assert len(actual_alltypes_entities) == 1 + + # ApplyFeatureTable + client.apply_feature_table(self.alltypes_ft_spec, PROJECT_NAME) + + # GetFeatureTable Check + actual_get_feature_table = client.get_feature_table(name="alltypes") + assert actual_get_feature_table.name == self.alltypes_ft_spec.name + assert actual_get_feature_table.entities == self.alltypes_ft_spec.entities + assert actual_get_feature_table.features == self.alltypes_ft_spec.features + + # ListFeatureTables Check + actual_list_feature_table = client.list_feature_tables()[0] + assert actual_list_feature_table.name == self.alltypes_ft_spec.name + assert actual_list_feature_table.entities == self.alltypes_ft_spec.entities + assert actual_list_feature_table.features == self.alltypes_ft_spec.features + + def test_alltypes_retrieval(self, client): + # TODO: Add ingest and retrieval check + pass diff --git a/tests/e2e/redis/specifications/dev_ft.yaml b/tests/e2e/redis/specifications/dev_ft.yaml new file mode 100644 index 0000000000..59072b73b9 --- /dev/null +++ b/tests/e2e/redis/specifications/dev_ft.yaml @@ -0,0 +1,38 @@ +spec: + name: dev_featuretable + entities: + - driver_id + - customer_id + features: + - name: dev_feature_float + valueType: FLOAT + - name: dev_feature_string + valueType: STRING + labels: + feature_key1: feature_val1 + batchSource: + type: BATCH_FILE + fieldMapping: + dev_entity: dev_entity_field + dev_feature_float: dev_feature_float_field + dev_feature_string: dev_feature_string_field + timestampColumn: datetime_col + datePartitionColumn: datetime + file_options: + file_format: PARQUET + file_url: gs://example/feast/* + streamSource: + type: STREAM_KAFKA + field_mapping: + dev_entity: dev_entity_field + dev_feature_float: dev_feature_float_field + dev_feature_string: dev_feature_string_field + timestampColumn: datetime_col + kafka_options: + bootstrap_servers: "localhost:9094" + topic: test_topic + class_path: random/path/to/test + maxAge: 14400s + labels: + key1: val1 + key2: val2 \ No newline at end of file diff --git a/tests/e2e/requirements.txt b/tests/e2e/requirements.txt index 94c63ca120..9c6dd06ac1 100644 --- a/tests/e2e/requirements.txt +++ b/tests/e2e/requirements.txt @@ -7,6 +7,7 @@ pytest-benchmark==3.2.2 pytest-mock==1.10.4 pytest-timeout==1.3.3 pytest-ordering==0.6.* +pytest-xdist==2.1.0 tensorflow-data-validation==0.21.2 deepdiff==4.3.2 tensorflow==2.1.0 From 86f27e3961fa4864b8e58c10960e1819417dae5e Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 12:12:27 +0800 Subject: [PATCH 05/34] Remove e2e bq tests Signed-off-by: Terence --- .../scripts/test-end-to-end-batch-dataflow.sh | 307 ------- infra/scripts/test-end-to-end-batch.sh | 153 ---- tests/e2e/bq/bq-batch-retrieval.py | 819 ------------------ tests/e2e/bq/feature-stats.py | 256 ------ tests/e2e/bq/testutils.py | 55 -- 5 files changed, 1590 deletions(-) delete mode 100755 infra/scripts/test-end-to-end-batch-dataflow.sh delete mode 100755 infra/scripts/test-end-to-end-batch.sh delete mode 100644 tests/e2e/bq/bq-batch-retrieval.py delete mode 100644 tests/e2e/bq/feature-stats.py delete mode 100644 tests/e2e/bq/testutils.py diff --git a/infra/scripts/test-end-to-end-batch-dataflow.sh b/infra/scripts/test-end-to-end-batch-dataflow.sh deleted file mode 100755 index 363ba7dc47..0000000000 --- a/infra/scripts/test-end-to-end-batch-dataflow.sh +++ /dev/null @@ -1,307 +0,0 @@ -#!/usr/bin/env bash -echo "Preparing environment variables..." - -set -e -set -o pipefail - -test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account-df/service-account-df.json" -test -z ${GCLOUD_PROJECT} && GCLOUD_PROJECT="kf-feast" -test -z ${GCLOUD_REGION} && GCLOUD_REGION="us-central1" -test -z ${GCLOUD_NETWORK} && GCLOUD_NETWORK="default" -test -z ${GCLOUD_SUBNET} && GCLOUD_SUBNET="default" -test -z ${TEMP_BUCKET} && TEMP_BUCKET="kf-feast-dataflow-temp" -test -z ${K8_CLUSTER_NAME} && K8_CLUSTER_NAME="feast-e2e-dataflow" -test -z ${HELM_RELEASE_NAME} && HELM_RELEASE_NAME="pr-$PULL_NUMBER" -test -z ${HELM_COMMON_NAME} && HELM_COMMON_NAME="deps" -test -z ${DATASET_NAME} && DATASET_NAME=feast_e2e_$(date +%s) -test -z ${SPECS_TOPIC} && SPECS_TOPIC=feast-specs-$(date +%s) -test -z ${FEATURES_TOPIC} && FEATURES_TOPIC=feast-$(date +%s) - - -feast_kafka_1_ip_name="feast-kafka-1" -feast_kafka_2_ip_name="feast-kafka-2" -feast_kafka_3_ip_name="feast-kafka-3" -feast_redis_ip_name="feast-redis" -feast_statsd_ip_name="feast-statsd" - -echo " -This script will run end-to-end tests for Feast Core and Batch Serving using Dataflow Runner. - -1. Setup K8s cluster (optional, if it was not created before) -2. Reuse existing IP addresses or generate new ones for stateful services -3. Install stateful services (kafka, redis, postgres, etc) (optional) -4. Build core & serving docker images (optional) -5. Create temporary BQ table for Feast Serving. -6. Rollout target images to cluster via helm in dedicated namespace (pr-{number}) -7. Install Python 3.7.4, Feast Python SDK and run end-to-end tests from - tests/e2e via pytest. -8. Tear down feast services, keep stateful services. -" - -ORIGINAL_DIR=$(pwd) -echo $ORIGINAL_DIR - -echo "Environment:" -printenv - -export GOOGLE_APPLICATION_CREDENTIALS -gcloud auth activate-service-account --key-file ${GOOGLE_APPLICATION_CREDENTIALS} -gcloud -q auth configure-docker - -gcloud config set project ${GCLOUD_PROJECT} -gcloud config set compute/region ${GCLOUD_REGION} -gcloud config list - -apt-get -qq update -apt-get -y install wget build-essential gettext-base curl - -curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 -chmod 700 $ORIGINAL_DIR/get_helm.sh -$ORIGINAL_DIR/get_helm.sh - - -function getPublicAddresses() { - existing_addresses=$(gcloud compute addresses list --filter="region:($GCLOUD_REGION) name:kafka" --format "list(name)") - if [[ -z "$existing_addresses" ]]; then - echo " -============================================================ -Reserving IP addresses for Feast dependencies -============================================================ -" - - gcloud compute addresses create \ - $feast_kafka_1_ip_name $feast_kafka_2_ip_name $feast_kafka_3_ip_name $feast_redis_ip_name $feast_statsd_ip_name \ - --region ${GCLOUD_REGION} --subnet ${GCLOUD_SUBNET} - fi - - - export feast_kafka_1_ip=$(gcloud compute addresses describe $feast_kafka_1_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_kafka_2_ip=$(gcloud compute addresses describe $feast_kafka_2_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_kafka_3_ip=$(gcloud compute addresses describe $feast_kafka_3_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_redis_ip=$(gcloud compute addresses describe $feast_redis_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_statsd_ip=$(gcloud compute addresses describe $feast_statsd_ip_name --region=${GCLOUD_REGION} --format "value(address)") -} - -function createKubeCluster() { - echo " -============================================================ -Creating GKE nodepool for Feast e2e test with DataflowRunner -============================================================ -" - gcloud container clusters create ${K8_CLUSTER_NAME} --region ${GCLOUD_REGION} \ - --enable-cloud-logging \ - --enable-cloud-monitoring \ - --network ${GCLOUD_NETWORK} \ - --subnetwork ${GCLOUD_SUBNET} \ - --scopes https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,\ -https://www.googleapis.com/auth/monitoring,https://www.googleapis.com/auth/service.management.readonly,\ -https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/trace.append,\ -https://www.googleapis.com/auth/bigquery \ - --machine-type n1-standard-2 - - echo " -============================================================ -Create feast-postgres-database Secret in GKE nodepool -============================================================ -" - kubectl create secret generic feast-postgresql --from-literal=postgresql-password=password - - echo " -============================================================ -Create feast-gcp-service-account Secret in GKE nodepool -============================================================ -" - cd $ORIGINAL_DIR/infra/scripts - kubectl create secret generic feast-gcp-service-account --from-file=credentials.json=${GOOGLE_APPLICATION_CREDENTIALS} -} - -function installDependencies() { - echo " -============================================================ -Helm install common parts (kafka, redis, etc) -============================================================ -" - cd $ORIGINAL_DIR/infra/charts/feast - - helm install --replace --wait --debug --values="values-end-to-end-batch-dataflow-updated.yaml" \ - --set "feast-core.enabled=false" \ - --set "feast-online-serving.enabled=false" \ - --set "feast-batch-serving.enabled=false" \ - --set "postgresql.enabled=false" - "$HELM_COMMON_NAME" . - -} - -function buildAndPushImage() -{ - echo docker build -t $1:$2 --build-arg REVISION=$2 -f $3 $ORIGINAL_DIR - docker build -t $1:$2 --build-arg REVISION=$2 -f $3 $ORIGINAL_DIR - docker push $1:$2 -} - -function buildTarget() { - buildAndPushImage "gcr.io/kf-feast/feast-core" "$PULL_NUMBER" "$ORIGINAL_DIR/infra/docker/core/Dockerfile" - buildAndPushImage "gcr.io/kf-feast/feast-serving" "$PULL_NUMBER" "$ORIGINAL_DIR/infra/docker/serving/Dockerfile" -} - -function installTarget() { - echo " -============================================================ -Helm install feast -============================================================ -" - cd $ORIGINAL_DIR/infra/charts/feast - - helm install --wait --timeout 300s --debug --values="values-end-to-end-batch-dataflow-updated.yaml" \ - --set "kafka.enabled=false" \ - --set "redis.enabled=false" \ - --set "prometheus-statsd-exporter.enabled=false" \ - --set "prometheus.enabled=false" \ - "$HELM_RELEASE_NAME" . - -} - -function clean() { - echo " - ============================================================ - Cleaning up - ============================================================ - " - cd $ORIGINAL_DIR/tests/e2e - - # Remove BQ Dataset - bq rm -r -f ${GCLOUD_PROJECT}:${DATASET_NAME} - - # Uninstall helm release before clearing PVCs - helm uninstall ${HELM_RELEASE_NAME} - - kubectl delete pvc data-${HELM_RELEASE_NAME}-postgresql-0 - - # Stop Dataflow jobs from retrieved Dataflow job ids in ingesting_jobs.txt - if [ -f ingesting_jobs.txt ]; then - while read line - do - echo $line - gcloud dataflow jobs cancel $line --region=${GCLOUD_REGION} - done < ingesting_jobs.txt - fi -} - -# 1. -existing_cluster=$(gcloud container clusters list --format "list(name)" --filter "name:$K8_CLUSTER_NAME") -if [[ -z $existing_cluster ]]; then - createKubeCluster "$@" -else - gcloud container clusters get-credentials $K8_CLUSTER_NAME --region $GCLOUD_REGION --project $GCLOUD_PROJECT -fi - -# 2. -getPublicAddresses "$@" - -echo " -============================================================ -Export required environment variables -============================================================ -" - -export TEMP_BUCKET=$TEMP_BUCKET/$HELM_RELEASE_NAME/$(date +%s) -export DATASET_NAME=$DATASET_NAME -export GCLOUD_PROJECT=$GCLOUD_PROJECT -export GCLOUD_NETWORK=$GCLOUD_NETWORK -export GCLOUD_SUBNET=$GCLOUD_SUBNET -export GCLOUD_REGION=$GCLOUD_REGION -export HELM_COMMON_NAME=$HELM_COMMON_NAME -export IMAGE_TAG=$PULL_PULL_SHA -export SPECS_TOPIC=$SPECS_TOPIC -export FEATURES_TOPIC=$FEATURES_TOPIC - -export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) -export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts -source ${SCRIPTS_DIR}/setup-common-functions.sh - -wait_for_docker_image gcr.io/kf-feast/feast-core:"${IMAGE_TAG}" -wait_for_docker_image gcr.io/kf-feast/feast-serving:"${IMAGE_TAG}" - -envsubst $'$TEMP_BUCKET $DATASET_NAME $GCLOUD_PROJECT $GCLOUD_NETWORK $SPECS_TOPIC $FEATURES_TOPIC \ - $GCLOUD_SUBNET $GCLOUD_REGION $IMAGE_TAG $HELM_COMMON_NAME $feast_kafka_1_ip - $feast_kafka_2_ip $feast_kafka_3_ip $feast_redis_ip $feast_statsd_ip' < $ORIGINAL_DIR/infra/scripts/test-templates/values-end-to-end-batch-dataflow.yaml > $ORIGINAL_DIR/infra/charts/feast/values-end-to-end-batch-dataflow-updated.yaml - - -# 3. -existing_deps=$(helm list --filter deps -q) -if [[ -z $existing_deps ]]; then - installDependencies "$@" -fi - -# 4. -# buildTarget "$@" - -# 5. -echo " -============================================================ -Creating temp BQ table for Feast Serving -============================================================ -" - -bq --location=US --project_id=${GCLOUD_PROJECT} mk \ - --dataset \ - --default_table_expiration 86400 \ - ${GCLOUD_PROJECT}:${DATASET_NAME} - - -# 6. - -set +e -installTarget "$@" - -# 7. -echo " -============================================================ -Installing Python 3.7 with Miniconda and Feast SDK -============================================================ -" -cd $ORIGINAL_DIR -# Install Python 3.7 with Miniconda -wget -q https://repo.continuum.io/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh \ - -O /tmp/miniconda.sh -bash /tmp/miniconda.sh -b -p /root/miniconda -f -/root/miniconda/bin/conda init -source ~/.bashrc - -# Install Feast Python SDK and test requirements -cd $ORIGINAL_DIR -make compile-protos-python -pip install -qe sdk/python -pip install -qr tests/e2e/requirements.txt - -echo " -============================================================ -Running end-to-end tests with pytest at 'tests/e2e' -============================================================ -" -# Default artifact location setting in Prow jobs -LOGS_ARTIFACT_PATH=/logs/artifacts - -cd $ORIGINAL_DIR/tests/e2e - -core_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-core) -serving_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-batch-serving) -jobcontroller_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-jobcontroller) - -set +e -pytest -s -v bq/bq-batch-retrieval.py -m dataflow_runner --core_url "$core_ip:6565" --serving_url "$serving_ip:6566" \ - --jobcontroller_url "$jobcontroller_ip:6570" --gcs_path "gs://${TEMP_BUCKET}" --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml -TEST_EXIT_CODE=$? - -if [[ ${TEST_EXIT_CODE} != 0 ]]; then - echo "[DEBUG] Printing logs" - ls -ltrh /var/log/feast* - cat /var/log/feast-serving-warehouse.log /var/log/feast-core.log - - echo "[DEBUG] Printing Python packages list" - pip list -else - clean "$@" -fi - -exit ${TEST_EXIT_CODE} diff --git a/infra/scripts/test-end-to-end-batch.sh b/infra/scripts/test-end-to-end-batch.sh deleted file mode 100755 index c741fe7168..0000000000 --- a/infra/scripts/test-end-to-end-batch.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env bash - -set -e -set -o pipefail - -PYTEST_MARK='direct_runner' #default - -print_usage() { - printf "Usage: ./test-end-to-end-batch -m pytest_mark" -} - -while getopts 'm:' flag; do - case "${flag}" in - m) PYTEST_MARK="${OPTARG}" ;; - *) print_usage - exit 1 ;; - esac -done - -test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account/service-account.json" -test -z ${SKIP_BUILD_JARS} && SKIP_BUILD_JARS="false" -test -z ${GOOGLE_CLOUD_PROJECT} && GOOGLE_CLOUD_PROJECT="kf-feast" -test -z ${TEMP_BUCKET} && TEMP_BUCKET="feast-templocation-kf-feast" -test -z ${JOBS_STAGING_LOCATION} && JOBS_STAGING_LOCATION="gs://${TEMP_BUCKET}/staging-location/$(date +%s)" - -# Get the current build version using maven (and pom.xml) -export FEAST_BUILD_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) -echo Building version: $FEAST_BUILD_VERSION - -# Get Feast project repository root and scripts directory -export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) -export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts - -echo " -This script will run end-to-end tests for Feast Core and Batch Serving. - -1. Install gcloud SDK -2. Install Redis as the job store for Feast Batch Serving. -4. Install Postgres for persisting Feast metadata. -5. Install Kafka and Zookeeper as the Source in Feast. -6. Install Python 3.7.4, Feast Python SDK and run end-to-end tests from - tests/e2e via pytest. -" - -source ${SCRIPTS_DIR}/setup-common-functions.sh - -install_test_tools -install_gcloud_sdk -install_and_start_local_redis -install_and_start_local_postgres -install_and_start_local_zookeeper_and_kafka - -if [[ ${SKIP_BUILD_JARS} != "true" ]]; then - build_feast_core_and_serving -else - echo "[DEBUG] Skipping building jars" -fi - -DATASET_NAME=feast_$(date +%s) -bq --location=US --project_id=${GOOGLE_CLOUD_PROJECT} mk \ - --dataset \ - --default_table_expiration 86400 \ - ${GOOGLE_CLOUD_PROJECT}:${DATASET_NAME} - -# Start Feast Core in background -cat < /tmp/jc.warehouse.application.yml -feast: - core-host: localhost - core-port: 6565 - jobs: - polling_interval_milliseconds: 10000 - active_runner: direct - consolidate-jobs-per-source: true - runners: - - name: direct - type: DirectRunner - options: - tempLocation: gs://${TEMP_BUCKET}/tempLocation - -EOF - -cat < /tmp/serving.warehouse.application.yml -feast: - # GRPC service address for Feast Core - # Feast Serving requires connection to Feast Core to retrieve and reload Feast metadata (e.g. FeatureSpecs, Store information) - core-host: localhost - core-grpc-port: 6565 - - # Indicates the active store. Only a single store in the last can be active at one time. In the future this key - # will be deprecated in order to allow multiple stores to be served from a single serving instance - active_store: historical - - # List of store configurations - stores: - - name: historical - type: BIGQUERY - config: - project_id: ${GOOGLE_CLOUD_PROJECT} - dataset_id: ${DATASET_NAME} - staging_location: ${JOBS_STAGING_LOCATION} - initial_retry_delay_seconds: 1 - total_timeout_seconds: 21600 - write_triggering_frequency_seconds: 1 - subscriptions: - - name: "*" - project: "*" - version: "*" - - job_store: - redis_host: localhost - redis_port: 6379 - - tracing: - enabled: false - -server: - port: 8081 - -EOF - -cat /tmp/jc.warehouse.application.yml /tmp/serving.warehouse.application.yml - -start_feast_core -start_feast_jobcontroller /tmp/jc.warehouse.application.yml -start_feast_serving /tmp/serving.warehouse.application.yml - -install_python_with_miniconda_and_feast_sdk - -print_banner "Running end-to-end tests with pytest at 'tests/e2e'" -# Default artifact location setting in Prow jobs -LOGS_ARTIFACT_PATH=/logs/artifacts - -ORIGINAL_DIR=$(pwd) -cd tests/e2e - -set +e -pytest bq/* -v -m ${PYTEST_MARK} --gcs_path ${JOBS_STAGING_LOCATION} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml -TEST_EXIT_CODE=$? - -if [[ ${TEST_EXIT_CODE} != 0 ]]; then - echo "[DEBUG] Printing logs" - ls -ltrh /var/log/feast* - cat /var/log/feast-serving-online.log /var/log/feast-core.log /var/log/feast-jobcontroller.log - - echo "[DEBUG] Printing Python packages list" - pip list -else - print_banner "Cleaning up" - - bq rm -r -f ${GOOGLE_CLOUD_PROJECT}:${DATASET_NAME} -fi - -exit ${TEST_EXIT_CODE} diff --git a/tests/e2e/bq/bq-batch-retrieval.py b/tests/e2e/bq/bq-batch-retrieval.py deleted file mode 100644 index 2d94d2e6cf..0000000000 --- a/tests/e2e/bq/bq-batch-retrieval.py +++ /dev/null @@ -1,819 +0,0 @@ -import math -import os -import random -import time -import uuid -from datetime import datetime, timedelta -from urllib.parse import urlparse - -import numpy as np -import pandas as pd -import pytest -import pytz -import tensorflow_data_validation as tfdv -from google.cloud import bigquery, storage -from google.cloud.storage import Blob -from google.protobuf.duration_pb2 import Duration -from pandavro import to_avro - -from bq.testutils import assert_stats_equal, clear_unsupported_fields -from feast.client import Client -from feast.contrib.job_controller.client import Client as JCClient -from feast.core.CoreService_pb2 import ListStoresRequest -from feast.core.FeatureSet_pb2 import FeatureSetStatus -from feast.core.IngestionJob_pb2 import IngestionJobStatus -from feast.entity import Entity -from feast.feature import Feature -from feast.feature_set import FeatureSet -from feast.type_map import ValueType -from feast.wait import wait_retry_backoff - -pd.set_option("display.max_columns", None) - -PROJECT_NAME = "batch_" + uuid.uuid4().hex.upper()[0:6] - - -@pytest.fixture(scope="module") -def core_url(pytestconfig): - return pytestconfig.getoption("core_url") - - -@pytest.fixture(scope="module") -def serving_url(pytestconfig): - return pytestconfig.getoption("serving_url") - - -@pytest.fixture(scope="module") -def jobcontroller_url(pytestconfig): - return pytestconfig.getoption("jobcontroller_url") - - -@pytest.fixture(scope="module") -def allow_dirty(pytestconfig): - return True if pytestconfig.getoption("allow_dirty").lower() == "true" else False - - -@pytest.fixture(scope="module") -def gcs_path(pytestconfig): - return pytestconfig.getoption("gcs_path") - - -@pytest.fixture(scope="module") -def client(core_url, serving_url, allow_dirty): - # Get client for core and serving - client = Client(core_url=core_url, serving_url=serving_url) - client.create_project(PROJECT_NAME) - client.set_project(PROJECT_NAME) - - # Ensure Feast core is active, but empty - if not allow_dirty: - feature_sets = client.list_feature_sets() - if len(feature_sets) > 0: - raise Exception( - "Feast cannot have existing feature sets registered. Exiting tests." - ) - - return client - - -def wait_for(fn, timeout: timedelta, sleep=5): - until = datetime.now() + timeout - last_exc = BaseException() - - while datetime.now() <= until: - try: - fn() - except Exception as exc: - last_exc = exc - else: - return - time.sleep(sleep) - - raise last_exc - - -@pytest.mark.first -@pytest.mark.direct_runner -@pytest.mark.dataflow_runner -@pytest.mark.run(order=1) -def test_batch_apply_all_featuresets(client): - client.set_project(PROJECT_NAME) - - file_fs1 = FeatureSet( - "file_feature_set", - features=[Feature("feature_value1", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(file_fs1) - - gcs_fs1 = FeatureSet( - "gcs_feature_set", - features=[Feature("feature_value2", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(gcs_fs1) - - proc_time_fs = FeatureSet( - "processing_time", - features=[Feature("feature_value3", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(proc_time_fs) - - add_cols_fs = FeatureSet( - "additional_columns", - features=[Feature("feature_value4", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(add_cols_fs) - - historical_fs = FeatureSet( - "historical", - features=[Feature("feature_value5", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(historical_fs) - - fs1 = FeatureSet( - "feature_set_1", - features=[Feature("feature_value6", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - - fs2 = FeatureSet( - "feature_set_2", - features=[Feature("other_feature_value7", ValueType.INT64)], - entities=[Entity("other_entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(fs1) - client.apply(fs2) - - no_max_age_fs = FeatureSet( - "no_max_age", - features=[Feature("feature_value8", ValueType.INT64)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=0), - ) - client.apply(no_max_age_fs) - - -@pytest.mark.direct_runner -@pytest.mark.dataflow_runner -@pytest.mark.run(order=10) -def test_batch_get_historical_features_with_file(client): - file_fs1 = client.get_feature_set(name="file_feature_set") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value1": [f"{i}" for i in range(N_ROWS)], - } - ) - - # feature set may be ready (direct runner set ready right after job submitted), - # but kafka consumer is not configured - # give some time to warm up ingestion job - wait_retry_backoff( - retry_fn=( - lambda: ( - None, - client.get_feature_set(name="file_feature_set").status - == FeatureSetStatus.STATUS_READY, - ) - ), - timeout_secs=480, - timeout_msg="Wait for FeatureSet to be READY", - ) - time.sleep(20) - - client.ingest(file_fs1, features_1_df, timeout=480) - - # Rename column (datetime -> event_timestamp) - features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) - - to_avro( - df=features_1_df[["event_timestamp", "entity_id"]], - file_path_or_buffer="file_feature_set.avro", - ) - - time.sleep(10) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows="file://file_feature_set.avro", - feature_refs=["feature_value1"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["entity_id"].to_list() == [ - int(i) for i in output["feature_value1"].to_list() - ] - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=10)) - - -@pytest.mark.direct_runner -@pytest.mark.dataflow_runner -@pytest.mark.run(order=11) -def test_batch_get_historical_features_with_gs_path(client, gcs_path): - gcs_fs1 = client.get_feature_set(name="gcs_feature_set") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value2": [f"{i}" for i in range(N_ROWS)], - } - ) - client.ingest(gcs_fs1, features_1_df, timeout=360) - - # Rename column (datetime -> event_timestamp) - features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) - - # Output file to local - file_name = "gcs_feature_set.avro" - to_avro( - df=features_1_df[["event_timestamp", "entity_id"]], - file_path_or_buffer=file_name, - ) - - uri = urlparse(gcs_path) - bucket = uri.hostname - ts = int(time.time()) - remote_path = str(uri.path).strip("/") + f"/{ts}/{file_name}" - - # Upload file to gcs - storage_client = storage.Client(project=None) - bucket = storage_client.get_bucket(bucket) - blob = bucket.blob(remote_path) - blob.upload_from_filename(file_name) - - time.sleep(10) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=f"{gcs_path}/{ts}/*", - feature_refs=["feature_value2"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - assert output["entity_id"].to_list() == [ - int(i) for i in output["feature_value2"].to_list() - ] - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - blob.delete() - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=12) -def test_batch_order_by_creation_time(client): - proc_time_fs = client.get_feature_set(name="processing_time") - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - N_ROWS = 10 - incorrect_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value3": ["WRONG"] * N_ROWS, - } - ) - correct_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value3": ["CORRECT"] * N_ROWS, - } - ) - client.ingest(proc_time_fs, incorrect_df) - time.sleep(15) - client.ingest(proc_time_fs, correct_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=incorrect_df[["datetime", "entity_id"]], - feature_refs=["feature_value3"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["feature_value3"].to_list() == ["CORRECT"] * N_ROWS - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=13) -def test_batch_additional_columns_in_entity_table(client): - add_cols_fs = client.get_feature_set(name="additional_columns") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value4": ["abc"] * N_ROWS, - } - ) - client.ingest(add_cols_fs, features_df) - - entity_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "additional_string_col": ["hello im extra"] * N_ROWS, - "additional_float_col": [random.random() for i in range(N_ROWS)], - } - ) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value4"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - print(output.head(10)) - - assert np.allclose( - output["additional_float_col"], entity_df["additional_float_col"] - ) - assert ( - output["additional_string_col"].to_list() - == entity_df["additional_string_col"].to_list() - ) - assert ( - output["feature_value4"].to_list() - == features_df["feature_value4"].to_list() - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=14) -def test_batch_point_in_time_correctness_join(client): - historical_fs = client.get_feature_set(name="historical") - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - N_EXAMPLES = 10 - historical_df = pd.DataFrame( - { - "datetime": [ - time_offset - timedelta(seconds=50), - time_offset - timedelta(seconds=30), - time_offset - timedelta(seconds=10), - ] - * N_EXAMPLES, - "entity_id": [i for i in range(N_EXAMPLES) for _ in range(3)], - "feature_value5": ["WRONG", "WRONG", "CORRECT"] * N_EXAMPLES, - } - ) - entity_df = pd.DataFrame( - { - "datetime": [time_offset - timedelta(seconds=10)] * N_EXAMPLES, - "entity_id": [i for i in range(N_EXAMPLES)], - } - ) - - client.ingest(historical_fs, historical_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value5"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["feature_value5"].to_list() == ["CORRECT"] * N_EXAMPLES - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=15) -def test_batch_multiple_featureset_joins(client): - fs1 = client.get_feature_set(name="feature_set_1") - fs2 = client.get_feature_set(name="feature_set_2") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value6": [f"{i}" for i in range(N_ROWS)], - } - ) - client.ingest(fs1, features_1_df) - - features_2_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "other_entity_id": [i for i in range(N_ROWS)], - "other_feature_value7": [i for i in range(N_ROWS)], - } - ) - client.ingest(fs2, features_2_df) - - entity_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "other_entity_id": [N_ROWS - 1 - i for i in range(N_ROWS)], - } - ) - - # Test retrieve with different variations of the string feature refs - # ie feature set inference for feature refs without specified feature set - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["entity_id"].to_list() == [ - int(i) for i in output["feature_value6"].to_list() - ] - assert ( - output["other_entity_id"].to_list() - == output["feature_set_2__other_feature_value7"].to_list() - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=16) -def test_batch_no_max_age(client): - no_max_age_fs = client.get_feature_set(name="no_max_age") - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - N_ROWS = 10 - features_8_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value8": [i for i in range(N_ROWS)], - } - ) - client.ingest(no_max_age_fs, features_8_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=features_8_df[["datetime", "entity_id"]], - feature_refs=["feature_value8"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["entity_id"].to_list() == output["feature_value8"].to_list() - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.fixture(scope="module", autouse=True) -def infra_teardown(pytestconfig, jobcontroller_url): - client = JCClient(jobcontroller_url=jobcontroller_url) - - marker = pytestconfig.getoption("-m") - yield marker - if marker == "dataflow_runner": - ingest_jobs = client.list_ingest_jobs() - ingest_jobs = [ - client.list_ingest_jobs(job.id)[0].external_id - for job in ingest_jobs - if job.status == IngestionJobStatus.RUNNING - ] - - cwd = os.getcwd() - with open(f"{cwd}/ingesting_jobs.txt", "w+") as output: - for job in ingest_jobs: - output.write("%s\n" % job) - else: - print("Cleaning up not required") - - -""" -This suite of tests tests the apply feature set - update feature set - retrieve -event sequence. It ensures that when a feature set is updated, tombstoned features -are no longer retrieved, and added features are null for previously ingested -rows. - -It is marked separately because of the length of time required -to perform this test, due to bigquery schema caching for streaming writes. -""" - - -@pytest.fixture(scope="module") -def update_featureset_dataframe(): - n_rows = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - return pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [i for i in range(n_rows)], - "update_feature1": ["a" for i in range(n_rows)], - "update_feature2": [i + 2 for i in range(n_rows)], - "update_feature3": [i for i in range(n_rows)], - "update_feature4": ["b" for i in range(n_rows)], - } - ) - - -@pytest.mark.fs_update -@pytest.mark.run(order=20) -def test_update_featureset_apply_featureset_and_ingest_first_subset( - client, update_featureset_dataframe -): - subset_columns = ["datetime", "entity_id", "update_feature1", "update_feature2"] - subset_df = update_featureset_dataframe.iloc[:5][subset_columns] - update_fs = FeatureSet( - "update_fs", - entities=[Entity(name="entity_id", dtype=ValueType.INT64)], - max_age=Duration(seconds=432000), - ) - update_fs.infer_fields_from_df(subset_df) - client.apply(update_fs) - - client.ingest(feature_set=update_fs, source=subset_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[:5], - feature_refs=["update_feature1", "update_feature2"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - print(output.head()) - - assert ( - output["update_feature1"].to_list() - == subset_df["update_feature1"].to_list() - ) - assert ( - output["update_feature2"].to_list() - == subset_df["update_feature2"].to_list() - ) - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.fs_update -@pytest.mark.timeout(600) -@pytest.mark.run(order=21) -def test_update_featureset_update_featureset_and_ingest_second_subset( - client, update_featureset_dataframe -): - subset_columns = [ - "datetime", - "entity_id", - "update_feature1", - "update_feature3", - "update_feature4", - ] - subset_df = update_featureset_dataframe.iloc[5:][subset_columns] - update_fs = FeatureSet( - "update_fs", - entities=[Entity(name="entity_id", dtype=ValueType.INT64)], - max_age=Duration(seconds=432000), - ) - update_fs.infer_fields_from_df(subset_df) - client.apply(update_fs) - - # We keep retrying this ingestion until all values make it into the buffer. - # This is a necessary step because bigquery streaming caches table schemas - # and as a result, rows may be lost. - while True: - ingestion_id = client.ingest(feature_set=update_fs, source=subset_df) - time.sleep(15) # wait for rows to get written to bq - rows_ingested = get_rows_ingested(client, update_fs, ingestion_id) - if rows_ingested == len(subset_df): - print(f"Number of rows successfully ingested: {rows_ingested}. Continuing.") - break - print( - f"Number of rows successfully ingested: {rows_ingested}. Retrying ingestion." - ) - time.sleep(30) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[5:], - feature_refs=["update_feature1", "update_feature3", "update_feature4"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - print(output.head()) - - assert ( - output["update_feature1"].to_list() - == subset_df["update_feature1"].to_list() - ) - assert ( - output["update_feature3"].to_list() - == subset_df["update_feature3"].to_list() - ) - assert ( - output["update_feature4"].to_list() - == subset_df["update_feature4"].to_list() - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.fs_update -@pytest.mark.run(order=22) -def test_update_featureset_retrieve_all_fields(client, update_featureset_dataframe): - with pytest.raises(Exception): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]], - feature_refs=[ - "update_feature1", - "update_feature2", - "update_feature3", - "update_feature4", - ], - project=PROJECT_NAME, - ) - feature_retrieval_job.result() - - -@pytest.mark.fs_update -@pytest.mark.run(order=23) -def test_update_featureset_retrieve_valid_fields(client, update_featureset_dataframe): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]], - feature_refs=["update_feature1", "update_feature3", "update_feature4"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - print(output.head(10)) - assert ( - output["update_feature1"].to_list() - == update_featureset_dataframe["update_feature1"].to_list() - ) - # we have to convert to float because the column contains np.NaN - assert [math.isnan(i) for i in output["update_feature3"].to_list()[:5]] == [ - True - ] * 5 - assert output["update_feature3"].to_list()[5:] == [ - float(i) for i in update_featureset_dataframe["update_feature3"].to_list()[5:] - ] - assert ( - output["update_feature4"].to_list() - == [None] * 5 + update_featureset_dataframe["update_feature4"].to_list()[5:] - ) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=31) -@pytest.mark.timeout(600) -def test_batch_dataset_statistics(client): - fs1 = client.get_feature_set(name="feature_set_1") - fs2 = client.get_feature_set(name="feature_set_2") - id_offset = 20 - - n_rows = 21 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [id_offset + i for i in range(n_rows)], - "feature_value6": ["a" for i in range(n_rows)], - } - ) - ingestion_id1 = client.ingest(fs1, features_1_df) - - features_2_df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "other_entity_id": [id_offset + i for i in range(n_rows)], - "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)], - } - ) - ingestion_id2 = client.ingest(fs2, features_2_df) - - entity_df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [id_offset + i for i in range(n_rows)], - "other_entity_id": [id_offset + i for i in range(n_rows)], - } - ) - - time.sleep(15) # wait for rows to get written to bq - while True: - rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1) - rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2) - if rows_ingested1 == len(features_1_df) and rows_ingested2 == len( - features_2_df - ): - print( - f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing." - ) - break - time.sleep(30) - - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], - project=PROJECT_NAME, - compute_statistics=True, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head(10)) - stats = feature_retrieval_job.statistics(timeout_sec=180) - clear_unsupported_fields(stats) - - expected_stats = tfdv.generate_statistics_from_dataframe( - output[["feature_value6", "feature_set_2__other_feature_value7"]] - ) - clear_unsupported_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = output[name].std() - feature.num_stats.std_dev = std - - assert_stats_equal(expected_stats, stats) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - -def get_rows_ingested( - client: Client, feature_set: FeatureSet, ingestion_id: str -) -> int: - response = client._core_service.ListStores( - ListStoresRequest(filter=ListStoresRequest.Filter(name="historical")) - ) - bq_config = response.store[0].bigquery_config - project = bq_config.project_id - dataset = bq_config.dataset_id - table = f"{PROJECT_NAME}_{feature_set.name}" - - bq_client = bigquery.Client(project=project) - rows = bq_client.query( - f'SELECT COUNT(*) as count FROM `{project}.{dataset}.{table}` WHERE ingestion_id = "{ingestion_id}"' - ).result() - - return list(rows)[0]["count"] - - -def clean_up_remote_files(files): - storage_client = storage.Client() - for file_uri in files: - if file_uri.scheme == "gs": - blob = Blob.from_string(file_uri.geturl(), client=storage_client) - blob.delete() diff --git a/tests/e2e/bq/feature-stats.py b/tests/e2e/bq/feature-stats.py deleted file mode 100644 index 226dc358f1..0000000000 --- a/tests/e2e/bq/feature-stats.py +++ /dev/null @@ -1,256 +0,0 @@ -import os -import time -import uuid -from datetime import datetime, timedelta - -import pandas as pd -import pytest -import pytz -import tensorflow_data_validation as tfdv -from google.protobuf.duration_pb2 import Duration - -from bq.testutils import ( - assert_stats_equal, - clear_unsupported_agg_fields, - clear_unsupported_fields, -) -from feast.client import Client -from feast.entity import Entity -from feast.feature import Feature -from feast.feature_set import FeatureSet -from feast.type_map import ValueType - -pd.set_option("display.max_columns", None) - -PROJECT_NAME = "batch_" + uuid.uuid4().hex.upper()[0:6] -STORE_NAME = "historical" -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - -@pytest.fixture(scope="module") -def core_url(pytestconfig): - return pytestconfig.getoption("core_url") - - -@pytest.fixture(scope="module") -def serving_url(pytestconfig): - return pytestconfig.getoption("serving_url") - - -@pytest.fixture(scope="module") -def allow_dirty(pytestconfig): - return True if pytestconfig.getoption("allow_dirty").lower() == "true" else False - - -@pytest.fixture(scope="module") -def gcs_path(pytestconfig): - return pytestconfig.getoption("gcs_path") - - -@pytest.fixture(scope="module") -def client(core_url, allow_dirty): - # Get client for core and serving - client = Client(core_url=core_url) - client.create_project(PROJECT_NAME) - client.set_project(PROJECT_NAME) - - # Ensure Feast core is active, but empty - if not allow_dirty: - feature_sets = client.list_feature_sets() - if len(feature_sets) > 0: - raise Exception( - "Feast cannot have existing feature sets registered. Exiting tests." - ) - - return client - - -@pytest.fixture(scope="module") -def feature_stats_feature_set(client): - fv_fs = FeatureSet( - "feature_stats", - features=[ - Feature("strings", ValueType.STRING), - Feature("ints", ValueType.INT64), - Feature("floats", ValueType.FLOAT), - ], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(fv_fs) - return fv_fs - - -@pytest.fixture(scope="module") -def feature_stats_dataset_basic(client, feature_stats_feature_set): - - n_rows = 20 - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [i for i in range(n_rows)], - "strings": ["a", "b"] * int(n_rows / 2), - "ints": [int(i) for i in range(n_rows)], - "floats": [10.5 - i for i in range(n_rows)], - } - ) - - expected_stats = tfdv.generate_statistics_from_dataframe( - df[["strings", "ints", "floats"]] - ) - clear_unsupported_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = df[name].std() - feature.num_stats.std_dev = std - - ingestion_id = client.ingest(feature_stats_feature_set, df) - time.sleep(10) - return { - "df": df, - "id": ingestion_id, - "date": datetime(time_offset.year, time_offset.month, time_offset.day).replace( - tzinfo=pytz.utc - ), - "stats": expected_stats, - } - - -@pytest.fixture(scope="module") -def feature_stats_dataset_agg(client, feature_stats_feature_set): - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - start_date = time_offset - timedelta(days=10) - end_date = time_offset - timedelta(days=7) - df1 = pd.DataFrame( - { - "datetime": [start_date] * 5, - "entity_id": [i for i in range(5)], - "strings": ["a", "b", "b", "b", "a"], - "ints": [4, 3, 2, 6, 3], - "floats": [2.1, 5.2, 4.3, 0.6, 0.1], - } - ) - ingestion_id_1 = client.ingest(feature_stats_feature_set, df1) - df2 = pd.DataFrame( - { - "datetime": [start_date + timedelta(days=1)] * 3, - "entity_id": [i for i in range(3)], - "strings": ["a", "b", "c"], - "ints": [2, 6, 7], - "floats": [1.6, 2.4, 2], - } - ) - ingestion_id_2 = client.ingest(feature_stats_feature_set, df2) - - combined_df = pd.concat([df1, df2])[["strings", "ints", "floats"]] - expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) - clear_unsupported_agg_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = combined_df[name].std() - feature.num_stats.std_dev = std - - time.sleep(10) - - return { - "ids": [ingestion_id_1, ingestion_id_2], - "start_date": datetime( - start_date.year, start_date.month, start_date.day - ).replace(tzinfo=pytz.utc), - "end_date": datetime(end_date.year, end_date.month, end_date.day).replace( - tzinfo=pytz.utc - ), - "stats": expected_stats, - } - - -def test_feature_stats_retrieval_by_single_dataset(client, feature_stats_dataset_basic): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - ingestion_ids=[feature_stats_dataset_basic["id"]], - ) - - assert_stats_equal(feature_stats_dataset_basic["stats"], stats) - - -def test_feature_stats_by_date(client, feature_stats_dataset_basic): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - start_date=feature_stats_dataset_basic["date"], - end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), - ) - assert_stats_equal(feature_stats_dataset_basic["stats"], stats) - - -def test_feature_stats_agg_over_datasets(client, feature_stats_dataset_agg): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - ingestion_ids=feature_stats_dataset_agg["ids"], - ) - assert_stats_equal(feature_stats_dataset_agg["stats"], stats) - - -def test_feature_stats_agg_over_dates(client, feature_stats_dataset_agg): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - start_date=feature_stats_dataset_agg["start_date"], - end_date=feature_stats_dataset_agg["end_date"], - ) - assert_stats_equal(feature_stats_dataset_agg["stats"], stats) - - -def test_feature_stats_force_refresh( - client, feature_stats_dataset_basic, feature_stats_feature_set -): - df = feature_stats_dataset_basic["df"] - - df2 = pd.DataFrame( - { - "datetime": [df.iloc[0].datetime], - "entity_id": [10], - "strings": ["c"], - "ints": [2], - "floats": [1.3], - } - ) - client.ingest(feature_stats_feature_set, df2) - time.sleep(10) - - actual_stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store="historical", - start_date=feature_stats_dataset_basic["date"], - end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), - force_refresh=True, - ) - - combined_df = pd.concat([df, df2]) - expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) - - clear_unsupported_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = combined_df[name].std() - feature.num_stats.std_dev = std - - assert_stats_equal(expected_stats, actual_stats) diff --git a/tests/e2e/bq/testutils.py b/tests/e2e/bq/testutils.py deleted file mode 100644 index 9ac678bc59..0000000000 --- a/tests/e2e/bq/testutils.py +++ /dev/null @@ -1,55 +0,0 @@ -from deepdiff import DeepDiff -from google.protobuf.json_format import MessageToDict - - -def clear_unsupported_fields(datasets): - dataset = datasets.datasets[0] - for feature in dataset.features: - if feature.HasField("num_stats"): - feature.num_stats.common_stats.ClearField("num_values_histogram") - # Since difference in how BQ and TFDV compute histogram values make them - # approximate but uncomparable - feature.num_stats.ClearField("histograms") - elif feature.HasField("string_stats"): - feature.string_stats.common_stats.ClearField("num_values_histogram") - for bucket in feature.string_stats.rank_histogram.buckets: - bucket.ClearField("low_rank") - bucket.ClearField("high_rank") - elif feature.HasField("struct_stats"): - feature.string_stats.struct_stats.ClearField("num_values_histogram") - elif feature.HasField("bytes_stats"): - feature.string_stats.bytes_stats.ClearField("num_values_histogram") - - -def clear_unsupported_agg_fields(datasets): - dataset = datasets.datasets[0] - for feature in dataset.features: - if feature.HasField("num_stats"): - feature.num_stats.common_stats.ClearField("num_values_histogram") - feature.num_stats.ClearField("histograms") - feature.num_stats.ClearField("median") - elif feature.HasField("string_stats"): - feature.string_stats.common_stats.ClearField("num_values_histogram") - feature.string_stats.ClearField("rank_histogram") - feature.string_stats.ClearField("top_values") - feature.string_stats.ClearField("unique") - elif feature.HasField("struct_stats"): - feature.struct_stats.ClearField("num_values_histogram") - elif feature.HasField("bytes_stats"): - feature.bytes_stats.ClearField("num_values_histogram") - feature.bytes_stats.ClearField("unique") - - -def assert_stats_equal(left, right): - left_stats = MessageToDict(left)["datasets"][0] - right_stats = MessageToDict(right)["datasets"][0] - assert ( - left_stats["numExamples"] == right_stats["numExamples"] - ), f"Number of examples do not match. Expected {left_stats['numExamples']}, got {right_stats['numExamples']}" - - left_features = sorted(left_stats["features"], key=lambda k: k["path"]["step"][0]) - right_features = sorted(right_stats["features"], key=lambda k: k["path"]["step"][0]) - diff = DeepDiff(left_features, right_features, significant_digits=3) - assert ( - len(diff) == 0 - ), f"Feature statistics do not match: \nwanted: {left_features}\n got: {right_features}" From 0205ea8b2fe88f156e64ad6c012cc99fc062b238 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 12:36:04 +0800 Subject: [PATCH 06/34] Fix env variable Signed-off-by: Terence --- infra/scripts/test-end-to-end-redis-cluster.sh | 2 +- infra/scripts/test-end-to-end.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/infra/scripts/test-end-to-end-redis-cluster.sh b/infra/scripts/test-end-to-end-redis-cluster.sh index 544c1f4d3d..083079a32b 100755 --- a/infra/scripts/test-end-to-end-redis-cluster.sh +++ b/infra/scripts/test-end-to-end-redis-cluster.sh @@ -103,7 +103,7 @@ cd tests/e2e set +e CORE_NO=$(nproc --all) -pytest redis/parallel-ingest-redis-serving.py -n CORE_NO --dist=loadscope --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +pytest redis/parallel-ingest-redis-serving.py -n ${CORE_NO} --dist=loadscope --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then diff --git a/infra/scripts/test-end-to-end.sh b/infra/scripts/test-end-to-end.sh index e65c72b0ba..a7dadd5a1f 100755 --- a/infra/scripts/test-end-to-end.sh +++ b/infra/scripts/test-end-to-end.sh @@ -120,7 +120,7 @@ cd tests/e2e set +e export GOOGLE_APPLICATION_CREDENTIALS=/etc/gcloud/service-account.json CORE_NO=$(nproc --all) -pytest redis/parallel-ingest-redis-serving.py -n CORE_NO --dist=loadscope --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +pytest redis/parallel-ingest-redis-serving.py -n ${CORE_NO} --dist=loadscope --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then From 894ee48819a159a3b35640d6e19f887a3ee10631 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 14:05:56 +0800 Subject: [PATCH 07/34] Fix pytest redis Signed-off-by: Terence --- tests/e2e/redis/parallel-ingest-redis-serving.py | 3 --- tests/e2e/requirements.txt | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/e2e/redis/parallel-ingest-redis-serving.py b/tests/e2e/redis/parallel-ingest-redis-serving.py index fb30746a0f..7a213dc8af 100644 --- a/tests/e2e/redis/parallel-ingest-redis-serving.py +++ b/tests/e2e/redis/parallel-ingest-redis-serving.py @@ -72,7 +72,6 @@ def test_discovery(self, client): actual_matchmaking_entities = client.list_entities( labels=matchmaking_filtering_labels ) - assert len(actual_common_entities) == 2 assert len(actual_matchmaking_entities) == 1 @@ -168,8 +167,6 @@ def test_discovery(self, client): actual_alltypes_entities = client.list_entities( labels=alltypes_filtering_labels ) - - assert len(client.list_entities()) == 1 assert len(actual_alltypes_entities) == 1 # ApplyFeatureTable diff --git a/tests/e2e/requirements.txt b/tests/e2e/requirements.txt index 9c6dd06ac1..68595ee1b5 100644 --- a/tests/e2e/requirements.txt +++ b/tests/e2e/requirements.txt @@ -2,7 +2,7 @@ mock==2.0.0 numpy==1.16.4 pandas~=1.0.0 pandavro==1.5.* -pytest==5.2.1 +pytest==6.0.0 pytest-benchmark==3.2.2 pytest-mock==1.10.4 pytest-timeout==1.3.3 From f9a9a3236969357bef39a09c20217c17eb5f9f37 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 14:06:58 +0800 Subject: [PATCH 08/34] Revert "Remove e2e bq tests" This reverts commit fa0bcab17b244142dbdc44f6c48a4108e5a2b522. Signed-off-by: Terence --- .prow/config.yaml | 91 ++ Makefile | 2 +- .../scripts/test-end-to-end-batch-dataflow.sh | 307 +++++++ infra/scripts/test-end-to-end-batch.sh | 153 ++++ tests/e2e/bq/bq-batch-retrieval.py | 819 ++++++++++++++++++ tests/e2e/bq/feature-stats.py | 256 ++++++ tests/e2e/bq/testutils.py | 55 ++ 7 files changed, 1682 insertions(+), 1 deletion(-) create mode 100755 infra/scripts/test-end-to-end-batch-dataflow.sh create mode 100755 infra/scripts/test-end-to-end-batch.sh create mode 100644 tests/e2e/bq/bq-batch-retrieval.py create mode 100644 tests/e2e/bq/feature-stats.py create mode 100644 tests/e2e/bq/testutils.py diff --git a/.prow/config.yaml b/.prow/config.yaml index 39c275603d..d2269fcc6d 100644 --- a/.prow/config.yaml +++ b/.prow/config.yaml @@ -254,6 +254,97 @@ presubmits: branches: - ^v0\.(3|4)-branch$ + - name: test-end-to-end-batch + decorate: true + always_run: true + spec: + volumes: + - name: service-account + secret: + secretName: feast-service-account + containers: + - image: maven:3.6-jdk-11 + command: ["infra/scripts/test-end-to-end-batch.sh"] + resources: + requests: + cpu: "6" + memory: "6144Mi" + volumeMounts: + - name: service-account + mountPath: "/etc/service-account" + skip_branches: + - ^v0\.(3|4)-branch$ + + - name: test-end-to-end-batch-fs-update + decorate: true + always_run: false + spec: + volumes: + - name: service-account + secret: + secretName: feast-service-account + containers: + - image: maven:3.6-jdk-11 + command: ["infra/scripts/test-end-to-end-batch.sh", "-m", "fs_update"] + resources: + requests: + cpu: "6" + memory: "6144Mi" + volumeMounts: + - name: service-account + mountPath: "/etc/service-account" + skip_branches: + - ^v0\.(3|4)-branch$ + + - name: test-end-to-end-batch-java-8 + decorate: true + always_run: true + spec: + volumes: + - name: service-account + secret: + secretName: feast-service-account + containers: + - image: maven:3.6-jdk-8 + command: ["infra/scripts/test-end-to-end-batch.sh"] + resources: + requests: + cpu: "6" + memory: "6144Mi" + volumeMounts: + - name: service-account + mountPath: "/etc/service-account" + branches: + - ^v0\.(3|4)-branch$ + + - name: test-end-to-end-batch-dataflow + decorate: true + always_run: true + spec: + volumes: + - name: service-account-df + secret: + secretName: feast-e2e-service-account + - name: docker-socket + hostPath: + path: /var/run/docker.sock + containers: + - image: google/cloud-sdk:302.0.0 + command: ["infra/scripts/test-end-to-end-batch-dataflow.sh"] + resources: + requests: + cpu: "6" + memory: "6144Mi" + volumeMounts: + - name: service-account-df + mountPath: "/etc/service-account-df" + - name: docker-socket + mountPath: /var/run/docker.sock + securityContext: + privileged: true + skip_branches: + - ^v0\.(3|4)-branch$ + postsubmits: feast-dev/feast: - name: publish-python-sdk diff --git a/Makefile b/Makefile index f159ad624d..8fffe20816 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,7 @@ lint-python: cd ${ROOT_DIR}/sdk/python; flake8 feast/ tests/ cd ${ROOT_DIR}/sdk/python; black --check feast tests - cd ${ROOT_DIR}/tests/e2e; mypy redis/ + cd ${ROOT_DIR}/tests/e2e; mypy bq/ redis/ cd ${ROOT_DIR}/tests/e2e; isort . --check-only cd ${ROOT_DIR}/tests/e2e; flake8 . cd ${ROOT_DIR}/tests/e2e; black --check . diff --git a/infra/scripts/test-end-to-end-batch-dataflow.sh b/infra/scripts/test-end-to-end-batch-dataflow.sh new file mode 100755 index 0000000000..363ba7dc47 --- /dev/null +++ b/infra/scripts/test-end-to-end-batch-dataflow.sh @@ -0,0 +1,307 @@ +#!/usr/bin/env bash +echo "Preparing environment variables..." + +set -e +set -o pipefail + +test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account-df/service-account-df.json" +test -z ${GCLOUD_PROJECT} && GCLOUD_PROJECT="kf-feast" +test -z ${GCLOUD_REGION} && GCLOUD_REGION="us-central1" +test -z ${GCLOUD_NETWORK} && GCLOUD_NETWORK="default" +test -z ${GCLOUD_SUBNET} && GCLOUD_SUBNET="default" +test -z ${TEMP_BUCKET} && TEMP_BUCKET="kf-feast-dataflow-temp" +test -z ${K8_CLUSTER_NAME} && K8_CLUSTER_NAME="feast-e2e-dataflow" +test -z ${HELM_RELEASE_NAME} && HELM_RELEASE_NAME="pr-$PULL_NUMBER" +test -z ${HELM_COMMON_NAME} && HELM_COMMON_NAME="deps" +test -z ${DATASET_NAME} && DATASET_NAME=feast_e2e_$(date +%s) +test -z ${SPECS_TOPIC} && SPECS_TOPIC=feast-specs-$(date +%s) +test -z ${FEATURES_TOPIC} && FEATURES_TOPIC=feast-$(date +%s) + + +feast_kafka_1_ip_name="feast-kafka-1" +feast_kafka_2_ip_name="feast-kafka-2" +feast_kafka_3_ip_name="feast-kafka-3" +feast_redis_ip_name="feast-redis" +feast_statsd_ip_name="feast-statsd" + +echo " +This script will run end-to-end tests for Feast Core and Batch Serving using Dataflow Runner. + +1. Setup K8s cluster (optional, if it was not created before) +2. Reuse existing IP addresses or generate new ones for stateful services +3. Install stateful services (kafka, redis, postgres, etc) (optional) +4. Build core & serving docker images (optional) +5. Create temporary BQ table for Feast Serving. +6. Rollout target images to cluster via helm in dedicated namespace (pr-{number}) +7. Install Python 3.7.4, Feast Python SDK and run end-to-end tests from + tests/e2e via pytest. +8. Tear down feast services, keep stateful services. +" + +ORIGINAL_DIR=$(pwd) +echo $ORIGINAL_DIR + +echo "Environment:" +printenv + +export GOOGLE_APPLICATION_CREDENTIALS +gcloud auth activate-service-account --key-file ${GOOGLE_APPLICATION_CREDENTIALS} +gcloud -q auth configure-docker + +gcloud config set project ${GCLOUD_PROJECT} +gcloud config set compute/region ${GCLOUD_REGION} +gcloud config list + +apt-get -qq update +apt-get -y install wget build-essential gettext-base curl + +curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 +chmod 700 $ORIGINAL_DIR/get_helm.sh +$ORIGINAL_DIR/get_helm.sh + + +function getPublicAddresses() { + existing_addresses=$(gcloud compute addresses list --filter="region:($GCLOUD_REGION) name:kafka" --format "list(name)") + if [[ -z "$existing_addresses" ]]; then + echo " +============================================================ +Reserving IP addresses for Feast dependencies +============================================================ +" + + gcloud compute addresses create \ + $feast_kafka_1_ip_name $feast_kafka_2_ip_name $feast_kafka_3_ip_name $feast_redis_ip_name $feast_statsd_ip_name \ + --region ${GCLOUD_REGION} --subnet ${GCLOUD_SUBNET} + fi + + + export feast_kafka_1_ip=$(gcloud compute addresses describe $feast_kafka_1_ip_name --region=${GCLOUD_REGION} --format "value(address)") + export feast_kafka_2_ip=$(gcloud compute addresses describe $feast_kafka_2_ip_name --region=${GCLOUD_REGION} --format "value(address)") + export feast_kafka_3_ip=$(gcloud compute addresses describe $feast_kafka_3_ip_name --region=${GCLOUD_REGION} --format "value(address)") + export feast_redis_ip=$(gcloud compute addresses describe $feast_redis_ip_name --region=${GCLOUD_REGION} --format "value(address)") + export feast_statsd_ip=$(gcloud compute addresses describe $feast_statsd_ip_name --region=${GCLOUD_REGION} --format "value(address)") +} + +function createKubeCluster() { + echo " +============================================================ +Creating GKE nodepool for Feast e2e test with DataflowRunner +============================================================ +" + gcloud container clusters create ${K8_CLUSTER_NAME} --region ${GCLOUD_REGION} \ + --enable-cloud-logging \ + --enable-cloud-monitoring \ + --network ${GCLOUD_NETWORK} \ + --subnetwork ${GCLOUD_SUBNET} \ + --scopes https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,\ +https://www.googleapis.com/auth/monitoring,https://www.googleapis.com/auth/service.management.readonly,\ +https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/trace.append,\ +https://www.googleapis.com/auth/bigquery \ + --machine-type n1-standard-2 + + echo " +============================================================ +Create feast-postgres-database Secret in GKE nodepool +============================================================ +" + kubectl create secret generic feast-postgresql --from-literal=postgresql-password=password + + echo " +============================================================ +Create feast-gcp-service-account Secret in GKE nodepool +============================================================ +" + cd $ORIGINAL_DIR/infra/scripts + kubectl create secret generic feast-gcp-service-account --from-file=credentials.json=${GOOGLE_APPLICATION_CREDENTIALS} +} + +function installDependencies() { + echo " +============================================================ +Helm install common parts (kafka, redis, etc) +============================================================ +" + cd $ORIGINAL_DIR/infra/charts/feast + + helm install --replace --wait --debug --values="values-end-to-end-batch-dataflow-updated.yaml" \ + --set "feast-core.enabled=false" \ + --set "feast-online-serving.enabled=false" \ + --set "feast-batch-serving.enabled=false" \ + --set "postgresql.enabled=false" + "$HELM_COMMON_NAME" . + +} + +function buildAndPushImage() +{ + echo docker build -t $1:$2 --build-arg REVISION=$2 -f $3 $ORIGINAL_DIR + docker build -t $1:$2 --build-arg REVISION=$2 -f $3 $ORIGINAL_DIR + docker push $1:$2 +} + +function buildTarget() { + buildAndPushImage "gcr.io/kf-feast/feast-core" "$PULL_NUMBER" "$ORIGINAL_DIR/infra/docker/core/Dockerfile" + buildAndPushImage "gcr.io/kf-feast/feast-serving" "$PULL_NUMBER" "$ORIGINAL_DIR/infra/docker/serving/Dockerfile" +} + +function installTarget() { + echo " +============================================================ +Helm install feast +============================================================ +" + cd $ORIGINAL_DIR/infra/charts/feast + + helm install --wait --timeout 300s --debug --values="values-end-to-end-batch-dataflow-updated.yaml" \ + --set "kafka.enabled=false" \ + --set "redis.enabled=false" \ + --set "prometheus-statsd-exporter.enabled=false" \ + --set "prometheus.enabled=false" \ + "$HELM_RELEASE_NAME" . + +} + +function clean() { + echo " + ============================================================ + Cleaning up + ============================================================ + " + cd $ORIGINAL_DIR/tests/e2e + + # Remove BQ Dataset + bq rm -r -f ${GCLOUD_PROJECT}:${DATASET_NAME} + + # Uninstall helm release before clearing PVCs + helm uninstall ${HELM_RELEASE_NAME} + + kubectl delete pvc data-${HELM_RELEASE_NAME}-postgresql-0 + + # Stop Dataflow jobs from retrieved Dataflow job ids in ingesting_jobs.txt + if [ -f ingesting_jobs.txt ]; then + while read line + do + echo $line + gcloud dataflow jobs cancel $line --region=${GCLOUD_REGION} + done < ingesting_jobs.txt + fi +} + +# 1. +existing_cluster=$(gcloud container clusters list --format "list(name)" --filter "name:$K8_CLUSTER_NAME") +if [[ -z $existing_cluster ]]; then + createKubeCluster "$@" +else + gcloud container clusters get-credentials $K8_CLUSTER_NAME --region $GCLOUD_REGION --project $GCLOUD_PROJECT +fi + +# 2. +getPublicAddresses "$@" + +echo " +============================================================ +Export required environment variables +============================================================ +" + +export TEMP_BUCKET=$TEMP_BUCKET/$HELM_RELEASE_NAME/$(date +%s) +export DATASET_NAME=$DATASET_NAME +export GCLOUD_PROJECT=$GCLOUD_PROJECT +export GCLOUD_NETWORK=$GCLOUD_NETWORK +export GCLOUD_SUBNET=$GCLOUD_SUBNET +export GCLOUD_REGION=$GCLOUD_REGION +export HELM_COMMON_NAME=$HELM_COMMON_NAME +export IMAGE_TAG=$PULL_PULL_SHA +export SPECS_TOPIC=$SPECS_TOPIC +export FEATURES_TOPIC=$FEATURES_TOPIC + +export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) +export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts +source ${SCRIPTS_DIR}/setup-common-functions.sh + +wait_for_docker_image gcr.io/kf-feast/feast-core:"${IMAGE_TAG}" +wait_for_docker_image gcr.io/kf-feast/feast-serving:"${IMAGE_TAG}" + +envsubst $'$TEMP_BUCKET $DATASET_NAME $GCLOUD_PROJECT $GCLOUD_NETWORK $SPECS_TOPIC $FEATURES_TOPIC \ + $GCLOUD_SUBNET $GCLOUD_REGION $IMAGE_TAG $HELM_COMMON_NAME $feast_kafka_1_ip + $feast_kafka_2_ip $feast_kafka_3_ip $feast_redis_ip $feast_statsd_ip' < $ORIGINAL_DIR/infra/scripts/test-templates/values-end-to-end-batch-dataflow.yaml > $ORIGINAL_DIR/infra/charts/feast/values-end-to-end-batch-dataflow-updated.yaml + + +# 3. +existing_deps=$(helm list --filter deps -q) +if [[ -z $existing_deps ]]; then + installDependencies "$@" +fi + +# 4. +# buildTarget "$@" + +# 5. +echo " +============================================================ +Creating temp BQ table for Feast Serving +============================================================ +" + +bq --location=US --project_id=${GCLOUD_PROJECT} mk \ + --dataset \ + --default_table_expiration 86400 \ + ${GCLOUD_PROJECT}:${DATASET_NAME} + + +# 6. + +set +e +installTarget "$@" + +# 7. +echo " +============================================================ +Installing Python 3.7 with Miniconda and Feast SDK +============================================================ +" +cd $ORIGINAL_DIR +# Install Python 3.7 with Miniconda +wget -q https://repo.continuum.io/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh \ + -O /tmp/miniconda.sh +bash /tmp/miniconda.sh -b -p /root/miniconda -f +/root/miniconda/bin/conda init +source ~/.bashrc + +# Install Feast Python SDK and test requirements +cd $ORIGINAL_DIR +make compile-protos-python +pip install -qe sdk/python +pip install -qr tests/e2e/requirements.txt + +echo " +============================================================ +Running end-to-end tests with pytest at 'tests/e2e' +============================================================ +" +# Default artifact location setting in Prow jobs +LOGS_ARTIFACT_PATH=/logs/artifacts + +cd $ORIGINAL_DIR/tests/e2e + +core_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-core) +serving_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-batch-serving) +jobcontroller_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-jobcontroller) + +set +e +pytest -s -v bq/bq-batch-retrieval.py -m dataflow_runner --core_url "$core_ip:6565" --serving_url "$serving_ip:6566" \ + --jobcontroller_url "$jobcontroller_ip:6570" --gcs_path "gs://${TEMP_BUCKET}" --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +TEST_EXIT_CODE=$? + +if [[ ${TEST_EXIT_CODE} != 0 ]]; then + echo "[DEBUG] Printing logs" + ls -ltrh /var/log/feast* + cat /var/log/feast-serving-warehouse.log /var/log/feast-core.log + + echo "[DEBUG] Printing Python packages list" + pip list +else + clean "$@" +fi + +exit ${TEST_EXIT_CODE} diff --git a/infra/scripts/test-end-to-end-batch.sh b/infra/scripts/test-end-to-end-batch.sh new file mode 100755 index 0000000000..c741fe7168 --- /dev/null +++ b/infra/scripts/test-end-to-end-batch.sh @@ -0,0 +1,153 @@ +#!/usr/bin/env bash + +set -e +set -o pipefail + +PYTEST_MARK='direct_runner' #default + +print_usage() { + printf "Usage: ./test-end-to-end-batch -m pytest_mark" +} + +while getopts 'm:' flag; do + case "${flag}" in + m) PYTEST_MARK="${OPTARG}" ;; + *) print_usage + exit 1 ;; + esac +done + +test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account/service-account.json" +test -z ${SKIP_BUILD_JARS} && SKIP_BUILD_JARS="false" +test -z ${GOOGLE_CLOUD_PROJECT} && GOOGLE_CLOUD_PROJECT="kf-feast" +test -z ${TEMP_BUCKET} && TEMP_BUCKET="feast-templocation-kf-feast" +test -z ${JOBS_STAGING_LOCATION} && JOBS_STAGING_LOCATION="gs://${TEMP_BUCKET}/staging-location/$(date +%s)" + +# Get the current build version using maven (and pom.xml) +export FEAST_BUILD_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) +echo Building version: $FEAST_BUILD_VERSION + +# Get Feast project repository root and scripts directory +export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) +export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts + +echo " +This script will run end-to-end tests for Feast Core and Batch Serving. + +1. Install gcloud SDK +2. Install Redis as the job store for Feast Batch Serving. +4. Install Postgres for persisting Feast metadata. +5. Install Kafka and Zookeeper as the Source in Feast. +6. Install Python 3.7.4, Feast Python SDK and run end-to-end tests from + tests/e2e via pytest. +" + +source ${SCRIPTS_DIR}/setup-common-functions.sh + +install_test_tools +install_gcloud_sdk +install_and_start_local_redis +install_and_start_local_postgres +install_and_start_local_zookeeper_and_kafka + +if [[ ${SKIP_BUILD_JARS} != "true" ]]; then + build_feast_core_and_serving +else + echo "[DEBUG] Skipping building jars" +fi + +DATASET_NAME=feast_$(date +%s) +bq --location=US --project_id=${GOOGLE_CLOUD_PROJECT} mk \ + --dataset \ + --default_table_expiration 86400 \ + ${GOOGLE_CLOUD_PROJECT}:${DATASET_NAME} + +# Start Feast Core in background +cat < /tmp/jc.warehouse.application.yml +feast: + core-host: localhost + core-port: 6565 + jobs: + polling_interval_milliseconds: 10000 + active_runner: direct + consolidate-jobs-per-source: true + runners: + - name: direct + type: DirectRunner + options: + tempLocation: gs://${TEMP_BUCKET}/tempLocation + +EOF + +cat < /tmp/serving.warehouse.application.yml +feast: + # GRPC service address for Feast Core + # Feast Serving requires connection to Feast Core to retrieve and reload Feast metadata (e.g. FeatureSpecs, Store information) + core-host: localhost + core-grpc-port: 6565 + + # Indicates the active store. Only a single store in the last can be active at one time. In the future this key + # will be deprecated in order to allow multiple stores to be served from a single serving instance + active_store: historical + + # List of store configurations + stores: + - name: historical + type: BIGQUERY + config: + project_id: ${GOOGLE_CLOUD_PROJECT} + dataset_id: ${DATASET_NAME} + staging_location: ${JOBS_STAGING_LOCATION} + initial_retry_delay_seconds: 1 + total_timeout_seconds: 21600 + write_triggering_frequency_seconds: 1 + subscriptions: + - name: "*" + project: "*" + version: "*" + + job_store: + redis_host: localhost + redis_port: 6379 + + tracing: + enabled: false + +server: + port: 8081 + +EOF + +cat /tmp/jc.warehouse.application.yml /tmp/serving.warehouse.application.yml + +start_feast_core +start_feast_jobcontroller /tmp/jc.warehouse.application.yml +start_feast_serving /tmp/serving.warehouse.application.yml + +install_python_with_miniconda_and_feast_sdk + +print_banner "Running end-to-end tests with pytest at 'tests/e2e'" +# Default artifact location setting in Prow jobs +LOGS_ARTIFACT_PATH=/logs/artifacts + +ORIGINAL_DIR=$(pwd) +cd tests/e2e + +set +e +pytest bq/* -v -m ${PYTEST_MARK} --gcs_path ${JOBS_STAGING_LOCATION} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +TEST_EXIT_CODE=$? + +if [[ ${TEST_EXIT_CODE} != 0 ]]; then + echo "[DEBUG] Printing logs" + ls -ltrh /var/log/feast* + cat /var/log/feast-serving-online.log /var/log/feast-core.log /var/log/feast-jobcontroller.log + + echo "[DEBUG] Printing Python packages list" + pip list +else + print_banner "Cleaning up" + + bq rm -r -f ${GOOGLE_CLOUD_PROJECT}:${DATASET_NAME} +fi + +exit ${TEST_EXIT_CODE} diff --git a/tests/e2e/bq/bq-batch-retrieval.py b/tests/e2e/bq/bq-batch-retrieval.py new file mode 100644 index 0000000000..2d94d2e6cf --- /dev/null +++ b/tests/e2e/bq/bq-batch-retrieval.py @@ -0,0 +1,819 @@ +import math +import os +import random +import time +import uuid +from datetime import datetime, timedelta +from urllib.parse import urlparse + +import numpy as np +import pandas as pd +import pytest +import pytz +import tensorflow_data_validation as tfdv +from google.cloud import bigquery, storage +from google.cloud.storage import Blob +from google.protobuf.duration_pb2 import Duration +from pandavro import to_avro + +from bq.testutils import assert_stats_equal, clear_unsupported_fields +from feast.client import Client +from feast.contrib.job_controller.client import Client as JCClient +from feast.core.CoreService_pb2 import ListStoresRequest +from feast.core.FeatureSet_pb2 import FeatureSetStatus +from feast.core.IngestionJob_pb2 import IngestionJobStatus +from feast.entity import Entity +from feast.feature import Feature +from feast.feature_set import FeatureSet +from feast.type_map import ValueType +from feast.wait import wait_retry_backoff + +pd.set_option("display.max_columns", None) + +PROJECT_NAME = "batch_" + uuid.uuid4().hex.upper()[0:6] + + +@pytest.fixture(scope="module") +def core_url(pytestconfig): + return pytestconfig.getoption("core_url") + + +@pytest.fixture(scope="module") +def serving_url(pytestconfig): + return pytestconfig.getoption("serving_url") + + +@pytest.fixture(scope="module") +def jobcontroller_url(pytestconfig): + return pytestconfig.getoption("jobcontroller_url") + + +@pytest.fixture(scope="module") +def allow_dirty(pytestconfig): + return True if pytestconfig.getoption("allow_dirty").lower() == "true" else False + + +@pytest.fixture(scope="module") +def gcs_path(pytestconfig): + return pytestconfig.getoption("gcs_path") + + +@pytest.fixture(scope="module") +def client(core_url, serving_url, allow_dirty): + # Get client for core and serving + client = Client(core_url=core_url, serving_url=serving_url) + client.create_project(PROJECT_NAME) + client.set_project(PROJECT_NAME) + + # Ensure Feast core is active, but empty + if not allow_dirty: + feature_sets = client.list_feature_sets() + if len(feature_sets) > 0: + raise Exception( + "Feast cannot have existing feature sets registered. Exiting tests." + ) + + return client + + +def wait_for(fn, timeout: timedelta, sleep=5): + until = datetime.now() + timeout + last_exc = BaseException() + + while datetime.now() <= until: + try: + fn() + except Exception as exc: + last_exc = exc + else: + return + time.sleep(sleep) + + raise last_exc + + +@pytest.mark.first +@pytest.mark.direct_runner +@pytest.mark.dataflow_runner +@pytest.mark.run(order=1) +def test_batch_apply_all_featuresets(client): + client.set_project(PROJECT_NAME) + + file_fs1 = FeatureSet( + "file_feature_set", + features=[Feature("feature_value1", ValueType.STRING)], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + client.apply(file_fs1) + + gcs_fs1 = FeatureSet( + "gcs_feature_set", + features=[Feature("feature_value2", ValueType.STRING)], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + client.apply(gcs_fs1) + + proc_time_fs = FeatureSet( + "processing_time", + features=[Feature("feature_value3", ValueType.STRING)], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + client.apply(proc_time_fs) + + add_cols_fs = FeatureSet( + "additional_columns", + features=[Feature("feature_value4", ValueType.STRING)], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + client.apply(add_cols_fs) + + historical_fs = FeatureSet( + "historical", + features=[Feature("feature_value5", ValueType.STRING)], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + client.apply(historical_fs) + + fs1 = FeatureSet( + "feature_set_1", + features=[Feature("feature_value6", ValueType.STRING)], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + + fs2 = FeatureSet( + "feature_set_2", + features=[Feature("other_feature_value7", ValueType.INT64)], + entities=[Entity("other_entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + client.apply(fs1) + client.apply(fs2) + + no_max_age_fs = FeatureSet( + "no_max_age", + features=[Feature("feature_value8", ValueType.INT64)], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=0), + ) + client.apply(no_max_age_fs) + + +@pytest.mark.direct_runner +@pytest.mark.dataflow_runner +@pytest.mark.run(order=10) +def test_batch_get_historical_features_with_file(client): + file_fs1 = client.get_feature_set(name="file_feature_set") + + N_ROWS = 10 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + features_1_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "feature_value1": [f"{i}" for i in range(N_ROWS)], + } + ) + + # feature set may be ready (direct runner set ready right after job submitted), + # but kafka consumer is not configured + # give some time to warm up ingestion job + wait_retry_backoff( + retry_fn=( + lambda: ( + None, + client.get_feature_set(name="file_feature_set").status + == FeatureSetStatus.STATUS_READY, + ) + ), + timeout_secs=480, + timeout_msg="Wait for FeatureSet to be READY", + ) + time.sleep(20) + + client.ingest(file_fs1, features_1_df, timeout=480) + + # Rename column (datetime -> event_timestamp) + features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) + + to_avro( + df=features_1_df[["event_timestamp", "entity_id"]], + file_path_or_buffer="file_feature_set.avro", + ) + + time.sleep(10) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows="file://file_feature_set.avro", + feature_refs=["feature_value1"], + project=PROJECT_NAME, + ) + + output = feature_retrieval_job.to_dataframe(timeout_sec=180) + print(output.head()) + + assert output["entity_id"].to_list() == [ + int(i) for i in output["feature_value1"].to_list() + ] + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=10)) + + +@pytest.mark.direct_runner +@pytest.mark.dataflow_runner +@pytest.mark.run(order=11) +def test_batch_get_historical_features_with_gs_path(client, gcs_path): + gcs_fs1 = client.get_feature_set(name="gcs_feature_set") + + N_ROWS = 10 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + features_1_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "feature_value2": [f"{i}" for i in range(N_ROWS)], + } + ) + client.ingest(gcs_fs1, features_1_df, timeout=360) + + # Rename column (datetime -> event_timestamp) + features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) + + # Output file to local + file_name = "gcs_feature_set.avro" + to_avro( + df=features_1_df[["event_timestamp", "entity_id"]], + file_path_or_buffer=file_name, + ) + + uri = urlparse(gcs_path) + bucket = uri.hostname + ts = int(time.time()) + remote_path = str(uri.path).strip("/") + f"/{ts}/{file_name}" + + # Upload file to gcs + storage_client = storage.Client(project=None) + bucket = storage_client.get_bucket(bucket) + blob = bucket.blob(remote_path) + blob.upload_from_filename(file_name) + + time.sleep(10) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=f"{gcs_path}/{ts}/*", + feature_refs=["feature_value2"], + project=PROJECT_NAME, + ) + + output = feature_retrieval_job.to_dataframe(timeout_sec=180) + print(output.head()) + assert output["entity_id"].to_list() == [ + int(i) for i in output["feature_value2"].to_list() + ] + + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + blob.delete() + + wait_for(check, timedelta(minutes=5)) + + +@pytest.mark.direct_runner +@pytest.mark.run(order=12) +def test_batch_order_by_creation_time(client): + proc_time_fs = client.get_feature_set(name="processing_time") + + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + N_ROWS = 10 + incorrect_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "feature_value3": ["WRONG"] * N_ROWS, + } + ) + correct_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "feature_value3": ["CORRECT"] * N_ROWS, + } + ) + client.ingest(proc_time_fs, incorrect_df) + time.sleep(15) + client.ingest(proc_time_fs, correct_df) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=incorrect_df[["datetime", "entity_id"]], + feature_refs=["feature_value3"], + project=PROJECT_NAME, + ) + output = feature_retrieval_job.to_dataframe(timeout_sec=180) + print(output.head()) + + assert output["feature_value3"].to_list() == ["CORRECT"] * N_ROWS + + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=5)) + + +@pytest.mark.direct_runner +@pytest.mark.run(order=13) +def test_batch_additional_columns_in_entity_table(client): + add_cols_fs = client.get_feature_set(name="additional_columns") + + N_ROWS = 10 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + features_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "feature_value4": ["abc"] * N_ROWS, + } + ) + client.ingest(add_cols_fs, features_df) + + entity_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "additional_string_col": ["hello im extra"] * N_ROWS, + "additional_float_col": [random.random() for i in range(N_ROWS)], + } + ) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=entity_df, + feature_refs=["feature_value4"], + project=PROJECT_NAME, + ) + output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( + by=["entity_id"] + ) + print(output.head(10)) + + assert np.allclose( + output["additional_float_col"], entity_df["additional_float_col"] + ) + assert ( + output["additional_string_col"].to_list() + == entity_df["additional_string_col"].to_list() + ) + assert ( + output["feature_value4"].to_list() + == features_df["feature_value4"].to_list() + ) + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=5)) + + +@pytest.mark.direct_runner +@pytest.mark.run(order=14) +def test_batch_point_in_time_correctness_join(client): + historical_fs = client.get_feature_set(name="historical") + + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + N_EXAMPLES = 10 + historical_df = pd.DataFrame( + { + "datetime": [ + time_offset - timedelta(seconds=50), + time_offset - timedelta(seconds=30), + time_offset - timedelta(seconds=10), + ] + * N_EXAMPLES, + "entity_id": [i for i in range(N_EXAMPLES) for _ in range(3)], + "feature_value5": ["WRONG", "WRONG", "CORRECT"] * N_EXAMPLES, + } + ) + entity_df = pd.DataFrame( + { + "datetime": [time_offset - timedelta(seconds=10)] * N_EXAMPLES, + "entity_id": [i for i in range(N_EXAMPLES)], + } + ) + + client.ingest(historical_fs, historical_df) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=entity_df, + feature_refs=["feature_value5"], + project=PROJECT_NAME, + ) + output = feature_retrieval_job.to_dataframe(timeout_sec=180) + print(output.head()) + + assert output["feature_value5"].to_list() == ["CORRECT"] * N_EXAMPLES + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=5)) + + +@pytest.mark.direct_runner +@pytest.mark.run(order=15) +def test_batch_multiple_featureset_joins(client): + fs1 = client.get_feature_set(name="feature_set_1") + fs2 = client.get_feature_set(name="feature_set_2") + + N_ROWS = 10 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + features_1_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "feature_value6": [f"{i}" for i in range(N_ROWS)], + } + ) + client.ingest(fs1, features_1_df) + + features_2_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "other_entity_id": [i for i in range(N_ROWS)], + "other_feature_value7": [i for i in range(N_ROWS)], + } + ) + client.ingest(fs2, features_2_df) + + entity_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "other_entity_id": [N_ROWS - 1 - i for i in range(N_ROWS)], + } + ) + + # Test retrieve with different variations of the string feature refs + # ie feature set inference for feature refs without specified feature set + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=entity_df, + feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], + project=PROJECT_NAME, + ) + output = feature_retrieval_job.to_dataframe(timeout_sec=180) + print(output.head()) + + assert output["entity_id"].to_list() == [ + int(i) for i in output["feature_value6"].to_list() + ] + assert ( + output["other_entity_id"].to_list() + == output["feature_set_2__other_feature_value7"].to_list() + ) + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=5)) + + +@pytest.mark.direct_runner +@pytest.mark.run(order=16) +def test_batch_no_max_age(client): + no_max_age_fs = client.get_feature_set(name="no_max_age") + + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + N_ROWS = 10 + features_8_df = pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "entity_id": [i for i in range(N_ROWS)], + "feature_value8": [i for i in range(N_ROWS)], + } + ) + client.ingest(no_max_age_fs, features_8_df) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=features_8_df[["datetime", "entity_id"]], + feature_refs=["feature_value8"], + project=PROJECT_NAME, + ) + + output = feature_retrieval_job.to_dataframe(timeout_sec=180) + print(output.head()) + + assert output["entity_id"].to_list() == output["feature_value8"].to_list() + + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=5)) + + +@pytest.fixture(scope="module", autouse=True) +def infra_teardown(pytestconfig, jobcontroller_url): + client = JCClient(jobcontroller_url=jobcontroller_url) + + marker = pytestconfig.getoption("-m") + yield marker + if marker == "dataflow_runner": + ingest_jobs = client.list_ingest_jobs() + ingest_jobs = [ + client.list_ingest_jobs(job.id)[0].external_id + for job in ingest_jobs + if job.status == IngestionJobStatus.RUNNING + ] + + cwd = os.getcwd() + with open(f"{cwd}/ingesting_jobs.txt", "w+") as output: + for job in ingest_jobs: + output.write("%s\n" % job) + else: + print("Cleaning up not required") + + +""" +This suite of tests tests the apply feature set - update feature set - retrieve +event sequence. It ensures that when a feature set is updated, tombstoned features +are no longer retrieved, and added features are null for previously ingested +rows. + +It is marked separately because of the length of time required +to perform this test, due to bigquery schema caching for streaming writes. +""" + + +@pytest.fixture(scope="module") +def update_featureset_dataframe(): + n_rows = 10 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + return pd.DataFrame( + { + "datetime": [time_offset] * n_rows, + "entity_id": [i for i in range(n_rows)], + "update_feature1": ["a" for i in range(n_rows)], + "update_feature2": [i + 2 for i in range(n_rows)], + "update_feature3": [i for i in range(n_rows)], + "update_feature4": ["b" for i in range(n_rows)], + } + ) + + +@pytest.mark.fs_update +@pytest.mark.run(order=20) +def test_update_featureset_apply_featureset_and_ingest_first_subset( + client, update_featureset_dataframe +): + subset_columns = ["datetime", "entity_id", "update_feature1", "update_feature2"] + subset_df = update_featureset_dataframe.iloc[:5][subset_columns] + update_fs = FeatureSet( + "update_fs", + entities=[Entity(name="entity_id", dtype=ValueType.INT64)], + max_age=Duration(seconds=432000), + ) + update_fs.infer_fields_from_df(subset_df) + client.apply(update_fs) + + client.ingest(feature_set=update_fs, source=subset_df) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[:5], + feature_refs=["update_feature1", "update_feature2"], + project=PROJECT_NAME, + ) + + output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( + by=["entity_id"] + ) + print(output.head()) + + assert ( + output["update_feature1"].to_list() + == subset_df["update_feature1"].to_list() + ) + assert ( + output["update_feature2"].to_list() + == subset_df["update_feature2"].to_list() + ) + + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=5)) + + +@pytest.mark.fs_update +@pytest.mark.timeout(600) +@pytest.mark.run(order=21) +def test_update_featureset_update_featureset_and_ingest_second_subset( + client, update_featureset_dataframe +): + subset_columns = [ + "datetime", + "entity_id", + "update_feature1", + "update_feature3", + "update_feature4", + ] + subset_df = update_featureset_dataframe.iloc[5:][subset_columns] + update_fs = FeatureSet( + "update_fs", + entities=[Entity(name="entity_id", dtype=ValueType.INT64)], + max_age=Duration(seconds=432000), + ) + update_fs.infer_fields_from_df(subset_df) + client.apply(update_fs) + + # We keep retrying this ingestion until all values make it into the buffer. + # This is a necessary step because bigquery streaming caches table schemas + # and as a result, rows may be lost. + while True: + ingestion_id = client.ingest(feature_set=update_fs, source=subset_df) + time.sleep(15) # wait for rows to get written to bq + rows_ingested = get_rows_ingested(client, update_fs, ingestion_id) + if rows_ingested == len(subset_df): + print(f"Number of rows successfully ingested: {rows_ingested}. Continuing.") + break + print( + f"Number of rows successfully ingested: {rows_ingested}. Retrying ingestion." + ) + time.sleep(30) + + def check(): + feature_retrieval_job = client.get_historical_features( + entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[5:], + feature_refs=["update_feature1", "update_feature3", "update_feature4"], + project=PROJECT_NAME, + ) + + output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( + by=["entity_id"] + ) + print(output.head()) + + assert ( + output["update_feature1"].to_list() + == subset_df["update_feature1"].to_list() + ) + assert ( + output["update_feature3"].to_list() + == subset_df["update_feature3"].to_list() + ) + assert ( + output["update_feature4"].to_list() + == subset_df["update_feature4"].to_list() + ) + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + wait_for(check, timedelta(minutes=5)) + + +@pytest.mark.fs_update +@pytest.mark.run(order=22) +def test_update_featureset_retrieve_all_fields(client, update_featureset_dataframe): + with pytest.raises(Exception): + feature_retrieval_job = client.get_historical_features( + entity_rows=update_featureset_dataframe[["datetime", "entity_id"]], + feature_refs=[ + "update_feature1", + "update_feature2", + "update_feature3", + "update_feature4", + ], + project=PROJECT_NAME, + ) + feature_retrieval_job.result() + + +@pytest.mark.fs_update +@pytest.mark.run(order=23) +def test_update_featureset_retrieve_valid_fields(client, update_featureset_dataframe): + feature_retrieval_job = client.get_historical_features( + entity_rows=update_featureset_dataframe[["datetime", "entity_id"]], + feature_refs=["update_feature1", "update_feature3", "update_feature4"], + project=PROJECT_NAME, + ) + output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( + by=["entity_id"] + ) + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + print(output.head(10)) + assert ( + output["update_feature1"].to_list() + == update_featureset_dataframe["update_feature1"].to_list() + ) + # we have to convert to float because the column contains np.NaN + assert [math.isnan(i) for i in output["update_feature3"].to_list()[:5]] == [ + True + ] * 5 + assert output["update_feature3"].to_list()[5:] == [ + float(i) for i in update_featureset_dataframe["update_feature3"].to_list()[5:] + ] + assert ( + output["update_feature4"].to_list() + == [None] * 5 + update_featureset_dataframe["update_feature4"].to_list()[5:] + ) + + +@pytest.mark.direct_runner +@pytest.mark.run(order=31) +@pytest.mark.timeout(600) +def test_batch_dataset_statistics(client): + fs1 = client.get_feature_set(name="feature_set_1") + fs2 = client.get_feature_set(name="feature_set_2") + id_offset = 20 + + n_rows = 21 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + features_1_df = pd.DataFrame( + { + "datetime": [time_offset] * n_rows, + "entity_id": [id_offset + i for i in range(n_rows)], + "feature_value6": ["a" for i in range(n_rows)], + } + ) + ingestion_id1 = client.ingest(fs1, features_1_df) + + features_2_df = pd.DataFrame( + { + "datetime": [time_offset] * n_rows, + "other_entity_id": [id_offset + i for i in range(n_rows)], + "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)], + } + ) + ingestion_id2 = client.ingest(fs2, features_2_df) + + entity_df = pd.DataFrame( + { + "datetime": [time_offset] * n_rows, + "entity_id": [id_offset + i for i in range(n_rows)], + "other_entity_id": [id_offset + i for i in range(n_rows)], + } + ) + + time.sleep(15) # wait for rows to get written to bq + while True: + rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1) + rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2) + if rows_ingested1 == len(features_1_df) and rows_ingested2 == len( + features_2_df + ): + print( + f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing." + ) + break + time.sleep(30) + + feature_retrieval_job = client.get_historical_features( + entity_rows=entity_df, + feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], + project=PROJECT_NAME, + compute_statistics=True, + ) + output = feature_retrieval_job.to_dataframe(timeout_sec=180) + print(output.head(10)) + stats = feature_retrieval_job.statistics(timeout_sec=180) + clear_unsupported_fields(stats) + + expected_stats = tfdv.generate_statistics_from_dataframe( + output[["feature_value6", "feature_set_2__other_feature_value7"]] + ) + clear_unsupported_fields(expected_stats) + + # Since TFDV computes population std dev + for feature in expected_stats.datasets[0].features: + if feature.HasField("num_stats"): + name = feature.path.step[0] + std = output[name].std() + feature.num_stats.std_dev = std + + assert_stats_equal(expected_stats, stats) + clean_up_remote_files(feature_retrieval_job.get_avro_files()) + + +def get_rows_ingested( + client: Client, feature_set: FeatureSet, ingestion_id: str +) -> int: + response = client._core_service.ListStores( + ListStoresRequest(filter=ListStoresRequest.Filter(name="historical")) + ) + bq_config = response.store[0].bigquery_config + project = bq_config.project_id + dataset = bq_config.dataset_id + table = f"{PROJECT_NAME}_{feature_set.name}" + + bq_client = bigquery.Client(project=project) + rows = bq_client.query( + f'SELECT COUNT(*) as count FROM `{project}.{dataset}.{table}` WHERE ingestion_id = "{ingestion_id}"' + ).result() + + return list(rows)[0]["count"] + + +def clean_up_remote_files(files): + storage_client = storage.Client() + for file_uri in files: + if file_uri.scheme == "gs": + blob = Blob.from_string(file_uri.geturl(), client=storage_client) + blob.delete() diff --git a/tests/e2e/bq/feature-stats.py b/tests/e2e/bq/feature-stats.py new file mode 100644 index 0000000000..226dc358f1 --- /dev/null +++ b/tests/e2e/bq/feature-stats.py @@ -0,0 +1,256 @@ +import os +import time +import uuid +from datetime import datetime, timedelta + +import pandas as pd +import pytest +import pytz +import tensorflow_data_validation as tfdv +from google.protobuf.duration_pb2 import Duration + +from bq.testutils import ( + assert_stats_equal, + clear_unsupported_agg_fields, + clear_unsupported_fields, +) +from feast.client import Client +from feast.entity import Entity +from feast.feature import Feature +from feast.feature_set import FeatureSet +from feast.type_map import ValueType + +pd.set_option("display.max_columns", None) + +PROJECT_NAME = "batch_" + uuid.uuid4().hex.upper()[0:6] +STORE_NAME = "historical" +os.environ["CUDA_VISIBLE_DEVICES"] = "0" + + +@pytest.fixture(scope="module") +def core_url(pytestconfig): + return pytestconfig.getoption("core_url") + + +@pytest.fixture(scope="module") +def serving_url(pytestconfig): + return pytestconfig.getoption("serving_url") + + +@pytest.fixture(scope="module") +def allow_dirty(pytestconfig): + return True if pytestconfig.getoption("allow_dirty").lower() == "true" else False + + +@pytest.fixture(scope="module") +def gcs_path(pytestconfig): + return pytestconfig.getoption("gcs_path") + + +@pytest.fixture(scope="module") +def client(core_url, allow_dirty): + # Get client for core and serving + client = Client(core_url=core_url) + client.create_project(PROJECT_NAME) + client.set_project(PROJECT_NAME) + + # Ensure Feast core is active, but empty + if not allow_dirty: + feature_sets = client.list_feature_sets() + if len(feature_sets) > 0: + raise Exception( + "Feast cannot have existing feature sets registered. Exiting tests." + ) + + return client + + +@pytest.fixture(scope="module") +def feature_stats_feature_set(client): + fv_fs = FeatureSet( + "feature_stats", + features=[ + Feature("strings", ValueType.STRING), + Feature("ints", ValueType.INT64), + Feature("floats", ValueType.FLOAT), + ], + entities=[Entity("entity_id", ValueType.INT64)], + max_age=Duration(seconds=100), + ) + client.apply(fv_fs) + return fv_fs + + +@pytest.fixture(scope="module") +def feature_stats_dataset_basic(client, feature_stats_feature_set): + + n_rows = 20 + + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + df = pd.DataFrame( + { + "datetime": [time_offset] * n_rows, + "entity_id": [i for i in range(n_rows)], + "strings": ["a", "b"] * int(n_rows / 2), + "ints": [int(i) for i in range(n_rows)], + "floats": [10.5 - i for i in range(n_rows)], + } + ) + + expected_stats = tfdv.generate_statistics_from_dataframe( + df[["strings", "ints", "floats"]] + ) + clear_unsupported_fields(expected_stats) + + # Since TFDV computes population std dev + for feature in expected_stats.datasets[0].features: + if feature.HasField("num_stats"): + name = feature.path.step[0] + std = df[name].std() + feature.num_stats.std_dev = std + + ingestion_id = client.ingest(feature_stats_feature_set, df) + time.sleep(10) + return { + "df": df, + "id": ingestion_id, + "date": datetime(time_offset.year, time_offset.month, time_offset.day).replace( + tzinfo=pytz.utc + ), + "stats": expected_stats, + } + + +@pytest.fixture(scope="module") +def feature_stats_dataset_agg(client, feature_stats_feature_set): + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + start_date = time_offset - timedelta(days=10) + end_date = time_offset - timedelta(days=7) + df1 = pd.DataFrame( + { + "datetime": [start_date] * 5, + "entity_id": [i for i in range(5)], + "strings": ["a", "b", "b", "b", "a"], + "ints": [4, 3, 2, 6, 3], + "floats": [2.1, 5.2, 4.3, 0.6, 0.1], + } + ) + ingestion_id_1 = client.ingest(feature_stats_feature_set, df1) + df2 = pd.DataFrame( + { + "datetime": [start_date + timedelta(days=1)] * 3, + "entity_id": [i for i in range(3)], + "strings": ["a", "b", "c"], + "ints": [2, 6, 7], + "floats": [1.6, 2.4, 2], + } + ) + ingestion_id_2 = client.ingest(feature_stats_feature_set, df2) + + combined_df = pd.concat([df1, df2])[["strings", "ints", "floats"]] + expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) + clear_unsupported_agg_fields(expected_stats) + + # Since TFDV computes population std dev + for feature in expected_stats.datasets[0].features: + if feature.HasField("num_stats"): + name = feature.path.step[0] + std = combined_df[name].std() + feature.num_stats.std_dev = std + + time.sleep(10) + + return { + "ids": [ingestion_id_1, ingestion_id_2], + "start_date": datetime( + start_date.year, start_date.month, start_date.day + ).replace(tzinfo=pytz.utc), + "end_date": datetime(end_date.year, end_date.month, end_date.day).replace( + tzinfo=pytz.utc + ), + "stats": expected_stats, + } + + +def test_feature_stats_retrieval_by_single_dataset(client, feature_stats_dataset_basic): + stats = client.get_statistics( + "feature_stats", + features=["strings", "ints", "floats"], + store=STORE_NAME, + ingestion_ids=[feature_stats_dataset_basic["id"]], + ) + + assert_stats_equal(feature_stats_dataset_basic["stats"], stats) + + +def test_feature_stats_by_date(client, feature_stats_dataset_basic): + stats = client.get_statistics( + "feature_stats", + features=["strings", "ints", "floats"], + store=STORE_NAME, + start_date=feature_stats_dataset_basic["date"], + end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), + ) + assert_stats_equal(feature_stats_dataset_basic["stats"], stats) + + +def test_feature_stats_agg_over_datasets(client, feature_stats_dataset_agg): + stats = client.get_statistics( + "feature_stats", + features=["strings", "ints", "floats"], + store=STORE_NAME, + ingestion_ids=feature_stats_dataset_agg["ids"], + ) + assert_stats_equal(feature_stats_dataset_agg["stats"], stats) + + +def test_feature_stats_agg_over_dates(client, feature_stats_dataset_agg): + stats = client.get_statistics( + "feature_stats", + features=["strings", "ints", "floats"], + store=STORE_NAME, + start_date=feature_stats_dataset_agg["start_date"], + end_date=feature_stats_dataset_agg["end_date"], + ) + assert_stats_equal(feature_stats_dataset_agg["stats"], stats) + + +def test_feature_stats_force_refresh( + client, feature_stats_dataset_basic, feature_stats_feature_set +): + df = feature_stats_dataset_basic["df"] + + df2 = pd.DataFrame( + { + "datetime": [df.iloc[0].datetime], + "entity_id": [10], + "strings": ["c"], + "ints": [2], + "floats": [1.3], + } + ) + client.ingest(feature_stats_feature_set, df2) + time.sleep(10) + + actual_stats = client.get_statistics( + "feature_stats", + features=["strings", "ints", "floats"], + store="historical", + start_date=feature_stats_dataset_basic["date"], + end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), + force_refresh=True, + ) + + combined_df = pd.concat([df, df2]) + expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) + + clear_unsupported_fields(expected_stats) + + # Since TFDV computes population std dev + for feature in expected_stats.datasets[0].features: + if feature.HasField("num_stats"): + name = feature.path.step[0] + std = combined_df[name].std() + feature.num_stats.std_dev = std + + assert_stats_equal(expected_stats, actual_stats) diff --git a/tests/e2e/bq/testutils.py b/tests/e2e/bq/testutils.py new file mode 100644 index 0000000000..9ac678bc59 --- /dev/null +++ b/tests/e2e/bq/testutils.py @@ -0,0 +1,55 @@ +from deepdiff import DeepDiff +from google.protobuf.json_format import MessageToDict + + +def clear_unsupported_fields(datasets): + dataset = datasets.datasets[0] + for feature in dataset.features: + if feature.HasField("num_stats"): + feature.num_stats.common_stats.ClearField("num_values_histogram") + # Since difference in how BQ and TFDV compute histogram values make them + # approximate but uncomparable + feature.num_stats.ClearField("histograms") + elif feature.HasField("string_stats"): + feature.string_stats.common_stats.ClearField("num_values_histogram") + for bucket in feature.string_stats.rank_histogram.buckets: + bucket.ClearField("low_rank") + bucket.ClearField("high_rank") + elif feature.HasField("struct_stats"): + feature.string_stats.struct_stats.ClearField("num_values_histogram") + elif feature.HasField("bytes_stats"): + feature.string_stats.bytes_stats.ClearField("num_values_histogram") + + +def clear_unsupported_agg_fields(datasets): + dataset = datasets.datasets[0] + for feature in dataset.features: + if feature.HasField("num_stats"): + feature.num_stats.common_stats.ClearField("num_values_histogram") + feature.num_stats.ClearField("histograms") + feature.num_stats.ClearField("median") + elif feature.HasField("string_stats"): + feature.string_stats.common_stats.ClearField("num_values_histogram") + feature.string_stats.ClearField("rank_histogram") + feature.string_stats.ClearField("top_values") + feature.string_stats.ClearField("unique") + elif feature.HasField("struct_stats"): + feature.struct_stats.ClearField("num_values_histogram") + elif feature.HasField("bytes_stats"): + feature.bytes_stats.ClearField("num_values_histogram") + feature.bytes_stats.ClearField("unique") + + +def assert_stats_equal(left, right): + left_stats = MessageToDict(left)["datasets"][0] + right_stats = MessageToDict(right)["datasets"][0] + assert ( + left_stats["numExamples"] == right_stats["numExamples"] + ), f"Number of examples do not match. Expected {left_stats['numExamples']}, got {right_stats['numExamples']}" + + left_features = sorted(left_stats["features"], key=lambda k: k["path"]["step"][0]) + right_features = sorted(right_stats["features"], key=lambda k: k["path"]["step"][0]) + diff = DeepDiff(left_features, right_features, significant_digits=3) + assert ( + len(diff) == 0 + ), f"Feature statistics do not match: \nwanted: {left_features}\n got: {right_features}" From 7e9ad2bac86fdc517804fdce3e889bcf62310234 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 14:46:03 +0800 Subject: [PATCH 09/34] Fix pytest redis Signed-off-by: Terence --- sdk/python/feast/feature_table.py | 4 +++- tests/e2e/redis/parallel-ingest-redis-serving.py | 16 ++++------------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index ebe69e7fad..eafa4260ef 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -80,7 +80,9 @@ def __eq__(self, other): ): return False - if self.entities != other.entities: + if sorted(self.entities) != sorted(other.entities): + return False + if self.features != other.features: return False if self.batch_source != other.batch_source: return False diff --git a/tests/e2e/redis/parallel-ingest-redis-serving.py b/tests/e2e/redis/parallel-ingest-redis-serving.py index 7a213dc8af..dd98f66063 100644 --- a/tests/e2e/redis/parallel-ingest-redis-serving.py +++ b/tests/e2e/redis/parallel-ingest-redis-serving.py @@ -80,15 +80,11 @@ def test_discovery(self, client): # GetFeatureTable Check actual_get_feature_table = client.get_feature_table(name="dev_featuretable") - assert actual_get_feature_table.name == self.basic_ft_spec.name - assert actual_get_feature_table.entities == self.basic_ft_spec.entities - assert actual_get_feature_table.features == self.basic_ft_spec.features + assert actual_get_feature_table == self.basic_ft_spec # ListFeatureTables Check actual_list_feature_table = client.list_feature_tables()[0] - assert actual_list_feature_table.name == self.basic_ft_spec.name - assert actual_list_feature_table.entities == self.basic_ft_spec.entities - assert actual_list_feature_table.features == self.basic_ft_spec.features + assert actual_list_feature_table == self.basic_ft_spec def test_basic_retrieval(self, client): # TODO: Add ingest and retrieval check @@ -174,15 +170,11 @@ def test_discovery(self, client): # GetFeatureTable Check actual_get_feature_table = client.get_feature_table(name="alltypes") - assert actual_get_feature_table.name == self.alltypes_ft_spec.name - assert actual_get_feature_table.entities == self.alltypes_ft_spec.entities - assert actual_get_feature_table.features == self.alltypes_ft_spec.features + assert actual_get_feature_table == self.alltypes_ft_spec # ListFeatureTables Check actual_list_feature_table = client.list_feature_tables()[0] - assert actual_list_feature_table.name == self.alltypes_ft_spec.name - assert actual_list_feature_table.entities == self.alltypes_ft_spec.entities - assert actual_list_feature_table.features == self.alltypes_ft_spec.features + assert actual_list_feature_table == self.alltypes_ft_spec def test_alltypes_retrieval(self, client): # TODO: Add ingest and retrieval check From f7c5e316f41a87dac0d042f3424361c1faf62213 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 15:57:53 +0800 Subject: [PATCH 10/34] Remove unused batch tests Signed-off-by: Terence --- .prow/config.yaml | 91 -- Makefile | 2 +- .../scripts/test-end-to-end-batch-dataflow.sh | 307 ------- infra/scripts/test-end-to-end-batch.sh | 153 ---- tests/e2e/bq/bq-batch-retrieval.py | 819 ------------------ tests/e2e/bq/feature-stats.py | 256 ------ tests/e2e/bq/testutils.py | 55 -- 7 files changed, 1 insertion(+), 1682 deletions(-) delete mode 100755 infra/scripts/test-end-to-end-batch-dataflow.sh delete mode 100755 infra/scripts/test-end-to-end-batch.sh delete mode 100644 tests/e2e/bq/bq-batch-retrieval.py delete mode 100644 tests/e2e/bq/feature-stats.py delete mode 100644 tests/e2e/bq/testutils.py diff --git a/.prow/config.yaml b/.prow/config.yaml index d2269fcc6d..39c275603d 100644 --- a/.prow/config.yaml +++ b/.prow/config.yaml @@ -254,97 +254,6 @@ presubmits: branches: - ^v0\.(3|4)-branch$ - - name: test-end-to-end-batch - decorate: true - always_run: true - spec: - volumes: - - name: service-account - secret: - secretName: feast-service-account - containers: - - image: maven:3.6-jdk-11 - command: ["infra/scripts/test-end-to-end-batch.sh"] - resources: - requests: - cpu: "6" - memory: "6144Mi" - volumeMounts: - - name: service-account - mountPath: "/etc/service-account" - skip_branches: - - ^v0\.(3|4)-branch$ - - - name: test-end-to-end-batch-fs-update - decorate: true - always_run: false - spec: - volumes: - - name: service-account - secret: - secretName: feast-service-account - containers: - - image: maven:3.6-jdk-11 - command: ["infra/scripts/test-end-to-end-batch.sh", "-m", "fs_update"] - resources: - requests: - cpu: "6" - memory: "6144Mi" - volumeMounts: - - name: service-account - mountPath: "/etc/service-account" - skip_branches: - - ^v0\.(3|4)-branch$ - - - name: test-end-to-end-batch-java-8 - decorate: true - always_run: true - spec: - volumes: - - name: service-account - secret: - secretName: feast-service-account - containers: - - image: maven:3.6-jdk-8 - command: ["infra/scripts/test-end-to-end-batch.sh"] - resources: - requests: - cpu: "6" - memory: "6144Mi" - volumeMounts: - - name: service-account - mountPath: "/etc/service-account" - branches: - - ^v0\.(3|4)-branch$ - - - name: test-end-to-end-batch-dataflow - decorate: true - always_run: true - spec: - volumes: - - name: service-account-df - secret: - secretName: feast-e2e-service-account - - name: docker-socket - hostPath: - path: /var/run/docker.sock - containers: - - image: google/cloud-sdk:302.0.0 - command: ["infra/scripts/test-end-to-end-batch-dataflow.sh"] - resources: - requests: - cpu: "6" - memory: "6144Mi" - volumeMounts: - - name: service-account-df - mountPath: "/etc/service-account-df" - - name: docker-socket - mountPath: /var/run/docker.sock - securityContext: - privileged: true - skip_branches: - - ^v0\.(3|4)-branch$ - postsubmits: feast-dev/feast: - name: publish-python-sdk diff --git a/Makefile b/Makefile index 8fffe20816..f159ad624d 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,7 @@ lint-python: cd ${ROOT_DIR}/sdk/python; flake8 feast/ tests/ cd ${ROOT_DIR}/sdk/python; black --check feast tests - cd ${ROOT_DIR}/tests/e2e; mypy bq/ redis/ + cd ${ROOT_DIR}/tests/e2e; mypy redis/ cd ${ROOT_DIR}/tests/e2e; isort . --check-only cd ${ROOT_DIR}/tests/e2e; flake8 . cd ${ROOT_DIR}/tests/e2e; black --check . diff --git a/infra/scripts/test-end-to-end-batch-dataflow.sh b/infra/scripts/test-end-to-end-batch-dataflow.sh deleted file mode 100755 index 363ba7dc47..0000000000 --- a/infra/scripts/test-end-to-end-batch-dataflow.sh +++ /dev/null @@ -1,307 +0,0 @@ -#!/usr/bin/env bash -echo "Preparing environment variables..." - -set -e -set -o pipefail - -test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account-df/service-account-df.json" -test -z ${GCLOUD_PROJECT} && GCLOUD_PROJECT="kf-feast" -test -z ${GCLOUD_REGION} && GCLOUD_REGION="us-central1" -test -z ${GCLOUD_NETWORK} && GCLOUD_NETWORK="default" -test -z ${GCLOUD_SUBNET} && GCLOUD_SUBNET="default" -test -z ${TEMP_BUCKET} && TEMP_BUCKET="kf-feast-dataflow-temp" -test -z ${K8_CLUSTER_NAME} && K8_CLUSTER_NAME="feast-e2e-dataflow" -test -z ${HELM_RELEASE_NAME} && HELM_RELEASE_NAME="pr-$PULL_NUMBER" -test -z ${HELM_COMMON_NAME} && HELM_COMMON_NAME="deps" -test -z ${DATASET_NAME} && DATASET_NAME=feast_e2e_$(date +%s) -test -z ${SPECS_TOPIC} && SPECS_TOPIC=feast-specs-$(date +%s) -test -z ${FEATURES_TOPIC} && FEATURES_TOPIC=feast-$(date +%s) - - -feast_kafka_1_ip_name="feast-kafka-1" -feast_kafka_2_ip_name="feast-kafka-2" -feast_kafka_3_ip_name="feast-kafka-3" -feast_redis_ip_name="feast-redis" -feast_statsd_ip_name="feast-statsd" - -echo " -This script will run end-to-end tests for Feast Core and Batch Serving using Dataflow Runner. - -1. Setup K8s cluster (optional, if it was not created before) -2. Reuse existing IP addresses or generate new ones for stateful services -3. Install stateful services (kafka, redis, postgres, etc) (optional) -4. Build core & serving docker images (optional) -5. Create temporary BQ table for Feast Serving. -6. Rollout target images to cluster via helm in dedicated namespace (pr-{number}) -7. Install Python 3.7.4, Feast Python SDK and run end-to-end tests from - tests/e2e via pytest. -8. Tear down feast services, keep stateful services. -" - -ORIGINAL_DIR=$(pwd) -echo $ORIGINAL_DIR - -echo "Environment:" -printenv - -export GOOGLE_APPLICATION_CREDENTIALS -gcloud auth activate-service-account --key-file ${GOOGLE_APPLICATION_CREDENTIALS} -gcloud -q auth configure-docker - -gcloud config set project ${GCLOUD_PROJECT} -gcloud config set compute/region ${GCLOUD_REGION} -gcloud config list - -apt-get -qq update -apt-get -y install wget build-essential gettext-base curl - -curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 -chmod 700 $ORIGINAL_DIR/get_helm.sh -$ORIGINAL_DIR/get_helm.sh - - -function getPublicAddresses() { - existing_addresses=$(gcloud compute addresses list --filter="region:($GCLOUD_REGION) name:kafka" --format "list(name)") - if [[ -z "$existing_addresses" ]]; then - echo " -============================================================ -Reserving IP addresses for Feast dependencies -============================================================ -" - - gcloud compute addresses create \ - $feast_kafka_1_ip_name $feast_kafka_2_ip_name $feast_kafka_3_ip_name $feast_redis_ip_name $feast_statsd_ip_name \ - --region ${GCLOUD_REGION} --subnet ${GCLOUD_SUBNET} - fi - - - export feast_kafka_1_ip=$(gcloud compute addresses describe $feast_kafka_1_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_kafka_2_ip=$(gcloud compute addresses describe $feast_kafka_2_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_kafka_3_ip=$(gcloud compute addresses describe $feast_kafka_3_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_redis_ip=$(gcloud compute addresses describe $feast_redis_ip_name --region=${GCLOUD_REGION} --format "value(address)") - export feast_statsd_ip=$(gcloud compute addresses describe $feast_statsd_ip_name --region=${GCLOUD_REGION} --format "value(address)") -} - -function createKubeCluster() { - echo " -============================================================ -Creating GKE nodepool for Feast e2e test with DataflowRunner -============================================================ -" - gcloud container clusters create ${K8_CLUSTER_NAME} --region ${GCLOUD_REGION} \ - --enable-cloud-logging \ - --enable-cloud-monitoring \ - --network ${GCLOUD_NETWORK} \ - --subnetwork ${GCLOUD_SUBNET} \ - --scopes https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,\ -https://www.googleapis.com/auth/monitoring,https://www.googleapis.com/auth/service.management.readonly,\ -https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/trace.append,\ -https://www.googleapis.com/auth/bigquery \ - --machine-type n1-standard-2 - - echo " -============================================================ -Create feast-postgres-database Secret in GKE nodepool -============================================================ -" - kubectl create secret generic feast-postgresql --from-literal=postgresql-password=password - - echo " -============================================================ -Create feast-gcp-service-account Secret in GKE nodepool -============================================================ -" - cd $ORIGINAL_DIR/infra/scripts - kubectl create secret generic feast-gcp-service-account --from-file=credentials.json=${GOOGLE_APPLICATION_CREDENTIALS} -} - -function installDependencies() { - echo " -============================================================ -Helm install common parts (kafka, redis, etc) -============================================================ -" - cd $ORIGINAL_DIR/infra/charts/feast - - helm install --replace --wait --debug --values="values-end-to-end-batch-dataflow-updated.yaml" \ - --set "feast-core.enabled=false" \ - --set "feast-online-serving.enabled=false" \ - --set "feast-batch-serving.enabled=false" \ - --set "postgresql.enabled=false" - "$HELM_COMMON_NAME" . - -} - -function buildAndPushImage() -{ - echo docker build -t $1:$2 --build-arg REVISION=$2 -f $3 $ORIGINAL_DIR - docker build -t $1:$2 --build-arg REVISION=$2 -f $3 $ORIGINAL_DIR - docker push $1:$2 -} - -function buildTarget() { - buildAndPushImage "gcr.io/kf-feast/feast-core" "$PULL_NUMBER" "$ORIGINAL_DIR/infra/docker/core/Dockerfile" - buildAndPushImage "gcr.io/kf-feast/feast-serving" "$PULL_NUMBER" "$ORIGINAL_DIR/infra/docker/serving/Dockerfile" -} - -function installTarget() { - echo " -============================================================ -Helm install feast -============================================================ -" - cd $ORIGINAL_DIR/infra/charts/feast - - helm install --wait --timeout 300s --debug --values="values-end-to-end-batch-dataflow-updated.yaml" \ - --set "kafka.enabled=false" \ - --set "redis.enabled=false" \ - --set "prometheus-statsd-exporter.enabled=false" \ - --set "prometheus.enabled=false" \ - "$HELM_RELEASE_NAME" . - -} - -function clean() { - echo " - ============================================================ - Cleaning up - ============================================================ - " - cd $ORIGINAL_DIR/tests/e2e - - # Remove BQ Dataset - bq rm -r -f ${GCLOUD_PROJECT}:${DATASET_NAME} - - # Uninstall helm release before clearing PVCs - helm uninstall ${HELM_RELEASE_NAME} - - kubectl delete pvc data-${HELM_RELEASE_NAME}-postgresql-0 - - # Stop Dataflow jobs from retrieved Dataflow job ids in ingesting_jobs.txt - if [ -f ingesting_jobs.txt ]; then - while read line - do - echo $line - gcloud dataflow jobs cancel $line --region=${GCLOUD_REGION} - done < ingesting_jobs.txt - fi -} - -# 1. -existing_cluster=$(gcloud container clusters list --format "list(name)" --filter "name:$K8_CLUSTER_NAME") -if [[ -z $existing_cluster ]]; then - createKubeCluster "$@" -else - gcloud container clusters get-credentials $K8_CLUSTER_NAME --region $GCLOUD_REGION --project $GCLOUD_PROJECT -fi - -# 2. -getPublicAddresses "$@" - -echo " -============================================================ -Export required environment variables -============================================================ -" - -export TEMP_BUCKET=$TEMP_BUCKET/$HELM_RELEASE_NAME/$(date +%s) -export DATASET_NAME=$DATASET_NAME -export GCLOUD_PROJECT=$GCLOUD_PROJECT -export GCLOUD_NETWORK=$GCLOUD_NETWORK -export GCLOUD_SUBNET=$GCLOUD_SUBNET -export GCLOUD_REGION=$GCLOUD_REGION -export HELM_COMMON_NAME=$HELM_COMMON_NAME -export IMAGE_TAG=$PULL_PULL_SHA -export SPECS_TOPIC=$SPECS_TOPIC -export FEATURES_TOPIC=$FEATURES_TOPIC - -export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) -export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts -source ${SCRIPTS_DIR}/setup-common-functions.sh - -wait_for_docker_image gcr.io/kf-feast/feast-core:"${IMAGE_TAG}" -wait_for_docker_image gcr.io/kf-feast/feast-serving:"${IMAGE_TAG}" - -envsubst $'$TEMP_BUCKET $DATASET_NAME $GCLOUD_PROJECT $GCLOUD_NETWORK $SPECS_TOPIC $FEATURES_TOPIC \ - $GCLOUD_SUBNET $GCLOUD_REGION $IMAGE_TAG $HELM_COMMON_NAME $feast_kafka_1_ip - $feast_kafka_2_ip $feast_kafka_3_ip $feast_redis_ip $feast_statsd_ip' < $ORIGINAL_DIR/infra/scripts/test-templates/values-end-to-end-batch-dataflow.yaml > $ORIGINAL_DIR/infra/charts/feast/values-end-to-end-batch-dataflow-updated.yaml - - -# 3. -existing_deps=$(helm list --filter deps -q) -if [[ -z $existing_deps ]]; then - installDependencies "$@" -fi - -# 4. -# buildTarget "$@" - -# 5. -echo " -============================================================ -Creating temp BQ table for Feast Serving -============================================================ -" - -bq --location=US --project_id=${GCLOUD_PROJECT} mk \ - --dataset \ - --default_table_expiration 86400 \ - ${GCLOUD_PROJECT}:${DATASET_NAME} - - -# 6. - -set +e -installTarget "$@" - -# 7. -echo " -============================================================ -Installing Python 3.7 with Miniconda and Feast SDK -============================================================ -" -cd $ORIGINAL_DIR -# Install Python 3.7 with Miniconda -wget -q https://repo.continuum.io/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh \ - -O /tmp/miniconda.sh -bash /tmp/miniconda.sh -b -p /root/miniconda -f -/root/miniconda/bin/conda init -source ~/.bashrc - -# Install Feast Python SDK and test requirements -cd $ORIGINAL_DIR -make compile-protos-python -pip install -qe sdk/python -pip install -qr tests/e2e/requirements.txt - -echo " -============================================================ -Running end-to-end tests with pytest at 'tests/e2e' -============================================================ -" -# Default artifact location setting in Prow jobs -LOGS_ARTIFACT_PATH=/logs/artifacts - -cd $ORIGINAL_DIR/tests/e2e - -core_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-core) -serving_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-batch-serving) -jobcontroller_ip=$(kubectl get -o jsonpath="{.status.loadBalancer.ingress[0].ip}" service ${HELM_RELEASE_NAME}-feast-jobcontroller) - -set +e -pytest -s -v bq/bq-batch-retrieval.py -m dataflow_runner --core_url "$core_ip:6565" --serving_url "$serving_ip:6566" \ - --jobcontroller_url "$jobcontroller_ip:6570" --gcs_path "gs://${TEMP_BUCKET}" --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml -TEST_EXIT_CODE=$? - -if [[ ${TEST_EXIT_CODE} != 0 ]]; then - echo "[DEBUG] Printing logs" - ls -ltrh /var/log/feast* - cat /var/log/feast-serving-warehouse.log /var/log/feast-core.log - - echo "[DEBUG] Printing Python packages list" - pip list -else - clean "$@" -fi - -exit ${TEST_EXIT_CODE} diff --git a/infra/scripts/test-end-to-end-batch.sh b/infra/scripts/test-end-to-end-batch.sh deleted file mode 100755 index c741fe7168..0000000000 --- a/infra/scripts/test-end-to-end-batch.sh +++ /dev/null @@ -1,153 +0,0 @@ -#!/usr/bin/env bash - -set -e -set -o pipefail - -PYTEST_MARK='direct_runner' #default - -print_usage() { - printf "Usage: ./test-end-to-end-batch -m pytest_mark" -} - -while getopts 'm:' flag; do - case "${flag}" in - m) PYTEST_MARK="${OPTARG}" ;; - *) print_usage - exit 1 ;; - esac -done - -test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account/service-account.json" -test -z ${SKIP_BUILD_JARS} && SKIP_BUILD_JARS="false" -test -z ${GOOGLE_CLOUD_PROJECT} && GOOGLE_CLOUD_PROJECT="kf-feast" -test -z ${TEMP_BUCKET} && TEMP_BUCKET="feast-templocation-kf-feast" -test -z ${JOBS_STAGING_LOCATION} && JOBS_STAGING_LOCATION="gs://${TEMP_BUCKET}/staging-location/$(date +%s)" - -# Get the current build version using maven (and pom.xml) -export FEAST_BUILD_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) -echo Building version: $FEAST_BUILD_VERSION - -# Get Feast project repository root and scripts directory -export PROJECT_ROOT_DIR=$(git rev-parse --show-toplevel) -export SCRIPTS_DIR=${PROJECT_ROOT_DIR}/infra/scripts - -echo " -This script will run end-to-end tests for Feast Core and Batch Serving. - -1. Install gcloud SDK -2. Install Redis as the job store for Feast Batch Serving. -4. Install Postgres for persisting Feast metadata. -5. Install Kafka and Zookeeper as the Source in Feast. -6. Install Python 3.7.4, Feast Python SDK and run end-to-end tests from - tests/e2e via pytest. -" - -source ${SCRIPTS_DIR}/setup-common-functions.sh - -install_test_tools -install_gcloud_sdk -install_and_start_local_redis -install_and_start_local_postgres -install_and_start_local_zookeeper_and_kafka - -if [[ ${SKIP_BUILD_JARS} != "true" ]]; then - build_feast_core_and_serving -else - echo "[DEBUG] Skipping building jars" -fi - -DATASET_NAME=feast_$(date +%s) -bq --location=US --project_id=${GOOGLE_CLOUD_PROJECT} mk \ - --dataset \ - --default_table_expiration 86400 \ - ${GOOGLE_CLOUD_PROJECT}:${DATASET_NAME} - -# Start Feast Core in background -cat < /tmp/jc.warehouse.application.yml -feast: - core-host: localhost - core-port: 6565 - jobs: - polling_interval_milliseconds: 10000 - active_runner: direct - consolidate-jobs-per-source: true - runners: - - name: direct - type: DirectRunner - options: - tempLocation: gs://${TEMP_BUCKET}/tempLocation - -EOF - -cat < /tmp/serving.warehouse.application.yml -feast: - # GRPC service address for Feast Core - # Feast Serving requires connection to Feast Core to retrieve and reload Feast metadata (e.g. FeatureSpecs, Store information) - core-host: localhost - core-grpc-port: 6565 - - # Indicates the active store. Only a single store in the last can be active at one time. In the future this key - # will be deprecated in order to allow multiple stores to be served from a single serving instance - active_store: historical - - # List of store configurations - stores: - - name: historical - type: BIGQUERY - config: - project_id: ${GOOGLE_CLOUD_PROJECT} - dataset_id: ${DATASET_NAME} - staging_location: ${JOBS_STAGING_LOCATION} - initial_retry_delay_seconds: 1 - total_timeout_seconds: 21600 - write_triggering_frequency_seconds: 1 - subscriptions: - - name: "*" - project: "*" - version: "*" - - job_store: - redis_host: localhost - redis_port: 6379 - - tracing: - enabled: false - -server: - port: 8081 - -EOF - -cat /tmp/jc.warehouse.application.yml /tmp/serving.warehouse.application.yml - -start_feast_core -start_feast_jobcontroller /tmp/jc.warehouse.application.yml -start_feast_serving /tmp/serving.warehouse.application.yml - -install_python_with_miniconda_and_feast_sdk - -print_banner "Running end-to-end tests with pytest at 'tests/e2e'" -# Default artifact location setting in Prow jobs -LOGS_ARTIFACT_PATH=/logs/artifacts - -ORIGINAL_DIR=$(pwd) -cd tests/e2e - -set +e -pytest bq/* -v -m ${PYTEST_MARK} --gcs_path ${JOBS_STAGING_LOCATION} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml -TEST_EXIT_CODE=$? - -if [[ ${TEST_EXIT_CODE} != 0 ]]; then - echo "[DEBUG] Printing logs" - ls -ltrh /var/log/feast* - cat /var/log/feast-serving-online.log /var/log/feast-core.log /var/log/feast-jobcontroller.log - - echo "[DEBUG] Printing Python packages list" - pip list -else - print_banner "Cleaning up" - - bq rm -r -f ${GOOGLE_CLOUD_PROJECT}:${DATASET_NAME} -fi - -exit ${TEST_EXIT_CODE} diff --git a/tests/e2e/bq/bq-batch-retrieval.py b/tests/e2e/bq/bq-batch-retrieval.py deleted file mode 100644 index 2d94d2e6cf..0000000000 --- a/tests/e2e/bq/bq-batch-retrieval.py +++ /dev/null @@ -1,819 +0,0 @@ -import math -import os -import random -import time -import uuid -from datetime import datetime, timedelta -from urllib.parse import urlparse - -import numpy as np -import pandas as pd -import pytest -import pytz -import tensorflow_data_validation as tfdv -from google.cloud import bigquery, storage -from google.cloud.storage import Blob -from google.protobuf.duration_pb2 import Duration -from pandavro import to_avro - -from bq.testutils import assert_stats_equal, clear_unsupported_fields -from feast.client import Client -from feast.contrib.job_controller.client import Client as JCClient -from feast.core.CoreService_pb2 import ListStoresRequest -from feast.core.FeatureSet_pb2 import FeatureSetStatus -from feast.core.IngestionJob_pb2 import IngestionJobStatus -from feast.entity import Entity -from feast.feature import Feature -from feast.feature_set import FeatureSet -from feast.type_map import ValueType -from feast.wait import wait_retry_backoff - -pd.set_option("display.max_columns", None) - -PROJECT_NAME = "batch_" + uuid.uuid4().hex.upper()[0:6] - - -@pytest.fixture(scope="module") -def core_url(pytestconfig): - return pytestconfig.getoption("core_url") - - -@pytest.fixture(scope="module") -def serving_url(pytestconfig): - return pytestconfig.getoption("serving_url") - - -@pytest.fixture(scope="module") -def jobcontroller_url(pytestconfig): - return pytestconfig.getoption("jobcontroller_url") - - -@pytest.fixture(scope="module") -def allow_dirty(pytestconfig): - return True if pytestconfig.getoption("allow_dirty").lower() == "true" else False - - -@pytest.fixture(scope="module") -def gcs_path(pytestconfig): - return pytestconfig.getoption("gcs_path") - - -@pytest.fixture(scope="module") -def client(core_url, serving_url, allow_dirty): - # Get client for core and serving - client = Client(core_url=core_url, serving_url=serving_url) - client.create_project(PROJECT_NAME) - client.set_project(PROJECT_NAME) - - # Ensure Feast core is active, but empty - if not allow_dirty: - feature_sets = client.list_feature_sets() - if len(feature_sets) > 0: - raise Exception( - "Feast cannot have existing feature sets registered. Exiting tests." - ) - - return client - - -def wait_for(fn, timeout: timedelta, sleep=5): - until = datetime.now() + timeout - last_exc = BaseException() - - while datetime.now() <= until: - try: - fn() - except Exception as exc: - last_exc = exc - else: - return - time.sleep(sleep) - - raise last_exc - - -@pytest.mark.first -@pytest.mark.direct_runner -@pytest.mark.dataflow_runner -@pytest.mark.run(order=1) -def test_batch_apply_all_featuresets(client): - client.set_project(PROJECT_NAME) - - file_fs1 = FeatureSet( - "file_feature_set", - features=[Feature("feature_value1", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(file_fs1) - - gcs_fs1 = FeatureSet( - "gcs_feature_set", - features=[Feature("feature_value2", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(gcs_fs1) - - proc_time_fs = FeatureSet( - "processing_time", - features=[Feature("feature_value3", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(proc_time_fs) - - add_cols_fs = FeatureSet( - "additional_columns", - features=[Feature("feature_value4", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(add_cols_fs) - - historical_fs = FeatureSet( - "historical", - features=[Feature("feature_value5", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(historical_fs) - - fs1 = FeatureSet( - "feature_set_1", - features=[Feature("feature_value6", ValueType.STRING)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - - fs2 = FeatureSet( - "feature_set_2", - features=[Feature("other_feature_value7", ValueType.INT64)], - entities=[Entity("other_entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(fs1) - client.apply(fs2) - - no_max_age_fs = FeatureSet( - "no_max_age", - features=[Feature("feature_value8", ValueType.INT64)], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=0), - ) - client.apply(no_max_age_fs) - - -@pytest.mark.direct_runner -@pytest.mark.dataflow_runner -@pytest.mark.run(order=10) -def test_batch_get_historical_features_with_file(client): - file_fs1 = client.get_feature_set(name="file_feature_set") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value1": [f"{i}" for i in range(N_ROWS)], - } - ) - - # feature set may be ready (direct runner set ready right after job submitted), - # but kafka consumer is not configured - # give some time to warm up ingestion job - wait_retry_backoff( - retry_fn=( - lambda: ( - None, - client.get_feature_set(name="file_feature_set").status - == FeatureSetStatus.STATUS_READY, - ) - ), - timeout_secs=480, - timeout_msg="Wait for FeatureSet to be READY", - ) - time.sleep(20) - - client.ingest(file_fs1, features_1_df, timeout=480) - - # Rename column (datetime -> event_timestamp) - features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) - - to_avro( - df=features_1_df[["event_timestamp", "entity_id"]], - file_path_or_buffer="file_feature_set.avro", - ) - - time.sleep(10) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows="file://file_feature_set.avro", - feature_refs=["feature_value1"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["entity_id"].to_list() == [ - int(i) for i in output["feature_value1"].to_list() - ] - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=10)) - - -@pytest.mark.direct_runner -@pytest.mark.dataflow_runner -@pytest.mark.run(order=11) -def test_batch_get_historical_features_with_gs_path(client, gcs_path): - gcs_fs1 = client.get_feature_set(name="gcs_feature_set") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value2": [f"{i}" for i in range(N_ROWS)], - } - ) - client.ingest(gcs_fs1, features_1_df, timeout=360) - - # Rename column (datetime -> event_timestamp) - features_1_df = features_1_df.rename(columns={"datetime": "event_timestamp"}) - - # Output file to local - file_name = "gcs_feature_set.avro" - to_avro( - df=features_1_df[["event_timestamp", "entity_id"]], - file_path_or_buffer=file_name, - ) - - uri = urlparse(gcs_path) - bucket = uri.hostname - ts = int(time.time()) - remote_path = str(uri.path).strip("/") + f"/{ts}/{file_name}" - - # Upload file to gcs - storage_client = storage.Client(project=None) - bucket = storage_client.get_bucket(bucket) - blob = bucket.blob(remote_path) - blob.upload_from_filename(file_name) - - time.sleep(10) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=f"{gcs_path}/{ts}/*", - feature_refs=["feature_value2"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - assert output["entity_id"].to_list() == [ - int(i) for i in output["feature_value2"].to_list() - ] - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - blob.delete() - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=12) -def test_batch_order_by_creation_time(client): - proc_time_fs = client.get_feature_set(name="processing_time") - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - N_ROWS = 10 - incorrect_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value3": ["WRONG"] * N_ROWS, - } - ) - correct_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value3": ["CORRECT"] * N_ROWS, - } - ) - client.ingest(proc_time_fs, incorrect_df) - time.sleep(15) - client.ingest(proc_time_fs, correct_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=incorrect_df[["datetime", "entity_id"]], - feature_refs=["feature_value3"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["feature_value3"].to_list() == ["CORRECT"] * N_ROWS - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=13) -def test_batch_additional_columns_in_entity_table(client): - add_cols_fs = client.get_feature_set(name="additional_columns") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value4": ["abc"] * N_ROWS, - } - ) - client.ingest(add_cols_fs, features_df) - - entity_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "additional_string_col": ["hello im extra"] * N_ROWS, - "additional_float_col": [random.random() for i in range(N_ROWS)], - } - ) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value4"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - print(output.head(10)) - - assert np.allclose( - output["additional_float_col"], entity_df["additional_float_col"] - ) - assert ( - output["additional_string_col"].to_list() - == entity_df["additional_string_col"].to_list() - ) - assert ( - output["feature_value4"].to_list() - == features_df["feature_value4"].to_list() - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=14) -def test_batch_point_in_time_correctness_join(client): - historical_fs = client.get_feature_set(name="historical") - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - N_EXAMPLES = 10 - historical_df = pd.DataFrame( - { - "datetime": [ - time_offset - timedelta(seconds=50), - time_offset - timedelta(seconds=30), - time_offset - timedelta(seconds=10), - ] - * N_EXAMPLES, - "entity_id": [i for i in range(N_EXAMPLES) for _ in range(3)], - "feature_value5": ["WRONG", "WRONG", "CORRECT"] * N_EXAMPLES, - } - ) - entity_df = pd.DataFrame( - { - "datetime": [time_offset - timedelta(seconds=10)] * N_EXAMPLES, - "entity_id": [i for i in range(N_EXAMPLES)], - } - ) - - client.ingest(historical_fs, historical_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value5"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["feature_value5"].to_list() == ["CORRECT"] * N_EXAMPLES - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=15) -def test_batch_multiple_featureset_joins(client): - fs1 = client.get_feature_set(name="feature_set_1") - fs2 = client.get_feature_set(name="feature_set_2") - - N_ROWS = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value6": [f"{i}" for i in range(N_ROWS)], - } - ) - client.ingest(fs1, features_1_df) - - features_2_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "other_entity_id": [i for i in range(N_ROWS)], - "other_feature_value7": [i for i in range(N_ROWS)], - } - ) - client.ingest(fs2, features_2_df) - - entity_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "other_entity_id": [N_ROWS - 1 - i for i in range(N_ROWS)], - } - ) - - # Test retrieve with different variations of the string feature refs - # ie feature set inference for feature refs without specified feature set - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["entity_id"].to_list() == [ - int(i) for i in output["feature_value6"].to_list() - ] - assert ( - output["other_entity_id"].to_list() - == output["feature_set_2__other_feature_value7"].to_list() - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=16) -def test_batch_no_max_age(client): - no_max_age_fs = client.get_feature_set(name="no_max_age") - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - N_ROWS = 10 - features_8_df = pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "entity_id": [i for i in range(N_ROWS)], - "feature_value8": [i for i in range(N_ROWS)], - } - ) - client.ingest(no_max_age_fs, features_8_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=features_8_df[["datetime", "entity_id"]], - feature_refs=["feature_value8"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head()) - - assert output["entity_id"].to_list() == output["feature_value8"].to_list() - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.fixture(scope="module", autouse=True) -def infra_teardown(pytestconfig, jobcontroller_url): - client = JCClient(jobcontroller_url=jobcontroller_url) - - marker = pytestconfig.getoption("-m") - yield marker - if marker == "dataflow_runner": - ingest_jobs = client.list_ingest_jobs() - ingest_jobs = [ - client.list_ingest_jobs(job.id)[0].external_id - for job in ingest_jobs - if job.status == IngestionJobStatus.RUNNING - ] - - cwd = os.getcwd() - with open(f"{cwd}/ingesting_jobs.txt", "w+") as output: - for job in ingest_jobs: - output.write("%s\n" % job) - else: - print("Cleaning up not required") - - -""" -This suite of tests tests the apply feature set - update feature set - retrieve -event sequence. It ensures that when a feature set is updated, tombstoned features -are no longer retrieved, and added features are null for previously ingested -rows. - -It is marked separately because of the length of time required -to perform this test, due to bigquery schema caching for streaming writes. -""" - - -@pytest.fixture(scope="module") -def update_featureset_dataframe(): - n_rows = 10 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - return pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [i for i in range(n_rows)], - "update_feature1": ["a" for i in range(n_rows)], - "update_feature2": [i + 2 for i in range(n_rows)], - "update_feature3": [i for i in range(n_rows)], - "update_feature4": ["b" for i in range(n_rows)], - } - ) - - -@pytest.mark.fs_update -@pytest.mark.run(order=20) -def test_update_featureset_apply_featureset_and_ingest_first_subset( - client, update_featureset_dataframe -): - subset_columns = ["datetime", "entity_id", "update_feature1", "update_feature2"] - subset_df = update_featureset_dataframe.iloc[:5][subset_columns] - update_fs = FeatureSet( - "update_fs", - entities=[Entity(name="entity_id", dtype=ValueType.INT64)], - max_age=Duration(seconds=432000), - ) - update_fs.infer_fields_from_df(subset_df) - client.apply(update_fs) - - client.ingest(feature_set=update_fs, source=subset_df) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[:5], - feature_refs=["update_feature1", "update_feature2"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - print(output.head()) - - assert ( - output["update_feature1"].to_list() - == subset_df["update_feature1"].to_list() - ) - assert ( - output["update_feature2"].to_list() - == subset_df["update_feature2"].to_list() - ) - - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.fs_update -@pytest.mark.timeout(600) -@pytest.mark.run(order=21) -def test_update_featureset_update_featureset_and_ingest_second_subset( - client, update_featureset_dataframe -): - subset_columns = [ - "datetime", - "entity_id", - "update_feature1", - "update_feature3", - "update_feature4", - ] - subset_df = update_featureset_dataframe.iloc[5:][subset_columns] - update_fs = FeatureSet( - "update_fs", - entities=[Entity(name="entity_id", dtype=ValueType.INT64)], - max_age=Duration(seconds=432000), - ) - update_fs.infer_fields_from_df(subset_df) - client.apply(update_fs) - - # We keep retrying this ingestion until all values make it into the buffer. - # This is a necessary step because bigquery streaming caches table schemas - # and as a result, rows may be lost. - while True: - ingestion_id = client.ingest(feature_set=update_fs, source=subset_df) - time.sleep(15) # wait for rows to get written to bq - rows_ingested = get_rows_ingested(client, update_fs, ingestion_id) - if rows_ingested == len(subset_df): - print(f"Number of rows successfully ingested: {rows_ingested}. Continuing.") - break - print( - f"Number of rows successfully ingested: {rows_ingested}. Retrying ingestion." - ) - time.sleep(30) - - def check(): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]].iloc[5:], - feature_refs=["update_feature1", "update_feature3", "update_feature4"], - project=PROJECT_NAME, - ) - - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - print(output.head()) - - assert ( - output["update_feature1"].to_list() - == subset_df["update_feature1"].to_list() - ) - assert ( - output["update_feature3"].to_list() - == subset_df["update_feature3"].to_list() - ) - assert ( - output["update_feature4"].to_list() - == subset_df["update_feature4"].to_list() - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - wait_for(check, timedelta(minutes=5)) - - -@pytest.mark.fs_update -@pytest.mark.run(order=22) -def test_update_featureset_retrieve_all_fields(client, update_featureset_dataframe): - with pytest.raises(Exception): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]], - feature_refs=[ - "update_feature1", - "update_feature2", - "update_feature3", - "update_feature4", - ], - project=PROJECT_NAME, - ) - feature_retrieval_job.result() - - -@pytest.mark.fs_update -@pytest.mark.run(order=23) -def test_update_featureset_retrieve_valid_fields(client, update_featureset_dataframe): - feature_retrieval_job = client.get_historical_features( - entity_rows=update_featureset_dataframe[["datetime", "entity_id"]], - feature_refs=["update_feature1", "update_feature3", "update_feature4"], - project=PROJECT_NAME, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180).sort_values( - by=["entity_id"] - ) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - print(output.head(10)) - assert ( - output["update_feature1"].to_list() - == update_featureset_dataframe["update_feature1"].to_list() - ) - # we have to convert to float because the column contains np.NaN - assert [math.isnan(i) for i in output["update_feature3"].to_list()[:5]] == [ - True - ] * 5 - assert output["update_feature3"].to_list()[5:] == [ - float(i) for i in update_featureset_dataframe["update_feature3"].to_list()[5:] - ] - assert ( - output["update_feature4"].to_list() - == [None] * 5 + update_featureset_dataframe["update_feature4"].to_list()[5:] - ) - - -@pytest.mark.direct_runner -@pytest.mark.run(order=31) -@pytest.mark.timeout(600) -def test_batch_dataset_statistics(client): - fs1 = client.get_feature_set(name="feature_set_1") - fs2 = client.get_feature_set(name="feature_set_2") - id_offset = 20 - - n_rows = 21 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - features_1_df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [id_offset + i for i in range(n_rows)], - "feature_value6": ["a" for i in range(n_rows)], - } - ) - ingestion_id1 = client.ingest(fs1, features_1_df) - - features_2_df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "other_entity_id": [id_offset + i for i in range(n_rows)], - "other_feature_value7": [int(i) % 10 for i in range(0, n_rows)], - } - ) - ingestion_id2 = client.ingest(fs2, features_2_df) - - entity_df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [id_offset + i for i in range(n_rows)], - "other_entity_id": [id_offset + i for i in range(n_rows)], - } - ) - - time.sleep(15) # wait for rows to get written to bq - while True: - rows_ingested1 = get_rows_ingested(client, fs1, ingestion_id1) - rows_ingested2 = get_rows_ingested(client, fs2, ingestion_id2) - if rows_ingested1 == len(features_1_df) and rows_ingested2 == len( - features_2_df - ): - print( - f"Number of rows successfully ingested: {rows_ingested1}, {rows_ingested2}. Continuing." - ) - break - time.sleep(30) - - feature_retrieval_job = client.get_historical_features( - entity_rows=entity_df, - feature_refs=["feature_value6", "feature_set_2:other_feature_value7"], - project=PROJECT_NAME, - compute_statistics=True, - ) - output = feature_retrieval_job.to_dataframe(timeout_sec=180) - print(output.head(10)) - stats = feature_retrieval_job.statistics(timeout_sec=180) - clear_unsupported_fields(stats) - - expected_stats = tfdv.generate_statistics_from_dataframe( - output[["feature_value6", "feature_set_2__other_feature_value7"]] - ) - clear_unsupported_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = output[name].std() - feature.num_stats.std_dev = std - - assert_stats_equal(expected_stats, stats) - clean_up_remote_files(feature_retrieval_job.get_avro_files()) - - -def get_rows_ingested( - client: Client, feature_set: FeatureSet, ingestion_id: str -) -> int: - response = client._core_service.ListStores( - ListStoresRequest(filter=ListStoresRequest.Filter(name="historical")) - ) - bq_config = response.store[0].bigquery_config - project = bq_config.project_id - dataset = bq_config.dataset_id - table = f"{PROJECT_NAME}_{feature_set.name}" - - bq_client = bigquery.Client(project=project) - rows = bq_client.query( - f'SELECT COUNT(*) as count FROM `{project}.{dataset}.{table}` WHERE ingestion_id = "{ingestion_id}"' - ).result() - - return list(rows)[0]["count"] - - -def clean_up_remote_files(files): - storage_client = storage.Client() - for file_uri in files: - if file_uri.scheme == "gs": - blob = Blob.from_string(file_uri.geturl(), client=storage_client) - blob.delete() diff --git a/tests/e2e/bq/feature-stats.py b/tests/e2e/bq/feature-stats.py deleted file mode 100644 index 226dc358f1..0000000000 --- a/tests/e2e/bq/feature-stats.py +++ /dev/null @@ -1,256 +0,0 @@ -import os -import time -import uuid -from datetime import datetime, timedelta - -import pandas as pd -import pytest -import pytz -import tensorflow_data_validation as tfdv -from google.protobuf.duration_pb2 import Duration - -from bq.testutils import ( - assert_stats_equal, - clear_unsupported_agg_fields, - clear_unsupported_fields, -) -from feast.client import Client -from feast.entity import Entity -from feast.feature import Feature -from feast.feature_set import FeatureSet -from feast.type_map import ValueType - -pd.set_option("display.max_columns", None) - -PROJECT_NAME = "batch_" + uuid.uuid4().hex.upper()[0:6] -STORE_NAME = "historical" -os.environ["CUDA_VISIBLE_DEVICES"] = "0" - - -@pytest.fixture(scope="module") -def core_url(pytestconfig): - return pytestconfig.getoption("core_url") - - -@pytest.fixture(scope="module") -def serving_url(pytestconfig): - return pytestconfig.getoption("serving_url") - - -@pytest.fixture(scope="module") -def allow_dirty(pytestconfig): - return True if pytestconfig.getoption("allow_dirty").lower() == "true" else False - - -@pytest.fixture(scope="module") -def gcs_path(pytestconfig): - return pytestconfig.getoption("gcs_path") - - -@pytest.fixture(scope="module") -def client(core_url, allow_dirty): - # Get client for core and serving - client = Client(core_url=core_url) - client.create_project(PROJECT_NAME) - client.set_project(PROJECT_NAME) - - # Ensure Feast core is active, but empty - if not allow_dirty: - feature_sets = client.list_feature_sets() - if len(feature_sets) > 0: - raise Exception( - "Feast cannot have existing feature sets registered. Exiting tests." - ) - - return client - - -@pytest.fixture(scope="module") -def feature_stats_feature_set(client): - fv_fs = FeatureSet( - "feature_stats", - features=[ - Feature("strings", ValueType.STRING), - Feature("ints", ValueType.INT64), - Feature("floats", ValueType.FLOAT), - ], - entities=[Entity("entity_id", ValueType.INT64)], - max_age=Duration(seconds=100), - ) - client.apply(fv_fs) - return fv_fs - - -@pytest.fixture(scope="module") -def feature_stats_dataset_basic(client, feature_stats_feature_set): - - n_rows = 20 - - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - df = pd.DataFrame( - { - "datetime": [time_offset] * n_rows, - "entity_id": [i for i in range(n_rows)], - "strings": ["a", "b"] * int(n_rows / 2), - "ints": [int(i) for i in range(n_rows)], - "floats": [10.5 - i for i in range(n_rows)], - } - ) - - expected_stats = tfdv.generate_statistics_from_dataframe( - df[["strings", "ints", "floats"]] - ) - clear_unsupported_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = df[name].std() - feature.num_stats.std_dev = std - - ingestion_id = client.ingest(feature_stats_feature_set, df) - time.sleep(10) - return { - "df": df, - "id": ingestion_id, - "date": datetime(time_offset.year, time_offset.month, time_offset.day).replace( - tzinfo=pytz.utc - ), - "stats": expected_stats, - } - - -@pytest.fixture(scope="module") -def feature_stats_dataset_agg(client, feature_stats_feature_set): - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - start_date = time_offset - timedelta(days=10) - end_date = time_offset - timedelta(days=7) - df1 = pd.DataFrame( - { - "datetime": [start_date] * 5, - "entity_id": [i for i in range(5)], - "strings": ["a", "b", "b", "b", "a"], - "ints": [4, 3, 2, 6, 3], - "floats": [2.1, 5.2, 4.3, 0.6, 0.1], - } - ) - ingestion_id_1 = client.ingest(feature_stats_feature_set, df1) - df2 = pd.DataFrame( - { - "datetime": [start_date + timedelta(days=1)] * 3, - "entity_id": [i for i in range(3)], - "strings": ["a", "b", "c"], - "ints": [2, 6, 7], - "floats": [1.6, 2.4, 2], - } - ) - ingestion_id_2 = client.ingest(feature_stats_feature_set, df2) - - combined_df = pd.concat([df1, df2])[["strings", "ints", "floats"]] - expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) - clear_unsupported_agg_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = combined_df[name].std() - feature.num_stats.std_dev = std - - time.sleep(10) - - return { - "ids": [ingestion_id_1, ingestion_id_2], - "start_date": datetime( - start_date.year, start_date.month, start_date.day - ).replace(tzinfo=pytz.utc), - "end_date": datetime(end_date.year, end_date.month, end_date.day).replace( - tzinfo=pytz.utc - ), - "stats": expected_stats, - } - - -def test_feature_stats_retrieval_by_single_dataset(client, feature_stats_dataset_basic): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - ingestion_ids=[feature_stats_dataset_basic["id"]], - ) - - assert_stats_equal(feature_stats_dataset_basic["stats"], stats) - - -def test_feature_stats_by_date(client, feature_stats_dataset_basic): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - start_date=feature_stats_dataset_basic["date"], - end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), - ) - assert_stats_equal(feature_stats_dataset_basic["stats"], stats) - - -def test_feature_stats_agg_over_datasets(client, feature_stats_dataset_agg): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - ingestion_ids=feature_stats_dataset_agg["ids"], - ) - assert_stats_equal(feature_stats_dataset_agg["stats"], stats) - - -def test_feature_stats_agg_over_dates(client, feature_stats_dataset_agg): - stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store=STORE_NAME, - start_date=feature_stats_dataset_agg["start_date"], - end_date=feature_stats_dataset_agg["end_date"], - ) - assert_stats_equal(feature_stats_dataset_agg["stats"], stats) - - -def test_feature_stats_force_refresh( - client, feature_stats_dataset_basic, feature_stats_feature_set -): - df = feature_stats_dataset_basic["df"] - - df2 = pd.DataFrame( - { - "datetime": [df.iloc[0].datetime], - "entity_id": [10], - "strings": ["c"], - "ints": [2], - "floats": [1.3], - } - ) - client.ingest(feature_stats_feature_set, df2) - time.sleep(10) - - actual_stats = client.get_statistics( - "feature_stats", - features=["strings", "ints", "floats"], - store="historical", - start_date=feature_stats_dataset_basic["date"], - end_date=feature_stats_dataset_basic["date"] + timedelta(days=1), - force_refresh=True, - ) - - combined_df = pd.concat([df, df2]) - expected_stats = tfdv.generate_statistics_from_dataframe(combined_df) - - clear_unsupported_fields(expected_stats) - - # Since TFDV computes population std dev - for feature in expected_stats.datasets[0].features: - if feature.HasField("num_stats"): - name = feature.path.step[0] - std = combined_df[name].std() - feature.num_stats.std_dev = std - - assert_stats_equal(expected_stats, actual_stats) diff --git a/tests/e2e/bq/testutils.py b/tests/e2e/bq/testutils.py deleted file mode 100644 index 9ac678bc59..0000000000 --- a/tests/e2e/bq/testutils.py +++ /dev/null @@ -1,55 +0,0 @@ -from deepdiff import DeepDiff -from google.protobuf.json_format import MessageToDict - - -def clear_unsupported_fields(datasets): - dataset = datasets.datasets[0] - for feature in dataset.features: - if feature.HasField("num_stats"): - feature.num_stats.common_stats.ClearField("num_values_histogram") - # Since difference in how BQ and TFDV compute histogram values make them - # approximate but uncomparable - feature.num_stats.ClearField("histograms") - elif feature.HasField("string_stats"): - feature.string_stats.common_stats.ClearField("num_values_histogram") - for bucket in feature.string_stats.rank_histogram.buckets: - bucket.ClearField("low_rank") - bucket.ClearField("high_rank") - elif feature.HasField("struct_stats"): - feature.string_stats.struct_stats.ClearField("num_values_histogram") - elif feature.HasField("bytes_stats"): - feature.string_stats.bytes_stats.ClearField("num_values_histogram") - - -def clear_unsupported_agg_fields(datasets): - dataset = datasets.datasets[0] - for feature in dataset.features: - if feature.HasField("num_stats"): - feature.num_stats.common_stats.ClearField("num_values_histogram") - feature.num_stats.ClearField("histograms") - feature.num_stats.ClearField("median") - elif feature.HasField("string_stats"): - feature.string_stats.common_stats.ClearField("num_values_histogram") - feature.string_stats.ClearField("rank_histogram") - feature.string_stats.ClearField("top_values") - feature.string_stats.ClearField("unique") - elif feature.HasField("struct_stats"): - feature.struct_stats.ClearField("num_values_histogram") - elif feature.HasField("bytes_stats"): - feature.bytes_stats.ClearField("num_values_histogram") - feature.bytes_stats.ClearField("unique") - - -def assert_stats_equal(left, right): - left_stats = MessageToDict(left)["datasets"][0] - right_stats = MessageToDict(right)["datasets"][0] - assert ( - left_stats["numExamples"] == right_stats["numExamples"] - ), f"Number of examples do not match. Expected {left_stats['numExamples']}, got {right_stats['numExamples']}" - - left_features = sorted(left_stats["features"], key=lambda k: k["path"]["step"][0]) - right_features = sorted(right_stats["features"], key=lambda k: k["path"]["step"][0]) - diff = DeepDiff(left_features, right_features, significant_digits=3) - assert ( - len(diff) == 0 - ), f"Feature statistics do not match: \nwanted: {left_features}\n got: {right_features}" From 28830deb746f9b28197dd4cd0e67442554e11c3e Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 16:44:25 +0800 Subject: [PATCH 11/34] Fix docker-compose test Signed-off-by: Terence --- infra/scripts/test-docker-compose.sh | 2 +- tests/e2e/redis/parallel-ingest-redis-serving.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/infra/scripts/test-docker-compose.sh b/infra/scripts/test-docker-compose.sh index 45105d4839..173c796eb0 100755 --- a/infra/scripts/test-docker-compose.sh +++ b/infra/scripts/test-docker-compose.sh @@ -63,4 +63,4 @@ export FEAST_ONLINE_SERVING_CONTAINER_IP_ADDRESS=$(docker inspect -f '{{range .N ${PROJECT_ROOT_DIR}/infra/scripts/wait-for-it.sh ${FEAST_ONLINE_SERVING_CONTAINER_IP_ADDRESS}:6566 --timeout=120 # Run e2e tests for Redis -docker exec feast_jupyter_1 bash -c 'cd /feast/tests/e2e/redis && pytest --verbose -rs parallel-ingest-redis-serving.py --core_url core:6565 --serving_url=online_serving:6566 --kafka_brokers=kafka:9092' +docker exec feast_jupyter_1 bash -c 'cd /feast/tests/e2e/redis && pytest -n 1 --dist=loadscope parallel-ingest-redis-serving.py --core_url core:6565 --serving_url=online_serving:6566 --kafka_brokers=kafka:9092' diff --git a/tests/e2e/redis/parallel-ingest-redis-serving.py b/tests/e2e/redis/parallel-ingest-redis-serving.py index dd98f66063..53967aaa74 100644 --- a/tests/e2e/redis/parallel-ingest-redis-serving.py +++ b/tests/e2e/redis/parallel-ingest-redis-serving.py @@ -83,7 +83,9 @@ def test_discovery(self, client): assert actual_get_feature_table == self.basic_ft_spec # ListFeatureTables Check - actual_list_feature_table = client.list_feature_tables()[0] + actual_list_feature_table = [ + ft for ft in client.list_feature_tables() if ft.name == "dev_featuretable" + ][0] assert actual_list_feature_table == self.basic_ft_spec def test_basic_retrieval(self, client): @@ -173,7 +175,9 @@ def test_discovery(self, client): assert actual_get_feature_table == self.alltypes_ft_spec # ListFeatureTables Check - actual_list_feature_table = client.list_feature_tables()[0] + actual_list_feature_table = [ + ft for ft in client.list_feature_tables() if ft.name == "alltypes" + ][0] assert actual_list_feature_table == self.alltypes_ft_spec def test_alltypes_retrieval(self, client): From d1c434668d192702fb810a86e3046dc305b64449 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 18:37:20 +0800 Subject: [PATCH 12/34] Address PR comments Signed-off-by: Terence --- Makefile | 2 +- .../scripts/test-end-to-end-redis-cluster.sh | 2 +- infra/scripts/test-end-to-end.sh | 2 +- sdk/python/feast/client.py | 6 +- sdk/python/feast/feature_table.py | 34 +++- sdk/python/feast/loaders/ingest.py | 2 +- sdk/python/tests/test_client.py | 24 ++- sdk/python/tests/test_feature_table.py | 14 +- .../redis/parallel-ingest-redis-serving.py | 185 ------------------ tests/e2e/setup.cfg | 2 +- .../{redis => src}/specifications/dev_ft.yaml | 0 tests/e2e/src/test-register-ingest.py | 158 +++++++++++++++ 12 files changed, 209 insertions(+), 222 deletions(-) delete mode 100644 tests/e2e/redis/parallel-ingest-redis-serving.py rename tests/e2e/{redis => src}/specifications/dev_ft.yaml (100%) create mode 100644 tests/e2e/src/test-register-ingest.py diff --git a/Makefile b/Makefile index f159ad624d..ad755d70d3 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,7 @@ lint-python: cd ${ROOT_DIR}/sdk/python; flake8 feast/ tests/ cd ${ROOT_DIR}/sdk/python; black --check feast tests - cd ${ROOT_DIR}/tests/e2e; mypy redis/ + cd ${ROOT_DIR}/tests/e2e; mypy src/ cd ${ROOT_DIR}/tests/e2e; isort . --check-only cd ${ROOT_DIR}/tests/e2e; flake8 . cd ${ROOT_DIR}/tests/e2e; black --check . diff --git a/infra/scripts/test-end-to-end-redis-cluster.sh b/infra/scripts/test-end-to-end-redis-cluster.sh index 083079a32b..1b67986742 100755 --- a/infra/scripts/test-end-to-end-redis-cluster.sh +++ b/infra/scripts/test-end-to-end-redis-cluster.sh @@ -103,7 +103,7 @@ cd tests/e2e set +e CORE_NO=$(nproc --all) -pytest redis/parallel-ingest-redis-serving.py -n ${CORE_NO} --dist=loadscope --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +pytest src/* -n ${CORE_NO} --dist=loadscope --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then diff --git a/infra/scripts/test-end-to-end.sh b/infra/scripts/test-end-to-end.sh index a7dadd5a1f..474c98de42 100755 --- a/infra/scripts/test-end-to-end.sh +++ b/infra/scripts/test-end-to-end.sh @@ -120,7 +120,7 @@ cd tests/e2e set +e export GOOGLE_APPLICATION_CREDENTIALS=/etc/gcloud/service-account.json CORE_NO=$(nproc --all) -pytest redis/parallel-ingest-redis-serving.py -n ${CORE_NO} --dist=loadscope --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +pytest src/* -n ${CORE_NO} --dist=loadscope --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index bc21a6f2e4..d53ccd7599 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -68,7 +68,7 @@ from feast.grpc.grpc import create_grpc_channel from feast.loaders.ingest import ( BATCH_INGESTION_PRODUCTION_TIMEOUT, - check_field_mappings, + _check_field_mappings, ) from feast.serving.ServingService_pb2 import GetFeastServingInfoRequest from feast.serving.ServingService_pb2_grpc import ServingServiceStub @@ -625,7 +625,7 @@ def ingest( >>> ) >>> client.set_project("project1") >>> - >>> driver_ft = client.get_feature_table(name="driver") + >>> driver_ft = client.get_feature_table("driver") >>> client.ingest(driver_ft, ft_df) """ @@ -670,7 +670,7 @@ def ingest( ) # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table - check_field_mappings( + _check_field_mappings( column_names, name, feature_table.batch_source.field_mapping ) diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index eafa4260ef..bb35eb534d 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, MutableMapping, Optional, Union +from typing import Dict, List, MutableMapping, Optional import yaml from google.protobuf import json_format @@ -20,6 +20,7 @@ from google.protobuf.json_format import MessageToDict, MessageToJson from google.protobuf.timestamp_pb2 import Timestamp +from feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.core.FeatureTable_pb2 import FeatureTable as FeatureTableProto from feast.core.FeatureTable_pb2 import FeatureTableMeta as FeatureTableMetaProto from feast.core.FeatureTable_pb2 import FeatureTableSpec as FeatureTableSpecProto @@ -33,6 +34,7 @@ ) from feast.feature import Feature from feast.loaders import yaml as feast_yaml +from feast.value_type import ValueType class FeatureTable: @@ -43,8 +45,8 @@ class FeatureTable: def __init__( self, name: str, - entities: Union[str, List[str]], - features: Union[Feature, List[Feature]], + entities: List[str], + features: List[Feature], batch_source: Optional[DataSource] = None, stream_source: Optional[DataSource] = None, max_age: Optional[Duration] = None, @@ -52,9 +54,19 @@ def __init__( ): self._name = name self._entities = entities - self._features = features - self._batch_source = batch_source - self._stream_source = stream_source + self._features = [ + feature.to_proto() for feature in features if isinstance(feature, Feature) + ] + self._batch_source = ( + batch_source.to_proto() + if isinstance(batch_source, DataSource) + else batch_source + ) + self._stream_source = ( + stream_source.to_proto() + if isinstance(stream_source, DataSource) + else stream_source + ) if labels is None: self._labels = dict() # type: MutableMapping[str, str] else: @@ -141,7 +153,7 @@ def batch_source(self): return self._batch_source @batch_source.setter - def batch_source(self, batch_source: DataSource): + def batch_source(self, batch_source: DataSourceProto): """ Sets the batch source of this feature table """ @@ -155,7 +167,7 @@ def stream_source(self): return self._stream_source @stream_source.setter - def stream_source(self, stream_source: DataSource): + def stream_source(self, stream_source: DataSourceProto): """ Sets the stream source of this feature table """ @@ -322,7 +334,11 @@ def from_proto(cls, feature_table_proto: FeatureTableProto): name=feature_table_proto.spec.name, entities=[entity for entity in feature_table_proto.spec.entities], features=[ - Feature.from_proto(feature).to_proto() + Feature( + name=feature.name, + dtype=ValueType(feature.value_type), + labels=feature.labels, + ) for feature in feature_table_proto.spec.features ], labels=feature_table_proto.spec.labels, diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index 0d1c3e5e31..56bb839eb8 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -8,7 +8,7 @@ BATCH_INGESTION_PRODUCTION_TIMEOUT = 120 # type: int -def check_field_mappings( +def _check_field_mappings( column_names: List[str], feature_table_name: str, feature_table_field_mappings: Dict[str, str], diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index c152d6d400..4964a84c96 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -39,7 +39,7 @@ from feast.core.FeatureTable_pb2 import FeatureTable as FeatureTableProto from feast.core.FeatureTable_pb2 import FeatureTableMeta as FeatureTableMetaProto from feast.core.FeatureTable_pb2 import FeatureTableSpec as FeatureTableSpecProto -from feast.data_source import DataSource, FileOptions, KafkaOptions, SourceType +from feast.data_source import DataSource, FileOptions, KafkaOptions from feast.entity import Entity from feast.feature import Feature from feast.feature_table import FeatureTable @@ -457,7 +457,7 @@ def test_get_feature_table(self, mocked_client, mocker): ], entities=["my_entity_1"], batch_source=DataSourceProto( - type=SourceType(1).name, + type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -498,7 +498,7 @@ def test_list_feature_tables(self, mocked_client, mocker): ) batch_source = DataSourceProto( - type=SourceType(1).name, + type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -569,7 +569,7 @@ def test_apply_feature_table_success(self, test_client): # Create Feature Tables batch_source = DataSource( - type=SourceType(1).name, + type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -580,7 +580,7 @@ def test_apply_feature_table_success(self, test_client): ) stream_source = DataSource( - type=SourceType(3).name, + type="STREAM_KAFKA", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -596,17 +596,15 @@ def test_apply_feature_table_success(self, test_client): ft1 = FeatureTable( name="my-feature-table-1", features=[ - Feature(name="fs1-my-feature-1", dtype=ValueType.INT64).to_proto(), - Feature(name="fs1-my-feature-2", dtype=ValueType.STRING).to_proto(), - Feature( - name="fs1-my-feature-3", dtype=ValueType.STRING_LIST - ).to_proto(), - Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST).to_proto(), + Feature(name="fs1-my-feature-1", dtype=ValueType.INT64), + Feature(name="fs1-my-feature-2", dtype=ValueType.STRING), + Feature(name="fs1-my-feature-3", dtype=ValueType.STRING_LIST), + Feature(name="fs1-my-feature-4", dtype=ValueType.BYTES_LIST), ], entities=["fs1-my-entity-1"], labels={"team": "matchmaking"}, - batch_source=batch_source.to_proto(), - stream_source=stream_source.to_proto(), + batch_source=batch_source, + stream_source=stream_source, ) # Register Feature Table with Core diff --git a/sdk/python/tests/test_feature_table.py b/sdk/python/tests/test_feature_table.py index 8a1059bcb6..8d9891e67d 100644 --- a/sdk/python/tests/test_feature_table.py +++ b/sdk/python/tests/test_feature_table.py @@ -21,7 +21,7 @@ from feast.client import Client from feast.core import CoreService_pb2_grpc as Core -from feast.data_source import DataSource, FileOptions, KafkaOptions, SourceType +from feast.data_source import DataSource, FileOptions, KafkaOptions from feast.feature import Feature from feast.feature_table import FeatureTable from feast.value_type import ValueType @@ -55,7 +55,7 @@ def client(self, server): def test_feature_table_import_export_yaml(self): batch_source = DataSource( - type=SourceType(1).name, + type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -66,7 +66,7 @@ def test_feature_table_import_export_yaml(self): ) stream_source = DataSource( - type=SourceType(3).name, + type="STREAM_KAFKA", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -82,13 +82,13 @@ def test_feature_table_import_export_yaml(self): test_feature_table = FeatureTable( name="car_driver", features=[ - Feature(name="ride_distance", dtype=ValueType.FLOAT).to_proto(), - Feature(name="ride_duration", dtype=ValueType.STRING).to_proto(), + Feature(name="ride_distance", dtype=ValueType.FLOAT), + Feature(name="ride_duration", dtype=ValueType.STRING), ], entities=["car_driver_entity"], labels={"team": "matchmaking"}, - batch_source=batch_source.to_proto(), - stream_source=stream_source.to_proto(), + batch_source=batch_source, + stream_source=stream_source, ) # Create a string YAML representation of the feature table diff --git a/tests/e2e/redis/parallel-ingest-redis-serving.py b/tests/e2e/redis/parallel-ingest-redis-serving.py deleted file mode 100644 index 53967aaa74..0000000000 --- a/tests/e2e/redis/parallel-ingest-redis-serving.py +++ /dev/null @@ -1,185 +0,0 @@ -import os -import uuid -from datetime import datetime - -import pytest -from google.protobuf.duration_pb2 import Duration - -from feast.client import Client -from feast.data_source import DataSource, FileOptions, SourceType -from feast.entity import Entity -from feast.feature import Feature -from feast.feature_table import FeatureTable -from feast.value_type import ValueType - -DIR_PATH = os.path.dirname(os.path.realpath(__file__)) -PROJECT_NAME = "basic_" + uuid.uuid4().hex.upper()[0:6] - - -@pytest.fixture(scope="module") -def client(pytestconfig): - core_url = pytestconfig.getoption("core_url") - serving_url = pytestconfig.getoption("serving_url") - - client = Client(core_url=core_url, serving_url=serving_url,) - - client.set_project(PROJECT_NAME) - - return client - - -@pytest.mark.incremental -class TestBasicIngestionRetrieval: - def setup_class(cls): - prefix = "basic_ingestion" - suffix = str(int(datetime.now().timestamp())) - cls.customer_ft_name = f"{prefix}_customer_{suffix}" - cls.driver_ft_name = f"{prefix}_driver_{suffix}" - - cls.customer_entity = Entity( - name="customer_id", - description="Customer entity for rides", - value_type=ValueType.STRING, - labels={"team": "customer_service", "common_key": "common_val"}, - ) - - cls.driver_entity = Entity( - name="driver_id", - description="Driver entity for car rides", - value_type=ValueType.STRING, - labels={"team": "matchmaking", "common_key": "common_val"}, - ) - - cls.basic_ft_spec = FeatureTable.from_yaml( - f"{DIR_PATH}/specifications/dev_ft.yaml" - ) - - def test_discovery(self, client): - - # ApplyEntity - client.apply_entity(self.customer_entity) - client.apply_entity(self.driver_entity) - - # GetEntity Check - assert client.get_entity(name="customer_id") == self.customer_entity - assert client.get_entity(name="driver_id") == self.driver_entity - - # ListEntities Check - common_filtering_labels = {"common_key": "common_val"} - matchmaking_filtering_labels = {"team": "matchmaking"} - - actual_common_entities = client.list_entities(labels=common_filtering_labels) - actual_matchmaking_entities = client.list_entities( - labels=matchmaking_filtering_labels - ) - assert len(actual_common_entities) == 2 - assert len(actual_matchmaking_entities) == 1 - - # ApplyFeatureTable - client.apply_feature_table(self.basic_ft_spec, PROJECT_NAME) - - # GetFeatureTable Check - actual_get_feature_table = client.get_feature_table(name="dev_featuretable") - assert actual_get_feature_table == self.basic_ft_spec - - # ListFeatureTables Check - actual_list_feature_table = [ - ft for ft in client.list_feature_tables() if ft.name == "dev_featuretable" - ][0] - assert actual_list_feature_table == self.basic_ft_spec - - def test_basic_retrieval(self, client): - # TODO: Add ingest and retrieval check - pass - - -@pytest.mark.incremental -class TestAllTypesIngestionRetrieval: - def setup_class(cls): - prefix = "alltypes_ingestion" - suffix = str(int(datetime.now().timestamp())) - batch_source = DataSource( - type=SourceType(1).name, - field_mapping={ - "ride_distance": "ride_distance", - "ride_duration": "ride_duration", - }, - options=FileOptions(file_format="parquet", file_url="file://feast/*"), - timestamp_column="ts_col", - date_partition_column="date_partition_col", - ) - - cls.alltypes_entity = Entity( - name="alltypes_id", - description="Driver entity for car rides", - value_type=ValueType.STRING, - labels={"cat": "alltypes"}, - ) - - cls.alltypes_ft_name = f"{prefix}_alltypes_{suffix}" - cls.alltypes_ft_spec = FeatureTable( - name="alltypes", - entities=["alltypes_id"], - features=[ - Feature(name="float_feature", dtype=ValueType.FLOAT).to_proto(), - Feature(name="int64_feature", dtype=ValueType.INT64).to_proto(), - Feature(name="int32_feature", dtype=ValueType.INT32).to_proto(), - Feature(name="string_feature", dtype=ValueType.STRING).to_proto(), - Feature(name="bytes_feature", dtype=ValueType.BYTES).to_proto(), - Feature(name="bool_feature", dtype=ValueType.BOOL).to_proto(), - Feature(name="double_feature", dtype=ValueType.DOUBLE).to_proto(), - Feature( - name="double_list_feature", dtype=ValueType.DOUBLE_LIST - ).to_proto(), - Feature( - name="float_list_feature", dtype=ValueType.FLOAT_LIST - ).to_proto(), - Feature( - name="int64_list_feature", dtype=ValueType.INT64_LIST - ).to_proto(), - Feature( - name="int32_list_feature", dtype=ValueType.INT32_LIST - ).to_proto(), - Feature( - name="string_list_feature", dtype=ValueType.STRING_LIST - ).to_proto(), - Feature( - name="bytes_list_feature", dtype=ValueType.BYTES_LIST - ).to_proto(), - Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST).to_proto(), - ], - max_age=Duration(seconds=3600), - batch_source=batch_source.to_proto(), - labels={"cat": "alltypes"}, - ) - - def test_discovery(self, client): - # ApplyEntity - client.apply_entity(self.alltypes_entity) - - # GetEntity Check - assert client.get_entity(name="alltypes_id") == self.alltypes_entity - - # ListEntities Check - alltypes_filtering_labels = {"cat": "alltypes"} - actual_alltypes_entities = client.list_entities( - labels=alltypes_filtering_labels - ) - assert len(actual_alltypes_entities) == 1 - - # ApplyFeatureTable - client.apply_feature_table(self.alltypes_ft_spec, PROJECT_NAME) - - # GetFeatureTable Check - actual_get_feature_table = client.get_feature_table(name="alltypes") - assert actual_get_feature_table == self.alltypes_ft_spec - - # ListFeatureTables Check - actual_list_feature_table = [ - ft for ft in client.list_feature_tables() if ft.name == "alltypes" - ][0] - assert actual_list_feature_table == self.alltypes_ft_spec - - def test_alltypes_retrieval(self, client): - # TODO: Add ingest and retrieval check - pass diff --git a/tests/e2e/setup.cfg b/tests/e2e/setup.cfg index 2e0bf6860b..0c3d9bee74 100644 --- a/tests/e2e/setup.cfg +++ b/tests/e2e/setup.cfg @@ -14,5 +14,5 @@ max-complexity = 20 select = B,C,E,F,W,T4 [mypy] -files=bq,redis +files=src ignore_missing_imports=true \ No newline at end of file diff --git a/tests/e2e/redis/specifications/dev_ft.yaml b/tests/e2e/src/specifications/dev_ft.yaml similarity index 100% rename from tests/e2e/redis/specifications/dev_ft.yaml rename to tests/e2e/src/specifications/dev_ft.yaml diff --git a/tests/e2e/src/test-register-ingest.py b/tests/e2e/src/test-register-ingest.py new file mode 100644 index 0000000000..ef3ac2c7ce --- /dev/null +++ b/tests/e2e/src/test-register-ingest.py @@ -0,0 +1,158 @@ +import os +import uuid + +import pytest +from google.protobuf.duration_pb2 import Duration + +from feast.client import Client +from feast.data_source import DataSource, FileOptions +from feast.entity import Entity +from feast.feature import Feature +from feast.feature_table import FeatureTable +from feast.value_type import ValueType + +DIR_PATH = os.path.dirname(os.path.realpath(__file__)) +PROJECT_NAME = "basic_" + uuid.uuid4().hex.upper()[0:6] + + +@pytest.fixture(scope="module") +def client(pytestconfig): + core_url = pytestconfig.getoption("core_url") + serving_url = pytestconfig.getoption("serving_url") + + client = Client(core_url=core_url, serving_url=serving_url,) + + client.set_project(PROJECT_NAME) + + return client + + +@pytest.fixture +def customer_entity(): + return Entity( + name="customer_id", + description="Customer entity for rides", + value_type=ValueType.STRING, + labels={"team": "customer_service", "common_key": "common_val"}, + ) + + +@pytest.fixture +def driver_entity(): + return Entity( + name="driver_id", + description="Driver entity for car rides", + value_type=ValueType.STRING, + labels={"team": "matchmaking", "common_key": "common_val"}, + ) + + +@pytest.fixture +def alltypes_entity(): + return Entity( + name="alltypes_id", + description="Driver entity for car rides", + value_type=ValueType.STRING, + labels={"cat": "alltypes"}, + ) + + +@pytest.fixture +def alltypes_featuretable(): + batch_source = DataSource( + type="BATCH_FILE", + field_mapping={ + "ride_distance": "ride_distance", + "ride_duration": "ride_duration", + }, + options=FileOptions(file_format="parquet", file_url="file://feast/*"), + timestamp_column="ts_col", + date_partition_column="date_partition_col", + ) + return FeatureTable( + name="alltypes", + entities=["alltypes_id"], + features=[ + Feature(name="float_feature", dtype=ValueType.FLOAT), + Feature(name="int64_feature", dtype=ValueType.INT64), + Feature(name="int32_feature", dtype=ValueType.INT32), + Feature(name="string_feature", dtype=ValueType.STRING), + Feature(name="bytes_feature", dtype=ValueType.BYTES), + Feature(name="bool_feature", dtype=ValueType.BOOL), + Feature(name="double_feature", dtype=ValueType.DOUBLE), + Feature(name="double_list_feature", dtype=ValueType.DOUBLE_LIST), + Feature(name="float_list_feature", dtype=ValueType.FLOAT_LIST), + Feature(name="int64_list_feature", dtype=ValueType.INT64_LIST), + Feature(name="int32_list_feature", dtype=ValueType.INT32_LIST), + Feature(name="string_list_feature", dtype=ValueType.STRING_LIST), + Feature(name="bytes_list_feature", dtype=ValueType.BYTES_LIST), + Feature(name="bool_list_feature", dtype=ValueType.BOOL_LIST), + ], + max_age=Duration(seconds=3600), + batch_source=batch_source, + labels={"cat": "alltypes"}, + ) + + +def test_get_list_basic(client: Client, customer_entity: Entity, driver_entity: Entity): + basic_ft_spec = FeatureTable.from_yaml(f"{DIR_PATH}/specifications/dev_ft.yaml") + + # ApplyEntity + client.apply_entity(customer_entity) + client.apply_entity(driver_entity) + + # GetEntity Check + assert client.get_entity(name="customer_id") == customer_entity + assert client.get_entity(name="driver_id") == driver_entity + + # ListEntities Check + common_filtering_labels = {"common_key": "common_val"} + matchmaking_filtering_labels = {"team": "matchmaking"} + + actual_common_entities = client.list_entities(labels=common_filtering_labels) + actual_matchmaking_entities = client.list_entities( + labels=matchmaking_filtering_labels + ) + assert len(actual_common_entities) == 2 + assert len(actual_matchmaking_entities) == 1 + + # ApplyFeatureTable + client.apply_feature_table(basic_ft_spec) + + # GetFeatureTable Check + actual_get_feature_table = client.get_feature_table(name="dev_featuretable") + assert actual_get_feature_table == basic_ft_spec + + # ListFeatureTables Check + actual_list_feature_table = [ + ft for ft in client.list_feature_tables() if ft.name == "dev_featuretable" + ][0] + assert actual_list_feature_table == basic_ft_spec + + +def test_get_list_alltypes( + client: Client, alltypes_entity: Entity, alltypes_featuretable: FeatureTable +): + # ApplyEntity + client.apply_entity(alltypes_entity) + + # GetEntity Check + assert client.get_entity(name="alltypes_id") == alltypes_entity + + # ListEntities Check + alltypes_filtering_labels = {"cat": "alltypes"} + actual_alltypes_entities = client.list_entities(labels=alltypes_filtering_labels) + assert len(actual_alltypes_entities) == 1 + + # ApplyFeatureTable + client.apply_feature_table(alltypes_featuretable) + + # GetFeatureTable Check + actual_get_feature_table = client.get_feature_table(name="alltypes") + assert actual_get_feature_table == alltypes_featuretable + + # ListFeatureTables Check + actual_list_feature_table = [ + ft for ft in client.list_feature_tables() if ft.name == "alltypes" + ][0] + assert actual_list_feature_table == alltypes_featuretable From c917762e7e35f8b2532ebe22a1424e10ee8b228a Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 18:47:46 +0800 Subject: [PATCH 13/34] Remove src folder for e2e tests Signed-off-by: Terence --- Makefile | 2 +- tests/e2e/setup.cfg | 1 - tests/e2e/{src => }/specifications/dev_ft.yaml | 0 tests/e2e/{src => }/test-register-ingest.py | 0 4 files changed, 1 insertion(+), 2 deletions(-) rename tests/e2e/{src => }/specifications/dev_ft.yaml (100%) rename tests/e2e/{src => }/test-register-ingest.py (100%) diff --git a/Makefile b/Makefile index ad755d70d3..85f3fd53e6 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,7 @@ lint-python: cd ${ROOT_DIR}/sdk/python; flake8 feast/ tests/ cd ${ROOT_DIR}/sdk/python; black --check feast tests - cd ${ROOT_DIR}/tests/e2e; mypy src/ + cd ${ROOT_DIR}/tests/e2e; mypy . cd ${ROOT_DIR}/tests/e2e; isort . --check-only cd ${ROOT_DIR}/tests/e2e; flake8 . cd ${ROOT_DIR}/tests/e2e; black --check . diff --git a/tests/e2e/setup.cfg b/tests/e2e/setup.cfg index 0c3d9bee74..3026e38be1 100644 --- a/tests/e2e/setup.cfg +++ b/tests/e2e/setup.cfg @@ -14,5 +14,4 @@ max-complexity = 20 select = B,C,E,F,W,T4 [mypy] -files=src ignore_missing_imports=true \ No newline at end of file diff --git a/tests/e2e/src/specifications/dev_ft.yaml b/tests/e2e/specifications/dev_ft.yaml similarity index 100% rename from tests/e2e/src/specifications/dev_ft.yaml rename to tests/e2e/specifications/dev_ft.yaml diff --git a/tests/e2e/src/test-register-ingest.py b/tests/e2e/test-register-ingest.py similarity index 100% rename from tests/e2e/src/test-register-ingest.py rename to tests/e2e/test-register-ingest.py From e4af10107d3d160fef771ec103ecda008cc2f280 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 20:34:57 +0800 Subject: [PATCH 14/34] Address PR comments Signed-off-by: Terence --- infra/scripts/test-docker-compose.sh | 2 +- infra/scripts/test-end-to-end-redis-cluster.sh | 2 +- infra/scripts/test-end-to-end.sh | 2 +- sdk/python/feast/cli.py | 6 ++++++ sdk/python/feast/client.py | 6 ++---- sdk/python/feast/loaders/ingest.py | 4 +++- 6 files changed, 14 insertions(+), 8 deletions(-) diff --git a/infra/scripts/test-docker-compose.sh b/infra/scripts/test-docker-compose.sh index 173c796eb0..d669f3b655 100755 --- a/infra/scripts/test-docker-compose.sh +++ b/infra/scripts/test-docker-compose.sh @@ -63,4 +63,4 @@ export FEAST_ONLINE_SERVING_CONTAINER_IP_ADDRESS=$(docker inspect -f '{{range .N ${PROJECT_ROOT_DIR}/infra/scripts/wait-for-it.sh ${FEAST_ONLINE_SERVING_CONTAINER_IP_ADDRESS}:6566 --timeout=120 # Run e2e tests for Redis -docker exec feast_jupyter_1 bash -c 'cd /feast/tests/e2e/redis && pytest -n 1 --dist=loadscope parallel-ingest-redis-serving.py --core_url core:6565 --serving_url=online_serving:6566 --kafka_brokers=kafka:9092' +docker exec feast_jupyter_1 bash -c 'cd /feast/tests/e2e && pytest *.py --core_url core:6565 --serving_url=online_serving:6566 --kafka_brokers=kafka:9092' diff --git a/infra/scripts/test-end-to-end-redis-cluster.sh b/infra/scripts/test-end-to-end-redis-cluster.sh index 1b67986742..0e5aa5879a 100755 --- a/infra/scripts/test-end-to-end-redis-cluster.sh +++ b/infra/scripts/test-end-to-end-redis-cluster.sh @@ -103,7 +103,7 @@ cd tests/e2e set +e CORE_NO=$(nproc --all) -pytest src/* -n ${CORE_NO} --dist=loadscope --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +pytest *.py -n ${CORE_NO} --dist=loadscope --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then diff --git a/infra/scripts/test-end-to-end.sh b/infra/scripts/test-end-to-end.sh index 474c98de42..51b55b1763 100755 --- a/infra/scripts/test-end-to-end.sh +++ b/infra/scripts/test-end-to-end.sh @@ -120,7 +120,7 @@ cd tests/e2e set +e export GOOGLE_APPLICATION_CREDENTIALS=/etc/gcloud/service-account.json CORE_NO=$(nproc --all) -pytest src/* -n ${CORE_NO} --dist=loadscope --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml +pytest *.py -n ${CORE_NO} --dist=loadscope --enable_auth=${ENABLE_AUTH} --junitxml=${LOGS_ARTIFACT_PATH}/python-sdk-test-report.xml TEST_EXIT_CODE=$? if [[ ${TEST_EXIT_CODE} != 0 ]]; then diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index f041b4c12f..788541ad2e 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -215,6 +215,12 @@ def feature_table(): def _get_labels_dict(label_str: str) -> Dict[str, str]: """ Converts CLI input labels string to dictionary format if provided string is valid. + + Args: + label_str: A comma-separated string of key-value pairs + + Returns: + Dict of key-value label pairs """ labels_dict: Dict[str, str] = {} labels_kv = label_str.split(",") diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index d53ccd7599..8d4d4b455a 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -588,11 +588,11 @@ def ingest( timeout: int = BATCH_INGESTION_PRODUCTION_TIMEOUT, ) -> None: """ - Batch load feature data into batch source of a specific feature table. + Batch load feature data into a FeatureTable. Args: feature_table (typing.Union[str, feast.feature_table.FeatureTable]): - Feature table object or the string name of the feature table + FeatureTable object or the string name of the feature table source (typing.Union[pd.DataFrame, str]): Either a file path or Pandas Dataframe to ingest into Feast @@ -634,13 +634,11 @@ def ingest( if isinstance(feature_table, FeatureTable): name = feature_table.name - # Read table and get row count dir_path, dest_path, column_names = _read_table_from_source( source, chunk_size, max_workers ) current_time = time.time() - print("Waiting for feature table to be ready for ingestion...") while True: if timeout is not None and time.time() - current_time >= timeout: raise TimeoutError("Timed out waiting for feature table to be ready") diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index 56bb839eb8..14ed290f54 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -28,7 +28,9 @@ def _check_field_mappings( f'Provided data source does not contain entity "datetime" in columns {column_names}' ) - specified_field_mappings = [v for k, v in feature_table_field_mappings.items()] + specified_field_mappings = list() + for k, v in feature_table_field_mappings.items(): + specified_field_mappings.append(v) is_valid = all(col_name in column_names for col_name in specified_field_mappings) From 69bda0290871cd1c48d9a4b972d9a3e537fe77d7 Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 22:58:25 +0800 Subject: [PATCH 15/34] Expose Source instead of Options native class Signed-off-by: Terence --- sdk/python/feast/__init__.py | 16 +- sdk/python/feast/client.py | 21 +- sdk/python/feast/data_source.py | 268 ++++++++++++++++++------- sdk/python/feast/feature_table.py | 54 ++--- sdk/python/tests/test_client.py | 17 +- sdk/python/tests/test_feature_table.py | 17 +- 6 files changed, 260 insertions(+), 133 deletions(-) diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index 298b8ac975..adf2aaf181 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -2,11 +2,11 @@ from .client import Client from .data_source import ( - BigQueryOptions, + BigQuerySource, DataSource, - FileOptions, - KafkaOptions, - KinesisOptions, + FileSource, + KafkaSource, + KinesisSource, SourceType, ) from .entity import Entity @@ -24,10 +24,10 @@ "Client", "Entity", "DataSource", - "BigQueryOptions", - "FileOptions", - "KafkaOptions", - "KinesisOptions", + "BigQuerySource", + "FileSource", + "KafkaSource", + "KinesisSource", "Feature", "FeatureTable", "SourceType", diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index 8d4d4b455a..283f878074 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -638,20 +638,13 @@ def ingest( source, chunk_size, max_workers ) - current_time = time.time() - while True: - if timeout is not None and time.time() - current_time >= timeout: - raise TimeoutError("Timed out waiting for feature table to be ready") - fetched_feature_table: Optional[FeatureTable] = self.get_feature_table( - name, project - ) - if fetched_feature_table is not None: - feature_table = fetched_feature_table - break - time.sleep(3) - - if timeout is not None: - timeout = timeout - int(time.time() - current_time) + fetched_feature_table: Optional[FeatureTable] = self.get_feature_table( + name, project + ) + if fetched_feature_table is not None: + feature_table = fetched_feature_table + else: + raise Exception(f"FeatureTable, {name} cannot be found.") # Check 1) Only parquet file format for FeatureTable batch source is supported if ( diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 59020f8ec9..38e9e695e0 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -14,7 +14,7 @@ import enum -from typing import Dict, Optional, Union +from typing import Dict, Optional from feast.core.DataSource_pb2 import DataSource as DataSourceProto @@ -354,13 +354,11 @@ def __init__( self, type: str, field_mapping: Dict[str, str], - options: Union[BigQueryOptions, FileOptions, KafkaOptions, KinesisOptions], timestamp_column: str, date_partition_column: Optional[str] = "", ): self._type = type self._field_mapping = field_mapping - self._options = options self._timestamp_column = timestamp_column self._date_partition_column = date_partition_column @@ -392,20 +390,6 @@ def field_mapping(self, field_mapping): """ self._field_mapping = field_mapping - @property - def options(self): - """ - Returns the options of this data source - """ - return self._options - - @options.setter - def options(self, options): - """ - Sets the options of this data source - """ - self._options = options - @property def timestamp_column(self): """ @@ -437,31 +421,50 @@ def date_partition_column(self, date_partition_column): @classmethod def from_proto(cls, data_source_proto: DataSourceProto): """ - Creates a DataSource from a protobuf representation of an data source + Creates a DataSource from a protobuf representation of a data source + """ + raise NotImplementedError - Args: - data_source_proto: A protobuf representation of a DataSource + def to_proto(self) -> DataSourceProto: + """ + Converts an DataSourceProto object to its protobuf representation. + """ + raise NotImplementedError - Returns: - Returns a DataSource object based on the data_source protobuf - """ - - if isinstance(cls.options, FileOptions): - data_source = cls(file_options=data_source_proto.options,) - if isinstance(cls.options, BigQueryOptions): - data_source = cls(bigquery_options=data_source_proto.options,) - if isinstance(cls.options, KafkaOptions): - data_source = cls(kafka_options=data_source_proto.options,) - if isinstance(cls.options, KinesisOptions): - data_source = cls(kinesis_options=data_source_proto.options,) - else: - raise TypeError( - "DataSource.from_proto: Provided DataSource option is invalid. Only FileOptions, BigQueryOptions, KafkaOptions and KinesisOptions are supported currently." - ) + +class FileSource(DataSource): + def __init__( + self, + type, + field_mapping, + timestamp_column, + file_format, + file_url, + date_partition_column="", + ): + super().__init__(type, field_mapping, timestamp_column, date_partition_column) + self._file_options = FileOptions(file_format=file_format, file_url=file_url) + + @property + def file_options(self): + """ + Returns the file options of this data source + """ + return self._file_options + + @file_options.setter + def file_options(self, file_options): + """ + Sets the file options of this data source + """ + self._file_options = file_options + + def from_proto(cls, data_source_proto): data_source = cls( type=data_source_proto.type, field_mapping=data_source_proto.field_mapping, + file_options=cls.file_options, timestamp_column=data_source_proto.timestamp_column, date_partition_column=data_source_proto.date_partition_column, ) @@ -469,42 +472,167 @@ def from_proto(cls, data_source_proto: DataSourceProto): return data_source def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=self.type, + field_mapping=self.field_mapping, + file_options=self.file_options.to_proto(), + ) + + data_source_proto.timestamp_column = self.timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + + return data_source_proto + + +class BigQuerySource(DataSource): + def __init__( + self, type, field_mapping, timestamp_column, table_ref, date_partition_column="" + ): + super().__init__(type, field_mapping, timestamp_column, date_partition_column) + self._bigquery_options = BigQueryOptions(table_ref=table_ref,) + + @property + def bigquery_options(self): """ - Converts an DataSourceProto object to its protobuf representation. - Used when passing DataSourceProto object to Feast request. + Returns the bigquery options of this data source + """ + return self._bigquery_options - Returns: - DataSourceProto protobuf - """ - - if isinstance(self.options, FileOptions): - data_source_proto = DataSourceProto( - type=self.type, - field_mapping=self.field_mapping, - file_options=self.options.to_proto(), - ) - elif isinstance(self.options, BigQueryOptions): - data_source_proto = DataSourceProto( - type=self.type, - field_mapping=self.field_mapping, - bigquery_options=self.options.to_proto(), - ) - elif isinstance(self.options, KafkaOptions): - data_source_proto = DataSourceProto( - type=self.type, - field_mapping=self.field_mapping, - kafka_options=self.options.to_proto(), - ) - elif isinstance(self.options, KinesisOptions): - data_source_proto = DataSourceProto( - type=self.type, - field_mapping=self.field_mapping, - kinesis_options=self.options.to_proto(), - ) - else: - raise TypeError( - "DataSource.to_proto: Provided DataSource option is invalid. Only FileOptions, BigQueryOptions, KafkaOptions and KinesisOptions are supported currently." - ) + @bigquery_options.setter + def bigquery_options(self, bigquery_options): + """ + Sets the bigquery options of this data source + """ + self._bigquery_options = bigquery_options + + def from_proto(cls, data_source_proto): + + data_source = cls( + type=data_source_proto.type, + field_mapping=data_source_proto.field_mapping, + bigquery_options=cls.bigquery_options, + timestamp_column=data_source_proto.timestamp_column, + date_partition_column=data_source_proto.date_partition_column, + ) + + return data_source + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=self.type, + field_mapping=self.field_mapping, + bigquery_options=self.bigquery_options.to_proto(), + ) + + data_source_proto.timestamp_column = self.timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + + return data_source_proto + + +class KafkaSource(DataSource): + def __init__( + self, + type, + field_mapping, + timestamp_column, + bootstrap_servers, + class_path, + topic, + date_partition_column="", + ): + super().__init__(type, field_mapping, timestamp_column, date_partition_column) + self._kafka_options = KafkaOptions( + bootstrap_servers=bootstrap_servers, class_path=class_path, topic=topic + ) + + @property + def kafka_options(self): + """ + Returns the kafka options of this data source + """ + return self._kafka_options + + @kafka_options.setter + def kafka_options(self, kafka_options): + """ + Sets the kafka options of this data source + """ + self._kafka_options = kafka_options + + def from_proto(cls, data_source_proto): + + data_source = cls( + type=data_source_proto.type, + field_mapping=data_source_proto.field_mapping, + kafka_options=cls.kafka_options, + timestamp_column=data_source_proto.timestamp_column, + date_partition_column=data_source_proto.date_partition_column, + ) + + return data_source + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=self.type, + field_mapping=self.field_mapping, + kafka_options=self.kafka_options.to_proto(), + ) + + data_source_proto.timestamp_column = self.timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + + return data_source_proto + + +class KinesisSource(DataSource): + def __init__( + self, + type, + field_mapping, + timestamp_column, + class_path, + region, + stream_name, + date_partition_column="", + ): + super().__init__(type, field_mapping, timestamp_column, date_partition_column) + self._kinesis_options = KinesisOptions( + class_path=class_path, region=region, stream_name=stream_name + ) + + @property + def kinesis_options(self): + """ + Returns the kinesis options of this data source + """ + return self._kinesis_options + + @kinesis_options.setter + def kinesis_options(self, kinesis_options): + """ + Sets the kinesis options of this data source + """ + self._kinesis_options = kinesis_options + + def from_proto(cls, data_source_proto): + + data_source = cls( + type=data_source_proto.type, + field_mapping=data_source_proto.field_mapping, + kinesis_options=cls.kinesis_options, + timestamp_column=data_source_proto.timestamp_column, + date_partition_column=data_source_proto.date_partition_column, + ) + + return data_source + + def to_proto(self) -> DataSourceProto: + data_source_proto = DataSourceProto( + type=self.type, + field_mapping=self.field_mapping, + kinesis_options=self.kinesis_options.to_proto(), + ) data_source_proto.timestamp_column = self.timestamp_column data_source_proto.date_partition_column = self.date_partition_column diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index bb35eb534d..b1f722c40d 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, MutableMapping, Optional +from typing import Dict, List, MutableMapping, Optional, Union import yaml from google.protobuf import json_format @@ -25,11 +25,11 @@ from feast.core.FeatureTable_pb2 import FeatureTableMeta as FeatureTableMetaProto from feast.core.FeatureTable_pb2 import FeatureTableSpec as FeatureTableSpecProto from feast.data_source import ( - BigQueryOptions, + BigQuerySource, DataSource, - FileOptions, - KafkaOptions, - KinesisOptions, + FileSource, + KafkaSource, + KinesisSource, SourceType, ) from feast.feature import Feature @@ -47,8 +47,8 @@ def __init__( name: str, entities: List[str], features: List[Feature], - batch_source: Optional[DataSource] = None, - stream_source: Optional[DataSource] = None, + batch_source: Union[BigQuerySource, FileSource] = None, + stream_source: Optional[Union[KafkaSource, KinesisSource]] = None, max_age: Optional[Duration] = None, labels: Optional[MutableMapping[str, str]] = None, ): @@ -275,47 +275,55 @@ def _to_data_source(cls, data_source): and data_source.file_options.file_format and data_source.file_options.file_url ): - data_source_options = FileOptions( + data_source_proto = FileSource( + type=data_source.type, + field_mapping=data_source.field_mapping, file_format=data_source.file_options.file_format, file_url=data_source.file_options.file_url, - ) + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ).to_proto() elif source_type == "BATCH_BIGQUERY" and data_source.bigquery_options.table_ref: - data_source_options = BigQueryOptions( + data_source_proto = BigQuerySource( + type=data_source.type, + field_mapping=data_source.field_mapping, table_ref=data_source.bigquery_options.table_ref, - ) + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ).to_proto() elif ( source_type == "STREAM_KAFKA" and data_source.kafka_options.bootstrap_servers and data_source.kafka_options.topic and data_source.kafka_options.class_path ): - data_source_options = KafkaOptions( + data_source_proto = KafkaSource( + type=data_source.type, + field_mapping=data_source.field_mapping, bootstrap_servers=data_source.kafka_options.bootstrap_servers, class_path=data_source.kafka_options.class_path, topic=data_source.kafka_options.topic, - ) + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ).to_proto() elif ( source_type == "STREAM_KINESIS" and data_source.kinesis_options.class_path and data_source.kinesis_options.region and data_source.kinesis_options.stream_name ): - data_source_options = KinesisOptions( + data_source_proto = KinesisSource( + type=data_source.type, + field_mapping=data_source.field_mapping, class_path=data_source.kinesis_options.class_path, region=data_source.kinesis_options.region, stream_name=data_source.kinesis_options.stream_name, - ) + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ).to_proto() else: raise ValueError("Could not identify the source type being added") - data_source_proto = DataSource( - type=data_source.type, - field_mapping=data_source.field_mapping, - options=data_source_options, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ).to_proto() - return data_source_proto @classmethod diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index 4964a84c96..cd83d10fcd 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -39,7 +39,7 @@ from feast.core.FeatureTable_pb2 import FeatureTable as FeatureTableProto from feast.core.FeatureTable_pb2 import FeatureTableMeta as FeatureTableMetaProto from feast.core.FeatureTable_pb2 import FeatureTableSpec as FeatureTableSpecProto -from feast.data_source import DataSource, FileOptions, KafkaOptions +from feast.data_source import FileSource, KafkaSource from feast.entity import Entity from feast.feature import Feature from feast.feature_table import FeatureTable @@ -568,28 +568,27 @@ def test_apply_feature_table_success(self, test_client): test_client.set_project("project1") # Create Feature Tables - batch_source = DataSource( + batch_source = FileSource( type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, - options=FileOptions(file_format="avro", file_url="data/test.avro"), + file_format="parquet", + file_url="file://feast/*", timestamp_column="ts_col", date_partition_column="date_partition_col", ) - stream_source = DataSource( + stream_source = KafkaSource( type="STREAM_KAFKA", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, - options=KafkaOptions( - bootstrap_servers="localhost:9094", - class_path="random/path/to/class", - topic="test_topic", - ), + bootstrap_servers="localhost:9094", + class_path="random/path/to/class", + topic="test_topic", timestamp_column="ts_col", ) diff --git a/sdk/python/tests/test_feature_table.py b/sdk/python/tests/test_feature_table.py index 8d9891e67d..d4085b61e0 100644 --- a/sdk/python/tests/test_feature_table.py +++ b/sdk/python/tests/test_feature_table.py @@ -21,7 +21,7 @@ from feast.client import Client from feast.core import CoreService_pb2_grpc as Core -from feast.data_source import DataSource, FileOptions, KafkaOptions +from feast.data_source import FileSource, KafkaSource from feast.feature import Feature from feast.feature_table import FeatureTable from feast.value_type import ValueType @@ -54,28 +54,27 @@ def client(self, server): def test_feature_table_import_export_yaml(self): - batch_source = DataSource( + batch_source = FileSource( type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, - options=FileOptions(file_format="avro", file_url="data/test.avro"), + file_format="parquet", + file_url="file://feast/*", timestamp_column="ts_col", date_partition_column="date_partition_col", ) - stream_source = DataSource( + stream_source = KafkaSource( type="STREAM_KAFKA", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, - options=KafkaOptions( - bootstrap_servers="localhost:9094", - class_path="random/path/to/class", - topic="test_topic", - ), + bootstrap_servers="localhost:9094", + class_path="random/path/to/class", + topic="test_topic", timestamp_column="ts_col", ) From 7f794c3c9e0d9c176c3b2d4470c7ecdd4ffa712b Mon Sep 17 00:00:00 2001 From: Terence Date: Mon, 5 Oct 2020 22:58:52 +0800 Subject: [PATCH 16/34] Refactor e2e tests without specification Signed-off-by: Terence --- tests/e2e/specifications/dev_ft.yaml | 38 ----------- ...st-register-ingest.py => test-register.py} | 65 ++++++++++++++++--- 2 files changed, 55 insertions(+), 48 deletions(-) delete mode 100644 tests/e2e/specifications/dev_ft.yaml rename tests/e2e/{test-register-ingest.py => test-register.py} (71%) diff --git a/tests/e2e/specifications/dev_ft.yaml b/tests/e2e/specifications/dev_ft.yaml deleted file mode 100644 index 59072b73b9..0000000000 --- a/tests/e2e/specifications/dev_ft.yaml +++ /dev/null @@ -1,38 +0,0 @@ -spec: - name: dev_featuretable - entities: - - driver_id - - customer_id - features: - - name: dev_feature_float - valueType: FLOAT - - name: dev_feature_string - valueType: STRING - labels: - feature_key1: feature_val1 - batchSource: - type: BATCH_FILE - fieldMapping: - dev_entity: dev_entity_field - dev_feature_float: dev_feature_float_field - dev_feature_string: dev_feature_string_field - timestampColumn: datetime_col - datePartitionColumn: datetime - file_options: - file_format: PARQUET - file_url: gs://example/feast/* - streamSource: - type: STREAM_KAFKA - field_mapping: - dev_entity: dev_entity_field - dev_feature_float: dev_feature_float_field - dev_feature_string: dev_feature_string_field - timestampColumn: datetime_col - kafka_options: - bootstrap_servers: "localhost:9094" - topic: test_topic - class_path: random/path/to/test - maxAge: 14400s - labels: - key1: val1 - key2: val2 \ No newline at end of file diff --git a/tests/e2e/test-register-ingest.py b/tests/e2e/test-register.py similarity index 71% rename from tests/e2e/test-register-ingest.py rename to tests/e2e/test-register.py index ef3ac2c7ce..c587a15879 100644 --- a/tests/e2e/test-register-ingest.py +++ b/tests/e2e/test-register.py @@ -5,7 +5,7 @@ from google.protobuf.duration_pb2 import Duration from feast.client import Client -from feast.data_source import DataSource, FileOptions +from feast.data_source import FileSource, KafkaSource from feast.entity import Entity from feast.feature import Feature from feast.feature_table import FeatureTable @@ -47,6 +47,46 @@ def driver_entity(): ) +@pytest.fixture +def basic_featuretable(): + batch_source = FileSource( + type="BATCH_FILE", + field_mapping={ + "dev_entity": "dev_entity_field", + "dev_feature_float": "dev_feature_float_field", + "dev_feature_string": "dev_feature_string_field", + }, + file_format="PARQUET", + file_url="gs://example/feast/*", + timestamp_column="datetime_col", + date_partition_column="datetime", + ) + stream_source = KafkaSource( + type="STREAM_KAFKA", + field_mapping={ + "dev_entity": "dev_entity_field", + "dev_feature_float": "dev_feature_float_field", + "dev_feature_string": "dev_feature_string_field", + }, + bootstrap_servers="localhost:9094", + class_path="random/path/to/class", + topic="test_topic", + timestamp_column="datetime_col", + ) + return FeatureTable( + name="basic_featuretable", + entities=["driver_id", "customer_id"], + features=[ + Feature(name="dev_feature_float", dtype=ValueType.FLOAT), + Feature(name="dev_feature_string", dtype=ValueType.STRING), + ], + max_age=Duration(seconds=3600), + batch_source=batch_source, + stream_source=stream_source, + labels={"key1": "val1", "key2": "val2"}, + ) + + @pytest.fixture def alltypes_entity(): return Entity( @@ -59,13 +99,14 @@ def alltypes_entity(): @pytest.fixture def alltypes_featuretable(): - batch_source = DataSource( + batch_source = FileSource( type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", }, - options=FileOptions(file_format="parquet", file_url="file://feast/*"), + file_format="parquet", + file_url="file://feast/*", timestamp_column="ts_col", date_partition_column="date_partition_col", ) @@ -94,8 +135,12 @@ def alltypes_featuretable(): ) -def test_get_list_basic(client: Client, customer_entity: Entity, driver_entity: Entity): - basic_ft_spec = FeatureTable.from_yaml(f"{DIR_PATH}/specifications/dev_ft.yaml") +def test_get_list_basic( + client: Client, + customer_entity: Entity, + driver_entity: Entity, + basic_featuretable: FeatureTable, +): # ApplyEntity client.apply_entity(customer_entity) @@ -117,17 +162,17 @@ def test_get_list_basic(client: Client, customer_entity: Entity, driver_entity: assert len(actual_matchmaking_entities) == 1 # ApplyFeatureTable - client.apply_feature_table(basic_ft_spec) + client.apply_feature_table(basic_featuretable) # GetFeatureTable Check - actual_get_feature_table = client.get_feature_table(name="dev_featuretable") - assert actual_get_feature_table == basic_ft_spec + actual_get_feature_table = client.get_feature_table(name="basic_featuretable") + assert actual_get_feature_table == basic_featuretable # ListFeatureTables Check actual_list_feature_table = [ - ft for ft in client.list_feature_tables() if ft.name == "dev_featuretable" + ft for ft in client.list_feature_tables() if ft.name == "basic_featuretable" ][0] - assert actual_list_feature_table == basic_ft_spec + assert actual_list_feature_table == basic_featuretable def test_get_list_alltypes( From a3d5d94192e69e52a13b55e4c4a27402ffab1f61 Mon Sep 17 00:00:00 2001 From: Terence Date: Tue, 6 Oct 2020 11:43:26 +0800 Subject: [PATCH 17/34] Fix data partitioning for ingest method Signed-off-by: Terence --- sdk/python/feast/client.py | 58 ++++++++++++++---------- sdk/python/feast/loaders/ingest.py | 72 ++++++++++++++++++++++++++---- 2 files changed, 99 insertions(+), 31 deletions(-) diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index 283f878074..4da2854734 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import datetime import logging import multiprocessing import os @@ -69,6 +68,7 @@ from feast.loaders.ingest import ( BATCH_INGESTION_PRODUCTION_TIMEOUT, _check_field_mappings, + _partition_by_date, ) from feast.serving.ServingService_pb2 import GetFeastServingInfoRequest from feast.serving.ServingService_pb2_grpc import ServingServiceStub @@ -634,10 +634,6 @@ def ingest( if isinstance(feature_table, FeatureTable): name = feature_table.name - dir_path, dest_path, column_names = _read_table_from_source( - source, chunk_size, max_workers - ) - fetched_feature_table: Optional[FeatureTable] = self.get_feature_table( name, project ) @@ -646,6 +642,10 @@ def ingest( else: raise Exception(f"FeatureTable, {name} cannot be found.") + dir_path, dest_path, column_names = _read_table_from_source( + source, chunk_size, max_workers + ) + # Check 1) Only parquet file format for FeatureTable batch source is supported if ( feature_table.batch_source @@ -662,7 +662,14 @@ def ingest( # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table _check_field_mappings( - column_names, name, feature_table.batch_source.field_mapping + column_names, + name, + feature_table.batch_source.timestamp_column, + feature_table.batch_source.field_mapping, + ) + # Partition dataset by date + date_partition_dest_path = _partition_by_date( + column_names, feature_table, dest_path, ) batch_source_type = SourceType(feature_table.batch_source.type).name @@ -675,36 +682,41 @@ def ingest( uri = urlparse(file_url) staging_client = get_staging_client(uri.scheme) - file_name = dest_path.split("/")[-1] - date_today = datetime.datetime.today().strftime("%Y-%m-%d") - - staging_client.upload_file( - dest_path, - uri.hostname, - str(uri.path).strip("/") + "/" + f"date={date_today}/" + file_name, - ) + file_paths = list() + for (dirpath, dirnames, filenames) in os.walk(date_partition_dest_path): + file_paths += [os.path.join(dirpath, file) for file in filenames] + for path in file_paths: + file_name = path.split("/")[-1] + partition_date = path.split("/")[-2].split("=")[-1] + staging_client.upload_file( + path, + uri.hostname, + str(uri.path).strip("/") + + "/" + + f"date={partition_date}/" + + file_name, + ) if batch_source_type == "BATCH_BIGQUERY": from google.cloud import bigquery bq_table_ref = feature_table.batch_source.bigquery_options.table_ref gcp_project, dataset_table = bq_table_ref.split(":") - dataset, table = dataset_table.split(".") client = bigquery.Client(project=gcp_project) - table_ref = client.dataset(dataset).table(table) + bq_table_ref = bq_table_ref.replace(":", ".") + table = bigquery.table.Table(bq_table_ref) + job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.PARQUET - # Check for date partitioning column in FeatureTable spec - if feature_table.batch_source.date_partition_column: - time_partitioning_obj = bigquery.table.TimePartitioning( - field=feature_table.batch_source.date_partition_column - ) - job_config.time_partitioning = time_partitioning_obj + time_partitioning_obj = bigquery.table.TimePartitioning( + field=feature_table.batch_source.timestamp_column + ) + job_config.time_partitioning = time_partitioning_obj with open(dest_path, "rb") as source_file: client.load_table_from_file( - source_file, table_ref, job_config=job_config + source_file, table, job_config=job_config ) finally: # Remove parquet file(s) that were created earlier diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index 14ed290f54..a6407f6e15 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -1,5 +1,12 @@ +import tempfile from typing import Dict, List +import pandas as pd +import pyarrow as pa +from pyarrow import parquet as pq + +from feast.feature_table import FeatureTable + GRPC_CONNECTION_TIMEOUT_DEFAULT = 3 # type: int GRPC_CONNECTION_TIMEOUT_APPLY = 300 # type: int FEAST_SERVING_URL_ENV_KEY = "FEAST_SERVING_URL" # type: str @@ -11,21 +18,22 @@ def _check_field_mappings( column_names: List[str], feature_table_name: str, + feature_table_timestamp_column: str, feature_table_field_mappings: Dict[str, str], ) -> None: """ - Checks that all specified field mappings in FeatureTable can be found in - column names of specified ingestion source. + Checks that all specified field mappings in FeatureTable can be found in + column names of specified ingestion source. - Args: - column_names: Column names in provided ingestion source - feature_table_name: Name of FeatureTable - feature_table_field_mappings: Field mappings of FeatureTable + Args: + column_names: Column names in provided ingestion source + feature_table_name: Name of FeatureTable + feature_table_field_mappings: Field mappings of FeatureTable """ - if "datetime" not in column_names: + if feature_table_timestamp_column not in column_names: raise ValueError( - f'Provided data source does not contain entity "datetime" in columns {column_names}' + f"Provided data source does not contain timestamp column {feature_table_timestamp_column} in columns {column_names}" ) specified_field_mappings = list() @@ -39,3 +47,51 @@ def _check_field_mappings( f"Provided data source does not contain all field mappings previously " f"defined for FeatureTable, {feature_table_name}." ) + + +def _partition_by_date( + column_names: List[str], feature_table: FeatureTable, file_path: str, +) -> str: + """ + Partitions dataset by date based on timestamp_column. + Assumes date_partition_column is in date format if provided. + + Args: + column_names: Column names in provided ingestion source + feature_table: FeatureTable + file_path: File path to existing parquet file that's not yet partitioned + + Returns: + str: + Root directory which contains date partitioned files. + """ + df = pd.read_parquet(file_path) + # Date-partitioned dataset temp path + dir_path = tempfile.mkdtemp() + + # Case: date_partition_column is provided and dataset contains it + if ( + feature_table.batch_source.date_partition_column + and feature_table.batch_source.date_partition_column in column_names + ): + table = pa.Table.from_pandas(df) + pq.write_to_dataset( + table=table, + root_path=dir_path, + partition_cols=[feature_table.batch_source.date_partition_column], + ) + return dir_path + + # Case: date_partition_column is provided and dataset does not contain it + if feature_table.batch_source.date_partition_column: + feast_partition_col = feature_table.batch_source.date_partition_column + else: + feast_partition_col = "feast_partition_col" + + df[feast_partition_col] = df[feature_table.batch_source.timestamp_column].dt.date + table = pa.Table.from_pandas(df) + pq.write_to_dataset( + table=table, root_path=dir_path, partition_cols=[feast_partition_col] + ) + + return dir_path From 6ce6e4169e3569dd5f8c554a5c800407bbc4a923 Mon Sep 17 00:00:00 2001 From: Terence Date: Tue, 6 Oct 2020 16:33:07 +0800 Subject: [PATCH 18/34] Cleanup date partition logic and add ingest test Signed-off-by: Terence --- sdk/python/feast/client.py | 46 +++++++---- sdk/python/feast/loaders/ingest.py | 33 +++----- sdk/python/feast/staging/storage_client.py | 8 +- sdk/python/tests/test_client.py | 90 ++++++++++++++++++++++ 4 files changed, 139 insertions(+), 38 deletions(-) diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index 4da2854734..b4e060f607 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -668,9 +668,14 @@ def ingest( feature_table.batch_source.field_mapping, ) # Partition dataset by date - date_partition_dest_path = _partition_by_date( - column_names, feature_table, dest_path, - ) + date_partition_dest_path = None + if feature_table.batch_source.date_partition_column: + date_partition_dest_path = _partition_by_date( + column_names, + feature_table.batch_source.date_partition_column, + feature_table.batch_source.timestamp_column, + dest_path, + ) batch_source_type = SourceType(feature_table.batch_source.type).name @@ -682,19 +687,32 @@ def ingest( uri = urlparse(file_url) staging_client = get_staging_client(uri.scheme) - file_paths = list() - for (dirpath, dirnames, filenames) in os.walk(date_partition_dest_path): - file_paths += [os.path.join(dirpath, file) for file in filenames] - for path in file_paths: - file_name = path.split("/")[-1] - partition_date = path.split("/")[-2].split("=")[-1] + if date_partition_dest_path is not None: + file_paths = list() + for (dirpath, dirnames, filenames) in os.walk( + date_partition_dest_path + ): + file_paths += [ + os.path.join(dirpath, file) for file in filenames + ] + for path in file_paths: + file_name = path.split("/")[-1] + partition_col = path.split("/")[-2] + staging_client.upload_file( + path, + uri.hostname, + str(uri.path).strip("/") + + "/" + + partition_col + + "/" + + file_name, + ) + else: + file_name = dest_path.split("/")[-1] staging_client.upload_file( - path, + dest_path, uri.hostname, - str(uri.path).strip("/") - + "/" - + f"date={partition_date}/" - + file_name, + str(uri.path).strip("/") + "/" + file_name, ) if batch_source_type == "BATCH_BIGQUERY": from google.cloud import bigquery diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index a6407f6e15..f817a27cd3 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -5,8 +5,6 @@ import pyarrow as pa from pyarrow import parquet as pq -from feast.feature_table import FeatureTable - GRPC_CONNECTION_TIMEOUT_DEFAULT = 3 # type: int GRPC_CONNECTION_TIMEOUT_APPLY = 300 # type: int FEAST_SERVING_URL_ENV_KEY = "FEAST_SERVING_URL" # type: str @@ -50,7 +48,10 @@ def _check_field_mappings( def _partition_by_date( - column_names: List[str], feature_table: FeatureTable, file_path: str, + column_names: List[str], + feature_table_date_partition_column: str, + feature_table_timestamp_column: str, + file_path: str, ) -> str: """ Partitions dataset by date based on timestamp_column. @@ -69,29 +70,17 @@ def _partition_by_date( # Date-partitioned dataset temp path dir_path = tempfile.mkdtemp() - # Case: date_partition_column is provided and dataset contains it - if ( - feature_table.batch_source.date_partition_column - and feature_table.batch_source.date_partition_column in column_names - ): - table = pa.Table.from_pandas(df) - pq.write_to_dataset( - table=table, - root_path=dir_path, - partition_cols=[feature_table.batch_source.date_partition_column], - ) - return dir_path - # Case: date_partition_column is provided and dataset does not contain it - if feature_table.batch_source.date_partition_column: - feast_partition_col = feature_table.batch_source.date_partition_column - else: - feast_partition_col = "feast_partition_col" + if feature_table_date_partition_column not in column_names: + df[feature_table_date_partition_column] = df[ + feature_table_timestamp_column + ].dt.date - df[feast_partition_col] = df[feature_table.batch_source.timestamp_column].dt.date table = pa.Table.from_pandas(df) pq.write_to_dataset( - table=table, root_path=dir_path, partition_cols=[feast_partition_col] + table=table, + root_path=dir_path, + partition_cols=[feature_table_date_partition_column], ) return dir_path diff --git a/sdk/python/feast/staging/storage_client.py b/sdk/python/feast/staging/storage_client.py index 3b391410b2..a10558b38c 100644 --- a/sdk/python/feast/staging/storage_client.py +++ b/sdk/python/feast/staging/storage_client.py @@ -14,7 +14,9 @@ # limitations under the License. +import os import re +import shutil from abc import ABC, ABCMeta, abstractmethod from tempfile import TemporaryFile from typing import List @@ -227,8 +229,10 @@ def download_file(self, uri: ParseResult) -> IO[bytes]: def list_files(self, bucket: str, path: str) -> List[str]: raise NotImplementedError("list files not implemented for Local file") - def upload_file(self, local_path: str, bucket: str, remote_path: str): - pass # For test cases + def upload_file(self, local_path: str, folder: str, remote_path: str): + dest_fpath = os.path.join(folder + "/" + remote_path) + os.makedirs(os.path.dirname(dest_fpath), exist_ok=True) + shutil.copy(local_path, dest_fpath) storage_clients = {GS: GCSClient, S3: S3Client, LOCAL_FILE: LocalFSClient} diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index cd83d10fcd..b38b1102e3 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -11,15 +11,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import pkgutil import socket from concurrent import futures +from datetime import datetime, timedelta from unittest import mock import grpc +import numpy as np +import pandas as pd import pytest +import pytz from google.protobuf.duration_pb2 import Duration from mock import MagicMock, patch +from pandas.util.testing import assert_frame_equal +from pyarrow import parquet as pq from pytest_lazyfixture import lazy_fixture from feast.client import Client @@ -628,6 +635,89 @@ def test_apply_feature_table_success(self, test_client): and feature_tables[0].entities[0] == "fs1-my-entity-1" ) + @pytest.mark.parametrize( + "mocked_client", [lazy_fixture("mock_client")], + ) + def test_ingest(self, mocked_client, mocker): + mocked_client._core_service_stub = Core.CoreServiceStub( + grpc.insecure_channel("") + ) + + N_ROWS = 100 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + final_offset = ( + [time_offset] * 33 + + [time_offset - timedelta(days=1)] * 33 + + [time_offset - timedelta(days=2)] * 34 + ) + final_part_offset = ( + [time_offset - timedelta(days=99)] * 33 + + [time_offset - timedelta(days=100)] * 33 + + [time_offset - timedelta(days=101)] * 34 + ) + ft_df = pd.DataFrame( + { + "datetime": final_offset, + "datetime_col": final_part_offset, + "dev_feature_float": [np.float(row) for row in range(N_ROWS)], + "dev_feature_string": ["feat_" + str(row) for row in range(N_ROWS)], + } + ) + + mocker.patch.object( + mocked_client._core_service_stub, + "GetFeatureTable", + return_value=GetFeatureTableResponse( + table=FeatureTableProto( + spec=FeatureTableSpecProto( + name="ingest_featuretable", + max_age=Duration(seconds=3600), + features=[ + FeatureSpecProto( + name="dev_feature_float", + value_type=ValueProto.ValueType.FLOAT, + ), + FeatureSpecProto( + name="dev_feature_string", + value_type=ValueProto.ValueType.STRING, + ), + ], + entities=["dev_entity"], + batch_source=DataSourceProto( + type="BATCH_FILE", + field_mapping={ + "dev_feature_float": "dev_feature_float", + "dev_feature_string": "dev_feature_string", + }, + file_options=DataSourceProto.FileOptions( + file_format="parquet", file_url="file://feast/*" + ), + timestamp_column="datetime", + date_partition_column="datetime_col", + ), + ), + meta=FeatureTableMetaProto(), + ) + ), + ) + + mocked_client.set_project("my_project") + ft = mocked_client.get_feature_table("ingest_featuretable") + mocked_client.ingest(ft, ft_df, timeout=600) + + dest_fpath = os.path.join("feast/") + pq_df = pq.read_table(dest_fpath).to_pandas() + + ft_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) + ft_df = ft_df.reindex(sorted(ft_df.columns), axis=1) + ft_df.reset_index(drop=True, inplace=True) + pq_df.reset_index(drop=True, inplace=True) + pq_df["datetime_col"] = pd.to_datetime(pq_df.datetime_col).dt.tz_convert("UTC") + + assert_frame_equal(ft_df, pq_df) + @patch("grpc.channel_ready_future") def test_secure_channel_creation_with_secure_client( self, _mocked_obj, core_server, serving_server From 4ec077cac6a93f1d3777763c3a0362e8324a69f1 Mon Sep 17 00:00:00 2001 From: Terence Date: Tue, 6 Oct 2020 17:30:24 +0800 Subject: [PATCH 19/34] Remove type field from datasource classes Signed-off-by: Terence --- sdk/python/feast/client.py | 17 +++++------ sdk/python/feast/data_source.py | 41 ++++++-------------------- sdk/python/feast/feature_table.py | 21 +++---------- sdk/python/tests/test_client.py | 2 -- sdk/python/tests/test_feature_table.py | 2 -- tests/e2e/test-register.py | 3 -- 6 files changed, 21 insertions(+), 65 deletions(-) diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index b4e060f607..7f502afe19 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -60,7 +60,7 @@ ListProjectsResponse, ) from feast.core.CoreService_pb2_grpc import CoreServiceStub -from feast.data_source import SourceType +from feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.entity import Entity from feast.feature_table import FeatureTable from feast.grpc import auth as feast_auth @@ -649,7 +649,7 @@ def ingest( # Check 1) Only parquet file format for FeatureTable batch source is supported if ( feature_table.batch_source - and SourceType(feature_table.batch_source.type).name == "BATCH_FILE" + and feature_table.batch_source.type == DataSourceProto.BATCH_FILE and "".join( feature_table.batch_source.file_options.file_format.split() ).lower() @@ -677,10 +677,11 @@ def ingest( dest_path, ) - batch_source_type = SourceType(feature_table.batch_source.type).name - try: - if batch_source_type == "BATCH_FILE": + if ( + feature_table.batch_source.file_options.file_format + and feature_table.batch_source.file_options.file_url + ): from urllib.parse import urlparse file_url = feature_table.batch_source.file_options.file_url[:-1] @@ -714,7 +715,7 @@ def ingest( uri.hostname, str(uri.path).strip("/") + "/" + file_name, ) - if batch_source_type == "BATCH_BIGQUERY": + if feature_table.batch_source.bigquery_options.table_ref: from google.cloud import bigquery bq_table_ref = feature_table.batch_source.bigquery_options.table_ref @@ -741,9 +742,7 @@ def ingest( print("Removing temporary file(s)...") shutil.rmtree(dir_path) - print( - f"Data has been successfully ingested into FeatureTable {batch_source_type} batch source." - ) + print("Data has been successfully ingested into FeatureTable batch source.") def _get_grpc_metadata(self): """ diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 38e9e695e0..35954b8ca3 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -352,30 +352,14 @@ class DataSource: def __init__( self, - type: str, field_mapping: Dict[str, str], timestamp_column: str, date_partition_column: Optional[str] = "", ): - self._type = type self._field_mapping = field_mapping self._timestamp_column = timestamp_column self._date_partition_column = date_partition_column - @property - def type(self): - """ - Returns the type of this data source - """ - return self._type - - @type.setter - def type(self, type): - """ - Sets the type of this data source - """ - self._type = type - @property def field_mapping(self): """ @@ -435,14 +419,13 @@ def to_proto(self) -> DataSourceProto: class FileSource(DataSource): def __init__( self, - type, field_mapping, timestamp_column, file_format, file_url, date_partition_column="", ): - super().__init__(type, field_mapping, timestamp_column, date_partition_column) + super().__init__(field_mapping, timestamp_column, date_partition_column) self._file_options = FileOptions(file_format=file_format, file_url=file_url) @property @@ -462,7 +445,6 @@ def file_options(self, file_options): def from_proto(cls, data_source_proto): data_source = cls( - type=data_source_proto.type, field_mapping=data_source_proto.field_mapping, file_options=cls.file_options, timestamp_column=data_source_proto.timestamp_column, @@ -473,7 +455,7 @@ def from_proto(cls, data_source_proto): def to_proto(self) -> DataSourceProto: data_source_proto = DataSourceProto( - type=self.type, + type=DataSourceProto.BATCH_FILE, field_mapping=self.field_mapping, file_options=self.file_options.to_proto(), ) @@ -486,9 +468,9 @@ def to_proto(self) -> DataSourceProto: class BigQuerySource(DataSource): def __init__( - self, type, field_mapping, timestamp_column, table_ref, date_partition_column="" + self, field_mapping, timestamp_column, table_ref, date_partition_column="" ): - super().__init__(type, field_mapping, timestamp_column, date_partition_column) + super().__init__(field_mapping, timestamp_column, date_partition_column) self._bigquery_options = BigQueryOptions(table_ref=table_ref,) @property @@ -508,7 +490,6 @@ def bigquery_options(self, bigquery_options): def from_proto(cls, data_source_proto): data_source = cls( - type=data_source_proto.type, field_mapping=data_source_proto.field_mapping, bigquery_options=cls.bigquery_options, timestamp_column=data_source_proto.timestamp_column, @@ -519,7 +500,7 @@ def from_proto(cls, data_source_proto): def to_proto(self) -> DataSourceProto: data_source_proto = DataSourceProto( - type=self.type, + type=DataSourceProto.BATCH_BIGQUERY, field_mapping=self.field_mapping, bigquery_options=self.bigquery_options.to_proto(), ) @@ -533,7 +514,6 @@ def to_proto(self) -> DataSourceProto: class KafkaSource(DataSource): def __init__( self, - type, field_mapping, timestamp_column, bootstrap_servers, @@ -541,7 +521,7 @@ def __init__( topic, date_partition_column="", ): - super().__init__(type, field_mapping, timestamp_column, date_partition_column) + super().__init__(field_mapping, timestamp_column, date_partition_column) self._kafka_options = KafkaOptions( bootstrap_servers=bootstrap_servers, class_path=class_path, topic=topic ) @@ -563,7 +543,6 @@ def kafka_options(self, kafka_options): def from_proto(cls, data_source_proto): data_source = cls( - type=data_source_proto.type, field_mapping=data_source_proto.field_mapping, kafka_options=cls.kafka_options, timestamp_column=data_source_proto.timestamp_column, @@ -574,7 +553,7 @@ def from_proto(cls, data_source_proto): def to_proto(self) -> DataSourceProto: data_source_proto = DataSourceProto( - type=self.type, + type=DataSourceProto.STREAM_KAFKA, field_mapping=self.field_mapping, kafka_options=self.kafka_options.to_proto(), ) @@ -588,7 +567,6 @@ def to_proto(self) -> DataSourceProto: class KinesisSource(DataSource): def __init__( self, - type, field_mapping, timestamp_column, class_path, @@ -596,7 +574,7 @@ def __init__( stream_name, date_partition_column="", ): - super().__init__(type, field_mapping, timestamp_column, date_partition_column) + super().__init__(field_mapping, timestamp_column, date_partition_column) self._kinesis_options = KinesisOptions( class_path=class_path, region=region, stream_name=stream_name ) @@ -618,7 +596,6 @@ def kinesis_options(self, kinesis_options): def from_proto(cls, data_source_proto): data_source = cls( - type=data_source_proto.type, field_mapping=data_source_proto.field_mapping, kinesis_options=cls.kinesis_options, timestamp_column=data_source_proto.timestamp_column, @@ -629,7 +606,7 @@ def from_proto(cls, data_source_proto): def to_proto(self) -> DataSourceProto: data_source_proto = DataSourceProto( - type=self.type, + type=DataSourceProto.STREAM_KINESIS, field_mapping=self.field_mapping, kinesis_options=self.kinesis_options.to_proto(), ) diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index b1f722c40d..1448791627 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -30,7 +30,6 @@ FileSource, KafkaSource, KinesisSource, - SourceType, ) from feast.feature import Feature from feast.loaders import yaml as feast_yaml @@ -268,37 +267,27 @@ def _to_data_source(cls, data_source): Convert dict to data source. """ - source_type = SourceType(data_source.type).name - - if ( - source_type == "BATCH_FILE" - and data_source.file_options.file_format - and data_source.file_options.file_url - ): + if data_source.file_options.file_format and data_source.file_options.file_url: data_source_proto = FileSource( - type=data_source.type, field_mapping=data_source.field_mapping, file_format=data_source.file_options.file_format, file_url=data_source.file_options.file_url, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, ).to_proto() - elif source_type == "BATCH_BIGQUERY" and data_source.bigquery_options.table_ref: + elif data_source.bigquery_options.table_ref: data_source_proto = BigQuerySource( - type=data_source.type, field_mapping=data_source.field_mapping, table_ref=data_source.bigquery_options.table_ref, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, ).to_proto() elif ( - source_type == "STREAM_KAFKA" - and data_source.kafka_options.bootstrap_servers + data_source.kafka_options.bootstrap_servers and data_source.kafka_options.topic and data_source.kafka_options.class_path ): data_source_proto = KafkaSource( - type=data_source.type, field_mapping=data_source.field_mapping, bootstrap_servers=data_source.kafka_options.bootstrap_servers, class_path=data_source.kafka_options.class_path, @@ -307,13 +296,11 @@ def _to_data_source(cls, data_source): date_partition_column=data_source.date_partition_column, ).to_proto() elif ( - source_type == "STREAM_KINESIS" - and data_source.kinesis_options.class_path + data_source.kinesis_options.class_path and data_source.kinesis_options.region and data_source.kinesis_options.stream_name ): data_source_proto = KinesisSource( - type=data_source.type, field_mapping=data_source.field_mapping, class_path=data_source.kinesis_options.class_path, region=data_source.kinesis_options.region, diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index b38b1102e3..70ba3fc22a 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -576,7 +576,6 @@ def test_apply_feature_table_success(self, test_client): # Create Feature Tables batch_source = FileSource( - type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -588,7 +587,6 @@ def test_apply_feature_table_success(self, test_client): ) stream_source = KafkaSource( - type="STREAM_KAFKA", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", diff --git a/sdk/python/tests/test_feature_table.py b/sdk/python/tests/test_feature_table.py index d4085b61e0..7a50b7e58f 100644 --- a/sdk/python/tests/test_feature_table.py +++ b/sdk/python/tests/test_feature_table.py @@ -55,7 +55,6 @@ def client(self, server): def test_feature_table_import_export_yaml(self): batch_source = FileSource( - type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", @@ -67,7 +66,6 @@ def test_feature_table_import_export_yaml(self): ) stream_source = KafkaSource( - type="STREAM_KAFKA", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", diff --git a/tests/e2e/test-register.py b/tests/e2e/test-register.py index c587a15879..665936eb5d 100644 --- a/tests/e2e/test-register.py +++ b/tests/e2e/test-register.py @@ -50,7 +50,6 @@ def driver_entity(): @pytest.fixture def basic_featuretable(): batch_source = FileSource( - type="BATCH_FILE", field_mapping={ "dev_entity": "dev_entity_field", "dev_feature_float": "dev_feature_float_field", @@ -62,7 +61,6 @@ def basic_featuretable(): date_partition_column="datetime", ) stream_source = KafkaSource( - type="STREAM_KAFKA", field_mapping={ "dev_entity": "dev_entity_field", "dev_feature_float": "dev_feature_float_field", @@ -100,7 +98,6 @@ def alltypes_entity(): @pytest.fixture def alltypes_featuretable(): batch_source = FileSource( - type="BATCH_FILE", field_mapping={ "ride_distance": "ride_distance", "ride_duration": "ride_duration", From e6cb88f0d9d9b68713b53e142ac2d2336a6fad21 Mon Sep 17 00:00:00 2001 From: Terence Date: Tue, 6 Oct 2020 17:37:13 +0800 Subject: [PATCH 20/34] Remove not so useful test Signed-off-by: Terence --- sdk/python/tests/test_client.py | 226 -------------------------------- 1 file changed, 226 deletions(-) diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index 70ba3fc22a..a8c9a71fd4 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -32,16 +32,10 @@ from feast.client import Client from feast.core import CoreService_pb2_grpc as Core from feast.core.CoreService_pb2 import ( - GetEntityResponse, GetFeastCoreVersionResponse, GetFeatureTableResponse, - ListEntitiesResponse, - ListFeatureTablesResponse, ) from feast.core.DataSource_pb2 import DataSource as DataSourceProto -from feast.core.Entity_pb2 import Entity as EntityProto -from feast.core.Entity_pb2 import EntityMeta as EntityMetaProto -from feast.core.Entity_pb2 import EntitySpecV2 as EntitySpecProto from feast.core.Feature_pb2 import FeatureSpecV2 as FeatureSpecProto from feast.core.FeatureTable_pb2 import FeatureTable as FeatureTableProto from feast.core.FeatureTable_pb2 import FeatureTableMeta as FeatureTableMetaProto @@ -319,91 +313,6 @@ def test_get_online_features(self, mocked_client, auth_metadata, mocker): def test_get_historical_features(self, mocked_client, mocker): assert 1 == 1 - @pytest.mark.parametrize( - "mocked_client", - [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], - ) - def test_get_entity(self, mocked_client, mocker): - mocked_client._core_service_stub = Core.CoreServiceStub( - grpc.insecure_channel("") - ) - - entity_proto = EntityProto( - spec=EntitySpecProto( - name="driver_car_id", - description="Car driver id", - value_type=ValueProto.ValueType.STRING, - labels={"key1": "val1", "key2": "val2"}, - ), - meta=EntityMetaProto(), - ) - - mocker.patch.object( - mocked_client._core_service_stub, - "GetEntity", - return_value=GetEntityResponse(entity=entity_proto), - ) - mocked_client.set_project("my_project") - entity = mocked_client.get_entity("my_entity") - - assert ( - entity.name == "driver_car_id" - and entity.description == "Car driver id" - and entity.value_type == ValueType(ValueProto.ValueType.STRING).name - and "key1" in entity.labels - and entity.labels["key1"] == "val1" - ) - - @pytest.mark.parametrize( - "mocked_client", - [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], - ) - def test_list_entities(self, mocked_client, mocker): - mocker.patch.object( - mocked_client, - "_core_service_stub", - return_value=Core.CoreServiceStub(grpc.insecure_channel("")), - ) - - entity_1_proto = EntityProto( - spec=EntitySpecProto( - name="driver_car_id", - description="Car driver id", - value_type=ValueProto.ValueType.INT64, - labels={"key1": "val1", "key2": "val2"}, - ) - ) - entity_2_proto = EntityProto( - spec=EntitySpecProto( - name="driver_ride_id", - description="Ride driver id", - value_type=ValueProto.ValueType.STRING, - labels={"key3": "val3", "key4": "val4"}, - ) - ) - - mocker.patch.object( - mocked_client._core_service_stub, - "ListEntities", - return_value=ListEntitiesResponse( - entities=[entity_1_proto, entity_2_proto] - ), - ) - - entities = mocked_client.list_entities(labels={"key1": "val1"}) - assert len(entities) == 2 - - entity = entities[1] - assert ( - entity.name == "driver_ride_id" - and entity.description == "Ride driver id" - and entity.value_type == ValueType(ValueProto.ValueType.STRING).name - and "key3" in entity.labels - and entity.labels["key3"] == "val3" - and "key4" in entity.labels - and entity.labels["key4"] == "val4" - ) - @pytest.mark.parametrize( "test_client", [lazy_fixture("client"), lazy_fixture("secure_client")], ) @@ -432,141 +341,6 @@ def test_apply_entity_success(self, test_client): and entity.labels["team"] == "matchmaking" ) - @pytest.mark.parametrize( - "mocked_client", - [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], - ) - def test_get_feature_table(self, mocked_client, mocker): - mocked_client._core_service_stub = Core.CoreServiceStub( - grpc.insecure_channel("") - ) - - from google.protobuf.duration_pb2 import Duration - - mocker.patch.object( - mocked_client._core_service_stub, - "GetFeatureTable", - return_value=GetFeatureTableResponse( - table=FeatureTableProto( - spec=FeatureTableSpecProto( - name="my_feature_table", - max_age=Duration(seconds=3600), - labels={"key1": "val1", "key2": "val2"}, - features=[ - FeatureSpecProto( - name="my_feature_1", - value_type=ValueProto.ValueType.FLOAT, - ), - FeatureSpecProto( - name="my_feature_2", - value_type=ValueProto.ValueType.FLOAT, - ), - ], - entities=["my_entity_1"], - batch_source=DataSourceProto( - type="BATCH_FILE", - field_mapping={ - "ride_distance": "ride_distance", - "ride_duration": "ride_duration", - }, - file_options=DataSourceProto.FileOptions( - file_format="avro", file_url="data/test.avro" - ), - timestamp_column="ts_col", - date_partition_column="date_partition_col", - ), - ), - meta=FeatureTableMetaProto(), - ) - ), - ) - mocked_client.set_project("my_project") - feature_table = mocked_client.get_feature_table("my_feature_table") - - assert ( - feature_table.name == "my_feature_table" - and "key1" in feature_table.labels - and feature_table.labels["key1"] == "val1" - and "key2" in feature_table.labels - and feature_table.labels["key2"] == "val2" - and len(feature_table.features) == 2 - and len(feature_table.entities) == 1 - ) - - @pytest.mark.parametrize( - "mocked_client", - [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], - ) - def test_list_feature_tables(self, mocked_client, mocker): - mocker.patch.object( - mocked_client, - "_core_service_stub", - return_value=Core.CoreServiceStub(grpc.insecure_channel("")), - ) - - batch_source = DataSourceProto( - type="BATCH_FILE", - field_mapping={ - "ride_distance": "ride_distance", - "ride_duration": "ride_duration", - }, - file_options=DataSourceProto.FileOptions( - file_format="avro", file_url="data/test.avro" - ), - timestamp_column="ts_col", - date_partition_column="date_partition_col", - ) - - feature_table_1_proto = FeatureTableProto( - spec=FeatureTableSpecProto( - name="driver_car", - max_age=Duration(seconds=3600), - labels={"key1": "val1", "key2": "val2"}, - features=[ - FeatureSpecProto( - name="feature_1", value_type=ValueProto.ValueType.FLOAT - ) - ], - entities=["driver_car_id"], - batch_source=batch_source, - ) - ) - feature_table_2_proto = FeatureTableProto( - spec=FeatureTableSpecProto( - name="driver_ride", - max_age=Duration(seconds=3600), - labels={"key1": "val1"}, - features=[ - FeatureSpecProto( - name="feature_1", value_type=ValueProto.ValueType.FLOAT - ) - ], - entities=["driver_ride_id"], - batch_source=batch_source, - ) - ) - - mocker.patch.object( - mocked_client._core_service_stub, - "ListFeatureTables", - return_value=ListFeatureTablesResponse( - tables=[feature_table_1_proto, feature_table_2_proto] - ), - ) - - feature_tables = mocked_client.list_feature_tables(labels={"key1": "val1"}) - assert len(feature_tables) == 2 - - feature_table = feature_tables[0] - assert ( - feature_table.name == "driver_car" - and "key1" in feature_table.labels - and feature_table.labels["key1"] == "val1" - and "key2" in feature_table.labels - and feature_table.labels["key2"] == "val2" - and len(feature_table.features) == 1 - ) - @pytest.mark.parametrize( "test_client", [lazy_fixture("client"), lazy_fixture("secure_client")], ) From f09c65e9746202d79c6f058181917bcbe16d0a02 Mon Sep 17 00:00:00 2001 From: Terence Date: Tue, 6 Oct 2020 22:12:31 +0800 Subject: [PATCH 21/34] Address PR comments Signed-off-by: Terence --- sdk/python/feast/data_source.py | 52 ++++++++++++++++-------------- sdk/python/feast/feature_table.py | 8 ++--- sdk/python/feast/loaders/ingest.py | 1 + sdk/python/tests/test_client.py | 12 ------- tests/e2e/test-register.py | 4 --- 5 files changed, 33 insertions(+), 44 deletions(-) diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 35954b8ca3..21014c8008 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -352,12 +352,12 @@ class DataSource: def __init__( self, - field_mapping: Dict[str, str], timestamp_column: str, + field_mapping: Optional[Dict[str, str]] = dict(), date_partition_column: Optional[str] = "", ): - self._field_mapping = field_mapping self._timestamp_column = timestamp_column + self._field_mapping = field_mapping self._date_partition_column = date_partition_column @property @@ -419,13 +419,13 @@ def to_proto(self) -> DataSourceProto: class FileSource(DataSource): def __init__( self, - field_mapping, - timestamp_column, - file_format, - file_url, - date_partition_column="", + timestamp_column: str, + file_format: str, + file_url: str, + field_mapping: Optional[Dict[str, str]] = dict(), + date_partition_column: Optional[str] = "", ): - super().__init__(field_mapping, timestamp_column, date_partition_column) + super().__init__(timestamp_column, field_mapping, date_partition_column) self._file_options = FileOptions(file_format=file_format, file_url=file_url) @property @@ -468,9 +468,13 @@ def to_proto(self) -> DataSourceProto: class BigQuerySource(DataSource): def __init__( - self, field_mapping, timestamp_column, table_ref, date_partition_column="" + self, + timestamp_column: str, + table_ref: str, + field_mapping: Optional[Dict[str, str]] = dict(), + date_partition_column: Optional[str] = "", ): - super().__init__(field_mapping, timestamp_column, date_partition_column) + super().__init__(timestamp_column, field_mapping, date_partition_column) self._bigquery_options = BigQueryOptions(table_ref=table_ref,) @property @@ -514,14 +518,14 @@ def to_proto(self) -> DataSourceProto: class KafkaSource(DataSource): def __init__( self, - field_mapping, - timestamp_column, - bootstrap_servers, - class_path, - topic, - date_partition_column="", + timestamp_column: str, + bootstrap_servers: str, + class_path: str, + topic: str, + field_mapping: Optional[Dict[str, str]] = dict(), + date_partition_column: Optional[str] = "", ): - super().__init__(field_mapping, timestamp_column, date_partition_column) + super().__init__(timestamp_column, field_mapping, date_partition_column) self._kafka_options = KafkaOptions( bootstrap_servers=bootstrap_servers, class_path=class_path, topic=topic ) @@ -567,14 +571,14 @@ def to_proto(self) -> DataSourceProto: class KinesisSource(DataSource): def __init__( self, - field_mapping, - timestamp_column, - class_path, - region, - stream_name, - date_partition_column="", + timestamp_column: str, + class_path: str, + region: str, + stream_name: str, + field_mapping: Optional[Dict[str, str]] = dict(), + date_partition_column: Optional[str] = "", ): - super().__init__(field_mapping, timestamp_column, date_partition_column) + super().__init__(timestamp_column, field_mapping, date_partition_column) self._kinesis_options = KinesisOptions( class_path=class_path, region=region, stream_name=stream_name ) diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index 1448791627..3312b7845c 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -262,9 +262,9 @@ def from_dict(cls, ft_dict): return cls.from_proto(feature_table_proto) @classmethod - def _to_data_source(cls, data_source): + def _get_data_source_proto(cls, data_source): """ - Convert dict to data source. + Convert data source config in FeatureTable spec to a DataSource proto. """ if data_source.file_options.file_format and data_source.file_options.file_url: @@ -346,12 +346,12 @@ def from_proto(cls, feature_table_proto: FeatureTableProto): batch_source=( None if not feature_table_proto.spec.batch_source.ByteSize() - else cls._to_data_source(feature_table_proto.spec.batch_source) + else cls._get_data_source_proto(feature_table_proto.spec.batch_source) ), stream_source=( None if not feature_table_proto.spec.stream_source.ByteSize() - else cls._to_data_source(feature_table_proto.spec.stream_source) + else cls._get_data_source_proto(feature_table_proto.spec.stream_source) ), ) diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index f817a27cd3..8cd658c2ef 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -26,6 +26,7 @@ def _check_field_mappings( Args: column_names: Column names in provided ingestion source feature_table_name: Name of FeatureTable + feature_table_timestamp_column: Timestamp column of FeatureTable feature_table_field_mappings: Field mappings of FeatureTable """ diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index a8c9a71fd4..a5f2198416 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -350,10 +350,6 @@ def test_apply_feature_table_success(self, test_client): # Create Feature Tables batch_source = FileSource( - field_mapping={ - "ride_distance": "ride_distance", - "ride_duration": "ride_duration", - }, file_format="parquet", file_url="file://feast/*", timestamp_column="ts_col", @@ -361,10 +357,6 @@ def test_apply_feature_table_success(self, test_client): ) stream_source = KafkaSource( - field_mapping={ - "ride_distance": "ride_distance", - "ride_duration": "ride_duration", - }, bootstrap_servers="localhost:9094", class_path="random/path/to/class", topic="test_topic", @@ -457,10 +449,6 @@ def test_ingest(self, mocked_client, mocker): entities=["dev_entity"], batch_source=DataSourceProto( type="BATCH_FILE", - field_mapping={ - "dev_feature_float": "dev_feature_float", - "dev_feature_string": "dev_feature_string", - }, file_options=DataSourceProto.FileOptions( file_format="parquet", file_url="file://feast/*" ), diff --git a/tests/e2e/test-register.py b/tests/e2e/test-register.py index 665936eb5d..11f64fb854 100644 --- a/tests/e2e/test-register.py +++ b/tests/e2e/test-register.py @@ -98,10 +98,6 @@ def alltypes_entity(): @pytest.fixture def alltypes_featuretable(): batch_source = FileSource( - field_mapping={ - "ride_distance": "ride_distance", - "ride_duration": "ride_duration", - }, file_format="parquet", file_url="file://feast/*", timestamp_column="ts_col", From ab47c408ddf300f9bd41594b176dbd0ec39a612c Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 09:13:56 +0800 Subject: [PATCH 22/34] Cleanup the way protos are converted Signed-off-by: Terence --- sdk/python/feast/client.py | 11 ++-- sdk/python/feast/data_source.py | 67 +++++++++++++++++++++++ sdk/python/feast/feature_table.py | 91 ++++++++++++++++++------------- sdk/python/tests/test_client.py | 10 ++-- 4 files changed, 127 insertions(+), 52 deletions(-) diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index 7f502afe19..8635940580 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -60,7 +60,7 @@ ListProjectsResponse, ) from feast.core.CoreService_pb2_grpc import CoreServiceStub -from feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.data_source import BigQuerySource, FileSource from feast.entity import Entity from feast.feature_table import FeatureTable from feast.grpc import auth as feast_auth @@ -649,7 +649,7 @@ def ingest( # Check 1) Only parquet file format for FeatureTable batch source is supported if ( feature_table.batch_source - and feature_table.batch_source.type == DataSourceProto.BATCH_FILE + and issubclass(type(feature_table.batch_source), FileSource) and "".join( feature_table.batch_source.file_options.file_format.split() ).lower() @@ -678,10 +678,7 @@ def ingest( ) try: - if ( - feature_table.batch_source.file_options.file_format - and feature_table.batch_source.file_options.file_url - ): + if issubclass(type(feature_table.batch_source), FileSource): from urllib.parse import urlparse file_url = feature_table.batch_source.file_options.file_url[:-1] @@ -715,7 +712,7 @@ def ingest( uri.hostname, str(uri.path).strip("/") + "/" + file_name, ) - if feature_table.batch_source.bigquery_options.table_ref: + if issubclass(type(feature_table.batch_source), BigQuerySource): from google.cloud import bigquery bq_table_ref = feature_table.batch_source.bigquery_options.table_ref diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 21014c8008..40dfc8e3b2 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -360,6 +360,19 @@ def __init__( self._field_mapping = field_mapping self._date_partition_column = date_partition_column + def __eq__(self, other): + if not isinstance(other, DataSource): + raise TypeError("Comparisons should only involve DataSource class objects.") + + if ( + self.timestamp_column != other.timestamp_column + or self.field_mapping != other.field_mapping + or self.date_partition_column != other.date_partition_column + ): + return False + + return True + @property def field_mapping(self): """ @@ -428,6 +441,18 @@ def __init__( super().__init__(timestamp_column, field_mapping, date_partition_column) self._file_options = FileOptions(file_format=file_format, file_url=file_url) + def __eq__(self, other): + if not isinstance(other, FileSource): + raise TypeError("Comparisons should only involve FileSource class objects.") + + if ( + self.file_options.file_url != other.file_options.file_url + or self.file_options.file_format != other.file_options.file_format + ): + return False + + return True + @property def file_options(self): """ @@ -477,6 +502,17 @@ def __init__( super().__init__(timestamp_column, field_mapping, date_partition_column) self._bigquery_options = BigQueryOptions(table_ref=table_ref,) + def __eq__(self, other): + if not isinstance(other, BigQuerySource): + raise TypeError( + "Comparisons should only involve BigQuerySource class objects." + ) + + if self.bigquery_options.table_ref != other.bigquery_options.table_ref: + return False + + return True + @property def bigquery_options(self): """ @@ -530,6 +566,22 @@ def __init__( bootstrap_servers=bootstrap_servers, class_path=class_path, topic=topic ) + def __eq__(self, other): + if not isinstance(other, KafkaSource): + raise TypeError( + "Comparisons should only involve KafkaSource class objects." + ) + + if ( + self.kafka_options.bootstrap_servers + != other.kafka_options.bootstrap_servers + or self.kafka_options.class_path != other.kafka_options.class_path + or self.kafka_options.topic != other.kafka_options.topic + ): + return False + + return True + @property def kafka_options(self): """ @@ -583,6 +635,21 @@ def __init__( class_path=class_path, region=region, stream_name=stream_name ) + def __eq__(self, other): + if not isinstance(other, KinesisSource): + raise TypeError( + "Comparisons should only involve KinesisSource class objects." + ) + + if ( + self.kinesis_options.class_path != other.kinesis_options.class_path + or self.kinesis_options.region != other.kinesis_options.region + or self.kinesis_options.stream_name != other.kinesis_options.stream_name + ): + return False + + return True + @property def kinesis_options(self): """ diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index 3312b7845c..3f10003182 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -20,7 +20,6 @@ from google.protobuf.json_format import MessageToDict, MessageToJson from google.protobuf.timestamp_pb2 import Timestamp -from feast.core.DataSource_pb2 import DataSource as DataSourceProto from feast.core.FeatureTable_pb2 import FeatureTable as FeatureTableProto from feast.core.FeatureTable_pb2 import FeatureTableMeta as FeatureTableMetaProto from feast.core.FeatureTable_pb2 import FeatureTableSpec as FeatureTableSpecProto @@ -53,19 +52,9 @@ def __init__( ): self._name = name self._entities = entities - self._features = [ - feature.to_proto() for feature in features if isinstance(feature, Feature) - ] - self._batch_source = ( - batch_source.to_proto() - if isinstance(batch_source, DataSource) - else batch_source - ) - self._stream_source = ( - stream_source.to_proto() - if isinstance(stream_source, DataSource) - else stream_source - ) + self._features = features + self._batch_source = batch_source + self._stream_source = stream_source if labels is None: self._labels = dict() # type: MutableMapping[str, str] else: @@ -110,7 +99,7 @@ def name(self): return self._name @name.setter - def name(self, name): + def name(self, name: str): """ Sets the name of this feature table """ @@ -124,7 +113,7 @@ def entities(self): return self._entities @entities.setter - def entities(self, entities): + def entities(self, entities: List[str]): """ Sets the entities of this feature table """ @@ -138,7 +127,7 @@ def features(self): return self._features @features.setter - def features(self, features): + def features(self, features: List[Feature]): """ Sets the features of this feature table """ @@ -152,7 +141,7 @@ def batch_source(self): return self._batch_source @batch_source.setter - def batch_source(self, batch_source: DataSourceProto): + def batch_source(self, batch_source: Union[BigQuerySource, FileSource]): """ Sets the batch source of this feature table """ @@ -166,7 +155,7 @@ def stream_source(self): return self._stream_source @stream_source.setter - def stream_source(self, stream_source: DataSourceProto): + def stream_source(self, stream_source: Union[KafkaSource, KinesisSource]): """ Sets the stream source of this feature table """ @@ -182,7 +171,7 @@ def max_age(self): return self._max_age @max_age.setter - def max_age(self, max_age): + def max_age(self, max_age: Duration): """ Set the maximum age for this feature table """ @@ -262,56 +251,56 @@ def from_dict(cls, ft_dict): return cls.from_proto(feature_table_proto) @classmethod - def _get_data_source_proto(cls, data_source): + def _get_data_source(cls, data_source): """ - Convert data source config in FeatureTable spec to a DataSource proto. + Convert data source config in FeatureTable spec to a DataSource class object. """ if data_source.file_options.file_format and data_source.file_options.file_url: - data_source_proto = FileSource( + data_source_obj = FileSource( field_mapping=data_source.field_mapping, file_format=data_source.file_options.file_format, file_url=data_source.file_options.file_url, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, - ).to_proto() + ) elif data_source.bigquery_options.table_ref: - data_source_proto = BigQuerySource( + data_source_obj = BigQuerySource( field_mapping=data_source.field_mapping, table_ref=data_source.bigquery_options.table_ref, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, - ).to_proto() + ) elif ( data_source.kafka_options.bootstrap_servers and data_source.kafka_options.topic and data_source.kafka_options.class_path ): - data_source_proto = KafkaSource( + data_source_obj = KafkaSource( field_mapping=data_source.field_mapping, bootstrap_servers=data_source.kafka_options.bootstrap_servers, class_path=data_source.kafka_options.class_path, topic=data_source.kafka_options.topic, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, - ).to_proto() + ) elif ( data_source.kinesis_options.class_path and data_source.kinesis_options.region and data_source.kinesis_options.stream_name ): - data_source_proto = KinesisSource( + data_source_obj = KinesisSource( field_mapping=data_source.field_mapping, class_path=data_source.kinesis_options.class_path, region=data_source.kinesis_options.region, stream_name=data_source.kinesis_options.stream_name, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, - ).to_proto() + ) else: raise ValueError("Could not identify the source type being added") - return data_source_proto + return data_source_obj @classmethod def from_proto(cls, feature_table_proto: FeatureTableProto): @@ -346,12 +335,12 @@ def from_proto(cls, feature_table_proto: FeatureTableProto): batch_source=( None if not feature_table_proto.spec.batch_source.ByteSize() - else cls._get_data_source_proto(feature_table_proto.spec.batch_source) + else cls._get_data_source(feature_table_proto.spec.batch_source) ), stream_source=( None if not feature_table_proto.spec.stream_source.ByteSize() - else cls._get_data_source_proto(feature_table_proto.spec.stream_source) + else cls._get_data_source(feature_table_proto.spec.stream_source) ), ) @@ -375,11 +364,23 @@ def to_proto(self) -> FeatureTableProto: spec = FeatureTableSpecProto( name=self.name, entities=self.entities, - features=self.features, + features=[ + feature.to_proto() + for feature in self.features + if type(feature) == Feature + ], labels=self.labels, max_age=self.max_age, - batch_source=self.batch_source, - stream_source=self.stream_source, + batch_source=( + self.batch_source.to_proto() + if issubclass(type(self.batch_source), DataSource) + else self.batch_source + ), + stream_source=( + self.stream_source.to_proto() + if issubclass(type(self.stream_source), DataSource) + else self.stream_source + ), ) return FeatureTableProto(spec=spec, meta=meta) @@ -396,11 +397,23 @@ def to_spec_proto(self) -> FeatureTableSpecProto: spec = FeatureTableSpecProto( name=self.name, entities=self.entities, - features=self.features, + features=[ + feature.to_proto() + for feature in self.features + if type(feature) == Feature + ], labels=self.labels, max_age=self.max_age, - batch_source=self.batch_source, - stream_source=self.stream_source, + batch_source=( + self.batch_source.to_proto() + if issubclass(type(self.batch_source), DataSource) + else self.batch_source + ), + stream_source=( + self.stream_source.to_proto() + if issubclass(type(self.stream_source), DataSource) + else self.stream_source + ), ) return spec diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index a5f2198416..8daa5db5db 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -387,15 +387,13 @@ def test_apply_feature_table_success(self, test_client): len(feature_tables) == 1 and feature_tables[0].name == "my-feature-table-1" and feature_tables[0].features[0].name == "fs1-my-feature-1" - and feature_tables[0].features[0].value_type == ValueProto.ValueType.INT64 + and feature_tables[0].features[0].dtype == ValueType.INT64 and feature_tables[0].features[1].name == "fs1-my-feature-2" - and feature_tables[0].features[1].value_type == ValueProto.ValueType.STRING + and feature_tables[0].features[1].dtype == ValueType.STRING and feature_tables[0].features[2].name == "fs1-my-feature-3" - and feature_tables[0].features[2].value_type - == ValueProto.ValueType.STRING_LIST + and feature_tables[0].features[2].dtype == ValueType.STRING_LIST and feature_tables[0].features[3].name == "fs1-my-feature-4" - and feature_tables[0].features[3].value_type - == ValueProto.ValueType.BYTES_LIST + and feature_tables[0].features[3].dtype == ValueType.BYTES_LIST and feature_tables[0].entities[0] == "fs1-my-entity-1" ) From 40c3e29740cc9fac824e172225acb09e83e33b4b Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 16:06:00 +0800 Subject: [PATCH 23/34] Split ingest function and add more tests Signed-off-by: Terence --- sdk/python/feast/client.py | 168 +++------------- sdk/python/feast/loaders/ingest.py | 164 +++++++++++++++- sdk/python/tests/data/dev_featuretable.csv | 101 ++++++++++ sdk/python/tests/test_client.py | 215 ++++++++++++++++++--- 4 files changed, 474 insertions(+), 174 deletions(-) create mode 100644 sdk/python/tests/data/dev_featuretable.csv diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index 8635940580..96ed8e7538 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -13,17 +13,11 @@ # limitations under the License. import logging import multiprocessing -import os import shutil -import tempfile -import time -from math import ceil -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Union import grpc import pandas as pd -import pyarrow as pa -from pyarrow import parquet as pq from feast.config import Config from feast.constants import ( @@ -68,11 +62,14 @@ from feast.loaders.ingest import ( BATCH_INGESTION_PRODUCTION_TIMEOUT, _check_field_mappings, - _partition_by_date, + _read_table_from_source, + _upload_to_bq_source, + _upload_to_file_source, + _write_non_partitioned_table_from_source, + _write_partitioned_table_from_source, ) from feast.serving.ServingService_pb2 import GetFeastServingInfoRequest from feast.serving.ServingService_pb2_grpc import ServingServiceStub -from feast.staging.storage_client import get_staging_client _logger = logging.getLogger(__name__) @@ -642,10 +639,6 @@ def ingest( else: raise Exception(f"FeatureTable, {name} cannot be found.") - dir_path, dest_path, column_names = _read_table_from_source( - source, chunk_size, max_workers - ) - # Check 1) Only parquet file format for FeatureTable batch source is supported if ( feature_table.batch_source @@ -660,6 +653,7 @@ def ingest( f"Only BATCH_FILE source with parquet format is supported for batch ingestion." ) + pyarrow_table, column_names = _read_table_from_source(source) # Check 2) Check if FeatureTable batch source field mappings can be found in provided source table _check_field_mappings( column_names, @@ -667,77 +661,42 @@ def ingest( feature_table.batch_source.timestamp_column, feature_table.batch_source.field_mapping, ) - # Partition dataset by date - date_partition_dest_path = None - if feature_table.batch_source.date_partition_column: - date_partition_dest_path = _partition_by_date( + + dir_path = None + to_partition = False + if feature_table.batch_source.date_partition_column and issubclass( + type(feature_table.batch_source), FileSource + ): + to_partition = True + dest_path = _write_partitioned_table_from_source( column_names, + pyarrow_table, feature_table.batch_source.date_partition_column, feature_table.batch_source.timestamp_column, - dest_path, + ) + else: + dir_path, dest_path = _write_non_partitioned_table_from_source( + column_names, pyarrow_table, chunk_size, max_workers, ) try: if issubclass(type(feature_table.batch_source), FileSource): - from urllib.parse import urlparse - file_url = feature_table.batch_source.file_options.file_url[:-1] - uri = urlparse(file_url) - staging_client = get_staging_client(uri.scheme) - - if date_partition_dest_path is not None: - file_paths = list() - for (dirpath, dirnames, filenames) in os.walk( - date_partition_dest_path - ): - file_paths += [ - os.path.join(dirpath, file) for file in filenames - ] - for path in file_paths: - file_name = path.split("/")[-1] - partition_col = path.split("/")[-2] - staging_client.upload_file( - path, - uri.hostname, - str(uri.path).strip("/") - + "/" - + partition_col - + "/" - + file_name, - ) - else: - file_name = dest_path.split("/")[-1] - staging_client.upload_file( - dest_path, - uri.hostname, - str(uri.path).strip("/") + "/" + file_name, - ) + _upload_to_file_source(file_url, to_partition, dest_path) if issubclass(type(feature_table.batch_source), BigQuerySource): - from google.cloud import bigquery - bq_table_ref = feature_table.batch_source.bigquery_options.table_ref - gcp_project, dataset_table = bq_table_ref.split(":") - - client = bigquery.Client(project=gcp_project) - - bq_table_ref = bq_table_ref.replace(":", ".") - table = bigquery.table.Table(bq_table_ref) - - job_config = bigquery.LoadJobConfig() - job_config.source_format = bigquery.SourceFormat.PARQUET + feature_table_timestamp_column = ( + feature_table.batch_source.timestamp_column + ) - time_partitioning_obj = bigquery.table.TimePartitioning( - field=feature_table.batch_source.timestamp_column + _upload_to_bq_source( + bq_table_ref, feature_table_timestamp_column, dest_path ) - job_config.time_partitioning = time_partitioning_obj - with open(dest_path, "rb") as source_file: - client.load_table_from_file( - source_file, table, job_config=job_config - ) finally: # Remove parquet file(s) that were created earlier print("Removing temporary file(s)...") - shutil.rmtree(dir_path) + if dir_path: + shutil.rmtree(dir_path) print("Data has been successfully ingested into FeatureTable batch source.") @@ -751,74 +710,3 @@ def _get_grpc_metadata(self): if self._config.getboolean(CONFIG_ENABLE_AUTH_KEY) and self._auth_metadata: return self._auth_metadata.get_signed_meta() return () - - -def _read_table_from_source( - source: Union[pd.DataFrame, str], chunk_size: int, max_workers: int -) -> Tuple[str, str, List[str]]: - """ - Infers a data source type (path or Pandas DataFrame) and reads it in as - a PyArrow Table. - - The PyArrow Table that is read will be written to a parquet file with row - group size determined by the minimum of: - * (table.num_rows / max_workers) - * chunk_size - - The parquet file that is created will be passed as file path to the - multiprocessing pool workers. - - Args: - source (Union[pd.DataFrame, str]): - Either a string path or Pandas DataFrame. - - chunk_size (int): - Number of worker processes to use to encode values. - - max_workers (int): - Amount of rows to load and ingest at a time. - - Returns: - Tuple[str, str, List[str]]: - Tuple containing parent directory path, destination path to - parquet file and column names of pyarrow table. - """ - - # Pandas DataFrame detected - if isinstance(source, pd.DataFrame): - table = pa.Table.from_pandas(df=source) - - # Inferring a string path - elif isinstance(source, str): - file_path = source - filename, file_ext = os.path.splitext(file_path) - - if ".csv" in file_ext: - from pyarrow import csv - - table = csv.read_csv(filename) - elif ".json" in file_ext: - from pyarrow import json - - table = json.read_json(filename) - else: - table = pq.read_table(file_path) - else: - raise ValueError(f"Unknown data source provided for ingestion: {source}") - - # Ensure that PyArrow table is initialised - assert isinstance(table, pa.lib.Table) - - # Write table as parquet file with a specified row_group_size - dir_path = tempfile.mkdtemp() - tmp_table_name = f"{int(time.time())}.parquet" - dest_path = f"{dir_path}/{tmp_table_name}" - row_group_size = min(ceil(table.num_rows / max_workers), chunk_size) - pq.write_table(table=table, where=dest_path, row_group_size=row_group_size) - - column_names = table.column_names - - # Remove table from memory - del table - - return dir_path, dest_path, column_names diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index 8cd658c2ef..528e47fa7c 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -1,10 +1,15 @@ +import os import tempfile -from typing import Dict, List +import time +from math import ceil +from typing import Dict, List, Tuple, Union import pandas as pd import pyarrow as pa from pyarrow import parquet as pq +from feast.staging.storage_client import get_staging_client + GRPC_CONNECTION_TIMEOUT_DEFAULT = 3 # type: int GRPC_CONNECTION_TIMEOUT_APPLY = 300 # type: int FEAST_SERVING_URL_ENV_KEY = "FEAST_SERVING_URL" # type: str @@ -48,11 +53,42 @@ def _check_field_mappings( ) -def _partition_by_date( +def _write_non_partitioned_table_from_source( + column_names: List[str], table: pa.Table, chunk_size: int, max_workers: int +) -> Tuple[str, str]: + """ + Partitions dataset by date based on timestamp_column. + Assumes date_partition_column is in date format if provided. + + Args: + column_names: Column names in provided ingestion source + table: PyArrow table of Dataset + chunk_size: Number of worker processes to use to encode values. + max_workers: Amount of rows to load and ingest at a time. + Returns: + Tuple[str,str]: + Tuple containing parent directory path, destination path to + parquet file. + """ + dir_path = tempfile.mkdtemp() + + # Write table as parquet file with a specified row_group_size + tmp_table_name = f"{int(time.time())}.parquet" + dest_path = f"{dir_path}/{tmp_table_name}" + row_group_size = min(ceil(table.num_rows / max_workers), chunk_size) + pq.write_table(table=table, where=dest_path, row_group_size=row_group_size) + + # Remove table from memory + del table + + return dir_path, dest_path + + +def _write_partitioned_table_from_source( column_names: List[str], + table: pa.Table, feature_table_date_partition_column: str, feature_table_timestamp_column: str, - file_path: str, ) -> str: """ Partitions dataset by date based on timestamp_column. @@ -60,28 +96,138 @@ def _partition_by_date( Args: column_names: Column names in provided ingestion source - feature_table: FeatureTable - file_path: File path to existing parquet file that's not yet partitioned - + table: PyArrow table of Dataset + feature_table_date_partition_column: Date-partition column of FeatureTable + feature_table_timestamp_column: Timestamp column of FeatureTable Returns: str: Root directory which contains date partitioned files. """ - df = pd.read_parquet(file_path) - # Date-partitioned dataset temp path dir_path = tempfile.mkdtemp() # Case: date_partition_column is provided and dataset does not contain it if feature_table_date_partition_column not in column_names: + df = table.to_pandas() df[feature_table_date_partition_column] = df[ feature_table_timestamp_column ].dt.date + table = pa.Table.from_pandas(df) - table = pa.Table.from_pandas(df) pq.write_to_dataset( table=table, root_path=dir_path, partition_cols=[feature_table_date_partition_column], ) + # Remove table from memory + del table + return dir_path + + +def _read_table_from_source( + source: Union[pd.DataFrame, str] +) -> Tuple[pa.Table, List[str]]: + """ + Infers a data source type (path or Pandas DataFrame) and reads it in as + a PyArrow Table. + + Args: + source (Union[pd.DataFrame, str]): + Either a string path or Pandas DataFrame. + + Returns: + Tuple[pa.Table, List[str]]: + Tuple containing PyArrow table of dataset, and column names of PyArrow table. + """ + + # Pandas DataFrame detected + if isinstance(source, pd.DataFrame): + table = pa.Table.from_pandas(df=source) + + # Inferring a string path + elif isinstance(source, str): + file_path = source + filename, file_ext = os.path.splitext(file_path) + + if ".csv" in file_ext: + from pyarrow import csv + + table = csv.read_csv(filename) + elif ".json" in file_ext: + from pyarrow import json + + table = json.read_json(filename) + else: + table = pq.read_table(file_path) + else: + raise ValueError(f"Unknown data source provided for ingestion: {source}") + + # Ensure that PyArrow table is initialised + assert isinstance(table, pa.lib.Table) + + column_names = table.column_names + + return table, column_names + + +def _upload_to_file_source(file_url: str, to_partition: bool, dest_path: str) -> None: + """ + Uploads data into a FileSource. Currently supports GCS, S3 and Local FS. + + Args: + file_url: file url of FileSource defined for FeatureTable + """ + from urllib.parse import urlparse + + uri = urlparse(file_url) + staging_client = get_staging_client(uri.scheme) + + if to_partition: + file_paths = list() + for (dirpath, dirnames, filenames) in os.walk(dest_path): + file_paths += [os.path.join(dirpath, file) for file in filenames] + for path in file_paths: + file_name = path.split("/")[-1] + partition_col = path.split("/")[-2] + staging_client.upload_file( + path, + uri.hostname, + str(uri.path).strip("/") + "/" + partition_col + "/" + file_name, + ) + else: + file_name = dest_path.split("/")[-1] + staging_client.upload_file( + dest_path, uri.hostname, str(uri.path).strip("/") + "/" + file_name, + ) + + +def _upload_to_bq_source( + bq_table_ref: str, feature_table_timestamp_column: str, dest_path: str +) -> None: + """ + Uploads data into a BigQuerySource. + + Args: + bq_table_ref: BigQuery table reference of format "project:dataset_name.table_name" defined for FeatureTable + feature_table_timestamp_column: Timestamp column of FeatureTable + dest_path: File path to existing parquet file + """ + from google.cloud import bigquery + + gcp_project, _ = bq_table_ref.split(":") + + bq_client = bigquery.Client(project=gcp_project) + + bq_table_ref = bq_table_ref.replace(":", ".") + table = bigquery.table.Table(bq_table_ref) + + job_config = bigquery.LoadJobConfig() + job_config.source_format = bigquery.SourceFormat.PARQUET + + time_partitioning_obj = bigquery.table.TimePartitioning( + field=feature_table_timestamp_column + ) + job_config.time_partitioning = time_partitioning_obj + with open(dest_path, "rb") as source_file: + bq_client.load_table_from_file(source_file, table, job_config=job_config) diff --git a/sdk/python/tests/data/dev_featuretable.csv b/sdk/python/tests/data/dev_featuretable.csv new file mode 100644 index 0000000000..61fc785299 --- /dev/null +++ b/sdk/python/tests/data/dev_featuretable.csv @@ -0,0 +1,101 @@ +datetime,datetime_col,dev_feature_float,dev_feature_string +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,0.0,feat_0 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,1.0,feat_1 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,2.0,feat_2 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,3.0,feat_3 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,4.0,feat_4 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,5.0,feat_5 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,6.0,feat_6 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,7.0,feat_7 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,8.0,feat_8 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,9.0,feat_9 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,10.0,feat_10 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,11.0,feat_11 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,12.0,feat_12 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,13.0,feat_13 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,14.0,feat_14 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,15.0,feat_15 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,16.0,feat_16 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,17.0,feat_17 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,18.0,feat_18 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,19.0,feat_19 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,20.0,feat_20 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,21.0,feat_21 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,22.0,feat_22 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,23.0,feat_23 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,24.0,feat_24 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,25.0,feat_25 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,26.0,feat_26 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,27.0,feat_27 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,28.0,feat_28 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,29.0,feat_29 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,30.0,feat_30 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,31.0,feat_31 +2020-10-07 06:39:35.998951+00:00,2020-06-30 06:39:35.998951+00:00,32.0,feat_32 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,33.0,feat_33 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,34.0,feat_34 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,35.0,feat_35 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,36.0,feat_36 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,37.0,feat_37 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,38.0,feat_38 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,39.0,feat_39 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,40.0,feat_40 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,41.0,feat_41 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,42.0,feat_42 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,43.0,feat_43 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,44.0,feat_44 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,45.0,feat_45 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,46.0,feat_46 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,47.0,feat_47 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,48.0,feat_48 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,49.0,feat_49 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,50.0,feat_50 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,51.0,feat_51 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,52.0,feat_52 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,53.0,feat_53 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,54.0,feat_54 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,55.0,feat_55 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,56.0,feat_56 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,57.0,feat_57 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,58.0,feat_58 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,59.0,feat_59 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,60.0,feat_60 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,61.0,feat_61 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,62.0,feat_62 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,63.0,feat_63 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,64.0,feat_64 +2020-10-06 06:39:35.998951+00:00,2020-06-29 06:39:35.998951+00:00,65.0,feat_65 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,66.0,feat_66 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,67.0,feat_67 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,68.0,feat_68 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,69.0,feat_69 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,70.0,feat_70 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,71.0,feat_71 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,72.0,feat_72 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,73.0,feat_73 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,74.0,feat_74 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,75.0,feat_75 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,76.0,feat_76 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,77.0,feat_77 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,78.0,feat_78 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,79.0,feat_79 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,80.0,feat_80 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,81.0,feat_81 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,82.0,feat_82 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,83.0,feat_83 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,84.0,feat_84 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,85.0,feat_85 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,86.0,feat_86 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,87.0,feat_87 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,88.0,feat_88 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,89.0,feat_89 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,90.0,feat_90 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,91.0,feat_91 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,92.0,feat_92 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,93.0,feat_93 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,94.0,feat_94 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,95.0,feat_95 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,96.0,feat_96 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,97.0,feat_97 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,98.0,feat_98 +2020-10-05 06:39:35.998951+00:00,2020-06-28 06:39:35.998951+00:00,99.0,feat_99 diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index 8daa5db5db..a9f6c628e5 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -251,6 +251,43 @@ def client(self, core_server, serving_server): serving_url=f"localhost:{serving_server}", ) + @pytest.fixture + def partitioned_df(self): + # Partitioned DataFrame + N_ROWS = 100 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + final_offset = ( + [time_offset] * 33 + + [time_offset - timedelta(days=1)] * 33 + + [time_offset - timedelta(days=2)] * 34 + ) + final_part_offset = ( + [time_offset - timedelta(days=99)] * 33 + + [time_offset - timedelta(days=100)] * 33 + + [time_offset - timedelta(days=101)] * 34 + ) + return pd.DataFrame( + { + "datetime": final_offset, + "datetime_col": final_part_offset, + "dev_feature_float": [np.float(row) for row in range(N_ROWS)], + "dev_feature_string": ["feat_" + str(row) for row in range(N_ROWS)], + } + ) + + @pytest.fixture + def non_partitioned_df(self): + # Non-Partitioned DataFrame + N_ROWS = 100 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + return pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "dev_feature_float": [np.float(row) for row in range(N_ROWS)], + "dev_feature_string": ["feat_" + str(row) for row in range(N_ROWS)], + } + ) + @pytest.mark.parametrize( "mocked_client", [lazy_fixture("mock_client"), lazy_fixture("secure_mock_client")], @@ -400,30 +437,147 @@ def test_apply_feature_table_success(self, test_client): @pytest.mark.parametrize( "mocked_client", [lazy_fixture("mock_client")], ) - def test_ingest(self, mocked_client, mocker): + def test_ingest_dataframe_partition(self, mocked_client, mocker, partitioned_df): + """ + Test ingestion with local FileSource, using DataFrame. + Partition column stated but not provided in Dataset. + """ mocked_client._core_service_stub = Core.CoreServiceStub( grpc.insecure_channel("") ) - N_ROWS = 100 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - final_offset = ( - [time_offset] * 33 - + [time_offset - timedelta(days=1)] * 33 - + [time_offset - timedelta(days=2)] * 34 + mocker.patch.object( + mocked_client._core_service_stub, + "GetFeatureTable", + return_value=GetFeatureTableResponse( + table=FeatureTableProto( + spec=FeatureTableSpecProto( + name="ingest_featuretable", + max_age=Duration(seconds=3600), + features=[ + FeatureSpecProto( + name="dev_feature_float", + value_type=ValueProto.ValueType.FLOAT, + ), + FeatureSpecProto( + name="dev_feature_string", + value_type=ValueProto.ValueType.STRING, + ), + ], + entities=["dev_entity"], + batch_source=DataSourceProto( + type="BATCH_FILE", + file_options=DataSourceProto.FileOptions( + file_format="parquet", file_url="file://feast/*" + ), + timestamp_column="datetime", + date_partition_column="datetime_col", + ), + ), + meta=FeatureTableMetaProto(), + ) + ), ) - final_part_offset = ( - [time_offset - timedelta(days=99)] * 33 - + [time_offset - timedelta(days=100)] * 33 - + [time_offset - timedelta(days=101)] * 34 + + mocked_client.set_project("my_project") + ft = mocked_client.get_feature_table("ingest_featuretable") + mocked_client.ingest(ft, partitioned_df, timeout=600) + + dest_fpath = os.path.join("feast/") + pq_df = pq.read_table(dest_fpath).to_pandas() + + # Format Dataframes before comparing them + partitioned_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) + partitioned_df = partitioned_df.reindex(sorted(partitioned_df.columns), axis=1) + partitioned_df.reset_index(drop=True, inplace=True) + pq_df.reset_index(drop=True, inplace=True) + pq_df["datetime_col"] = pd.to_datetime(pq_df.datetime_col).dt.tz_convert("UTC") + + assert_frame_equal(partitioned_df, pq_df) + + @pytest.mark.parametrize( + "mocked_client", [lazy_fixture("mock_client")], + ) + def test_ingest_dataframe_no_partition( + self, mocked_client, mocker, non_partitioned_df + ): + """ + Test ingestion with local FileSource, using DataFrame. + Partition column not stated. + """ + mocked_client._core_service_stub = Core.CoreServiceStub( + grpc.insecure_channel("") ) - ft_df = pd.DataFrame( - { - "datetime": final_offset, - "datetime_col": final_part_offset, - "dev_feature_float": [np.float(row) for row in range(N_ROWS)], - "dev_feature_string": ["feat_" + str(row) for row in range(N_ROWS)], - } + + mocker.patch.object( + mocked_client._core_service_stub, + "GetFeatureTable", + return_value=GetFeatureTableResponse( + table=FeatureTableProto( + spec=FeatureTableSpecProto( + name="ingest_featuretable", + max_age=Duration(seconds=3600), + features=[ + FeatureSpecProto( + name="dev_feature_float", + value_type=ValueProto.ValueType.FLOAT, + ), + FeatureSpecProto( + name="dev_feature_string", + value_type=ValueProto.ValueType.STRING, + ), + ], + entities=["dev_entity"], + batch_source=DataSourceProto( + type="BATCH_FILE", + file_options=DataSourceProto.FileOptions( + file_format="parquet", file_url="file://feast2/*" + ), + timestamp_column="datetime", + ), + ), + meta=FeatureTableMetaProto(), + ) + ), + ) + + mocked_client.set_project("my_project") + ft = mocked_client.get_feature_table("ingest_featuretable") + mocked_client.ingest(ft, non_partitioned_df, timeout=600) + + # Since not partitioning, we're only looking for single file + dest_fpath = os.path.join("feast2/") + single_file = [ + f + for f in os.listdir(dest_fpath) + if os.path.isfile(os.path.join(dest_fpath, f)) + ][0] + pq_df = pq.read_table(dest_fpath + single_file).to_pandas() + + # Format Dataframes before comparing them + non_partitioned_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) + non_partitioned_df = non_partitioned_df.reindex( + sorted(non_partitioned_df.columns), axis=1 + ) + non_partitioned_df.reset_index(drop=True, inplace=True) + pq_df.reset_index(drop=True, inplace=True) + + assert_frame_equal(non_partitioned_df, pq_df) + + @pytest.mark.parametrize( + "mocked_client", [lazy_fixture("mock_client")], + ) + def test_ingest_csv(self, mocked_client, mocker): + """ + Test ingestion with local FileSource, using CSV file. + Partition column is provided. + """ + mocked_client._core_service_stub = Core.CoreServiceStub( + grpc.insecure_channel("") ) mocker.patch.object( @@ -448,7 +602,7 @@ def test_ingest(self, mocked_client, mocker): batch_source=DataSourceProto( type="BATCH_FILE", file_options=DataSourceProto.FileOptions( - file_format="parquet", file_url="file://feast/*" + file_format="parquet", file_url="file://feast3/*" ), timestamp_column="datetime", date_partition_column="datetime_col", @@ -459,22 +613,33 @@ def test_ingest(self, mocked_client, mocker): ), ) + partitioned_df = pd.read_csv( + os.path.join( + os.path.dirname(os.path.realpath(__file__)), + "./data/dev_featuretable.csv", + ) + ) + mocked_client.set_project("my_project") ft = mocked_client.get_feature_table("ingest_featuretable") - mocked_client.ingest(ft, ft_df, timeout=600) + mocked_client.ingest(ft, partitioned_df, timeout=600) - dest_fpath = os.path.join("feast/") + dest_fpath = os.path.join("feast3/") pq_df = pq.read_table(dest_fpath).to_pandas() - ft_df.sort_values(by=["dev_feature_float"], inplace=True) + # Format Dataframes before comparing them + partitioned_df.sort_values(by=["dev_feature_float"], inplace=True) pq_df.sort_values(by=["dev_feature_float"], inplace=True) pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) - ft_df = ft_df.reindex(sorted(ft_df.columns), axis=1) - ft_df.reset_index(drop=True, inplace=True) + partitioned_df = partitioned_df.reindex(sorted(partitioned_df.columns), axis=1) + partitioned_df.reset_index(drop=True, inplace=True) pq_df.reset_index(drop=True, inplace=True) + partitioned_df["datetime_col"] = pd.to_datetime( + partitioned_df.datetime_col + ).dt.tz_convert("UTC") pq_df["datetime_col"] = pd.to_datetime(pq_df.datetime_col).dt.tz_convert("UTC") - assert_frame_equal(ft_df, pq_df) + assert_frame_equal(partitioned_df, pq_df) @patch("grpc.channel_ready_future") def test_secure_channel_creation_with_secure_client( From c9506c3c4364d16969f6a119a5fc32fd8078e27a Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 16:07:28 +0800 Subject: [PATCH 24/34] Cleanup FeatureTable and DataSource Signed-off-by: Terence --- sdk/python/feast/__init__.py | 2 - sdk/python/feast/data_source.py | 44 ++++++++++++++++++++ sdk/python/feast/feature_table.py | 69 +++---------------------------- 3 files changed, 50 insertions(+), 65 deletions(-) diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index adf2aaf181..5ac3658d18 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -3,7 +3,6 @@ from .client import Client from .data_source import ( BigQuerySource, - DataSource, FileSource, KafkaSource, KinesisSource, @@ -23,7 +22,6 @@ __all__ = [ "Client", "Entity", - "DataSource", "BigQuerySource", "FileSource", "KafkaSource", diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 40dfc8e3b2..bf6a0c0157 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -686,3 +686,47 @@ def to_proto(self) -> DataSourceProto: data_source_proto.date_partition_column = self.date_partition_column return data_source_proto + + +def _get_data_source(data_source): + """ + Convert data source config in FeatureTable spec to a DataSource class object. + """ + + if issubclass(type(data_source), FileSource): + data_source_obj = FileSource( + field_mapping=data_source.field_mapping, + file_format=data_source.file_options.file_format, + file_url=data_source.file_options.file_url, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + elif issubclass(type(data_source), BigQuerySource): + data_source_obj = BigQuerySource( + field_mapping=data_source.field_mapping, + table_ref=data_source.bigquery_options.table_ref, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + elif issubclass(type(data_source), KafkaSource): + data_source_obj = KafkaSource( + field_mapping=data_source.field_mapping, + bootstrap_servers=data_source.kafka_options.bootstrap_servers, + class_path=data_source.kafka_options.class_path, + topic=data_source.kafka_options.topic, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + elif issubclass(type(data_source), KinesisSource): + data_source_obj = KinesisSource( + field_mapping=data_source.field_mapping, + class_path=data_source.kinesis_options.class_path, + region=data_source.kinesis_options.region, + stream_name=data_source.kinesis_options.stream_name, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + else: + raise ValueError("Could not identify the source type being added") + + return data_source_obj diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index 3f10003182..692894e077 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -29,6 +29,7 @@ FileSource, KafkaSource, KinesisSource, + _get_data_source, ) from feast.feature import Feature from feast.loaders import yaml as feast_yaml @@ -80,7 +81,7 @@ def __eq__(self, other): ): return False - if sorted(self.entities) != sorted(other.entities): + if self.entities != other.entities: return False if self.features != other.features: return False @@ -250,58 +251,6 @@ def from_dict(cls, ft_dict): return cls.from_proto(feature_table_proto) - @classmethod - def _get_data_source(cls, data_source): - """ - Convert data source config in FeatureTable spec to a DataSource class object. - """ - - if data_source.file_options.file_format and data_source.file_options.file_url: - data_source_obj = FileSource( - field_mapping=data_source.field_mapping, - file_format=data_source.file_options.file_format, - file_url=data_source.file_options.file_url, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - elif data_source.bigquery_options.table_ref: - data_source_obj = BigQuerySource( - field_mapping=data_source.field_mapping, - table_ref=data_source.bigquery_options.table_ref, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - elif ( - data_source.kafka_options.bootstrap_servers - and data_source.kafka_options.topic - and data_source.kafka_options.class_path - ): - data_source_obj = KafkaSource( - field_mapping=data_source.field_mapping, - bootstrap_servers=data_source.kafka_options.bootstrap_servers, - class_path=data_source.kafka_options.class_path, - topic=data_source.kafka_options.topic, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - elif ( - data_source.kinesis_options.class_path - and data_source.kinesis_options.region - and data_source.kinesis_options.stream_name - ): - data_source_obj = KinesisSource( - field_mapping=data_source.field_mapping, - class_path=data_source.kinesis_options.class_path, - region=data_source.kinesis_options.region, - stream_name=data_source.kinesis_options.stream_name, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - else: - raise ValueError("Could not identify the source type being added") - - return data_source_obj - @classmethod def from_proto(cls, feature_table_proto: FeatureTableProto): """ @@ -332,15 +281,11 @@ def from_proto(cls, feature_table_proto: FeatureTableProto): and feature_table_proto.spec.max_age.nanos == 0 else feature_table_proto.spec.max_age ), - batch_source=( - None - if not feature_table_proto.spec.batch_source.ByteSize() - else cls._get_data_source(feature_table_proto.spec.batch_source) - ), + batch_source=_get_data_source(feature_table_proto.spec.batch_source), stream_source=( None if not feature_table_proto.spec.stream_source.ByteSize() - else cls._get_data_source(feature_table_proto.spec.stream_source) + else _get_data_source(feature_table_proto.spec.stream_source) ), ) @@ -365,9 +310,8 @@ def to_proto(self) -> FeatureTableProto: name=self.name, entities=self.entities, features=[ - feature.to_proto() + feature.to_proto() if type(feature) == Feature else feature for feature in self.features - if type(feature) == Feature ], labels=self.labels, max_age=self.max_age, @@ -398,9 +342,8 @@ def to_spec_proto(self) -> FeatureTableSpecProto: name=self.name, entities=self.entities, features=[ - feature.to_proto() + feature.to_proto() if type(feature) == Feature else feature for feature in self.features - if type(feature) == Feature ], labels=self.labels, max_age=self.max_age, From f1fa274a664aadea5cf438395cd4446db90b5267 Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 16:20:49 +0800 Subject: [PATCH 25/34] Some fixes Signed-off-by: Terence --- sdk/python/feast/data_source.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index bf6a0c0157..346a5b883f 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -693,7 +693,7 @@ def _get_data_source(data_source): Convert data source config in FeatureTable spec to a DataSource class object. """ - if issubclass(type(data_source), FileSource): + if data_source.file_options.file_format and data_source.file_options.file_url: data_source_obj = FileSource( field_mapping=data_source.field_mapping, file_format=data_source.file_options.file_format, @@ -701,14 +701,18 @@ def _get_data_source(data_source): timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, ) - elif issubclass(type(data_source), BigQuerySource): + elif data_source.bigquery_options.table_ref: data_source_obj = BigQuerySource( field_mapping=data_source.field_mapping, table_ref=data_source.bigquery_options.table_ref, timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, ) - elif issubclass(type(data_source), KafkaSource): + elif ( + data_source.kafka_options.bootstrap_servers + and data_source.kafka_options.topic + and data_source.kafka_options.class_path + ): data_source_obj = KafkaSource( field_mapping=data_source.field_mapping, bootstrap_servers=data_source.kafka_options.bootstrap_servers, @@ -717,7 +721,11 @@ def _get_data_source(data_source): timestamp_column=data_source.timestamp_column, date_partition_column=data_source.date_partition_column, ) - elif issubclass(type(data_source), KinesisSource): + elif ( + data_source.kinesis_options.class_path + and data_source.kinesis_options.region + and data_source.kinesis_options.stream_name + ): data_source_obj = KinesisSource( field_mapping=data_source.field_mapping, class_path=data_source.kinesis_options.class_path, From d43bb44e53d934f3fac0fce2bbd9610e96157f20 Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 17:48:22 +0800 Subject: [PATCH 26/34] Add BQ source test Signed-off-by: Terence --- tests/e2e/test-register.py | 56 +++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/tests/e2e/test-register.py b/tests/e2e/test-register.py index 11f64fb854..99ecd1b339 100644 --- a/tests/e2e/test-register.py +++ b/tests/e2e/test-register.py @@ -1,11 +1,15 @@ import os import uuid +from datetime import datetime +import numpy as np +import pandas as pd import pytest +import pytz from google.protobuf.duration_pb2 import Duration from feast.client import Client -from feast.data_source import FileSource, KafkaSource +from feast.data_source import BigQuerySource, FileSource, KafkaSource from feast.entity import Entity from feast.feature import Feature from feast.feature_table import FeatureTable @@ -13,6 +17,7 @@ DIR_PATH = os.path.dirname(os.path.realpath(__file__)) PROJECT_NAME = "basic_" + uuid.uuid4().hex.upper()[0:6] +SUFFIX = str(int(datetime.now().timestamp())) @pytest.fixture(scope="module") @@ -85,6 +90,36 @@ def basic_featuretable(): ) +@pytest.fixture +def bq_dataset(): + N_ROWS = 100 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + return pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "dev_feature_float": [np.float(row) for row in range(N_ROWS)], + "dev_feature_string": ["feat_" + str(row) for row in range(N_ROWS)], + } + ) + + +@pytest.fixture +def bq_featuretable(): + batch_source = BigQuerySource( + table_ref=f"kf-feast:feaste2e.table{SUFFIX}", timestamp_column="datetime", + ) + return FeatureTable( + name="basic_featuretable", + entities=["driver_id", "customer_id"], + features=[ + Feature(name="dev_feature_float", dtype=ValueType.FLOAT), + Feature(name="dev_feature_string", dtype=ValueType.STRING), + ], + max_age=Duration(seconds=3600), + batch_source=batch_source, + ) + + @pytest.fixture def alltypes_entity(): return Entity( @@ -194,3 +229,22 @@ def test_get_list_alltypes( ft for ft in client.list_feature_tables() if ft.name == "alltypes" ][0] assert actual_list_feature_table == alltypes_featuretable + + +def test_ingest( + client: Client, bq_featuretable: FeatureTable, bq_dataset: pd.DataFrame +): + # ApplyFeatureTable + client.apply_feature_table(bq_featuretable) + client.ingest(bq_featuretable, bq_dataset, timeout=600) + + from google.cloud import bigquery + + bq_client = bigquery.Client(project="kf-feast") + query_string = f"SELECT COUNT(*) FROM `kf-feast.feaste2e.table{SUFFIX}`" + + job = bq_client.query(query_string) + results = job.result() + + for row in results: + assert row[0] == 100 From 33bd9dba56b29498d99ff9791770e03b723e27cb Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 18:26:57 +0800 Subject: [PATCH 27/34] Revert "Add BQ source test" This reverts commit d567937eaf80190cde59128c19af4644c810e7d9. Signed-off-by: Terence --- tests/e2e/test-register.py | 56 +------------------------------------- 1 file changed, 1 insertion(+), 55 deletions(-) diff --git a/tests/e2e/test-register.py b/tests/e2e/test-register.py index 99ecd1b339..11f64fb854 100644 --- a/tests/e2e/test-register.py +++ b/tests/e2e/test-register.py @@ -1,15 +1,11 @@ import os import uuid -from datetime import datetime -import numpy as np -import pandas as pd import pytest -import pytz from google.protobuf.duration_pb2 import Duration from feast.client import Client -from feast.data_source import BigQuerySource, FileSource, KafkaSource +from feast.data_source import FileSource, KafkaSource from feast.entity import Entity from feast.feature import Feature from feast.feature_table import FeatureTable @@ -17,7 +13,6 @@ DIR_PATH = os.path.dirname(os.path.realpath(__file__)) PROJECT_NAME = "basic_" + uuid.uuid4().hex.upper()[0:6] -SUFFIX = str(int(datetime.now().timestamp())) @pytest.fixture(scope="module") @@ -90,36 +85,6 @@ def basic_featuretable(): ) -@pytest.fixture -def bq_dataset(): - N_ROWS = 100 - time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) - return pd.DataFrame( - { - "datetime": [time_offset] * N_ROWS, - "dev_feature_float": [np.float(row) for row in range(N_ROWS)], - "dev_feature_string": ["feat_" + str(row) for row in range(N_ROWS)], - } - ) - - -@pytest.fixture -def bq_featuretable(): - batch_source = BigQuerySource( - table_ref=f"kf-feast:feaste2e.table{SUFFIX}", timestamp_column="datetime", - ) - return FeatureTable( - name="basic_featuretable", - entities=["driver_id", "customer_id"], - features=[ - Feature(name="dev_feature_float", dtype=ValueType.FLOAT), - Feature(name="dev_feature_string", dtype=ValueType.STRING), - ], - max_age=Duration(seconds=3600), - batch_source=batch_source, - ) - - @pytest.fixture def alltypes_entity(): return Entity( @@ -229,22 +194,3 @@ def test_get_list_alltypes( ft for ft in client.list_feature_tables() if ft.name == "alltypes" ][0] assert actual_list_feature_table == alltypes_featuretable - - -def test_ingest( - client: Client, bq_featuretable: FeatureTable, bq_dataset: pd.DataFrame -): - # ApplyFeatureTable - client.apply_feature_table(bq_featuretable) - client.ingest(bq_featuretable, bq_dataset, timeout=600) - - from google.cloud import bigquery - - bq_client = bigquery.Client(project="kf-feast") - query_string = f"SELECT COUNT(*) FROM `kf-feast.feaste2e.table{SUFFIX}`" - - job = bq_client.query(query_string) - results = job.result() - - for row in results: - assert row[0] == 100 From 99821c5dedb3f691bf3ad665cf0fd27c1cc89b97 Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 22:14:48 +0800 Subject: [PATCH 28/34] Add BQ source test Signed-off-by: Terence --- tests/e2e/test-register.py | 77 +++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/tests/e2e/test-register.py b/tests/e2e/test-register.py index 11f64fb854..3581ae8891 100644 --- a/tests/e2e/test-register.py +++ b/tests/e2e/test-register.py @@ -1,11 +1,17 @@ import os +import time import uuid +from datetime import datetime +import numpy as np +import pandas as pd import pytest +import pytz from google.protobuf.duration_pb2 import Duration +from pandas.testing import assert_frame_equal from feast.client import Client -from feast.data_source import FileSource, KafkaSource +from feast.data_source import BigQuerySource, FileSource, KafkaSource from feast.entity import Entity from feast.feature import Feature from feast.feature_table import FeatureTable @@ -13,6 +19,7 @@ DIR_PATH = os.path.dirname(os.path.realpath(__file__)) PROJECT_NAME = "basic_" + uuid.uuid4().hex.upper()[0:6] +SUFFIX = str(int(datetime.now().timestamp())) @pytest.fixture(scope="module") @@ -27,6 +34,11 @@ def client(pytestconfig): return client +@pytest.fixture +def bq_table_id(): + return f"kf-feast:feaste2e.table{SUFFIX}" + + @pytest.fixture def customer_entity(): return Entity( @@ -85,6 +97,34 @@ def basic_featuretable(): ) +@pytest.fixture +def bq_dataset(): + N_ROWS = 100 + time_offset = datetime.utcnow().replace(tzinfo=pytz.utc) + return pd.DataFrame( + { + "datetime": [time_offset] * N_ROWS, + "dev_feature_float": [np.float(row) for row in range(N_ROWS)], + "dev_feature_string": ["feat_" + str(row) for row in range(N_ROWS)], + } + ) + + +@pytest.fixture +def bq_featuretable(bq_table_id): + batch_source = BigQuerySource(table_ref=bq_table_id, timestamp_column="datetime",) + return FeatureTable( + name="basic_featuretable", + entities=["driver_id", "customer_id"], + features=[ + Feature(name="dev_feature_float", dtype=ValueType.FLOAT), + Feature(name="dev_feature_string", dtype=ValueType.STRING), + ], + max_age=Duration(seconds=3600), + batch_source=batch_source, + ) + + @pytest.fixture def alltypes_entity(): return Entity( @@ -194,3 +234,38 @@ def test_get_list_alltypes( ft for ft in client.list_feature_tables() if ft.name == "alltypes" ][0] assert actual_list_feature_table == alltypes_featuretable + + +def test_ingest( + client: Client, + customer_entity: Entity, + driver_entity: Entity, + bq_featuretable: FeatureTable, + bq_dataset: pd.DataFrame, + bq_table_id: str, +): + gcp_project, _ = bq_table_id.split(":") + bq_table_id = bq_table_id.replace(":", ".") + + # ApplyEntity + client.apply_entity(customer_entity) + client.apply_entity(driver_entity) + + # ApplyFeatureTable + client.apply_feature_table(bq_featuretable) + client.ingest(bq_featuretable, bq_dataset, timeout=120) + + # Give time to allow data to propagate to BQ table + time.sleep(15) + + from google.cloud import bigquery + + bq_client = bigquery.Client(project=gcp_project) + query_string = f"SELECT * FROM `{bq_table_id}`" + + job = bq_client.query(query_string) + query_df = job.to_dataframe() + + assert_frame_equal(query_df, bq_dataset) + + bq_client.delete_table(bq_table_id, not_found_ok=True) From 5367a480201acf3ae87d4072330fc2f27210cb30 Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 22:32:47 +0800 Subject: [PATCH 29/34] Update perms for bq test Signed-off-by: Terence --- .prow/config.yaml | 4 ++-- infra/scripts/test-end-to-end-redis-cluster.sh | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.prow/config.yaml b/.prow/config.yaml index 39c275603d..4155adad91 100644 --- a/.prow/config.yaml +++ b/.prow/config.yaml @@ -147,7 +147,7 @@ presubmits: spec: containers: - image: maven:3.6-jdk-11 - command: ["infra/scripts/test-end-to-end.sh"] + command: ["infra/scripts/test-end-to-end.sh", "True"] resources: requests: cpu: "6" @@ -199,7 +199,7 @@ presubmits: spec: containers: - image: maven:3.6-jdk-11 - command: ["infra/scripts/test-end-to-end-redis-cluster.sh"] + command: ["infra/scripts/test-end-to-end-redis-cluster.sh", "True"] resources: requests: cpu: "6" diff --git a/infra/scripts/test-end-to-end-redis-cluster.sh b/infra/scripts/test-end-to-end-redis-cluster.sh index 0e5aa5879a..ffb4c21aca 100755 --- a/infra/scripts/test-end-to-end-redis-cluster.sh +++ b/infra/scripts/test-end-to-end-redis-cluster.sh @@ -2,6 +2,8 @@ set -e set -o pipefail +[[ $1 == "True" ]] && ENABLE_AUTH="true" || ENABLE_AUTH="false" +echo "Authenication enabled : ${ENABLE_AUTH}" test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/gcloud/service-account.json" test -z ${SKIP_BUILD_JARS} && SKIP_BUILD_JARS="false" From d8ade3765835c1cc86aaf9ea9d648b861e0f47ad Mon Sep 17 00:00:00 2001 From: Terence Date: Wed, 7 Oct 2020 23:16:59 +0800 Subject: [PATCH 30/34] Revert "Update perms for bq test" This reverts commit 7e74e9069f97af9c0e108aba8f4bd1197ba5c3ed. Signed-off-by: Terence --- .prow/config.yaml | 40 ++++--------------- .../scripts/test-end-to-end-redis-cluster.sh | 5 +-- 2 files changed, 9 insertions(+), 36 deletions(-) diff --git a/.prow/config.yaml b/.prow/config.yaml index 4155adad91..381fcccea8 100644 --- a/.prow/config.yaml +++ b/.prow/config.yaml @@ -147,23 +147,11 @@ presubmits: spec: containers: - image: maven:3.6-jdk-11 - command: ["infra/scripts/test-end-to-end.sh", "True"] + command: ["infra/scripts/test-end-to-end.sh"] resources: requests: cpu: "6" memory: "6144Mi" - env: - - name: GOOGLE_APPLICATION_CREDENTIALS - value: /etc/gcloud/service-account.json - volumeMounts: - - mountPath: /etc/gcloud/service-account.json - name: service-account - readOnly: true - subPath: service-account.json - volumes: - - name: service-account - secret: - secretName: feast-service-account skip_branches: - ^v0\.(3|4)-branch$ @@ -198,26 +186,14 @@ presubmits: always_run: true spec: containers: - - image: maven:3.6-jdk-11 - command: ["infra/scripts/test-end-to-end-redis-cluster.sh", "True"] - resources: - requests: - cpu: "6" - memory: "6144Mi" - env: - - name: GOOGLE_APPLICATION_CREDENTIALS - value: /etc/gcloud/service-account.json - volumeMounts: - - mountPath: /etc/gcloud/service-account.json - name: service-account - readOnly: true - subPath: service-account.json - volumes: - - name: service-account - secret: - secretName: feast-service-account + - image: maven:3.6-jdk-11 + command: ["infra/scripts/test-end-to-end-redis-cluster.sh"] + resources: + requests: + cpu: "6" + memory: "6144Mi" skip_branches: - - ^v0\.(3|4)-branch$ + - ^v0\.(3|4)-branch$ - name: test-end-to-end-java-8 decorate: true diff --git a/infra/scripts/test-end-to-end-redis-cluster.sh b/infra/scripts/test-end-to-end-redis-cluster.sh index ffb4c21aca..12e7dc8b45 100755 --- a/infra/scripts/test-end-to-end-redis-cluster.sh +++ b/infra/scripts/test-end-to-end-redis-cluster.sh @@ -2,10 +2,8 @@ set -e set -o pipefail -[[ $1 == "True" ]] && ENABLE_AUTH="true" || ENABLE_AUTH="false" -echo "Authenication enabled : ${ENABLE_AUTH}" -test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/gcloud/service-account.json" +test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account/service-account.json" test -z ${SKIP_BUILD_JARS} && SKIP_BUILD_JARS="false" test -z ${GOOGLE_CLOUD_PROJECT} && GOOGLE_CLOUD_PROJECT="kf-feast" test -z ${TEMP_BUCKET} && TEMP_BUCKET="feast-templocation-kf-feast" @@ -32,7 +30,6 @@ This script will run end-to-end tests for Feast Core and Online Serving. source ${SCRIPTS_DIR}/setup-common-functions.sh install_test_tools -install_gcloud_sdk install_and_start_local_redis_cluster install_and_start_local_postgres install_and_start_local_zookeeper_and_kafka From 20cd4d93b538af4ddd003468c3487879d9e3afed Mon Sep 17 00:00:00 2001 From: Terence Date: Thu, 8 Oct 2020 09:39:57 +0800 Subject: [PATCH 31/34] Cleanup datasource Signed-off-by: Terence --- sdk/python/feast/data_source.py | 109 +++++++++++++++--------------- sdk/python/feast/feature_table.py | 5 +- 2 files changed, 55 insertions(+), 59 deletions(-) diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index 346a5b883f..04f4752c37 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -415,12 +415,57 @@ def date_partition_column(self, date_partition_column): """ self._date_partition_column = date_partition_column - @classmethod - def from_proto(cls, data_source_proto: DataSourceProto): + @staticmethod + def from_proto(data_source): """ - Creates a DataSource from a protobuf representation of a data source + Convert data source config in FeatureTable spec to a DataSource class object. """ - raise NotImplementedError + + if data_source.file_options.file_format and data_source.file_options.file_url: + data_source_obj = FileSource( + field_mapping=data_source.field_mapping, + file_format=data_source.file_options.file_format, + file_url=data_source.file_options.file_url, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + elif data_source.bigquery_options.table_ref: + data_source_obj = BigQuerySource( + field_mapping=data_source.field_mapping, + table_ref=data_source.bigquery_options.table_ref, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + elif ( + data_source.kafka_options.bootstrap_servers + and data_source.kafka_options.topic + and data_source.kafka_options.class_path + ): + data_source_obj = KafkaSource( + field_mapping=data_source.field_mapping, + bootstrap_servers=data_source.kafka_options.bootstrap_servers, + class_path=data_source.kafka_options.class_path, + topic=data_source.kafka_options.topic, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + elif ( + data_source.kinesis_options.class_path + and data_source.kinesis_options.region + and data_source.kinesis_options.stream_name + ): + data_source_obj = KinesisSource( + field_mapping=data_source.field_mapping, + class_path=data_source.kinesis_options.class_path, + region=data_source.kinesis_options.region, + stream_name=data_source.kinesis_options.stream_name, + timestamp_column=data_source.timestamp_column, + date_partition_column=data_source.date_partition_column, + ) + else: + raise ValueError("Could not identify the source type being added") + + return data_source_obj def to_proto(self) -> DataSourceProto: """ @@ -467,6 +512,7 @@ def file_options(self, file_options): """ self._file_options = file_options + @classmethod def from_proto(cls, data_source_proto): data_source = cls( @@ -527,6 +573,7 @@ def bigquery_options(self, bigquery_options): """ self._bigquery_options = bigquery_options + @classmethod def from_proto(cls, data_source_proto): data_source = cls( @@ -596,6 +643,7 @@ def kafka_options(self, kafka_options): """ self._kafka_options = kafka_options + @classmethod def from_proto(cls, data_source_proto): data_source = cls( @@ -664,6 +712,7 @@ def kinesis_options(self, kinesis_options): """ self._kinesis_options = kinesis_options + @classmethod def from_proto(cls, data_source_proto): data_source = cls( @@ -686,55 +735,3 @@ def to_proto(self) -> DataSourceProto: data_source_proto.date_partition_column = self.date_partition_column return data_source_proto - - -def _get_data_source(data_source): - """ - Convert data source config in FeatureTable spec to a DataSource class object. - """ - - if data_source.file_options.file_format and data_source.file_options.file_url: - data_source_obj = FileSource( - field_mapping=data_source.field_mapping, - file_format=data_source.file_options.file_format, - file_url=data_source.file_options.file_url, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - elif data_source.bigquery_options.table_ref: - data_source_obj = BigQuerySource( - field_mapping=data_source.field_mapping, - table_ref=data_source.bigquery_options.table_ref, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - elif ( - data_source.kafka_options.bootstrap_servers - and data_source.kafka_options.topic - and data_source.kafka_options.class_path - ): - data_source_obj = KafkaSource( - field_mapping=data_source.field_mapping, - bootstrap_servers=data_source.kafka_options.bootstrap_servers, - class_path=data_source.kafka_options.class_path, - topic=data_source.kafka_options.topic, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - elif ( - data_source.kinesis_options.class_path - and data_source.kinesis_options.region - and data_source.kinesis_options.stream_name - ): - data_source_obj = KinesisSource( - field_mapping=data_source.field_mapping, - class_path=data_source.kinesis_options.class_path, - region=data_source.kinesis_options.region, - stream_name=data_source.kinesis_options.stream_name, - timestamp_column=data_source.timestamp_column, - date_partition_column=data_source.date_partition_column, - ) - else: - raise ValueError("Could not identify the source type being added") - - return data_source_obj diff --git a/sdk/python/feast/feature_table.py b/sdk/python/feast/feature_table.py index 692894e077..b1401ec97a 100644 --- a/sdk/python/feast/feature_table.py +++ b/sdk/python/feast/feature_table.py @@ -29,7 +29,6 @@ FileSource, KafkaSource, KinesisSource, - _get_data_source, ) from feast.feature import Feature from feast.loaders import yaml as feast_yaml @@ -281,11 +280,11 @@ def from_proto(cls, feature_table_proto: FeatureTableProto): and feature_table_proto.spec.max_age.nanos == 0 else feature_table_proto.spec.max_age ), - batch_source=_get_data_source(feature_table_proto.spec.batch_source), + batch_source=DataSource.from_proto(feature_table_proto.spec.batch_source), stream_source=( None if not feature_table_proto.spec.stream_source.ByteSize() - else _get_data_source(feature_table_proto.spec.stream_source) + else DataSource.from_proto(feature_table_proto.spec.stream_source) ), ) From 404cc364c3ce8b302bafa3bf9c65c4fa6a3a2d11 Mon Sep 17 00:00:00 2001 From: Terence Date: Thu, 8 Oct 2020 10:23:27 +0800 Subject: [PATCH 32/34] Renaming and optimize file search loop Signed-off-by: Terence --- sdk/python/feast/client.py | 11 ++++++----- sdk/python/feast/loaders/ingest.py | 12 ++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/sdk/python/feast/client.py b/sdk/python/feast/client.py index 96ed8e7538..0b418e303d 100644 --- a/sdk/python/feast/client.py +++ b/sdk/python/feast/client.py @@ -663,11 +663,12 @@ def ingest( ) dir_path = None - to_partition = False - if feature_table.batch_source.date_partition_column and issubclass( - type(feature_table.batch_source), FileSource + with_partitions = False + if ( + issubclass(type(feature_table.batch_source), FileSource) + and feature_table.batch_source.date_partition_column ): - to_partition = True + with_partitions = True dest_path = _write_partitioned_table_from_source( column_names, pyarrow_table, @@ -682,7 +683,7 @@ def ingest( try: if issubclass(type(feature_table.batch_source), FileSource): file_url = feature_table.batch_source.file_options.file_url[:-1] - _upload_to_file_source(file_url, to_partition, dest_path) + _upload_to_file_source(file_url, with_partitions, dest_path) if issubclass(type(feature_table.batch_source), BigQuerySource): bq_table_ref = feature_table.batch_source.bigquery_options.table_ref feature_table_timestamp_column = ( diff --git a/sdk/python/feast/loaders/ingest.py b/sdk/python/feast/loaders/ingest.py index 528e47fa7c..581958f5c0 100644 --- a/sdk/python/feast/loaders/ingest.py +++ b/sdk/python/feast/loaders/ingest.py @@ -1,3 +1,4 @@ +import glob import os import tempfile import time @@ -171,7 +172,9 @@ def _read_table_from_source( return table, column_names -def _upload_to_file_source(file_url: str, to_partition: bool, dest_path: str) -> None: +def _upload_to_file_source( + file_url: str, with_partitions: bool, dest_path: str +) -> None: """ Uploads data into a FileSource. Currently supports GCS, S3 and Local FS. @@ -183,11 +186,8 @@ def _upload_to_file_source(file_url: str, to_partition: bool, dest_path: str) -> uri = urlparse(file_url) staging_client = get_staging_client(uri.scheme) - if to_partition: - file_paths = list() - for (dirpath, dirnames, filenames) in os.walk(dest_path): - file_paths += [os.path.join(dirpath, file) for file in filenames] - for path in file_paths: + if with_partitions: + for path in glob.glob(os.path.join(dest_path, "**/*")): file_name = path.split("/")[-1] partition_col = path.split("/")[-2] staging_client.upload_file( From 6990f16ef0f535ec9dd128369aa98704deb72aa4 Mon Sep 17 00:00:00 2001 From: Terence Date: Thu, 8 Oct 2020 11:02:00 +0800 Subject: [PATCH 33/34] Remove duplicated code in pytest Signed-off-by: Terence --- sdk/python/tests/test_client.py | 185 +++++++++++++------------------- 1 file changed, 77 insertions(+), 108 deletions(-) diff --git a/sdk/python/tests/test_client.py b/sdk/python/tests/test_client.py index a9f6c628e5..05e598ec34 100644 --- a/sdk/python/tests/test_client.py +++ b/sdk/python/tests/test_client.py @@ -16,6 +16,7 @@ import socket from concurrent import futures from datetime import datetime, timedelta +from typing import Tuple from unittest import mock import grpc @@ -449,33 +450,8 @@ def test_ingest_dataframe_partition(self, mocked_client, mocker, partitioned_df) mocker.patch.object( mocked_client._core_service_stub, "GetFeatureTable", - return_value=GetFeatureTableResponse( - table=FeatureTableProto( - spec=FeatureTableSpecProto( - name="ingest_featuretable", - max_age=Duration(seconds=3600), - features=[ - FeatureSpecProto( - name="dev_feature_float", - value_type=ValueProto.ValueType.FLOAT, - ), - FeatureSpecProto( - name="dev_feature_string", - value_type=ValueProto.ValueType.STRING, - ), - ], - entities=["dev_entity"], - batch_source=DataSourceProto( - type="BATCH_FILE", - file_options=DataSourceProto.FileOptions( - file_format="parquet", file_url="file://feast/*" - ), - timestamp_column="datetime", - date_partition_column="datetime_col", - ), - ), - meta=FeatureTableMetaProto(), - ) + return_value=_ingest_test_getfeaturetable_mocked_resp( + "file://feast/*", "datetime_col" ), ) @@ -486,14 +462,9 @@ def test_ingest_dataframe_partition(self, mocked_client, mocker, partitioned_df) dest_fpath = os.path.join("feast/") pq_df = pq.read_table(dest_fpath).to_pandas() - # Format Dataframes before comparing them - partitioned_df.sort_values(by=["dev_feature_float"], inplace=True) - pq_df.sort_values(by=["dev_feature_float"], inplace=True) - pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) - partitioned_df = partitioned_df.reindex(sorted(partitioned_df.columns), axis=1) - partitioned_df.reset_index(drop=True, inplace=True) - pq_df.reset_index(drop=True, inplace=True) - pq_df["datetime_col"] = pd.to_datetime(pq_df.datetime_col).dt.tz_convert("UTC") + partitioned_df, pq_df = _ingest_test_format_dataframes( + partitioned_df, pq_df, True + ) assert_frame_equal(partitioned_df, pq_df) @@ -514,33 +485,7 @@ def test_ingest_dataframe_no_partition( mocker.patch.object( mocked_client._core_service_stub, "GetFeatureTable", - return_value=GetFeatureTableResponse( - table=FeatureTableProto( - spec=FeatureTableSpecProto( - name="ingest_featuretable", - max_age=Duration(seconds=3600), - features=[ - FeatureSpecProto( - name="dev_feature_float", - value_type=ValueProto.ValueType.FLOAT, - ), - FeatureSpecProto( - name="dev_feature_string", - value_type=ValueProto.ValueType.STRING, - ), - ], - entities=["dev_entity"], - batch_source=DataSourceProto( - type="BATCH_FILE", - file_options=DataSourceProto.FileOptions( - file_format="parquet", file_url="file://feast2/*" - ), - timestamp_column="datetime", - ), - ), - meta=FeatureTableMetaProto(), - ) - ), + return_value=_ingest_test_getfeaturetable_mocked_resp("file://feast2/*"), ) mocked_client.set_project("my_project") @@ -556,15 +501,9 @@ def test_ingest_dataframe_no_partition( ][0] pq_df = pq.read_table(dest_fpath + single_file).to_pandas() - # Format Dataframes before comparing them - non_partitioned_df.sort_values(by=["dev_feature_float"], inplace=True) - pq_df.sort_values(by=["dev_feature_float"], inplace=True) - pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) - non_partitioned_df = non_partitioned_df.reindex( - sorted(non_partitioned_df.columns), axis=1 + non_partitioned_df, pq_df = _ingest_test_format_dataframes( + non_partitioned_df, pq_df ) - non_partitioned_df.reset_index(drop=True, inplace=True) - pq_df.reset_index(drop=True, inplace=True) assert_frame_equal(non_partitioned_df, pq_df) @@ -583,33 +522,8 @@ def test_ingest_csv(self, mocked_client, mocker): mocker.patch.object( mocked_client._core_service_stub, "GetFeatureTable", - return_value=GetFeatureTableResponse( - table=FeatureTableProto( - spec=FeatureTableSpecProto( - name="ingest_featuretable", - max_age=Duration(seconds=3600), - features=[ - FeatureSpecProto( - name="dev_feature_float", - value_type=ValueProto.ValueType.FLOAT, - ), - FeatureSpecProto( - name="dev_feature_string", - value_type=ValueProto.ValueType.STRING, - ), - ], - entities=["dev_entity"], - batch_source=DataSourceProto( - type="BATCH_FILE", - file_options=DataSourceProto.FileOptions( - file_format="parquet", file_url="file://feast3/*" - ), - timestamp_column="datetime", - date_partition_column="datetime_col", - ), - ), - meta=FeatureTableMetaProto(), - ) + return_value=_ingest_test_getfeaturetable_mocked_resp( + "file://feast3/*", "datetime_col" ), ) @@ -627,17 +541,9 @@ def test_ingest_csv(self, mocked_client, mocker): dest_fpath = os.path.join("feast3/") pq_df = pq.read_table(dest_fpath).to_pandas() - # Format Dataframes before comparing them - partitioned_df.sort_values(by=["dev_feature_float"], inplace=True) - pq_df.sort_values(by=["dev_feature_float"], inplace=True) - pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) - partitioned_df = partitioned_df.reindex(sorted(partitioned_df.columns), axis=1) - partitioned_df.reset_index(drop=True, inplace=True) - pq_df.reset_index(drop=True, inplace=True) - partitioned_df["datetime_col"] = pd.to_datetime( - partitioned_df.datetime_col - ).dt.tz_convert("UTC") - pq_df["datetime_col"] = pd.to_datetime(pq_df.datetime_col).dt.tz_convert("UTC") + partitioned_df, pq_df = _ingest_test_format_dataframes( + partitioned_df, pq_df, True + ) assert_frame_equal(partitioned_df, pq_df) @@ -710,3 +616,66 @@ def test_no_auth_sent_when_auth_disabled( ): client = Client(core_url=f"localhost:{insecure_core_server_that_blocks_auth}") client.list_feature_tables() + + +def _ingest_test_getfeaturetable_mocked_resp( + file_url: str, date_partition_col: str = None +): + return GetFeatureTableResponse( + table=FeatureTableProto( + spec=FeatureTableSpecProto( + name="ingest_featuretable", + max_age=Duration(seconds=3600), + features=[ + FeatureSpecProto( + name="dev_feature_float", value_type=ValueProto.ValueType.FLOAT, + ), + FeatureSpecProto( + name="dev_feature_string", + value_type=ValueProto.ValueType.STRING, + ), + ], + entities=["dev_entity"], + batch_source=DataSourceProto( + file_options=DataSourceProto.FileOptions( + file_format="parquet", file_url=file_url + ), + timestamp_column="datetime", + date_partition_column=date_partition_col + if date_partition_col is not None + else None, + ), + ), + meta=FeatureTableMetaProto(), + ) + ) + + +def _ingest_test_format_dataframes( + partitioned_df: pd.DataFrame, pq_df: pd.DataFrame, with_partitions: bool = False +) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Format Dataframes before comparing them through assertion. + + Args: + partitioned_df: DataFrame from pytest fixture + pq_df: DataFrame from parquet files + with_partitions: Flag to indicate if data has been partitioned + + Returns: + Formatted DataFrames for comparison + """ + partitioned_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df.sort_values(by=["dev_feature_float"], inplace=True) + pq_df = pq_df.reindex(sorted(pq_df.columns), axis=1) + partitioned_df = partitioned_df.reindex(sorted(partitioned_df.columns), axis=1) + partitioned_df.reset_index(drop=True, inplace=True) + pq_df.reset_index(drop=True, inplace=True) + + if with_partitions: + partitioned_df["datetime_col"] = pd.to_datetime( + partitioned_df.datetime_col + ).dt.tz_convert("UTC") + pq_df["datetime_col"] = pd.to_datetime(pq_df.datetime_col).dt.tz_convert("UTC") + + return partitioned_df, pq_df From b6ee24bded0f0a055f3129485ec77d87d26dd9ce Mon Sep 17 00:00:00 2001 From: Terence Date: Thu, 8 Oct 2020 11:10:45 +0800 Subject: [PATCH 34/34] Fix prow config Signed-off-by: Terence --- .prow/config.yaml | 38 +++++++++++++++---- .../scripts/test-end-to-end-redis-cluster.sh | 3 +- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/.prow/config.yaml b/.prow/config.yaml index 381fcccea8..39c275603d 100644 --- a/.prow/config.yaml +++ b/.prow/config.yaml @@ -152,6 +152,18 @@ presubmits: requests: cpu: "6" memory: "6144Mi" + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /etc/gcloud/service-account.json + volumeMounts: + - mountPath: /etc/gcloud/service-account.json + name: service-account + readOnly: true + subPath: service-account.json + volumes: + - name: service-account + secret: + secretName: feast-service-account skip_branches: - ^v0\.(3|4)-branch$ @@ -186,14 +198,26 @@ presubmits: always_run: true spec: containers: - - image: maven:3.6-jdk-11 - command: ["infra/scripts/test-end-to-end-redis-cluster.sh"] - resources: - requests: - cpu: "6" - memory: "6144Mi" + - image: maven:3.6-jdk-11 + command: ["infra/scripts/test-end-to-end-redis-cluster.sh"] + resources: + requests: + cpu: "6" + memory: "6144Mi" + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /etc/gcloud/service-account.json + volumeMounts: + - mountPath: /etc/gcloud/service-account.json + name: service-account + readOnly: true + subPath: service-account.json + volumes: + - name: service-account + secret: + secretName: feast-service-account skip_branches: - - ^v0\.(3|4)-branch$ + - ^v0\.(3|4)-branch$ - name: test-end-to-end-java-8 decorate: true diff --git a/infra/scripts/test-end-to-end-redis-cluster.sh b/infra/scripts/test-end-to-end-redis-cluster.sh index 12e7dc8b45..0e5aa5879a 100755 --- a/infra/scripts/test-end-to-end-redis-cluster.sh +++ b/infra/scripts/test-end-to-end-redis-cluster.sh @@ -3,7 +3,7 @@ set -e set -o pipefail -test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/service-account/service-account.json" +test -z ${GOOGLE_APPLICATION_CREDENTIALS} && GOOGLE_APPLICATION_CREDENTIALS="/etc/gcloud/service-account.json" test -z ${SKIP_BUILD_JARS} && SKIP_BUILD_JARS="false" test -z ${GOOGLE_CLOUD_PROJECT} && GOOGLE_CLOUD_PROJECT="kf-feast" test -z ${TEMP_BUCKET} && TEMP_BUCKET="feast-templocation-kf-feast" @@ -30,6 +30,7 @@ This script will run end-to-end tests for Feast Core and Online Serving. source ${SCRIPTS_DIR}/setup-common-functions.sh install_test_tools +install_gcloud_sdk install_and_start_local_redis_cluster install_and_start_local_postgres install_and_start_local_zookeeper_and_kafka