feast-dev · feast-ci-bot · Nov 2, 2020 · Oct 29, 2020 · Oct 29, 2020 · Oct 29, 2020
@@ -10,7 +10,8 @@ python -m pip install --upgrade pip setuptools wheel
 make install-python
 python -m pip install -qr tests/requirements.txt
 
-su -p postgres -c "PATH=$PATH HOME=/tmp pytest tests/e2e/ \
+su -p postgres -c "PATH=$PATH HOME=/tmp pytest -v tests/e2e/ \
       --feast-version develop --env=gcloud --dataproc-cluster-name feast-e2e \
       --dataproc-project kf-feast --dataproc-region us-central1 \
-      --redis-url 10.128.0.105:6379 --redis-cluster --kafka-brokers 10.128.0.103:9094"
+      --redis-url 10.128.0.105:6379 --redis-cluster --kafka-brokers 10.128.0.103:9094 \
+      --bq-project kf-feast"
@@ -7,4 +7,4 @@ python -m pip install --upgrade pip setuptools wheel
 make install-python
 python -m pip install -qr tests/requirements.txt
 
-su -p postgres -c "PATH=$PATH HOME=/tmp pytest tests/e2e/ --feast-version develop"
+su -p postgres -c "PATH=$PATH HOME=/tmp pytest -v tests/e2e/ --feast-version develop"
@@ -128,6 +128,9 @@ message GetHistoricalFeaturesRequest {
   // Export to AWS S3 - s3://path/to/features
   // Export to GCP GCS -  gs://path/to/features
   string output_location = 4;
+
+  // Specify format name for output, eg. parquet
+  string output_format = 5;
 }
 
 message GetHistoricalFeaturesResponse {

@@ -15,12 +15,10 @@
 import multiprocessing
 import os
 import shutil
-import tempfile
 import uuid
 from datetime import datetime
 from itertools import groupby
 from typing import Any, Dict, List, Optional, Union
-from urllib.parse import urlparse
 
 import grpc
 import pandas as pd
@@ -101,7 +99,11 @@
     GetOnlineFeaturesRequestV2,
 )
 from feast.serving.ServingService_pb2_grpc import ServingServiceStub
-from feast.staging.storage_client import get_staging_client
+from feast.staging.entities import (
+    stage_entities_to_bq,
+    stage_entities_to_fs,
+    table_reference_from_string,
+)
 
 _logger = logging.getLogger(__name__)
 
@@ -855,6 +857,7 @@ def get_online_features(
                     entity_rows=_infer_online_entity_rows(entity_rows),
                     project=project if project is not None else self.project,
                 ),
+                timeout=self._config.getint(CONFIG_GRPC_CONNECTION_TIMEOUT_DEFAULT_KEY),
                 metadata=self._get_grpc_metadata(),
             )
         except grpc.RpcError as e:
@@ -879,8 +882,11 @@ def get_historical_features(
                 "feature_table:feature" where "feature_table" & "feature" refer to
                 the feature and feature table names respectively.
             entity_source (Union[pd.DataFrame, FileSource, BigQuerySource]): Source for the entity rows.
-                If entity_source is a Panda DataFrame, the dataframe will be exported to the staging
-                location as parquet file. It is also assumed that the column event_timestamp is present
+                If entity_source is a Panda DataFrame, the dataframe will be staged
+                to become accessible by spark workers.
+                If one of feature tables' source is in BigQuery - entities will be upload to BQ.
+                Otherwise to remote file storage (derived from configured staging location).
+                It is also assumed that the column event_timestamp is present
                 in the dataframe, and is of type datetime without timezone information.
 
                 The user needs to make sure that the source (or staging location, if entity_source is
@@ -916,25 +922,27 @@ def get_historical_features(
                 str(uuid.uuid4()),
             )
         output_format = self._config.get(CONFIG_SPARK_HISTORICAL_FEATURE_OUTPUT_FORMAT)
+        feature_sources = [
+            feature_table.batch_source for feature_table in feature_tables
+        ]
 
         if isinstance(entity_source, pd.DataFrame):
-            staging_location = self._config.get(CONFIG_SPARK_STAGING_LOCATION)
-            entity_staging_uri = urlparse(
-                os.path.join(staging_location, str(uuid.uuid4()))
-            )
-            staging_client = get_staging_client(entity_staging_uri.scheme)
-            with tempfile.NamedTemporaryFile() as df_export_path:
-                entity_source.to_parquet(df_export_path.name)
-                bucket = (
-                    None
-                    if entity_staging_uri.scheme == "file"
-                    else entity_staging_uri.netloc
+            if any(isinstance(source, BigQuerySource) for source in feature_sources):
+                first_bq_source = [
+                    source
+                    for source in feature_sources
+                    if isinstance(source, BigQuerySource)
+                ][0]
+                source_ref = table_reference_from_string(
+                    first_bq_source.bigquery_options.table_ref
                 )
-                staging_client.upload_file(
-                    df_export_path.name, bucket, entity_staging_uri.path.lstrip("/")
+                entity_source = stage_entities_to_bq(
+                    entity_source, source_ref.project, source_ref.dataset_id
                 )
-                entity_source = FileSource(
-                    "event_timestamp", ParquetFormat(), entity_staging_uri.geturl(),
+            else:
+                entity_source = stage_entities_to_fs(
+                    entity_source,
+                    staging_location=self._config.get(CONFIG_SPARK_STAGING_LOCATION),
                 )
 
         if self._use_job_service:
@@ -943,6 +951,7 @@ def get_historical_features(
                     feature_refs=feature_refs,
                     entity_source=entity_source.to_proto(),
                     project=project,
+                    output_format=output_format,
                     output_location=output_location,
                 ),
                 **self._extra_grpc_params(),
@@ -955,11 +964,7 @@ def get_historical_features(
             )
         else:
             return start_historical_feature_retrieval_job(
-                self,
-                entity_source,
-                feature_tables,
-                output_format,
-                os.path.join(output_location, str(uuid.uuid4())),
+                self, entity_source, feature_tables, output_format, output_location,
             )
 
     def get_historical_features_df(

@@ -7,6 +7,7 @@
 from feast.core import JobService_pb2_grpc
 from feast.core.JobService_pb2 import (
     CancelJobResponse,
+    GetHistoricalFeaturesRequest,
     GetHistoricalFeaturesResponse,
     GetJobResponse,
 )
@@ -20,6 +21,7 @@
     SparkJobStatus,
     StreamIngestionJob,
 )
+from feast.pyspark.launcher import start_historical_feature_retrieval_job
 from feast.third_party.grpc.health.v1 import HealthService_pb2_grpc
 from feast.third_party.grpc.health.v1.HealthService_pb2 import (
     HealthCheckResponse,
@@ -64,13 +66,16 @@ def StartOfflineToOnlineIngestionJob(self, request, context):
         context.set_details("Method not implemented!")
         raise NotImplementedError("Method not implemented!")
 
-    def GetHistoricalFeatures(self, request, context):
+    def GetHistoricalFeatures(self, request: GetHistoricalFeaturesRequest, context):
         """Produce a training dataset, return a job id that will provide a file reference"""
-        job = self.client.get_historical_features(
-            request.feature_refs,
+        job = start_historical_feature_retrieval_job(
+            client=self.client,
             entity_source=DataSource.from_proto(request.entity_source),
-            project=request.project,
-            output_location=request.output_location,
+            feature_tables=self.client._get_feature_tables_from_feature_refs(
+                list(request.feature_refs), request.project
+            ),
+            output_format=request.output_format,
+            output_path=request.output_location,
         )
 
         output_file_uri = job.get_output_file_uri(block=False)

@@ -149,25 +149,31 @@ def spark_format(self) -> str:
     def spark_path(self) -> str:
         return f"{self.project}:{self.dataset}.{self.table}"
 
+    @property
+    def spark_read_options(self) -> Dict[str, str]:
+        return {**super().spark_read_options, "viewsEnabled": "true"}
+
 
 def _source_from_dict(dct: Dict) -> Source:
     if "file" in dct.keys():
         return FileSource(
-            FileSource.PROTO_FORMAT_TO_SPARK[dct["file"]["format"]["json_class"]],
-            dct["file"]["path"],
-            dct["file"]["event_timestamp_column"],
-            dct["file"].get("created_timestamp_column"),
-            dct["file"].get("field_mapping"),
-            dct["file"].get("options"),
+            format=FileSource.PROTO_FORMAT_TO_SPARK[
+                dct["file"]["format"]["json_class"]
+            ],
+            path=dct["file"]["path"],
+            event_timestamp_column=dct["file"]["event_timestamp_column"],
+            created_timestamp_column=dct["file"].get("created_timestamp_column"),
+            field_mapping=dct["file"].get("field_mapping"),
+            options=dct["file"].get("options"),
         )
     else:
         return BigQuerySource(
-            dct["bq"]["project"],
-            dct["bq"]["dataset"],
-            dct["bq"]["table"],
-            dct["bq"].get("field_mapping", {}),
-            dct["bq"]["event_timestamp_column"],
-            dct["bq"].get("created_timestamp_column"),
+            project=dct["bq"]["project"],
+            dataset=dct["bq"]["dataset"],
+            table=dct["bq"]["table"],
+            field_mapping=dct["bq"].get("field_mapping", {}),
+            event_timestamp_column=dct["bq"]["event_timestamp_column"],
+            created_timestamp_column=dct["bq"].get("created_timestamp_column"),
         )
 
 

@@ -35,6 +35,7 @@
     StreamIngestionJob,
     StreamIngestionJobParameters,
 )
+from feast.staging.entities import create_bq_view_of_joined_features_and_entities
 from feast.staging.storage_client import get_staging_client
 from feast.value_type import ValueType
 
@@ -106,7 +107,11 @@ def _source_to_argument(source: DataSource):
         return {"file": properties}
 
     if isinstance(source, BigQuerySource):
-        properties["table_ref"] = source.bigquery_options.table_ref
+        project, dataset_and_table = source.bigquery_options.table_ref.split(":")
+        dataset, table = dataset_and_table.split(".")
+        properties["project"] = project
+        properties["dataset"] = dataset
+        properties["table"] = table
         return {"bq": properties}
 
     if isinstance(source, KafkaSource):
@@ -171,13 +176,17 @@ def start_historical_feature_retrieval_job(
     output_path: str,
 ) -> RetrievalJob:
     launcher = resolve_launcher(client._config)
+    feature_sources = [
+        _source_to_argument(
+            replace_bq_table_with_joined_view(feature_table, entity_source)
+        )
+        for feature_table in feature_tables
+    ]
+
     return launcher.historical_feature_retrieval(
         RetrievalJobParameters(
             entity_source=_source_to_argument(entity_source),
-            feature_tables_sources=[
-                _source_to_argument(feature_table.batch_source)
-                for feature_table in feature_tables
-            ],
+            feature_tables_sources=feature_sources,
             feature_tables=[
                 _feature_table_to_argument(client, feature_table)
                 for feature_table in feature_tables
@@ -188,6 +197,35 @@ def start_historical_feature_retrieval_job(
     )
 
 
+def replace_bq_table_with_joined_view(
+    feature_table: FeatureTable, entity_source: Union[FileSource, BigQuerySource],
+) -> Union[FileSource, BigQuerySource]:
+    """
+    Applies optimization to historical retrieval. Instead of pulling all data from Batch Source,
+    with this optimization we join feature values & entities on Data Warehouse side (improving data locality).
+    Several conditions should be met to enable this optimization:
+    * entities are staged to BigQuery
+    * feature values are in in BigQuery
+    * Entity columns are not mapped (ToDo: fix this limitation)
+    :return: replacement for feature source
+    """
+    if not isinstance(feature_table.batch_source, BigQuerySource):
+        return feature_table.batch_source
+
+    if not isinstance(entity_source, BigQuerySource):
+        return feature_table.batch_source
+
+    if any(
+        entity in feature_table.batch_source.field_mapping
+        for entity in feature_table.entities
+    ):
+        return feature_table.batch_source
+
+    return create_bq_view_of_joined_features_and_entities(
+        feature_table.batch_source, entity_source, feature_table.entities,
+    )
+
+
 def _download_jar(remote_jar: str) -> str:
     remote_jar_parts = urlparse(remote_jar)
 

@@ -104,6 +104,8 @@ class DataprocClusterLauncher(JobLauncher):
     addition to the Feast SDK.
     """
 
+    EXTERNAL_JARS = ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"]
+
     def __init__(
         self, cluster_name: str, staging_location: str, region: str, project_id: str,
     ):
@@ -157,7 +159,7 @@ def dataproc_submit(self, job_params: SparkJobParameters) -> Operation:
             job_config.update(
                 {
                     "spark_job": {
-                        "jar_file_uris": [main_file_uri],
+                        "jar_file_uris": [main_file_uri] + self.EXTERNAL_JARS,
                         "main_class": job_params.get_class_name(),
                         "args": job_params.get_arguments(),
                     }
@@ -168,6 +170,7 @@ def dataproc_submit(self, job_params: SparkJobParameters) -> Operation:
                 {
                     "pyspark_job": {
                         "main_python_file_uri": main_file_uri,
+                        "jar_file_uris": self.EXTERNAL_JARS,
                         "args": job_params.get_arguments(),
                     }
                 }

@@ -148,6 +148,8 @@ class StandaloneClusterLauncher(JobLauncher):
     Submits jobs to a standalone Spark cluster in client mode.
     """
 
+    BQ_CONNECTOR_VERSION = "2.12:0.17.3"
+
     def __init__(self, master_url: str, spark_home: str = None):
         """
         This launcher executes the spark-submit script in a subprocess. The subprocess
@@ -184,6 +186,23 @@ def spark_submit(
         if ui_port:
             submission_cmd.extend(["--conf", f"spark.ui.port={ui_port}"])
 
+        # Workaround for https://github.com/apache/spark/pull/26552
+        # Fix running spark job with bigquery connector (w/ shadowing) on JDK 9+
+        submission_cmd.extend(
+            [
+                "--conf",
+                "spark.executor.extraJavaOptions="
+                "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT",
+                "--conf",
+                "spark.driver.extraJavaOptions="
+                "-Dcom.google.cloud.spark.bigquery.repackaged.io.netty.tryReflectionSetAccessible=true -Duser.timezone=GMT",
+                "--conf",
+                "spark.sql.session.timeZone=UTC",  # ignore local timezone
+                "--packages",
+                f"com.google.cloud.spark:spark-bigquery-with-dependencies_{self.BQ_CONNECTOR_VERSION}",
+            ]
+        )
+
         if job_params.get_extra_options():
             submission_cmd.extend(job_params.get_extra_options().split(" "))