From f359ee30538643b480633b2ad07fc262c2c53f2b Mon Sep 17 00:00:00 2001 From: David Miller Date: Thu, 27 Jan 2022 14:33:21 -0800 Subject: [PATCH 01/19] Add backticks to left_table_query_string (#2250) Signed-off-by: david Signed-off-by: sfc-gh-madkins --- sdk/python/feast/infra/offline_stores/bigquery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index 42a1a83907..f34a997718 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -558,7 +558,7 @@ def _get_bigquery_client(project: Optional[str] = None, location: Optional[str] ,CAST({{entity_df_event_timestamp_col}} AS STRING) AS {{featureview.name}}__entity_row_unique_id {% endif %} {% endfor %} - FROM {{ left_table_query_string }} + FROM `{{ left_table_query_string }}` ), {% for featureview in featureviews %} From 49c5536667ada69e776a0b695fc35c14aefacc10 Mon Sep 17 00:00:00 2001 From: Oleksii Moskalenko Date: Fri, 28 Jan 2022 09:25:54 +0200 Subject: [PATCH 02/19] Delete entity key from Redis only when all attached feature views are gone (#2240) * Delete entity from redis when the last attached feature view is deleted Signed-off-by: pyalex * Delete entity key from Redis only when all attached feature views are gone Signed-off-by: pyalex * make lint happy Signed-off-by: pyalex * make lint happy Signed-off-by: pyalex * one more try with mypy Signed-off-by: pyalex Signed-off-by: sfc-gh-madkins --- sdk/python/feast/infra/online_stores/redis.py | 24 ++++-- .../online_store/test_universal_online.py | 74 +++++++++++++++++++ 2 files changed, 90 insertions(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/redis.py b/sdk/python/feast/infra/online_stores/redis.py index 493c6ab462..752ed7d009 100644 --- a/sdk/python/feast/infra/online_stores/redis.py +++ b/sdk/python/feast/infra/online_stores/redis.py @@ -72,11 +72,11 @@ class RedisOnlineStoreConfig(FeastConfigBaseModel): class RedisOnlineStore(OnlineStore): _client: Optional[Union[Redis, RedisCluster]] = None - def delete_table_values(self, config: RepoConfig, table: FeatureView): + def delete_entity_values(self, config: RepoConfig, join_keys: List[str]): client = self._get_client(config.online_store) deleted_count = 0 pipeline = client.pipeline() - prefix = _redis_key_prefix(table.entities) + prefix = _redis_key_prefix(join_keys) for _k in client.scan_iter( b"".join([prefix, b"*", config.project.encode("utf8")]) @@ -85,7 +85,7 @@ def delete_table_values(self, config: RepoConfig, table: FeatureView): deleted_count += 1 pipeline.execute() - logger.debug(f"Deleted {deleted_count} keys for {table.name}") + logger.debug(f"Deleted {deleted_count} rows for entity {', '.join(join_keys)}") @log_exceptions_and_usage(online_store="redis") def update( @@ -98,10 +98,16 @@ def update( partial: bool, ): """ - We delete the keys in redis for tables/views being removed. + Look for join_keys (list of entities) that are not in use anymore + (usually this happens when the last feature view that was using specific compound key is deleted) + and remove all features attached to this "join_keys". """ - for table in tables_to_delete: - self.delete_table_values(config, table) + join_keys_to_keep = set(tuple(table.entities) for table in tables_to_keep) + + join_keys_to_delete = set(tuple(table.entities) for table in tables_to_delete) + + for join_keys in join_keys_to_delete - join_keys_to_keep: + self.delete_entity_values(config, list(join_keys)) def teardown( self, @@ -112,8 +118,10 @@ def teardown( """ We delete the keys in redis for tables/views being removed. """ - for table in tables: - self.delete_table_values(config, table) + join_keys_to_delete = set(tuple(table.entities) for table in tables) + + for join_keys in join_keys_to_delete: + self.delete_entity_values(config, list(join_keys)) @staticmethod def _parse_connection_string(connection_string: str): diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index b23c68033e..f483d54f6b 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -28,6 +28,7 @@ ) from tests.integration.feature_repos.universal.feature_views import ( create_driver_hourly_stats_feature_view, + driver_feature_view, ) from tests.utils.data_source_utils import prep_file_source @@ -503,6 +504,79 @@ def test_online_retrieval(environment, universal_data_sources, full_feature_name ) +@pytest.mark.integration +@pytest.mark.universal +def test_online_store_cleanup(environment, universal_data_sources): + """ + Some online store implementations (like Redis) keep features from different features views + but with common entities together. + This might end up with deletion of all features attached to the entity, + when only one feature view was deletion target (see https://github.com/feast-dev/feast/issues/2150). + + Plan: + 1. Register two feature views with common entity "driver" + 2. Materialize data + 3. Check if features are available (via online retrieval) + 4. Delete one feature view + 5. Check that features for other are still available + 6. Delete another feature view (and create again) + 7. Verify that features for both feature view were deleted + """ + fs = environment.feature_store + entities, datasets, data_sources = universal_data_sources + driver_stats_fv = construct_universal_feature_views(data_sources)["driver"] + + df = pd.DataFrame( + { + "ts_1": [environment.end_date] * len(entities["driver"]), + "created_ts": [environment.end_date] * len(entities["driver"]), + "driver_id": entities["driver"], + "value": np.random.random(size=len(entities["driver"])), + } + ) + + ds = environment.data_source_creator.create_data_source( + df, destination_name="simple_driver_dataset" + ) + + simple_driver_fv = driver_feature_view( + data_source=ds, name="test_universal_online_simple_driver" + ) + + fs.apply([driver(), simple_driver_fv, driver_stats_fv]) + + fs.materialize( + environment.start_date - timedelta(days=1), + environment.end_date + timedelta(days=1), + ) + expected_values = df.sort_values(by="driver_id") + + features = [f"{simple_driver_fv.name}:value"] + entity_rows = [{"driver": driver_id} for driver_id in sorted(entities["driver"])] + + online_features = fs.get_online_features( + features=features, entity_rows=entity_rows + ).to_dict() + assert np.allclose(expected_values["value"], online_features["value"]) + + fs.apply( + objects=[simple_driver_fv], objects_to_delete=[driver_stats_fv], partial=False + ) + + online_features = fs.get_online_features( + features=features, entity_rows=entity_rows + ).to_dict() + assert np.allclose(expected_values["value"], online_features["value"]) + + fs.apply(objects=[], objects_to_delete=[simple_driver_fv], partial=False) + fs.apply([simple_driver_fv]) + + online_features = fs.get_online_features( + features=features, entity_rows=entity_rows + ).to_dict() + assert all(v is None for v in online_features["value"]) + + def response_feature_name(feature: str, full_feature_names: bool) -> str: if ( feature in {"current_balance", "avg_passenger_count", "lifetime_trip_count"} From 68d2c1a4a43c724236a16f54f3af4f977e7603a1 Mon Sep 17 00:00:00 2001 From: Michelle Rascati <44408275+michelle-rascati-sp@users.noreply.github.com> Date: Fri, 28 Jan 2022 01:57:54 -0600 Subject: [PATCH 03/19] historical_field_mappings2 merge for one sign off commit (#2252) Signed-off-by: Michelle Rascati Signed-off-by: sfc-gh-madkins --- CONTRIBUTING.md | 2 +- sdk/python/feast/driver_test_data.py | 26 ++++++++++++++ .../feast/infra/offline_stores/bigquery.py | 4 +-- .../infra/offline_stores/offline_utils.py | 12 +++++-- .../feast/infra/offline_stores/redshift.py | 4 +-- sdk/python/setup.py | 2 +- .../feature_repos/repo_configuration.py | 14 ++++++++ .../feature_repos/universal/feature_views.py | 10 ++++++ .../test_universal_historical_retrieval.py | 36 +++++++++++++++++++ 9 files changed, 102 insertions(+), 8 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6918d7f1de..dbf44d4bef 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -50,7 +50,7 @@ Setting up your development environment for Feast Python SDK / CLI: 3. _Recommended:_ Create a virtual environment to isolate development dependencies to be installed ```sh # create & activate a virtual environment -python -v venv venv/ +python -m venv venv/ source venv/bin/activate ``` diff --git a/sdk/python/feast/driver_test_data.py b/sdk/python/feast/driver_test_data.py index 1c9a1dd20b..117bfcbd9c 100644 --- a/sdk/python/feast/driver_test_data.py +++ b/sdk/python/feast/driver_test_data.py @@ -264,3 +264,29 @@ def create_global_daily_stats_df(start_date, end_date) -> pd.DataFrame: # TODO: Remove created timestamp in order to test whether its really optional df_daily["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) return df_daily + + +def create_field_mapping_df(start_date, end_date) -> pd.DataFrame: + """ + Example df generated by this function: + | event_timestamp | column_name | created | + |------------------+-------------+------------------| + | 2021-03-17 19:00 | 99 | 2021-03-24 19:38 | + | 2021-03-17 19:00 | 22 | 2021-03-24 19:38 | + | 2021-03-17 19:00 | 7 | 2021-03-24 19:38 | + | 2021-03-17 19:00 | 45 | 2021-03-24 19:38 | + """ + size = 10 + df = pd.DataFrame() + df["column_name"] = np.random.randint(1, 100, size=size).astype(np.int32) + df[DEFAULT_ENTITY_DF_EVENT_TIMESTAMP_COL] = [ + _convert_event_timestamp( + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms"), + EventTimestampType(idx % 4), + ) + for idx, dt in enumerate( + pd.date_range(start=start_date, end=end_date, periods=size) + ) + ] + df["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) + return df diff --git a/sdk/python/feast/infra/offline_stores/bigquery.py b/sdk/python/feast/infra/offline_stores/bigquery.py index f34a997718..44e62d6ad1 100644 --- a/sdk/python/feast/infra/offline_stores/bigquery.py +++ b/sdk/python/feast/infra/offline_stores/bigquery.py @@ -598,7 +598,7 @@ def _get_bigquery_client(project: Optional[str] = None, location: Optional[str] {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} {{ featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %} {% for feature in featureview.features %} - {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %}{% if loop.last %}{% else %}, {% endif %} + {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}{% if loop.last %}{% else %}, {% endif %} {% endfor %} FROM {{ featureview.table_subquery }} WHERE {{ featureview.event_timestamp_column }} <= '{{ featureview.max_event_timestamp }}' @@ -699,7 +699,7 @@ def _get_bigquery_client(project: Optional[str] = None, location: Optional[str] SELECT {{featureview.name}}__entity_row_unique_id {% for feature in featureview.features %} - ,{% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %} + ,{% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %} {% endfor %} FROM {{ featureview.name }}__cleaned ) USING ({{featureview.name}}__entity_row_unique_id) diff --git a/sdk/python/feast/infra/offline_stores/offline_utils.py b/sdk/python/feast/infra/offline_stores/offline_utils.py index 0b60c3493d..eaf4925266 100644 --- a/sdk/python/feast/infra/offline_stores/offline_utils.py +++ b/sdk/python/feast/infra/offline_stores/offline_utils.py @@ -85,6 +85,7 @@ class FeatureViewQueryContext: ttl: int entities: List[str] features: List[str] # feature reference format + field_mapping: Dict[str, str] event_timestamp_column: str created_timestamp_column: Optional[str] table_subquery: str @@ -144,7 +145,10 @@ def get_feature_view_query_context( name=feature_view.projection.name_to_use(), ttl=ttl_seconds, entities=join_keys, - features=features, + features=[ + reverse_field_mapping.get(feature, feature) for feature in features + ], + field_mapping=feature_view.input.field_mapping, event_timestamp_column=reverse_field_mapping.get( event_timestamp_column, event_timestamp_column ), @@ -175,7 +179,11 @@ def build_point_in_time_query( final_output_feature_names = list(entity_df_columns) final_output_feature_names.extend( [ - (f"{fv.name}__{feature}" if full_feature_names else feature) + ( + f"{fv.name}__{fv.field_mapping.get(feature, feature)}" + if full_feature_names + else fv.field_mapping.get(feature, feature) + ) for fv in feature_view_query_contexts for feature in fv.features ] diff --git a/sdk/python/feast/infra/offline_stores/redshift.py b/sdk/python/feast/infra/offline_stores/redshift.py index 2aa3d5c41c..3efd45bc74 100644 --- a/sdk/python/feast/infra/offline_stores/redshift.py +++ b/sdk/python/feast/infra/offline_stores/redshift.py @@ -563,7 +563,7 @@ def _get_entity_df_event_timestamp_range( {{ featureview.created_timestamp_column ~ ' as created_timestamp,' if featureview.created_timestamp_column else '' }} {{ featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %} {% for feature in featureview.features %} - {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %}{% if loop.last %}{% else %}, {% endif %} + {{ feature }} as {% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %}{% if loop.last %}{% else %}, {% endif %} {% endfor %} FROM {{ featureview.table_subquery }} WHERE {{ featureview.event_timestamp_column }} <= '{{ featureview.max_event_timestamp }}' @@ -664,7 +664,7 @@ def _get_entity_df_event_timestamp_range( SELECT {{featureview.name}}__entity_row_unique_id {% for feature in featureview.features %} - ,{% if full_feature_names %}{{ featureview.name }}__{{feature}}{% else %}{{ feature }}{% endif %} + ,{% if full_feature_names %}{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}{% else %}{{ featureview.field_mapping.get(feature, feature) }}{% endif %} {% endfor %} FROM {{ featureview.name }}__cleaned ) USING ({{featureview.name}}__entity_row_unique_id) diff --git a/sdk/python/setup.py b/sdk/python/setup.py index d35ee9de11..4f01c7b4e0 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -132,7 +132,7 @@ + AWS_REQUIRED ) -DEV_REQUIRED = ["mypy-protobuf==1.*", "grpcio-testing==1.*"] + CI_REQUIRED +DEV_REQUIRED = ["mypy-protobuf>=1.*", "grpcio-testing==1.*"] + CI_REQUIRED # Get git repo root directory repo_root = str(pathlib.Path(__file__).resolve().parent.parent.parent) diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index f66a92c9d6..f0fb0b28fd 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -35,6 +35,7 @@ create_customer_daily_profile_feature_view, create_driver_age_request_feature_view, create_driver_hourly_stats_feature_view, + create_field_mapping_feature_view, create_global_stats_feature_view, create_location_stats_feature_view, create_order_feature_view, @@ -126,6 +127,7 @@ def construct_universal_datasets( order_count=20, ) global_df = driver_test_data.create_global_daily_stats_df(start_time, end_time) + field_mapping_df = driver_test_data.create_field_mapping_df(start_time, end_time) entity_df = orders_df[ [ "customer_id", @@ -143,6 +145,7 @@ def construct_universal_datasets( "location": location_df, "orders": orders_df, "global": global_df, + "field_mapping": field_mapping_df, "entity": entity_df, } @@ -180,12 +183,20 @@ def construct_universal_data_sources( event_timestamp_column="event_timestamp", created_timestamp_column="created", ) + field_mapping_ds = data_source_creator.create_data_source( + datasets["field_mapping"], + destination_name="field_mapping", + event_timestamp_column="event_timestamp", + created_timestamp_column="created", + field_mapping={"column_name": "feature_name"}, + ) return { "customer": customer_ds, "driver": driver_ds, "location": location_ds, "orders": orders_ds, "global": global_ds, + "field_mapping": field_mapping_ds, } @@ -210,6 +221,9 @@ def construct_universal_feature_views( "driver_age_request_fv": create_driver_age_request_feature_view(), "order": create_order_feature_view(data_sources["orders"]), "location": create_location_stats_feature_view(data_sources["location"]), + "field_mapping": create_field_mapping_feature_view( + data_sources["field_mapping"] + ), } diff --git a/sdk/python/tests/integration/feature_repos/universal/feature_views.py b/sdk/python/tests/integration/feature_repos/universal/feature_views.py index f68add88cb..b0dc34197f 100644 --- a/sdk/python/tests/integration/feature_repos/universal/feature_views.py +++ b/sdk/python/tests/integration/feature_repos/universal/feature_views.py @@ -217,3 +217,13 @@ def create_location_stats_feature_view(source, infer_features: bool = False): ttl=timedelta(days=2), ) return location_stats_feature_view + + +def create_field_mapping_feature_view(source): + return FeatureView( + name="field_mapping", + entities=[], + features=[Feature(name="feature_name", dtype=ValueType.INT32)], + batch_source=source, + ttl=timedelta(days=2), + ) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 5e4bd00460..147e20aee1 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -82,6 +82,8 @@ def get_expected_training_df( location_fv: FeatureView, global_df: pd.DataFrame, global_fv: FeatureView, + field_mapping_df: pd.DataFrame, + field_mapping_fv: FeatureView, entity_df: pd.DataFrame, event_timestamp: str, full_feature_names: bool = False, @@ -102,6 +104,10 @@ def get_expected_training_df( global_records = convert_timestamp_records_to_utc( global_df.to_dict("records"), global_fv.batch_source.event_timestamp_column ) + field_mapping_records = convert_timestamp_records_to_utc( + field_mapping_df.to_dict("records"), + field_mapping_fv.batch_source.event_timestamp_column, + ) entity_rows = convert_timestamp_records_to_utc( entity_df.to_dict("records"), event_timestamp ) @@ -156,6 +162,13 @@ def get_expected_training_df( ts_end=order_record[event_timestamp], ) + field_mapping_record = find_asof_record( + field_mapping_records, + ts_key=field_mapping_fv.batch_source.event_timestamp_column, + ts_start=order_record[event_timestamp] - field_mapping_fv.ttl, + ts_end=order_record[event_timestamp], + ) + entity_row.update( { ( @@ -197,6 +210,16 @@ def get_expected_training_df( } ) + # get field_mapping_record by column name, but label by feature name + entity_row.update( + { + ( + f"field_mapping__{feature}" if full_feature_names else feature + ): field_mapping_record.get(column, None) + for (column, feature) in field_mapping_fv.input.field_mapping.items() + } + ) + # Convert records back to pandas dataframe expected_df = pd.DataFrame(entity_rows) @@ -213,6 +236,7 @@ def get_expected_training_df( "customer_profile__current_balance": "float32", "customer_profile__avg_passenger_count": "float32", "global_stats__avg_ride_length": "float32", + "field_mapping__feature_name": "int32", } else: expected_column_types = { @@ -221,6 +245,7 @@ def get_expected_training_df( "current_balance": "float32", "avg_passenger_count": "float32", "avg_ride_length": "float32", + "feature_name": "int32", } for col, typ in expected_column_types.items(): @@ -311,6 +336,8 @@ def test_historical_features(environment, universal_data_sources, full_feature_n feature_views["location"], datasets["global"], feature_views["global"], + datasets["field_mapping"], + feature_views["field_mapping"], entity_df_with_request_data, event_timestamp, full_feature_names, @@ -336,6 +363,7 @@ def test_historical_features(environment, universal_data_sources, full_feature_n "global_stats:num_rides", "global_stats:avg_ride_length", "driver_age:driver_age", + "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) @@ -404,6 +432,7 @@ def test_historical_features_with_missing_request_data( "conv_rate_plus_100:conv_rate_plus_val_to_add", "global_stats:num_rides", "global_stats:avg_ride_length", + "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) @@ -419,6 +448,7 @@ def test_historical_features_with_missing_request_data( "driver_age:driver_age", "global_stats:num_rides", "global_stats:avg_ride_length", + "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) @@ -452,6 +482,7 @@ def test_historical_features_with_entities_from_query( "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", + "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) @@ -477,6 +508,8 @@ def test_historical_features_with_entities_from_query( feature_views["location"], datasets["global"], feature_views["global"], + datasets["field_mapping"], + feature_views["field_mapping"], datasets["entity"], event_timestamp, full_feature_names, @@ -538,6 +571,7 @@ def test_historical_features_persisting( "order:order_is_success", "global_stats:num_rides", "global_stats:avg_ride_length", + "field_mapping:feature_name", ], full_feature_names=full_feature_names, ) @@ -561,6 +595,8 @@ def test_historical_features_persisting( feature_views["location"], datasets["global"], feature_views["global"], + datasets["field_mapping"], + feature_views["field_mapping"], entity_df, event_timestamp, full_feature_names, From 507ca6bf4c489925729581ff764434d6f73227cd Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Fri, 28 Jan 2022 16:21:54 +0000 Subject: [PATCH 04/19] Correct inconsistent dependency (#2255) Signed-off-by: Judah Rand <17158624+judahrand@users.noreply.github.com> Signed-off-by: sfc-gh-madkins --- sdk/python/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/setup.py b/sdk/python/setup.py index 4f01c7b4e0..bae1695bf1 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -132,7 +132,7 @@ + AWS_REQUIRED ) -DEV_REQUIRED = ["mypy-protobuf>=1.*", "grpcio-testing==1.*"] + CI_REQUIRED +DEV_REQUIRED = ["mypy-protobuf>=3.1.0", "grpcio-testing==1.*"] + CI_REQUIRED # Get git repo root directory repo_root = str(pathlib.Path(__file__).resolve().parent.parent.parent) @@ -244,7 +244,7 @@ def run(self): ], entry_points={"console_scripts": ["feast=feast.cli:cli"]}, use_scm_version=use_scm_version, - setup_requires=["setuptools_scm", "grpcio", "grpcio-tools==1.34.0", "mypy-protobuf==1.*", "sphinx!=4.0.0"], + setup_requires=["setuptools_scm", "grpcio", "grpcio-tools==1.34.0", "mypy-protobuf==3.1.0", "sphinx!=4.0.0"], package_data={ "": [ "protos/feast/**/*.proto", From 660a7d0ec18c87da9c47cfae1ff5c2302e03450b Mon Sep 17 00:00:00 2001 From: sfc-gh-madkins <82121043+sfc-gh-madkins@users.noreply.github.com> Date: Sat, 29 Jan 2022 09:01:31 -0600 Subject: [PATCH 05/19] Add snowflake environment variables to allow testing on snowflake infra (#2258) * add snowflake environment vars to test framework Signed-off-by: sfc-gh-madkins * add snowflake environment vars to test framework Signed-off-by: sfc-gh-madkins --- .github/workflows/master_only.yml | 5 +++++ .github/workflows/pr_integration_tests.yml | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/.github/workflows/master_only.yml b/.github/workflows/master_only.yml index 42f0383832..66b5c62073 100644 --- a/.github/workflows/master_only.yml +++ b/.github/workflows/master_only.yml @@ -125,6 +125,11 @@ jobs: FEAST_SERVER_DOCKER_IMAGE_TAG: ${{ needs.build-lambda-docker-image.outputs.DOCKER_IMAGE_TAG }} FEAST_USAGE: "False" IS_TEST: "True" + SNOWFLAKE_CI_DEPLOYMENT: ${{ secrets.SNOWFLAKE_CI_DEPLOYMENT }} + SNOWFLAKE_CI_USER: ${{ secrets.SNOWFLAKE_CI_USER }} + SNOWFLAKE_CI_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} + SNOWFLAKE_CI_ROLE: ${{ secrets.SNOWFLAKE_CI_ROLE }} + SNOWFLAKE_CI_WAREHOUSE: ${{ secrets.SNOWFLAKE_CI_WAREHOUSE }} run: pytest -n 8 --cov=./ --cov-report=xml --verbose --color=yes sdk/python/tests --integration --durations=5 - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 diff --git a/.github/workflows/pr_integration_tests.yml b/.github/workflows/pr_integration_tests.yml index 8a910f943c..e04b78ec32 100644 --- a/.github/workflows/pr_integration_tests.yml +++ b/.github/workflows/pr_integration_tests.yml @@ -151,6 +151,11 @@ jobs: FEAST_SERVER_DOCKER_IMAGE_TAG: ${{ needs.build-docker-image.outputs.DOCKER_IMAGE_TAG }} FEAST_USAGE: "False" IS_TEST: "True" + SNOWFLAKE_CI_DEPLOYMENT: ${{ secrets.SNOWFLAKE_CI_DEPLOYMENT }} + SNOWFLAKE_CI_USER: ${{ secrets.SNOWFLAKE_CI_USER }} + SNOWFLAKE_CI_PASSWORD: ${{ secrets.SNOWFLAKE_CI_PASSWORD }} + SNOWFLAKE_CI_ROLE: ${{ secrets.SNOWFLAKE_CI_ROLE }} + SNOWFLAKE_CI_WAREHOUSE: ${{ secrets.SNOWFLAKE_CI_WAREHOUSE }} run: pytest -n 8 --cov=./ --cov-report=xml --verbose --color=yes sdk/python/tests --integration --durations=5 - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 From d44561b9f36d8d34988fea025809c89d9628eca2 Mon Sep 17 00:00:00 2001 From: Judah Rand <17158624+judahrand@users.noreply.github.com> Date: Sat, 29 Jan 2022 21:38:32 +0000 Subject: [PATCH 06/19] Return `UNIX_TIMESTAMP` as Python `datetime` (#2244) * Refactor `UNIX_TIMESTAMP` conversion Signed-off-by: Judah Rand <17158624+judahrand@users.noreply.github.com> * Return `UNIX_TIMESTAMP` types as `datetime` to user Signed-off-by: Judah Rand <17158624+judahrand@users.noreply.github.com> * Fix linting errors Signed-off-by: Judah Rand <17158624+judahrand@users.noreply.github.com> * Rename variable to something more sensible Signed-off-by: Judah Rand <17158624+judahrand@users.noreply.github.com> Signed-off-by: sfc-gh-madkins --- sdk/python/feast/type_map.py | 88 ++++++++++++------- .../registration/test_universal_types.py | 2 +- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 74c4cb17ed..599be85fdf 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -13,8 +13,20 @@ # limitations under the License. import re -from datetime import datetime -from typing import Any, Dict, List, Optional, Set, Sized, Tuple, Type +from datetime import datetime, timezone +from typing import ( + Any, + Dict, + List, + Optional, + Sequence, + Set, + Sized, + Tuple, + Type, + Union, + cast, +) import numpy as np import pandas as pd @@ -49,8 +61,17 @@ def feast_value_type_to_python_type(field_value_proto: ProtoValue) -> Any: if val_attr is None: return None val = getattr(field_value_proto, val_attr) + + # If it's a _LIST type extract the list. if hasattr(val, "val"): val = list(val.val) + + # Convert UNIX_TIMESTAMP values to `datetime` + if val_attr == "unix_timestamp_list_val": + val = [datetime.fromtimestamp(v, tz=timezone.utc) for v in val] + elif val_attr == "unix_timestamp_val": + val = datetime.fromtimestamp(val, tz=timezone.utc) + return val @@ -240,6 +261,28 @@ def _type_err(item, dtype): } +def _python_datetime_to_int_timestamp( + values: Sequence[Any], +) -> Sequence[Union[int, np.int_]]: + # Fast path for Numpy array. + if isinstance(values, np.ndarray) and isinstance(values.dtype, np.datetime64): + if values.ndim != 1: + raise ValueError("Only 1 dimensional arrays are supported.") + return cast(Sequence[np.int_], values.astype("datetime64[s]").astype(np.int_)) + + int_timestamps = [] + for value in values: + if isinstance(value, datetime): + int_timestamps.append(int(value.timestamp())) + elif isinstance(value, Timestamp): + int_timestamps.append(int(value.ToSeconds())) + elif isinstance(value, np.datetime64): + int_timestamps.append(value.astype("datetime64[s]").astype(np.int_)) + else: + int_timestamps.append(int(value)) + return int_timestamps + + def _python_value_to_proto_value( feast_value_type: ValueType, values: List[Any] ) -> List[ProtoValue]: @@ -275,22 +318,14 @@ def _python_value_to_proto_value( raise _type_err(first_invalid, valid_types[0]) if feast_value_type == ValueType.UNIX_TIMESTAMP_LIST: - converted_values = [] - for value in values: - converted_sub_values = [] - for sub_value in value: - if isinstance(sub_value, datetime): - converted_sub_values.append(int(sub_value.timestamp())) - elif isinstance(sub_value, Timestamp): - converted_sub_values.append(int(sub_value.ToSeconds())) - elif isinstance(sub_value, np.datetime64): - converted_sub_values.append( - sub_value.astype("datetime64[s]").astype("int") - ) - else: - converted_sub_values.append(sub_value) - converted_values.append(converted_sub_values) - values = converted_values + int_timestamps_lists = ( + _python_datetime_to_int_timestamp(value) for value in values + ) + return [ + # ProtoValue does actually accept `np.int_` but the typing complains. + ProtoValue(unix_timestamp_list_val=Int64List(val=ts)) # type: ignore + for ts in int_timestamps_lists + ] return [ ProtoValue(**{field_name: proto_type(val=value)}) # type: ignore @@ -302,20 +337,9 @@ def _python_value_to_proto_value( # Handle scalar types below else: if feast_value_type == ValueType.UNIX_TIMESTAMP: - if isinstance(sample, datetime): - return [ - ProtoValue(int64_val=int(value.timestamp())) for value in values - ] - elif isinstance(sample, Timestamp): - return [ - ProtoValue(int64_val=int(value.ToSeconds())) for value in values - ] - elif isinstance(sample, np.datetime64): - return [ - ProtoValue(int64_val=value.astype("datetime64[s]").astype("int")) - for value in values - ] - return [ProtoValue(int64_val=int(value)) for value in values] + int_timestamps = _python_datetime_to_int_timestamp(values) + # ProtoValue does actually accept `np.int_` but the typing complains. + return [ProtoValue(unix_timestamp_val=ts) for ts in int_timestamps] # type: ignore if feast_value_type in PYTHON_SCALAR_VALUE_TYPE_TO_PROTO_VALUE: ( diff --git a/sdk/python/tests/integration/registration/test_universal_types.py b/sdk/python/tests/integration/registration/test_universal_types.py index 8cb21e6384..5c782306e6 100644 --- a/sdk/python/tests/integration/registration/test_universal_types.py +++ b/sdk/python/tests/integration/registration/test_universal_types.py @@ -234,7 +234,7 @@ def test_feature_get_online_features_types_match(online_types_test_fixtures): "float": float, "string": str, "bool": bool, - "datetime": int, + "datetime": datetime, } expected_dtype = feature_list_dtype_to_expected_online_response_value_type[ config.feature_dtype From e617a0698cbb7ae782eac1c4b6672333d29b5ab2 Mon Sep 17 00:00:00 2001 From: Felix Wang Date: Sun, 30 Jan 2022 09:02:52 -0800 Subject: [PATCH 07/19] Feast plan clean up (#2256) * Run validation and inference on views and entities during plan Signed-off-by: Felix Wang * Do not log objects that are unchanged Signed-off-by: Felix Wang * Rename Fco to FeastObject Signed-off-by: Felix Wang * Remove useless method Signed-off-by: Felix Wang * Lint Signed-off-by: Felix Wang * Always initialize registry during feature store initialization Signed-off-by: Felix Wang * Fix usage test Signed-off-by: Felix Wang * Remove print statements Signed-off-by: Felix Wang Signed-off-by: sfc-gh-madkins --- sdk/python/feast/diff/infra_diff.py | 8 + .../diff/{FcoDiff.py => registry_diff.py} | 153 +++++++++--------- sdk/python/feast/feature_store.py | 63 +++----- sdk/python/feast/inference.py | 12 +- sdk/python/feast/repo_operations.py | 3 +- .../tests/integration/e2e/test_usage_e2e.py | 10 +- ...test_fco_diff.py => test_registry_diff.py} | 67 ++------ 7 files changed, 145 insertions(+), 171 deletions(-) rename sdk/python/feast/diff/{FcoDiff.py => registry_diff.py} (63%) rename sdk/python/tests/unit/diff/{test_fco_diff.py => test_registry_diff.py} (53%) diff --git a/sdk/python/feast/diff/infra_diff.py b/sdk/python/feast/diff/infra_diff.py index d5bcbbc44a..a09eaf39eb 100644 --- a/sdk/python/feast/diff/infra_diff.py +++ b/sdk/python/feast/diff/infra_diff.py @@ -71,12 +71,20 @@ def to_string(self): TransitionType.UPDATE: ("Updated", Fore.YELLOW), } for infra_object_diff in self.infra_object_diffs: + if infra_object_diff.transition_type == TransitionType.UNCHANGED: + continue action, color = message_action_map[infra_object_diff.transition_type] log_string += f"{action} {infra_object_diff.infra_object_type} {Style.BRIGHT + color}{infra_object_diff.name}{Style.RESET_ALL}\n" if infra_object_diff.transition_type == TransitionType.UPDATE: for _p in infra_object_diff.infra_object_property_diffs: log_string += f"\t{_p.property_name}: {Style.BRIGHT + color}{_p.val_existing}{Style.RESET_ALL} -> {Style.BRIGHT + Fore.LIGHTGREEN_EX}{_p.val_declared}{Style.RESET_ALL}\n" + log_string = ( + f"{Style.BRIGHT + Fore.LIGHTBLUE_EX}No changes to infrastructure" + if not log_string + else log_string + ) + return log_string diff --git a/sdk/python/feast/diff/FcoDiff.py b/sdk/python/feast/diff/registry_diff.py similarity index 63% rename from sdk/python/feast/diff/FcoDiff.py rename to sdk/python/feast/diff/registry_diff.py index 1ea66ec659..1f68d3ff65 100644 --- a/sdk/python/feast/diff/FcoDiff.py +++ b/sdk/python/feast/diff/registry_diff.py @@ -20,28 +20,28 @@ from feast.registry import FEAST_OBJECT_TYPES, FeastObjectType, Registry from feast.repo_contents import RepoContents -Fco = TypeVar("Fco", Entity, BaseFeatureView, FeatureService) +FeastObject = TypeVar("FeastObject", Entity, BaseFeatureView, FeatureService) @dataclass -class FcoDiff(Generic[Fco]): +class FeastObjectDiff(Generic[FeastObject]): name: str - fco_type: FeastObjectType - current_fco: Fco - new_fco: Fco - fco_property_diffs: List[PropertyDiff] + feast_object_type: FeastObjectType + current_feast_object: FeastObject + new_feast_object: FeastObject + feast_object_property_diffs: List[PropertyDiff] transition_type: TransitionType @dataclass class RegistryDiff: - fco_diffs: List[FcoDiff] + feast_object_diffs: List[FeastObjectDiff] def __init__(self): - self.fco_diffs = [] + self.feast_object_diffs = [] - def add_fco_diff(self, fco_diff: FcoDiff): - self.fco_diffs.append(fco_diff) + def add_feast_object_diff(self, feast_object_diff: FeastObjectDiff): + self.feast_object_diffs.append(feast_object_diff) def to_string(self): from colorama import Fore, Style @@ -54,21 +54,29 @@ def to_string(self): TransitionType.UNCHANGED: ("Unchanged", Fore.LIGHTBLUE_EX), TransitionType.UPDATE: ("Updated", Fore.YELLOW), } - for fco_diff in self.fco_diffs: - if fco_diff.name == DUMMY_ENTITY_NAME: + for feast_object_diff in self.feast_object_diffs: + if feast_object_diff.name == DUMMY_ENTITY_NAME: continue - action, color = message_action_map[fco_diff.transition_type] - log_string += f"{action} {fco_diff.fco_type.value} {Style.BRIGHT + color}{fco_diff.name}{Style.RESET_ALL}\n" - if fco_diff.transition_type == TransitionType.UPDATE: - for _p in fco_diff.fco_property_diffs: + if feast_object_diff.transition_type == TransitionType.UNCHANGED: + continue + action, color = message_action_map[feast_object_diff.transition_type] + log_string += f"{action} {feast_object_diff.feast_object_type.value} {Style.BRIGHT + color}{feast_object_diff.name}{Style.RESET_ALL}\n" + if feast_object_diff.transition_type == TransitionType.UPDATE: + for _p in feast_object_diff.feast_object_property_diffs: log_string += f"\t{_p.property_name}: {Style.BRIGHT + color}{_p.val_existing}{Style.RESET_ALL} -> {Style.BRIGHT + Fore.LIGHTGREEN_EX}{_p.val_declared}{Style.RESET_ALL}\n" + log_string = ( + f"{Style.BRIGHT + Fore.LIGHTBLUE_EX}No changes to registry" + if not log_string + else log_string + ) + return log_string def tag_objects_for_keep_delete_update_add( - existing_objs: Iterable[Fco], desired_objs: Iterable[Fco] -) -> Tuple[Set[Fco], Set[Fco], Set[Fco], Set[Fco]]: + existing_objs: Iterable[FeastObject], desired_objs: Iterable[FeastObject] +) -> Tuple[Set[FeastObject], Set[FeastObject], Set[FeastObject], Set[FeastObject]]: existing_obj_names = {e.name for e in existing_objs} desired_obj_names = {e.name for e in desired_objs} @@ -80,8 +88,8 @@ def tag_objects_for_keep_delete_update_add( return objs_to_keep, objs_to_delete, objs_to_update, objs_to_add -FcoProto = TypeVar( - "FcoProto", +FeastObjectProto = TypeVar( + "FeastObjectProto", EntityProto, FeatureViewProto, FeatureServiceProto, @@ -90,25 +98,12 @@ def tag_objects_for_keep_delete_update_add( ) -def tag_proto_objects_for_keep_delete_add( - existing_objs: Iterable[FcoProto], desired_objs: Iterable[FcoProto] -) -> Tuple[Iterable[FcoProto], Iterable[FcoProto], Iterable[FcoProto]]: - existing_obj_names = {e.spec.name for e in existing_objs} - desired_obj_names = {e.spec.name for e in desired_objs} - - objs_to_add = [e for e in desired_objs if e.spec.name not in existing_obj_names] - objs_to_keep = [e for e in desired_objs if e.spec.name in existing_obj_names] - objs_to_delete = [e for e in existing_objs if e.spec.name not in desired_obj_names] - - return objs_to_keep, objs_to_delete, objs_to_add - - FIELDS_TO_IGNORE = {"project"} def diff_registry_objects( - current: Fco, new: Fco, object_type: FeastObjectType -) -> FcoDiff: + current: FeastObject, new: FeastObject, object_type: FeastObjectType +) -> FeastObjectDiff: current_proto = current.to_proto() new_proto = new.to_proto() assert current_proto.DESCRIPTOR.full_name == new_proto.DESCRIPTOR.full_name @@ -129,12 +124,12 @@ def diff_registry_objects( getattr(new_proto.spec, _field.name), ) ) - return FcoDiff( + return FeastObjectDiff( name=new_proto.spec.name, - fco_type=object_type, - current_fco=current, - new_fco=new, - fco_property_diffs=property_diffs, + feast_object_type=object_type, + current_feast_object=current, + new_feast_object=new, + feast_object_property_diffs=property_diffs, transition_type=transition, ) @@ -142,10 +137,10 @@ def diff_registry_objects( def extract_objects_for_keep_delete_update_add( registry: Registry, current_project: str, desired_repo_contents: RepoContents, ) -> Tuple[ - Dict[FeastObjectType, Set[Fco]], - Dict[FeastObjectType, Set[Fco]], - Dict[FeastObjectType, Set[Fco]], - Dict[FeastObjectType, Set[Fco]], + Dict[FeastObjectType, Set[FeastObject]], + Dict[FeastObjectType, Set[FeastObject]], + Dict[FeastObjectType, Set[FeastObject]], + Dict[FeastObjectType, Set[FeastObject]], ]: """ Returns the objects in the registry that must be modified to achieve the desired repo state. @@ -215,30 +210,32 @@ def diff_between( objects_to_add = objs_to_add[object_type] for e in objects_to_add: - diff.add_fco_diff( - FcoDiff( + diff.add_feast_object_diff( + FeastObjectDiff( name=e.name, - fco_type=object_type, - current_fco=None, - new_fco=e, - fco_property_diffs=[], + feast_object_type=object_type, + current_feast_object=None, + new_feast_object=e, + feast_object_property_diffs=[], transition_type=TransitionType.CREATE, ) ) for e in objects_to_delete: - diff.add_fco_diff( - FcoDiff( + diff.add_feast_object_diff( + FeastObjectDiff( name=e.name, - fco_type=object_type, - current_fco=e, - new_fco=None, - fco_property_diffs=[], + feast_object_type=object_type, + current_feast_object=e, + new_feast_object=None, + feast_object_property_diffs=[], transition_type=TransitionType.DELETE, ) ) for e in objects_to_update: current_obj = [_e for _e in objects_to_keep if _e.name == e.name][0] - diff.add_fco_diff(diff_registry_objects(current_obj, e, object_type)) + diff.add_feast_object_diff( + diff_registry_objects(current_obj, e, object_type) + ) return diff @@ -255,39 +252,47 @@ def apply_diff_to_registry( project: Feast project to be updated. commit: Whether the change should be persisted immediately """ - for fco_diff in registry_diff.fco_diffs: - # There is no need to delete the FCO on an update, since applying the new FCO - # will automatically delete the existing FCO. - if fco_diff.transition_type == TransitionType.DELETE: - if fco_diff.fco_type == FeastObjectType.ENTITY: - registry.delete_entity(fco_diff.current_fco.name, project, commit=False) - elif fco_diff.fco_type == FeastObjectType.FEATURE_SERVICE: + for feast_object_diff in registry_diff.feast_object_diffs: + # There is no need to delete the object on an update, since applying the new object + # will automatically delete the existing object. + if feast_object_diff.transition_type == TransitionType.DELETE: + if feast_object_diff.feast_object_type == FeastObjectType.ENTITY: + registry.delete_entity( + feast_object_diff.current_feast_object.name, project, commit=False + ) + elif feast_object_diff.feast_object_type == FeastObjectType.FEATURE_SERVICE: registry.delete_feature_service( - fco_diff.current_fco.name, project, commit=False + feast_object_diff.current_feast_object.name, project, commit=False ) - elif fco_diff.fco_type in [ + elif feast_object_diff.feast_object_type in [ FeastObjectType.FEATURE_VIEW, FeastObjectType.ON_DEMAND_FEATURE_VIEW, FeastObjectType.REQUEST_FEATURE_VIEW, ]: registry.delete_feature_view( - fco_diff.current_fco.name, project, commit=False, + feast_object_diff.current_feast_object.name, project, commit=False, ) - if fco_diff.transition_type in [ + if feast_object_diff.transition_type in [ TransitionType.CREATE, TransitionType.UPDATE, ]: - if fco_diff.fco_type == FeastObjectType.ENTITY: - registry.apply_entity(fco_diff.new_fco, project, commit=False) - elif fco_diff.fco_type == FeastObjectType.FEATURE_SERVICE: - registry.apply_feature_service(fco_diff.new_fco, project, commit=False) - elif fco_diff.fco_type in [ + if feast_object_diff.feast_object_type == FeastObjectType.ENTITY: + registry.apply_entity( + feast_object_diff.new_feast_object, project, commit=False + ) + elif feast_object_diff.feast_object_type == FeastObjectType.FEATURE_SERVICE: + registry.apply_feature_service( + feast_object_diff.new_feast_object, project, commit=False + ) + elif feast_object_diff.feast_object_type in [ FeastObjectType.FEATURE_VIEW, FeastObjectType.ON_DEMAND_FEATURE_VIEW, FeastObjectType.REQUEST_FEATURE_VIEW, ]: - registry.apply_feature_view(fco_diff.new_fco, project, commit=False) + registry.apply_feature_view( + feast_object_diff.new_feast_object, project, commit=False + ) if commit: registry.commit() diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 01b1dc0f0c..6b1dadde5c 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -39,9 +39,8 @@ from feast import feature_server, flags, flags_helper, utils from feast.base_feature_view import BaseFeatureView -from feast.diff.FcoDiff import RegistryDiff, apply_diff_to_registry, diff_between from feast.diff.infra_diff import InfraDiff, diff_infra_protos -from feast.diff.property_diff import TransitionType +from feast.diff.registry_diff import RegistryDiff, apply_diff_to_registry, diff_between from feast.entity import Entity from feast.errors import ( EntityNotFoundException, @@ -75,7 +74,7 @@ ) from feast.protos.feast.types.EntityKey_pb2 import EntityKey as EntityKeyProto from feast.protos.feast.types.Value_pb2 import RepeatedValue, Value -from feast.registry import FeastObjectType, Registry +from feast.registry import Registry from feast.repo_config import RepoConfig, load_repo_config from feast.repo_contents import RepoContents from feast.request_feature_view import RequestFeatureView @@ -126,6 +125,7 @@ def __init__( registry_config = self.config.get_registry_config() self._registry = Registry(registry_config, repo_path=self.repo_path) + self._registry._initialize_registry() self._provider = get_provider(self.config, self.repo_path) @log_exceptions @@ -429,8 +429,10 @@ def _make_inferences( [view.batch_source for view in views_to_update], self.config ) + # New feature views may reference previously applied entities. + entities = self._list_entities() update_feature_views_with_inferred_features( - views_to_update, entities_to_update, self.config + views_to_update, entities + entities_to_update, self.config ) for odfv in odfvs_to_update: @@ -476,10 +478,26 @@ def _plan( ... ) >>> registry_diff, infra_diff, new_infra = fs._plan(RepoContents({driver_hourly_stats_view}, set(), set(), {driver}, set())) # register entity and feature view """ + # Validate and run inference on all the objects to be registered. + self._validate_all_feature_views( + list(desired_repo_contents.feature_views), + list(desired_repo_contents.on_demand_feature_views), + list(desired_repo_contents.request_feature_views), + ) + self._make_inferences( + list(desired_repo_contents.entities), + list(desired_repo_contents.feature_views), + list(desired_repo_contents.on_demand_feature_views), + ) + + # Compute the desired difference between the current objects in the registry and + # the desired repo state. registry_diff = diff_between( self._registry, self.project, desired_repo_contents ) + # Compute the desired difference between the current infra, as stored in the registry, + # and the desired infra. self._registry.refresh() current_infra_proto = ( self._registry.cached_registry_proto.infra.__deepcopy__() @@ -504,43 +522,6 @@ def _apply_diffs( infra_diff: The diff between the current infra and the desired infra. new_infra: The desired infra. """ - entities_to_update = [ - fco_diff.new_fco - for fco_diff in registry_diff.fco_diffs - if fco_diff.fco_type == FeastObjectType.ENTITY - and fco_diff.transition_type - in [TransitionType.CREATE, TransitionType.UPDATE] - ] - views_to_update = [ - fco_diff.new_fco - for fco_diff in registry_diff.fco_diffs - if fco_diff.fco_type == FeastObjectType.FEATURE_VIEW - and fco_diff.transition_type - in [TransitionType.CREATE, TransitionType.UPDATE] - ] - odfvs_to_update = [ - fco_diff.new_fco - for fco_diff in registry_diff.fco_diffs - if fco_diff.fco_type == FeastObjectType.ON_DEMAND_FEATURE_VIEW - and fco_diff.transition_type - in [TransitionType.CREATE, TransitionType.UPDATE] - ] - request_views_to_update = [ - fco_diff.new_fco - for fco_diff in registry_diff.fco_diffs - if fco_diff.fco_type == FeastObjectType.REQUEST_FEATURE_VIEW - and fco_diff.transition_type - in [TransitionType.CREATE, TransitionType.UPDATE] - ] - - # TODO(felixwang9817): move validation logic into _plan. - # Validate all feature views and make inferences. - self._validate_all_feature_views( - views_to_update, odfvs_to_update, request_views_to_update - ) - self._make_inferences(entities_to_update, views_to_update, odfvs_to_update) - - # Apply infra and registry changes. infra_diff.update() apply_diff_to_registry( self._registry, registry_diff, self.project, commit=False diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index 39a77264bc..642a3c6442 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -13,7 +13,12 @@ def update_entities_with_inferred_types_from_feature_views( entities: List[Entity], feature_views: List[FeatureView], config: RepoConfig ) -> None: """ - Infer entity value type by examining schema of feature view batch sources + Infers the types of the entities by examining the schemas of feature view batch sources. + + Args: + entities: The entities to be updated. + feature_views: A list containing feature views associated with the entities. + config: The config for the current feature store. """ incomplete_entities = { entity.name: entity @@ -127,6 +132,11 @@ def update_feature_views_with_inferred_features( Infers the set of features associated to each FeatureView and updates the FeatureView with those features. Inference occurs through considering each column of the underlying data source as a feature except columns that are associated with the data source's timestamp columns and the FeatureView's entity columns. + + Args: + fvs: The feature views to be updated. + entities: A list containing entities associated with the feature views. + config: The config for the current feature store. """ entity_name_to_join_key_map = {entity.name: entity.join_key for entity in entities} diff --git a/sdk/python/feast/repo_operations.py b/sdk/python/feast/repo_operations.py index f34346871d..8a3a202c6d 100644 --- a/sdk/python/feast/repo_operations.py +++ b/sdk/python/feast/repo_operations.py @@ -12,7 +12,7 @@ import click from click.exceptions import BadParameter -from feast.diff.FcoDiff import extract_objects_for_keep_delete_update_add +from feast.diff.registry_diff import extract_objects_for_keep_delete_update_add from feast.entity import Entity from feast.feature_service import FeatureService from feast.feature_store import FeatureStore @@ -147,7 +147,6 @@ def _prepare_registry_and_repo(repo_config, repo_path): ) sys.exit(1) registry = store.registry - registry._initialize_registry() sys.dont_write_bytecode = True repo = parse_repo(repo_path) return project, registry, repo, store diff --git a/sdk/python/tests/integration/e2e/test_usage_e2e.py b/sdk/python/tests/integration/e2e/test_usage_e2e.py index f55fbce55c..0bae973063 100644 --- a/sdk/python/tests/integration/e2e/test_usage_e2e.py +++ b/sdk/python/tests/integration/e2e/test_usage_e2e.py @@ -66,10 +66,16 @@ def test_usage_on(dummy_exporter, enabling_toggle): test_feature_store.apply([entity]) - assert len(dummy_exporter) == 1 + assert len(dummy_exporter) == 3 assert { - "entrypoint": "feast.feature_store.FeatureStore.apply" + "entrypoint": "feast.infra.local.LocalRegistryStore.get_registry_proto" }.items() <= dummy_exporter[0].items() + assert { + "entrypoint": "feast.infra.local.LocalRegistryStore.update_registry_proto" + }.items() <= dummy_exporter[1].items() + assert { + "entrypoint": "feast.feature_store.FeatureStore.apply" + }.items() <= dummy_exporter[2].items() @pytest.mark.integration diff --git a/sdk/python/tests/unit/diff/test_fco_diff.py b/sdk/python/tests/unit/diff/test_registry_diff.py similarity index 53% rename from sdk/python/tests/unit/diff/test_fco_diff.py rename to sdk/python/tests/unit/diff/test_registry_diff.py index fa3c84d035..0322ab47ab 100644 --- a/sdk/python/tests/unit/diff/test_fco_diff.py +++ b/sdk/python/tests/unit/diff/test_registry_diff.py @@ -1,54 +1,11 @@ -from feast.diff.FcoDiff import ( +from feast.diff.registry_diff import ( diff_registry_objects, tag_objects_for_keep_delete_update_add, - tag_proto_objects_for_keep_delete_add, ) from feast.feature_view import FeatureView from tests.utils.data_source_utils import prep_file_source -def test_tag_proto_objects_for_keep_delete_add(simple_dataset_1): - with prep_file_source( - df=simple_dataset_1, event_timestamp_column="ts_1" - ) as file_source: - to_delete = FeatureView( - name="to_delete", entities=["id"], batch_source=file_source, ttl=None, - ).to_proto() - unchanged_fv = FeatureView( - name="fv1", entities=["id"], batch_source=file_source, ttl=None, - ).to_proto() - pre_changed = FeatureView( - name="fv2", - entities=["id"], - batch_source=file_source, - ttl=None, - tags={"when": "before"}, - ).to_proto() - post_changed = FeatureView( - name="fv2", - entities=["id"], - batch_source=file_source, - ttl=None, - tags={"when": "after"}, - ).to_proto() - to_add = FeatureView( - name="to_add", entities=["id"], batch_source=file_source, ttl=None, - ).to_proto() - - keep, delete, add = tag_proto_objects_for_keep_delete_add( - [unchanged_fv, pre_changed, to_delete], [unchanged_fv, post_changed, to_add] - ) - - assert len(list(keep)) == 2 - assert unchanged_fv in keep - assert post_changed in keep - assert pre_changed not in keep - assert len(list(delete)) == 1 - assert to_delete in delete - assert len(list(add)) == 1 - assert to_add in add - - def test_tag_objects_for_keep_delete_update_add(simple_dataset_1): with prep_file_source( df=simple_dataset_1, event_timestamp_column="ts_1" @@ -114,12 +71,20 @@ def test_diff_registry_objects_feature_views(simple_dataset_1): tags={"when": "after"}, ) - fco_diffs = diff_registry_objects(pre_changed, pre_changed, "feature view") - assert len(fco_diffs.fco_property_diffs) == 0 + feast_object_diffs = diff_registry_objects( + pre_changed, pre_changed, "feature view" + ) + assert len(feast_object_diffs.feast_object_property_diffs) == 0 - fco_diffs = diff_registry_objects(pre_changed, post_changed, "feature view") - assert len(fco_diffs.fco_property_diffs) == 1 + feast_object_diffs = diff_registry_objects( + pre_changed, post_changed, "feature view" + ) + assert len(feast_object_diffs.feast_object_property_diffs) == 1 - assert fco_diffs.fco_property_diffs[0].property_name == "tags" - assert fco_diffs.fco_property_diffs[0].val_existing == {"when": "before"} - assert fco_diffs.fco_property_diffs[0].val_declared == {"when": "after"} + assert feast_object_diffs.feast_object_property_diffs[0].property_name == "tags" + assert feast_object_diffs.feast_object_property_diffs[0].val_existing == { + "when": "before" + } + assert feast_object_diffs.feast_object_property_diffs[0].val_declared == { + "when": "after" + } From bb17968ef1d2d0a8114067e72afd3ee20a5a4439 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 30 Jan 2022 17:40:22 -0500 Subject: [PATCH 08/19] Squash commits Signed-off-by: Danny Chiao Signed-off-by: sfc-gh-madkins --- README.md | 8 +- docs/README.md | 2 +- docs/SUMMARY.md | 5 +- .../third-party-integrations.md | 4 +- .../README.md | 0 .../build-a-training-dataset.md | 0 .../create-a-feature-repository.md | 18 +- .../deploy-a-feature-store.md | 0 .../install-feast.md | 6 + .../load-data-into-the-online-store.md | 0 .../read-features-from-the-online-store.md | 0 docs/reference/data-sources/README.md | 3 +- docs/reference/data-sources/snowflake.md | 44 ++ docs/reference/offline-stores/README.md | 3 +- docs/reference/offline-stores/snowflake.md | 30 + docs/reference/offline-stores/untitled.md | 26 - docs/reference/online-stores/README.md | 1 - docs/reference/providers/README.md | 1 - docs/roadmap.md | 2 + docs/specs/offline_store_format.md | 22 +- .../tutorials/driver-stats-using-snowflake.md | 143 ++++ docs/tutorials/tutorials-overview.md | 1 + protos/feast/core/DataSource.proto | 24 +- protos/feast/core/SavedDataset.proto | 1 + sdk/python/feast/__init__.py | 2 + sdk/python/feast/cli.py | 2 +- sdk/python/feast/data_source.py | 7 + sdk/python/feast/errors.py | 10 + sdk/python/feast/inference.py | 17 +- .../feast/infra/offline_stores/snowflake.py | 632 ++++++++++++++++++ .../infra/offline_stores/snowflake_source.py | 315 +++++++++ .../feast/infra/utils/snowflake_utils.py | 282 ++++++++ sdk/python/feast/repo_config.py | 2 + .../feast/templates/snowflake/bootstrap.py | 91 +++ .../feast/templates/snowflake/driver_repo.py | 64 ++ .../templates/snowflake/feature_store.yaml | 11 + sdk/python/feast/templates/snowflake/test.py | 65 ++ sdk/python/feast/type_map.py | 24 + .../requirements/py3.7-ci-requirements.txt | 27 +- .../requirements/py3.8-ci-requirements.txt | 27 +- .../requirements/py3.9-ci-requirements.txt | 27 +- sdk/python/setup.py | 6 + .../feature_repos/repo_configuration.py | 9 + .../universal/data_sources/snowflake.py | 82 +++ .../test_universal_historical_retrieval.py | 2 +- 45 files changed, 1994 insertions(+), 54 deletions(-) rename docs/how-to-guides/{feast-gcp-aws => feast-snowflake-gcp-aws}/README.md (100%) rename docs/how-to-guides/{feast-gcp-aws => feast-snowflake-gcp-aws}/build-a-training-dataset.md (100%) rename docs/how-to-guides/{feast-gcp-aws => feast-snowflake-gcp-aws}/create-a-feature-repository.md (84%) rename docs/how-to-guides/{feast-gcp-aws => feast-snowflake-gcp-aws}/deploy-a-feature-store.md (100%) rename docs/how-to-guides/{feast-gcp-aws => feast-snowflake-gcp-aws}/install-feast.md (80%) rename docs/how-to-guides/{feast-gcp-aws => feast-snowflake-gcp-aws}/load-data-into-the-online-store.md (100%) rename docs/how-to-guides/{feast-gcp-aws => feast-snowflake-gcp-aws}/read-features-from-the-online-store.md (100%) create mode 100644 docs/reference/data-sources/snowflake.md create mode 100644 docs/reference/offline-stores/snowflake.md delete mode 100644 docs/reference/offline-stores/untitled.md create mode 100644 docs/tutorials/driver-stats-using-snowflake.md create mode 100644 sdk/python/feast/infra/offline_stores/snowflake.py create mode 100644 sdk/python/feast/infra/offline_stores/snowflake_source.py create mode 100644 sdk/python/feast/infra/utils/snowflake_utils.py create mode 100644 sdk/python/feast/templates/snowflake/bootstrap.py create mode 100644 sdk/python/feast/templates/snowflake/driver_repo.py create mode 100644 sdk/python/feast/templates/snowflake/feature_store.yaml create mode 100644 sdk/python/feast/templates/snowflake/test.py create mode 100644 sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py diff --git a/README.md b/README.md index 649bb909fa..7ede0c612a 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ The list below contains the functionality that contributors are planning to deve * Want to speak to a Feast contributor? We are more than happy to jump on a call. Please schedule a time using [Calendly](https://calendly.com/d/x2ry-g5bb/meet-with-feast-team). * **Data Sources** + * [x] [Snowflake source](https://docs.feast.dev/reference/data-sources/snowflake) * [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) * [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) * [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) @@ -143,9 +144,9 @@ The list below contains the functionality that contributors are planning to deve * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (community plugin)](https://github.com/nossrannug/feast-postgres) * [x] Kafka source (with [push support into the online store](reference/alpha-stream-ingestion.md)) - * [x] [Snowflake source (community plugin)](https://github.com/sfc-gh-madkins/feast-snowflake) * [ ] HTTP source * **Offline Stores** + * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) * [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) * [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) @@ -153,7 +154,6 @@ The list below contains the functionality that contributors are planning to deve * [x] [Postgres (community plugin)](https://github.com/nossrannug/feast-postgres) * [x] [In-memory / Pandas](https://docs.feast.dev/reference/offline-stores/file) * [x] [Custom offline store support](https://docs.feast.dev/how-to-guides/adding-a-new-offline-store) - * [x] [Snowflake (community plugin)](https://github.com/sfc-gh-madkins/feast-snowflake) * [x] [Trino (communiuty plugin)](https://github.com/Shopify/feast-trino) * **Online Stores** * [x] [DynamoDB](https://docs.feast.dev/reference/online-stores/dynamodb) @@ -208,7 +208,7 @@ The list below contains the functionality that contributors are planning to deve Please refer to the official documentation at [Documentation](https://docs.feast.dev/) * [Quickstart](https://docs.feast.dev/getting-started/quickstart) * [Tutorials](https://docs.feast.dev/tutorials/tutorials-overview) - * [Running Feast with GCP/AWS](https://docs.feast.dev/how-to-guides/feast-gcp-aws) + * [Running Feast with Snowflake/GCP/AWS](https://docs.feast.dev/how-to-guides/feast-snowflake-gcp-aws) * [Change Log](https://github.com/feast-dev/feast/blob/master/CHANGELOG.md) * [Slack (#Feast)](https://slack.feast.dev/) @@ -224,4 +224,4 @@ Thanks goes to these incredible people: - \ No newline at end of file + diff --git a/docs/README.md b/docs/README.md index 1a76adbde3..d5c5177a18 100644 --- a/docs/README.md +++ b/docs/README.md @@ -52,6 +52,6 @@ Explore the following resources to get started with Feast: * [Concepts](getting-started/concepts/) describes all important Feast API concepts * [Architecture](getting-started/architecture-and-components/) describes Feast's overall architecture. * [Tutorials](tutorials/tutorials-overview.md) shows full examples of using Feast in machine learning applications. -* [Running Feast with GCP/AWS](how-to-guides/feast-gcp-aws/) provides a more in-depth guide to using Feast. +* [Running Feast with Snowflake/GCP/AWS](how-to-guides/feast-snowflake-gcp-aws/) provides a more in-depth guide to using Feast. * [Reference](reference/feast-cli-commands.md) contains detailed API and design documents. * [Contributing](project/contributing.md) contains resources for anyone who wants to contribute to Feast. diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index ae23cd5d40..e1343ec485 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -33,10 +33,11 @@ * [Driver ranking](tutorials/driver-ranking-with-feast.md) * [Fraud detection on GCP](tutorials/fraud-detection.md) * [Real-time credit scoring on AWS](tutorials/real-time-credit-scoring-on-aws.md) +* [Driver Stats using Snowflake](tutorials/driver-stats-using-snowflake.md) ## How-to Guides -* [Running Feast with GCP/AWS](how-to-guides/feast-gcp-aws/README.md) +* [Running Feast with Snowflake/GCP/AWS](how-to-guides/feast-snowflake-gcp-aws/README.md) * [Install Feast](how-to-guides/feast-gcp-aws/install-feast.md) * [Create a feature repository](how-to-guides/feast-gcp-aws/create-a-feature-repository.md) * [Deploy a feature store](how-to-guides/feast-gcp-aws/deploy-a-feature-store.md) @@ -54,10 +55,12 @@ * [Data sources](reference/data-sources/README.md) * [File](reference/data-sources/file.md) + * [Snowflake](reference/data-sources/snowflake.md) * [BigQuery](reference/data-sources/bigquery.md) * [Redshift](reference/data-sources/redshift.md) * [Offline stores](reference/offline-stores/README.md) * [File](reference/offline-stores/file.md) + * [Snowflake](reference/offline-stores/snowflake.md) * [BigQuery](reference/offline-stores/bigquery.md) * [Redshift](reference/offline-stores/redshift.md) * [Online stores](reference/online-stores/README.md) diff --git a/docs/getting-started/third-party-integrations.md b/docs/getting-started/third-party-integrations.md index 31b6acdc88..a3a41bb836 100644 --- a/docs/getting-started/third-party-integrations.md +++ b/docs/getting-started/third-party-integrations.md @@ -13,6 +13,7 @@ Don't see your offline store or online store of choice here? Check out our guide ### **Data Sources** +* [x] [Snowflake source](https://docs.feast.dev/reference/data-sources/snowflake) * [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) * [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) * [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) @@ -20,11 +21,11 @@ Don't see your offline store or online store of choice here? Check out our guide * [x] [Hive (community plugin)](https://github.com/baineng/feast-hive) * [x] [Postgres (community plugin)](https://github.com/nossrannug/feast-postgres) * [x] Kafka source (with [push support into the online store](https://docs.feast.dev/reference/alpha-stream-ingestion)) -* [x] [Snowflake source (community plugin)](https://github.com/sfc-gh-madkins/feast-snowflake) * [ ] HTTP source ### Offline Stores +* [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) * [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) * [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) @@ -32,7 +33,6 @@ Don't see your offline store or online store of choice here? Check out our guide * [x] [Postgres (community plugin)](https://github.com/nossrannug/feast-postgres) * [x] [In-memory / Pandas](https://docs.feast.dev/reference/offline-stores/file) * [x] [Custom offline store support](https://docs.feast.dev/how-to-guides/adding-a-new-offline-store) -* [x] [Snowflake source (community plugin)](https://github.com/sfc-gh-madkins/feast-snowflake) * [x] [Trino (communiuty plugin)](https://github.com/Shopify/feast-trino) ### Online Stores diff --git a/docs/how-to-guides/feast-gcp-aws/README.md b/docs/how-to-guides/feast-snowflake-gcp-aws/README.md similarity index 100% rename from docs/how-to-guides/feast-gcp-aws/README.md rename to docs/how-to-guides/feast-snowflake-gcp-aws/README.md diff --git a/docs/how-to-guides/feast-gcp-aws/build-a-training-dataset.md b/docs/how-to-guides/feast-snowflake-gcp-aws/build-a-training-dataset.md similarity index 100% rename from docs/how-to-guides/feast-gcp-aws/build-a-training-dataset.md rename to docs/how-to-guides/feast-snowflake-gcp-aws/build-a-training-dataset.md diff --git a/docs/how-to-guides/feast-gcp-aws/create-a-feature-repository.md b/docs/how-to-guides/feast-snowflake-gcp-aws/create-a-feature-repository.md similarity index 84% rename from docs/how-to-guides/feast-gcp-aws/create-a-feature-repository.md rename to docs/how-to-guides/feast-snowflake-gcp-aws/create-a-feature-repository.md index 1add0a92e8..8754bc051a 100644 --- a/docs/how-to-guides/feast-gcp-aws/create-a-feature-repository.md +++ b/docs/how-to-guides/feast-snowflake-gcp-aws/create-a-feature-repository.md @@ -13,6 +13,21 @@ Creating a new Feast repository in /<...>/tiny_pika. ``` {% endtab %} +{% tabs %} +{% tab title="Snowflake template" %} +```bash +feast init -t snowflake +Snowflake Deployment URL: ... +Snowflake User Name: ... +Snowflake Password: ... +Snowflake Role Name: ... +Snowflake Warehouse Name: ... +Snowflake Database Name: ... + +Creating a new Feast repository in /<...>/tiny_pika. +``` +{% endtab %} + {% tab title="GCP template" %} ```text feast init -t gcp @@ -30,7 +45,7 @@ Redshift Database Name: ... Redshift User Name: ... Redshift S3 Staging Location (s3://*): ... Redshift IAM Role for S3 (arn:aws:iam::*:role/*): ... -Should I upload example data to Redshift (overwriting 'feast_driver_hourly_stats' table)? (Y/n): +Should I upload example data to Redshift (overwriting 'feast_driver_hourly_stats' table)? (Y/n): Creating a new Feast repository in /<...>/tiny_pika. ``` @@ -63,4 +78,3 @@ You can now use this feature repository for development. You can try the followi * Run `feast apply` to apply these definitions to Feast. * Edit the example feature definitions in `example.py` and run `feast apply` again to change feature definitions. * Initialize a git repository in the same directory and checking the feature repository into version control. - diff --git a/docs/how-to-guides/feast-gcp-aws/deploy-a-feature-store.md b/docs/how-to-guides/feast-snowflake-gcp-aws/deploy-a-feature-store.md similarity index 100% rename from docs/how-to-guides/feast-gcp-aws/deploy-a-feature-store.md rename to docs/how-to-guides/feast-snowflake-gcp-aws/deploy-a-feature-store.md diff --git a/docs/how-to-guides/feast-gcp-aws/install-feast.md b/docs/how-to-guides/feast-snowflake-gcp-aws/install-feast.md similarity index 80% rename from docs/how-to-guides/feast-gcp-aws/install-feast.md rename to docs/how-to-guides/feast-snowflake-gcp-aws/install-feast.md index 019231be09..26d95c6117 100644 --- a/docs/how-to-guides/feast-gcp-aws/install-feast.md +++ b/docs/how-to-guides/feast-snowflake-gcp-aws/install-feast.md @@ -6,6 +6,12 @@ Install Feast using [pip](https://pip.pypa.io): pip install feast ``` +Install Feast with Snowflake dependencies (required when using Snowflake): + +``` +pip install 'feast[snowflake]' +``` + Install Feast with GCP dependencies (required when using BigQuery or Firestore): ``` diff --git a/docs/how-to-guides/feast-gcp-aws/load-data-into-the-online-store.md b/docs/how-to-guides/feast-snowflake-gcp-aws/load-data-into-the-online-store.md similarity index 100% rename from docs/how-to-guides/feast-gcp-aws/load-data-into-the-online-store.md rename to docs/how-to-guides/feast-snowflake-gcp-aws/load-data-into-the-online-store.md diff --git a/docs/how-to-guides/feast-gcp-aws/read-features-from-the-online-store.md b/docs/how-to-guides/feast-snowflake-gcp-aws/read-features-from-the-online-store.md similarity index 100% rename from docs/how-to-guides/feast-gcp-aws/read-features-from-the-online-store.md rename to docs/how-to-guides/feast-snowflake-gcp-aws/read-features-from-the-online-store.md diff --git a/docs/reference/data-sources/README.md b/docs/reference/data-sources/README.md index 6732fc16a0..fc6e136a9c 100644 --- a/docs/reference/data-sources/README.md +++ b/docs/reference/data-sources/README.md @@ -4,7 +4,8 @@ Please see [Data Source](../../getting-started/concepts/feature-view.md#data-sou {% page-ref page="file.md" %} +{% page-ref page="snowflake.md" %} + {% page-ref page="bigquery.md" %} {% page-ref page="redshift.md" %} - diff --git a/docs/reference/data-sources/snowflake.md b/docs/reference/data-sources/snowflake.md new file mode 100644 index 0000000000..0f5304b6cd --- /dev/null +++ b/docs/reference/data-sources/snowflake.md @@ -0,0 +1,44 @@ +# Snowflake + +## Description + +Snowflake data sources allow for the retrieval of historical feature values from Snowflake for building training datasets as well as materializing features into an online store. + +* Either a table reference or a SQL query can be provided. + +## Examples + +Using a table reference + +```python +from feast import SnowflakeSource + +my_snowflake_source = SnowflakeSource( + database="FEAST", + schema="PUBLIC", + table="FEATURE_TABLE", +) +``` + +Using a query + +```python +from feast import SnowflakeSource + +my_snowflake_source = SnowflakeSource( + query=""" + SELECT + timestamp_column AS "ts", + "created", + "f1", + "f2" + FROM + `FEAST.PUBLIC.FEATURE_TABLE` + """, +) +``` + +One thing to remember is how Snowflake handles table and column name conventions. +You can read more about quote identifiers [here](https://docs.snowflake.com/en/sql-reference/identifiers-syntax.html) + +Configuration options are available [here](https://rtd.feast.dev/en/latest/index.html#feast.data_source.SnowflakeSource). diff --git a/docs/reference/offline-stores/README.md b/docs/reference/offline-stores/README.md index 1260fe8b29..141a34d03b 100644 --- a/docs/reference/offline-stores/README.md +++ b/docs/reference/offline-stores/README.md @@ -4,7 +4,8 @@ Please see [Offline Store](../../getting-started/architecture-and-components/off {% page-ref page="file.md" %} +{% page-ref page="snowflake.md" %} + {% page-ref page="bigquery.md" %} {% page-ref page="redshift.md" %} - diff --git a/docs/reference/offline-stores/snowflake.md b/docs/reference/offline-stores/snowflake.md new file mode 100644 index 0000000000..fcf9a7a6fd --- /dev/null +++ b/docs/reference/offline-stores/snowflake.md @@ -0,0 +1,30 @@ +# Snowflake + +## Description + +The Snowflake offline store provides support for reading [SnowflakeSources](../data-sources/snowflake.md). + +* Snowflake tables and views are allowed as sources. +* All joins happen within Snowflake. +* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be uploaded to Snowflake in order to complete join operations. +* A [SnowflakeRetrievalJob](https://github.com/feast-dev/feast/blob/bf557bcb72c7878a16dccb48443bbbe9dc3efa49/sdk/python/feast/infra/offline_stores/snowflake.py#L185) is returned when calling `get_historical_features()`. + +## Example + +{% code title="feature_store.yaml" %} +```yaml +project: my_feature_repo +registry: data/registry.db +provider: local +offline_store: + type: snowflake.offline + account: snowflake_deployment.us-east-1 + user: user_login + password: user_password + role: sysadmin + warehouse: demo_wh + database: FEAST +``` +{% endcode %} + +Configuration options are available [here](https://github.com/feast-dev/feast/blob/bf557bcb72c7878a16dccb48443bbbe9dc3efa49/sdk/python/feast/infra/offline_stores/snowflake.py#L39). diff --git a/docs/reference/offline-stores/untitled.md b/docs/reference/offline-stores/untitled.md deleted file mode 100644 index 8ffa566a70..0000000000 --- a/docs/reference/offline-stores/untitled.md +++ /dev/null @@ -1,26 +0,0 @@ -# BigQuery - -### Description - -The BigQuery offline store provides support for reading [BigQuerySources](../data-sources/bigquery.md). - -* BigQuery tables and views are allowed as sources. -* All joins happen within BigQuery. -* Entity dataframes can be provided as a SQL query or can be provided as a Pandas dataframe. Pandas dataframes will be uploaded to BigQuery in order to complete join operations. -* A [BigQueryRetrievalJob](https://github.com/feast-dev/feast/blob/c50a36ec1ad5b8d81c6f773c23204db7c7a7d218/sdk/python/feast/infra/offline_stores/bigquery.py#L210) is returned when calling `get_historical_features()`. - -### Example - -{% code title="feature\_store.yaml" %} -```yaml -project: my_feature_repo -registry: gs://my-bucket/data/registry.db -provider: gcp -offline_store: - type: bigquery - dataset: feast_bq_dataset -``` -{% endcode %} - -Configuration options are available [here](https://rtd.feast.dev/en/latest/#feast.repo_config.BigQueryOfflineStoreConfig). - diff --git a/docs/reference/online-stores/README.md b/docs/reference/online-stores/README.md index aadcc0eb65..2c2902bc57 100644 --- a/docs/reference/online-stores/README.md +++ b/docs/reference/online-stores/README.md @@ -9,4 +9,3 @@ Please see [Online Store](../../getting-started/architecture-and-components/onli {% page-ref page="datastore.md" %} {% page-ref page="dynamodb.md" %} - diff --git a/docs/reference/providers/README.md b/docs/reference/providers/README.md index 7eb992d5ac..dc52d92726 100644 --- a/docs/reference/providers/README.md +++ b/docs/reference/providers/README.md @@ -7,4 +7,3 @@ Please see [Provider](../../getting-started/architecture-and-components/provider {% page-ref page="google-cloud-platform.md" %} {% page-ref page="amazon-web-services.md" %} - diff --git a/docs/roadmap.md b/docs/roadmap.md index 723bfba82a..42da01fcba 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -8,6 +8,7 @@ The list below contains the functionality that contributors are planning to deve * Want to speak to a Feast contributor? We are more than happy to jump on a call. Please schedule a time using [Calendly](https://calendly.com/d/x2ry-g5bb/meet-with-feast-team). * **Data Sources** + * [x] [Snowflake source](https://docs.feast.dev/reference/data-sources/snowflake) * [x] [Redshift source](https://docs.feast.dev/reference/data-sources/redshift) * [x] [BigQuery source](https://docs.feast.dev/reference/data-sources/bigquery) * [x] [Parquet file source](https://docs.feast.dev/reference/data-sources/file) @@ -18,6 +19,7 @@ The list below contains the functionality that contributors are planning to deve * [x] [Snowflake source (community plugin)](https://github.com/sfc-gh-madkins/feast-snowflake) * [ ] HTTP source * **Offline Stores** + * [x] [Snowflake](https://docs.feast.dev/reference/offline-stores/snowflake) * [x] [Redshift](https://docs.feast.dev/reference/offline-stores/redshift) * [x] [BigQuery](https://docs.feast.dev/reference/offline-stores/bigquery) * [x] [Synapse (community plugin)](https://github.com/Azure/feast-azure) diff --git a/docs/specs/offline_store_format.md b/docs/specs/offline_store_format.md index 6826c50190..ac829dd52f 100644 --- a/docs/specs/offline_store_format.md +++ b/docs/specs/offline_store_format.md @@ -7,8 +7,8 @@ One of the design goals of Feast is being able to plug seamlessly into existing Feast provides first class support for the following data warehouses (DWH) to store feature data offline out of the box: * [BigQuery](https://cloud.google.com/bigquery) -* [Snowflake](https://www.snowflake.com/) (Coming Soon) -* [Redshift](https://aws.amazon.com/redshift/) (Coming Soon) +* [Snowflake](https://www.snowflake.com/) +* [Redshift](https://aws.amazon.com/redshift/) The integration between Feast and the DWH is highly configurable, but at the same time there are some non-configurable implications and assumptions that Feast imposes on table schemas and mapping between database-native types and Feast type system. This is what this document is about. @@ -28,14 +28,14 @@ Feature data is stored in tables in the DWH. There is one DWH table per Feast Fe ## Type mappings #### Pandas types -Here's how Feast types map to Pandas types for Feast APIs that take in or return a Pandas dataframe: +Here's how Feast types map to Pandas types for Feast APIs that take in or return a Pandas dataframe: | Feast Type | Pandas Type | |-------------|--| | Event Timestamp | `datetime64[ns]` | | BYTES | `bytes` | | STRING | `str` , `category`| -| INT32 | `int32`, `uint32` | +| INT32 | `int16`, `uint16`, `int32`, `uint32` | | INT64 | `int64`, `uint64` | | UNIX_TIMESTAMP | `datetime64[ns]`, `datetime64[ns, tz]` | | DOUBLE | `float64` | @@ -80,3 +80,17 @@ Here's how Feast types map to BigQuery types when using BigQuery for offline sto | BOOL\_LIST | `ARRAY`| Values that are not specified by the table above will cause an error on conversion. + +#### Snowflake Types +Here's how Feast types map to Snowflake types when using Snowflake for offline storage +See source here: +https://docs.snowflake.com/en/user-guide/python-connector-pandas.html#snowflake-to-pandas-data-mapping + +| Feast Type | Snowflake Python Type | +|-------------|--| +| Event Timestamp | `DATETIME64[NS]` | +| UNIX_TIMESTAMP | `DATETIME64[NS]` | +| STRING | `STR` | +| INT32 | `INT8 / UINT8 / INT16 / UINT16 / INT32 / UINT32` | +| INT64 | `INT64 / UINT64` | +| DOUBLE | `FLOAT64` | diff --git a/docs/tutorials/driver-stats-using-snowflake.md b/docs/tutorials/driver-stats-using-snowflake.md new file mode 100644 index 0000000000..e95dd655ae --- /dev/null +++ b/docs/tutorials/driver-stats-using-snowflake.md @@ -0,0 +1,143 @@ +--- +description: >- + Initial demonstration of using Snowflake with Feast as both and Offline & Online store + using the snowflake demo template. +--- + +# Drivers Stats using Snowflake + +In the following steps below, we will setup a sample feast project that leverages Snowflake +as an Offline Store. + +Starting with data in a Snowflake table, we will register that table to the feature store and +define features associated with the columns in that table. From there, we will generate historical +training data based on those feature definitions. We then will materialize the latest feature values +given our feature definitions into our online feature store. Lastly, we will then call +for those latest feature values. + +Our template that you will leverage will generate new data related to driver statistics. +From there, we will show you code snippets that will call to the offline store for generating +training datasets, and then the code for calling the online store to serve you the +latest feature values to serve models in production. + +## Snowflake Offline/Online Store Example + +#### Install feast-snowflake + +```shell +pip install feast[snowflake] +``` + +#### Get a Snowflake Trial Account (Optional) + +[Snowflake Trial Account](trial.snowflake.com) + +#### Create a feature repository + +```shell +feast init -t snowflake {feature_repo_name} +Snowflake Deployment URL (exclude .snowflakecomputing.com): +Snowflake User Name:: +Snowflake Password:: +Snowflake Role Name (Case Sensitive):: +Snowflake Warehouse Name (Case Sensitive):: +Snowflake Database Name (Case Sensitive):: +Should I upload example data to Snowflake (overwrite table)? [Y/n]: Y +cd {feature_repo_name} +``` + +The following files will automatically be created in your project folder: + +* feature_store.yaml -- This is your main configuration file +* driver_repo.py -- This is your main feature definition file +* test.py -- This is a file to test your feature store configuration + +* registry.db -- (Inactive) This file contains the metadata related to your feature store operations +* data/ -- (Inactive) This folder contains the sample data that we will use + +#### Inspect `feature_store.yaml` + +Here you will see the information that you entered. This template will look to use +Snowflake as both an Offline & Online store. The main thing to remember is by default, +Snowflake Objects have ALL CAPS names unless lower case was specified. + +{% code title="feature_store.yaml" %} +```yaml +project: ... +registry: ... +provider: local +offline_store: + type: snowflake.offline + account: SNOWFLAKE_DEPLOYMENT_URL #drop .snowflakecomputing.com + user: USERNAME + password: PASSWORD + role: ROLE_NAME #case sensitive + warehouse: WAREHOUSE_NAME #case sensitive + database: DATABASE_NAME #case cap sensitive +``` +{% endcode %} + +#### Run our test python script `test.py` + +```shell +python test.py +``` + +## What we did in `test.py` + +#### Initialize our Feature Store +{% code title="test.py" %} +```python +from datetime import datetime, timedelta + +import pandas as pd +from driver_repo import driver, driver_stats_fv + +from feast import FeatureStore + +fs = FeatureStore(repo_path=".") + +fs.apply([driver, driver_stats_fv]) +``` +{% endcode %} + +#### Create a dummy training dataframe, then call our Offline store to add additional columns +{% code title="test.py" %} +```python +entity_df = pd.DataFrame( + { + "event_timestamp": [ + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") + for dt in pd.date_range( + start=datetime.now() - timedelta(days=3), + end=datetime.now(), + periods=3, + ) + ], + "driver_id": [1001, 1002, 1003], + } +) + +features = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] + +training_df = fs.get_historical_features( + features=features, entity_df=entity_df +).to_df() +``` +{% endcode %} + +#### Materialize the latest feature values into our Online store +{% code title="test.py" %} +```python +fs.materialize_incremental(end_date=datetime.now()) +``` +{% endcode %} + +#### Retrieve the latest values from our Online store based on our Entity Key +{% code title="test.py" %} +```python +online_features = fs.get_online_features( + features=features, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], +).to_dict() +``` +{% endcode %} diff --git a/docs/tutorials/tutorials-overview.md b/docs/tutorials/tutorials-overview.md index a523f9b38e..86a8c25371 100644 --- a/docs/tutorials/tutorials-overview.md +++ b/docs/tutorials/tutorials-overview.md @@ -8,3 +8,4 @@ These Feast tutorials showcase how to use Feast to simplify end to end model tra {% page-ref page="real-time-credit-scoring-on-aws.md" %} +{% page-ref page="driver-stats-using-snowflake.md" %} diff --git a/protos/feast/core/DataSource.proto b/protos/feast/core/DataSource.proto index ee5c6939d7..41bba6443f 100644 --- a/protos/feast/core/DataSource.proto +++ b/protos/feast/core/DataSource.proto @@ -32,19 +32,22 @@ message DataSource { reserved 6 to 10; // Type of Data Source. + // Next available id: 9 enum SourceType { INVALID = 0; BATCH_FILE = 1; + BATCH_SNOWFLAKE = 8; BATCH_BIGQUERY = 2; + BATCH_REDSHIFT = 5; STREAM_KAFKA = 3; STREAM_KINESIS = 4; - BATCH_REDSHIFT = 5; CUSTOM_SOURCE = 6; REQUEST_SOURCE = 7; + } SourceType type = 1; - // Defines mapping between fields in the sourced data + // Defines mapping between fields in the sourced data // and fields in parent FeatureTable. map field_mapping = 2; @@ -128,6 +131,22 @@ message DataSource { string schema = 3; } + // Defines options for DataSource that sources features from a Snowflake Query + message SnowflakeOptions { + // Snowflake table name + string table = 1; + + // SQL query that returns a table containing feature data. Must contain an event_timestamp column, and respective + // entity columns + string query = 2; + + // Snowflake schema name + string schema = 3; + + // Snowflake schema name + string database = 4; + } + // Defines configuration for custom third-party data sources. message CustomSourceOptions { // Serialized configuration information for the data source. The implementer of the custom data source is @@ -153,5 +172,6 @@ message DataSource { RedshiftOptions redshift_options = 15; RequestDataOptions request_data_options = 18; CustomSourceOptions custom_options = 16; + SnowflakeOptions snowflake_options = 19; } } diff --git a/protos/feast/core/SavedDataset.proto b/protos/feast/core/SavedDataset.proto index 6ec9df0835..ebd2e56d35 100644 --- a/protos/feast/core/SavedDataset.proto +++ b/protos/feast/core/SavedDataset.proto @@ -53,6 +53,7 @@ message SavedDatasetStorage { DataSource.FileOptions file_storage = 4; DataSource.BigQueryOptions bigquery_storage = 5; DataSource.RedshiftOptions redshift_storage = 6; + DataSource.SnowflakeOptions snowflake_storage = 7; } } diff --git a/sdk/python/feast/__init__.py b/sdk/python/feast/__init__.py index eada13f995..9f78f9d98b 100644 --- a/sdk/python/feast/__init__.py +++ b/sdk/python/feast/__init__.py @@ -5,6 +5,7 @@ from feast.infra.offline_stores.bigquery_source import BigQuerySource from feast.infra.offline_stores.file_source import FileSource from feast.infra.offline_stores.redshift_source import RedshiftSource +from feast.infra.offline_stores.snowflake_source import SnowflakeSource from .data_source import KafkaSource, KinesisSource, SourceType from .entity import Entity @@ -43,4 +44,5 @@ "BigQuerySource", "FileSource", "RedshiftSource", + "SnowflakeSource", ] diff --git a/sdk/python/feast/cli.py b/sdk/python/feast/cli.py index 4950977e2a..f6d326410a 100644 --- a/sdk/python/feast/cli.py +++ b/sdk/python/feast/cli.py @@ -477,7 +477,7 @@ def materialize_incremental_command(ctx: click.Context, end_ts: str, views: List @click.option( "--template", "-t", - type=click.Choice(["local", "gcp", "aws"], case_sensitive=False), + type=click.Choice(["local", "gcp", "aws", "snowflake"], case_sensitive=False), help="Specify a template for the created project", default="local", ) diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index b30340f0d2..f7ab2f04d4 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -34,6 +34,7 @@ class SourceType(enum.Enum): BATCH_BIGQUERY = 2 STREAM_KAFKA = 3 STREAM_KINESIS = 4 + BATCH_SNOWFLAKE = 8 class KafkaOptions: @@ -360,6 +361,12 @@ def from_proto(data_source: DataSourceProto) -> Any: from feast.infra.offline_stores.redshift_source import RedshiftSource data_source_obj = RedshiftSource.from_proto(data_source) + + elif data_source.snowflake_options.table or data_source.snowflake_options.query: + from feast.infra.offline_stores.snowflake_source import SnowflakeSource + + data_source_obj = SnowflakeSource.from_proto(data_source) + elif ( data_source.kafka_options.bootstrap_servers and data_source.kafka_options.topic diff --git a/sdk/python/feast/errors.py b/sdk/python/feast/errors.py index 3fc8c7571e..2dc4576b37 100644 --- a/sdk/python/feast/errors.py +++ b/sdk/python/feast/errors.py @@ -250,6 +250,16 @@ def __init__(self, table_name: str): ) +class SnowflakeCredentialsError(Exception): + def __init__(self): + super().__init__("Snowflake Connector failed due to incorrect credentials") + + +class SnowflakeQueryError(Exception): + def __init__(self, details): + super().__init__(f"Snowflake SQL Query failed to finish. Details: {details}") + + class EntityTimestampInferenceException(Exception): def __init__(self, expected_column_name: str): super().__init__( diff --git a/sdk/python/feast/inference.py b/sdk/python/feast/inference.py index 642a3c6442..ce8fa919f1 100644 --- a/sdk/python/feast/inference.py +++ b/sdk/python/feast/inference.py @@ -1,7 +1,14 @@ import re from typing import List -from feast import BigQuerySource, Entity, Feature, FileSource, RedshiftSource +from feast import ( + BigQuerySource, + Entity, + Feature, + FileSource, + RedshiftSource, + SnowflakeSource, +) from feast.data_source import DataSource from feast.errors import RegistryInferenceFailure from feast.feature_view import FeatureView @@ -83,6 +90,8 @@ def update_data_sources_with_inferred_event_timestamp_col( ts_column_type_regex_pattern = "TIMESTAMP|DATETIME" elif isinstance(data_source, RedshiftSource): ts_column_type_regex_pattern = "TIMESTAMP[A-Z]*" + elif isinstance(data_source, SnowflakeSource): + ts_column_type_regex_pattern = "TIMESTAMP_[A-Z]*" else: raise RegistryInferenceFailure( "DataSource", @@ -92,8 +101,10 @@ def update_data_sources_with_inferred_event_timestamp_col( """, ) # for informing the type checker - assert isinstance(data_source, FileSource) or isinstance( - data_source, BigQuerySource + assert ( + isinstance(data_source, FileSource) + or isinstance(data_source, BigQuerySource) + or isinstance(data_source, SnowflakeSource) ) # loop through table columns to find singular match diff --git a/sdk/python/feast/infra/offline_stores/snowflake.py b/sdk/python/feast/infra/offline_stores/snowflake.py new file mode 100644 index 0000000000..b2f014ea0d --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/snowflake.py @@ -0,0 +1,632 @@ +import contextlib +import os +from datetime import datetime +from pathlib import Path +from typing import ( + Callable, + ContextManager, + Dict, + Iterator, + List, + Optional, + Tuple, + Union, +) + +import numpy as np +import pandas as pd +import pyarrow as pa +from pydantic import Field +from pydantic.typing import Literal +from pytz import utc + +from feast import OnDemandFeatureView +from feast.data_source import DataSource +from feast.errors import InvalidEntityType +from feast.feature_view import DUMMY_ENTITY_ID, DUMMY_ENTITY_VAL, FeatureView +from feast.infra.offline_stores import offline_utils +from feast.infra.offline_stores.offline_store import ( + OfflineStore, + RetrievalJob, + RetrievalMetadata, +) +from feast.infra.offline_stores.snowflake_source import ( + SavedDatasetSnowflakeStorage, + SnowflakeSource, +) +from feast.infra.utils.snowflake_utils import ( + execute_snowflake_statement, + get_snowflake_conn, + write_pandas, +) +from feast.registry import Registry +from feast.repo_config import FeastConfigBaseModel, RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.usage import log_exceptions_and_usage + +try: + from snowflake.connector import SnowflakeConnection +except ImportError as e: + from feast.errors import FeastExtrasDependencyImportError + + raise FeastExtrasDependencyImportError("snowflake", str(e)) + + +class SnowflakeOfflineStoreConfig(FeastConfigBaseModel): + """ Offline store config for Snowflake """ + + type: Literal["snowflake.offline"] = "snowflake.offline" + """ Offline store type selector""" + + config_path: Optional[str] = ( + Path(os.environ["HOME"]) / ".snowsql/config" + ).__str__() + """ Snowflake config path -- absolute path required (Cant use ~)""" + + account: Optional[str] = None + """ Snowflake deployment identifier -- drop .snowflakecomputing.com""" + + user: Optional[str] = None + """ Snowflake user name """ + + password: Optional[str] = None + """ Snowflake password """ + + role: Optional[str] = None + """ Snowflake role name""" + + warehouse: Optional[str] = None + """ Snowflake warehouse name """ + + database: Optional[str] = None + """ Snowflake database name """ + + schema_: Optional[str] = Field("PUBLIC", alias="schema") + """ Snowflake schema name """ + + class Config: + allow_population_by_field_name = True + + +class SnowflakeOfflineStore(OfflineStore): + @staticmethod + @log_exceptions_and_usage(offline_store="snowflake") + def pull_latest_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + event_timestamp_column: str, + created_timestamp_column: Optional[str], + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert isinstance(data_source, SnowflakeSource) + assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) + + from_expression = ( + data_source.get_table_query_string() + ) # returns schema.table as a string + + if join_key_columns: + partition_by_join_key_string = '"' + '", "'.join(join_key_columns) + '"' + partition_by_join_key_string = ( + "PARTITION BY " + partition_by_join_key_string + ) + else: + partition_by_join_key_string = "" + + timestamp_columns = [event_timestamp_column] + if created_timestamp_column: + timestamp_columns.append(created_timestamp_column) + + timestamp_desc_string = '"' + '" DESC, "'.join(timestamp_columns) + '" DESC' + field_string = ( + '"' + + '", "'.join(join_key_columns + feature_name_columns + timestamp_columns) + + '"' + ) + + snowflake_conn = get_snowflake_conn(config.offline_store) + + query = f""" + SELECT + {field_string} + {f''', TRIM({repr(DUMMY_ENTITY_VAL)}::VARIANT,'"') AS "{DUMMY_ENTITY_ID}"''' if not join_key_columns else ""} + FROM ( + SELECT {field_string}, + ROW_NUMBER() OVER({partition_by_join_key_string} ORDER BY {timestamp_desc_string}) AS "_feast_row" + FROM {from_expression} + WHERE "{event_timestamp_column}" BETWEEN TO_TIMESTAMP_NTZ({start_date.timestamp()}) AND TO_TIMESTAMP_NTZ({end_date.timestamp()}) + ) + WHERE "_feast_row" = 1 + """ + + return SnowflakeRetrievalJob( + query=query, + snowflake_conn=snowflake_conn, + config=config, + full_feature_names=False, + on_demand_feature_views=None, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="snowflake") + def pull_all_from_table_or_query( + config: RepoConfig, + data_source: DataSource, + join_key_columns: List[str], + feature_name_columns: List[str], + event_timestamp_column: str, + start_date: datetime, + end_date: datetime, + ) -> RetrievalJob: + assert isinstance(data_source, SnowflakeSource) + from_expression = data_source.get_table_query_string() + + field_string = ( + '"' + + '", "'.join( + join_key_columns + feature_name_columns + [event_timestamp_column] + ) + + '"' + ) + + snowflake_conn = get_snowflake_conn(config.offline_store) + + start_date = start_date.astimezone(tz=utc) + end_date = end_date.astimezone(tz=utc) + + query = f""" + SELECT {field_string} + FROM {from_expression} + WHERE "{event_timestamp_column}" BETWEEN TIMESTAMP '{start_date}' AND TIMESTAMP '{end_date}' + """ + + return SnowflakeRetrievalJob( + query=query, + snowflake_conn=snowflake_conn, + config=config, + full_feature_names=False, + ) + + @staticmethod + @log_exceptions_and_usage(offline_store="snowflake") + def get_historical_features( + config: RepoConfig, + feature_views: List[FeatureView], + feature_refs: List[str], + entity_df: Union[pd.DataFrame, str], + registry: Registry, + project: str, + full_feature_names: bool = False, + ) -> RetrievalJob: + assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) + + snowflake_conn = get_snowflake_conn(config.offline_store) + + entity_schema = _get_entity_schema(entity_df, snowflake_conn, config) + + entity_df_event_timestamp_col = offline_utils.infer_event_timestamp_from_entity_df( + entity_schema + ) + + entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( + entity_df, entity_df_event_timestamp_col, snowflake_conn, config, + ) + + @contextlib.contextmanager + def query_generator() -> Iterator[str]: + + table_name = offline_utils.get_temp_entity_table_name() + + _upload_entity_df(entity_df, snowflake_conn, config, table_name) + + expected_join_keys = offline_utils.get_expected_join_keys( + project, feature_views, registry + ) + + offline_utils.assert_expected_columns_in_entity_df( + entity_schema, expected_join_keys, entity_df_event_timestamp_col + ) + + # Build a query context containing all information required to template the Snowflake SQL query + query_context = offline_utils.get_feature_view_query_context( + feature_refs, + feature_views, + registry, + project, + entity_df_event_timestamp_range, + ) + + query_context = _fix_entity_selections_identifiers(query_context) + + # Generate the Snowflake SQL query from the query context + query = offline_utils.build_point_in_time_query( + query_context, + left_table_query_string=table_name, + entity_df_event_timestamp_col=entity_df_event_timestamp_col, + entity_df_columns=entity_schema.keys(), + query_template=MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN, + full_feature_names=full_feature_names, + ) + + yield query + + return SnowflakeRetrievalJob( + query=query_generator, + snowflake_conn=snowflake_conn, + config=config, + full_feature_names=full_feature_names, + on_demand_feature_views=OnDemandFeatureView.get_requested_odfvs( + feature_refs, project, registry + ), + metadata=RetrievalMetadata( + features=feature_refs, + keys=list(entity_schema.keys() - {entity_df_event_timestamp_col}), + min_event_timestamp=entity_df_event_timestamp_range[0], + max_event_timestamp=entity_df_event_timestamp_range[1], + ), + ) + + +class SnowflakeRetrievalJob(RetrievalJob): + def __init__( + self, + query: Union[str, Callable[[], ContextManager[str]]], + snowflake_conn: SnowflakeConnection, + config: RepoConfig, + full_feature_names: bool, + on_demand_feature_views: Optional[List[OnDemandFeatureView]] = None, + metadata: Optional[RetrievalMetadata] = None, + ): + + if not isinstance(query, str): + self._query_generator = query + else: + + @contextlib.contextmanager + def query_generator() -> Iterator[str]: + assert isinstance(query, str) + yield query + + self._query_generator = query_generator + + self.snowflake_conn = snowflake_conn + self.config = config + self._full_feature_names = full_feature_names + self._on_demand_feature_views = ( + on_demand_feature_views if on_demand_feature_views else [] + ) + self._metadata = metadata + + @property + def full_feature_names(self) -> bool: + return self._full_feature_names + + @property + def on_demand_feature_views(self) -> Optional[List[OnDemandFeatureView]]: + return self._on_demand_feature_views + + def _to_df_internal(self) -> pd.DataFrame: + with self._query_generator() as query: + + df = execute_snowflake_statement( + self.snowflake_conn, query + ).fetch_pandas_all() + + return df + + def _to_arrow_internal(self) -> pa.Table: + with self._query_generator() as query: + + pa_table = execute_snowflake_statement( + self.snowflake_conn, query + ).fetch_arrow_all() + + if pa_table: + + return pa_table + else: + empty_result = execute_snowflake_statement(self.snowflake_conn, query) + + return pa.Table.from_pandas( + pd.DataFrame(columns=[md.name for md in empty_result.description]) + ) + + def to_snowflake(self, table_name: str) -> None: + """ Save dataset as a new Snowflake table """ + if self.on_demand_feature_views is not None: + transformed_df = self.to_df() + + write_pandas( + self.snowflake_conn, transformed_df, table_name, auto_create_table=True + ) + + return None + + with self._query_generator() as query: + query = f'CREATE TABLE IF NOT EXISTS "{table_name}" AS ({query});\n' + + execute_snowflake_statement(self.snowflake_conn, query) + + def to_sql(self) -> str: + """ + Returns the SQL query that will be executed in Snowflake to build the historical feature table. + """ + with self._query_generator() as query: + return query + + def to_arrow_chunks(self, arrow_options: Optional[Dict] = None) -> list: + with self._query_generator() as query: + + arrow_batches = execute_snowflake_statement( + self.snowflake_conn, query + ).get_result_batches() + + return arrow_batches + + def persist(self, storage: SavedDatasetStorage): + assert isinstance(storage, SavedDatasetSnowflakeStorage) + self.to_snowflake(table_name=storage.snowflake_options.table) + + @property + def metadata(self) -> Optional[RetrievalMetadata]: + return self._metadata + + +def _get_entity_schema( + entity_df: Union[pd.DataFrame, str], + snowflake_conn: SnowflakeConnection, + config: RepoConfig, +) -> Dict[str, np.dtype]: + + if isinstance(entity_df, pd.DataFrame): + + return dict(zip(entity_df.columns, entity_df.dtypes)) + + else: + + query = f"SELECT * FROM ({entity_df}) LIMIT 1" + limited_entity_df = execute_snowflake_statement( + snowflake_conn, query + ).fetch_pandas_all() + + return dict(zip(limited_entity_df.columns, limited_entity_df.dtypes)) + + +def _upload_entity_df( + entity_df: Union[pd.DataFrame, str], + snowflake_conn: SnowflakeConnection, + config: RepoConfig, + table_name: str, +) -> None: + + if isinstance(entity_df, pd.DataFrame): + # Write the data from the DataFrame to the table + write_pandas( + snowflake_conn, + entity_df, + table_name, + auto_create_table=True, + create_temp_table=True, + ) + + return None + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), create a Snowflake table out of it, + query = f'CREATE TEMPORARY TABLE "{table_name}" AS ({entity_df})' + execute_snowflake_statement(snowflake_conn, query) + + return None + else: + raise InvalidEntityType(type(entity_df)) + + +def _fix_entity_selections_identifiers(query_context) -> list: + + for i, qc in enumerate(query_context): + for j, es in enumerate(qc.entity_selections): + query_context[i].entity_selections[j] = f'"{es}"'.replace(" AS ", '" AS "') + + return query_context + + +def _get_entity_df_event_timestamp_range( + entity_df: Union[pd.DataFrame, str], + entity_df_event_timestamp_col: str, + snowflake_conn: SnowflakeConnection, + config: RepoConfig, +) -> Tuple[datetime, datetime]: + if isinstance(entity_df, pd.DataFrame): + entity_df_event_timestamp = entity_df.loc[ + :, entity_df_event_timestamp_col + ].infer_objects() + if pd.api.types.is_string_dtype(entity_df_event_timestamp): + entity_df_event_timestamp = pd.to_datetime( + entity_df_event_timestamp, utc=True + ) + entity_df_event_timestamp_range = ( + entity_df_event_timestamp.min().to_pydatetime(), + entity_df_event_timestamp.max().to_pydatetime(), + ) + elif isinstance(entity_df, str): + # If the entity_df is a string (SQL query), determine range + # from table + query = f'SELECT MIN("{entity_df_event_timestamp_col}") AS "min_value", MAX("{entity_df_event_timestamp_col}") AS "max_value" FROM ({entity_df})' + results = execute_snowflake_statement(snowflake_conn, query).fetchall() + + entity_df_event_timestamp_range = results[0] + else: + raise InvalidEntityType(type(entity_df)) + + return entity_df_event_timestamp_range + + +MULTIPLE_FEATURE_VIEW_POINT_IN_TIME_JOIN = """ +/* + Compute a deterministic hash for the `left_table_query_string` that will be used throughout + all the logic as the field to GROUP BY the data +*/ +WITH "entity_dataframe" AS ( + SELECT *, + "{{entity_df_event_timestamp_col}}" AS "entity_timestamp" + {% for featureview in featureviews %} + {% if featureview.entities %} + ,( + {% for entity in featureview.entities %} + CAST("{{entity}}" AS VARCHAR) || + {% endfor %} + CAST("{{entity_df_event_timestamp_col}}" AS VARCHAR) + ) AS "{{featureview.name}}__entity_row_unique_id" + {% else %} + ,CAST("{{entity_df_event_timestamp_col}}" AS VARCHAR) AS "{{featureview.name}}__entity_row_unique_id" + {% endif %} + {% endfor %} + FROM "{{ left_table_query_string }}" +), + +{% for featureview in featureviews %} + +"{{ featureview.name }}__entity_dataframe" AS ( + SELECT + {{ featureview.entities | map('tojson') | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + "entity_timestamp", + "{{featureview.name}}__entity_row_unique_id" + FROM "entity_dataframe" + GROUP BY + {{ featureview.entities | map('tojson') | join(', ')}}{% if featureview.entities %},{% else %}{% endif %} + "entity_timestamp", + "{{featureview.name}}__entity_row_unique_id" +), + +/* + This query template performs the point-in-time correctness join for a single feature set table + to the provided entity table. + + 1. We first join the current feature_view to the entity dataframe that has been passed. + This JOIN has the following logic: + - For each row of the entity dataframe, only keep the rows where the `event_timestamp_column` + is less than the one provided in the entity dataframe + - If there a TTL for the current feature_view, also keep the rows where the `event_timestamp_column` + is higher the the one provided minus the TTL + - For each row, Join on the entity key and retrieve the `entity_row_unique_id` that has been + computed previously + + The output of this CTE will contain all the necessary information and already filtered out most + of the data that is not relevant. +*/ + +"{{ featureview.name }}__subquery" AS ( + SELECT + "{{ featureview.event_timestamp_column }}" as "event_timestamp", + {{'"' ~ featureview.created_timestamp_column ~ '" as "created_timestamp",' if featureview.created_timestamp_column else '' }} + {{featureview.entity_selections | join(', ')}}{% if featureview.entity_selections %},{% else %}{% endif %} + {% for feature in featureview.features %} + "{{ feature }}" as {% if full_feature_names %}"{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}"{% else %}"{{ featureview.field_mapping.get(feature, feature) }}"{% endif %}{% if loop.last %}{% else %}, {% endif %} + {% endfor %} + FROM {{ featureview.table_subquery }} + WHERE "{{ featureview.event_timestamp_column }}" <= '{{ featureview.max_event_timestamp }}' + {% if featureview.ttl == 0 %}{% else %} + AND "{{ featureview.event_timestamp_column }}" >= '{{ featureview.min_event_timestamp }}' + {% endif %} +), + +"{{ featureview.name }}__base" AS ( + SELECT + "subquery".*, + "entity_dataframe"."entity_timestamp", + "entity_dataframe"."{{featureview.name}}__entity_row_unique_id" + FROM "{{ featureview.name }}__subquery" AS "subquery" + INNER JOIN "{{ featureview.name }}__entity_dataframe" AS "entity_dataframe" + ON TRUE + AND "subquery"."event_timestamp" <= "entity_dataframe"."entity_timestamp" + + {% if featureview.ttl == 0 %}{% else %} + AND "subquery"."event_timestamp" >= TIMESTAMPADD(second,-{{ featureview.ttl }},"entity_dataframe"."entity_timestamp") + {% endif %} + + {% for entity in featureview.entities %} + AND "subquery"."{{ entity }}" = "entity_dataframe"."{{ entity }}" + {% endfor %} +), + +/* + 2. If the `created_timestamp_column` has been set, we need to + deduplicate the data first. This is done by calculating the + `MAX(created_at_timestamp)` for each event_timestamp. + We then join the data on the next CTE +*/ +{% if featureview.created_timestamp_column %} +"{{ featureview.name }}__dedup" AS ( + SELECT + "{{featureview.name}}__entity_row_unique_id", + "event_timestamp", + MAX("created_timestamp") AS "created_timestamp" + FROM "{{ featureview.name }}__base" + GROUP BY "{{featureview.name}}__entity_row_unique_id", "event_timestamp" +), +{% endif %} + +/* + 3. The data has been filtered during the first CTE "*__base" + Thus we only need to compute the latest timestamp of each feature. +*/ +"{{ featureview.name }}__latest" AS ( + SELECT + "event_timestamp", + {% if featureview.created_timestamp_column %}"created_timestamp",{% endif %} + "{{featureview.name}}__entity_row_unique_id" + FROM + ( + SELECT *, + ROW_NUMBER() OVER( + PARTITION BY "{{featureview.name}}__entity_row_unique_id" + ORDER BY "event_timestamp" DESC{% if featureview.created_timestamp_column %},"created_timestamp" DESC{% endif %} + ) AS "row_number" + FROM "{{ featureview.name }}__base" + {% if featureview.created_timestamp_column %} + INNER JOIN "{{ featureview.name }}__dedup" + USING ("{{featureview.name}}__entity_row_unique_id", "event_timestamp", "created_timestamp") + {% endif %} + ) + WHERE "row_number" = 1 +), + +/* + 4. Once we know the latest value of each feature for a given timestamp, + we can join again the data back to the original "base" dataset +*/ +"{{ featureview.name }}__cleaned" AS ( + SELECT "base".* + FROM "{{ featureview.name }}__base" AS "base" + INNER JOIN "{{ featureview.name }}__latest" + USING( + "{{featureview.name}}__entity_row_unique_id", + "event_timestamp" + {% if featureview.created_timestamp_column %} + ,"created_timestamp" + {% endif %} + ) +){% if loop.last %}{% else %}, {% endif %} + + +{% endfor %} +/* + Joins the outputs of multiple time travel joins to a single table. + The entity_dataframe dataset being our source of truth here. + */ + +SELECT "{{ final_output_feature_names | join('", "')}}" +FROM "entity_dataframe" +{% for featureview in featureviews %} +LEFT JOIN ( + SELECT + "{{featureview.name}}__entity_row_unique_id" + {% for feature in featureview.features %} + ,{% if full_feature_names %}"{{ featureview.name }}__{{featureview.field_mapping.get(feature, feature)}}"{% else %}"{{ featureview.field_mapping.get(feature, feature) }}"{% endif %} + {% endfor %} + FROM "{{ featureview.name }}__cleaned" +) "{{ featureview.name }}__cleaned" USING ("{{featureview.name}}__entity_row_unique_id") +{% endfor %} +""" diff --git a/sdk/python/feast/infra/offline_stores/snowflake_source.py b/sdk/python/feast/infra/offline_stores/snowflake_source.py new file mode 100644 index 0000000000..b5d50be0f4 --- /dev/null +++ b/sdk/python/feast/infra/offline_stores/snowflake_source.py @@ -0,0 +1,315 @@ +from typing import Callable, Dict, Iterable, Optional, Tuple + +from feast import type_map +from feast.data_source import DataSource +from feast.protos.feast.core.DataSource_pb2 import DataSource as DataSourceProto +from feast.protos.feast.core.SavedDataset_pb2 import ( + SavedDatasetStorage as SavedDatasetStorageProto, +) +from feast.repo_config import RepoConfig +from feast.saved_dataset import SavedDatasetStorage +from feast.value_type import ValueType + + +class SnowflakeSource(DataSource): + def __init__( + self, + database: Optional[str] = None, + schema: Optional[str] = None, + table: Optional[str] = None, + query: Optional[str] = None, + event_timestamp_column: Optional[str] = "", + created_timestamp_column: Optional[str] = "", + field_mapping: Optional[Dict[str, str]] = None, + date_partition_column: Optional[str] = "", + ): + """ + Creates a SnowflakeSource object. + + Args: + database (optional): Snowflake database where the features are stored. + schema (optional): Snowflake schema in which the table is located. + table (optional): Snowflake table where the features are stored. + event_timestamp_column (optional): Event timestamp column used for point in + time joins of feature values. + query (optional): The query to be executed to obtain the features. + created_timestamp_column (optional): Timestamp column indicating when the + row was created, used for deduplicating rows. + field_mapping (optional): A dictionary mapping of column names in this data + source to column names in a feature table or view. + date_partition_column (optional): Timestamp column used for partitioning. + + """ + super().__init__( + event_timestamp_column, + created_timestamp_column, + field_mapping, + date_partition_column, + ) + + # The default Snowflake schema is named "PUBLIC". + _schema = "PUBLIC" if (database and table and not schema) else schema + + self._snowflake_options = SnowflakeOptions( + database=database, schema=_schema, table=table, query=query + ) + + @staticmethod + def from_proto(data_source: DataSourceProto): + """ + Creates a SnowflakeSource from a protobuf representation of a SnowflakeSource. + + Args: + data_source: A protobuf representation of a SnowflakeSource + + Returns: + A SnowflakeSource object based on the data_source protobuf. + """ + return SnowflakeSource( + field_mapping=dict(data_source.field_mapping), + database=data_source.snowflake_options.database, + schema=data_source.snowflake_options.schema, + table=data_source.snowflake_options.table, + event_timestamp_column=data_source.event_timestamp_column, + created_timestamp_column=data_source.created_timestamp_column, + date_partition_column=data_source.date_partition_column, + query=data_source.snowflake_options.query, + ) + + def __eq__(self, other): + if not isinstance(other, SnowflakeSource): + raise TypeError( + "Comparisons should only involve SnowflakeSource class objects." + ) + + return ( + self.snowflake_options.database == other.snowflake_options.database + and self.snowflake_options.schema == other.snowflake_options.schema + and self.snowflake_options.table == other.snowflake_options.table + and self.snowflake_options.query == other.snowflake_options.query + and self.event_timestamp_column == other.event_timestamp_column + and self.created_timestamp_column == other.created_timestamp_column + and self.field_mapping == other.field_mapping + ) + + @property + def database(self): + """Returns the database of this snowflake source.""" + return self._snowflake_options.database + + @property + def schema(self): + """Returns the schema of this snowflake source.""" + return self._snowflake_options.schema + + @property + def table(self): + """Returns the table of this snowflake source.""" + return self._snowflake_options.table + + @property + def query(self): + """Returns the snowflake options of this snowflake source.""" + return self._snowflake_options.query + + @property + def snowflake_options(self): + """Returns the snowflake options of this snowflake source.""" + return self._snowflake_options + + @snowflake_options.setter + def snowflake_options(self, _snowflake_options): + """Sets the snowflake options of this snowflake source.""" + self._snowflake_options = _snowflake_options + + def to_proto(self) -> DataSourceProto: + """ + Converts a SnowflakeSource object to its protobuf representation. + + Returns: + A DataSourceProto object. + """ + data_source_proto = DataSourceProto( + type=DataSourceProto.BATCH_SNOWFLAKE, + field_mapping=self.field_mapping, + snowflake_options=self.snowflake_options.to_proto(), + ) + + data_source_proto.event_timestamp_column = self.event_timestamp_column + data_source_proto.created_timestamp_column = self.created_timestamp_column + data_source_proto.date_partition_column = self.date_partition_column + + return data_source_proto + + def validate(self, config: RepoConfig): + # As long as the query gets successfully executed, or the table exists, + # the data source is validated. We don't need the results though. + self.get_table_column_names_and_types(config) + + def get_table_query_string(self) -> str: + """Returns a string that can directly be used to reference this table in SQL.""" + if self.database and self.table: + return f'"{self.database}"."{self.schema}"."{self.table}"' + elif self.table: + return f'"{self.table}"' + else: + return f"({self.query})" + + @staticmethod + def source_datatype_to_feast_value_type() -> Callable[[str], ValueType]: + return type_map.snowflake_python_type_to_feast_value_type + + def get_table_column_names_and_types( + self, config: RepoConfig + ) -> Iterable[Tuple[str, str]]: + """ + Returns a mapping of column names to types for this snowflake source. + + Args: + config: A RepoConfig describing the feature repo + """ + + from feast.infra.offline_stores.snowflake import SnowflakeOfflineStoreConfig + from feast.infra.utils.snowflake_utils import ( + execute_snowflake_statement, + get_snowflake_conn, + ) + + assert isinstance(config.offline_store, SnowflakeOfflineStoreConfig) + + snowflake_conn = get_snowflake_conn(config.offline_store) + + if self.database and self.table: + query = f'SELECT * FROM "{self.database}"."{self.schema}"."{self.table}" LIMIT 1' + elif self.table: + query = f'SELECT * FROM "{self.table}" LIMIT 1' + else: + query = f"SELECT * FROM ({self.query}) LIMIT 1" + + result = execute_snowflake_statement(snowflake_conn, query).fetch_pandas_all() + + if not result.empty: + metadata = result.dtypes.apply(str) + return list(zip(metadata.index, metadata)) + else: + raise ValueError("The following source:\n" + query + "\n ... is empty") + + +class SnowflakeOptions: + """ + DataSource snowflake options used to source features from snowflake query. + """ + + def __init__( + self, + database: Optional[str], + schema: Optional[str], + table: Optional[str], + query: Optional[str], + ): + self._database = database + self._schema = schema + self._table = table + self._query = query + + @property + def query(self): + """Returns the snowflake SQL query referenced by this source.""" + return self._query + + @query.setter + def query(self, query): + """Sets the snowflake SQL query referenced by this source.""" + self._query = query + + @property + def database(self): + """Returns the database name of this snowflake table.""" + return self._database + + @database.setter + def database(self, database): + """Sets the database ref of this snowflake table.""" + self._database = database + + @property + def schema(self): + """Returns the schema name of this snowflake table.""" + return self._schema + + @schema.setter + def schema(self, schema): + """Sets the schema of this snowflake table.""" + self._schema = schema + + @property + def table(self): + """Returns the table name of this snowflake table.""" + return self._table + + @table.setter + def table(self, table): + """Sets the table ref of this snowflake table.""" + self._table = table + + @classmethod + def from_proto(cls, snowflake_options_proto: DataSourceProto.SnowflakeOptions): + """ + Creates a SnowflakeOptions from a protobuf representation of a snowflake option. + + Args: + snowflake_options_proto: A protobuf representation of a DataSource + + Returns: + A SnowflakeOptions object based on the snowflake_options protobuf. + """ + snowflake_options = cls( + database=snowflake_options_proto.database, + schema=snowflake_options_proto.schema, + table=snowflake_options_proto.table, + query=snowflake_options_proto.query, + ) + + return snowflake_options + + def to_proto(self) -> DataSourceProto.SnowflakeOptions: + """ + Converts an SnowflakeOptionsProto object to its protobuf representation. + + Returns: + A SnowflakeOptionsProto protobuf. + """ + snowflake_options_proto = DataSourceProto.SnowflakeOptions( + database=self.database, + schema=self.schema, + table=self.table, + query=self.query, + ) + + return snowflake_options_proto + + +class SavedDatasetSnowflakeStorage(SavedDatasetStorage): + _proto_attr_name = "snowflake_storage" + + snowflake_options: SnowflakeOptions + + def __init__(self, table_ref: str): + self.snowflake_options = SnowflakeOptions( + database=None, schema=None, table=table_ref, query=None + ) + + @staticmethod + def from_proto(storage_proto: SavedDatasetStorageProto) -> SavedDatasetStorage: + + return SavedDatasetSnowflakeStorage( + table_ref=SnowflakeOptions.from_proto(storage_proto.snowflake_storage).table + ) + + def to_proto(self) -> SavedDatasetStorageProto: + return SavedDatasetStorageProto( + snowflake_storage=self.snowflake_options.to_proto() + ) + + def to_data_source(self) -> DataSource: + return SnowflakeSource(table=self.snowflake_options.table) diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py new file mode 100644 index 0000000000..9660a0fc28 --- /dev/null +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -0,0 +1,282 @@ +import configparser +import os +import random +import string +from logging import getLogger +from tempfile import TemporaryDirectory +from typing import Iterator, Optional, Sequence, Tuple, TypeVar + +import pandas as pd +import snowflake.connector +from snowflake.connector import ProgrammingError, SnowflakeConnection + +getLogger("snowflake.connector.cursor").disabled = True +getLogger("snowflake.connector.connection").disabled = True +getLogger("snowflake.connector.network").disabled = True +logger = getLogger(__name__) + + +def execute_snowflake_statement(conn: SnowflakeConnection, query): + return conn.cursor().execute(query) + + +def get_snowflake_conn(config, autocommit=True): + + if config.type == "snowflake.offline": + config_header = "connections.feast_offline_store" + + config = dict(config) + + # read config file + config_reader = configparser.ConfigParser() + config_reader.read([config["config_path"]]) + if config_reader.has_section(config_header): + kwargs = dict(config_reader[config_header]) + else: + kwargs = {} + + kwargs.update((k, v) for k, v in config.items() if v is not None) + + try: + conn = snowflake.connector.connect( + account=kwargs["account"], + user=kwargs["user"], + password=kwargs["password"], + role=f'''"{kwargs['role']}"''', + warehouse=f'''"{kwargs['warehouse']}"''', + database=f'''"{kwargs['database']}"''', + schema=f'''"{kwargs['schema_']}"''', + application="feast", + autocommit=autocommit, + ) + + return conn + except KeyError as e: + print(f"{e} not defined in a config file or feature_store.yaml file") + + +def write_pandas( + conn: SnowflakeConnection, + df: pd.DataFrame, + table_name: str, + database: Optional[str] = None, + schema: Optional[str] = None, + chunk_size: Optional[int] = None, + compression: str = "gzip", + on_error: str = "abort_statement", + parallel: int = 4, + quote_identifiers: bool = True, + auto_create_table: bool = False, + create_temp_table: bool = False, +) -> Tuple[ + bool, + int, + int, + Sequence[ + Tuple[ + str, + str, + int, + int, + int, + int, + Optional[str], + Optional[int], + Optional[int], + Optional[str], + ] + ], +]: + """Allows users to most efficiently write back a pandas DataFrame to Snowflake. + + It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table. + + Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested + with all of the COPY INTO command's output for debugging purposes. + + Example usage: + import pandas + from snowflake.connector.pandas_tools import write_pandas + + df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance']) + success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers') + + Args: + conn: Connection to be used to communicate with Snowflake. + df: Dataframe we'd like to write back. + table_name: Table name where we want to insert into. + database: Database schema and table is in, if not provided the default one will be used (Default value = None). + schema: Schema table is in, if not provided the default one will be used (Default value = None). + chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once + (Default value = None). + compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a + better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip'). + on_error: Action to take when COPY INTO statements fail, default follows documentation at: + https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions + (Default value = 'abort_statement'). + parallel: Number of threads to be used when uploading chunks, default follows documentation at: + https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4). + quote_identifiers: By default, identifiers, specifically database, schema, table and column names + (from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting. + I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True) + auto_create_table: When true, will automatically create a table with corresponding columns for each column in + the passed in DataFrame. The table will not be created if it already exists + create_temp_table: Will make the auto-created table as a temporary table + + Returns: + Returns the COPY INTO command's results to verify ingestion in the form of a tuple of whether all chunks were + ingested correctly, # of chunks, # of ingested rows, and ingest's output. + """ + if database is not None and schema is None: + raise ProgrammingError( + "Schema has to be provided to write_pandas when a database is provided" + ) + # This dictionary maps the compression algorithm to Snowflake put copy into command type + # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet + compression_map = {"gzip": "auto", "snappy": "snappy"} + if compression not in compression_map.keys(): + raise ProgrammingError( + "Invalid compression '{}', only acceptable values are: {}".format( + compression, compression_map.keys() + ) + ) + if quote_identifiers: + location = ( + (('"' + database + '".') if database else "") + + (('"' + schema + '".') if schema else "") + + ('"' + table_name + '"') + ) + else: + location = ( + (database + "." if database else "") + + (schema + "." if schema else "") + + (table_name) + ) + if chunk_size is None: + chunk_size = len(df) + cursor = conn.cursor() + while True: + try: + stage_name = "".join( + random.choice(string.ascii_lowercase) for _ in range(5) + ) + create_stage_sql = ( + "create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + '"{stage_name}"' + ).format(stage_name=stage_name) + logger.debug(f"creating stage with '{create_stage_sql}'") + cursor.execute(create_stage_sql, _is_internal=True).fetchall() + break + except ProgrammingError as pe: + if pe.msg.endswith("already exists."): + continue + raise + + with TemporaryDirectory() as tmp_folder: + for i, chunk in chunk_helper(df, chunk_size): + chunk_path = os.path.join(tmp_folder, "file{}.txt".format(i)) + # Dump chunk into parquet file + chunk.to_parquet( + chunk_path, + compression=compression, + use_deprecated_int96_timestamps=True, + ) + # Upload parquet file + upload_sql = ( + "PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + "'file://{path}' @\"{stage_name}\" PARALLEL={parallel}" + ).format( + path=chunk_path.replace("\\", "\\\\").replace("'", "\\'"), + stage_name=stage_name, + parallel=parallel, + ) + logger.debug(f"uploading files with '{upload_sql}'") + cursor.execute(upload_sql, _is_internal=True) + # Remove chunk file + os.remove(chunk_path) + if quote_identifiers: + columns = '"' + '","'.join(list(df.columns)) + '"' + else: + columns = ",".join(list(df.columns)) + + if auto_create_table: + while True: + try: + file_format_name = ( + '"' + + "".join(random.choice(string.ascii_lowercase) for _ in range(5)) + + '"' + ) + file_format_sql = ( + f"CREATE FILE FORMAT {file_format_name} " + f"/* Python:snowflake.connector.pandas_tools.write_pandas() */ " + f"TYPE=PARQUET COMPRESSION={compression_map[compression]}" + ) + logger.debug(f"creating file format with '{file_format_sql}'") + cursor.execute(file_format_sql, _is_internal=True) + break + except ProgrammingError as pe: + if pe.msg.endswith("already exists."): + continue + raise + infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))" + logger.debug(f"inferring schema with '{infer_schema_sql}'") + column_type_mapping = dict( + cursor.execute(infer_schema_sql, _is_internal=True).fetchall() + ) + # Infer schema can return the columns out of order depending on the chunking we do when uploading + # so we have to iterate through the dataframe columns to make sure we create the table with its + # columns in order + quote = '"' if quote_identifiers else "" + create_table_columns = ", ".join( + [f"{quote}{c}{quote} {column_type_mapping[c]}" for c in df.columns] + ) + create_table_sql = ( + f"CREATE {'TEMP ' if create_temp_table else ''}TABLE IF NOT EXISTS {location} " + f"({create_table_columns})" + f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + ) + logger.debug(f"auto creating table with '{create_table_sql}'") + cursor.execute(create_table_sql, _is_internal=True) + drop_file_format_sql = f"DROP FILE FORMAT IF EXISTS {file_format_name}" + logger.debug(f"dropping file format with '{drop_file_format_sql}'") + cursor.execute(drop_file_format_sql, _is_internal=True) + + # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly + # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html) + if quote_identifiers: + parquet_columns = "$1:" + ",$1:".join(f'"{c}"' for c in df.columns) + else: + parquet_columns = "$1:" + ",$1:".join(df.columns) + copy_into_sql = ( + "COPY INTO {location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + "({columns}) " + 'FROM (SELECT {parquet_columns} FROM @"{stage_name}") ' + "FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression}) " + "PURGE=TRUE ON_ERROR={on_error}" + ).format( + location=location, + columns=columns, + parquet_columns=parquet_columns, + stage_name=stage_name, + compression=compression_map[compression], + on_error=on_error, + ) + logger.debug("copying into with '{}'".format(copy_into_sql)) + copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall() + cursor.close() + return ( + all(e[1] == "LOADED" for e in copy_results), + len(copy_results), + sum(int(e[3]) for e in copy_results), + copy_results, + ) + + +T = TypeVar("T", bound=Sequence) + + +def chunk_helper(lst: T, n: int) -> Iterator[Tuple[int, T]]: + """Helper generator to chunk a sequence efficiently with current index like if enumerate was called on sequence.""" + for i in range(0, len(lst), n): + yield int(i / n), lst[i : i + n] diff --git a/sdk/python/feast/repo_config.py b/sdk/python/feast/repo_config.py index e8ba180568..3f32d18b80 100644 --- a/sdk/python/feast/repo_config.py +++ b/sdk/python/feast/repo_config.py @@ -31,12 +31,14 @@ "datastore": "feast.infra.online_stores.datastore.DatastoreOnlineStore", "redis": "feast.infra.online_stores.redis.RedisOnlineStore", "dynamodb": "feast.infra.online_stores.dynamodb.DynamoDBOnlineStore", + "snowflake.online": "feast.infra.online_stores.snowflake.SnowflakeOnlineStore", } OFFLINE_STORE_CLASS_FOR_TYPE = { "file": "feast.infra.offline_stores.file.FileOfflineStore", "bigquery": "feast.infra.offline_stores.bigquery.BigQueryOfflineStore", "redshift": "feast.infra.offline_stores.redshift.RedshiftOfflineStore", + "snowflake.offline": "feast.infra.offline_stores.snowflake.SnowflakeOfflineStore", } FEATURE_SERVER_CONFIG_CLASS_FOR_TYPE = { diff --git a/sdk/python/feast/templates/snowflake/bootstrap.py b/sdk/python/feast/templates/snowflake/bootstrap.py new file mode 100644 index 0000000000..3712651a5d --- /dev/null +++ b/sdk/python/feast/templates/snowflake/bootstrap.py @@ -0,0 +1,91 @@ +import click +import snowflake.connector + +from feast.infra.utils.snowflake_utils import write_pandas + + +def bootstrap(): + # Bootstrap() will automatically be called from the init_repo() during `feast init` + + import pathlib + from datetime import datetime, timedelta + + from feast.driver_test_data import create_driver_hourly_stats_df + + repo_path = pathlib.Path(__file__).parent.absolute() + config_file = repo_path / "feature_store.yaml" + + project_name = str(repo_path)[str(repo_path).rfind("/") + 1 :] + + end_date = datetime.now().replace(microsecond=0, second=0, minute=0) + start_date = end_date - timedelta(days=15) + + driver_entities = [1001, 1002, 1003, 1004, 1005] + driver_df = create_driver_hourly_stats_df(driver_entities, start_date, end_date) + + repo_path = pathlib.Path(__file__).parent.absolute() + data_path = repo_path / "data" + data_path.mkdir(exist_ok=True) + driver_stats_path = data_path / "driver_stats.parquet" + driver_df.to_parquet(path=str(driver_stats_path), allow_truncated_timestamps=True) + + snowflake_deployment_url = click.prompt( + "Snowflake Deployment URL (exclude .snowflakecomputing.com):" + ) + snowflake_user = click.prompt("Snowflake User Name:") + snowflake_password = click.prompt("Snowflake Password:", hide_input=True) + snowflake_role = click.prompt("Snowflake Role Name (Case Sensitive):") + snowflake_warehouse = click.prompt("Snowflake Warehouse Name (Case Sensitive):") + snowflake_database = click.prompt("Snowflake Database Name (Case Sensitive):") + + if click.confirm( + f'Should I upload example data to Snowflake (overwriting "{project_name}_feast_driver_hourly_stats" table)?', + default=True, + ): + + conn = snowflake.connector.connect( + account=snowflake_deployment_url, + user=snowflake_user, + password=snowflake_password, + role=snowflake_role, + warehouse=snowflake_warehouse, + application="feast", + ) + + cur = conn.cursor() + cur.execute(f'CREATE DATABASE IF NOT EXISTS "{snowflake_database}"') + cur.execute(f'USE DATABASE "{snowflake_database}"') + cur.execute('CREATE SCHEMA IF NOT EXISTS "PUBLIC"') + cur.execute('USE SCHEMA "PUBLIC"') + cur.execute(f'DROP TABLE IF EXISTS "{project_name}_feast_driver_hourly_stats"') + write_pandas( + conn, + driver_df, + f"{project_name}_feast_driver_hourly_stats", + auto_create_table=True, + ) + conn.close() + + repo_path = pathlib.Path(__file__).parent.absolute() + config_file = repo_path / "feature_store.yaml" + + replace_str_in_file( + config_file, "SNOWFLAKE_DEPLOYMENT_URL", snowflake_deployment_url + ) + replace_str_in_file(config_file, "SNOWFLAKE_USER", snowflake_user) + replace_str_in_file(config_file, "SNOWFLAKE_PASSWORD", snowflake_password) + replace_str_in_file(config_file, "SNOWFLAKE_ROLE", snowflake_role) + replace_str_in_file(config_file, "SNOWFLAKE_WAREHOUSE", snowflake_warehouse) + replace_str_in_file(config_file, "SNOWFLAKE_DATABASE", snowflake_database) + + +def replace_str_in_file(file_path, match_str, sub_str): + with open(file_path, "r") as f: + contents = f.read() + contents = contents.replace(match_str, sub_str) + with open(file_path, "wt") as f: + f.write(contents) + + +if __name__ == "__main__": + bootstrap() diff --git a/sdk/python/feast/templates/snowflake/driver_repo.py b/sdk/python/feast/templates/snowflake/driver_repo.py new file mode 100644 index 0000000000..a63c6cb503 --- /dev/null +++ b/sdk/python/feast/templates/snowflake/driver_repo.py @@ -0,0 +1,64 @@ +from datetime import timedelta + +import yaml + +from feast import Entity, Feature, FeatureView, SnowflakeSource, ValueType + +# Define an entity for the driver. Entities can be thought of as primary keys used to +# retrieve features. Entities are also used to join multiple tables/views during the +# construction of feature vectors +driver = Entity( + # Name of the entity. Must be unique within a project + name="driver_id", + # The join key of an entity describes the storage level field/column on which + # features can be looked up. The join key is also used to join feature + # tables/views when building feature vectors + join_key="driver_id", +) + +# Indicates a data source from which feature values can be retrieved. Sources are queried when building training +# datasets or materializing features into an online store. +project_name = yaml.safe_load(open("feature_store.yaml"))["project"] + +driver_stats_source = SnowflakeSource( + # The Snowflake table where features can be found + database=yaml.safe_load(open("feature_store.yaml"))["offline_store"]["database"], + table=f"{project_name}_feast_driver_hourly_stats", + # The event timestamp is used for point-in-time joins and for ensuring only + # features within the TTL are returned + event_timestamp_column="event_timestamp", + # The (optional) created timestamp is used to ensure there are no duplicate + # feature rows in the offline store or when building training datasets + created_timestamp_column="created", +) + +# Feature views are a grouping based on how features are stored in either the +# online or offline store. +driver_stats_fv = FeatureView( + # The unique name of this feature view. Two feature views in a single + # project cannot have the same name + name="driver_hourly_stats", + # The list of entities specifies the keys required for joining or looking + # up features from this feature view. The reference provided in this field + # correspond to the name of a defined entity (or entities) + entities=["driver_id"], + # The timedelta is the maximum age that each feature value may have + # relative to its lookup time. For historical features (used in training), + # TTL is relative to each timestamp provided in the entity dataframe. + # TTL also allows for eviction of keys from online stores and limits the + # amount of historical scanning required for historical feature values + # during retrieval + ttl=timedelta(weeks=52), + # The list of features defined below act as a schema to both define features + # for both materialization of features into a store, and are used as references + # during retrieval for building a training dataset or serving features + features=[ + Feature(name="conv_rate", dtype=ValueType.FLOAT), + Feature(name="acc_rate", dtype=ValueType.FLOAT), + Feature(name="avg_daily_trips", dtype=ValueType.INT64), + ], + # Batch sources are used to find feature values. In the case of this feature + # view we will query a source table on Redshift for driver statistics + # features + batch_source=driver_stats_source, +) diff --git a/sdk/python/feast/templates/snowflake/feature_store.yaml b/sdk/python/feast/templates/snowflake/feature_store.yaml new file mode 100644 index 0000000000..9757ea2ead --- /dev/null +++ b/sdk/python/feast/templates/snowflake/feature_store.yaml @@ -0,0 +1,11 @@ +project: my_project +registry: registry.db +provider: local +offline_store: + type: snowflake.offline + account: SNOWFLAKE_DEPLOYMENT_URL + user: SNOWFLAKE_USER + password: SNOWFLAKE_PASSWORD + role: SNOWFLAKE_ROLE + warehouse: SNOWFLAKE_WAREHOUSE + database: SNOWFLAKE_DATABASE diff --git a/sdk/python/feast/templates/snowflake/test.py b/sdk/python/feast/templates/snowflake/test.py new file mode 100644 index 0000000000..32aa6380d5 --- /dev/null +++ b/sdk/python/feast/templates/snowflake/test.py @@ -0,0 +1,65 @@ +from datetime import datetime, timedelta + +import pandas as pd +from driver_repo import driver, driver_stats_fv + +from feast import FeatureStore + + +def main(): + pd.set_option("display.max_columns", None) + pd.set_option("display.width", 1000) + + # Load the feature store from the current path + fs = FeatureStore(repo_path=".") + + # Deploy the feature store to Snowflake + print("Deploying feature store to Snowflake...") + fs.apply([driver, driver_stats_fv]) + + # Select features + features = ["driver_hourly_stats:conv_rate", "driver_hourly_stats:acc_rate"] + + # Create an entity dataframe. This is the dataframe that will be enriched with historical features + entity_df = pd.DataFrame( + { + "event_timestamp": [ + pd.Timestamp(dt, unit="ms", tz="UTC").round("ms") + for dt in pd.date_range( + start=datetime.now() - timedelta(days=3), + end=datetime.now(), + periods=3, + ) + ], + "driver_id": [1001, 1002, 1003], + } + ) + + print("Retrieving training data...") + + # Retrieve historical features by joining the entity dataframe to the Snowflake table source + training_df = fs.get_historical_features( + features=features, entity_df=entity_df + ).to_df() + + print() + print(training_df) + + print() + print("Loading features into the online store...") + fs.materialize_incremental(end_date=datetime.now()) + + print() + print("Retrieving online features...") + + # Retrieve features from the online store + online_features = fs.get_online_features( + features=features, entity_rows=[{"driver_id": 1001}, {"driver_id": 1002}], + ).to_dict() + + print() + print(pd.DataFrame.from_dict(online_features)) + + +if __name__ == "__main__": + main() diff --git a/sdk/python/feast/type_map.py b/sdk/python/feast/type_map.py index 599be85fdf..e39a4ecb81 100644 --- a/sdk/python/feast/type_map.py +++ b/sdk/python/feast/type_map.py @@ -126,6 +126,8 @@ def python_type_to_feast_value_type( "uint64": ValueType.INT64, "int32": ValueType.INT32, "uint32": ValueType.INT32, + "int16": ValueType.INT32, + "uint16": ValueType.INT32, "uint8": ValueType.INT32, "int8": ValueType.INT32, "bool": ValueType.BOOL, @@ -480,6 +482,28 @@ def redshift_to_feast_value_type(redshift_type_as_str: str) -> ValueType: return type_map[redshift_type_as_str.lower()] +def snowflake_python_type_to_feast_value_type( + snowflake_python_type_as_str: str, +) -> ValueType: + + type_map = { + "str": ValueType.STRING, + "float64": ValueType.DOUBLE, + "int64": ValueType.INT64, + "uint64": ValueType.INT64, + "int32": ValueType.INT32, + "uint32": ValueType.INT32, + "int16": ValueType.INT32, + "uint16": ValueType.INT32, + "uint8": ValueType.INT32, + "int8": ValueType.INT32, + "datetime64[ns]": ValueType.UNIX_TIMESTAMP, + "object": ValueType.UNKNOWN, + } + + return type_map[snowflake_python_type_as_str.lower()] + + def pa_to_redshift_value_type(pa_type: pyarrow.DataType) -> str: # PyArrow types: https://arrow.apache.org/docs/python/api/datatypes.html # Redshift type: https://docs.aws.amazon.com/redshift/latest/dg/c_Supported_data_types.html diff --git a/sdk/python/requirements/py3.7-ci-requirements.txt b/sdk/python/requirements/py3.7-ci-requirements.txt index 87ab9f9813..293b44e053 100644 --- a/sdk/python/requirements/py3.7-ci-requirements.txt +++ b/sdk/python/requirements/py3.7-ci-requirements.txt @@ -26,6 +26,10 @@ appdirs==1.4.4 # via black asgiref==3.4.1 # via uvicorn +asn1crypto==1.4.0 + # via + # oscrypto + # snowflake-connector-python assertpy==1.1 # via feast (setup.py) async-timeout==4.0.2 @@ -73,16 +77,19 @@ certifi==2021.10.8 # minio # msrest # requests + # snowflake-connector-python cffi==1.15.0 # via # azure-datalake-store # cryptography + # snowflake-connector-python cfgv==3.3.1 # via pre-commit charset-normalizer==2.0.10 # via # aiohttp # requests + # snowflake-connector-python click==8.0.3 # via # black @@ -101,6 +108,8 @@ cryptography==3.3.2 # feast (setup.py) # moto # msal + # pyopenssl + # snowflake-connector-python decorator==5.1.1 # via gcsfs deprecated==1.2.13 @@ -229,6 +238,7 @@ idna==3.3 # via # anyio # requests + # snowflake-connector-python # yarl imagesize==1.3.0 # via sphinx @@ -316,6 +326,8 @@ numpy==1.21.5 # pyarrow oauthlib==3.1.1 # via requests-oauthlib +oscrypto==1.2.1 + # via snowflake-connector-python packaging==21.3 # via # deprecation @@ -329,6 +341,7 @@ pandas==1.3.5 # via # feast (setup.py) # pandavro + # snowflake-connector-python pandavro==1.5.2 # via feast (setup.py) pathspec==0.9.0 @@ -373,7 +386,9 @@ py==1.11.0 py-cpuinfo==8.0.0 # via pytest-benchmark pyarrow==6.0.1 - # via feast (setup.py) + # via + # feast (setup.py) + # snowflake-connector-python pyasn1==0.4.8 # via # pyasn1-modules @@ -384,6 +399,8 @@ pycodestyle==2.8.0 # via flake8 pycparser==2.21 # via cffi +pycryptodomex==3.13.0 + # via snowflake-connector-python pydantic==1.9.0 # via # fastapi @@ -396,6 +413,9 @@ pyjwt[crypto]==2.3.0 # via # adal # msal + # snowflake-connector-python +pyopenssl==21.0.0 + # via snowflake-connector-python pyparsing==3.0.7 # via # httplib2 @@ -444,6 +464,7 @@ pytz==2021.3 # google-api-core # moto # pandas + # snowflake-connector-python pyyaml==6.0 # via # feast (setup.py) @@ -471,6 +492,7 @@ requests==2.27.1 # msrest # requests-oauthlib # responses + # snowflake-connector-python # sphinx requests-oauthlib==1.3.0 # via @@ -497,6 +519,7 @@ six==1.16.0 # mock # msrestazure # pandavro + # pyopenssl # python-dateutil # responses # virtualenv @@ -504,6 +527,8 @@ sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 # via sphinx +snowflake-connector-python[pandas]==2.7.3 + # via feast (setup.py) sphinx==4.3.2 # via # feast (setup.py) diff --git a/sdk/python/requirements/py3.8-ci-requirements.txt b/sdk/python/requirements/py3.8-ci-requirements.txt index 851a0b7054..3cdc118144 100644 --- a/sdk/python/requirements/py3.8-ci-requirements.txt +++ b/sdk/python/requirements/py3.8-ci-requirements.txt @@ -26,6 +26,10 @@ appdirs==1.4.4 # via black asgiref==3.4.1 # via uvicorn +asn1crypto==1.4.0 + # via + # oscrypto + # snowflake-connector-python assertpy==1.1 # via feast (setup.py) async-timeout==4.0.2 @@ -71,16 +75,19 @@ certifi==2021.10.8 # minio # msrest # requests + # snowflake-connector-python cffi==1.15.0 # via # azure-datalake-store # cryptography + # snowflake-connector-python cfgv==3.3.1 # via pre-commit charset-normalizer==2.0.10 # via # aiohttp # requests + # snowflake-connector-python click==8.0.3 # via # black @@ -99,6 +106,8 @@ cryptography==3.3.2 # feast (setup.py) # moto # msal + # pyopenssl + # snowflake-connector-python decorator==5.1.1 # via gcsfs deprecated==1.2.13 @@ -227,6 +236,7 @@ idna==3.3 # via # anyio # requests + # snowflake-connector-python # yarl imagesize==1.3.0 # via sphinx @@ -302,6 +312,8 @@ numpy==1.22.1 # pyarrow oauthlib==3.1.1 # via requests-oauthlib +oscrypto==1.2.1 + # via snowflake-connector-python packaging==21.3 # via # deprecation @@ -315,6 +327,7 @@ pandas==1.3.5 # via # feast (setup.py) # pandavro + # snowflake-connector-python pandavro==1.5.2 # via feast (setup.py) pathspec==0.9.0 @@ -359,7 +372,9 @@ py==1.11.0 py-cpuinfo==8.0.0 # via pytest-benchmark pyarrow==6.0.1 - # via feast (setup.py) + # via + # feast (setup.py) + # snowflake-connector-python pyasn1==0.4.8 # via # pyasn1-modules @@ -370,6 +385,8 @@ pycodestyle==2.8.0 # via flake8 pycparser==2.21 # via cffi +pycryptodomex==3.13.0 + # via snowflake-connector-python pydantic==1.9.0 # via # fastapi @@ -382,6 +399,9 @@ pyjwt[crypto]==2.3.0 # via # adal # msal + # snowflake-connector-python +pyopenssl==21.0.0 + # via snowflake-connector-python pyparsing==3.0.7 # via # httplib2 @@ -430,6 +450,7 @@ pytz==2021.3 # google-api-core # moto # pandas + # snowflake-connector-python pyyaml==6.0 # via # feast (setup.py) @@ -457,6 +478,7 @@ requests==2.27.1 # msrest # requests-oauthlib # responses + # snowflake-connector-python # sphinx requests-oauthlib==1.3.0 # via @@ -483,6 +505,7 @@ six==1.16.0 # mock # msrestazure # pandavro + # pyopenssl # python-dateutil # responses # virtualenv @@ -490,6 +513,8 @@ sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 # via sphinx +snowflake-connector-python[pandas]==2.7.3 + # via feast (setup.py) sphinx==4.3.2 # via # feast (setup.py) diff --git a/sdk/python/requirements/py3.9-ci-requirements.txt b/sdk/python/requirements/py3.9-ci-requirements.txt index 76ed9f1237..69247a2c7d 100644 --- a/sdk/python/requirements/py3.9-ci-requirements.txt +++ b/sdk/python/requirements/py3.9-ci-requirements.txt @@ -26,6 +26,10 @@ appdirs==1.4.4 # via black asgiref==3.4.1 # via uvicorn +asn1crypto==1.4.0 + # via + # oscrypto + # snowflake-connector-python assertpy==1.1 # via feast (setup.py) async-timeout==4.0.2 @@ -71,16 +75,19 @@ certifi==2021.10.8 # minio # msrest # requests + # snowflake-connector-python cffi==1.15.0 # via # azure-datalake-store # cryptography + # snowflake-connector-python cfgv==3.3.1 # via pre-commit charset-normalizer==2.0.10 # via # aiohttp # requests + # snowflake-connector-python click==8.0.3 # via # black @@ -99,6 +106,8 @@ cryptography==3.3.2 # feast (setup.py) # moto # msal + # pyopenssl + # snowflake-connector-python decorator==5.1.1 # via gcsfs deprecated==1.2.13 @@ -227,6 +236,7 @@ idna==3.3 # via # anyio # requests + # snowflake-connector-python # yarl imagesize==1.3.0 # via sphinx @@ -300,6 +310,8 @@ numpy==1.22.1 # pyarrow oauthlib==3.1.1 # via requests-oauthlib +oscrypto==1.2.1 + # via snowflake-connector-python packaging==21.3 # via # deprecation @@ -313,6 +325,7 @@ pandas==1.3.5 # via # feast (setup.py) # pandavro + # snowflake-connector-python pandavro==1.5.2 # via feast (setup.py) pathspec==0.9.0 @@ -357,7 +370,9 @@ py==1.11.0 py-cpuinfo==8.0.0 # via pytest-benchmark pyarrow==6.0.1 - # via feast (setup.py) + # via + # feast (setup.py) + # snowflake-connector-python pyasn1==0.4.8 # via # pyasn1-modules @@ -368,6 +383,8 @@ pycodestyle==2.8.0 # via flake8 pycparser==2.21 # via cffi +pycryptodomex==3.13.0 + # via snowflake-connector-python pydantic==1.9.0 # via # fastapi @@ -380,6 +397,9 @@ pyjwt[crypto]==2.3.0 # via # adal # msal + # snowflake-connector-python +pyopenssl==21.0.0 + # via snowflake-connector-python pyparsing==3.0.7 # via # httplib2 @@ -428,6 +448,7 @@ pytz==2021.3 # google-api-core # moto # pandas + # snowflake-connector-python pyyaml==6.0 # via # feast (setup.py) @@ -455,6 +476,7 @@ requests==2.27.1 # msrest # requests-oauthlib # responses + # snowflake-connector-python # sphinx requests-oauthlib==1.3.0 # via @@ -481,6 +503,7 @@ six==1.16.0 # mock # msrestazure # pandavro + # pyopenssl # python-dateutil # responses # virtualenv @@ -488,6 +511,8 @@ sniffio==1.2.0 # via anyio snowballstemmer==2.2.0 # via sphinx +snowflake-connector-python[pandas]==2.7.3 + # via feast (setup.py) sphinx==4.3.2 # via # feast (setup.py) diff --git a/sdk/python/setup.py b/sdk/python/setup.py index bae1695bf1..cb5381813b 100644 --- a/sdk/python/setup.py +++ b/sdk/python/setup.py @@ -86,6 +86,10 @@ "docker>=5.0.2", ] +SNOWFLAKE_REQUIRED = [ + "snowflake-connector-python[pandas]>=2.7.3", +] + CI_REQUIRED = ( [ "cryptography==3.3.2", @@ -130,6 +134,7 @@ + GCP_REQUIRED + REDIS_REQUIRED + AWS_REQUIRED + + SNOWFLAKE_REQUIRED ) DEV_REQUIRED = ["mypy-protobuf>=3.1.0", "grpcio-testing==1.*"] + CI_REQUIRED @@ -231,6 +236,7 @@ def run(self): "gcp": GCP_REQUIRED, "aws": AWS_REQUIRED, "redis": REDIS_REQUIRED, + "snowflake": SNOWFLAKE_REQUIRED }, include_package_data=True, license="Apache", diff --git a/sdk/python/tests/integration/feature_repos/repo_configuration.py b/sdk/python/tests/integration/feature_repos/repo_configuration.py index f0fb0b28fd..a9953d5977 100644 --- a/sdk/python/tests/integration/feature_repos/repo_configuration.py +++ b/sdk/python/tests/integration/feature_repos/repo_configuration.py @@ -29,6 +29,9 @@ from tests.integration.feature_repos.universal.data_sources.redshift import ( RedshiftDataSourceCreator, ) +from tests.integration.feature_repos.universal.data_sources.snowflake import ( + SnowflakeDataSourceCreator, +) from tests.integration.feature_repos.universal.feature_views import ( conv_rate_plus_100_feature_view, create_conv_rate_request_data_source, @@ -83,6 +86,12 @@ offline_store_creator=RedshiftDataSourceCreator, online_store=REDIS_CONFIG, ), + # Snowflake configurations + IntegrationTestRepoConfig( + provider="aws", # no list features, no feature server + offline_store_creator=SnowflakeDataSourceCreator, + online_store=REDIS_CONFIG, + ), ] ) full_repo_configs_module = os.environ.get(FULL_REPO_CONFIGS_MODULE_ENV_NAME) diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py new file mode 100644 index 0000000000..ef6902802e --- /dev/null +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py @@ -0,0 +1,82 @@ +import os +import uuid +from typing import Dict, List, Optional + +import pandas as pd + +from feast import SnowflakeSource +from feast.data_source import DataSource +from feast.infra.offline_stores.snowflake import SnowflakeOfflineStoreConfig +from feast.infra.offline_stores.snowflake_source import SavedDatasetSnowflakeStorage +from feast.infra.utils.snowflake_utils import get_snowflake_conn, write_pandas +from feast.repo_config import FeastConfigBaseModel +from tests.integration.feature_repos.universal.data_source_creator import ( + DataSourceCreator, +) + + +class SnowflakeDataSourceCreator(DataSourceCreator): + + tables: List[str] = [] + + def __init__(self, project_name: str): + super().__init__() + self.project_name = project_name + self.offline_store_config = SnowflakeOfflineStoreConfig( + type="snowflake.offline", + account=os.environ["SNOWFLAKE_CI_DEPLOYMENT"], + user=os.environ["SNOWFLAKE_CI_USER"], + password=os.environ["SNOWFLAKE_CI_PASSWORD"], + role=os.environ["SNOWFLAKE_CI_ROLE"], + warehouse=os.environ["SNOWFLAKE_CI_WAREHOUSE"], + database="FEAST", + ) + + def create_data_source( + self, + df: pd.DataFrame, + destination_name: str, + suffix: Optional[str] = None, + event_timestamp_column="ts", + created_timestamp_column="created_ts", + field_mapping: Dict[str, str] = None, + ) -> DataSource: + + snowflake_conn = get_snowflake_conn(self.offline_store_config) + + destination_name = self.get_prefixed_table_name(destination_name) + + write_pandas(snowflake_conn, df, destination_name, auto_create_table=True) + + self.tables.append(destination_name) + + return SnowflakeSource( + table=destination_name, + event_timestamp_column=event_timestamp_column, + created_timestamp_column=created_timestamp_column, + date_partition_column="", + field_mapping=field_mapping or {"ts_1": "ts"}, + ) + + def create_saved_dataset_destination(self) -> SavedDatasetSnowflakeStorage: + table = self.get_prefixed_table_name( + f"persisted_ds_{str(uuid.uuid4()).replace('-', '_')}" + ) + self.tables.append(table) + + return SavedDatasetSnowflakeStorage(table_ref=table) + + def create_offline_store_config(self) -> FeastConfigBaseModel: + return self.offline_store_config + + def get_prefixed_table_name(self, suffix: str) -> str: + return f"{self.project_name}_{suffix}" + + def teardown(self): + + snowflake_conn = get_snowflake_conn(self.offline_store_config) + + with snowflake_conn as conn: + cur = conn.cursor() + for table in self.tables: + cur.execute(f'DROP TABLE IF EXISTS "{table}"') diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 147e20aee1..6300ea5f5d 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -469,7 +469,7 @@ def test_historical_features_with_entities_from_query( if not orders_table: raise pytest.skip("Offline source is not sql-based") - entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}" + entity_df_query = f'''SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp" FROM "{orders_table}"''' store.apply([driver(), customer(), location(), *feature_views.values()]) From 1152687dec78107a9f22b7d3b91809594de55efd Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 30 Jan 2022 18:14:09 -0500 Subject: [PATCH 09/19] Add error type and refactor query execution to have retries Signed-off-by: Danny Chiao Signed-off-by: sfc-gh-madkins --- sdk/python/feast/errors.py | 5 + .../feast/infra/utils/snowflake_utils.py | 93 +++++++++++-------- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/sdk/python/feast/errors.py b/sdk/python/feast/errors.py index 2dc4576b37..39d741a350 100644 --- a/sdk/python/feast/errors.py +++ b/sdk/python/feast/errors.py @@ -320,3 +320,8 @@ def __init__(self, actual_class: str, expected_class: str): class FeastInvalidInfraObjectType(Exception): def __init__(self): super().__init__("Could not identify the type of the InfraObject.") + + +class SnowflakeIncompleteConfig(Exception): + def __init__(self, e: KeyError): + super().__init__(f"{e} not defined in a config file or feature_store.yaml file") \ No newline at end of file diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index 9660a0fc28..1ba276aaec 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -4,11 +4,15 @@ import string from logging import getLogger from tempfile import TemporaryDirectory -from typing import Iterator, Optional, Sequence, Tuple, TypeVar +from typing import Iterator, Optional, Sequence, Tuple, TypeVar, Union, Dict import pandas as pd import snowflake.connector from snowflake.connector import ProgrammingError, SnowflakeConnection +from snowflake.connector.cursor import SnowflakeCursor +from tenacity import retry, wait_exponential, retry_if_exception_type, stop_after_attempt + +from sdk.python.feast.errors import SnowflakeIncompleteConfig getLogger("snowflake.connector.cursor").disabled = True getLogger("snowflake.connector.connection").disabled = True @@ -16,12 +20,11 @@ logger = getLogger(__name__) -def execute_snowflake_statement(conn: SnowflakeConnection, query): +def execute_snowflake_statement(conn: SnowflakeConnection, query) -> Optional[Union["SnowflakeCursor", None]]: return conn.cursor().execute(query) -def get_snowflake_conn(config, autocommit=True): - +def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: if config.type == "snowflake.offline": config_header = "connections.feast_offline_store" @@ -52,7 +55,7 @@ def get_snowflake_conn(config, autocommit=True): return conn except KeyError as e: - print(f"{e} not defined in a config file or feature_store.yaml file") + raise SnowflakeIncompleteConfig(e) def write_pandas( @@ -154,23 +157,8 @@ def write_pandas( ) if chunk_size is None: chunk_size = len(df) - cursor = conn.cursor() - while True: - try: - stage_name = "".join( - random.choice(string.ascii_lowercase) for _ in range(5) - ) - create_stage_sql = ( - "create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ " - '"{stage_name}"' - ).format(stage_name=stage_name) - logger.debug(f"creating stage with '{create_stage_sql}'") - cursor.execute(create_stage_sql, _is_internal=True).fetchall() - break - except ProgrammingError as pe: - if pe.msg.endswith("already exists."): - continue - raise + cursor: SnowflakeCursor = conn.cursor() + stage_name = create_temporary_sfc_stage(cursor) with TemporaryDirectory() as tmp_folder: for i, chunk in chunk_helper(df, chunk_size): @@ -200,25 +188,7 @@ def write_pandas( columns = ",".join(list(df.columns)) if auto_create_table: - while True: - try: - file_format_name = ( - '"' - + "".join(random.choice(string.ascii_lowercase) for _ in range(5)) - + '"' - ) - file_format_sql = ( - f"CREATE FILE FORMAT {file_format_name} " - f"/* Python:snowflake.connector.pandas_tools.write_pandas() */ " - f"TYPE=PARQUET COMPRESSION={compression_map[compression]}" - ) - logger.debug(f"creating file format with '{file_format_sql}'") - cursor.execute(file_format_sql, _is_internal=True) - break - except ProgrammingError as pe: - if pe.msg.endswith("already exists."): - continue - raise + file_format_name = create_file_format(compression, compression_map, cursor) infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))" logger.debug(f"inferring schema with '{infer_schema_sql}'") column_type_mapping = dict( @@ -273,6 +243,47 @@ def write_pandas( ) +@retry( + wait=wait_exponential(multiplier=1, max=4), + retry=retry_if_exception_type(ProgrammingError), + stop=stop_after_attempt(5), + reraise=True, +) +def create_file_format(compression: str, compression_map: Dict[str, str], cursor: SnowflakeCursor) -> str: + file_format_name = ( + '"' + + "".join(random.choice(string.ascii_lowercase) for _ in range(5)) + + '"' + ) + file_format_sql = ( + f"CREATE FILE FORMAT {file_format_name} " + f"/* Python:snowflake.connector.pandas_tools.write_pandas() */ " + f"TYPE=PARQUET COMPRESSION={compression_map[compression]}" + ) + logger.debug(f"creating file format with '{file_format_sql}'") + cursor.execute(file_format_sql, _is_internal=True) + return file_format_name + + +@retry( + wait=wait_exponential(multiplier=1, max=4), + retry=retry_if_exception_type(ProgrammingError), + stop=stop_after_attempt(5), + reraise=True, +) +def create_temporary_sfc_stage(cursor: SnowflakeCursor) -> str: + stage_name = "".join( + random.choice(string.ascii_lowercase) for _ in range(5) + ) + create_stage_sql = ( + "create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ " + '"{stage_name}"' + ).format(stage_name=stage_name) + logger.debug(f"creating stage with '{create_stage_sql}'") + cursor.execute(create_stage_sql, _is_internal=True).fetchall() + return stage_name + + T = TypeVar("T", bound=Sequence) From 9a2e0a5af889f5a0b075f1c62cfc7cfd75cb6be0 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 30 Jan 2022 18:34:44 -0500 Subject: [PATCH 10/19] Handle more snowflake errors Signed-off-by: Danny Chiao Signed-off-by: sfc-gh-madkins --- sdk/python/feast/errors.py | 7 ++++- .../feast/infra/utils/snowflake_utils.py | 29 ++++++++++++++----- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/sdk/python/feast/errors.py b/sdk/python/feast/errors.py index 39d741a350..c955e3b474 100644 --- a/sdk/python/feast/errors.py +++ b/sdk/python/feast/errors.py @@ -324,4 +324,9 @@ def __init__(self): class SnowflakeIncompleteConfig(Exception): def __init__(self, e: KeyError): - super().__init__(f"{e} not defined in a config file or feature_store.yaml file") \ No newline at end of file + super().__init__(f"{e} not defined in a config file or feature_store.yaml file") + + +class SnowflakeQueryUnknownError(Exception): + def __init__(self, query: str): + super().__init__(f"Snowflake query failed: {query}") \ No newline at end of file diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index 1ba276aaec..dce5101f86 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -12,7 +12,7 @@ from snowflake.connector.cursor import SnowflakeCursor from tenacity import retry, wait_exponential, retry_if_exception_type, stop_after_attempt -from sdk.python.feast.errors import SnowflakeIncompleteConfig +from sdk.python.feast.errors import SnowflakeIncompleteConfig, SnowflakeQueryUnknownError getLogger("snowflake.connector.cursor").disabled = True getLogger("snowflake.connector.connection").disabled = True @@ -20,8 +20,11 @@ logger = getLogger(__name__) -def execute_snowflake_statement(conn: SnowflakeConnection, query) -> Optional[Union["SnowflakeCursor", None]]: - return conn.cursor().execute(query) +def execute_snowflake_statement(conn: SnowflakeConnection, query) -> SnowflakeCursor: + cursor = conn.cursor().execute(query) + if cursor is None: + raise SnowflakeQueryUnknownError(query) + return cursor def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: @@ -191,8 +194,11 @@ def write_pandas( file_format_name = create_file_format(compression, compression_map, cursor) infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@\"{stage_name}\"', file_format=>'{file_format_name}'))" logger.debug(f"inferring schema with '{infer_schema_sql}'") - column_type_mapping = dict( - cursor.execute(infer_schema_sql, _is_internal=True).fetchall() + result_cursor = cursor.execute(infer_schema_sql, _is_internal=True) + if result_cursor is None: + raise SnowflakeQueryUnknownError(infer_schema_sql) + column_type_mapping: Dict[str, str] = dict( + result_cursor.fetchall() ) # Infer schema can return the columns out of order depending on the chunking we do when uploading # so we have to iterate through the dataframe columns to make sure we create the table with its @@ -233,8 +239,12 @@ def write_pandas( on_error=on_error, ) logger.debug("copying into with '{}'".format(copy_into_sql)) - copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall() - cursor.close() + # Snowflake returns the original cursor if the query execution succeeded. + result_cursor = cursor.execute(copy_into_sql, _is_internal=True) + if result_cursor is None: + raise SnowflakeQueryUnknownError(copy_into_sql) + copy_results = result_cursor.fetchall() + result_cursor.close() return ( all(e[1] == "LOADED" for e in copy_results), len(copy_results), @@ -280,7 +290,10 @@ def create_temporary_sfc_stage(cursor: SnowflakeCursor) -> str: '"{stage_name}"' ).format(stage_name=stage_name) logger.debug(f"creating stage with '{create_stage_sql}'") - cursor.execute(create_stage_sql, _is_internal=True).fetchall() + result_cursor = cursor.execute(create_stage_sql, _is_internal=True) + if result_cursor is None: + raise SnowflakeQueryUnknownError(create_stage_sql) + result_cursor.fetchall() return stage_name From 7c24117f032af8e8dfdc2859e87f2428b86df8ba Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 30 Jan 2022 19:10:11 -0500 Subject: [PATCH 11/19] Fix lint errors Signed-off-by: Danny Chiao Signed-off-by: sfc-gh-madkins --- sdk/python/feast/errors.py | 2 +- .../feast/infra/offline_stores/snowflake.py | 8 +- .../feast/infra/utils/snowflake_utils.py | 80 ++++++++----------- 3 files changed, 37 insertions(+), 53 deletions(-) diff --git a/sdk/python/feast/errors.py b/sdk/python/feast/errors.py index c955e3b474..17147f8a60 100644 --- a/sdk/python/feast/errors.py +++ b/sdk/python/feast/errors.py @@ -329,4 +329,4 @@ def __init__(self, e: KeyError): class SnowflakeQueryUnknownError(Exception): def __init__(self, query: str): - super().__init__(f"Snowflake query failed: {query}") \ No newline at end of file + super().__init__(f"Snowflake query failed: {query}") diff --git a/sdk/python/feast/infra/offline_stores/snowflake.py b/sdk/python/feast/infra/offline_stores/snowflake.py index b2f014ea0d..ee8cd71ce0 100644 --- a/sdk/python/feast/infra/offline_stores/snowflake.py +++ b/sdk/python/feast/infra/offline_stores/snowflake.py @@ -11,6 +11,7 @@ Optional, Tuple, Union, + cast, ) import numpy as np @@ -212,7 +213,7 @@ def get_historical_features( ) entity_df_event_timestamp_range = _get_entity_df_event_timestamp_range( - entity_df, entity_df_event_timestamp_col, snowflake_conn, config, + entity_df, entity_df_event_timestamp_col, snowflake_conn, ) @contextlib.contextmanager @@ -357,7 +358,7 @@ def to_sql(self) -> str: with self._query_generator() as query: return query - def to_arrow_chunks(self, arrow_options: Optional[Dict] = None) -> list: + def to_arrow_chunks(self, arrow_options: Optional[Dict] = None) -> Optional[List]: with self._query_generator() as query: arrow_batches = execute_snowflake_statement( @@ -436,7 +437,6 @@ def _get_entity_df_event_timestamp_range( entity_df: Union[pd.DataFrame, str], entity_df_event_timestamp_col: str, snowflake_conn: SnowflakeConnection, - config: RepoConfig, ) -> Tuple[datetime, datetime]: if isinstance(entity_df, pd.DataFrame): entity_df_event_timestamp = entity_df.loc[ @@ -456,7 +456,7 @@ def _get_entity_df_event_timestamp_range( query = f'SELECT MIN("{entity_df_event_timestamp_col}") AS "min_value", MAX("{entity_df_event_timestamp_col}") AS "max_value" FROM ({entity_df})' results = execute_snowflake_statement(snowflake_conn, query).fetchall() - entity_df_event_timestamp_range = results[0] + entity_df_event_timestamp_range = cast(Tuple[datetime, datetime], results[0]) else: raise InvalidEntityType(type(entity_df)) diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index dce5101f86..b9430fd6d3 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -4,15 +4,33 @@ import string from logging import getLogger from tempfile import TemporaryDirectory -from typing import Iterator, Optional, Sequence, Tuple, TypeVar, Union, Dict +from typing import ( + Any, + Dict, + Iterator, + List, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + cast, +) import pandas as pd import snowflake.connector +from sdk.python.feast.errors import ( + SnowflakeIncompleteConfig, + SnowflakeQueryUnknownError, +) from snowflake.connector import ProgrammingError, SnowflakeConnection from snowflake.connector.cursor import SnowflakeCursor -from tenacity import retry, wait_exponential, retry_if_exception_type, stop_after_attempt - -from sdk.python.feast.errors import SnowflakeIncompleteConfig, SnowflakeQueryUnknownError +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) getLogger("snowflake.connector.cursor").disabled = True getLogger("snowflake.connector.connection").disabled = True @@ -74,25 +92,7 @@ def write_pandas( quote_identifiers: bool = True, auto_create_table: bool = False, create_temp_table: bool = False, -) -> Tuple[ - bool, - int, - int, - Sequence[ - Tuple[ - str, - str, - int, - int, - int, - int, - Optional[str], - Optional[int], - Optional[int], - Optional[str], - ] - ], -]: +): """Allows users to most efficiently write back a pandas DataFrame to Snowflake. It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table. @@ -128,10 +128,6 @@ def write_pandas( auto_create_table: When true, will automatically create a table with corresponding columns for each column in the passed in DataFrame. The table will not be created if it already exists create_temp_table: Will make the auto-created table as a temporary table - - Returns: - Returns the COPY INTO command's results to verify ingestion in the form of a tuple of whether all chunks were - ingested correctly, # of chunks, # of ingested rows, and ingest's output. """ if database is not None and schema is None: raise ProgrammingError( @@ -197,9 +193,8 @@ def write_pandas( result_cursor = cursor.execute(infer_schema_sql, _is_internal=True) if result_cursor is None: raise SnowflakeQueryUnknownError(infer_schema_sql) - column_type_mapping: Dict[str, str] = dict( - result_cursor.fetchall() - ) + result = cast(List[Tuple[str, str]], result_cursor.fetchall()) + column_type_mapping: Dict[str, str] = dict(result) # Infer schema can return the columns out of order depending on the chunking we do when uploading # so we have to iterate through the dataframe columns to make sure we create the table with its # columns in order @@ -243,14 +238,8 @@ def write_pandas( result_cursor = cursor.execute(copy_into_sql, _is_internal=True) if result_cursor is None: raise SnowflakeQueryUnknownError(copy_into_sql) - copy_results = result_cursor.fetchall() + copy_results = cast(List[Tuple], result_cursor.fetchall()) result_cursor.close() - return ( - all(e[1] == "LOADED" for e in copy_results), - len(copy_results), - sum(int(e[3]) for e in copy_results), - copy_results, - ) @retry( @@ -259,11 +248,11 @@ def write_pandas( stop=stop_after_attempt(5), reraise=True, ) -def create_file_format(compression: str, compression_map: Dict[str, str], cursor: SnowflakeCursor) -> str: +def create_file_format( + compression: str, compression_map: Dict[str, str], cursor: SnowflakeCursor +) -> str: file_format_name = ( - '"' - + "".join(random.choice(string.ascii_lowercase) for _ in range(5)) - + '"' + '"' + "".join(random.choice(string.ascii_lowercase) for _ in range(5)) + '"' ) file_format_sql = ( f"CREATE FILE FORMAT {file_format_name} " @@ -282,9 +271,7 @@ def create_file_format(compression: str, compression_map: Dict[str, str], cursor reraise=True, ) def create_temporary_sfc_stage(cursor: SnowflakeCursor) -> str: - stage_name = "".join( - random.choice(string.ascii_lowercase) for _ in range(5) - ) + stage_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) create_stage_sql = ( "create temporary stage /* Python:snowflake.connector.pandas_tools.write_pandas() */ " '"{stage_name}"' @@ -297,10 +284,7 @@ def create_temporary_sfc_stage(cursor: SnowflakeCursor) -> str: return stage_name -T = TypeVar("T", bound=Sequence) - - -def chunk_helper(lst: T, n: int) -> Iterator[Tuple[int, T]]: +def chunk_helper(lst: pd.DataFrame, n: int) -> Iterator[Tuple[int, pd.DataFrame]]: """Helper generator to chunk a sequence efficiently with current index like if enumerate was called on sequence.""" for i in range(0, len(lst), n): yield int(i / n), lst[i : i + n] From 5f8fbf4bca2e34a3ac74130fb6db7f9101754f7e Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 30 Jan 2022 19:11:11 -0500 Subject: [PATCH 12/19] Fix lint errors Signed-off-by: Danny Chiao Signed-off-by: sfc-gh-madkins --- sdk/python/feast/infra/utils/snowflake_utils.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index b9430fd6d3..1afcd17cc8 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -5,15 +5,11 @@ from logging import getLogger from tempfile import TemporaryDirectory from typing import ( - Any, Dict, Iterator, List, Optional, - Sequence, Tuple, - TypeVar, - Union, cast, ) @@ -238,7 +234,6 @@ def write_pandas( result_cursor = cursor.execute(copy_into_sql, _is_internal=True) if result_cursor is None: raise SnowflakeQueryUnknownError(copy_into_sql) - copy_results = cast(List[Tuple], result_cursor.fetchall()) result_cursor.close() From 80627c59db9dc292d254a3e48f733c9016838347 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 30 Jan 2022 19:11:38 -0500 Subject: [PATCH 13/19] Fix lint errors Signed-off-by: Danny Chiao Signed-off-by: sfc-gh-madkins --- sdk/python/feast/infra/utils/snowflake_utils.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index 1afcd17cc8..1a520f8f9c 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -4,14 +4,7 @@ import string from logging import getLogger from tempfile import TemporaryDirectory -from typing import ( - Dict, - Iterator, - List, - Optional, - Tuple, - cast, -) +from typing import Dict, Iterator, List, Optional, Tuple, cast import pandas as pd import snowflake.connector From 12c92f1f216eda8e9b3f87bd98f6df8a20deb095 Mon Sep 17 00:00:00 2001 From: Danny Chiao Date: Sun, 30 Jan 2022 19:17:34 -0500 Subject: [PATCH 14/19] Fix wrong import Signed-off-by: Danny Chiao Signed-off-by: sfc-gh-madkins --- sdk/python/feast/infra/utils/snowflake_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index 1a520f8f9c..01a7761bc7 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -8,10 +8,6 @@ import pandas as pd import snowflake.connector -from sdk.python.feast.errors import ( - SnowflakeIncompleteConfig, - SnowflakeQueryUnknownError, -) from snowflake.connector import ProgrammingError, SnowflakeConnection from snowflake.connector.cursor import SnowflakeCursor from tenacity import ( @@ -21,6 +17,8 @@ wait_exponential, ) +from feast.errors import SnowflakeIncompleteConfig, SnowflakeQueryUnknownError + getLogger("snowflake.connector.cursor").disabled = True getLogger("snowflake.connector.connection").disabled = True getLogger("snowflake.connector.network").disabled = True From 17874d7e9d60d349116831ddcd952cbe155ccfe5 Mon Sep 17 00:00:00 2001 From: Nalin Mehra <37969183+NalinGHub@users.noreply.github.com> Date: Sun, 30 Jan 2022 17:52:52 -0500 Subject: [PATCH 15/19] modify registry.db s3 object initialization to work in S3 subdirectory with Java Feast Server (#2259) Signed-off-by: NalinGHub Signed-off-by: sfc-gh-madkins --- .../src/main/java/feast/serving/registry/S3RegistryFile.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/java/serving/src/main/java/feast/serving/registry/S3RegistryFile.java b/java/serving/src/main/java/feast/serving/registry/S3RegistryFile.java index 486e2ca39c..4b122a5de0 100644 --- a/java/serving/src/main/java/feast/serving/registry/S3RegistryFile.java +++ b/java/serving/src/main/java/feast/serving/registry/S3RegistryFile.java @@ -33,7 +33,8 @@ public S3RegistryFile(AmazonS3 s3Client, String url) { this.s3Client = s3Client; String[] split = url.replace("s3://", "").split("/"); - this.s3Object = this.s3Client.getObject(split[0], split[1]); + String objectPath = String.join("/", java.util.Arrays.copyOfRange(split, 1, split.length)); + this.s3Object = this.s3Client.getObject(split[0], objectPath); } @Override From 61172b380ddc2d7d18c7471d05fbba3bdcf64220 Mon Sep 17 00:00:00 2001 From: sfc-gh-madkins Date: Sun, 30 Jan 2022 21:36:26 -0600 Subject: [PATCH 16/19] clean up docs Signed-off-by: sfc-gh-madkins --- docs/tutorials/driver-stats-using-snowflake.md | 3 --- sdk/python/feast/data_source.py | 1 - sdk/python/feast/infra/utils/snowflake_utils.py | 4 +++- .../feature_repos/universal/data_sources/snowflake.py | 1 - 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/tutorials/driver-stats-using-snowflake.md b/docs/tutorials/driver-stats-using-snowflake.md index e95dd655ae..c51fc9b1ce 100644 --- a/docs/tutorials/driver-stats-using-snowflake.md +++ b/docs/tutorials/driver-stats-using-snowflake.md @@ -52,9 +52,6 @@ The following files will automatically be created in your project folder: * driver_repo.py -- This is your main feature definition file * test.py -- This is a file to test your feature store configuration -* registry.db -- (Inactive) This file contains the metadata related to your feature store operations -* data/ -- (Inactive) This folder contains the sample data that we will use - #### Inspect `feature_store.yaml` Here you will see the information that you entered. This template will look to use diff --git a/sdk/python/feast/data_source.py b/sdk/python/feast/data_source.py index f7ab2f04d4..94910c6c08 100644 --- a/sdk/python/feast/data_source.py +++ b/sdk/python/feast/data_source.py @@ -34,7 +34,6 @@ class SourceType(enum.Enum): BATCH_BIGQUERY = 2 STREAM_KAFKA = 3 STREAM_KINESIS = 4 - BATCH_SNOWFLAKE = 8 class KafkaOptions: diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index 01a7761bc7..d675fbd84b 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -65,7 +65,9 @@ def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: except KeyError as e: raise SnowflakeIncompleteConfig(e) - +# TO DO -- sfc-gh-madkins +#Remove dependency on write_pandas function by falling back to native snowflake python connector +#Current issue is datetime[ns] types are read incorrectly in Snowflake, need to coerce to datetime[ns, UTC] def write_pandas( conn: SnowflakeConnection, df: pd.DataFrame, diff --git a/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py index ef6902802e..1ecae0317b 100644 --- a/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py +++ b/sdk/python/tests/integration/feature_repos/universal/data_sources/snowflake.py @@ -73,7 +73,6 @@ def get_prefixed_table_name(self, suffix: str) -> str: return f"{self.project_name}_{suffix}" def teardown(self): - snowflake_conn = get_snowflake_conn(self.offline_store_config) with snowflake_conn as conn: From 9e952b21759f8abcf1913ba72301e8ba92833f4f Mon Sep 17 00:00:00 2001 From: sfc-gh-madkins Date: Sun, 30 Jan 2022 22:28:23 -0600 Subject: [PATCH 17/19] lint-python Signed-off-by: sfc-gh-madkins --- sdk/python/feast/infra/utils/snowflake_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/utils/snowflake_utils.py b/sdk/python/feast/infra/utils/snowflake_utils.py index d675fbd84b..f280cfa218 100644 --- a/sdk/python/feast/infra/utils/snowflake_utils.py +++ b/sdk/python/feast/infra/utils/snowflake_utils.py @@ -65,9 +65,10 @@ def get_snowflake_conn(config, autocommit=True) -> SnowflakeConnection: except KeyError as e: raise SnowflakeIncompleteConfig(e) + # TO DO -- sfc-gh-madkins -#Remove dependency on write_pandas function by falling back to native snowflake python connector -#Current issue is datetime[ns] types are read incorrectly in Snowflake, need to coerce to datetime[ns, UTC] +# Remove dependency on write_pandas function by falling back to native snowflake python connector +# Current issue is datetime[ns] types are read incorrectly in Snowflake, need to coerce to datetime[ns, UTC] def write_pandas( conn: SnowflakeConnection, df: pd.DataFrame, From 3ddaf0d1e7383de1930b599bd999c5c439f8de5f Mon Sep 17 00:00:00 2001 From: sfc-gh-madkins Date: Sun, 30 Jan 2022 22:40:22 -0600 Subject: [PATCH 18/19] fixed historical test Signed-off-by: sfc-gh-madkins --- .../offline_store/test_universal_historical_retrieval.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index 6300ea5f5d..fba012ad1c 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -469,7 +469,13 @@ def test_historical_features_with_entities_from_query( if not orders_table: raise pytest.skip("Offline source is not sql-based") - entity_df_query = f'''SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp" FROM "{orders_table}"''' + if ( + environment.test_repo_config.offline_store_creator.__name__ + == SnowflakeDataSourceCreator.__name__ + ): + entity_df_query = f'''SELECT "customer_id", "driver_id", "order_id", "origin_id", "destination_id", "event_timestamp" FROM "{orders_table}"''' + else: + entity_df_query = f"SELECT customer_id, driver_id, order_id, origin_id, destination_id, event_timestamp FROM {orders_table}" store.apply([driver(), customer(), location(), *feature_views.values()]) From 6c29bfb0cf98e4bce86843557471772f0ee20300 Mon Sep 17 00:00:00 2001 From: sfc-gh-madkins Date: Sun, 30 Jan 2022 22:46:13 -0600 Subject: [PATCH 19/19] fixed historical test Signed-off-by: sfc-gh-madkins --- .../offline_store/test_universal_historical_retrieval.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py index fba012ad1c..4a396c7e4d 100644 --- a/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py +++ b/sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py @@ -26,6 +26,9 @@ construct_universal_feature_views, table_name_from_data_source, ) +from tests.integration.feature_repos.universal.data_sources.snowflake import ( + SnowflakeDataSourceCreator, +) from tests.integration.feature_repos.universal.entities import ( customer, driver,