From 5f38daf30d7b0d4bbb75a06957c5d1aa4c7107fd Mon Sep 17 00:00:00 2001 From: "john.lemmon" Date: Mon, 12 Feb 2024 14:09:26 -0600 Subject: [PATCH] Bugfix for grabbing historical data from Snowflake with array type features that are null for an entity. Signed-off-by: john.lemmon Update docs to reflect array support in Snowflake Signed-off-by: john.lemmon --- docs/reference/data-sources/overview.md | 20 +++++++------- docs/reference/data-sources/snowflake.md | 2 +- .../feast/infra/offline_stores/snowflake.py | 4 ++- .../infra/offline_stores/test_snowflake.py | 26 +++++++++++++++++++ 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/docs/reference/data-sources/overview.md b/docs/reference/data-sources/overview.md index 112d4168d3..302c19b049 100644 --- a/docs/reference/data-sources/overview.md +++ b/docs/reference/data-sources/overview.md @@ -19,13 +19,13 @@ Details for each specific data source can be found [here](README.md). Below is a matrix indicating which data sources support which types. | | File | BigQuery | Snowflake | Redshift | Postgres | Spark | Trino | -| :-------------------------------- | :-- | :-- | :-- | :-- | :-- | :-- | :-- | -| `bytes` | yes | yes | yes | yes | yes | yes | yes | -| `string` | yes | yes | yes | yes | yes | yes | yes | -| `int32` | yes | yes | yes | yes | yes | yes | yes | -| `int64` | yes | yes | yes | yes | yes | yes | yes | -| `float32` | yes | yes | yes | yes | yes | yes | yes | -| `float64` | yes | yes | yes | yes | yes | yes | yes | -| `bool` | yes | yes | yes | yes | yes | yes | yes | -| `timestamp` | yes | yes | yes | yes | yes | yes | yes | -| array types | yes | yes | no | no | yes | yes | no | \ No newline at end of file +| :-------------------------------- | :-- | :-- |:----------| :-- | :-- | :-- | :-- | +| `bytes` | yes | yes | yes | yes | yes | yes | yes | +| `string` | yes | yes | yes | yes | yes | yes | yes | +| `int32` | yes | yes | yes | yes | yes | yes | yes | +| `int64` | yes | yes | yes | yes | yes | yes | yes | +| `float32` | yes | yes | yes | yes | yes | yes | yes | +| `float64` | yes | yes | yes | yes | yes | yes | yes | +| `bool` | yes | yes | yes | yes | yes | yes | yes | +| `timestamp` | yes | yes | yes | yes | yes | yes | yes | +| array types | yes | yes | yes | no | yes | yes | no | \ No newline at end of file diff --git a/docs/reference/data-sources/snowflake.md b/docs/reference/data-sources/snowflake.md index 82bf5cb4d4..98a56e09f8 100644 --- a/docs/reference/data-sources/snowflake.md +++ b/docs/reference/data-sources/snowflake.md @@ -46,5 +46,5 @@ The full set of configuration options is available [here](https://rtd.feast.dev/ ## Supported Types -Snowflake data sources support all eight primitive types, but currently do not support array types. +Snowflake data sources support all eight primitive types. Array types are also supported but not with type inference. For a comparison against other batch data sources, please see [here](overview.md#functionality-matrix). diff --git a/sdk/python/feast/infra/offline_stores/snowflake.py b/sdk/python/feast/infra/offline_stores/snowflake.py index 32cda2d6b6..14752fd857 100644 --- a/sdk/python/feast/infra/offline_stores/snowflake.py +++ b/sdk/python/feast/infra/offline_stores/snowflake.py @@ -463,7 +463,9 @@ def _to_df_internal(self, timeout: Optional[int] = None) -> pd.DataFrame: Array(Float32), Array(Bool), ]: - df[feature.name] = [json.loads(x) for x in df[feature.name]] + df[feature.name] = [ + json.loads(x) if x else None for x in df[feature.name] + ] return df diff --git a/sdk/python/tests/unit/infra/offline_stores/test_snowflake.py b/sdk/python/tests/unit/infra/offline_stores/test_snowflake.py index afc3ae97ae..ac55f123bb 100644 --- a/sdk/python/tests/unit/infra/offline_stores/test_snowflake.py +++ b/sdk/python/tests/unit/infra/offline_stores/test_snowflake.py @@ -1,14 +1,18 @@ import re from unittest.mock import ANY, MagicMock, patch +import pandas as pd import pytest +from pytest_mock import MockFixture +from feast import FeatureView, Field, FileSource from feast.infra.offline_stores.snowflake import ( SnowflakeOfflineStoreConfig, SnowflakeRetrievalJob, ) from feast.infra.online_stores.sqlite import SqliteOnlineStoreConfig from feast.repo_config import RepoConfig +from feast.types import Array, String @pytest.fixture(params=["s3", "s3gov"]) @@ -55,3 +59,25 @@ def test_to_remote_storage(retrieval_job): mock_get_file_names_from_copy.assert_called_once_with(ANY, ANY) native_path = mock_get_file_names_from_copy.call_args[0][1] assert re.match("^s3://.*", native_path), "path should be s3://*" + + +def test_snowflake_to_df_internal( + retrieval_job: SnowflakeRetrievalJob, mocker: MockFixture +): + mock_execute = mocker.patch( + "feast.infra.offline_stores.snowflake.execute_snowflake_statement" + ) + mock_execute.return_value.fetch_pandas_all.return_value = pd.DataFrame.from_dict( + {"feature1": ['["1", "2", "3"]', None, "[]"]} # For Valid, Null, and Empty + ) + + feature_view = FeatureView( + name="my-feature-view", + entities=[], + schema=[ + Field(name="feature1", dtype=Array(String)), + ], + source=FileSource(path="dummy.path"), # Dummy value + ) + retrieval_job._feature_views = [feature_view] + retrieval_job._to_df_internal()