From dc1cf1bc231e2ce1c002b390e00aa61e9e5396a2 Mon Sep 17 00:00:00 2001 From: Achal Shah Date: Thu, 29 Jul 2021 16:55:48 -0700 Subject: [PATCH] Document how pandas deals with missing values Signed-off-by: Achal Shah --- sdk/python/feast/infra/offline_stores/file.py | 8 ++++++-- sdk/python/feast/infra/utils/aws_utils.py | 7 ++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py index 0e4657eec5..4461ef842b 100644 --- a/sdk/python/feast/infra/offline_stores/file.py +++ b/sdk/python/feast/infra/offline_stores/file.py @@ -111,7 +111,7 @@ def evaluate_historical_retrieval(): feature_view.batch_source.created_timestamp_column ) - # Read offline parquet data in pyarrow format + # Read offline parquet data in pyarrow format. table = pyarrow.parquet.read_table(feature_view.batch_source.path) # Rename columns by the field mapping dictionary if it exists @@ -120,7 +120,11 @@ def evaluate_historical_retrieval(): table, feature_view.batch_source.field_mapping ) - # Convert pyarrow table to pandas dataframe + # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values, + # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean + # If the dtype is 'object', then missing values are inferred as python `None`s. + # More details at: + # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing df_to_join = table.to_pandas() # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py index 8ea636dc3e..fcd84d4b6f 100644 --- a/sdk/python/feast/infra/utils/aws_utils.py +++ b/sdk/python/feast/infra/utils/aws_utils.py @@ -178,7 +178,12 @@ def upload_df_to_redshift( # Drop the index so that we dont have unnecessary columns df.reset_index(drop=True, inplace=True) - # Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema + # Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema. + # Note, if the underlying data has missing values, + # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean. + # If the dtype is 'object', then missing values are inferred as python `None`s. + # More details at: + # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing table = pa.Table.from_pandas(df) column_names, column_types = [], [] for field in table.schema: