From dc1cf1bc231e2ce1c002b390e00aa61e9e5396a2 Mon Sep 17 00:00:00 2001
From: Achal Shah <achals@gmail.com>
Date: Thu, 29 Jul 2021 16:55:48 -0700
Subject: [PATCH] Document how pandas deals with missing values

Signed-off-by: Achal Shah <achals@gmail.com>
---
 sdk/python/feast/infra/offline_stores/file.py | 8 ++++++--
 sdk/python/feast/infra/utils/aws_utils.py     | 7 ++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py
index 0e4657eec5..4461ef842b 100644
--- a/sdk/python/feast/infra/offline_stores/file.py
+++ b/sdk/python/feast/infra/offline_stores/file.py
@@ -111,7 +111,7 @@ def evaluate_historical_retrieval():
                     feature_view.batch_source.created_timestamp_column
                 )
 
-                # Read offline parquet data in pyarrow format
+                # Read offline parquet data in pyarrow format.
                 table = pyarrow.parquet.read_table(feature_view.batch_source.path)
 
                 # Rename columns by the field mapping dictionary if it exists
@@ -120,7 +120,11 @@ def evaluate_historical_retrieval():
                         table, feature_view.batch_source.field_mapping
                     )
 
-                # Convert pyarrow table to pandas dataframe
+                # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values,
+                # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean
+                # If the dtype is 'object', then missing values are inferred as python `None`s.
+                # More details at:
+                # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
                 df_to_join = table.to_pandas()
 
                 # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC
diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py
index 8ea636dc3e..fcd84d4b6f 100644
--- a/sdk/python/feast/infra/utils/aws_utils.py
+++ b/sdk/python/feast/infra/utils/aws_utils.py
@@ -178,7 +178,12 @@ def upload_df_to_redshift(
     # Drop the index so that we dont have unnecessary columns
     df.reset_index(drop=True, inplace=True)
 
-    # Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema
+    # Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema.
+    # Note, if the underlying data has missing values,
+    # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean.
+    # If the dtype is 'object', then missing values are inferred as python `None`s.
+    # More details at:
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
     table = pa.Table.from_pandas(df)
     column_names, column_types = [], []
     for field in table.schema: