Document how pandas deals with missing values

Signed-off-by: Achal Shah <[email protected]>
feast-dev · Jul 30, 2021 · dc1cf1b · dc1cf1b
1 parent de8a1b6
commit dc1cf1b
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 3 deletions.
diff --git a/sdk/python/feast/infra/offline_stores/file.py b/sdk/python/feast/infra/offline_stores/file.py
@@ -111,7 +111,7 @@ def evaluate_historical_retrieval():
                     feature_view.batch_source.created_timestamp_column
                 )
 
-                # Read offline parquet data in pyarrow format
+                # Read offline parquet data in pyarrow format.
                 table = pyarrow.parquet.read_table(feature_view.batch_source.path)
 
                 # Rename columns by the field mapping dictionary if it exists
@@ -120,7 +120,11 @@ def evaluate_historical_retrieval():
                         table, feature_view.batch_source.field_mapping
                     )
 
-                # Convert pyarrow table to pandas dataframe
+                # Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values,
+                # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean
+                # If the dtype is 'object', then missing values are inferred as python `None`s.
+                # More details at:
+                # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
                 df_to_join = table.to_pandas()
 
                 # Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC

diff --git a/sdk/python/feast/infra/utils/aws_utils.py b/sdk/python/feast/infra/utils/aws_utils.py
@@ -178,7 +178,12 @@ def upload_df_to_redshift(
     # Drop the index so that we dont have unnecessary columns
     df.reset_index(drop=True, inplace=True)
 
-    # Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema
+    # Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema.
+    # Note, if the underlying data has missing values,
+    # pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean.
+    # If the dtype is 'object', then missing values are inferred as python `None`s.
+    # More details at:
+    # https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
     table = pa.Table.from_pandas(df)
     column_names, column_types = [], []
     for field in table.schema: