Skip to content

Commit

Permalink
Document how pandas deals with missing values
Browse files Browse the repository at this point in the history
Signed-off-by: Achal Shah <[email protected]>
  • Loading branch information
achals committed Jul 30, 2021
1 parent de8a1b6 commit dc1cf1b
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
8 changes: 6 additions & 2 deletions sdk/python/feast/infra/offline_stores/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def evaluate_historical_retrieval():
feature_view.batch_source.created_timestamp_column
)

# Read offline parquet data in pyarrow format
# Read offline parquet data in pyarrow format.
table = pyarrow.parquet.read_table(feature_view.batch_source.path)

# Rename columns by the field mapping dictionary if it exists
Expand All @@ -120,7 +120,11 @@ def evaluate_historical_retrieval():
table, feature_view.batch_source.field_mapping
)

# Convert pyarrow table to pandas dataframe
# Convert pyarrow table to pandas dataframe. Note, if the underlying data has missing values,
# pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean
# If the dtype is 'object', then missing values are inferred as python `None`s.
# More details at:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
df_to_join = table.to_pandas()

# Make sure all timestamp fields are tz-aware. We default tz-naive fields to UTC
Expand Down
7 changes: 6 additions & 1 deletion sdk/python/feast/infra/utils/aws_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,12 @@ def upload_df_to_redshift(
# Drop the index so that we dont have unnecessary columns
df.reset_index(drop=True, inplace=True)

# Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema
# Convert Pandas DataFrame into PyArrow table and compile the Redshift table schema.
# Note, if the underlying data has missing values,
# pandas will convert those values to np.nan if the dtypes are numerical (floats, ints, etc.) or boolean.
# If the dtype is 'object', then missing values are inferred as python `None`s.
# More details at:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html#values-considered-missing
table = pa.Table.from_pandas(df)
column_names, column_types = [], []
for field in table.schema:
Expand Down

0 comments on commit dc1cf1b

Please sign in to comment.