From 1165d871e572622e94457c6561bdcccd011aeade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gunnar=20Sv=20Sigurbj=C3=B6rnsson?= Date: Thu, 28 Oct 2021 13:22:51 +0000 Subject: [PATCH] Improve performance of _convert_arrow_to_proto MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _convert_arrow_to_proto does multiple lookups in the list of column names for the arrow table to find the index of columns. This is done for each row so the time complexity grows quickly. Rather than search the list of names for the index we construct a dictionary {column_name: index}. This allows faster lookups for the index of a column and speeds up the method significantly which affects e.g. materialization. Signed-off-by: Gunnar Sv Sigurbjörnsson --- sdk/python/feast/infra/provider.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 4b7c8069f2..da3fb2c628 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -304,27 +304,28 @@ def _coerce_datetime(ts): else: return ts + column_names_idx = {k: i for i, k in enumerate(table.column_names)} for row in zip(*table.to_pydict().values()): entity_key = EntityKeyProto() for join_key in join_keys: entity_key.join_keys.append(join_key) - idx = table.column_names.index(join_key) + idx = column_names_idx[join_key] value = python_value_to_proto_value(row[idx]) entity_key.entity_values.append(value) feature_dict = {} for feature in feature_view.features: - idx = table.column_names.index(feature.name) + idx = column_names_idx[feature.name] value = python_value_to_proto_value(row[idx], feature.dtype) feature_dict[feature.name] = value - event_timestamp_idx = table.column_names.index( + event_timestamp_idx = column_names_idx[ feature_view.batch_source.event_timestamp_column - ) + ] event_timestamp = _coerce_datetime(row[event_timestamp_idx]) if feature_view.batch_source.created_timestamp_column: - created_timestamp_idx = table.column_names.index( + created_timestamp_idx = column_names_idx[ feature_view.batch_source.created_timestamp_column - ) + ] created_timestamp = _coerce_datetime(row[created_timestamp_idx]) else: created_timestamp = None