From f420faa09f8350782011a744edd332de4c16a2d9 Mon Sep 17 00:00:00 2001 From: Chitti Ankith Date: Wed, 25 Oct 2023 20:30:47 -0400 Subject: [PATCH] Convert nested join in Vector Queries to Pandas Merge. (#1298) Profiling on Vector Scan showed that we are spending a lot of time in the post-processing logic doing a Nested Join. This is an initial commit to change that into a Join using Pandas. Change showed ~50% improvement in Similarity Queries. --- evadb/executor/vector_index_scan_executor.py | 38 +++++++++++++------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/evadb/executor/vector_index_scan_executor.py b/evadb/executor/vector_index_scan_executor.py index b2e1bb219e..0d6ee58c4f 100644 --- a/evadb/executor/vector_index_scan_executor.py +++ b/evadb/executor/vector_index_scan_executor.py @@ -114,7 +114,6 @@ def _evadb_vector_index_scan(self, *args, **kwargs): # todo support queries over distance as well # distance_list = index_result.similarities row_num_np = index_result.ids - # Load projected columns from disk and join with search results. row_num_col_name = None @@ -126,20 +125,35 @@ def _evadb_vector_index_scan(self, *args, **kwargs): f"The index {self.index_name} returned only {num_required_results} results, which is fewer than the required {self.limit_count.value}." ) - res_row_list = [None for _ in range(num_required_results)] + final_df = pd.DataFrame() + res_data_list = [] + row_num_df = pd.DataFrame({"row_num_np": row_num_np}) for batch in self.children[0].exec(**kwargs): - column_list = batch.columns if not row_num_col_name: + column_list = batch.columns row_num_alias = get_row_num_column_alias(column_list) row_num_col_name = "{}.{}".format(row_num_alias, ROW_NUM_COLUMN) - # Nested join. - for _, row in batch.frames.iterrows(): - for idx, row_num in enumerate(row_num_np): - if row_num == row[row_num_col_name]: - res_row = dict() - for col_name in column_list: - res_row[col_name] = row[col_name] - res_row_list[idx] = res_row + if not batch.frames[row_num_col_name].isin(row_num_df["row_num_np"]).any(): + continue + + for index, row in batch.frames.iterrows(): + row_dict = row.to_dict() + res_data_list.append(row_dict) + + result_df = pd.DataFrame(res_data_list) + result_df.set_index(row_num_col_name, inplace=True) + result_df = result_df.reindex(row_num_np) + row_num_df.set_index(pd.Index(row_num_np), inplace=True) + + final_df = pd.merge( + row_num_df, + result_df, + left_index=True, + right_index=True, + how="left", + ) - yield Batch(pd.DataFrame(res_row_list)) + if "row_num_np" in final_df: + del final_df["row_num_np"] + yield Batch(final_df)