Skip to content

Commit

Permalink
Convert nested join in Vector Queries to Pandas Merge. (#1298)
Browse files Browse the repository at this point in the history
Profiling on Vector Scan showed that we are spending a lot of time in
the post-processing logic doing a Nested Join. This is an initial commit
to change that into a Join using Pandas. Change showed ~50% improvement
in Similarity Queries.
  • Loading branch information
Chitti-Ankith authored Oct 26, 2023
1 parent 703dc94 commit f420faa
Showing 1 changed file with 26 additions and 12 deletions.
38 changes: 26 additions & 12 deletions evadb/executor/vector_index_scan_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,6 @@ def _evadb_vector_index_scan(self, *args, **kwargs):
# todo support queries over distance as well
# distance_list = index_result.similarities
row_num_np = index_result.ids

# Load projected columns from disk and join with search results.
row_num_col_name = None

Expand All @@ -126,20 +125,35 @@ def _evadb_vector_index_scan(self, *args, **kwargs):
f"The index {self.index_name} returned only {num_required_results} results, which is fewer than the required {self.limit_count.value}."
)

res_row_list = [None for _ in range(num_required_results)]
final_df = pd.DataFrame()
res_data_list = []
row_num_df = pd.DataFrame({"row_num_np": row_num_np})
for batch in self.children[0].exec(**kwargs):
column_list = batch.columns
if not row_num_col_name:
column_list = batch.columns
row_num_alias = get_row_num_column_alias(column_list)
row_num_col_name = "{}.{}".format(row_num_alias, ROW_NUM_COLUMN)

# Nested join.
for _, row in batch.frames.iterrows():
for idx, row_num in enumerate(row_num_np):
if row_num == row[row_num_col_name]:
res_row = dict()
for col_name in column_list:
res_row[col_name] = row[col_name]
res_row_list[idx] = res_row
if not batch.frames[row_num_col_name].isin(row_num_df["row_num_np"]).any():
continue

for index, row in batch.frames.iterrows():
row_dict = row.to_dict()
res_data_list.append(row_dict)

result_df = pd.DataFrame(res_data_list)
result_df.set_index(row_num_col_name, inplace=True)
result_df = result_df.reindex(row_num_np)
row_num_df.set_index(pd.Index(row_num_np), inplace=True)

final_df = pd.merge(
row_num_df,
result_df,
left_index=True,
right_index=True,
how="left",
)

yield Batch(pd.DataFrame(res_row_list))
if "row_num_np" in final_df:
del final_df["row_num_np"]
yield Batch(final_df)

0 comments on commit f420faa

Please sign in to comment.