Skip to content

Commit

Permalink
Merge nest loop join build vectors
Browse files Browse the repository at this point in the history
Summary: When build vectors in nested loop join are small, we should merge them to get better performance.  In some extreme case, the performance difference can be more than 100 times.

Differential Revision: D65450017
  • Loading branch information
Yuhta authored and facebook-github-bot committed Nov 4, 2024
1 parent c95f1e0 commit 7c3a23f
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 0 deletions.
29 changes: 29 additions & 0 deletions velox/exec/NestedLoopJoinBuild.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,34 @@ BlockingReason NestedLoopJoinBuild::isBlocked(ContinueFuture* future) {
return BlockingReason::kWaitForJoinBuild;
}

std::vector<RowVectorPtr> NestedLoopJoinBuild::mergeDataVectors() const {
const auto maxBatchRows =
operatorCtx_->task()->queryCtx()->queryConfig().maxOutputBatchRows();
std::vector<RowVectorPtr> merged;
for (int i = 0; i < dataVectors_.size();) {
auto batchSize = dataVectors_[i]->size();
auto j = i + 1;
while (j < dataVectors_.size() &&
batchSize + dataVectors_[j]->size() <= maxBatchRows) {
batchSize += dataVectors_[j++]->size();
}
if (j == i + 1) {
merged.push_back(dataVectors_[i++]);
} else {
auto batch = BaseVector::create<RowVector>(
dataVectors_[i]->type(), batchSize, pool());
batchSize = 0;
while (i < j) {
auto* source = dataVectors_[i++].get();
batch->copy(source, batchSize, 0, source->size());
batchSize += source->size();
}
merged.push_back(std::move(batch));
}
}
return merged;
}

void NestedLoopJoinBuild::noMoreInput() {
Operator::noMoreInput();
std::vector<ContinuePromise> promises;
Expand Down Expand Up @@ -105,6 +133,7 @@ void NestedLoopJoinBuild::noMoreInput() {
}
}

dataVectors_ = mergeDataVectors();
operatorCtx_->task()
->getNestedLoopJoinBridge(
operatorCtx_->driverCtx()->splitGroupId, planNodeId())
Expand Down
2 changes: 2 additions & 0 deletions velox/exec/NestedLoopJoinBuild.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class NestedLoopJoinBuild : public Operator {
}

private:
std::vector<RowVectorPtr> mergeDataVectors() const;

std::vector<RowVectorPtr> dataVectors_;

// Future for synchronizing with other Drivers of the same pipeline. All build
Expand Down

0 comments on commit 7c3a23f

Please sign in to comment.