From 4731ed93eb243f821e5d94d3bf15548568e376a6 Mon Sep 17 00:00:00 2001 From: "jie.wang" <38901892+jievince@users.noreply.github.com> Date: Fri, 2 Dec 2022 10:05:41 +0800 Subject: [PATCH] fix duplicate output column names of join (#4965) --- .../executor/query/InnerJoinExecutor.cpp | 12 -- src/graph/executor/query/InnerJoinExecutor.h | 3 - src/graph/executor/query/JoinExecutor.cpp | 43 ++++- src/graph/executor/query/JoinExecutor.h | 6 + src/graph/executor/test/JoinTest.cpp | 164 ++++++++++++++++++ src/graph/planner/plan/Query.cpp | 8 +- .../features/match/MultiQueryParts.feature | 67 +++++++ 7 files changed, 277 insertions(+), 26 deletions(-) diff --git a/src/graph/executor/query/InnerJoinExecutor.cpp b/src/graph/executor/query/InnerJoinExecutor.cpp index ee835b20478..0379b0f558a 100644 --- a/src/graph/executor/query/InnerJoinExecutor.cpp +++ b/src/graph/executor/query/InnerJoinExecutor.cpp @@ -244,18 +244,6 @@ void InnerJoinExecutor::buildNewRow(const std::unordered_mapkind() == PlanNode::Kind::kBiInnerJoin) { return node_->asNode()->leftInputVar(); diff --git a/src/graph/executor/query/InnerJoinExecutor.h b/src/graph/executor/query/InnerJoinExecutor.h index f7deb1162f6..4cdbf95d9db 100644 --- a/src/graph/executor/query/InnerJoinExecutor.h +++ b/src/graph/executor/query/InnerJoinExecutor.h @@ -48,9 +48,6 @@ class InnerJoinExecutor : public JoinExecutor { Row rRow, DataSet& ds) const; - // concat rows - Row newRow(Row left, Row right) const; - const std::string& leftVar() const; const std::string& rightVar() const; diff --git a/src/graph/executor/query/JoinExecutor.cpp b/src/graph/executor/query/JoinExecutor.cpp index b35d03cae8e..897cacf848c 100644 --- a/src/graph/executor/query/JoinExecutor.cpp +++ b/src/graph/executor/query/JoinExecutor.cpp @@ -49,6 +49,25 @@ Status JoinExecutor::checkBiInputDataSets() { return Status::Error(ss.str()); } colSize_ = join->colNames().size(); + + auto& lhsResult = ectx_->getResult(join->leftInputVar()); + auto& lhsColNames = lhsResult.valuePtr()->getDataSet().colNames; + auto lhsColSize = lhsColNames.size(); + + auto& rhsResult = ectx_->getResult(join->rightInputVar()); + auto& rhsColNames = rhsResult.valuePtr()->getDataSet().colNames; + auto rhsColSize = rhsColNames.size(); + + DCHECK_LE(colSize_, lhsColSize + rhsColSize); + if (colSize_ < lhsColSize + rhsColSize) { + for (size_t i = 0; i < rhsColSize; ++i) { + auto it = std::find(lhsColNames.begin(), lhsColNames.end(), rhsColNames[i]); + if (it == lhsColNames.end()) { + rhsOutputColIdxs_.push_back(i); + } + } + } + return Status::OK(); } @@ -83,15 +102,21 @@ void JoinExecutor::buildSingleKeyHashTable( } Row JoinExecutor::newRow(Row left, Row right) const { - Row r; - r.reserve(left.size() + right.size()); - r.values.insert(r.values.end(), - std::make_move_iterator(left.values.begin()), - std::make_move_iterator(left.values.end())); - r.values.insert(r.values.end(), - std::make_move_iterator(right.values.begin()), - std::make_move_iterator(right.values.end())); - return r; + Row row; + row.reserve(left.size() + right.size()); + row.values.insert(row.values.end(), + std::make_move_iterator(left.values.begin()), + std::make_move_iterator(left.values.end())); + if (!rhsOutputColIdxs_.empty()) { + for (auto idx : rhsOutputColIdxs_) { + row.values.emplace_back(std::move(right.values[idx])); + } + } else { + row.values.insert(row.values.end(), + std::make_move_iterator(right.values.begin()), + std::make_move_iterator(right.values.end())); + } + return row; } } // namespace graph diff --git a/src/graph/executor/query/JoinExecutor.h b/src/graph/executor/query/JoinExecutor.h index e759565b649..68a6078bc95 100644 --- a/src/graph/executor/query/JoinExecutor.h +++ b/src/graph/executor/query/JoinExecutor.h @@ -33,7 +33,13 @@ class JoinExecutor : public Executor { std::unique_ptr lhsIter_; std::unique_ptr rhsIter_; + // colSize_ is the size of the output columns of join executor. + // If the join is natural join, which means the left and right columns have duplicated names, + // the output columns will be a intersection of the left and right columns. size_t colSize_{0}; + // If the join is natural join, rhsOutputColIdxs_ will be used to record the output column index + // of the right. If not, rhsOutputColIdxs_ will be empty. + std::vector rhsOutputColIdxs_; std::unordered_map> hashTable_; std::unordered_map> listHashTable_; }; diff --git a/src/graph/executor/test/JoinTest.cpp b/src/graph/executor/test/JoinTest.cpp index d5547b63ea4..98fecc66c00 100644 --- a/src/graph/executor/test/JoinTest.cpp +++ b/src/graph/executor/test/JoinTest.cpp @@ -9,6 +9,7 @@ #include "graph/executor/query/InnerJoinExecutor.h" #include "graph/executor/query/LeftJoinExecutor.h" #include "graph/executor/test/QueryTestBase.h" +#include "graph/planner/plan/Logic.h" #include "graph/planner/plan/Query.h" namespace nebula { @@ -68,6 +69,43 @@ class JoinTest : public QueryTestBase { qctx_->symTable()->newVariable("empty_var2"); qctx_->ectx()->setResult("empty_var2", ResultBuilder().value(Value(std::move(ds))).build()); } + { + DataSet ds; + ds.colNames = {"v1", "e1", "v2", "v3"}; + Row row1; + row1.values.emplace_back(1); + row1.values.emplace_back(2); + row1.values.emplace_back(3); + row1.values.emplace_back(4); + ds.rows.emplace_back(std::move(row1)); + + Row row2; + row2.values.emplace_back(11); + row2.values.emplace_back(12); + row2.values.emplace_back(3); + row2.values.emplace_back(4); + ds.rows.emplace_back(std::move(row2)); + + qctx_->symTable()->newVariable("var4"); + qctx_->ectx()->setResult("var4", ResultBuilder().value(Value(std::move(ds))).build()); + } + { + DataSet ds; + ds.colNames = {"v2", "e2", "v3"}; + Row row1; + row1.values.emplace_back(3); + row1.values.emplace_back(0); + row1.values.emplace_back(4); + ds.rows.emplace_back(std::move(row1)); + + Row row2; + row2.values.emplace_back(3); + row2.values.emplace_back(0); + row2.values.emplace_back(5); + ds.rows.emplace_back(std::move(row2)); + qctx_->symTable()->newVariable("var5"); + qctx_->ectx()->setResult("var5", ResultBuilder().value(Value(std::move(ds))).build()); + } } void testInnerJoin(std::string left, std::string right, DataSet& expected, int64_t line); @@ -171,6 +209,65 @@ TEST_F(JoinTest, InnerJoin) { testInnerJoin("var2", "var1", expected, __LINE__); } +TEST_F(JoinTest, BiInnerJoin) { + DataSet expected; + expected.colNames = {"v1", "e1", "v2", "v3", "e2"}; + Row row1; + row1.values.emplace_back(1); + row1.values.emplace_back(2); + row1.values.emplace_back(3); + row1.values.emplace_back(4); + row1.values.emplace_back(0); + expected.rows.emplace_back(std::move(row1)); + + Row row2; + row2.values.emplace_back(11); + row2.values.emplace_back(12); + row2.values.emplace_back(3); + row2.values.emplace_back(4); + row2.values.emplace_back(0); + expected.rows.emplace_back(std::move(row2)); + + // $var4 inner join $var5 on $var4.v2 = $var5.v2, $var4.v3 = $var4.v3 + auto key1 = VariablePropertyExpression::make(pool_, "var4", "v2"); + auto key2 = VariablePropertyExpression::make(pool_, "var4", "v3"); + std::vector hashKeys = {key1, key2}; + auto probe1 = VariablePropertyExpression::make(pool_, "var5", "v2"); + auto probe2 = VariablePropertyExpression::make(pool_, "var5", "v3"); + std::vector probeKeys = {probe1, probe2}; + + auto lhs = Project::make(qctx_.get(), nullptr, nullptr); + lhs->setOutputVar("var4"); + lhs->setColNames({"v1", "e1", "v2", "v3"}); + auto rhs = Project::make(qctx_.get(), nullptr, nullptr); + rhs->setOutputVar("var5"); + rhs->setColNames({"v2", "e2", "v3"}); + + auto* join = BiInnerJoin::make(qctx_.get(), lhs, rhs, std::move(hashKeys), std::move(probeKeys)); + + auto joinExe = std::make_unique(join, qctx_.get()); + auto future = joinExe->execute(); + auto status = std::move(future).get(); + EXPECT_TRUE(status.ok()); + auto& result = qctx_->ectx()->getResult(join->outputVar()); + + DataSet resultDs; + resultDs.colNames = {"v1", "e1", "v2", "v3", "e2"}; + auto iter = result.iter(); + for (; iter->valid(); iter->next()) { + const auto& cols = *iter->row(); + Row row; + for (size_t i = 0; i < cols.size(); ++i) { + Value col = cols[i]; + row.values.emplace_back(std::move(col)); + } + resultDs.rows.emplace_back(std::move(row)); + } + + EXPECT_EQ(resultDs, expected); + EXPECT_EQ(result.state(), Result::State::kSuccess); +} + TEST_F(JoinTest, InnerJoinTwice) { std::string joinOutput; { @@ -283,6 +380,73 @@ TEST_F(JoinTest, LeftJoin) { testLeftJoin("var1", "var2", expected, __LINE__); } +TEST_F(JoinTest, BiLeftJoin) { + DataSet expected; + expected.colNames = {"v2", "e2", "v3", "v1", "e1"}; + Row row1; + row1.values.emplace_back(3); + row1.values.emplace_back(0); + row1.values.emplace_back(4); + row1.values.emplace_back(1); + row1.values.emplace_back(2); + expected.rows.emplace_back(std::move(row1)); + + Row row2; + row2.values.emplace_back(3); + row2.values.emplace_back(0); + row2.values.emplace_back(4); + row2.values.emplace_back(11); + row2.values.emplace_back(12); + expected.rows.emplace_back(std::move(row2)); + + Row row3; + row3.values.emplace_back(3); + row3.values.emplace_back(0); + row3.values.emplace_back(5); + row3.values.emplace_back(Value::kNullValue); + row3.values.emplace_back(Value::kNullValue); + expected.rows.emplace_back(std::move(row3)); + + // $var5 left join $var4 on $var5.v2 = $var4.v2, $var5.v3 = $var4.v3 + auto key1 = VariablePropertyExpression::make(pool_, "var5", "v2"); + auto key2 = VariablePropertyExpression::make(pool_, "var5", "v3"); + std::vector hashKeys = {key1, key2}; + auto probe1 = VariablePropertyExpression::make(pool_, "var4", "v2"); + auto probe2 = VariablePropertyExpression::make(pool_, "var4", "v3"); + std::vector probeKeys = {probe1, probe2}; + + auto lhs = Project::make(qctx_.get(), nullptr, nullptr); + lhs->setOutputVar("var5"); + lhs->setColNames({"v2", "e2", "v3"}); + auto rhs = Project::make(qctx_.get(), nullptr, nullptr); + rhs->setOutputVar("var4"); + rhs->setColNames({"v1", "e1", "v2", "v3"}); + + auto* join = BiLeftJoin::make(qctx_.get(), lhs, rhs, std::move(hashKeys), std::move(probeKeys)); + + auto joinExe = std::make_unique(join, qctx_.get()); + auto future = joinExe->execute(); + auto status = std::move(future).get(); + EXPECT_TRUE(status.ok()); + auto& result = qctx_->ectx()->getResult(join->outputVar()); + + DataSet resultDs; + resultDs.colNames = {"v2", "e2", "v3", "v1", "e1"}; + auto iter = result.iter(); + for (; iter->valid(); iter->next()) { + const auto& cols = *iter->row(); + Row row; + for (size_t i = 0; i < cols.size(); ++i) { + Value col = cols[i]; + row.values.emplace_back(std::move(col)); + } + resultDs.rows.emplace_back(std::move(row)); + } + + EXPECT_EQ(resultDs, expected); + EXPECT_EQ(result.state(), Result::State::kSuccess); +} + TEST_F(JoinTest, LeftJoinTwice) { std::string joinOutput; { diff --git a/src/graph/planner/plan/Query.cpp b/src/graph/planner/plan/Query.cpp index 2d347fce136..db3e8e56e56 100644 --- a/src/graph/planner/plan/Query.cpp +++ b/src/graph/planner/plan/Query.cpp @@ -867,8 +867,12 @@ BiJoin::BiJoin(QueryContext* qctx, hashKeys_(std::move(hashKeys)), probeKeys_(std::move(probeKeys)) { auto lColNames = left->colNames(); - auto rColNames = right->colNames(); - lColNames.insert(lColNames.end(), rColNames.begin(), rColNames.end()); + auto& rColNames = right->colNames(); + for (auto& rColName : rColNames) { + if (std::find(lColNames.begin(), lColNames.end(), rColName) == lColNames.end()) { + lColNames.emplace_back(rColName); + } + } setColNames(lColNames); } diff --git a/tests/tck/features/match/MultiQueryParts.feature b/tests/tck/features/match/MultiQueryParts.feature index 5a82164b147..a77ef51184f 100644 --- a/tests/tck/features/match/MultiQueryParts.feature +++ b/tests/tck/features/match/MultiQueryParts.feature @@ -140,8 +140,75 @@ Feature: Multi Query Parts | o.player.name | | "Tim Duncan" | | "Tim Duncan" | + # Both the 2 match statement contains variable v1 and v4 + When profiling query: + """ + MATCH (v1:player)-[:like*2..2]->(v2)-[e3:like]->(v4) where id(v1) == "Tony Parker" + MATCH (v3:player)-[:like]->(v1)<-[e5]-(v4) where id(v3) == "Tim Duncan" return * + """ + Then the result should be, in any order, with relax comparison: + | v1 | v2 | e3 | v4 | v3 | e5 | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"Tim Duncan" @0 {likeness: 95}] | ("Tim Duncan") | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Manu Ginobili") | [:like "Manu Ginobili"->"Tim Duncan" @0 {likeness: 90}] | ("Tim Duncan") | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tim Duncan" ) | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tim Duncan" ) | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"LaMarcus Aldridge" @0 {likeness: 90}] | ("LaMarcus Aldridge") | ("Tim Duncan") | [:like "LaMarcus Aldridge"->"Tony Parker" @0 {}] | + # The redudant Project after BiInnerJoin is removed now + And the execution plan should be: + | id | name | dependencies | profiling data | operator info | + | 19 | BiInnerJoin | 7,14 | | | + | 7 | Project | 6 | | | + | 6 | AppendVertices | 5 | | | + | 5 | Traverse | 20 | | | + | 20 | Traverse | 2 | | | + | 2 | Dedup | 1 | | | + | 1 | PassThrough | 3 | | | + | 3 | Start | | | | + | 14 | Project | 13 | | | + | 13 | AppendVertices | 12 | | | + | 12 | Traverse | 21 | | | + | 21 | Traverse | 9 | | | + | 9 | Dedup | 8 | | | + | 8 | PassThrough | 10 | | | + | 10 | Start | | | | Scenario: Optional Match + When profiling query: + """ + MATCH (v1:player)-[:like*2..2]->(v2)-[e3:like]->(v4) where id(v1) == "Tony Parker" + OPTIONAL MATCH (v3:player)-[:like]->(v1)<-[e5]-(v4) where id(v3) == "Tim Duncan" return * + """ + Then the result should be, in any order, with relax comparison: + | v1 | v2 | e3 | v4 | v3 | e5 | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"LaMarcus Aldridge" @0 {likeness: 90}] | ("LaMarcus Aldridge") | ("Tim Duncan") | [:like "LaMarcus Aldridge"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tim Duncan") | [:like "Tim Duncan"->"Tony Parker" @0 {likeness: 95}] | ("Tony Parker") | __NULL__ | __NULL__ | + | ("Tony Parker") | ("Tim Duncan") | [:like "Tim Duncan"->"Tony Parker" @0 {likeness: 95}] | ("Tony Parker") | __NULL__ | __NULL__ | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"Tim Duncan" @0 {likeness: 95}] | ("Tim Duncan") | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Manu Ginobili" :player{age: 41, name: "Manu Ginobili"}) | [:like "Manu Ginobili"->"Tim Duncan" @0 {likeness: 90}] | ("Tim Duncan") | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tony Parker") | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tim Duncan") | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + | ("Tony Parker") | ("Tim Duncan") | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}] | ("Manu Ginobili") | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] | + # The redudant Project after BiLeftJoin is removed now + And the execution plan should be: + | id | name | dependencies | profiling data | operator info | + | 19 | BiLeftJoin | 7,14 | | | + | 7 | Project | 6 | | | + | 6 | AppendVertices | 5 | | | + | 5 | Traverse | 20 | | | + | 20 | Traverse | 2 | | | + | 2 | Dedup | 1 | | | + | 1 | PassThrough | 3 | | | + | 3 | Start | | | | + | 14 | Project | 13 | | | + | 13 | AppendVertices | 12 | | | + | 12 | Traverse | 21 | | | + | 21 | Traverse | 9 | | | + | 9 | Dedup | 8 | | | + | 8 | PassThrough | 10 | | | + | 10 | Start | | | | When executing query: """ MATCH (m)-[]-(n) WHERE id(m)=="Tim Duncan"