fix duplicate output column names of join (#4965)

vesoft-inc · Dec 2, 2022 · 4731ed9 · 4731ed9
1 parent 4f1fe76
commit 4731ed9
Show file tree

Hide file tree

Showing 7 changed files with 277 additions and 26 deletions.
diff --git a/src/graph/executor/query/InnerJoinExecutor.cpp b/src/graph/executor/query/InnerJoinExecutor.cpp
@@ -244,18 +244,6 @@ void InnerJoinExecutor::buildNewRow(const std::unordered_map<T, std::vector<cons
   }
 }
 
-Row InnerJoinExecutor::newRow(Row left, Row right) const {
-  Row r;
-  r.reserve(left.size() + right.size());
-  r.values.insert(r.values.end(),
-                  std::make_move_iterator(left.values.begin()),
-                  std::make_move_iterator(left.values.end()));
-  r.values.insert(r.values.end(),
-                  std::make_move_iterator(right.values.begin()),
-                  std::make_move_iterator(right.values.end()));
-  return r;
-}
-
 const std::string& InnerJoinExecutor::leftVar() const {
   if (node_->kind() == PlanNode::Kind::kBiInnerJoin) {
     return node_->asNode<BiJoin>()->leftInputVar();

diff --git a/src/graph/executor/query/InnerJoinExecutor.h b/src/graph/executor/query/InnerJoinExecutor.h
@@ -48,9 +48,6 @@ class InnerJoinExecutor : public JoinExecutor {
                    Row rRow,
                    DataSet& ds) const;
 
-  // concat rows
-  Row newRow(Row left, Row right) const;
-
   const std::string& leftVar() const;
 
   const std::string& rightVar() const;

diff --git a/src/graph/executor/query/JoinExecutor.cpp b/src/graph/executor/query/JoinExecutor.cpp
@@ -49,6 +49,25 @@ Status JoinExecutor::checkBiInputDataSets() {
     return Status::Error(ss.str());
   }
   colSize_ = join->colNames().size();
+
+  auto& lhsResult = ectx_->getResult(join->leftInputVar());
+  auto& lhsColNames = lhsResult.valuePtr()->getDataSet().colNames;
+  auto lhsColSize = lhsColNames.size();
+
+  auto& rhsResult = ectx_->getResult(join->rightInputVar());
+  auto& rhsColNames = rhsResult.valuePtr()->getDataSet().colNames;
+  auto rhsColSize = rhsColNames.size();
+
+  DCHECK_LE(colSize_, lhsColSize + rhsColSize);
+  if (colSize_ < lhsColSize + rhsColSize) {
+    for (size_t i = 0; i < rhsColSize; ++i) {
+      auto it = std::find(lhsColNames.begin(), lhsColNames.end(), rhsColNames[i]);
+      if (it == lhsColNames.end()) {
+        rhsOutputColIdxs_.push_back(i);
+      }
+    }
+  }
+
   return Status::OK();
 }
 
@@ -83,15 +102,21 @@ void JoinExecutor::buildSingleKeyHashTable(
 }
 
 Row JoinExecutor::newRow(Row left, Row right) const {
-  Row r;
-  r.reserve(left.size() + right.size());
-  r.values.insert(r.values.end(),
-                  std::make_move_iterator(left.values.begin()),
-                  std::make_move_iterator(left.values.end()));
-  r.values.insert(r.values.end(),
-                  std::make_move_iterator(right.values.begin()),
-                  std::make_move_iterator(right.values.end()));
-  return r;
+  Row row;
+  row.reserve(left.size() + right.size());
+  row.values.insert(row.values.end(),
+                    std::make_move_iterator(left.values.begin()),
+                    std::make_move_iterator(left.values.end()));
+  if (!rhsOutputColIdxs_.empty()) {
+    for (auto idx : rhsOutputColIdxs_) {
+      row.values.emplace_back(std::move(right.values[idx]));
+    }
+  } else {
+    row.values.insert(row.values.end(),
+                      std::make_move_iterator(right.values.begin()),
+                      std::make_move_iterator(right.values.end()));
+  }
+  return row;
 }
 
 }  // namespace graph

diff --git a/src/graph/executor/query/JoinExecutor.h b/src/graph/executor/query/JoinExecutor.h
@@ -33,7 +33,13 @@ class JoinExecutor : public Executor {
 
   std::unique_ptr<Iterator> lhsIter_;
   std::unique_ptr<Iterator> rhsIter_;
+  // colSize_ is the size of the output columns of join executor.
+  // If the join is natural join, which means the left and right columns have duplicated names,
+  // the output columns will be a intersection of the left and right columns.
   size_t colSize_{0};
+  // If the join is natural join, rhsOutputColIdxs_ will be used to record the output column index
+  // of the right. If not, rhsOutputColIdxs_ will be empty.
+  std::vector<size_t> rhsOutputColIdxs_;
   std::unordered_map<Value, std::vector<const Row*>> hashTable_;
   std::unordered_map<List, std::vector<const Row*>> listHashTable_;
 };

diff --git a/src/graph/executor/test/JoinTest.cpp b/src/graph/executor/test/JoinTest.cpp
@@ -9,6 +9,7 @@
 #include "graph/executor/query/InnerJoinExecutor.h"
 #include "graph/executor/query/LeftJoinExecutor.h"
 #include "graph/executor/test/QueryTestBase.h"
+#include "graph/planner/plan/Logic.h"
 #include "graph/planner/plan/Query.h"
 
 namespace nebula {
@@ -68,6 +69,43 @@ class JoinTest : public QueryTestBase {
       qctx_->symTable()->newVariable("empty_var2");
       qctx_->ectx()->setResult("empty_var2", ResultBuilder().value(Value(std::move(ds))).build());
     }
+    {
+      DataSet ds;
+      ds.colNames = {"v1", "e1", "v2", "v3"};
+      Row row1;
+      row1.values.emplace_back(1);
+      row1.values.emplace_back(2);
+      row1.values.emplace_back(3);
+      row1.values.emplace_back(4);
+      ds.rows.emplace_back(std::move(row1));
+
+      Row row2;
+      row2.values.emplace_back(11);
+      row2.values.emplace_back(12);
+      row2.values.emplace_back(3);
+      row2.values.emplace_back(4);
+      ds.rows.emplace_back(std::move(row2));
+
+      qctx_->symTable()->newVariable("var4");
+      qctx_->ectx()->setResult("var4", ResultBuilder().value(Value(std::move(ds))).build());
+    }
+    {
+      DataSet ds;
+      ds.colNames = {"v2", "e2", "v3"};
+      Row row1;
+      row1.values.emplace_back(3);
+      row1.values.emplace_back(0);
+      row1.values.emplace_back(4);
+      ds.rows.emplace_back(std::move(row1));
+
+      Row row2;
+      row2.values.emplace_back(3);
+      row2.values.emplace_back(0);
+      row2.values.emplace_back(5);
+      ds.rows.emplace_back(std::move(row2));
+      qctx_->symTable()->newVariable("var5");
+      qctx_->ectx()->setResult("var5", ResultBuilder().value(Value(std::move(ds))).build());
+    }
   }
 
   void testInnerJoin(std::string left, std::string right, DataSet& expected, int64_t line);
@@ -171,6 +209,65 @@ TEST_F(JoinTest, InnerJoin) {
   testInnerJoin("var2", "var1", expected, __LINE__);
 }
 
+TEST_F(JoinTest, BiInnerJoin) {
+  DataSet expected;
+  expected.colNames = {"v1", "e1", "v2", "v3", "e2"};
+  Row row1;
+  row1.values.emplace_back(1);
+  row1.values.emplace_back(2);
+  row1.values.emplace_back(3);
+  row1.values.emplace_back(4);
+  row1.values.emplace_back(0);
+  expected.rows.emplace_back(std::move(row1));
+
+  Row row2;
+  row2.values.emplace_back(11);
+  row2.values.emplace_back(12);
+  row2.values.emplace_back(3);
+  row2.values.emplace_back(4);
+  row2.values.emplace_back(0);
+  expected.rows.emplace_back(std::move(row2));
+
+  // $var4 inner join $var5 on $var4.v2 = $var5.v2, $var4.v3 = $var4.v3
+  auto key1 = VariablePropertyExpression::make(pool_, "var4", "v2");
+  auto key2 = VariablePropertyExpression::make(pool_, "var4", "v3");
+  std::vector<Expression*> hashKeys = {key1, key2};
+  auto probe1 = VariablePropertyExpression::make(pool_, "var5", "v2");
+  auto probe2 = VariablePropertyExpression::make(pool_, "var5", "v3");
+  std::vector<Expression*> probeKeys = {probe1, probe2};
+
+  auto lhs = Project::make(qctx_.get(), nullptr, nullptr);
+  lhs->setOutputVar("var4");
+  lhs->setColNames({"v1", "e1", "v2", "v3"});
+  auto rhs = Project::make(qctx_.get(), nullptr, nullptr);
+  rhs->setOutputVar("var5");
+  rhs->setColNames({"v2", "e2", "v3"});
+
+  auto* join = BiInnerJoin::make(qctx_.get(), lhs, rhs, std::move(hashKeys), std::move(probeKeys));
+
+  auto joinExe = std::make_unique<BiInnerJoinExecutor>(join, qctx_.get());
+  auto future = joinExe->execute();
+  auto status = std::move(future).get();
+  EXPECT_TRUE(status.ok());
+  auto& result = qctx_->ectx()->getResult(join->outputVar());
+
+  DataSet resultDs;
+  resultDs.colNames = {"v1", "e1", "v2", "v3", "e2"};
+  auto iter = result.iter();
+  for (; iter->valid(); iter->next()) {
+    const auto& cols = *iter->row();
+    Row row;
+    for (size_t i = 0; i < cols.size(); ++i) {
+      Value col = cols[i];
+      row.values.emplace_back(std::move(col));
+    }
+    resultDs.rows.emplace_back(std::move(row));
+  }
+
+  EXPECT_EQ(resultDs, expected);
+  EXPECT_EQ(result.state(), Result::State::kSuccess);
+}
+
 TEST_F(JoinTest, InnerJoinTwice) {
   std::string joinOutput;
   {
@@ -283,6 +380,73 @@ TEST_F(JoinTest, LeftJoin) {
   testLeftJoin("var1", "var2", expected, __LINE__);
 }
 
+TEST_F(JoinTest, BiLeftJoin) {
+  DataSet expected;
+  expected.colNames = {"v2", "e2", "v3", "v1", "e1"};
+  Row row1;
+  row1.values.emplace_back(3);
+  row1.values.emplace_back(0);
+  row1.values.emplace_back(4);
+  row1.values.emplace_back(1);
+  row1.values.emplace_back(2);
+  expected.rows.emplace_back(std::move(row1));
+
+  Row row2;
+  row2.values.emplace_back(3);
+  row2.values.emplace_back(0);
+  row2.values.emplace_back(4);
+  row2.values.emplace_back(11);
+  row2.values.emplace_back(12);
+  expected.rows.emplace_back(std::move(row2));
+
+  Row row3;
+  row3.values.emplace_back(3);
+  row3.values.emplace_back(0);
+  row3.values.emplace_back(5);
+  row3.values.emplace_back(Value::kNullValue);
+  row3.values.emplace_back(Value::kNullValue);
+  expected.rows.emplace_back(std::move(row3));
+
+  // $var5 left join $var4 on $var5.v2 = $var4.v2, $var5.v3 = $var4.v3
+  auto key1 = VariablePropertyExpression::make(pool_, "var5", "v2");
+  auto key2 = VariablePropertyExpression::make(pool_, "var5", "v3");
+  std::vector<Expression*> hashKeys = {key1, key2};
+  auto probe1 = VariablePropertyExpression::make(pool_, "var4", "v2");
+  auto probe2 = VariablePropertyExpression::make(pool_, "var4", "v3");
+  std::vector<Expression*> probeKeys = {probe1, probe2};
+
+  auto lhs = Project::make(qctx_.get(), nullptr, nullptr);
+  lhs->setOutputVar("var5");
+  lhs->setColNames({"v2", "e2", "v3"});
+  auto rhs = Project::make(qctx_.get(), nullptr, nullptr);
+  rhs->setOutputVar("var4");
+  rhs->setColNames({"v1", "e1", "v2", "v3"});
+
+  auto* join = BiLeftJoin::make(qctx_.get(), lhs, rhs, std::move(hashKeys), std::move(probeKeys));
+
+  auto joinExe = std::make_unique<BiLeftJoinExecutor>(join, qctx_.get());
+  auto future = joinExe->execute();
+  auto status = std::move(future).get();
+  EXPECT_TRUE(status.ok());
+  auto& result = qctx_->ectx()->getResult(join->outputVar());
+
+  DataSet resultDs;
+  resultDs.colNames = {"v2", "e2", "v3", "v1", "e1"};
+  auto iter = result.iter();
+  for (; iter->valid(); iter->next()) {
+    const auto& cols = *iter->row();
+    Row row;
+    for (size_t i = 0; i < cols.size(); ++i) {
+      Value col = cols[i];
+      row.values.emplace_back(std::move(col));
+    }
+    resultDs.rows.emplace_back(std::move(row));
+  }
+
+  EXPECT_EQ(resultDs, expected);
+  EXPECT_EQ(result.state(), Result::State::kSuccess);
+}
+
 TEST_F(JoinTest, LeftJoinTwice) {
   std::string joinOutput;
   {

diff --git a/src/graph/planner/plan/Query.cpp b/src/graph/planner/plan/Query.cpp
@@ -867,8 +867,12 @@ BiJoin::BiJoin(QueryContext* qctx,
       hashKeys_(std::move(hashKeys)),
       probeKeys_(std::move(probeKeys)) {
   auto lColNames = left->colNames();
-  auto rColNames = right->colNames();
-  lColNames.insert(lColNames.end(), rColNames.begin(), rColNames.end());
+  auto& rColNames = right->colNames();
+  for (auto& rColName : rColNames) {
+    if (std::find(lColNames.begin(), lColNames.end(), rColName) == lColNames.end()) {
+      lColNames.emplace_back(rColName);
+    }
+  }
   setColNames(lColNames);
 }
 

diff --git a/tests/tck/features/match/MultiQueryParts.feature b/tests/tck/features/match/MultiQueryParts.feature
@@ -140,8 +140,75 @@ Feature: Multi Query Parts
       | o.player.name |
       | "Tim Duncan"  |
       | "Tim Duncan"  |
+    # Both the 2 match statement contains variable v1 and v4
+    When profiling query:
+      """
+      MATCH (v1:player)-[:like*2..2]->(v2)-[e3:like]->(v4) where id(v1) == "Tony Parker"
+      MATCH (v3:player)-[:like]->(v1)<-[e5]-(v4) where id(v3) == "Tim Duncan" return *
+      """
+    Then the result should be, in any order, with relax comparison:
+      | v1              | v2                | e3                                                           | v4                    | v3             | e5                                               |
+      | ("Tony Parker") | ("Tony Parker")   | [:like "Tony Parker"->"Tim Duncan" @0 {likeness: 95}]        | ("Tim Duncan")        | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}]    |
+      | ("Tony Parker") | ("Manu Ginobili") | [:like "Manu Ginobili"->"Tim Duncan" @0 {likeness: 90}]      | ("Tim Duncan")        | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}]    |
+      | ("Tony Parker") | ("Tony Parker")   | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}]     | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tony Parker")   | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}]     | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tim Duncan" )   | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}]      | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tim Duncan" )   | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}]      | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tony Parker")   | [:like "Tony Parker"->"LaMarcus Aldridge" @0 {likeness: 90}] | ("LaMarcus Aldridge") | ("Tim Duncan") | [:like "LaMarcus Aldridge"->"Tony Parker" @0 {}] |
+    # The redudant Project after BiInnerJoin is removed now
+    And the execution plan should be:
+      | id | name           | dependencies | profiling data | operator info |
+      | 19 | BiInnerJoin    | 7,14         |                |               |
+      | 7  | Project        | 6            |                |               |
+      | 6  | AppendVertices | 5            |                |               |
+      | 5  | Traverse       | 20           |                |               |
+      | 20 | Traverse       | 2            |                |               |
+      | 2  | Dedup          | 1            |                |               |
+      | 1  | PassThrough    | 3            |                |               |
+      | 3  | Start          |              |                |               |
+      | 14 | Project        | 13           |                |               |
+      | 13 | AppendVertices | 12           |                |               |
+      | 12 | Traverse       | 21           |                |               |
+      | 21 | Traverse       | 9            |                |               |
+      | 9  | Dedup          | 8            |                |               |
+      | 8  | PassThrough    | 10           |                |               |
+      | 10 | Start          |              |                |               |
 
   Scenario: Optional Match
+    When profiling query:
+      """
+      MATCH (v1:player)-[:like*2..2]->(v2)-[e3:like]->(v4) where id(v1) == "Tony Parker"
+      OPTIONAL MATCH (v3:player)-[:like]->(v1)<-[e5]-(v4) where id(v3) == "Tim Duncan" return *
+      """
+    Then the result should be, in any order, with relax comparison:
+      | v1              | v2                                                        | e3                                                           | v4                    | v3             | e5                                               |
+      | ("Tony Parker") | ("Tony Parker")                                           | [:like "Tony Parker"->"LaMarcus Aldridge" @0 {likeness: 90}] | ("LaMarcus Aldridge") | ("Tim Duncan") | [:like "LaMarcus Aldridge"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tim Duncan")                                            | [:like "Tim Duncan"->"Tony Parker" @0 {likeness: 95}]        | ("Tony Parker")       | __NULL__       | __NULL__                                         |
+      | ("Tony Parker") | ("Tim Duncan")                                            | [:like "Tim Duncan"->"Tony Parker" @0 {likeness: 95}]        | ("Tony Parker")       | __NULL__       | __NULL__                                         |
+      | ("Tony Parker") | ("Tony Parker")                                           | [:like "Tony Parker"->"Tim Duncan" @0 {likeness: 95}]        | ("Tim Duncan")        | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}]    |
+      | ("Tony Parker") | ("Manu Ginobili" :player{age: 41, name: "Manu Ginobili"}) | [:like "Manu Ginobili"->"Tim Duncan" @0 {likeness: 90}]      | ("Tim Duncan")        | ("Tim Duncan") | [:teammate "Tim Duncan"->"Tony Parker" @0 {}]    |
+      | ("Tony Parker") | ("Tony Parker")                                           | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}]     | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tony Parker")                                           | [:like "Tony Parker"->"Manu Ginobili" @0 {likeness: 95}]     | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tim Duncan")                                            | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}]      | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+      | ("Tony Parker") | ("Tim Duncan")                                            | [:like "Tim Duncan"->"Manu Ginobili" @0 {likeness: 95}]      | ("Manu Ginobili")     | ("Tim Duncan") | [:teammate "Manu Ginobili"->"Tony Parker" @0 {}] |
+    # The redudant Project after BiLeftJoin is removed now
+    And the execution plan should be:
+      | id | name           | dependencies | profiling data | operator info |
+      | 19 | BiLeftJoin     | 7,14         |                |               |
+      | 7  | Project        | 6            |                |               |
+      | 6  | AppendVertices | 5            |                |               |
+      | 5  | Traverse       | 20           |                |               |
+      | 20 | Traverse       | 2            |                |               |
+      | 2  | Dedup          | 1            |                |               |
+      | 1  | PassThrough    | 3            |                |               |
+      | 3  | Start          |              |                |               |
+      | 14 | Project        | 13           |                |               |
+      | 13 | AppendVertices | 12           |                |               |
+      | 12 | Traverse       | 21           |                |               |
+      | 21 | Traverse       | 9            |                |               |
+      | 9  | Dedup          | 8            |                |               |
+      | 8  | PassThrough    | 10           |                |               |
+      | 10 | Start          |              |                |               |
     When executing query:
       """
       MATCH (m)-[]-(n) WHERE id(m)=="Tim Duncan"