From e422852b94cd1ecf97f9b70d5b30cf4a938a9dc1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 17 Dec 2024 00:18:31 +0100 Subject: [PATCH] Fix bug for materialized join results with lazy input with local vocabs (#1684) This fixes a subtle bug which led to a segmentation fault in the following scenario: 1. The result of a join is fully materialized, but at least one of its inputs is lazy 2. The blocks of the unique input contain local vocabs with unique entries (not shared between the blocks) 3. There is at least one block in the lazy input that has no matching row in the other input, and therefore its local vocab is not used 4. There is at least one block before the block from item 3 that is needed (including its local vocab) Previously, the local vocabs from all blocks that fulfill item 3 were not part of the input and caused segfaults, when the corresponding local vocab pointer was dereferenced. Fixes #1679 --- src/engine/AddCombinedRowToTable.h | 9 ++++++-- test/AddCombinedRowToTableTest.cpp | 36 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index c43e298a88..45f51fb527 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -155,8 +155,13 @@ class AddCombinedRowToIdTable { if (nextIndex_ != 0) { AD_CORRECTNESS_CHECK(inputLeftAndRight_.has_value()); flush(); - } else { - // Clear vocab when no rows were written. + } else if (resultTable_.empty()) { + // Clear local vocab when no rows were written. + // + // TODO This is a conservative approach. We could + // optimize this case (clear the local vocab more often, but still + // correctly) by considering the situation after all the relevant inputs + // have been processed. mergedVocab_ = LocalVocab{}; } } diff --git a/test/AddCombinedRowToTableTest.cpp b/test/AddCombinedRowToTableTest.cpp index 2c409a830b..4bf72adcf9 100644 --- a/test/AddCombinedRowToTableTest.cpp +++ b/test/AddCombinedRowToTableTest.cpp @@ -323,3 +323,39 @@ TEST(AddCombinedRowToTable, verifyLocalVocabIsRetainedWhenNotMoving) { EXPECT_TRUE(vocabContainsString(localVocab, "b")); EXPECT_THAT(localVocab.getAllWordsForTesting(), ::testing::SizeIs(2)); } + +// _____________________________________________________________________________ +TEST(AddCombinedRowToTable, localVocabIsOnlyClearedWhenLegal) { + auto outputTable = makeIdTableFromVector({}); + outputTable.setNumColumns(3); + ad_utility::AddCombinedRowToIdTable adder{ + 1, std::move(outputTable), + std::make_shared>(), 1}; + + IdTableWithVocab input1{makeIdTableFromVector({{0, 1}}), + createVocabWithSingleString("a")}; + IdTableWithVocab input2{makeIdTableFromVector({{0, 2}}), + createVocabWithSingleString("b")}; + + adder.setInput(input1, input2); + adder.addRow(0, 0); + IdTableWithVocab input3{makeIdTableFromVector({{3, 1}}), + createVocabWithSingleString("c")}; + IdTableWithVocab input4{makeIdTableFromVector({{3, 2}}), + createVocabWithSingleString("d")}; + // NOTE: This seemingly redundant call to `setInput` is important, as it tests + // a previous bug: Each call to `setInput` implicitly also calls `flush` and + // also possibly clears the local vocab if it is not used anymore. In this + // case however we may not clear the local vocab, as the result of the + // previous calls to `addRow` has not yet been extracted. + adder.setInput(input1, input2); + adder.setInput(input3, input4); + adder.addRow(0, 0); + auto localVocab = adder.localVocab().clone(); + + EXPECT_TRUE(vocabContainsString(localVocab, "a")); + EXPECT_TRUE(vocabContainsString(localVocab, "b")); + EXPECT_TRUE(vocabContainsString(localVocab, "c")); + EXPECT_TRUE(vocabContainsString(localVocab, "d")); + EXPECT_THAT(localVocab.getAllWordsForTesting(), ::testing::SizeIs(4)); +}