Skip to content

Commit

Permalink
GH-39583: [C++] Fix the issue of ExecBatchBuilder when appending cons…
Browse files Browse the repository at this point in the history
…ecutive tail rows with the same id may exceed buffer boundary (for fixed size types) (#39585)

### Rationale for this change

#39583 is a subsequent issue of #32570 (fixed by #39234). The last issue and fixed only resolved var length types. It turns out fixed size types have the same issue.

### What changes are included in this PR?

Do the same fix of #39234 for fixed size types.

### Are these changes tested?

UT included.

### Are there any user-facing changes?

* Closes: #39583

Authored-by: zanmato1984 <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
zanmato1984 authored Jan 17, 2024
1 parent 4a33d2f commit 1dc3b81
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 19 deletions.
21 changes: 8 additions & 13 deletions cpp/src/arrow/compute/light_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -383,27 +383,22 @@ int ExecBatchBuilder::NumRowsToSkip(const std::shared_ptr<ArrayData>& column,

KeyColumnMetadata column_metadata =
ColumnMetadataFromDataType(column->type).ValueOrDie();
ARROW_DCHECK(!column_metadata.is_fixed_length || column_metadata.fixed_length > 0);

int num_rows_left = num_rows;
int num_bytes_skipped = 0;
while (num_rows_left > 0 && num_bytes_skipped < num_tail_bytes_to_skip) {
--num_rows_left;
int row_id_removed = row_ids[num_rows_left];
if (column_metadata.is_fixed_length) {
if (column_metadata.fixed_length == 0) {
num_rows_left = std::max(num_rows_left, 8) - 8;
++num_bytes_skipped;
} else {
--num_rows_left;
num_bytes_skipped += column_metadata.fixed_length;
}
num_bytes_skipped += column_metadata.fixed_length;
} else {
--num_rows_left;
int row_id_removed = row_ids[num_rows_left];
const int32_t* offsets = column->GetValues<int32_t>(1);
num_bytes_skipped += offsets[row_id_removed + 1] - offsets[row_id_removed];
// Skip consecutive rows with the same id
while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) {
--num_rows_left;
}
}
// Skip consecutive rows with the same id
while (num_rows_left > 0 && row_id_removed == row_ids[num_rows_left - 1]) {
--num_rows_left;
}
}

Expand Down
75 changes: 69 additions & 6 deletions cpp/src/arrow/compute/light_array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -474,15 +474,18 @@ TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
TEST(ExecBatchBuilder, AppendBatchDupRows) {
std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
MemoryPool* pool = owned_pool.get();

// Case of cross-word copying for the last row, which may exceed the buffer boundary.
// This is a simplified case of GH-32570
//
{
// This is a simplified case of GH-32570
// 64-byte data fully occupying one minimal 64-byte aligned memory region.
ExecBatch batch_string = JSONToExecBatch({binary()}, R"([["123456789ABCDEF0"],
["123456789ABCDEF0"],
["123456789ABCDEF0"],
["ABCDEF0"],
["123456789"]])"); // 9-byte tail row, larger than a word.
ExecBatch batch_string = JSONToExecBatch({binary()}, R"([
["123456789ABCDEF0"],
["123456789ABCDEF0"],
["123456789ABCDEF0"],
["ABCDEF0"],
["123456789"]])"); // 9-byte tail row, larger than a word.
ASSERT_EQ(batch_string[0].array()->buffers[1]->capacity(), 64);
ASSERT_EQ(batch_string[0].array()->buffers[2]->capacity(), 64);
ExecBatchBuilder builder;
Expand All @@ -494,6 +497,66 @@ TEST(ExecBatchBuilder, AppendBatchDupRows) {
ASSERT_EQ(batch_string_appended, built);
ASSERT_NE(0, pool->bytes_allocated());
}

{
// This is a simplified case of GH-39583, using fsb(3) type.
// 63-byte data occupying almost one minimal 64-byte aligned memory region.
ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(3)}, R"([
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["000"],
["123"]])"); // 3-byte tail row, not aligned to a word.
ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
ExecBatchBuilder builder;
uint16_t row_ids[4] = {20, 20, 20,
20}; // Get the last row 4 times, 3 to skip a word.
ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 4, row_ids, /*num_cols=*/1));
ExecBatch built = builder.Flush();
ExecBatch batch_fsb_appended = JSONToExecBatch(
{fixed_size_binary(3)}, R"([["123"], ["123"], ["123"], ["123"]])");
ASSERT_EQ(batch_fsb_appended, built);
ASSERT_NE(0, pool->bytes_allocated());
}

{
// This is a simplified case of GH-39583, using fsb(9) type.
// 63-byte data occupying almost one minimal 64-byte aligned memory region.
ExecBatch batch_fsb = JSONToExecBatch({fixed_size_binary(9)}, R"([
["000000000"],
["000000000"],
["000000000"],
["000000000"],
["000000000"],
["000000000"],
["123456789"]])"); // 9-byte tail row, not aligned to a word.
ASSERT_EQ(batch_fsb[0].array()->buffers[1]->capacity(), 64);
ExecBatchBuilder builder;
uint16_t row_ids[2] = {6, 6}; // Get the last row 2 times, 1 to skip a word.
ASSERT_OK(builder.AppendSelected(pool, batch_fsb, 2, row_ids, /*num_cols=*/1));
ExecBatch built = builder.Flush();
ExecBatch batch_fsb_appended =
JSONToExecBatch({fixed_size_binary(9)}, R"([["123456789"], ["123456789"]])");
ASSERT_EQ(batch_fsb_appended, built);
ASSERT_NE(0, pool->bytes_allocated());
}

ASSERT_EQ(0, pool->bytes_allocated());
}

Expand Down

0 comments on commit 1dc3b81

Please sign in to comment.