From 40af4f699f0a3ff4c4c168d0eaf0777b9eb3f7f8 Mon Sep 17 00:00:00 2001 From: Andrew Lumsdaine Date: Mon, 17 Jun 2024 21:49:29 -0700 Subject: [PATCH] Implement bin partitioning function (#5092) To write tiles into a temporary array that will fit into fixed-sized blocks on reading, this PR implements a bin partitioning function that returns a `std::vector` of cell partitions, such that each group of cells has a total byte count less than a specified number of bytes. Also returned is a `std::vector` of the bin sizes (in number of bytes). This is the first file to go into the new sm/external_sort, so it also creates the directory and populates it with a test subdirectory and appropriate CMakeLists.txt files. A previous PR creates a doc subdirectory and includes the svg image of our design for external sort. --- TYPE: FEATURE DESC: Implement a partitioning function to partition cells to fit into fixed size bins --------- Co-authored-by: Luc Rancourt --- tiledb/sm/query/CMakeLists.txt | 1 + tiledb/sm/query/external_sort/CMakeLists.txt | 29 ++++++ tiledb/sm/query/external_sort/partition.h | 98 +++++++++++++++++++ .../query/external_sort/test/CMakeLists.txt | 33 +++++++ .../external_sort/test/unit_partition.cc | 77 +++++++++++++++ 5 files changed, 238 insertions(+) create mode 100644 tiledb/sm/query/external_sort/CMakeLists.txt create mode 100644 tiledb/sm/query/external_sort/partition.h create mode 100644 tiledb/sm/query/external_sort/test/CMakeLists.txt create mode 100644 tiledb/sm/query/external_sort/test/unit_partition.cc diff --git a/tiledb/sm/query/CMakeLists.txt b/tiledb/sm/query/CMakeLists.txt index cccb774ebad..4084966eaa4 100644 --- a/tiledb/sm/query/CMakeLists.txt +++ b/tiledb/sm/query/CMakeLists.txt @@ -27,6 +27,7 @@ include(common NO_POLICY_SCOPE) add_subdirectory(ast) add_subdirectory(deletes_and_updates) +add_subdirectory(external_sort) add_subdirectory(readers) # diff --git a/tiledb/sm/query/external_sort/CMakeLists.txt b/tiledb/sm/query/external_sort/CMakeLists.txt new file mode 100644 index 00000000000..ef98a1193d3 --- /dev/null +++ b/tiledb/sm/query/external_sort/CMakeLists.txt @@ -0,0 +1,29 @@ +# +# tiledb/sm/query/external_sort/CMakeLists.txt +# +# The MIT License +# +# Copyright (c) 2024 TileDB, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +include(common NO_POLICY_SCOPE) + +add_test_subdirectory() diff --git a/tiledb/sm/query/external_sort/partition.h b/tiledb/sm/query/external_sort/partition.h new file mode 100644 index 00000000000..08e7e79aae3 --- /dev/null +++ b/tiledb/sm/query/external_sort/partition.h @@ -0,0 +1,98 @@ +/** + * @file tiledb/sm/query/external_sort/partition.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + */ + +#ifndef TILEDB_PARTITION_H +#define TILEDB_PARTITION_H + +#include +#include +#include +#include + +/** + * @brief Partition a list of sizes into bins that are less than or equal to a + * given number of bytes. The sizes are the number of elements in each cell, + * which are assumed to be of type `char`. + * @param bin_size The maximum number of bytes in a bin. + * @param num_cells The total number of cells to be partitioned. + * @param fixed_bytes_per_cell The number of fixed bytes per cell. This + * includes all of the non varlength elements in each cell, including the + * elements that specify the sizes. + * @param sizes The number of varlength elements in each cell. These are + * assumed to correspond to chars, so the number of bytes in each cell is + * the same as the number of elements in the cell. + * @return + */ +auto bin_partition( + size_t bin_size, + size_t num_cells, + size_t fixed_bytes_per_cell, + std::list::iterator>& sizes) { + assert(bin_size > 0); + assert(num_cells > 0); + assert(fixed_bytes_per_cell > 0); + + size_t current_index{0}; + // size_t next_index{0}; + size_t current_size{0}; + size_t next_size{0}; + + auto offsets_begin = sizes.begin(); + auto offsets_end = sizes.end(); + + std::vector bins{0}; + std::vector bin_sizes; + + while (true) { + next_size = current_size + fixed_bytes_per_cell; + for (auto o = offsets_begin; o != offsets_end; ++o) { + next_size += (*o)[current_index] * sizeof(char); + } + if (next_size > bin_size) { + bins.push_back(current_index); + bin_sizes.push_back(current_size); + + next_size = current_size = 0; + continue; + } else { + current_size = next_size; + } + if (++current_index == num_cells) { + bins.push_back(num_cells); + bin_sizes.push_back(current_size); + break; + } + } + + return std::make_tuple(std::move(bins), std::move(bin_sizes)); +} + +#endif // TILEDB_PARTITION_H diff --git a/tiledb/sm/query/external_sort/test/CMakeLists.txt b/tiledb/sm/query/external_sort/test/CMakeLists.txt new file mode 100644 index 00000000000..c47f50e8eb4 --- /dev/null +++ b/tiledb/sm/query/external_sort/test/CMakeLists.txt @@ -0,0 +1,33 @@ +# +# tiledb/sm/query/external_sort/test/CMakeLists.txt +# +# The MIT License +# +# Copyright (c) 2024 TileDB, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +include(unit_test) + +commence(unit_test partition) +this_target_sources( + unit_partition.cc +) +conclude(unit_test) \ No newline at end of file diff --git a/tiledb/sm/query/external_sort/test/unit_partition.cc b/tiledb/sm/query/external_sort/test/unit_partition.cc new file mode 100644 index 00000000000..f3ee50f2181 --- /dev/null +++ b/tiledb/sm/query/external_sort/test/unit_partition.cc @@ -0,0 +1,77 @@ +/** + * @file unit_partition.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2024 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file implements unit tests for the alt_var_length_view class. + */ + +#include +#include +#include "tiledb/common/util/var_length_util.h" +#include "tiledb/sm/query/external_sort/partition.h" + +TEST_CASE("partition: Null test", "[partition][null_test]") { + REQUIRE(true); +} + +TEST_CASE("partition: sized", "[partition]") { + std::vector o{8, 6, 7, 5, 3, 0, 9}; + std::vector p{3, 1, 4, 1, 5, 9, 2}; + + REQUIRE(o.size() == p.size()); + size_t num_cells = size(o); + size_t bin_size = 256; + auto fixed_bytes_per_cell = 24; + + std::vector o_bytes{64, 48, 56, 40, 24, 0, 72}; + std::vector p_bytes{24, 8, 32, 8, 40, 72, 16}; + for (size_t i = 0; i < num_cells; ++i) { + o_bytes[i] *= 8; + p_bytes[i] *= 8; + o[i] *= 8; + p[i] *= 8; + } + std::vector sum_bytes(num_cells); + for (size_t i = 0; i < num_cells; ++i) { + sum_bytes[i] = o_bytes[i] + p_bytes[i] + fixed_bytes_per_cell; + } + std::vector byte_offsets(num_cells + 1); + lengths_to_offsets(sum_bytes, byte_offsets); + // {112, 192, /**/ 304, 376, /**/ 464, 560, /**/ 672}; + // {112, 192, /**/ 112, 184, /**/ 88, 184, /**/ 112}; + + std::list::iterator> sizes{begin(o), begin(p)}; + + auto&& [x, y] = + bin_partition(bin_size, num_cells, fixed_bytes_per_cell, sizes); + std::vector expected_bins{0, 2, 4, 6, 7}; + std::vector expected_sizes{192, 184, 184, 112}; + + CHECK(x == expected_bins); + CHECK(y == expected_sizes); +}