-
Notifications
You must be signed in to change notification settings - Fork 184
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement bin partitioning function (#5092)
To write tiles into a temporary array that will fit into fixed-sized blocks on reading, this PR implements a bin partitioning function that returns a `std::vector` of cell partitions, such that each group of cells has a total byte count less than a specified number of bytes. Also returned is a `std::vector` of the bin sizes (in number of bytes). This is the first file to go into the new sm/external_sort, so it also creates the directory and populates it with a test subdirectory and appropriate CMakeLists.txt files. A previous PR creates a doc subdirectory and includes the svg image of our design for external sort. --- TYPE: FEATURE DESC: Implement a partitioning function to partition cells to fit into fixed size bins --------- Co-authored-by: Luc Rancourt <[email protected]>
- Loading branch information
Showing
5 changed files
with
238 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# | ||
# tiledb/sm/query/external_sort/CMakeLists.txt | ||
# | ||
# The MIT License | ||
# | ||
# Copyright (c) 2024 TileDB, Inc. | ||
# | ||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
# | ||
# The above copyright notice and this permission notice shall be included in | ||
# all copies or substantial portions of the Software. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
# THE SOFTWARE. | ||
# | ||
|
||
include(common NO_POLICY_SCOPE) | ||
|
||
add_test_subdirectory() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
/** | ||
* @file tiledb/sm/query/external_sort/partition.h | ||
* | ||
* @section LICENSE | ||
* | ||
* The MIT License | ||
* | ||
* @copyright Copyright (c) 2024 TileDB, Inc. | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
* | ||
* @section DESCRIPTION | ||
* | ||
*/ | ||
|
||
#ifndef TILEDB_PARTITION_H | ||
#define TILEDB_PARTITION_H | ||
|
||
#include <cassert> | ||
#include <list> | ||
#include <tuple> | ||
#include <vector> | ||
|
||
/** | ||
* @brief Partition a list of sizes into bins that are less than or equal to a | ||
* given number of bytes. The sizes are the number of elements in each cell, | ||
* which are assumed to be of type `char`. | ||
* @param bin_size The maximum number of bytes in a bin. | ||
* @param num_cells The total number of cells to be partitioned. | ||
* @param fixed_bytes_per_cell The number of fixed bytes per cell. This | ||
* includes all of the non varlength elements in each cell, including the | ||
* elements that specify the sizes. | ||
* @param sizes The number of varlength elements in each cell. These are | ||
* assumed to correspond to chars, so the number of bytes in each cell is | ||
* the same as the number of elements in the cell. | ||
* @return | ||
*/ | ||
auto bin_partition( | ||
size_t bin_size, | ||
size_t num_cells, | ||
size_t fixed_bytes_per_cell, | ||
std::list<std::vector<uint64_t>::iterator>& sizes) { | ||
assert(bin_size > 0); | ||
assert(num_cells > 0); | ||
assert(fixed_bytes_per_cell > 0); | ||
|
||
size_t current_index{0}; | ||
// size_t next_index{0}; | ||
size_t current_size{0}; | ||
size_t next_size{0}; | ||
|
||
auto offsets_begin = sizes.begin(); | ||
auto offsets_end = sizes.end(); | ||
|
||
std::vector<uint64_t> bins{0}; | ||
std::vector<uint64_t> bin_sizes; | ||
|
||
while (true) { | ||
next_size = current_size + fixed_bytes_per_cell; | ||
for (auto o = offsets_begin; o != offsets_end; ++o) { | ||
next_size += (*o)[current_index] * sizeof(char); | ||
} | ||
if (next_size > bin_size) { | ||
bins.push_back(current_index); | ||
bin_sizes.push_back(current_size); | ||
|
||
next_size = current_size = 0; | ||
continue; | ||
} else { | ||
current_size = next_size; | ||
} | ||
if (++current_index == num_cells) { | ||
bins.push_back(num_cells); | ||
bin_sizes.push_back(current_size); | ||
break; | ||
} | ||
} | ||
|
||
return std::make_tuple(std::move(bins), std::move(bin_sizes)); | ||
} | ||
|
||
#endif // TILEDB_PARTITION_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# | ||
# tiledb/sm/query/external_sort/test/CMakeLists.txt | ||
# | ||
# The MIT License | ||
# | ||
# Copyright (c) 2024 TileDB, Inc. | ||
# | ||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
# | ||
# The above copyright notice and this permission notice shall be included in | ||
# all copies or substantial portions of the Software. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
# THE SOFTWARE. | ||
# | ||
|
||
include(unit_test) | ||
|
||
commence(unit_test partition) | ||
this_target_sources( | ||
unit_partition.cc | ||
) | ||
conclude(unit_test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
/** | ||
* @file unit_partition.cc | ||
* | ||
* @section LICENSE | ||
* | ||
* The MIT License | ||
* | ||
* @copyright Copyright (c) 2024 TileDB, Inc. | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
* | ||
* @section DESCRIPTION | ||
* | ||
* This file implements unit tests for the alt_var_length_view class. | ||
*/ | ||
|
||
#include <catch2/catch_all.hpp> | ||
#include <vector> | ||
#include "tiledb/common/util/var_length_util.h" | ||
#include "tiledb/sm/query/external_sort/partition.h" | ||
|
||
TEST_CASE("partition: Null test", "[partition][null_test]") { | ||
REQUIRE(true); | ||
} | ||
|
||
TEST_CASE("partition: sized", "[partition]") { | ||
std::vector<uint64_t> o{8, 6, 7, 5, 3, 0, 9}; | ||
std::vector<uint64_t> p{3, 1, 4, 1, 5, 9, 2}; | ||
|
||
REQUIRE(o.size() == p.size()); | ||
size_t num_cells = size(o); | ||
size_t bin_size = 256; | ||
auto fixed_bytes_per_cell = 24; | ||
|
||
std::vector<uint64_t> o_bytes{64, 48, 56, 40, 24, 0, 72}; | ||
std::vector<uint64_t> p_bytes{24, 8, 32, 8, 40, 72, 16}; | ||
for (size_t i = 0; i < num_cells; ++i) { | ||
o_bytes[i] *= 8; | ||
p_bytes[i] *= 8; | ||
o[i] *= 8; | ||
p[i] *= 8; | ||
} | ||
std::vector<uint64_t> sum_bytes(num_cells); | ||
for (size_t i = 0; i < num_cells; ++i) { | ||
sum_bytes[i] = o_bytes[i] + p_bytes[i] + fixed_bytes_per_cell; | ||
} | ||
std::vector<uint64_t> byte_offsets(num_cells + 1); | ||
lengths_to_offsets(sum_bytes, byte_offsets); | ||
// {112, 192, /**/ 304, 376, /**/ 464, 560, /**/ 672}; | ||
// {112, 192, /**/ 112, 184, /**/ 88, 184, /**/ 112}; | ||
|
||
std::list<std::vector<uint64_t>::iterator> sizes{begin(o), begin(p)}; | ||
|
||
auto&& [x, y] = | ||
bin_partition(bin_size, num_cells, fixed_bytes_per_cell, sizes); | ||
std::vector<uint64_t> expected_bins{0, 2, 4, 6, 7}; | ||
std::vector<uint64_t> expected_sizes{192, 184, 184, 112}; | ||
|
||
CHECK(x == expected_bins); | ||
CHECK(y == expected_sizes); | ||
} |