Skip to content

Commit

Permalink
Implement bin partitioning function (#5092)
Browse files Browse the repository at this point in the history
To write tiles into a temporary array that will fit into fixed-sized
blocks on reading, this PR implements a bin partitioning function that
returns a `std::vector` of cell partitions, such that each group of
cells has a total byte count less than a specified number of bytes. Also
returned is a `std::vector` of the bin sizes (in number of bytes).

This is the first file to go into the new sm/external_sort, so it also
creates the directory and populates it with a test subdirectory and
appropriate CMakeLists.txt files.

A previous PR creates a doc subdirectory and includes the svg image of
our design for external sort.

---
TYPE: FEATURE
DESC: Implement a partitioning function to partition cells to fit into
fixed size bins

---------

Co-authored-by: Luc Rancourt <[email protected]>
  • Loading branch information
lums658 and KiterLuc authored Jun 18, 2024
1 parent 25933f9 commit 40af4f6
Show file tree
Hide file tree
Showing 5 changed files with 238 additions and 0 deletions.
1 change: 1 addition & 0 deletions tiledb/sm/query/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
include(common NO_POLICY_SCOPE)
add_subdirectory(ast)
add_subdirectory(deletes_and_updates)
add_subdirectory(external_sort)
add_subdirectory(readers)

#
Expand Down
29 changes: 29 additions & 0 deletions tiledb/sm/query/external_sort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#
# tiledb/sm/query/external_sort/CMakeLists.txt
#
# The MIT License
#
# Copyright (c) 2024 TileDB, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#

include(common NO_POLICY_SCOPE)

add_test_subdirectory()
98 changes: 98 additions & 0 deletions tiledb/sm/query/external_sort/partition.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/**
* @file tiledb/sm/query/external_sort/partition.h
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*
*/

#ifndef TILEDB_PARTITION_H
#define TILEDB_PARTITION_H

#include <cassert>
#include <list>
#include <tuple>
#include <vector>

/**
* @brief Partition a list of sizes into bins that are less than or equal to a
* given number of bytes. The sizes are the number of elements in each cell,
* which are assumed to be of type `char`.
* @param bin_size The maximum number of bytes in a bin.
* @param num_cells The total number of cells to be partitioned.
* @param fixed_bytes_per_cell The number of fixed bytes per cell. This
* includes all of the non varlength elements in each cell, including the
* elements that specify the sizes.
* @param sizes The number of varlength elements in each cell. These are
* assumed to correspond to chars, so the number of bytes in each cell is
* the same as the number of elements in the cell.
* @return
*/
auto bin_partition(
size_t bin_size,
size_t num_cells,
size_t fixed_bytes_per_cell,
std::list<std::vector<uint64_t>::iterator>& sizes) {
assert(bin_size > 0);
assert(num_cells > 0);
assert(fixed_bytes_per_cell > 0);

size_t current_index{0};
// size_t next_index{0};
size_t current_size{0};
size_t next_size{0};

auto offsets_begin = sizes.begin();
auto offsets_end = sizes.end();

std::vector<uint64_t> bins{0};
std::vector<uint64_t> bin_sizes;

while (true) {
next_size = current_size + fixed_bytes_per_cell;
for (auto o = offsets_begin; o != offsets_end; ++o) {
next_size += (*o)[current_index] * sizeof(char);
}
if (next_size > bin_size) {
bins.push_back(current_index);
bin_sizes.push_back(current_size);

next_size = current_size = 0;
continue;
} else {
current_size = next_size;
}
if (++current_index == num_cells) {
bins.push_back(num_cells);
bin_sizes.push_back(current_size);
break;
}
}

return std::make_tuple(std::move(bins), std::move(bin_sizes));
}

#endif // TILEDB_PARTITION_H
33 changes: 33 additions & 0 deletions tiledb/sm/query/external_sort/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# tiledb/sm/query/external_sort/test/CMakeLists.txt
#
# The MIT License
#
# Copyright (c) 2024 TileDB, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#

include(unit_test)

commence(unit_test partition)
this_target_sources(
unit_partition.cc
)
conclude(unit_test)
77 changes: 77 additions & 0 deletions tiledb/sm/query/external_sort/test/unit_partition.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* @file unit_partition.cc
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*
* This file implements unit tests for the alt_var_length_view class.
*/

#include <catch2/catch_all.hpp>
#include <vector>
#include "tiledb/common/util/var_length_util.h"
#include "tiledb/sm/query/external_sort/partition.h"

TEST_CASE("partition: Null test", "[partition][null_test]") {
REQUIRE(true);
}

TEST_CASE("partition: sized", "[partition]") {
std::vector<uint64_t> o{8, 6, 7, 5, 3, 0, 9};
std::vector<uint64_t> p{3, 1, 4, 1, 5, 9, 2};

REQUIRE(o.size() == p.size());
size_t num_cells = size(o);
size_t bin_size = 256;
auto fixed_bytes_per_cell = 24;

std::vector<uint64_t> o_bytes{64, 48, 56, 40, 24, 0, 72};
std::vector<uint64_t> p_bytes{24, 8, 32, 8, 40, 72, 16};
for (size_t i = 0; i < num_cells; ++i) {
o_bytes[i] *= 8;
p_bytes[i] *= 8;
o[i] *= 8;
p[i] *= 8;
}
std::vector<uint64_t> sum_bytes(num_cells);
for (size_t i = 0; i < num_cells; ++i) {
sum_bytes[i] = o_bytes[i] + p_bytes[i] + fixed_bytes_per_cell;
}
std::vector<uint64_t> byte_offsets(num_cells + 1);
lengths_to_offsets(sum_bytes, byte_offsets);
// {112, 192, /**/ 304, 376, /**/ 464, 560, /**/ 672};
// {112, 192, /**/ 112, 184, /**/ 88, 184, /**/ 112};

std::list<std::vector<uint64_t>::iterator> sizes{begin(o), begin(p)};

auto&& [x, y] =
bin_partition(bin_size, num_cells, fixed_bytes_per_cell, sizes);
std::vector<uint64_t> expected_bins{0, 2, 4, 6, 7};
std::vector<uint64_t> expected_sizes{192, 184, 184, 112};

CHECK(x == expected_bins);
CHECK(y == expected_sizes);
}

0 comments on commit 40af4f6

Please sign in to comment.