Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement bin partitioning function #5092

Merged
merged 3 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tiledb/sm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ add_subdirectory(compressors)
add_subdirectory(config)
add_subdirectory(consolidator)
add_subdirectory(crypto)
add_subdirectory(external_sort)
add_subdirectory(filesystem)
add_subdirectory(filter)
add_subdirectory(fragment)
Expand Down
29 changes: 29 additions & 0 deletions tiledb/sm/external_sort/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#
# tiledb/sm/external_sort/CMakeLists.txt
#
# The MIT License
#
# Copyright (c) 2024 TileDB, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#

include(common NO_POLICY_SCOPE)

add_test_subdirectory()
98 changes: 98 additions & 0 deletions tiledb/sm/external_sort/partition.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/**
* @file tiledb/sm/external_sort/partition.h
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*
*/

#ifndef TILEDB_PARTITION_H
#define TILEDB_PARTITION_H

#include <cassert>
#include <list>
#include <tuple>
#include <vector>

/**
* @brief Partition a list of sizes into bins that are less than or equal to a
* given number of bytes. The sizes are the number of elements in each cell,
* which are assumed to be of type `char`.
* @param bin_size The maximum number of bytes in a bin.
* @param num_cells The total number of cells to be partitioned.
* @param fixed_bytes_per_cell The number of fixed bytes per cell. This
* includes all of the non varlength elements in each cell, including the
* elements that specify the sizes.
* @param sizes The number of varlength elements in each cell. These are
* assumed to correspond to chars, so the number of bytes in each cell is
* the same as the number of elements in the cell.
* @return
*/
auto bin_partition(
size_t bin_size,
size_t num_cells,
size_t fixed_bytes_per_cell,
std::list<std::vector<uint64_t>::iterator>& sizes) {
assert(bin_size > 0);
assert(num_cells > 0);
assert(fixed_bytes_per_cell > 0);

size_t current_index{0};
// size_t next_index{0};
size_t current_size{0};
size_t next_size{0};

auto offsets_begin = sizes.begin();
auto offsets_end = sizes.end();

std::vector<uint64_t> bins{0};
std::vector<uint64_t> bin_sizes;

while (true) {
next_size = current_size + fixed_bytes_per_cell;
for (auto o = offsets_begin; o != offsets_end; ++o) {
next_size += (*o)[current_index] * sizeof(char);
}
if (next_size > bin_size) {
bins.push_back(current_index);
bin_sizes.push_back(current_size);

next_size = current_size = 0;
continue;
} else {
current_size = next_size;
}
if (++current_index == num_cells) {
bins.push_back(num_cells);
bin_sizes.push_back(current_size);
break;
}
}

return std::make_tuple(std::move(bins), std::move(bin_sizes));
}

#endif // TILEDB_PARTITION_H
33 changes: 33 additions & 0 deletions tiledb/sm/external_sort/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# tiledb/sm/external_sort/test/CMakeLists.txt
#
# The MIT License
#
# Copyright (c) 2024 TileDB, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#

include(unit_test)

commence(unit_test partition)
this_target_sources(
unit_partition.cc
)
conclude(unit_test)
77 changes: 77 additions & 0 deletions tiledb/sm/external_sort/test/unit_partition.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* @file unit_partition.cc
*
* @section LICENSE
*
* The MIT License
*
* @copyright Copyright (c) 2024 TileDB, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
* @section DESCRIPTION
*
* This file implements unit tests for the alt_var_length_view class.
*/

#include <catch2/catch_all.hpp>
#include <vector>
#include "tiledb/common/util/var_length_util.h"
#include "tiledb/sm/external_sort/partition.h"

TEST_CASE("partition: Null test", "[partition][null_test]") {
REQUIRE(true);
}

TEST_CASE("partition: sized", "[partition]") {
std::vector<uint64_t> o{8, 6, 7, 5, 3, 0, 9};
std::vector<uint64_t> p{3, 1, 4, 1, 5, 9, 2};

REQUIRE(o.size() == p.size());
size_t num_cells = size(o);
size_t bin_size = 256;
auto fixed_bytes_per_cell = 24;

std::vector<uint64_t> o_bytes{64, 48, 56, 40, 24, 0, 72};
std::vector<uint64_t> p_bytes{24, 8, 32, 8, 40, 72, 16};
for (size_t i = 0; i < num_cells; ++i) {
o_bytes[i] *= 8;
p_bytes[i] *= 8;
o[i] *= 8;
p[i] *= 8;
}
std::vector<uint64_t> sum_bytes(num_cells);
for (size_t i = 0; i < num_cells; ++i) {
sum_bytes[i] = o_bytes[i] + p_bytes[i] + fixed_bytes_per_cell;
}
std::vector<uint64_t> byte_offsets(num_cells + 1);
lengths_to_offsets(sum_bytes, byte_offsets);
// {112, 192, /**/ 304, 376, /**/ 464, 560, /**/ 672};
// {112, 192, /**/ 112, 184, /**/ 88, 184, /**/ 112};

std::list<std::vector<uint64_t>::iterator> sizes{begin(o), begin(p)};

auto&& [x, y] =
bin_partition(bin_size, num_cells, fixed_bytes_per_cell, sizes);
std::vector<uint64_t> expected_bins{0, 2, 4, 6, 7};
std::vector<uint64_t> expected_sizes{192, 184, 184, 112};

CHECK(x == expected_bins);
CHECK(y == expected_sizes);
}
Loading