From a8f12eaa45f19521f6a652215a4830fd1547d570 Mon Sep 17 00:00:00 2001 From: "Paul J. Davis" Date: Thu, 6 Apr 2023 11:17:53 -0500 Subject: [PATCH] Add Enumerated Data Types TODO: Write a better commit message --- CMakeLists.txt | 7 +- PJD_TODO.md | 33 + scripts/generate-coverage-report.py | 481 +++++++++ test/CMakeLists.txt | 1 + test/src/unit-cppapi-deletes.cc | 36 +- test/src/unit-enumerations.cc | 922 ++++++++++++++++++ .../src/array_schema_params_generator.h | 176 ++++ tiledb/CMakeLists.txt | 4 + tiledb/api/c_api/CMakeLists.txt | 3 + .../api/c_api/dimension_label/CMakeLists.txt | 1 + .../c_api/dimension_label/test/CMakeLists.txt | 1 + tiledb/api/c_api/enumeration/CMakeLists.txt | 40 + .../api/c_api/enumeration/enumeration_api.cc | 204 ++++ .../enumeration_api_experimental.h | 230 +++++ .../enumeration/enumeration_api_internal.h | 124 +++ .../api/c_api/enumeration/test/CMakeLists.txt | 32 + .../compile_capi_enumeration_stub_main.cc | 36 + .../enumeration/test/unit_capi_enumeration.cc | 430 ++++++++ tiledb/sm/array/array.cc | 26 + tiledb/sm/array/array.h | 5 + tiledb/sm/array/array_directory.cc | 37 +- tiledb/sm/array/array_directory.h | 12 + tiledb/sm/array/test/CMakeLists.txt | 2 +- tiledb/sm/array_schema/CMakeLists.txt | 25 +- tiledb/sm/array_schema/array_schema.cc | 95 +- tiledb/sm/array_schema/array_schema.h | 52 + .../sm/array_schema/array_schema_evolution.cc | 98 +- .../sm/array_schema/array_schema_evolution.h | 36 +- tiledb/sm/array_schema/attribute.cc | 36 +- tiledb/sm/array_schema/attribute.h | 18 +- tiledb/sm/array_schema/enumeration.cc | 230 +++++ tiledb/sm/array_schema/enumeration.h | 229 +++++ .../test/compile_enumeration_main.cc | 36 + tiledb/sm/c_api/tiledb.cc | 97 +- tiledb/sm/c_api/tiledb_experimental.h | 86 ++ tiledb/sm/cpp_api/array_experimental.h | 56 ++ tiledb/sm/cpp_api/array_schema_experimental.h | 21 + tiledb/sm/cpp_api/attribute_experimental.h | 67 ++ tiledb/sm/cpp_api/deleter.h | 4 + tiledb/sm/cpp_api/enumeration_experimental.h | 257 +++++ tiledb/sm/cpp_api/tiledb_experimental | 3 + tiledb/sm/enums/layout.h | 4 +- tiledb/sm/misc/constants.cc | 11 + tiledb/sm/misc/constants.h | 12 + tiledb/sm/query/ast/CMakeLists.txt | 2 +- tiledb/sm/query/ast/query_ast.cc | 104 ++ tiledb/sm/query/ast/query_ast.h | 138 ++- tiledb/sm/query/query.cc | 26 + tiledb/sm/query/query_condition.cc | 24 + tiledb/sm/query/query_condition.h | 29 + tiledb/sm/storage_manager/storage_manager.cc | 39 +- tiledb/sm/tile/CMakeLists.txt | 7 +- tiledb/storage_format/uri/CMakeLists.txt | 2 +- 53 files changed, 4596 insertions(+), 91 deletions(-) create mode 100644 PJD_TODO.md create mode 100755 scripts/generate-coverage-report.py create mode 100644 test/src/unit-enumerations.cc create mode 100644 test/support/src/array_schema_params_generator.h create mode 100644 tiledb/api/c_api/enumeration/CMakeLists.txt create mode 100644 tiledb/api/c_api/enumeration/enumeration_api.cc create mode 100644 tiledb/api/c_api/enumeration/enumeration_api_experimental.h create mode 100644 tiledb/api/c_api/enumeration/enumeration_api_internal.h create mode 100644 tiledb/api/c_api/enumeration/test/CMakeLists.txt create mode 100644 tiledb/api/c_api/enumeration/test/compile_capi_enumeration_stub_main.cc create mode 100644 tiledb/api/c_api/enumeration/test/unit_capi_enumeration.cc create mode 100644 tiledb/sm/array_schema/enumeration.cc create mode 100644 tiledb/sm/array_schema/enumeration.h create mode 100644 tiledb/sm/array_schema/test/compile_enumeration_main.cc create mode 100644 tiledb/sm/cpp_api/array_experimental.h create mode 100644 tiledb/sm/cpp_api/attribute_experimental.h create mode 100644 tiledb/sm/cpp_api/enumeration_experimental.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 3910e00ef685..4a2b669127c3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,7 +219,12 @@ else() elseif (CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo") add_compile_options(-DNDEBUG -O3 -g3 -ggdb3 -gdwarf-3) elseif (CMAKE_BUILD_TYPE MATCHES "Coverage") - add_compile_options(-DDEBUG -g3 -gdwarf-3 --coverage) + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-DDEBUG --coverage -fprofile-instr-generate -fcoverage-mapping) + add_link_options(--coverage -fprofile-instr-generate -fcoverage-mapping) + else() + add_compile_options(-DDEBUG -g3 -gdwarf-3 --coverage) + endif() endif() # Use -Wno-literal-suffix on Linux with C++ sources. diff --git a/PJD_TODO.md b/PJD_TODO.md new file mode 100644 index 000000000000..5667b5fb8fad --- /dev/null +++ b/PJD_TODO.md @@ -0,0 +1,33 @@ +Chores Left +=== + +* Testing + * Update C API unit tests to cover 100% of the Enumeration constructor + * Throw error if enumeration has longer length than the integer width of the attribute + * Require the attribute type to be integral when setting an enumeration + * Don't allow signed integer values for attributes with enumeratiojns? Does R need this? + * Require cell_val_num == 1 for attributes + + * Errors (de?)serializing to disk + +* Miscellany + * Add Enumeration::dump(FILE* out) + * Update schema consolidation and vaccuming for enumerations + * When serializing array schema, check that any enumerations set on attributes actually exist + * Array::get_enumeration - might have to check all load schema versions? + * Do loaded enumerations need serialized? + * QueryAstNode rewrite_enumeration_condition - Need to adjust bit widths + * Document enumeration format changes in the storage format docs + +* REST Serialization ToDOs + * Enumerations initial implementation + * Add ASTNode::use_enumeration to serialization code - If not, all remote queries are against enumeration values + * Add attribute enumeration name + +* Use generate_uri instead of reimplementing the thinger if I did that + + +* Coverage Reports + * Add totals row to TOC + * Do we generate files for headers with no executable lines? If not, track whcih ones are fake so that headers are green in the TOC when they only have non-executable changes + * Add jump-to-first-uncovered link diff --git a/scripts/generate-coverage-report.py b/scripts/generate-coverage-report.py new file mode 100755 index 000000000000..49054348a12c --- /dev/null +++ b/scripts/generate-coverage-report.py @@ -0,0 +1,481 @@ +#!/usr/bin/env python3 + +import html +import os +import re +import subprocess as sp +import sys +import textwrap as tw +import time + +LLVM_PROFDATA = "/Library/Developer/CommandLineTools/usr/bin/llvm-profdata" +LLVM_COV = "/Library/Developer/CommandLineTools/usr/bin/llvm-cov" + +RANGE_RE = re.compile("@@\s-(\d+)(,(\d+))?\s+\+(?P\d+)(,(?P\d+))?\s+@@") + +# From: https://stackoverflow.com/a/14693789 +ANSI_CODE_RE = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') + +SOURCE_EXTENSIONS = set([ + ".h", + ".c", + ".c++", + ".cc", + ".cpp", + ".hpp" +]) + +def log(*args, **kwargs): + kwargs["file"] = sys.stderr + print(*args, **kwargs) + +def is_source_file(fname): + if fname is None: + return False + (_, ext) = os.path.splitext(fname) + return ext in SOURCE_EXTENSIONS + +def find_source_dir(): + if not os.path.exists("CMakeCache.txt"): + print("Error finding CMakeCache.txt") + print("Are you running in your build directory?") + exit(2) + with open("CMakeCache.txt") as handle: + for line in handle: + bits = line.split("=", 1) + if len(bits) != 2: + continue + if bits[0] == "CMAKE_HOME_DIRECTORY:INTERNAL": + return bits[1].strip() + print("Failed to find source directory in CMakeCache.txt") + exit(2) + +def generate_file_list(): + source_dir = find_source_dir() + cmd = [ + "git", + "-C", + source_dir, + "--no-pager", + "diff", + "--unified=0", + "FETCH_HEAD" + ] + output = sp.check_output(" ".join(cmd), shell=True).decode("utf-8") + ret = {} + fname = None + ranges = [] + for line in output.splitlines(): + if line.startswith("+++ b/"): + if is_source_file(fname): + ret[fname] = ranges + fname = line[len("+++ b/"):].strip() + ranges = [] + elif line.startswith("@@"): + match = RANGE_RE.search(line.strip()) + if match is None: + log("Failed to parse: '{}'".format(line.strip())) + exit(3) + add_start = match.group("start") + add_len = match.group("length") + if add_len is None: + add_len = 1 + else: + add_len = int(add_len) + if add_len == 0: + continue + source_marker = line.rsplit("@@")[-1].strip() + ranges.append((int(add_start), int(add_len), source_marker)) + if is_source_file(fname): + ret[fname] = ranges + return ret + +def update_profdata(): + cmd = [ + LLVM_PROFDATA, + "merge", + "-sparse", + "default.profraw", + "-o", "default.profdata" + ] + sp.check_call(cmd) + +def generate_text_reports(file_info, target="tiledb/test/tiledb_unit"): + file_list = file_info.keys() + file_list = list(sorted([os.path.join("..", fname) for fname in file_list])) + for fname in file_list: + log(fname) + cmd = [ + LLVM_COV, + "show", + "--instr-profile=default.profdata", + "--Xdemangler=c++filt", + "--Xdemangler=--no-strip-underscore", + "--output-dir=.", + target + ] + file_list + sp.check_call(cmd) + +def load_text_reports(file_info): + ret = {} + prev_was_dashes = None + curr_fname = None + curr_template = None + curr_lines = None + for path, dnames, fnames in os.walk("coverage"): + for fname in fnames: + if os.path.splitext(fname)[1] != ".txt": + continue + if curr_fname is not None: + ret[curr_fname] = curr_lines + fname = os.path.join(path, fname) + curr_fname = fname + curr_lines = [] + with open(fname) as handle: + data = handle.read() + data = ANSI_CODE_RE.sub('', data) + for (idx, line) in enumerate(data.splitlines()): + if idx < 3: + continue + line = line.strip() + if not line: + continue + if prev_was_dashes and line.startswith("|"): + curr_template = line[1:].strip().rstrip(":") + continue + else: + curr_template = None + if all(c == "-" for c in line): + prev_was_dashes = True + continue + else: + prev_was_dashes = False + bits = line.strip().split("|", 2) + assert len(bits) == 3, line.strip() + " " + fname + line = int(bits[0]) + count = None + if bits[1].strip(): + count = bits[1] + if count.endswith("E"): + # I have no idea if this is correct, but the data seems to make + # sense that this is a bug where it should be E1 i.e., * 10^1 + count = int(float(count[:-1]) * 100) + elif count.endswith("k"): + count = int(float(count[:-1]) * 1000) + elif count.endswith("M"): + count = int(float(count[:-1]) * 1000000) + else: + count = int(bits[1].strip()) + curr_lines.append((line, count, curr_template, bits[2].rstrip())) + if curr_fname is not None: + ret[curr_fname] = curr_lines + for fname in file_info: + if coverage_key(fname) not in ret: + ret[coverage_key(fname)] = fake_txt_report(fname) + return ret + +def coverage_key(fname): + base = os.path.join("coverage", find_source_dir().lstrip(os.sep)) + return os.path.join(base, fname) + ".txt" + +def fake_txt_report(fname): + abs_path = os.path.join(find_source_dir(), fname) + ret = [] + with open(abs_path) as handle: + for (idx, line) in enumerate(handle): + ret.append((idx + 1, None, None, line.rstrip())) + return ret + +def render_html(file_info, txt_reports): + print(tw.dedent("""\ + + + + + + + + + """.format(css=css()))) + render_toc(file_info, txt_reports) + for fname in sorted(file_info.keys()): + render_file(fname, file_info[fname], txt_reports[coverage_key(fname)]) + print(tw.dedent("""\ + + + """)) + +def render_toc(file_info, txt_reports): + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + for fname in file_info: + (lines, executable, executed, coverage, row_class) = calculate_coverage(fname, file_info, txt_reports) + anchor = fname.replace(os.sep, "-") + print(' '.format(row_class)) + print(' '.format(anchor=anchor, fname=fname)) + print(' '.format(lines)) + print(' '.format(executable)) + print(' '.format(executed)) + print(' '.format(coverage)) + print(' ') + print("
FilenameChangesExecutableExecutedCovered
{fname}{}{}{}{}
") + + +def calculate_coverage(fname, file_info, txt_reports): + regions = list(sorted(file_info[fname])) + # No regions mean we're picking up a file that we only removed lines from + if not regions: + return "", "", "", "", "covered-file" + lines = 0 + executable = 0 + executed = 0 + txt_report = txt_reports[coverage_key(fname)] + for line in txt_report: + if not regions: + break + if line[0] < regions[0][0]: + continue + if line[0] <= regions[0][0] + regions[0][1] - 1: + lines += 1 + if line[1] is None: + continue + executable += 1 + if line[1] > 0: + executed += 1 + if line[0] >= regions[0][0] + regions[0][1] - 1: + regions.pop(0) + # We had changes in this file but zero lines executed which means it wasn't + # run at all. Report the issue visually even though the numbers don't + # make much sense. + if executable == 0: + return lines, "", "", "", "uncovered-file" + coverage = 100.0 * float(executed) / float(executable) + if coverage < 20.0: + return lines, executable, executed, "{:.2f}%".format(coverage), "covered-file-warn" + if coverage < 80.0: + return lines, executable, executed, "{:.2f}%".format(coverage), "covered-file" + else: + return lines, executable, executed, "{:.2f}%".format(coverage), "covered-file-good" + +def render_file(fname, file_info, txt_report): + anchor = fname.replace(os.sep, "-") + source_args = {"anchor": anchor, "fname": fname} + # Should already be sorted but why rely on that assumption? + changes = list(sorted(file_info)) + regions = add_context(file_info, 3) + print(" ") + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + print(' ') + for line in txt_report: + # if regions: + # print(regions) + # print(line) + if not regions: + break + if line[0] < regions[0][0]: + continue + if regions and line[0] == regions[0][0]: + print(' ') + print(' ') + print(' ') + change_value = "" + if changes and line[0] >= changes[0][0] and line[0] < changes[0][0] + changes[0][1]: + change_value = "+" + if line[1] is None: + row_class = "uncovered-line" + elif line[1] == 0: + row_class = "unexecuted-line" + else: + row_class = "executed-line" + else: + row_class = "uncovered-line" + print(' '.format(row_class)) + print(' '.format(line=line[0])) + if line[1] is None: + print(' ') + else: + print(' '.format(count=line[1])) + print(' '.format(change_value)) + print(' '.format(code=html.escape(line[3]))) + print(' ') + # Update changes and regions if we're on the last line of either + if changes and line[0] >= changes[0][0] + changes[0][1] - 1: + changes.pop(0) + if regions and line[0] >= regions[0][0] + regions[0][1] - 1: + regions.pop(0) + print("
') + print('
') + print('
{fname}
'.format(**source_args)) + print('
') + print('
Line
Count
Source
') + print('
') + print('
{}
'.format(html.escape(regions[0][2]))) + print('
') + print('
{line}
{count}
{}
{code}
") + +def add_context(regions, num_lines): + ret = [] + for region in regions: + new_start = max(0, region[0] - num_lines) + new_len = region[1] + 2 * num_lines + if ret and new_start <= ret[-1][0] + ret[-1][1]: + end_line = region[0] + region[1] + num_lines + ret[-1][1] = end_line - ret[-1][0] + else: + ret.append([new_start, new_len, region[2]]) + return list(sorted(tuple(row) for row in ret)) + +def main(): + file_info = generate_file_list() + update_profdata() + generate_text_reports(file_info) + txt_reports = load_text_reports(file_info) + render_html(file_info, txt_reports) + +def css(): + # This CSS was cribbed from the llvm-cov output + return tw.dedent(""" + body { + font-family: monospace; + } + pre { + margin-top: 0px !important; + margin-bottom: 0px !important; + } + table { + border-collapse: collapse; + border: 1px solid gray; + margin: auto; + margin-bottom: 2em; + } + th { + text-align: left; + } + td { + vertical-align: top; + padding: 2px 8px; + border-collapse: collapse; + border-right: solid 1px #eee; + border-left: solid 1px #eee; + text-align: left; + } + td pre { + display: inline-block; + } + td:first-child { + border-left: none; + } + td:last-child { + border-right: none; + } + tr:hover { + background-color: #f0f0f0; + } + .lines { + text-align: right; + } + .executable { + text-align: right; + } + .executed { + text-align: right; + } + .coverage { + text-align: right; + } + .source-file { + padding: 5px 10px; + border-bottom: 1px; + background-color: aliceblue; + line-height: 35px; + } + .source-marker { + padding: 5px 10px; + border-bottom: 1px; + background-color: azure; + line-height: 20px; + } + .changed { + text-align: center; + } + .uncovered-file { + background: palegoldenrod; + } + .covered-file { + } + .covered-file-warn { + background: lightpink; + } + .covered-file-good { + background: palegreen; + } + .line-number { + text-align: right; + } + .line-count { + text-align: right; + } + .uncovered-line { + + } + .unexecuted-line { + background: lightpink; + } + .executed-line { + + } + /* CSS OVERRIDES */ + body { + max-width: 1024px; + padding: 0 16px; + margin: 0 auto; + } + table { + border: none; + background-color: #f9fafb; + border-radius: 6px; + overflow: hidden; + width: 100%; + box-shadow: 0 1px 3px rgba(0,0,0,.16); + } + th { + color: #444; + font-size: 14px; + line-height: 1.5; + padding: 4px; + } + tr { + line-height: 1.5; + transition: background 0.3s ease-in-out; + color: #444; + } + a { + text-decoration: none; + color: rgb(0, 112, 240); + border-color: rgb(0, 112, 240); + border-width: 0; + } + a:hover { + border-bottom-width: 1px; + border-style: solid; + } + /* END OF CSS OVERRIDES */ + """) + +if __name__ == "__main__": + main() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6930e294ac4c..80905d430912 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -176,6 +176,7 @@ set(TILEDB_UNIT_TEST_SOURCES src/unit-dimension.cc src/unit-duplicates.cc src/unit-empty-var-length.cc + src/unit-enumerations.cc src/unit-filter-buffer.cc src/unit-filter-pipeline.cc src/unit-global-order.cc diff --git a/test/src/unit-cppapi-deletes.cc b/test/src/unit-cppapi-deletes.cc index f7a941d771ac..0a5f61d0df23 100644 --- a/test/src/unit-cppapi-deletes.cc +++ b/test/src/unit-cppapi-deletes.cc @@ -109,6 +109,7 @@ struct DeletesFx { void remove_sparse_array(); void remove_array(const std::string& array_name); bool is_array(const std::string& array_name); + std::vector list_schemas(const std::string& array_name); }; DeletesFx::DeletesFx() @@ -488,6 +489,29 @@ bool DeletesFx::is_array(const std::string& array_name) { return vfs_.is_dir(array_name); } +std::vector DeletesFx::list_schemas( + const std::string& array_name) { + auto& enum_dir = tiledb::sm::constants::array_enumerations_dir_name; + auto schemas = + vfs_.ls(array_name + "/" + tiledb::sm::constants::array_schema_dir_name); + + auto it = schemas.begin(); + while (it != schemas.end()) { + if ((*it).size() < enum_dir.size()) { + continue; + } + if ((*it).substr((*it).size() - enum_dir.size()) == enum_dir) { + break; + } + ++it; + } + if (it != schemas.end()) { + schemas.erase(it); + } + + return schemas; +} + TEST_CASE_METHOD( DeletesFx, "CPP API: Test writing delete condition", @@ -1949,8 +1973,7 @@ TEST_CASE_METHOD( // Check write CHECK(tiledb::test::num_commits(SPARSE_ARRAY_NAME) == 4); CHECK(tiledb::test::num_fragments(SPARSE_ARRAY_NAME) == 4); - auto schemas = - vfs_.ls(array_name + "/" + tiledb::sm::constants::array_schema_dir_name); + auto schemas = list_schemas(array_name); CHECK(schemas.size() == 1); auto meta = vfs_.ls( array_name + "/" + tiledb::sm::constants::array_metadata_dir_name); @@ -1984,8 +2007,7 @@ TEST_CASE_METHOD( // Check working directory after delete REQUIRE(vfs_.is_file(extraneous_file_path)); CHECK(tiledb::test::num_fragments(SPARSE_ARRAY_NAME) == 0); - schemas = - vfs_.ls(array_name + "/" + tiledb::sm::constants::array_schema_dir_name); + schemas = list_schemas(array_name); CHECK(schemas.size() == 0); meta = vfs_.ls( array_name + "/" + tiledb::sm::constants::array_metadata_dir_name); @@ -2042,8 +2064,7 @@ TEST_CASE_METHOD( vfs_.touch(extraneous_file_path); // Check write - auto schemas = - vfs_.ls(array_name + "/" + tiledb::sm::constants::array_schema_dir_name); + auto schemas = list_schemas(array_name); CHECK(schemas.size() == 1); auto uris = vfs_.ls(array_name); bool ok_exists = false; @@ -2072,8 +2093,7 @@ TEST_CASE_METHOD( CHECK(!tiledb::sm::utils::parse::starts_with(uri, ok_prefix)); } REQUIRE(vfs_.is_file(extraneous_file_path)); - schemas = - vfs_.ls(array_name + "/" + tiledb::sm::constants::array_schema_dir_name); + schemas = list_schemas(array_name); CHECK(schemas.size() == 0); remove_sparse_array(); diff --git a/test/src/unit-enumerations.cc b/test/src/unit-enumerations.cc new file mode 100644 index 000000000000..5bd1481eba2d --- /dev/null +++ b/test/src/unit-enumerations.cc @@ -0,0 +1,922 @@ +/** + * @file unit-enumerations.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Tests the C++ API for enumeration related functions. + */ + +#include + +#include "test/support/tdb_catch.h" +#include "tiledb/sm/array/array_directory.h" +#include "tiledb/sm/array_schema/array_schema.h" +#include "tiledb/sm/array_schema/attribute.h" +#include "tiledb/sm/array_schema/dimension.h" +#include "tiledb/sm/array_schema/domain.h" +#include "tiledb/sm/array_schema/enumeration.h" +#include "tiledb/sm/config/config.h" +#include "tiledb/sm/enums/array_type.h" +#include "tiledb/sm/enums/encryption_type.h" +#include "tiledb/sm/enums/layout.h" +#include "tiledb/sm/storage_manager/context.h" + +using namespace tiledb::sm; + +struct EnumerationFx { + EnumerationFx(); + ~EnumerationFx(); + + template + shared_ptr create_enumeration( + const std::vector& values, + bool ordered = false, + Datatype type = static_cast(255)); + + template + void check_enumeration( + shared_ptr enmr, + const std::vector& values, + Datatype data_type, + uint32_t cell_val_num, + bool ordered); + + template + void check_storage_serialization(const std::vector& values); + + template + void check_storage_deserialization(const std::vector& values); + + storage_size_t calculate_serialized_size(shared_ptr enmr); + WriterTile serialize_to_tile(shared_ptr enmr); + + template + std::vector as_vector(shared_ptr enmr); + + void create_array(); + shared_ptr array_directory(); + shared_ptr load_array_schema_latest(); + + + void rm_array(); + + URI uri_; + Config cfg_; + Context ctx_; + EncryptionKey enc_key_; +}; + +/* ********************************* */ +/* Testing Enumeration */ +/* ********************************* */ + +TEST_CASE_METHOD( + EnumerationFx, + "Basic Fixed Size Enumeration Creation", + "[enumeration][basic][fixed]") { + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + check_enumeration(enmr, values, Datatype::UINT32, 1, false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Basic Variable Size Enumeration Creation", + "[enumeration][basic][fixed]") { + std::vector values = {"foo", "bar", "baz", "bingo", "bango"}; + auto enmr = create_enumeration(values); + check_enumeration( + enmr, values, Datatype::STRING_ASCII, constants::var_num, false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Basic Variable Size With Empty Value Enumeration Creation", + "[enumeration][basic][fixed]") { + std::vector values = {"foo", "bar", "", "bingo", "bango"}; + auto e = create_enumeration(values); + check_enumeration( + e, values, Datatype::STRING_ASCII, constants::var_num, false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation with Ordered", + "[enumeration][basic][var-size][ordered]") { + std::vector values = {"foo", "bar", "baz", "bingo", "bango"}; + auto enmr = create_enumeration(values, true); + check_enumeration( + enmr, values, Datatype::STRING_ASCII, constants::var_num, true); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation with Datatype", + "[enumeration][basic][var-size][custom-datatype]") { + std::vector values = {"foo", "bar", "baz", "bingo", "bango"}; + auto enmr = create_enumeration(values, false, Datatype::STRING_UTF8); + check_enumeration( + enmr, values, Datatype::STRING_UTF8, constants::var_num, false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation with Multi-Cell Val Num", + "[enumeration][basic][fixed][multi-cell-val-num]") { + std::vector values = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + auto enmr = Enumeration::create( + Datatype::INT32, + 2, + false, + values.data(), + values.size() * sizeof(int), + nullptr, + 0); + check_enumeration(enmr, values, Datatype::INT32, 2, false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Invalid cell val num", + "[enumeration][error][invalid-cell-val-num]") { + std::vector values = {1, 2, 3}; + REQUIRE_THROWS(Enumeration::create( + Datatype::INT32, + 0, + false, + values.data(), + values.size() * sizeof(int), + nullptr, + 0)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - No data pointer", + "[enumeration][error][data-nullptr]") { + std::vector values = {1, 2, 3}; + REQUIRE_THROWS(Enumeration::create( + Datatype::INT32, + 1, + false, + nullptr, + values.size() * sizeof(int), + nullptr, + 0)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Zero data size", + "[enumeration][error][data-zero-size]") { + std::vector values = {1, 2, 3}; + REQUIRE_THROWS(Enumeration::create( + Datatype::INT32, 1, false, values.data(), 0, nullptr, 0)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - No offsets pointer", + "[enumeration][error][offsets-nullptr]") { + auto data = "foobarbazbam"; + std::vector offsets = {0, 3, 6, 9}; + REQUIRE_THROWS(Enumeration::create( + Datatype::STRING_ASCII, + constants::var_num, + false, + data, + strlen(data), + nullptr, + offsets.size() * sizeof(uint64_t))); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - No offsets size", + "[enumeration][error][offsets-zero-size]") { + auto data = "foobarbazbam"; + std::vector offsets = {0, 3, 6, 9}; + REQUIRE_THROWS(Enumeration::create( + Datatype::STRING_ASCII, + constants::var_num, + false, + data, + strlen(data), + offsets.data(), + 0)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Offsets not required, pointer provided", + "[enumeration][error][offsets-not-required]") { + std::vector values = {0, 1, 2, 3, 4}; + std::vector offsets = {0, 3, 6, 9}; + REQUIRE_THROWS(Enumeration::create( + Datatype::INT32, + 1, + false, + values.data(), + values.size() * sizeof(int), + offsets.data(), + 0)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Offsets not required, size provided", + "[enumeration][error][offsets-not-required]") { + std::vector values = {0, 1, 2, 3, 4}; + REQUIRE_THROWS(Enumeration::create( + Datatype::INT32, + 1, + false, + values.data(), + values.size() * sizeof(int), + nullptr, + 100)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Invalid offsests size provided", + "[enumeration][error][offsets-invalid-size]") { + auto data = "foobarbazbam"; + std::vector offsets = {0, 3, 6, 9}; + // Passing 3 for the offsets size is incorrect because the offsets size has + // to be a multiple of `sizeof(uint64_t)` + REQUIRE_THROWS(Enumeration::create( + Datatype::STRING_ASCII, + constants::var_num, + false, + data, + strlen(data), + offsets.data(), + 3)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Offsets to data beyond provided data size", + "[enumeration][error][invalid-offset-data]") { + auto data = "foobarbazbam"; + std::vector offsets = {0, 3, 6, 100}; + // The last offset is larger than data_size + REQUIRE_THROWS(Enumeration::create( + Datatype::STRING_ASCII, + constants::var_num, + false, + data, + strlen(data), + offsets.data(), + offsets.size() * sizeof(uint64_t))); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Invalid data size", + "[enumeration][error][invalid-data-size]") { + std::vector values = {1, 2, 3, 4, 5}; + // Passing 3 for the data size is invalid as its not a multiple of + // sizeof(int) + REQUIRE_THROWS(Enumeration::create( + Datatype::INT32, 1, false, values.data(), 3, nullptr, 0)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Repeated fixed sized values", + "[enumeration][error][repeated-values]") { + std::vector values = {1, 2, 3, 3, 4, 5}; + REQUIRE_THROWS(create_enumeration(values)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Repeated variable sized values", + "[enumeration][error][repeated-values]") { + std::vector values = {"foo", "bar", "bang", "bar"}; + REQUIRE_THROWS(create_enumeration(values)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Creation Error - Repeated empty variable sized values", + "[enumeration][error][repeated-values]") { + std::vector values = {"foo", "", "bang", ""}; + REQUIRE_THROWS(create_enumeration(values)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Serialization - Fixed Size", + "[enumeration][serialization][fixed-size]") { + std::vector values = {1, 2, 3, 4, 5}; + check_storage_serialization(values); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Serialization - Variable Size", + "[enumeration][serialization][var-size]") { + std::vector values = {"foo", "bar", "baz", "bam", "cap"}; + check_storage_serialization(values); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Serialization Error - Invalid version", + "[enumeration][serialization][error][invalid-version]") { + std::vector values = {"foo", "bar", "baz", "bam", "cap"}; + auto enmr = create_enumeration(values); + + SizeComputationSerializer size_serializer; + REQUIRE_THROWS(enmr->serialize(size_serializer, 0)); + REQUIRE_THROWS(enmr->serialize( + size_serializer, constants::enumerations_min_version - 1)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Deserialization - Fixed Size", + "[enumeration][deserialization][fixed-size]") { + std::vector values = {1, 2, 3, 4, 5}; + check_storage_deserialization(values); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Deserialization - Variable Size", + "[enumeration][deserialization][var-size]") { + std::vector values = {"foo", "bar", "baz", "bam", "cap"}; + check_storage_deserialization(values); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Deserialization Error - Invalid version", + "[enumeration][deserialization][error][invalid-version]") { + std::vector values = {"foo", "bar", "baz", "bam", "cap"}; + auto enmr = create_enumeration(values); + auto tile = serialize_to_tile(enmr); + + REQUIRE(tile.size() > 4); + auto data = tile.data(); + memset(data, 0, 4); + + Deserializer deserializer(tile.data(), tile.size()); + REQUIRE_THROWS(Enumeration::deserialize(deserializer)); +} + +TEST_CASE_METHOD( + EnumerationFx, "Enumeration Set Name", "[enumeration][set-name]") { + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + + enmr->set_name("foo"); + REQUIRE(enmr->name() == "foo"); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Set Name Error - Empty name", + "[enumeration][set-name][error][empty-name]") { + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + REQUIRE_THROWS(enmr->set_name("")); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration Set Name Error - Name with slash", + "[enumeration][set-name][error][name-with-slash]") { + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + REQUIRE_THROWS(enmr->set_name("slashes/are/not/allowed/here")); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration index_of - Fixed Size", + "[enumeration][index-of][fixed-size]") { + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + + for (uint64_t i = 0; i < values.size(); i++) { + int tmp = values[i]; + UntypedDatumView udv(&tmp, sizeof(int)); + REQUIRE(enmr->index_of(udv) == i); + } +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration index_of - Fixed Size Missing", + "[enumeration][index-of][fixed-size]") { + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + + int zero = 0; + UntypedDatumView udv_zero(&zero, sizeof(int)); + REQUIRE(enmr->index_of(udv_zero) == constants::enumeration_missing_value); +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration index_of - Variable Size", + "[enumeration][index-of][var-size]") { + std::vector values = {"foo", "bar", "baz", "bang", "ohai"}; + auto enmr = create_enumeration(values); + + for (uint64_t i = 0; i < values.size(); i++) { + UntypedDatumView udv(values[i].data(), values[i].size()); + REQUIRE(enmr->index_of(udv) == i); + } +} + +TEST_CASE_METHOD( + EnumerationFx, + "Enumeration index_of - Variable Size Missing", + "[enumeration][index-of][var-size]") { + std::vector values = {"foo", "bar", "baz", "bang", "ohai"}; + auto enmr = create_enumeration(values); + + UntypedDatumView udv_empty("", 0); + REQUIRE(enmr->index_of(udv_empty) == constants::enumeration_missing_value); +} + +/* ********************************* */ +/* Testing ArraySchema */ +/* ********************************* */ + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Add Attribute - Attribute nullptr Error", + "[enumeration][array-schema][error]") { + auto schema = make_shared(HERE()); + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + + REQUIRE_THROWS(schema->add_attribute(nullptr, enmr)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Add Attribute - Enumeration nullptr Error", + "[enumeration][array-schema][error]") { + auto schema = make_shared(HERE()); + auto attr = make_shared(HERE(), "bar", Datatype::INT16); + + REQUIRE_THROWS(schema->add_attribute(attr, nullptr)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Add Basic Enumeration", + "[enumeration][array-schema][basic]") { + auto schema = make_shared(HERE()); + auto attr = make_shared(HERE(), "foo", Datatype::INT8); + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + + CHECK_NOTHROW(schema->add_attribute(attr, enmr)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Get Enumeration", + "[enumeration][array-schema][get]") { + auto schema = make_shared(HERE(), ArrayType::DENSE); + auto attr = make_shared(HERE(), "an_attr", Datatype::INT32); + + REQUIRE(attr->has_enumeration() == false); + REQUIRE(attr->get_enumeration_name() == ""); + + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + + schema->add_attribute(attr, enmr); + + REQUIRE(attr->has_enumeration() == true); + REQUIRE(attr->get_enumeration_name() != ""); + + auto enmr2 = schema->enumeration(attr->get_enumeration_name()); + check_enumeration(enmr2, values, Datatype::INT32, 1, false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Get Missing Enumeration", + "[enumeration][array-schema][error]") { + auto schema = make_shared(HERE(), ArrayType::SPARSE); + REQUIRE_THROWS(schema->enumeration("foo")); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Get All Enumerations Empty", + "[enumeration][array-schema][get-all][empty]") { + auto schema = make_shared(HERE(), ArrayType::DENSE); + auto enumerations = schema->enumerations(); + REQUIRE(enumerations.size() == 0); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Get All Enumerations", + "[enumeration][array-schema][get-all]") { + auto schema = make_shared(HERE(), ArrayType::DENSE); + auto attr = make_shared(HERE(), "an_attr", Datatype::INT32); + std::vector values = {1.0, 1.1, 1.2, 1.3, 1.4}; + auto enmr = create_enumeration(values); + schema->add_attribute(attr, enmr); + + auto enumerations = schema->enumerations(); + REQUIRE(enumerations.size() == 1); + REQUIRE(enumerations[0] == enmr); + check_enumeration(enumerations[0], values, Datatype::FLOAT32, 1, false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Set Enumeration Error", + "[enumeration][array-schema][error][set-unknown-enumeration]") { + auto schema = make_shared(HERE(), ArrayType::DENSE); + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + REQUIRE_THROWS(schema->set_enumeration("foo", enmr)); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Set Enumeration", + "[enumeration][array-schema]") { + auto schema = make_shared(HERE(), ArrayType::DENSE); + auto attr = make_shared(HERE(), "ohai", Datatype::INT32); + std::vector values = {0, 1, 2, 100000000}; + auto enmr = create_enumeration(values); + schema->add_attribute(attr, enmr); + + // This probably looks funky due to this being test code. In real life this + // call is made when we dynamically load enumerations at query time after + // the array schema was deserialized. + schema->set_enumeration(attr->get_enumeration_name(), enmr); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Has Enumeration Missing Attribute", + "[enumeration][array-schema][has-enumeration]") { + auto schema = make_shared(HERE(), ArrayType::SPARSE); + REQUIRE(schema->has_enumeration("foo") == false); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Has Enumeration", + "[enumeration][array-schema][has-enumeration]") { + auto schema = make_shared(HERE(), ArrayType::SPARSE); + auto attr = make_shared(HERE(), "ohai", Datatype::INT64); + std::vector values = {"a", "spot", "of", "tea", "perhaps?"}; + auto enmr = create_enumeration(values); + + REQUIRE(schema->has_enumeration("ohai") == false); + schema->add_attribute(attr, enmr); + REQUIRE(schema->has_enumeration("ohai") == true); +} + +TEST_CASE_METHOD( + EnumerationFx, + "ArraySchema - Schema Copy Constructor", + "[enumeration][array-schema][copy-ctor]") { + auto schema = make_shared(HERE(), ArrayType::SPARSE); + + auto dim = make_shared(HERE(), "d1", Datatype::INT32); + int range[2] = {0, 1000}; + throw_if_not_ok(dim->set_domain(range)); + + auto dom = make_shared(HERE()); + throw_if_not_ok(dom->add_dimension(dim)); + throw_if_not_ok(schema->set_domain(dom)); + + auto attr = make_shared(HERE(), "an_attr", Datatype::INT32); + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + schema->add_attribute(attr, enmr); + + CHECK_NOTHROW(schema->check()); + CHECK_NOTHROW(make_shared(HERE(), *(schema.get()))); +} + +/* ********************************* */ +/* Testing ArrayDirectory */ +/* ********************************* */ + +TEST_CASE_METHOD(EnumerationFx, "ArrayDirectory - Load Enumeration", "[enumeration][array-directory][load-enumeration]") { + create_array(); + + auto schema = load_array_schema_latest(); + auto ad = array_directory(); + auto enmr_name = schema->attribute("attr1")->get_enumeration_name(); + REQUIRE(enmr_name != ""); + + auto enmr = ad->load_enumeration(schema, enmr_name, enc_key_); + std::vector values = {1, 2, 3, 4, 5}; + check_enumeration(enmr, values, Datatype::INT32, 1, false); +} + +TEST_CASE_METHOD(EnumerationFx, "ArrayDirectory - Load Enumeration Error", "[enumeration][array-directory][error]") { + create_array(); + + auto schema = load_array_schema_latest(); + auto ad = array_directory(); + auto enmr_name = schema->attribute("attr1")->get_enumeration_name(); + REQUIRE(enmr_name != ""); + + auto enmr = ad->load_enumeration(schema, enmr_name, enc_key_); + std::vector values = {1, 2, 3, 4, 5}; + check_enumeration(enmr, values, Datatype::INT32, 1, false); +} + +/* ********************************* */ +/* TEST SUPPORT CODE */ +/* ********************************* */ + +struct TypeParams { + TypeParams(Datatype type, uint32_t cell_val_num) + : type_(type) + , cell_val_num_(cell_val_num) { + } + + template + static TypeParams get(const std::vector>&) { + return TypeParams(Datatype::STRING_ASCII, constants::var_num); + } + + static TypeParams get(const std::vector&) { + return TypeParams(Datatype::INT32, 1); + } + + static TypeParams get(const std::vector&) { + return TypeParams(Datatype::UINT32, 1); + } + + static TypeParams get(const std::vector&) { + return TypeParams(Datatype::FLOAT32, 1); + } + + Datatype type_; + uint32_t cell_val_num_; +}; + +EnumerationFx::EnumerationFx() + : uri_("enumeration_test_array") + , ctx_(cfg_) { + rm_array(); + throw_if_not_ok(enc_key_.set_key(EncryptionType::NO_ENCRYPTION, nullptr, 0)); +} + +EnumerationFx::~EnumerationFx() { + rm_array(); +} + +template +shared_ptr EnumerationFx::create_enumeration( + const std::vector& values, bool ordered, Datatype type) { + TypeParams tp = TypeParams::get(values); + + if (type != static_cast(255)) { + tp.type_ = type; + } + + if constexpr (std::is_pod_v) { + return Enumeration::create( + tp.type_, + tp.cell_val_num_, + ordered, + values.data(), + values.size() * sizeof(T), + nullptr, + 0); + } else { + uint64_t total_size = 0; + for (auto v : values) { + total_size += v.size(); + } + + uint8_t data[total_size]; + std::vector offsets; + offsets.reserve(values.size()); + uint64_t curr_offset = 0; + + for (auto v : values) { + std::memcpy(data + curr_offset, v.data(), v.size()); + offsets.push_back(curr_offset); + curr_offset += v.size(); + } + + return Enumeration::create( + tp.type_, + tp.cell_val_num_, + ordered, + data, + total_size, + offsets.data(), + offsets.size() * sizeof(uint64_t)); + } +} + +template +void EnumerationFx::check_enumeration( + shared_ptr enmr, + const std::vector& values, + Datatype data_type, + uint32_t cell_val_num, + bool ordered) { + REQUIRE(enmr->type() == data_type); + REQUIRE(enmr->cell_val_num() == cell_val_num); + REQUIRE(enmr->ordered() == ordered); + + std::vector data = as_vector(enmr); + REQUIRE(data == values); +} + +template +void EnumerationFx::check_storage_serialization(const std::vector& values) { + auto enmr = create_enumeration(values); + auto tile = serialize_to_tile(enmr); + REQUIRE(tile.size() == calculate_serialized_size(enmr)); +} + +template +void EnumerationFx::check_storage_deserialization( + const std::vector& values) { + auto enmr = create_enumeration(values); + auto tile = serialize_to_tile(enmr); + + Deserializer deserializer(tile.data(), tile.size()); + auto deserialized = Enumeration::deserialize(deserializer); + + REQUIRE(deserialized->type() == enmr->type()); + REQUIRE(deserialized->cell_val_num() == enmr->cell_val_num()); + REQUIRE(deserialized->ordered() == enmr->ordered()); + REQUIRE(deserialized->cell_size() == enmr->cell_size()); + REQUIRE(deserialized->var_size() == enmr->var_size()); + + auto [orig_data, orig_data_size] = enmr->data(); + auto [des_data, des_data_size] = deserialized->data(); + REQUIRE(des_data_size == orig_data_size); + REQUIRE(memcmp(des_data, orig_data, orig_data_size) == 0); + + if (enmr->var_size()) { + auto [orig_offsets, orig_offsets_size] = enmr->offsets(); + auto [des_offsets, des_offsets_size] = deserialized->offsets(); + REQUIRE(des_offsets_size == orig_offsets_size); + REQUIRE(memcmp(des_offsets, orig_offsets, orig_offsets_size) == 0); + } +} + +storage_size_t EnumerationFx::calculate_serialized_size( + shared_ptr enmr) { + storage_size_t num_bytes = 0; + + // Size is the sum of the following sizes: + + // uint32_t - version + num_bytes += sizeof(uint32_t); + + // uint8_t - data type + num_bytes += sizeof(uint8_t); + + // uint32_t - cell_val_num + num_bytes += sizeof(uint32_t); + + // bool - ordered + num_bytes += sizeof(bool); + + // uint64_t - data.size() + // data.size() bytes + auto [data, data_size] = enmr->data(); + num_bytes += sizeof(uint64_t); + num_bytes += data_size; + + // if var_sized: + if (enmr->var_size()) { + auto [offsets, offsets_size] = enmr->offsets(); + num_bytes += sizeof(uint64_t); + num_bytes += offsets_size; + } + + return num_bytes; +} + +WriterTile EnumerationFx::serialize_to_tile(shared_ptr enmr) { + SizeComputationSerializer size_serializer; + enmr->serialize(size_serializer, constants::format_version); + + WriterTile tile{WriterTile::from_generic(size_serializer.size())}; + Serializer serializer(tile.data(), tile.size()); + enmr->serialize(serializer, constants::format_version); + + return tile; +} + +template +std::vector EnumerationFx::as_vector(shared_ptr enmr) { + std::vector ret; + + if constexpr (std::is_pod_v) { + auto [data, data_size] = enmr->data(); + + const T* elems = static_cast(data); + size_t count = data_size / sizeof(T); + + ret.reserve(count); + for (size_t i = 0; i < count; i++) { + ret.push_back(elems[i]); + } + } else { + auto [data, data_size] = enmr->data(); + auto [offsets, offsets_size] = enmr->offsets(); + + auto str_data = static_cast(data); + auto elems = static_cast(offsets); + size_t count = offsets_size / sizeof(uint64_t); + + ret.reserve(count); + for (size_t i = 0; i < count; i++) { + uint64_t len; + if (i + 1 < count) { + len = elems[i + 1] - elems[i]; + } else { + len = data_size - elems[i]; + } + ret.emplace_back(str_data + elems[i], len); + } + } + + return ret; +} + +void EnumerationFx::create_array() { + auto schema = make_shared(HERE(), ArrayType::SPARSE); + + auto dim = make_shared(HERE(), "dim1", Datatype::INT32); + int range[2] = {0, 1000}; + throw_if_not_ok(dim->set_domain(range)); + + auto dom = make_shared(HERE()); + throw_if_not_ok(dom->add_dimension(dim)); + throw_if_not_ok(schema->set_domain(dom)); + + auto attr1 = make_shared(HERE(), "attr1", Datatype::INT32); + std::vector values = {1, 2, 3, 4, 5}; + auto enmr = create_enumeration(values); + schema->add_attribute(attr1, enmr); + + auto attr2 = make_shared(HERE(), "attr2", Datatype::STRING_ASCII); + throw_if_not_ok(schema->add_attribute(attr2)); + + throw_if_not_ok( + ctx_.storage_manager()->array_create(uri_, schema, enc_key_)); +} + +shared_ptr EnumerationFx::array_directory() { + return make_shared(HERE(), ctx_.resources(), uri_, 0, UINT64_MAX, ArrayDirectoryMode::READ); +} + +shared_ptr EnumerationFx::load_array_schema_latest() { + return array_directory()->load_array_schema_latest(enc_key_); +} + +void EnumerationFx::rm_array() { + bool is_dir; + throw_if_not_ok(ctx_.resources().vfs().is_dir(uri_, &is_dir)); + if (is_dir) { + throw_if_not_ok(ctx_.resources().vfs().remove_dir(uri_)); + } +} diff --git a/test/support/src/array_schema_params_generator.h b/test/support/src/array_schema_params_generator.h new file mode 100644 index 000000000000..689a76d1eb9e --- /dev/null +++ b/test/support/src/array_schema_params_generator.h @@ -0,0 +1,176 @@ +/** + * @file array_schema_params_generator.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file declares some test suite helper functions specific to generating + * array schema params structures. + */ + +#ifndef TILEDB_ARRAY_SCHEMA_PARAMS_GENERATOR_H +#define TILEDB_ARRAY_SCHEMA_PARAMS_GENERATOR_H + +#include + +#include "test/support/tdb_catch.h" +#include "tiledb/sm/cpp_api/tiledb" +#include "tiledb/sm/cpp_api/tiledb_experimental" + +struct ArraySchemaParams { + ArraySchemaParams( + tiledb_array_type_t array_type, + tiledb_encryption_type_t enc_type, + tiledb_layout_t tile_order, + tiledb_layout_t cell_order, + uint64_t capacity, + bool allow_dups) + : array_type_(array_type) + , enc_type_(enc_type) + , tile_order_(tile_order) + , cell_order_(cell_order) + , capacity_(capacity) + , allow_dups_(allow_dups) { + } + + bool is_valid() { + // Allow dups is invalid for dense arrays + if (array_type_ == TILEDB_DENSE && allow_dups_) { + return false; + } + + return true; + } + + std::string stringify() const { + std::stringstream ss; + + ss << "ArraySchemaParams(" << array_type_ << ", " << enc_type_ << ", " + << tile_order_ << ", " << cell_order_ << ", " << capacity_ << ", " + << allow_dups_ << ")"; + + return ss.str(); + } + + tiledb_array_type_t array_type_; + tiledb_encryption_type_t enc_type_; + tiledb_layout_t tile_order_; + tiledb_layout_t cell_order_; + uint64_t capacity_; + bool allow_dups_; +}; + +inline std::ostream& operator<<( + std::ostream& os, const ArraySchemaParams& params) { + os << params.stringify(); + return os; +} + +namespace { + +class ArraySchemaParamsGenerator + : public Catch::Generators::IGenerator { + public: + ArraySchemaParamsGenerator( + std::vector array_types = + {TILEDB_DENSE, TILEDB_SPARSE}, + std::vector tile_orders = + {TILEDB_ROW_MAJOR, TILEDB_COL_MAJOR, TILEDB_HILBERT}, + std::vector cell_orders = + {TILEDB_ROW_MAJOR, TILEDB_COL_MAJOR}, + std::vector capacities = {10, 100, 10000}, + std::vector allow_dups_vals = {true, false}, + std::vector enc_types = { + TILEDB_NO_ENCRYPTION, TILEDB_AES_256_GCM}); + + bool next() override; + ArraySchemaParams const& get() const override; + std::string stringifyImpl() const override; + + private: + std::vector params_; + std::vector::iterator params_iter_; +}; + +ArraySchemaParamsGenerator::ArraySchemaParamsGenerator( + std::vector array_types, + std::vector tile_orders, + std::vector cell_orders, + std::vector capacities, + std::vector allow_dups_vals, + std::vector enc_types) { + for (auto atype : array_types) { + for (auto etype : enc_types) { + for (auto t_order : tile_orders) { + for (auto c_order : cell_orders) { + for (auto cap : capacities) { + for (auto dups : allow_dups_vals) { + params_.emplace_back(atype, etype, t_order, c_order, cap, dups); + if (!params_.back().is_valid()) { + params_.pop_back(); + } + } + } + } + } + } + } + + params_iter_ = params_.begin(); +} + +bool ArraySchemaParamsGenerator::next() { + // Check if we're already at the end + if (params_iter_ == params_.end()) { + return false; + } + + ++params_iter_; + + // Check if we just arrived at the end. + if (params_iter_ == params_.end()) { + return false; + } + + return true; +} + +ArraySchemaParams const& ArraySchemaParamsGenerator::get() const { + return *params_iter_; +} + +std::string ArraySchemaParamsGenerator::stringifyImpl() const { + return "ArraySchemaParamsGenerator<" + params_iter_->stringify() + ">"; +} + +Catch::Generators::GeneratorWrapper array_schema_params() { + return Catch::Generators::GeneratorWrapper( + new ArraySchemaParamsGenerator()); +} + +} // namespace + +#endif // Included array_schema_params_generator.h diff --git a/tiledb/CMakeLists.txt b/tiledb/CMakeLists.txt index 37a9fdaab9f5..5226f5881b69 100644 --- a/tiledb/CMakeLists.txt +++ b/tiledb/CMakeLists.txt @@ -84,10 +84,12 @@ if (TILEDB_CPP_API) ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/tiledb_experimental ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/array_deprecated.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/array.h + ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/array_experimental.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/array_schema.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/array_schema_evolution.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/array_schema_experimental.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/attribute.h + ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/attribute_experimental.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/config.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/consolidation_plan_experimental.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/context.h @@ -96,6 +98,7 @@ if (TILEDB_CPP_API) ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/dimension.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/dimension_label_experimental.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/domain.h + ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/enumeration_experimental.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/error.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/exception.h ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/cpp_api/filter.h @@ -147,6 +150,7 @@ set(TILEDB_CORE_SOURCES ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/array_schema/dimension.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/array_schema/dimension_label.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/array_schema/domain.cc + ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/array_schema/enumeration.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/buffer/buffer.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/buffer/buffer_list.cc ${TILEDB_CORE_INCLUDE_DIR}/tiledb/sm/c_api/api_argument_validator.cc diff --git a/tiledb/api/c_api/CMakeLists.txt b/tiledb/api/c_api/CMakeLists.txt index 655bc3402249..a1d2cdf7e30d 100644 --- a/tiledb/api/c_api/CMakeLists.txt +++ b/tiledb/api/c_api/CMakeLists.txt @@ -76,6 +76,9 @@ add_subdirectory(data_order) # `dimension_label`: depends on `context`, `datatype`, `data_order` add_subdirectory(dimension_label) +# `enumeration`: depends on `buffer`, `constants`, and `context` +add_subdirectory(enumeration) + # `filesystem`: no dependencies add_subdirectory(filesystem) diff --git a/tiledb/api/c_api/dimension_label/CMakeLists.txt b/tiledb/api/c_api/dimension_label/CMakeLists.txt index 9cfe7b01e707..e8f9d0d214ba 100644 --- a/tiledb/api/c_api/dimension_label/CMakeLists.txt +++ b/tiledb/api/c_api/dimension_label/CMakeLists.txt @@ -46,6 +46,7 @@ target_link_libraries(capi_dimension_label_stub PUBLIC array_schema $(type); + + bool is_ordered = false; + if (ordered != 0) { + is_ordered = true; + } + + try { + *enumeration = tiledb_enumeration_handle_t::make_handle( + datatype, + cell_val_num, + is_ordered, + data, + data_size, + offsets, + offsets_size); + } catch (...) { + *enumeration = nullptr; + throw; + } + + // Success + return TILEDB_OK; +} + +void tiledb_enumeration_free(tiledb_enumeration_t** enumeration) { + ensure_output_pointer_is_valid(enumeration); + ensure_enumeration_is_valid(*enumeration); + tiledb_enumeration_handle_t::break_handle(*enumeration); +} + +capi_return_t tiledb_enumeration_get_type( + tiledb_enumeration_t* enumeration, tiledb_datatype_t* type) { + ensure_enumeration_is_valid(enumeration); + ensure_output_pointer_is_valid(type); + *type = static_cast(enumeration->type()); + return TILEDB_OK; +} + +capi_return_t tiledb_enumeration_get_cell_val_num( + tiledb_enumeration_t* enumeration, uint32_t* cell_val_num) { + ensure_enumeration_is_valid(enumeration); + ensure_output_pointer_is_valid(cell_val_num); + *cell_val_num = enumeration->cell_val_num(); + return TILEDB_OK; +} + +capi_return_t tiledb_enumeration_get_ordered( + tiledb_enumeration_t* enumeration, int* ordered) { + ensure_enumeration_is_valid(enumeration); + ensure_output_pointer_is_valid(ordered); + *ordered = static_cast(enumeration->ordered()); + return TILEDB_OK; +} + +capi_return_t tiledb_enumeration_get_data( + tiledb_enumeration_t* enumeration, const void** data, uint64_t* data_size) { + ensure_enumeration_is_valid(enumeration); + ensure_output_pointer_is_valid(data); + ensure_output_pointer_is_valid(data_size); + auto [d, ds] = enumeration->data(); + *data = d; + *data_size = ds; + return TILEDB_OK; +} + +capi_return_t tiledb_enumeration_get_offsets( + tiledb_enumeration_t* enumeration, + const void** offsets, + uint64_t* offsets_size) { + ensure_enumeration_is_valid(enumeration); + ensure_output_pointer_is_valid(offsets); + ensure_output_pointer_is_valid(offsets_size); + auto [o, os] = enumeration->offsets(); + *offsets = o; + *offsets_size = os; + return TILEDB_OK; +} + +} // namespace tiledb::api + +using tiledb::api::api_entry_context; +using tiledb::api::api_entry_void; + +capi_return_t tiledb_enumeration_alloc( + tiledb_ctx_t* ctx, + tiledb_datatype_t type, + uint32_t cell_val_num, + int ordered, + const void* data, + uint64_t data_size, + const void* offsets, + uint64_t offsets_size, + tiledb_enumeration_t** enumeration) noexcept { + return api_entry_context( + ctx, + type, + cell_val_num, + ordered, + data, + data_size, + offsets, + offsets_size, + enumeration); +} + +void tiledb_enumeration_free(tiledb_enumeration_t** enumeration) noexcept { + return api_entry_void(enumeration); +} + +capi_return_t tiledb_enumeration_get_type( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + tiledb_datatype_t* type) noexcept { + return api_entry_context( + ctx, enumeration, type); +} + +capi_return_t tiledb_enumeration_get_cell_val_num( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + uint32_t* cell_val_num) noexcept { + return api_entry_context( + ctx, enumeration, cell_val_num); +} + +capi_return_t tiledb_enumeration_get_ordered( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + int* ordered) noexcept { + return api_entry_context( + ctx, enumeration, ordered); +} + +capi_return_t tiledb_enumeration_get_data( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + const void** data, + uint64_t* data_size) noexcept { + return api_entry_context( + ctx, enumeration, data, data_size); +} + +capi_return_t tiledb_enumeration_get_offsets( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + const void** offsets, + uint64_t* offsets_size) noexcept { + return api_entry_context( + ctx, enumeration, offsets, offsets_size); +} diff --git a/tiledb/api/c_api/enumeration/enumeration_api_experimental.h b/tiledb/api/c_api/enumeration/enumeration_api_experimental.h new file mode 100644 index 000000000000..642cd4cf2489 --- /dev/null +++ b/tiledb/api/c_api/enumeration/enumeration_api_experimental.h @@ -0,0 +1,230 @@ +/** + * @file tiledb/api/c_api/enumeration/enumeration_api_experimental.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file declares the enumeration section of the C API for TileDB. + */ + +#ifndef TILEDB_CAPI_ENUMERATION_EXPERIMENTAL_H +#define TILEDB_CAPI_ENUMERATION_EXPERIMENTAL_H + +#include "../api_external_common.h" +#include "../context/context_api_external.h" +#include "../datatype/datatype_api_external.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** A TileDB dimension. */ +typedef struct tiledb_enumeration_handle_t tiledb_enumeration_t; + +/** + * Creates an Enumeration. + * + * **Example:** + * + * @code{.c} + * tiledb_enumeration_t* enumeration; + * void* data = get_data(); + * uint64_t data_size = get_data_size(); + * tiledb_enumeration_alloc( + * ctx, + * TILEDB_INT64, + * cell_val_num, + * FALSE, + * data, + * data_size, + * nullptr, + * 0, + * &enumeration); + * @endcode + * + * @param ctx The TileDB context. + * @param type The enumeration type. + * @param cell_val_num The number of values per enumeration value + * @param ordered Whether this enumeration should be considered as ordered + * @param data A pointer to the enumeration value data + * @param data_size The length of the data buffer provided + * @param offsets A pointer to the offsets buffer if cell_vall_num = UINT32_MAX + * @param offsets_size The length of the offsets buffer, zero if no offsets + * @param enumeration The newly allocated enumeration + * @return `TILEDB_OK` for success and `TILEDB_ERR` for error. + */ +TILEDB_EXPORT capi_return_t tiledb_enumeration_alloc( + tiledb_ctx_t* ctx, + tiledb_datatype_t type, + uint32_t cell_val_num, + int ordered, + const void* data, + uint64_t data_size, + const void* offsets, + uint64_t offsets_size, + tiledb_enumeration_t** enumeration) TILEDB_NOEXCEPT; + +/** + * Destroys a TileDB enumeration, freeing associated memory. + * + * **Example:** + * + * @code{.c} + * tiledb_enumeration_free(&enumeration); + * @endcode + * + * @param enumeration The enumeration to be destroyed. + */ +TILEDB_EXPORT void tiledb_enumeration_free(tiledb_enumeration_t** enumeration) + TILEDB_NOEXCEPT; + +/** + * Return the datatype of the enumeration values + * + * **Example:** + * + * @code{.c} + * tiledb_datatype_t type; + * tiledb_enumeration_get_type(ctx, enumeration, &type); + * @endcode + * + * @param ctx The TileDB context. + * @param enumeration The enumeration + * @param type The data type of the enumeration + * @return `TILEDB_OK` or `TILEDB_ERR` + */ +TILEDB_EXPORT capi_return_t tiledb_enumeration_get_type( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + tiledb_datatype_t* type) TILEDB_NOEXCEPT; + +/** + * Return the cell value number of the enumeration values + * + * **Example:** + * + * @code{.c} + * uint32_t cell_val_num; + * tiledb_enumeration_get_cell_val_num(ctx, enumeration, &cell_val_num); + * @endcode + * + * @param ctx The TileDB context. + * @param enumeration The enumeration + * @param type The cell value number of the enumeration + * @return `TILEDB_OK` or `TILEDB_ERR` + */ +TILEDB_EXPORT capi_return_t tiledb_enumeration_get_cell_val_num( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + uint32_t* cell_val_num) TILEDB_NOEXCEPT; + +/** + * Return whether the enumeration values should be considered ordered. + * + * **Example:** + * + * @code{.c} + * int ordered; + * tiledb_enumeration_get_ordered(ctx, enumeration, &ordered); + * @endcode + * + * The cell values are considered if the value in ordered after `TILEDB_OK` + * is returned is non-zero. I.e., this is standard `int` as `bool` behavior. + * + * @param ctx The TileDB context. + * @param enumeration The enumeration + * @param ordered A boolean value indicating whether the values are ordered + * @return `TILEDB_OK` or `TILEDB_ERR` + */ +TILEDB_EXPORT capi_return_t tiledb_enumeration_get_ordered( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + int* ordered) TILEDB_NOEXCEPT; + +/** + * Return a pointer to and size of the enumerations underlying value data + * + * **Example:** + * + * @code{.c} + * void* data = NULL; + * uint64_t data_size = 0; + * tiledb_enumeration_get_data(ctx, enumeration, &data, &data_size); + * @endcode + * + * The pointer returned from this function references internal data managed + * by the enumeration. As such, clients should not attempt to free it, or + * access it beyond the lifetime of the enumeration instance. + * + * @param ctx The TileDB context. + * @param enumeration The enumeration + * @param data The returned pointer to this enumeration value buffer + * @param data_size The length of the buffer pointed to by data + * @return `TILEDB_OK` or `TILEDB_ERR` + */ +TILEDB_EXPORT capi_return_t tiledb_enumeration_get_data( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + const void** data, + uint64_t* data_size) TILEDB_NOEXCEPT; + +/** + * Return a pointer to and size of the enumerations underlying value offsets + * + * **Example:** + * + * @code{.c} + * void* offsets = NULL; + * uint64_t offsets_size = 0; + * tiledb_enumeration_get_offsets(ctx, enumeration, &offsets, &offsets_size); + * @endcode + * + * The pointer returned from this function references internal data managed + * by the enumeration. As such, clients should not attempt to free it, or + * access it beyond the lifetime of the enumeration instance. + * + * If the enumeration values are var sized (i.e., cell_var_num is UINT32_MAX) + * the offsets buffer will contain a `uint64_t` value for the starting offset + * of each underlying enumeration value. Note that the number of offsets is + * calculated as `offsets_size / sizeof(uint64_t)`. + * + * @param ctx The TileDB context. + * @param enumeration The enumeration + * @param data The returned pointer to this enumeration offsets buffer + * @param data_size The length of the buffer pointed to by offsets + * @return `TILEDB_OK` or `TILEDB_ERR` + */ +TILEDB_EXPORT capi_return_t tiledb_enumeration_get_offsets( + tiledb_ctx_t* ctx, + tiledb_enumeration_t* enumeration, + const void** offsets, + uint64_t* offsets_size) TILEDB_NOEXCEPT; + +#ifdef __cplusplus +} +#endif + +#endif // TILEDB_CAPI_ENUMERATION_EXPERIMENTAL_H diff --git a/tiledb/api/c_api/enumeration/enumeration_api_internal.h b/tiledb/api/c_api/enumeration/enumeration_api_internal.h new file mode 100644 index 000000000000..c484d13ce5c2 --- /dev/null +++ b/tiledb/api/c_api/enumeration/enumeration_api_internal.h @@ -0,0 +1,124 @@ +/** + * @file tiledb/api/c_api/enumeration/enumeration_api_internal.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file declares the enumeration section of the C API for TileDB. + */ + +#ifndef TILEDB_CAPI_ENUMERATION_INTERNAL_H +#define TILEDB_CAPI_ENUMERATION_INTERNAL_H + +#include "enumeration_api_experimental.h" +#include "tiledb/api/c_api_support/handle/handle.h" +#include "tiledb/common/common.h" +#include "tiledb/sm/array_schema/enumeration.h" + +/** + * Handle `struct` for API enumeration objects. + */ +struct tiledb_enumeration_handle_t + : public tiledb::api::CAPIHandle { + private: + shared_ptr enumeration_; + + public: + /** + * Type name + */ + static constexpr std::string_view object_type_name{"enumeration"}; + + template + tiledb_enumeration_handle_t(Arg&&... arg) + : enumeration_( + tiledb::sm::Enumeration::create(std::forward(arg)...)) { + } + + /** + * Constructor from `shared_ptr` copies the shared pointer. + */ + explicit tiledb_enumeration_handle_t(shared_ptr& e) + : enumeration_(e) { + } + + /** + * Copy the underlying enumeration object. + */ + [[nodiscard]] shared_ptr copy() { + return enumeration_; + } + + /** + * Return the data type of the enumeration values. + */ + [[nodiscard]] inline tiledb_datatype_t type() const { + return static_cast(enumeration_->type()); + } + + /** + * Return the cell_val_num of the enumeration values + */ + [[nodiscard]] inline uint32_t cell_val_num() const { + return enumeration_->cell_val_num(); + } + + /** + * Return a bool indicating whethe the enumeration values are ordered. + */ + [[nodiscard]] inline bool ordered() const { + return enumeration_->ordered(); + } + + /** + * Return a pointer and size pair for the underlying data type. + */ + [[nodiscard]] inline const tuple data() const { + return enumeration_->data(); + } + + /** + * Return a pointer and size tuple for the underlying offsets buffer. + */ + [[nodiscard]] inline const tuple offsets() const { + return enumeration_->offsets(); + } +}; + +namespace tiledb::api { + +/** + * Returns after successfully validating an error. Throws otherwise. + * + * @param dim A possibly-valid enumeration handle + */ +inline void ensure_enumeration_is_valid(const tiledb_enumeration_handle_t* e) { + ensure_handle_is_valid(e); +} + +} // namespace tiledb::api + +#endif // TILEDB_CAPI_ENUMERATION_INTERNAL_H diff --git a/tiledb/api/c_api/enumeration/test/CMakeLists.txt b/tiledb/api/c_api/enumeration/test/CMakeLists.txt new file mode 100644 index 000000000000..7ace61d484d8 --- /dev/null +++ b/tiledb/api/c_api/enumeration/test/CMakeLists.txt @@ -0,0 +1,32 @@ +# +# tiledb/api/c_api/enumeration/test/CMakeLists.txt +# +# The MIT License +# +# Copyright (c) 2023 TileDB, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +include(unit_test) + +commence(unit_test capi_enumeration) + this_target_sources(unit_capi_enumeration.cc) + this_target_object_libraries(capi_enumeration_stub) +conclude(unit_test) diff --git a/tiledb/api/c_api/enumeration/test/compile_capi_enumeration_stub_main.cc b/tiledb/api/c_api/enumeration/test/compile_capi_enumeration_stub_main.cc new file mode 100644 index 000000000000..6ffbd2a494ee --- /dev/null +++ b/tiledb/api/c_api/enumeration/test/compile_capi_enumeration_stub_main.cc @@ -0,0 +1,36 @@ +/** + * @file tiledb/api/c_api/enumeration/test/compile_capi_enumeration_main.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "../enumeration_api_internal.h" +#include "tiledb/sm/enums/datatype.h" + +int main() { + tiledb_enumeration_handle_t e{ + tiledb::sm::Datatype::INT32, 1, 0, nullptr, 0, nullptr, 0}; + return 0; +} diff --git a/tiledb/api/c_api/enumeration/test/unit_capi_enumeration.cc b/tiledb/api/c_api/enumeration/test/unit_capi_enumeration.cc new file mode 100644 index 000000000000..65578aa71650 --- /dev/null +++ b/tiledb/api/c_api/enumeration/test/unit_capi_enumeration.cc @@ -0,0 +1,430 @@ +/** + * @file tiledb/api/c_api/enumeration/test/unit_capi_enumeration.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + */ + +#define CATCH_CONFIG_MAIN +#include +#include "../../../c_api_test_support/testsupport_capi_context.h" +#include "../enumeration_api_experimental.h" +using namespace tiledb::api::test_support; + +struct FixedSizeEnumeration { + FixedSizeEnumeration() { + uint32_t values[5] = {1, 2, 3, 4, 5}; + auto rc = tiledb_enumeration_alloc( + ctx_.context, + TILEDB_UINT32, + 1, + 0, + values, + sizeof(uint32_t) * 5, + nullptr, + 0, + &enumeration_); + REQUIRE(rc == TILEDB_OK); + } + + ~FixedSizeEnumeration() { + tiledb_enumeration_free(&enumeration_); + } + + ordinary_context ctx_; + tiledb_enumeration_t* enumeration_; +}; + +struct VarSizeEnumeration { + VarSizeEnumeration() { + const char* values = "foobarbazbingobango"; + uint64_t offsets[5] = {0, 3, 6, 9, 14}; + auto rc = tiledb_enumeration_alloc( + ctx_.context, + TILEDB_STRING_UTF8, + UINT32_MAX, + 0, + values, + strlen(values), + offsets, + 5 * sizeof(uint64_t), + &enumeration_); + REQUIRE(rc == TILEDB_OK); + } + + ~VarSizeEnumeration() { + tiledb_enumeration_free(&enumeration_); + } + + ordinary_context ctx_; + tiledb_enumeration_t* enumeration_; +}; + +TEST_CASE( + "C API: tiledb_enumeration_alloc argument validation", + "[capi][enumeration]") { + ordinary_context ctx{}; + tiledb_enumeration_t* enumeration; + + int32_t values[5] = {1, 2, 3, 4, 5}; + const char* data = "foobarbazbingobango"; + uint64_t offsets[5] = {0, 3, 6, 9, 14}; + + SECTION("success - fixed size") { + auto rc = tiledb_enumeration_alloc( + ctx.context, + TILEDB_UINT32, + 1, + 0, + values, + sizeof(uint32_t) * 5, + nullptr, + 0, + &enumeration); + REQUIRE(rc == TILEDB_OK); + tiledb_enumeration_free(&enumeration); + } + + SECTION("success - var size") { + auto rc = tiledb_enumeration_alloc( + ctx.context, + TILEDB_STRING_ASCII, + UINT32_MAX, + 0, + (void*)data, + strlen(data), + offsets, + sizeof(uint64_t) * 5, + &enumeration); + REQUIRE(rc == TILEDB_OK); + tiledb_enumeration_free(&enumeration); + } + + SECTION("failure - null context") { + auto rc = tiledb_enumeration_alloc( + nullptr, + TILEDB_UINT32, + 1, + 0, + values, + sizeof(uint32_t) * 5, + nullptr, + 0, + &enumeration); + REQUIRE(rc == TILEDB_INVALID_CONTEXT); + tiledb_enumeration_free(&enumeration); + } + + SECTION("failure - invalid datatype") { + auto rc = tiledb_enumeration_alloc( + ctx.context, + (tiledb_datatype_t)255, + 1, + 0, + values, + sizeof(uint32_t) * 5, + nullptr, + 0, + &enumeration); + REQUIRE(rc == TILEDB_ERR); + tiledb_enumeration_free(&enumeration); + } + + SECTION("failure - data nullptr") { + auto rc = tiledb_enumeration_alloc( + ctx.context, + TILEDB_INT32, + 1, + 0, + nullptr, + sizeof(uint32_t) * 5, + nullptr, + 0, + &enumeration); + REQUIRE(rc == TILEDB_ERR); + tiledb_enumeration_free(&enumeration); + } + + SECTION("failure - data_size == 0") { + auto rc = tiledb_enumeration_alloc( + ctx.context, TILEDB_INT32, 1, 0, values, 0, nullptr, 0, &enumeration); + REQUIRE(rc == TILEDB_ERR); + tiledb_enumeration_free(&enumeration); + } + + SECTION("failure - enumeration nullptr") { + auto rc = tiledb_enumeration_alloc( + ctx.context, + TILEDB_INT32, + 1, + 0, + values, + sizeof(uint32_t) * 5, + nullptr, + 0, + nullptr); + REQUIRE(rc == TILEDB_ERR); + tiledb_enumeration_free(&enumeration); + } + + SECTION("failure - offsets nullptr") { + auto rc = tiledb_enumeration_alloc( + ctx.context, + TILEDB_STRING_ASCII, + UINT32_MAX, + 0, + (void*)data, + strlen(data), + nullptr, + sizeof(uint64_t) * 5, + &enumeration); + REQUIRE(rc == TILEDB_ERR); + tiledb_enumeration_free(&enumeration); + } + + SECTION("failure - offsets_size == 0") { + auto rc = tiledb_enumeration_alloc( + ctx.context, + TILEDB_STRING_ASCII, + UINT32_MAX, + 0, + (void*)data, + strlen(data), + offsets, + 0, + &enumeration); + REQUIRE(rc == TILEDB_ERR); + tiledb_enumeration_free(&enumeration); + } +} + +TEST_CASE( + "C API: tiledb_enumeration_free argument validation", + "[capi][enumeration]") { + REQUIRE_NOTHROW(tiledb_enumeration_free(nullptr)); +} + +TEST_CASE( + "C API: tiledb_enumeration_get_type argument validation", + "[capi][enumeration]") { + FixedSizeEnumeration fe; + VarSizeEnumeration ve; + tiledb_datatype_t dt; + + SECTION("success") { + auto rc = + tiledb_enumeration_get_type(fe.ctx_.context, fe.enumeration_, &dt); + REQUIRE(rc == TILEDB_OK); + REQUIRE(dt == TILEDB_UINT32); + + rc = tiledb_enumeration_get_type(ve.ctx_.context, ve.enumeration_, &dt); + REQUIRE(rc == TILEDB_OK); + REQUIRE(dt == TILEDB_STRING_UTF8); + } + + SECTION("failure - invalid context") { + auto rc = tiledb_enumeration_get_type(nullptr, fe.enumeration_, &dt); + REQUIRE(rc == TILEDB_INVALID_CONTEXT); + } + + SECTION("failure - invalid enumeration") { + auto rc = tiledb_enumeration_get_type(fe.ctx_.context, nullptr, &dt); + REQUIRE(rc == TILEDB_ERR); + } + + SECTION("failure - invalid type pointer") { + auto rc = + tiledb_enumeration_get_type(fe.ctx_.context, fe.enumeration_, nullptr); + REQUIRE(rc == TILEDB_ERR); + } +} + +TEST_CASE( + "C API: tiledb_enumeration_get_cell_val_num argument validation", + "[capi][enumeration]") { + FixedSizeEnumeration fe; + VarSizeEnumeration ve; + uint32_t cvn; + + SECTION("success") { + auto rc = tiledb_enumeration_get_cell_val_num( + fe.ctx_.context, fe.enumeration_, &cvn); + REQUIRE(rc == TILEDB_OK); + REQUIRE(cvn == 1); + + rc = tiledb_enumeration_get_cell_val_num( + ve.ctx_.context, ve.enumeration_, &cvn); + REQUIRE(rc == TILEDB_OK); + REQUIRE(cvn == UINT32_MAX); + } + + SECTION("failure - invalid context") { + auto rc = + tiledb_enumeration_get_cell_val_num(nullptr, fe.enumeration_, &cvn); + REQUIRE(rc == TILEDB_INVALID_CONTEXT); + } + + SECTION("failure - invalid enumeration") { + auto rc = + tiledb_enumeration_get_cell_val_num(fe.ctx_.context, nullptr, &cvn); + REQUIRE(rc == TILEDB_ERR); + } + + SECTION("failure - invalid cell_val_num pointer") { + auto rc = tiledb_enumeration_get_cell_val_num( + fe.ctx_.context, fe.enumeration_, nullptr); + REQUIRE(rc == TILEDB_ERR); + } +} + +TEST_CASE( + "C API: tiledb_enumeration_get_ordered argument validation", + "[capi][enumeration]") { + FixedSizeEnumeration fe; + VarSizeEnumeration ve; + int o; + + SECTION("success") { + auto rc = + tiledb_enumeration_get_ordered(fe.ctx_.context, fe.enumeration_, &o); + REQUIRE(rc == TILEDB_OK); + REQUIRE(!o); + + rc = tiledb_enumeration_get_ordered(ve.ctx_.context, ve.enumeration_, &o); + REQUIRE(rc == TILEDB_OK); + REQUIRE(!o); + } + + SECTION("failure - invalid context") { + auto rc = tiledb_enumeration_get_ordered(nullptr, fe.enumeration_, &o); + REQUIRE(rc == TILEDB_INVALID_CONTEXT); + } + + SECTION("failure - invalid enumeration") { + auto rc = tiledb_enumeration_get_ordered(fe.ctx_.context, nullptr, &o); + REQUIRE(rc == TILEDB_ERR); + } + + SECTION("failure - invalid ordered pointer") { + auto rc = tiledb_enumeration_get_ordered( + fe.ctx_.context, fe.enumeration_, nullptr); + REQUIRE(rc == TILEDB_ERR); + } +} + +TEST_CASE( + "C API: tiledb_enumeration_get_data argument validation", + "[capi][enumeration]") { + FixedSizeEnumeration fe; + VarSizeEnumeration ve; + const void* d; + uint64_t ds; + + uint32_t fixed_expect[5] = {1, 2, 3, 4, 5}; + const char* var_expect = "foobarbazbingobango"; + + SECTION("success") { + auto rc = + tiledb_enumeration_get_data(fe.ctx_.context, fe.enumeration_, &d, &ds); + REQUIRE(rc == TILEDB_OK); + REQUIRE(std::memcmp(fixed_expect, d, sizeof(uint32_t) * 5) == 0); + REQUIRE(ds == sizeof(uint32_t) * 5); + + rc = tiledb_enumeration_get_data(ve.ctx_.context, ve.enumeration_, &d, &ds); + REQUIRE(rc == TILEDB_OK); + REQUIRE(std::memcmp(var_expect, d, strlen(var_expect)) == 0); + REQUIRE(ds == strlen(var_expect)); + } + + SECTION("failure - invalid context") { + auto rc = tiledb_enumeration_get_data(nullptr, fe.enumeration_, &d, &ds); + REQUIRE(rc == TILEDB_INVALID_CONTEXT); + } + + SECTION("failure - invalid enumeration") { + auto rc = tiledb_enumeration_get_data(fe.ctx_.context, nullptr, &d, &ds); + REQUIRE(rc == TILEDB_ERR); + } + + SECTION("failure - invalid data pointer") { + auto rc = tiledb_enumeration_get_data( + fe.ctx_.context, fe.enumeration_, nullptr, &ds); + REQUIRE(rc == TILEDB_ERR); + } + + SECTION("failure - invalid data size pointer") { + auto rc = tiledb_enumeration_get_data( + fe.ctx_.context, fe.enumeration_, &d, nullptr); + REQUIRE(rc == TILEDB_ERR); + } +} + +TEST_CASE( + "C API: tiledb_enumeration_get_offsets argument validation", + "[capi][enumeration]") { + FixedSizeEnumeration fe; + VarSizeEnumeration ve; + const void* o; + uint64_t os; + + uint64_t var_expect[5] = {0, 3, 6, 9, 14}; + + SECTION("success") { + auto rc = tiledb_enumeration_get_offsets( + fe.ctx_.context, fe.enumeration_, &o, &os); + REQUIRE(rc == TILEDB_OK); + REQUIRE(o == nullptr); + REQUIRE(os == 0); + + rc = tiledb_enumeration_get_offsets( + ve.ctx_.context, ve.enumeration_, &o, &os); + REQUIRE(rc == TILEDB_OK); + REQUIRE(std::memcmp(var_expect, o, sizeof(uint64_t) * 5) == 0); + REQUIRE(os == sizeof(uint64_t) * 5); + } + + SECTION("failure - invalid context") { + auto rc = tiledb_enumeration_get_offsets(nullptr, fe.enumeration_, &o, &os); + REQUIRE(rc == TILEDB_INVALID_CONTEXT); + } + + SECTION("failure - invalid enumeration") { + auto rc = tiledb_enumeration_get_offsets(fe.ctx_.context, nullptr, &o, &os); + REQUIRE(rc == TILEDB_ERR); + } + + SECTION("failure - invalid offsets pointer") { + auto rc = tiledb_enumeration_get_offsets( + fe.ctx_.context, fe.enumeration_, nullptr, &os); + REQUIRE(rc == TILEDB_ERR); + } + + SECTION("failure - invalid offsets size pointer") { + auto rc = tiledb_enumeration_get_offsets( + fe.ctx_.context, fe.enumeration_, &o, nullptr); + REQUIRE(rc == TILEDB_ERR); + } +} diff --git a/tiledb/sm/array/array.cc b/tiledb/sm/array/array.cc index 43ee7ef43279..882cfe29d4c8 100644 --- a/tiledb/sm/array/array.cc +++ b/tiledb/sm/array/array.cc @@ -582,6 +582,32 @@ void Array::delete_fragments_list( } } +shared_ptr Array::get_enumeration(const std::string& attr_name) { + if (!is_open_) { + throw ArrayStatusException("Cannot get enumeration; Array is not open"); + } + + // Do we need to check all previous schema versions if the attribute has + // been dropped? + auto attr = array_schema_latest_->attribute(attr_name); + + if (attr == nullptr) { + return nullptr; + } + + if (!attr->has_enumeration()) { + return nullptr; + } + + auto ptr = array_schema_latest_->enumeration(attr->get_enumeration_name()); + if (ptr != nullptr) { + return ptr; + } + + return array_dir_.load_enumeration( + array_schema_latest_, attr->get_enumeration_name(), get_encryption_key()); +} + bool Array::is_empty() const { return fragment_metadata_.empty(); } diff --git a/tiledb/sm/array/array.h b/tiledb/sm/array/array.h index 92f2fccf14f4..3951aa88cd45 100644 --- a/tiledb/sm/array/array.h +++ b/tiledb/sm/array/array.h @@ -257,6 +257,11 @@ class Array { return fragment_metadata_; } + /** + * Get the enumeration for an attribute + */ + shared_ptr get_enumeration(const std::string& attr_name); + /** * Returns `true` if the array is empty at the time it is opened. * The funciton returns `false` if the array is not open. diff --git a/tiledb/sm/array/array_directory.cc b/tiledb/sm/array/array_directory.cc index 35984fffb0ca..6bbf6046e221 100644 --- a/tiledb/sm/array/array_directory.cc +++ b/tiledb/sm/array/array_directory.cc @@ -33,6 +33,7 @@ #include "tiledb/sm/array/array_directory.h" #include "tiledb/common/logger.h" #include "tiledb/common/stdx_string.h" +#include "tiledb/sm/array_schema/enumeration.h" #include "tiledb/sm/filesystem/vfs.h" #include "tiledb/sm/misc/constants.h" #include "tiledb/sm/misc/parallel_functions.h" @@ -185,6 +186,32 @@ ArrayDirectory::load_all_array_schemas( return array_schemas; } +shared_ptr ArrayDirectory::load_enumeration( + shared_ptr schema, + const std::string& enumeration_name, + const EncryptionKey& encryption_key) const { + auto timer_se = resources_.get().stats().start_timer("sm_load_enumeration"); + + // A subtle assertion that this schema knows about this enumeration + // name. If enumeration_name is unknown it will throw an exception. + schema->enumeration(enumeration_name); + + auto enum_uri = uri_.join_path(constants::array_schema_dir_name) + .join_path(constants::array_enumerations_dir_name) + .join_path(enumeration_name); + + auto&& tile = GenericTileIO::load(resources_, enum_uri, 0, encryption_key); + resources_.get().stats().add_counter("read_enumeration_size", tile.size()); + + Deserializer deserializer(tile.data(), tile.size()); + auto enum_ptr = Enumeration::deserialize(deserializer); + enum_ptr->set_name(enumeration_name); + + schema->set_enumeration(enumeration_name, enum_ptr); + + return enum_ptr; +} + const URI& ArrayDirectory::uri() const { return uri_; } @@ -1140,10 +1167,12 @@ Status ArrayDirectory::compute_array_schema_uris( if (!array_schema_dir_uris.empty()) { array_schema_uris_.reserve( array_schema_uris_.size() + array_schema_dir_uris.size()); - std::copy( - array_schema_dir_uris.begin(), - array_schema_dir_uris.end(), - std::back_inserter(array_schema_uris_)); + for (auto& uri : array_schema_dir_uris) { + if (uri.last_path_part() == constants::array_enumerations_dir_name) { + continue; + } + array_schema_uris_.push_back(uri); + } } return Status::Ok(); diff --git a/tiledb/sm/array/array_directory.h b/tiledb/sm/array/array_directory.h index c4e245314f46..f53f040f4588 100644 --- a/tiledb/sm/array/array_directory.h +++ b/tiledb/sm/array/array_directory.h @@ -385,6 +385,18 @@ class ArrayDirectory { std::unordered_map> load_all_array_schemas(const EncryptionKey& encryption_key) const; + /** + * Load an enumeration from schema with the given name. + * + * @param schema The ArraySchema that references the enumeration name. + * @param enumeration_name The name of the enumeration to load + * @return The loaded enumeration + */ + shared_ptr load_enumeration( + shared_ptr schema, + const std::string& enumeration_name, + const EncryptionKey& encryption_ley) const; + /** Returns the array URI. */ const URI& uri() const; diff --git a/tiledb/sm/array/test/CMakeLists.txt b/tiledb/sm/array/test/CMakeLists.txt index 3f06e3922eeb..62b222de11ea 100644 --- a/tiledb/sm/array/test/CMakeLists.txt +++ b/tiledb/sm/array/test/CMakeLists.txt @@ -28,7 +28,7 @@ include(unit_test) commence(unit_test array) this_target_sources(main.cc unit_array_directory.cc) - this_target_link_libraries(array) + this_target_link_libraries(array context_resources) conclude(unit_test) commence(unit_test consistency) diff --git a/tiledb/sm/array_schema/CMakeLists.txt b/tiledb/sm/array_schema/CMakeLists.txt index f9616f3b7092..df6ec2a47a47 100644 --- a/tiledb/sm/array_schema/CMakeLists.txt +++ b/tiledb/sm/array_schema/CMakeLists.txt @@ -32,7 +32,14 @@ include(object_library) # commence(object_library attribute) this_target_sources(attribute.cc) - this_target_object_libraries(baseline buffer constants filter_pipeline range stringx) + this_target_object_libraries( + baseline + buffer + constants + filter_pipeline + range + stringx + uuid) conclude(object_library) # @@ -51,13 +58,27 @@ commence(object_library domain) this_target_object_libraries(datum dimension math) conclude(object_library) +# +# `enumeration` object library +# +commence(object_library enumeration) + this_target_sources(enumeration.cc) + this_target_object_libraries(buffer constants) +conclude(object_library) + # # `array_schema` object library # commence(object_library array_schema) this_target_sources(array_schema.cc dimension_label.cc) this_target_object_libraries( - attribute domain time uri_format uuid vfs) + attribute domain enumeration time uri_format uuid vfs) conclude(object_library) +# This is linked outside the object_library scope because ContextResources +# is recompiled as part of the capi_context_stub. Including context_resources +# here like this prevents many headaches revolving around duplicate symbols +# when linking executables. +target_link_libraries(compile_array_schema PRIVATE context_resources generic_tile_io) + add_test_subdirectory() diff --git a/tiledb/sm/array_schema/array_schema.cc b/tiledb/sm/array_schema/array_schema.cc index 656f034f9466..8ec7df8ae0d3 100644 --- a/tiledb/sm/array_schema/array_schema.cc +++ b/tiledb/sm/array_schema/array_schema.cc @@ -39,6 +39,7 @@ #include "tiledb/sm/array_schema/dimension.h" #include "tiledb/sm/array_schema/dimension_label.h" #include "tiledb/sm/array_schema/domain.h" +#include "tiledb/sm/array_schema/enumeration.h" #include "tiledb/sm/buffer/buffer.h" #include "tiledb/sm/enums/array_type.h" #include "tiledb/sm/enums/compressor.h" @@ -50,6 +51,8 @@ #include "tiledb/sm/filter/webp_filter.h" #include "tiledb/sm/misc/hilbert.h" #include "tiledb/sm/misc/tdb_time.h" +#include "tiledb/sm/tile/generic_tile_io.h" +#include "tiledb/storage_format/uri/generate_uri.h" #include "tiledb/storage_format/uri/parse_uri.h" #include @@ -147,11 +150,15 @@ ArraySchema::ArraySchema( dim_map_[dim->name()] = dim; } - // Create attribute map + // Create attribute and enumeration map if (!attributes_.empty()) { for (auto attr_iter : attributes_) { auto attr = attr_iter.get(); attribute_map_[attr->name()] = attr; + if (attr->has_enumeration()) { + enumeration_map_[attr->get_enumeration_name()] = + shared_ptr(); + } } } @@ -196,8 +203,14 @@ ArraySchema::ArraySchema(const ArraySchema& array_schema) { throw_if_not_ok(set_domain(array_schema.domain_)); attribute_map_.clear(); - for (auto attr : array_schema.attributes_) + enumeration_map_.clear(); + for (auto attr : array_schema.attributes_) { throw_if_not_ok(add_attribute(attr, false)); + if (attr->has_enumeration()) { + enumeration_map_[attr->get_enumeration_name()] = + shared_ptr(); + } + } // Create dimension label map for (const auto& label : array_schema.dimension_labels_) { @@ -248,6 +261,40 @@ const std::vector>& ArraySchema::attributes() return attributes_; } +shared_ptr ArraySchema::enumeration( + const std::string& enumeration_name) const { + auto found = enumeration_map_.find(enumeration_name); + if (found == enumeration_map_.end()) { + throw ArraySchemaStatusException( + "Unknown enumeration '" + enumeration_name + "'"); + } + + return found->second; +} + +std::vector> ArraySchema::enumerations() const { + std::vector> ret; + ret.resize(enumeration_map_.size()); + + int idx = 0; + for (const auto& pair : enumeration_map_) { + ret[idx++] = pair.second; + } + + return ret; +} + +void ArraySchema::set_enumeration( + const std::string& enumeration_name, shared_ptr enumeration) { + auto found = enumeration_map_.find(enumeration_name); + if (found == enumeration_map_.end()) { + throw ArraySchemaStatusException( + "Unknown enumeration '" + enumeration_name + "'"); + } + + found->second = enumeration; +} + uint64_t ArraySchema::capacity() const { return capacity_; } @@ -697,6 +744,14 @@ bool ArraySchema::is_nullable(const std::string& name) const { return attr->nullable(); } +bool ArraySchema::has_enumeration(const std::string& name) const { + auto attr = this->attribute(name); + if (attr == nullptr) { + return false; + } + return attr->has_enumeration(); +} + // ===== FORMAT ===== // version (uint32_t) // allow_dups (bool) @@ -851,6 +906,41 @@ Status ArraySchema::add_attribute( return Status::Ok(); } +void ArraySchema::add_attribute( + shared_ptr attr, + shared_ptr enumeration, + bool check_special) { + if (attr == nullptr) { + throw ArraySchemaStatusException( + "Cannot add attribute; Input attribute is null"); + } + + if (enumeration == nullptr) { + throw ArraySchemaStatusException( + "Cannot add attribute; Input enumeration is null"); + } + + if (enumeration->name().empty()) { + auto name = storage_format::generate_uri( + timestamp_range_.first, timestamp_range_.second, write_version()); + if (name[0] == '/') { + name = name.substr(1); + } + + enumeration->set_name(name); + } + + auto found = enumeration_map_.find(enumeration->name()); + if (found == enumeration_map_.end()) { + enumerations_.push_back(enumeration); + enumeration_map_[enumeration->name()] = enumeration; + } + + attr->set_enumeration_name(enumeration->name()); + + throw_if_not_ok(add_attribute(attr, check_special)); +} + void ArraySchema::add_dimension_label( dimension_size_type dim_id, const std::string& name, @@ -1280,6 +1370,7 @@ const std::string& ArraySchema::name() const { /* ****************************** */ /* PRIVATE METHODS */ /* ****************************** */ + void ArraySchema::check_attribute_dimension_label_names() const { std::set names; // Check attribute and dimension names are unique. diff --git a/tiledb/sm/array_schema/array_schema.h b/tiledb/sm/array_schema/array_schema.h index d4a8bc64e15a..e9ba956c5da2 100644 --- a/tiledb/sm/array_schema/array_schema.h +++ b/tiledb/sm/array_schema/array_schema.h @@ -43,6 +43,7 @@ #include "tiledb/sm/misc/constants.h" #include "tiledb/sm/misc/hilbert.h" #include "tiledb/sm/misc/uuid.h" +#include "tiledb/sm/storage_manager/context_resources.h" using namespace tiledb::common; @@ -55,6 +56,7 @@ class ConstBuffer; class Dimension; class DimensionLabel; class Domain; +class Enumeration; enum class ArrayType : uint8_t; enum class Compressor : uint8_t; @@ -184,6 +186,25 @@ class ArraySchema { /** Returns the attributes. */ const std::vector>& attributes() const; + /** + * Returns a shared pointer to the selected enumeration (nullptr if it + * does not exist). + */ + shared_ptr enumeration(const std::string& name) const; + + /** Returns the enumerations. */ + std::vector> enumerations() const; + + /** + * Store a loaded enumeration + * + * @param enumeration_name The name of the enumeration to store. This must + * already be known to the array schema. + * @param enumeration The loaded enumeration to store + */ + void set_enumeration( + const std::string& enumeration_name, shared_ptr enumeration); + /** Returns the capacity. */ uint64_t capacity() const; @@ -301,6 +322,15 @@ class ArraySchema { /** Returns true if the input name is nullable. */ bool is_nullable(const std::string& name) const; + /** + * Returns true if the attribute with given name has an enumeration. Returns + * false if no attribute with name exists (rather than throws). + * + * @param name The name of the *attribute* that might have an enumeration. + * @return bool + */ + bool has_enumeration(const std::string& name) const; + /** * Serializes the array schema object into a buffer. * @@ -338,6 +368,22 @@ class ArraySchema { Status add_attribute( shared_ptr attr, bool check_special = true); + /** + * Adds an attribute with an enumeration + * + * @param attr The attribute to be added + * @param enumeration The enumeration to set on the attribute + * @param check_special If `true` this function will check if the attribute + * is special (starting with `__`) and error if that's the case. Setting + * to `false` will allow adding attributes starting with `__`, noting + * that particular care must be taken (i.e., the user must know what + * they are doing in this case). + */ + void add_attribute( + shared_ptr attr, + shared_ptr enumeration, + bool check_special = true); + /** * Adds a dimension label to the array schema. * @@ -546,6 +592,12 @@ class ArraySchema { /** A map from the dimension label names to the label schemas. */ std::unordered_map dimension_label_map_; + /** The array enumerations */ + std::vector> enumerations_; + + /** A map of URIs to Enumerations */ + std::unordered_map> enumeration_map_; + /** The filter pipeline run on offset tiles for var-length attributes. */ FilterPipeline cell_var_offsets_filters_; diff --git a/tiledb/sm/array_schema/array_schema_evolution.cc b/tiledb/sm/array_schema/array_schema_evolution.cc index bf285733c71e..c57f1094576e 100644 --- a/tiledb/sm/array_schema/array_schema_evolution.cc +++ b/tiledb/sm/array_schema/array_schema_evolution.cc @@ -35,10 +35,12 @@ #include "tiledb/common/common.h" #include "tiledb/common/heap_memory.h" #include "tiledb/common/logger.h" +#include "tiledb/common/status.h" #include "tiledb/sm/array_schema/array_schema.h" #include "tiledb/sm/array_schema/attribute.h" #include "tiledb/sm/array_schema/dimension.h" #include "tiledb/sm/array_schema/domain.h" +#include "tiledb/sm/array_schema/enumeration.h" #include "tiledb/sm/buffer/buffer.h" #include "tiledb/sm/enums/array_type.h" #include "tiledb/sm/enums/compressor.h" @@ -58,6 +60,14 @@ using namespace tiledb::common; namespace tiledb { namespace sm { +/** Class for locally generated exceptions. */ +class ArraySchemaEvolutionException : public StatusException { + public: + explicit ArraySchemaEvolutionException(const std::string& msg) + : StatusException("ArraySchemaEvolution", msg) { + } +}; + /* ****************************** */ /* CONSTRUCTORS & DESTRUCTORS */ /* ****************************** */ @@ -73,30 +83,32 @@ ArraySchemaEvolution::~ArraySchemaEvolution() { /* API */ /* ****************************** */ -tuple>> -ArraySchemaEvolution::evolve_schema( +shared_ptr ArraySchemaEvolution::evolve_schema( const shared_ptr& orig_schema) { std::lock_guard lock(mtx_); if (orig_schema == nullptr) { - return { - LOG_STATUS(Status_ArraySchemaEvolutionError( - "Cannot evolve schema; Input array schema is null")), - nullopt}; + throw ArraySchemaEvolutionException( + "Cannot evolve schema; Input array schema is null"); } auto schema = make_shared(HERE(), *(orig_schema.get())); // Add attributes. for (auto& attr : attributes_to_add_map_) { - RETURN_NOT_OK_TUPLE(schema->add_attribute(attr.second, false), nullopt); + auto eit = enumerations_to_add_map_.find(attr.first); + if (eit == enumerations_to_add_map_.end()) { + throw_if_not_ok(schema->add_attribute(attr.second, false)); + } else { + schema->add_attribute(attr.second, eit->second, false); + } } // Drop attributes. for (auto& attr_name : attributes_to_drop_) { bool has_attr = false; - RETURN_NOT_OK_TUPLE(schema->has_attribute(attr_name, &has_attr), nullopt); + throw_if_not_ok(schema->has_attribute(attr_name, &has_attr)); if (has_attr) { - RETURN_NOT_OK_TUPLE(schema->drop_attribute(attr_name), nullopt); + throw_if_not_ok(schema->drop_attribute(attr_name)); } } @@ -104,28 +116,27 @@ ArraySchemaEvolution::evolve_schema( // Set timestamp, if specified if (std::get<0>(timestamp_range_) != 0) { - RETURN_NOT_OK_TUPLE( - schema.get()->set_timestamp_range(timestamp_range_), nullopt); - RETURN_NOT_OK_TUPLE(schema->generate_uri(timestamp_range_), nullopt); + throw_if_not_ok(schema.get()->set_timestamp_range(timestamp_range_)); + throw_if_not_ok(schema->generate_uri(timestamp_range_)); } else { // Generate new schema URI - RETURN_NOT_OK_TUPLE(schema->generate_uri(), nullopt); + throw_if_not_ok(schema->generate_uri()); } - return {Status::Ok(), schema}; + return schema; } -Status ArraySchemaEvolution::add_attribute(const Attribute* attr) { +void ArraySchemaEvolution::add_attribute(const Attribute* attr) { std::lock_guard lock(mtx_); // Sanity check if (attr == nullptr) - return LOG_STATUS(Status_ArraySchemaEvolutionError( - "Cannot add attribute; Input attribute is null")); + throw ArraySchemaEvolutionException( + "Cannot add attribute; Input attribute is null"); if (attributes_to_add_map_.find(attr->name()) != attributes_to_add_map_.end()) { - return LOG_STATUS(Status_ArraySchemaEvolutionError( - "Cannot add attribute; Input attribute name is already there")); + throw ArraySchemaEvolutionException( + "Cannot add attribute; Input attribute name is already there"); } // Create new attribute and potentially set a default name @@ -134,8 +145,15 @@ Status ArraySchemaEvolution::add_attribute(const Attribute* attr) { if (attributes_to_drop_.find(attr->name()) != attributes_to_drop_.end()) { attributes_to_drop_.erase(attr->name()); } +} + +void ArraySchemaEvolution::add_attribute( + const Attribute* attr, shared_ptr enumeration) { + // Wait to acquire the lock because add_attribute takes the lock as well + add_attribute(attr); - return Status::Ok(); + std::lock_guard lock(mtx_); + enumerations_to_add_map_[attr->name()] = enumeration; } std::vector ArraySchemaEvolution::attribute_names_to_add() const { @@ -157,15 +175,41 @@ const Attribute* ArraySchemaEvolution::attribute_to_add( return (it == attributes_to_add_map_.end()) ? nullptr : it->second.get(); } -Status ArraySchemaEvolution::drop_attribute(const std::string& attribute_name) { +std::vector ArraySchemaEvolution::enumeration_names_to_add() + const { + std::lock_guard lock(mtx_); + std::vector names; + names.reserve(enumerations_to_add_map_.size()); + for (auto elem : enumerations_to_add_map_) { + names.push_back(elem.first); + } + return names; +} + +const Enumeration* ArraySchemaEvolution::enumeration_to_add( + const std::string& name) const { + std::lock_guard lock(mtx_); + auto it = enumerations_to_add_map_.find(name); + + if (it == enumerations_to_add_map_.end()) { + return nullptr; + } + + return it->second.get(); +} + +void ArraySchemaEvolution::drop_attribute(const std::string& attribute_name) { std::lock_guard lock(mtx_); attributes_to_drop_.insert(attribute_name); - if (attributes_to_add_map_.find(attribute_name) != - attributes_to_add_map_.end()) { + auto ait = attributes_to_add_map_.find(attribute_name); + if (ait != attributes_to_add_map_.end()) { // Reset the pointer and erase it - attributes_to_add_map_.erase(attribute_name); + attributes_to_add_map_.erase(ait); + } + auto eit = enumerations_to_add_map_.find(attribute_name); + if (eit != enumerations_to_add_map_.end()) { + enumerations_to_add_map_.erase(eit); } - return Status::Ok(); } std::vector ArraySchemaEvolution::attribute_names_to_drop() const { @@ -179,7 +223,7 @@ std::vector ArraySchemaEvolution::attribute_names_to_drop() const { return names; } -Status ArraySchemaEvolution::set_timestamp_range( +void ArraySchemaEvolution::set_timestamp_range( const std::pair& timestamp_range) { if (timestamp_range.first != timestamp_range.second) { throw std::runtime_error(std::string( @@ -188,7 +232,6 @@ Status ArraySchemaEvolution::set_timestamp_range( std::to_string(timestamp_range.second) + " are not equal!")); } timestamp_range_ = timestamp_range; - return Status::Ok(); } std::pair ArraySchemaEvolution::timestamp_range() const { @@ -202,6 +245,7 @@ std::pair ArraySchemaEvolution::timestamp_range() const { void ArraySchemaEvolution::clear() { attributes_to_add_map_.clear(); + enumerations_to_add_map_.clear(); attributes_to_drop_.clear(); timestamp_range_ = {0, 0}; } diff --git a/tiledb/sm/array_schema/array_schema_evolution.h b/tiledb/sm/array_schema/array_schema_evolution.h index 6ba64b754519..846dec6e5e7f 100644 --- a/tiledb/sm/array_schema/array_schema_evolution.h +++ b/tiledb/sm/array_schema/array_schema_evolution.h @@ -38,7 +38,6 @@ #include #include "tiledb/common/common.h" -#include "tiledb/common/status.h" #include "tiledb/sm/filesystem/uri.h" #include "tiledb/sm/filter/filter_pipeline.h" #include "tiledb/sm/misc/constants.h" @@ -55,6 +54,7 @@ class Buffer; class ConstBuffer; class Dimension; class Domain; +class Enumeration; class ArraySchema; enum class ArrayType : uint8_t; @@ -79,16 +79,24 @@ class ArraySchemaEvolution { /* API */ /* ********************************* */ - tuple>> evolve_schema( + shared_ptr evolve_schema( const shared_ptr& orig_schema); /** * Adds an attribute, copying the input. * * @param attr The attribute to be added - * @return Status */ - Status add_attribute(const Attribute* attr); + void add_attribute(const Attribute* attr); + + /** + * Adds an attribute with an enumeration, copying the attribute. + * + * @param attr The attribute to be added + * @param enumeration The enumeration to set on the attribute. + */ + void add_attribute( + const Attribute* attr, shared_ptr enumeration); /** Returns the names of attributes to add. */ std::vector attribute_names_to_add() const; @@ -99,19 +107,27 @@ class ArraySchemaEvolution { */ const Attribute* attribute_to_add(const std::string& name) const; + /** Returns the names of the attributes to add. */ + std::vector enumeration_names_to_add() const; + + /** + * Returns a constant pointer to the selected enumeration or nullptr if + * it does not exist. + */ + const Enumeration* enumeration_to_add(const std::string& name) const; + /** * Drops an attribute. * * @param attr The attribute to be dropped - * @return Status */ - Status drop_attribute(const std::string& attribute_name); + void drop_attribute(const std::string& attribute_name); /** Returns the names of attributes to drop. */ std::vector attribute_names_to_drop() const; /** Set a timestamp range for the array schema evolution */ - Status set_timestamp_range( + void set_timestamp_range( const std::pair& timestamp_range); /** Returns the timestamp range. */ @@ -126,6 +142,10 @@ class ArraySchemaEvolution { /** It maps each attribute name to the corresponding attribute object. */ std::unordered_map> attributes_to_add_map_; + /** Enumerations to add with any attribute. */ + std::unordered_map> + enumerations_to_add_map_; + /** The names of array attributes to be dropped. */ std::unordered_set attributes_to_drop_; @@ -151,4 +171,4 @@ class ArraySchemaEvolution { } // namespace sm } // namespace tiledb -#endif // TILEDB_SCHEMA_EVOLUTION_H \ No newline at end of file +#endif // TILEDB_SCHEMA_EVOLUTION_H diff --git a/tiledb/sm/array_schema/attribute.cc b/tiledb/sm/array_schema/attribute.cc index 8c6132a5c925..055241584a56 100644 --- a/tiledb/sm/array_schema/attribute.cc +++ b/tiledb/sm/array_schema/attribute.cc @@ -38,6 +38,7 @@ #include "tiledb/sm/enums/filter_type.h" #include "tiledb/sm/filter/compression_filter.h" #include "tiledb/sm/misc/parse_argument.h" +#include "tiledb/sm/misc/uuid.h" #include "tiledb/type/range/range.h" #include @@ -115,7 +116,8 @@ Attribute::Attribute( const FilterPipeline& filter_pipeline, const ByteVecValue& fill_value, uint8_t fill_value_validity, - DataOrder order) + DataOrder order, + const std::string& enumeration_name) : cell_val_num_(cell_val_num) , nullable_(nullable) , filters_(filter_pipeline) @@ -123,7 +125,8 @@ Attribute::Attribute( , type_(type) , fill_value_(fill_value) , fill_value_validity_(fill_value_validity) - , order_(order) { + , order_(order) + , enumeration_name_(enumeration_name) { } Attribute::Attribute(const Attribute* attr) { @@ -136,6 +139,7 @@ Attribute::Attribute(const Attribute* attr) { fill_value_ = attr->fill_value_; fill_value_validity_ = attr->fill_value_validity_; order_ = attr->order_; + enumeration_name_ = attr->enumeration_name_; } Attribute::~Attribute() = default; @@ -204,6 +208,13 @@ Attribute Attribute::deserialize( order = data_order_from_int(deserializer.read()); } + std::string enumeration_name; + if (version >= 19) { + auto enum_name_length = deserializer.read(); + enumeration_name.resize(enum_name_length); + deserializer.read(enumeration_name.data(), enum_name_length); + } + return Attribute( name, datatype, @@ -212,7 +223,8 @@ Attribute Attribute::deserialize( filterpipeline, fill_value, fill_value_validity, - order); + order, + enumeration_name); } void Attribute::dump(FILE* out) const { @@ -299,6 +311,12 @@ void Attribute::serialize( if (version >= 17) { serializer.write(static_cast(order_)); } + + // Write enumeration URI + if (version >= 19) { + serializer.write(enumeration_name_.size()); + serializer.write(enumeration_name_.data(), enumeration_name_.size()); + } } void Attribute::set_cell_val_num(unsigned int cell_val_num) { @@ -480,6 +498,18 @@ DataOrder Attribute::order() const { return order_; } +void Attribute::set_enumeration_name(const std::string& name) { + enumeration_name_ = name; +} + +const std::string& Attribute::get_enumeration_name() const { + return enumeration_name_; +} + +bool Attribute::has_enumeration() const { + return !enumeration_name_.empty(); +} + /* ********************************* */ /* PRIVATE METHODS */ /* ********************************* */ diff --git a/tiledb/sm/array_schema/attribute.h b/tiledb/sm/array_schema/attribute.h index 7e3e7a4dd302..1c5b463d14e1 100644 --- a/tiledb/sm/array_schema/attribute.h +++ b/tiledb/sm/array_schema/attribute.h @@ -51,6 +51,7 @@ namespace sm { class Buffer; class ConstBuffer; +class Enumeration; enum class Compressor : uint8_t; enum class Datatype : uint8_t; @@ -110,7 +111,8 @@ class Attribute { const FilterPipeline& filter_pipeline, const ByteVecValue& fill_value, uint8_t fill_value_validity, - DataOrder order = DataOrder::UNORDERED_DATA); + DataOrder order = DataOrder::UNORDERED_DATA, + const std::string& enumeration_name = {}); /** * Constructor. It clones the input attribute. @@ -242,6 +244,15 @@ class Attribute { static ByteVecValue default_fill_value( Datatype datatype, uint32_t cell_val_num); + /** Set an enumeration for this attribute. */ + void set_enumeration_name(const std::string& name); + + /** Get the enumeration for this attribute. */ + const std::string& get_enumeration_name() const; + + /** Check if this attribute has an enumeration. */ + bool has_enumeration() const; + private: /* ********************************* */ /* PRIVATE ATTRIBUTES */ @@ -271,8 +282,11 @@ class Attribute { /** The required order of the data stored in the attribute. */ DataOrder order_; + /** The enumeration UUID if one exists. */ + std::string enumeration_name_; + /* ********************************* */ - /* PRIVATE ATTRIBUTES */ + /* PRIVATE METHODS */ /* ********************************* */ /** Sets the default fill value. */ diff --git a/tiledb/sm/array_schema/enumeration.cc b/tiledb/sm/array_schema/enumeration.cc new file mode 100644 index 000000000000..89b043083915 --- /dev/null +++ b/tiledb/sm/array_schema/enumeration.cc @@ -0,0 +1,230 @@ +/** + * @file enumeration.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file implements class Enumeration. + */ + +#include + +#include "enumeration.h" + +namespace tiledb::sm { + +/** Class for locally generated status exceptions. */ +class EnumerationException : public StatusException { + public: + explicit EnumerationException(const std::string& msg) + : StatusException("Enumeration", msg) { + } +}; + +Enumeration::Enumeration( + Datatype type, + uint32_t cell_val_num, + bool ordered, + const void* data, + uint64_t data_size, + const void* offsets, + uint64_t offsets_size) + : type_(type) + , cell_val_num_(cell_val_num) + , ordered_(ordered) + , data_(data_size) + , offsets_(offsets_size) { + if (cell_val_num == 0) { + throw EnumerationException("Invalid cell_val_num in Enumeration"); + } + + if (data == nullptr || data_size == 0) { + throw EnumerationException("No attribute value data supplied."); + } + + if (var_size() && (offsets == nullptr || offsets_size == 0)) { + throw EnumerationException( + "Variable length datatype defined but offsets are not present"); + } else if (!var_size() && (offsets != nullptr || offsets_size > 0)) { + throw EnumerationException( + "Fixed length datatype defined but offsets are present"); + } + + if (var_size()) { + if (offsets_size % sizeof(uint64_t) != 0) { + throw EnumerationException( + "Invalid offsets size is not a multiple of sizeof(uint64_t)"); + } + auto offset_values = static_cast(offsets); + uint64_t last_offset = (offsets_size / sizeof(uint64_t)) - 1; + if (offset_values[last_offset] > data_size) { + throw EnumerationException( + "Provided data buffer size is too small for the provided offsets."); + } + } else { + if (data_size % cell_size() != 0) { + throw EnumerationException( + "Invalid data size is not a multiple of the cell size."); + } + } + + throw_if_not_ok(data_.write(data, 0, data_size)); + + if (offsets_size > 0) { + throw_if_not_ok(offsets_.write(offsets, 0, offsets_size)); + } + + generate_value_map(); +} + +shared_ptr Enumeration::deserialize(Deserializer& deserializer) { + auto disk_version = deserializer.read(); + if (disk_version < constants::enumerations_min_version) { + throw EnumerationException( + "Invalid Enumeration version '" + std::to_string(disk_version) + + "' is older than minimum version for enumerations '" + + std::to_string(constants::enumerations_min_version) + "'"); + } + + auto type = deserializer.read(); + auto cell_val_num = deserializer.read(); + auto ordered = deserializer.read(); + auto data_size = deserializer.read(); + + uint8_t data[data_size]; + deserializer.read(data, data_size); + + if (cell_val_num != constants::var_num) { + return Enumeration::create( + static_cast(type), + cell_val_num, + ordered, + data, + data_size, + nullptr, + 0); + } else { + uint64_t offsets_size = 0; + offsets_size = deserializer.read(); + + uint8_t offsets[offsets_size]; + deserializer.read(offsets, offsets_size); + + return Enumeration::create( + static_cast(type), + cell_val_num, + ordered, + data, + data_size, + offsets, + offsets_size); + } +} + +void Enumeration::serialize(Serializer& serializer, uint32_t version) const { + if (version < constants::enumerations_min_version) { + throw EnumerationException("Invalid version when serializing enumeration."); + } + + serializer.write(version); + + serializer.write(static_cast(type_)); + serializer.write(cell_val_num_); + serializer.write(ordered_); + serializer.write(data_.size()); + serializer.write(data_.data(), data_.size()); + + if (var_size()) { + serializer.write(offsets_.size()); + serializer.write(offsets_.data(), offsets_.size()); + } else { + assert(cell_val_num_ < constants::var_num); + assert(offsets_.size() == 0); + } +} + +void Enumeration::set_name(const std::string& name) { + if (name.empty()) { + throw EnumerationException("Enumeration name must not be empty"); + } + + if (name.find("/") != std::string::npos) { + throw EnumerationException( + "Enumeration name must not contain path separators"); + } + + name_ = name; +} + +uint64_t Enumeration::index_of(UntypedDatumView value) const { + std::string_view value_view( + static_cast(value.content()), value.size()); + + auto iter = value_map_.find(value_view); + if (iter == value_map_.end()) { + return std::numeric_limits::max(); + } + + return iter->second; +} + +void Enumeration::generate_value_map() { + auto char_data = data_.data_as(); + if (var_size()) { + auto offsets = offsets_.data_as(); + uint64_t num_offsets = offsets_.size() / sizeof(uint64_t); + + for (uint64_t i = 0; i < num_offsets; i++) { + uint64_t length = 0; + if (i < num_offsets - 1) { + length = offsets[i + 1] - offsets[i]; + } else { + length = data_.size() - offsets[i]; + } + + auto sv = std::string_view(char_data + offsets[i], length); + add_value_to_map(sv, i); + } + } else { + uint64_t i = 0; + auto stride = cell_size(); + while (i * stride < data_.size()) { + auto sv = std::string_view(char_data + i * stride, stride); + add_value_to_map(sv, i); + i += 1; + } + } +} + +void Enumeration::add_value_to_map(std::string_view& sv, uint64_t index) { + if (value_map_.find(sv) != value_map_.end()) { + throw EnumerationException( + "Invalid duplicated value in enumeration '" + std::string(sv) + "'"); + } + value_map_[sv] = index; +} + +} // namespace tiledb::sm diff --git a/tiledb/sm/array_schema/enumeration.h b/tiledb/sm/array_schema/enumeration.h new file mode 100644 index 000000000000..b158c06b1e24 --- /dev/null +++ b/tiledb/sm/array_schema/enumeration.h @@ -0,0 +1,229 @@ +/** + * @file enumeration.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file defines class Enumeration. + */ + +#ifndef TILEDB_ENUMERATION_H +#define TILEDB_ENUMERATION_H + +#include + +#include "tiledb/common/common.h" +#include "tiledb/common/types/untyped_datum.h" +#include "tiledb/sm/buffer/buffer.h" +#include "tiledb/sm/enums/datatype.h" +#include "tiledb/sm/filesystem/uri.h" +#include "tiledb/storage_format/serialization/serializers.h" + +namespace tiledb::sm { + +/** Defines an array enumeration */ +class Enumeration { + public: + /* ********************************* */ + /* CONSTRUCTORS & DESTRUCTORS */ + /* ********************************* */ + + /** No default constructor. Use the create factory method. */ + Enumeration() = delete; + + DISABLE_COPY(Enumeration); + DISABLE_MOVE(Enumeration); + + /** Destructor. */ + ~Enumeration() = default; + + /* ********************************* */ + /* OPERATORS */ + /* ********************************* */ + + DISABLE_COPY_ASSIGN(Enumeration); + DISABLE_MOVE_ASSIGN(Enumeration); + + /* ********************************* */ + /* API */ + /* ********************************* */ + + /** Create a new Enumeration */ + shared_ptr static create( + Datatype type, + uint32_t cell_val_num, + bool ordered, + const void* data, + uint64_t data_size, + const void* offsets, + uint64_t offsets_size) { + struct EnableMakeShared : public Enumeration { + EnableMakeShared( + Datatype type, + uint32_t cell_val_num, + bool ordered, + const void* data, + uint64_t data_size, + const void* offsets, + uint64_t offsets_size) + : Enumeration( + type, + cell_val_num, + ordered, + data, + data_size, + offsets, + offsets_size) { + } + }; + return make_shared( + HERE(), + type, + cell_val_num, + ordered, + data, + data_size, + offsets, + offsets_size); + } + + /** + * Deserialize an enumeration + * + * @param deserializer The deserializer to deserialize from. + * @return A new Enumeration + */ + static shared_ptr deserialize(Deserializer& deserializer); + + /** + * Serializes the enumeration into a buffer. + * + * @param serializer The object the array schema is serialized into. + */ + void serialize(Serializer& serializer, uint32_t version) const; + + // TODO: Add proxy calls to various Buffer::as_value/cur_data_as functions + // or just return buffers instead? + + /** + * Name stores the `__t1_t2_uuid_version` formatted string that is used + * when constructing URIs for Enumeration storage. + * + * @return The name of this Enumeration + */ + const std::string& name() const { + return name_; + } + + void set_name(const std::string& name); + + Datatype type() const { + return type_; + } + + uint32_t cell_val_num() const { + return cell_val_num_; + } + + uint64_t cell_size() const { + if (var_size()) { + return constants::var_size; + } + + return cell_val_num_ * datatype_size(type_); + } + + bool var_size() const { + return cell_val_num_ == constants::var_num; + } + + bool ordered() const { + return ordered_; + } + + tuple data() const { + return {data_.data(), data_.size()}; + } + + tuple offsets() const { + return {offsets_.data(), offsets_.size()}; + } + + uint64_t index_of(UntypedDatumView data) const; + + private: + /* ********************************* */ + /* PRIVATE CONSTRUCTORS */ + /* ********************************* */ + + Enumeration( + Datatype type, + uint32_t cell_val_num, + bool ordered, + const void* data, + uint64_t data_size, + const void* offsets, + uint64_t offsets_size); + + /* ********************************* */ + /* PRIVATE ATTRIBUTES */ + /* ********************************* */ + + /** The name of this Enumeration stored in the enumerations directory. */ + std::string name_; + + /** The type of enumerated values */ + Datatype type_; + + /** Number of values per enumeration value */ + uint32_t cell_val_num_; + + /** A flag which enables or disables inequality comparisons */ + bool ordered_; + + /** The list of enumeration values */ + Buffer data_; + + /** The offsets of each enumeration value if var sized. */ + Buffer offsets_; + + /** Map of values to indices */ + std::unordered_map value_map_; + + /* ********************************* */ + /* PRIVATE METHODS */ + /* ********************************* */ + + /** Populate the value_map_ */ + void generate_value_map(); + + /** Add a value to value_map_ */ + void add_value_to_map(std::string_view& sv, uint64_t index); +}; + +} // namespace tiledb::sm + +#endif // TILEDB_DOMAIN_H diff --git a/tiledb/sm/array_schema/test/compile_enumeration_main.cc b/tiledb/sm/array_schema/test/compile_enumeration_main.cc new file mode 100644 index 000000000000..dc407325d5b5 --- /dev/null +++ b/tiledb/sm/array_schema/test/compile_enumeration_main.cc @@ -0,0 +1,36 @@ +/** + * @file compile_enumeration_main.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "tiledb/sm/array_schema/enumeration.h" +#include "tiledb/sm/enums/datatype.h" + +int main(int, char*[]) { + tiledb::sm::Enumeration::create( + tiledb::sm::Datatype::INT32, 1, false, nullptr, 0, nullptr, 0); + return 0; +} diff --git a/tiledb/sm/c_api/tiledb.cc b/tiledb/sm/c_api/tiledb.cc index ed9eaf036925..140f1f17026c 100644 --- a/tiledb/sm/c_api/tiledb.cc +++ b/tiledb/sm/c_api/tiledb.cc @@ -40,6 +40,7 @@ #include "tiledb/api/c_api/buffer_list/buffer_list_api_internal.h" #include "tiledb/api/c_api/config/config_api_internal.h" #include "tiledb/api/c_api/dimension/dimension_api_internal.h" +#include "tiledb/api/c_api/enumeration/enumeration_api_internal.h" #include "tiledb/api/c_api/error/error_api_internal.h" #include "tiledb/api/c_api/filter_list/filter_list_api_internal.h" #include "tiledb/api/c_api/string/string_api_internal.h" @@ -710,6 +711,25 @@ int32_t tiledb_array_schema_timestamp_range( return TILEDB_OK; } +TILEDB_EXPORT int32_t tiledb_array_schema_add_attribute_with_enumeration( + tiledb_ctx_t* ctx, + tiledb_array_schema_t* array_schema, + tiledb_attribute_t* attr, + tiledb_enumeration_t* enumeration) { + if (sanity_check(ctx, array_schema) == TILEDB_ERR || + sanity_check(ctx, attr) == TILEDB_ERR) { + return TILEDB_ERR; + } + + api::ensure_enumeration_is_valid(enumeration); + + array_schema->array_schema_->add_attribute( + make_shared(HERE(), attr->attr_), + enumeration->copy()); + + return TILEDB_OK; +} + int32_t tiledb_array_schema_set_coords_filter_list( tiledb_ctx_t* ctx, tiledb_array_schema_t* array_schema, @@ -1255,12 +1275,8 @@ int32_t tiledb_array_schema_evolution_add_attribute( sanity_check(ctx, attr) == TILEDB_ERR) return TILEDB_ERR; - throw_if_not_ok( - array_schema_evolution->array_schema_evolution_->add_attribute( - attr->attr_)); - return TILEDB_OK; + array_schema_evolution->array_schema_evolution_->add_attribute(attr->attr_); - // Success return TILEDB_OK; } @@ -1272,9 +1288,8 @@ int32_t tiledb_array_schema_evolution_drop_attribute( sanity_check(ctx, array_schema_evolution) == TILEDB_ERR) return TILEDB_ERR; - throw_if_not_ok( - array_schema_evolution->array_schema_evolution_->drop_attribute( - attribute_name)); + array_schema_evolution->array_schema_evolution_->drop_attribute( + attribute_name); return TILEDB_OK; } @@ -1287,12 +1302,8 @@ TILEDB_EXPORT int32_t tiledb_array_schema_evolution_set_timestamp_range( sanity_check(ctx, array_schema_evolution) == TILEDB_ERR) return TILEDB_ERR; - throw_if_not_ok( - array_schema_evolution->array_schema_evolution_->set_timestamp_range( - {lo, hi})); - return TILEDB_OK; - - // Success + array_schema_evolution->array_schema_evolution_->set_timestamp_range( + {lo, hi}); return TILEDB_OK; } @@ -3396,6 +3407,26 @@ int32_t tiledb_array_evolve( return TILEDB_OK; } +int32_t tiledb_array_get_enumeration( + tiledb_ctx_t* ctx, + const tiledb_array_t* array, + const char* attr_name, + tiledb_enumeration_t** enumeration) { + if (sanity_check(ctx, array) == TILEDB_ERR) { + return TILEDB_ERR; + } + + if (attr_name == nullptr) { + throw api::CAPIStatusException("'attr_name' must not be null"); + } + + api::ensure_output_pointer_is_valid(enumeration); + auto ptr = array->array_->get_enumeration(attr_name); + *enumeration = tiledb_enumeration_handle_t::make_handle(ptr); + + return TILEDB_OK; +} + int32_t tiledb_array_upgrade_version( tiledb_ctx_t* ctx, const char* array_uri, tiledb_config_t* config) { // Sanity Checks @@ -5353,6 +5384,17 @@ int32_t tiledb_consolidation_plan_free_json_str(char** out) { return TILEDB_OK; } +int32_t tiledb_attribute_has_enumeration( + tiledb_attribute_t* attr, int* has_enumeration) { + if (attr->attr_->has_enumeration()) { + *has_enumeration = 1; + } else { + *has_enumeration = 0; + } + + return TILEDB_OK; +} + } // namespace tiledb::api /* ****************************** */ @@ -5604,6 +5646,14 @@ int32_t tiledb_attribute_get_fill_value_nullable( ctx, attr, value, size, valid); } +int32_t tiledb_attribute_has_enumeration( + tiledb_ctx_t* ctx, + tiledb_attribute_t* attr, + int* has_enumeration) noexcept { + return api_entry_context( + ctx, attr, has_enumeration); +} + /* ********************************* */ /* DOMAIN */ /* ********************************* */ @@ -5757,6 +5807,16 @@ int32_t tiledb_array_schema_timestamp_range( ctx, array_schema, lo, hi); } +int32_t tiledb_array_schema_add_attribute_with_enumeration( + tiledb_ctx_t* ctx, + tiledb_array_schema_t* array_schema, + tiledb_attribute_t* attr, + tiledb_enumeration_t* enumeration) noexcept { + return api_entry< + tiledb::api::tiledb_array_schema_add_attribute_with_enumeration>( + ctx, array_schema, attr, enumeration); +} + int32_t tiledb_array_schema_set_coords_filter_list( tiledb_ctx_t* ctx, tiledb_array_schema_t* array_schema, @@ -6933,6 +6993,15 @@ int32_t tiledb_array_evolve( ctx, array_uri, array_schema_evolution); } +int32_t tiledb_array_get_enumeration( + tiledb_ctx_t* ctx, + const tiledb_array_t* array, + const char* attr_name, + tiledb_enumeration_t** enumeration) noexcept { + return api_entry( + ctx, array, attr_name, enumeration); +} + int32_t tiledb_array_upgrade_version( tiledb_ctx_t* ctx, const char* array_uri, diff --git a/tiledb/sm/c_api/tiledb_experimental.h b/tiledb/sm/c_api/tiledb_experimental.h index f17d5eac0474..809c3468eb2f 100644 --- a/tiledb/sm/c_api/tiledb_experimental.h +++ b/tiledb/sm/c_api/tiledb_experimental.h @@ -41,6 +41,7 @@ /* * API sections */ +#include "tiledb/api/c_api/enumeration/enumeration_api_experimental.h" #include "tiledb/api/c_api/group/group_api_external_experimental.h" #include "tiledb/api/c_api/query_plan/query_plan_api_external_experimental.h" #include "tiledb_dimension_label_experimental.h" @@ -190,6 +191,64 @@ TILEDB_EXPORT int32_t tiledb_array_schema_timestamp_range( uint64_t* lo, uint64_t* hi) TILEDB_NOEXCEPT; +/** + * Adds an attribute with enumeration to an array schema. + * + * **Example:** + * + * @code{.c} + * tiledb_attribute_t* attr; + * tiledb_attribute_alloc(ctx, "my_attr", TILEDB_INT32, &attr); + * tiledb_enumeration_t* enumeration; + * tiledb_dimension_alloc( + * ctx, + * TILEDB_INT64, + * 1, + * FALSE, + * data, + * data_size, + * nullptr, + * 0, + * &enumeration); + * tiledb_array_schema_add_attribute_with_enumeration( + * ctx, array_schema, attr, enumeration); + * @endcode + * + * @param ctx The TileDB context. + * @param array_schema The array schema. + * @param attr The attribute to be added. + * @param enumeration The enumeration to add with the attribute + * @return `TILEDB_OK` for success and `TILEDB_ERR` for error. + */ +TILEDB_EXPORT int32_t tiledb_array_schema_add_attribute_with_enumeration( + tiledb_ctx_t* ctx, + tiledb_array_schema_t* array_schema, + tiledb_attribute_t* attr, + tiledb_enumeration_t* enumeration) TILEDB_NOEXCEPT; + +/* ********************************* */ +/* ATTRIBUTE ENUMERATIONS */ +/* ********************************* */ + +/** + * Check if the given attribute has an enumeration + * + * **Example:** + * + * @code{.c} + * tiledb_attribute_get_enumeration(ctx, attr, &enumeration); + * @endcode + * + * @param ctx The TileDB context. + * @param attr The target attribute. + * @param enumeration A bool indicating if the attribute has an enumeration + * @return `TILEDB_OK` for success and `TILEDB_ERR` for error. + */ +TILEDB_EXPORT int32_t tiledb_attribute_has_enumeration( + tiledb_ctx_t* ctx, + tiledb_attribute_t* attr, + int* has_enumeration) TILEDB_NOEXCEPT; + /* ********************************* */ /* ARRAY */ /* ********************************* */ @@ -249,6 +308,33 @@ TILEDB_EXPORT int32_t tiledb_array_evolve( const char* array_uri, tiledb_array_schema_evolution_t* array_schema_evolution) TILEDB_NOEXCEPT; +/** + * Retrieves an attribute's enumeration given the attribute name (key). + * + * **Example:** + * + * The following retrieves the first attribute in the schema. + * + * @code{.c} + * tiledb_attribute_t* attr; + * tiledb_array_schema_get_enumeration( + * ctx, array_schema, "attr_0", &enumeration); + * // Make sure to delete the retrieved attribute in the end. + * @endcode + * + * @param ctx The TileDB context. + * @param array_schema The array schema. + * @param name The name (key) of the attribute from which to + * retrieve the enumeration. + * @param enumeration The enumeration object to retrieve. + * @return `TILEDB_OK` for success and `TILEDB_ERR` for error. + */ +TILEDB_EXPORT int32_t tiledb_array_get_enumeration( + tiledb_ctx_t* ctx, + const tiledb_array_t* array, + const char* name, + tiledb_enumeration_t** enumeration) TILEDB_NOEXCEPT; + /** * Upgrades an array to the latest format version. * diff --git a/tiledb/sm/cpp_api/array_experimental.h b/tiledb/sm/cpp_api/array_experimental.h new file mode 100644 index 000000000000..9a7edb9cc645 --- /dev/null +++ b/tiledb/sm/cpp_api/array_experimental.h @@ -0,0 +1,56 @@ +/** + * @file array_experimental.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file declares the experimental C++ API for arrays. + */ + +#ifndef TILEDB_CPP_API_ARRAY_EXPERIMENTAL_H +#define TILEDB_CPP_API_ARRAY_EXPERIMENTAL_H + +#include "array.h" +#include "enumeration_experimental.h" +#include "tiledb_experimental.h" + +#include + +namespace tiledb { +class ArrayExperimental { + public: + static Enumeration get_enumeration( + const Context& ctx, const Array& array, const std::string& attr_name) { + tiledb_enumeration_t* enmr; + ctx.handle_error(tiledb_array_get_enumeration( + ctx.ptr().get(), array.ptr().get(), attr_name.c_str(), &enmr)); + return Enumeration(ctx, enmr); + } +}; + +} // namespace tiledb + +#endif diff --git a/tiledb/sm/cpp_api/array_schema_experimental.h b/tiledb/sm/cpp_api/array_schema_experimental.h index e712860126a6..5150ec83b74a 100644 --- a/tiledb/sm/cpp_api/array_schema_experimental.h +++ b/tiledb/sm/cpp_api/array_schema_experimental.h @@ -35,6 +35,7 @@ #include "array_schema.h" #include "dimension_label_experimental.h" +#include "enumeration_experimental.h" #include "filter_list.h" #include "tiledb_experimental.h" @@ -154,6 +155,26 @@ class ArraySchemaExperimental { ctx.ptr().get(), array_schema.ptr().get(), name.c_str(), &dim_label)); return DimensionLabel(ctx, dim_label); } + + /** + * Add an attribute with an enumeration to the array schema. + * + * @param ctx TileDB context + * @param array_schema Target array schema + * @param attr The attribute to add + * @param enumeration The enumeration to add + */ + static void add_attribute( + const Context& ctx, + const ArraySchema& array_schema, + const Attribute& attr, + const Enumeration& enmr) { + ctx.handle_error(tiledb_array_schema_add_attribute_with_enumeration( + ctx.ptr().get(), + array_schema.ptr().get(), + attr.ptr().get(), + enmr.ptr().get())); + } }; } // namespace tiledb diff --git a/tiledb/sm/cpp_api/attribute_experimental.h b/tiledb/sm/cpp_api/attribute_experimental.h new file mode 100644 index 000000000000..896cb5055ba7 --- /dev/null +++ b/tiledb/sm/cpp_api/attribute_experimental.h @@ -0,0 +1,67 @@ +/** + * @file attribute_experimental.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file declares the experimental C++ API for attributes. + */ + +#ifndef TILEDB_CPP_API_ATTRIBUTE_EXPERIMENTAL_H +#define TILEDB_CPP_API_ATTRIBUTE_EXPERIMENTAL_H + +#include "attribute.h" +#include "context.h" +#include "enumeration_experimental.h" + +#include + +namespace tiledb { +class AttributeExperimental { + public: + /** + * Return whether an attribute has an enumeration + * + * @param ctx TileDB context. + * @param attribute Target attribute. + * @return A bool indicating whether the attribute has an enumeration + */ + static bool has_enumeration(const Context& ctx, Attribute& attribute) { + int has_enum; + ctx.handle_error(tiledb_attribute_has_enumeration( + ctx.ptr().get(), attribute.ptr().get(), &has_enum)); + + if (has_enum) { + return true; + } else { + return false; + } + } +}; + +} // namespace tiledb + +#endif diff --git a/tiledb/sm/cpp_api/deleter.h b/tiledb/sm/cpp_api/deleter.h index c72d8ef71a1f..61f9a5d2c92f 100644 --- a/tiledb/sm/cpp_api/deleter.h +++ b/tiledb/sm/cpp_api/deleter.h @@ -111,6 +111,10 @@ class Deleter { tiledb_domain_free(&p); } + void operator()(tiledb_enumeration_t* p) const { + tiledb_enumeration_free(&p); + } + void operator()(tiledb_vfs_t* p) const { tiledb_vfs_free(&p); } diff --git a/tiledb/sm/cpp_api/enumeration_experimental.h b/tiledb/sm/cpp_api/enumeration_experimental.h new file mode 100644 index 000000000000..daee25339d42 --- /dev/null +++ b/tiledb/sm/cpp_api/enumeration_experimental.h @@ -0,0 +1,257 @@ +/** + * @file enumeration_experimental.h + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2023 TileDB, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * This file declares the C++ API for the TileDB Enumeration class. + */ + +#ifndef TILEDB_CPP_API_ENUMERATION_EXPERIMENTAL_H +#define TILEDB_CPP_API_ENUMERATION_EXPERIMENTAL_H + +#include "context.h" +#include "tiledb.h" +#include "tiledb_experimental.h" + +namespace tiledb { + +class Enumeration { + public: + Enumeration(const Context& ctx, tiledb_enumeration_t* enumeration) + : ctx_(ctx) { + enumeration_ = std::shared_ptr(enumeration, deleter_); + } + + Enumeration(const Enumeration&) = default; + Enumeration(Enumeration&&) = default; + Enumeration& operator=(const Enumeration&) = default; + Enumeration& operator=(Enumeration&&) = default; + + /** Destructor. */ + ~Enumeration() = default; + + /* ********************************* */ + /* API */ + /* ********************************* */ + + /** Returns the C TileDB context object. */ + std::shared_ptr ptr() const { + return enumeration_; + } + + /** + * Returns the type of the enumeration values + */ + tiledb_datatype_t type() const { + tiledb_datatype_t ret; + tiledb_ctx_t* c_ctx = ctx_.get().ptr().get(); + ctx_.get().handle_error( + tiledb_enumeration_get_type(c_ctx, enumeration_.get(), &ret)); + return ret; + } + + /** + * Return the cell_val_num of the enumeration values + */ + uint32_t cell_val_num() const { + uint32_t ret; + tiledb_ctx_t* c_ctx = ctx_.get().ptr().get(); + ctx_.get().handle_error( + tiledb_enumeration_get_cell_val_num(c_ctx, enumeration_.get(), &ret)); + return ret; + } + + /** + * Return whether this enumeration is considered ordered. + */ + bool ordered() const { + int is_ordered; + tiledb_ctx_t* c_ctx = ctx_.get().ptr().get(); + ctx_.get().handle_error( + tiledb_enumeration_get_ordered(c_ctx, enumeration_.get(), &is_ordered)); + return is_ordered ? true : false; + } + + template * = nullptr> + void into_vector(std::vector& vec) { + tiledb_ctx_t* c_ctx = ctx_.get().ptr().get(); + + const void* data; + uint64_t data_size; + ctx_.get().handle_error(tiledb_enumeration_get_data( + c_ctx, enumeration_.get(), &data, &data_size)); + + const T* elems = static_cast(data); + size_t count = data_size / sizeof(T); + + vec.clear(); + vec.reserve(count); + for (size_t i = 0; i < count; i++) { + vec.push_back(elems[i]); + } + } + + template * = nullptr> + void into_vector(std::vector>& vec) { + tiledb_ctx_t* c_ctx = ctx_.get().ptr().get(); + + const void* data; + uint64_t data_size; + ctx_.get().handle_error(tiledb_enumeration_get_data( + c_ctx, enumeration_.get(), &data, &data_size)); + + const void* offsets; + uint64_t offsets_size; + ctx_.get().handle_error(tiledb_enumeration_get_offsets( + c_ctx, enumeration_.get(), &offsets, &offsets_size)); + + const T* str_data = static_cast(data); + const uint64_t* elems = static_cast(offsets); + size_t count = offsets_size / sizeof(uint64_t); + + vec.clear(); + vec.reserve(count); + for (size_t i = 0; i < count; i++) { + uint64_t len; + if (i + 1 < count) { + len = elems[i + 1] - elems[i]; + } else { + len = data_size - elems[i]; + } + vec.emplace_back(str_data + elems[i], len); + } + } + + /* ********************************* */ + /* STATIC FUNCTIONS */ + /* ********************************* */ + + template * = nullptr> + static Enumeration create( + const Context& ctx, + std::vector& values, + bool ordered = false, + tiledb_datatype_t type = static_cast(255)) { + using DataT = impl::TypeHandler; + + if (type == static_cast(255)) { + type = DataT::tiledb_type; + } + + return create( + ctx, + type, + DataT::tiledb_num, + ordered, + values.data(), + values.size() * sizeof(T), + nullptr, + 0); + } + + template * = nullptr> + static Enumeration create( + const Context& ctx, + std::vector>& values, + bool ordered = false, + tiledb_datatype_t type = static_cast(255)) { + using DataT = impl::TypeHandler; + static_assert( + DataT::tiledb_num == 1, "Enumeration string values cannot be compound"); + + if (type == static_cast(255)) { + type = DataT::tiledb_type; + } + + uint64_t total_size = 0; + for (auto v : values) { + total_size += v.size() * sizeof(T); + } + + uint8_t data[total_size]; + std::vector offsets; + offsets.reserve(values.size()); + uint64_t curr_offset = 0; + + for (auto v : values) { + std::memcpy(data + curr_offset, v.data(), v.size() * sizeof(T)); + offsets.push_back(curr_offset); + curr_offset += v.size() * sizeof(T); + } + + return create( + ctx, + type, + std::numeric_limits::max(), + ordered, + data, + total_size, + offsets.data(), + offsets.size() * sizeof(uint64_t)); + } + + static Enumeration create( + const Context& ctx, + tiledb_datatype_t type, + uint32_t cell_val_num, + bool ordered, + const void* data, + uint64_t data_size, + const void* offsets, + uint64_t offsets_size) { + tiledb_enumeration_t* enumeration; + ctx.handle_error(tiledb_enumeration_alloc( + ctx.ptr().get(), + type, + cell_val_num, + ordered, + data, + data_size, + offsets, + offsets_size, + &enumeration)); + return Enumeration(ctx, enumeration); + } + + private: + /* ********************************* */ + /* PRIVATE ATTRIBUTES */ + /* ********************************* */ + + /** The TileDB context. */ + std::reference_wrapper ctx_; + + /** Deleter wrapper. */ + impl::Deleter deleter_; + + /** Pointer to the TileDB C Enumeration object. */ + std::shared_ptr enumeration_; +}; + +} // namespace tiledb + +#endif // TILEDB_CPP_API_ENUMERATION_EXPERIMENTAL_H diff --git a/tiledb/sm/cpp_api/tiledb_experimental b/tiledb/sm/cpp_api/tiledb_experimental index df23d84ae518..0e213c4f0ec1 100644 --- a/tiledb/sm/cpp_api/tiledb_experimental +++ b/tiledb/sm/cpp_api/tiledb_experimental @@ -33,9 +33,12 @@ #ifndef TILEDB_EXPERIMENTAL_CPP_H #define TILEDB_EXPERIMENTAL_CPP_H +#include "array_experimental.h" #include "array_schema_evolution.h" #include "array_schema_experimental.h" +#include "attribute_experimental.h" #include "dimension_label_experimental.h" +#include "enumeration_experimental.h" #include "consolidation_plan_experimental.h" #include "group_experimental.h" #include "query_experimental.h" diff --git a/tiledb/sm/enums/layout.h b/tiledb/sm/enums/layout.h index 796519184a22..076b6625fde2 100644 --- a/tiledb/sm/enums/layout.h +++ b/tiledb/sm/enums/layout.h @@ -91,14 +91,14 @@ inline Status layout_enum(const std::string& layout_str, Layout* layout) { inline void ensure_tile_order_is_valid(uint8_t layout_enum) { if (layout_enum != 0 && layout_enum != 1) throw std::runtime_error( - "[Tile order] Invalid Layout enum " + std::to_string(layout_enum)); + "[Tile order] Invalid Tile Layout enum " + std::to_string(layout_enum)); } /* Throws error if cell order's enumeration is greater than 4. */ inline void ensure_cell_order_is_valid(uint8_t layout_enum) { if (layout_enum > 4) throw std::runtime_error( - "[Cell order] Invalid Layout enum " + std::to_string(layout_enum)); + "[Cell order] Invalid Cell Layout enum " + std::to_string(layout_enum)); } } // namespace sm diff --git a/tiledb/sm/misc/constants.cc b/tiledb/sm/misc/constants.cc index 17e95b511ee9..3c8e399a075a 100644 --- a/tiledb/sm/misc/constants.cc +++ b/tiledb/sm/misc/constants.cc @@ -93,6 +93,9 @@ const std::string fragment_metadata_filename = "__fragment_metadata.tdb"; /** The array dimension labels directory name. */ const std::string array_dimension_labels_dir_name = "__labels"; +/** The array enumerations directory name. */ +const std::string array_enumerations_dir_name = "__enumerations"; + /** The default tile capacity. */ const uint64_t capacity = 10000; @@ -213,6 +216,9 @@ const uint32_t empty_ucs4 = 0; /** The special value for an empty ANY. */ const uint8_t empty_any = 0; +/** The return value for missing entries in an Enumeration */ +const uint64_t enumeration_missing_value = std::numeric_limits::max(); + /** The file suffix used in TileDB. */ const std::string file_suffix = ".tdb"; @@ -662,6 +668,9 @@ const format_version_t deletes_min_version = 16; /** The lowest version supported for updates. */ const format_version_t updates_min_version = 16; +/** THe lowest version supported for enumerations. */ +const format_version_t enumerations_min_version = 19; + /** The maximum size of a tile chunk (unit of compression) in bytes. */ const uint64_t max_tile_chunk_size = 64 * 1024; @@ -813,6 +822,8 @@ const void* fill_value(Datatype type) { } const std::string config_delimiter = ","; + +const storage_size_t unhyphenated_uuid_size = 32; } // namespace constants } // namespace sm diff --git a/tiledb/sm/misc/constants.h b/tiledb/sm/misc/constants.h index d2796f86fe58..a419ccfd1b54 100644 --- a/tiledb/sm/misc/constants.h +++ b/tiledb/sm/misc/constants.h @@ -82,6 +82,9 @@ extern const std::string array_commits_dir_name; /** The array dimension labels directory name. */ extern const std::string array_dimension_labels_dir_name; +/** The array enumerations directory name. */ +extern const std::string array_enumerations_dir_name; + /** The default tile capacity. */ extern const uint64_t capacity; @@ -199,6 +202,9 @@ extern const uint32_t empty_ucs4; /** The special value for an empty ANY. */ extern const uint8_t empty_any; +/** The return value for missing entries in an Enumeration */ +extern const uint64_t enumeration_missing_value; + /** The file suffix used in TileDB. */ extern const std::string file_suffix; @@ -638,6 +644,9 @@ extern const format_version_t deletes_min_version; /** The lowest version supported for updates. */ extern const format_version_t updates_min_version; +/** The lowest version supported for enumerations. */ +extern const format_version_t enumerations_min_version; + /** The maximum size of a tile chunk (unit of compression) in bytes. */ extern const uint64_t max_tile_chunk_size; @@ -722,6 +731,9 @@ extern const uint64_t s3_min_multipart_part_size; */ extern const std::string s3_multipart_buffering_dirname; +/** The length in bytes of an unhyphenated UUID. */ +extern const storage_size_t unhyphenated_uuid_size; + } // namespace constants } // namespace sm diff --git a/tiledb/sm/query/ast/CMakeLists.txt b/tiledb/sm/query/ast/CMakeLists.txt index 4568075fbeb7..c46c57fd7244 100644 --- a/tiledb/sm/query/ast/CMakeLists.txt +++ b/tiledb/sm/query/ast/CMakeLists.txt @@ -39,7 +39,7 @@ list(APPEND SOURCES # commence(object_library query_ast) this_target_sources(${SOURCES}) - this_target_object_libraries(array_schema baseline) + this_target_object_libraries(array_schema generic_tile_io baseline) conclude(object_library) add_test_subdirectory() diff --git a/tiledb/sm/query/ast/query_ast.cc b/tiledb/sm/query/ast/query_ast.cc index ddec36e555ea..454e68cc4efc 100644 --- a/tiledb/sm/query/ast/query_ast.cc +++ b/tiledb/sm/query/ast/query_ast.cc @@ -31,6 +31,8 @@ */ #include "query_ast.h" +#include "tiledb/common/unreachable.h" +#include "tiledb/sm/array_schema/enumeration.h" using namespace tiledb::common; @@ -60,10 +62,77 @@ void ASTNodeVal::get_field_names( field_name_set.insert(field_name_); } +void ASTNodeVal::get_enumeration_field_names( + std::unordered_set& field_name_set) const { + if (use_enumeration_) { + field_name_set.insert(field_name_); + } +} + bool ASTNodeVal::is_backwards_compatible() const { return true; } +void ASTNodeVal::rewrite_enumeration_conditions( + const ArraySchema& array_schema) { + if (!use_enumeration_) { + return; + } + + if (!array_schema.is_attr(field_name_)) { + return; + } + + auto attr = array_schema.attribute(field_name_); + if (!attr->has_enumeration()) { + return; + } + + auto enumeration = array_schema.enumeration(attr->get_enumeration_name()); + if (!enumeration) { + throw std::logic_error( + "Missing requried enumeration for field '" + field_name_ + "'"); + } + + if (!enumeration->ordered() && + (op_ != QueryConditionOp::EQ && op_ != QueryConditionOp::NE)) { + throw std::logic_error( + "Cannot apply an inequality operator against an unordered Enumeration"); + } + + auto idx = enumeration->index_of(condition_value_view_); + auto dt_size = datatype_size(attr->type()); + + if (dt_size != 1 && dt_size != 2 && dt_size != 4 && dt_size != 8) { + throw std::runtime_error("Invalid data type size for enumeration index."); + } + + // At this point we're mostly guaranteed to succeed other than maybe an + // error allocating a very small buffer in which case we're up a creek + // regardless. However, we don't want to re-apply this enumeration + // translation so we mark it as done. + use_enumeration_ = false; + + condition_value_data_ = ByteVecValue(dt_size); + condition_value_view_ = + UntypedDatumView(condition_value_data_.data(), dt_size); + + if (dt_size == 1) { + uint8_t idx_val = static_cast(idx); + memcpy(condition_value_data_.data(), &idx_val, dt_size); + } else if (dt_size == 2) { + uint16_t idx_val = static_cast(idx); + memcpy(condition_value_data_.data(), &idx_val, dt_size); + } else if (dt_size == 4) { + uint32_t idx_val = static_cast(idx); + memcpy(condition_value_data_.data(), &idx_val, dt_size); + } else if (dt_size == 8) { + memcpy(condition_value_data_.data(), &idx, dt_size); + } else { + stdx::unreachable(); + } +} + Status ASTNodeVal::check_node_validity(const ArraySchema& array_schema) const { const uint64_t condition_value_size = condition_value_data_.size(); @@ -186,6 +255,14 @@ const QueryConditionCombinationOp& ASTNodeVal::get_combination_op() const { "value node."); } +bool ASTNodeVal::use_enumeration() const { + return use_enumeration_; +} + +void ASTNodeVal::set_use_enumeration(bool use_enumeration) { + use_enumeration_ = use_enumeration; +} + bool ASTNodeExpr::is_expr() const { return true; } @@ -205,6 +282,13 @@ void ASTNodeExpr::get_field_names( } } +void ASTNodeExpr::get_enumeration_field_names( + std::unordered_set& field_name_set) const { + for (const auto& child : nodes_) { + child->get_enumeration_field_names(field_name_set); + } +} + bool ASTNodeExpr::is_backwards_compatible() const { if (combination_op_ != QueryConditionCombinationOp::AND) { return false; @@ -217,6 +301,13 @@ bool ASTNodeExpr::is_backwards_compatible() const { return true; } +void ASTNodeExpr::rewrite_enumeration_conditions( + const ArraySchema& array_schema) { + for (auto& child : nodes_) { + child->rewrite_enumeration_conditions(array_schema); + } +} + Status ASTNodeExpr::check_node_validity(const ArraySchema& array_schema) const { // If the node is a compound expression node, ensure there are at least // two children in the node and then run a check on each child node. @@ -275,9 +366,22 @@ const QueryConditionOp& ASTNodeExpr::get_op() const { const std::vector>& ASTNodeExpr::get_children() const { return nodes_; } + const QueryConditionCombinationOp& ASTNodeExpr::get_combination_op() const { return combination_op_; } +bool ASTNodeExpr::use_enumeration() const { + throw std::runtime_error( + "ASTNodeExpr::use_enumeration: Cannot get enumeration status from " + "an AST expression node."); +} + +void ASTNodeExpr::set_use_enumeration(bool use_enumeration) { + for (auto& child : nodes_) { + child->set_use_enumeration(use_enumeration); + } +} + } // namespace sm } // namespace tiledb diff --git a/tiledb/sm/query/ast/query_ast.h b/tiledb/sm/query/ast/query_ast.h index 30a08884d905..a1901207379e 100644 --- a/tiledb/sm/query/ast/query_ast.h +++ b/tiledb/sm/query/ast/query_ast.h @@ -91,11 +91,19 @@ class ASTNode { * @brief Gets the set of field names from all the value nodes in the ASTNode. * * @param field_name_set The set variable the function populates. - * @return std::unordered_set& Set of the field names in the - * node. */ virtual void get_field_names( std::unordered_set& field_name_set) const = 0; + + /** + * @brief Gets the set of field names from all the value nodes that reference + * an enumerated field. + * + * @param field_name_set The set variable the function populates. + */ + virtual void get_enumeration_field_names( + std::unordered_set& field_name_set) const = 0; + /** * @brief Returns true if the AST is previously supported by previous versions * of TileDB. This means that the AST should only have AND combination ops, @@ -106,6 +114,15 @@ class ASTNode { */ virtual bool is_backwards_compatible() const = 0; + /** + * @brief Update an node value condition values that refer to enumerated + * attributes. + * + * @param array_schema The array schema with all relevant enumerations loaded. + */ + virtual void rewrite_enumeration_conditions( + const ArraySchema& array_schema) = 0; + /** * @brief Checks whether the node is valid based on the array schema of the * array that the query condition that contains this AST node will execute a @@ -176,6 +193,26 @@ class ASTNode { */ virtual const QueryConditionCombinationOp& get_combination_op() const = 0; + /** + * @brief Return whether this node's condition should be applied against + * the attributes enumerated values or the underlying index data if + * applicable for a given attribute. + * + * @return bool If true, apply this condition against the enumerated values + */ + virtual bool use_enumeration() const = 0; + + /** + * @brief By default, a query condition is applied against an enumeration's + * values. This can be disabled to apply a given condition against the + * underlying integer data stored for the attribute by passing `false` to + * this method. + * + * @param use_enumeration A bool indicating whether this condition should be + * applied against the enumerations values or not. + */ + virtual void set_use_enumeration(bool use_enumeration) = 0; + /** * @brief Default virtual destructor. */ @@ -209,7 +246,8 @@ class ASTNodeVal : public ASTNode { (void*)"" : condition_value_data_.data()), condition_value_data_.size()) - , op_(op) { + , op_(op) + , use_enumeration_(true) { if (condition_value_view_.size() != 0) { memcpy( condition_value_data_.data(), condition_value, condition_value_size); @@ -228,7 +266,8 @@ class ASTNodeVal : public ASTNode { (void*)"" : condition_value_data_.data()), condition_value_data_.size()) - , op_(rhs.op_) { + , op_(rhs.op_) + , use_enumeration_(rhs.use_enumeration_) { } /** @@ -243,7 +282,8 @@ class ASTNodeVal : public ASTNode { (void*)"" : condition_value_data_.data()), condition_value_data_.size()) - , op_(negate_query_condition_op(rhs.op_)) { + , op_(negate_query_condition_op(rhs.op_)) + , use_enumeration_(rhs.use_enumeration_) { } /** @@ -284,12 +324,19 @@ class ASTNodeVal : public ASTNode { * @brief Gets the set of field names from all the value nodes in the ASTNode. * * @param field_name_set The set variable the function populates. - * @return std::unordered_set& Set of the field names in the - * node. */ void get_field_names( std::unordered_set& field_name_set) const override; + /** + * @brief Gets the set of field names from all the value nodes that reference + * an enumerated field. + * + * @param field_name_set The set variable the function populates. + */ + void get_enumeration_field_names( + std::unordered_set& field_name_set) const override; + /** * @brief Returns true if the AST is previously supported by previous versions * of TileDB. This means that the AST should only have AND combination ops, @@ -300,6 +347,14 @@ class ASTNodeVal : public ASTNode { */ bool is_backwards_compatible() const override; + /** + * @brief Update an node value condition values that refer to enumerated + * attributes. + * + * @param array_schema The array schema with all relevant enumerations loaded. + */ + void rewrite_enumeration_conditions(const ArraySchema& array_schema) override; + /** * @brief Checks whether the node is valid based on the array schema of the * array that the query condition that contains this AST node will execute a @@ -370,6 +425,26 @@ class ASTNodeVal : public ASTNode { */ const QueryConditionCombinationOp& get_combination_op() const override; + /** + * @brief Return whether this node's condition should be applied against + * the attributes enumerated values or the underlying index data if + * applicable for a given attribute. + * + * @return bool If true, apply this condition against the enumerated values + */ + bool use_enumeration() const override; + + /** + * @brief By default, a query condition is applied against an enumeration's + * values. This can be disabled to apply a given condition against the + * underlying integer data stored for the attribute by passing `false` to + * this method. + * + * @param use_enumeration A bool indicating whether this condition should be + * applied against the enumerations values or not. + */ + void set_use_enumeration(bool use_enumeration) override; + private: /** The attribute name. */ std::string field_name_; @@ -382,6 +457,9 @@ class ASTNodeVal : public ASTNode { /** The comparison operator. */ QueryConditionOp op_; + + /** Whether this condiiton applies to enumerated values if applicable */ + bool use_enumeration_; }; /** @@ -461,12 +539,19 @@ class ASTNodeExpr : public ASTNode { * @brief Gets the set of field names from all the value nodes in the ASTNode. * * @param field_name_set The set variable the function populates. - * @return std::unordered_set& Set of the field names in the - * node. */ void get_field_names( std::unordered_set& field_name_set) const override; + /** + * @brief Gets the set of field names from all the value nodes that reference + * an enumerated field. + * + * @param field_name_set The set variable the function populates. + */ + void get_enumeration_field_names( + std::unordered_set& field_name_set) const override; + /** * @brief Returns true if the AST is previously supported by previous versions * of TileDB. This means that the AST should only have AND combination ops, @@ -477,6 +562,14 @@ class ASTNodeExpr : public ASTNode { */ bool is_backwards_compatible() const override; + /** + * @brief Update an node value condition values that refer to enumerated + * attributes. + * + * @param array_schema The array schema with all relevant enumerations loaded. + */ + void rewrite_enumeration_conditions(const ArraySchema& array_schema) override; + /** * @brief Checks whether the node is valid based on the array schema of the * array that the query condition that contains this AST node will execute a @@ -547,6 +640,31 @@ class ASTNodeExpr : public ASTNode { */ const QueryConditionCombinationOp& get_combination_op() const override; + /** + * @brief Return whether this node's condition should be applied against + * the attributes enumerated values or the underlying index data if + * applicable for a given attribute. + * + * This method always throws when called on an expression node. + * + * @return bool If true, apply this condition against the enumerated values + */ + bool use_enumeration() const override; + + /** + * @brief By default, a query condition is applied against an enumeration's + * values. This can be disabled to apply a given condition against the + * underlying integer data stored for the attribute by passing `false` to + * this method. + * + * When called on an expression node this value is recursively applied + * against all value nodes in the AST. + * + * @param use_enumeration A bool indicating whether this condition should be + * applied against the enumerations values or not. + */ + void set_use_enumeration(bool use_enumeration) override; + private: /** The node list **/ std::vector> nodes_; @@ -558,4 +676,4 @@ class ASTNodeExpr : public ASTNode { } // namespace sm } // namespace tiledb -#endif // TILEDB_QUERY_AST_H \ No newline at end of file +#endif // TILEDB_QUERY_AST_H diff --git a/tiledb/sm/query/query.cc b/tiledb/sm/query/query.cc index 97b5edefb552..dd65a29f230d 100644 --- a/tiledb/sm/query/query.cc +++ b/tiledb/sm/query/query.cc @@ -846,6 +846,32 @@ Status Query::process() { } } + // PJD: This is the least worst place I can find to load the attribute + // enumeration values data. AFAICT this should run server side in a REST + // query as well as run before the query condition is checked for correctness + // or any other use of enumeration values is required. + if (condition_.has_value()) { + auto& names = condition_->enumeration_field_names(); + std::vector to_load; + to_load.reserve(names.size()); + for (auto name : names) { + if (array_schema_->has_enumeration(name)) { + to_load.push_back(name); + } + } + + throw_if_not_ok(parallel_for( + storage_manager_->compute_tp(), + 0, + to_load.size(), + [&](const uint64_t i) { + array_->get_enumeration(to_load[i]); + return Status::Ok(); + })); + + condition_->rewrite_enumeration_conditions(array_schema()); + } + // Update query status. status_ = QueryStatus::INPROGRESS; diff --git a/tiledb/sm/query/query_condition.cc b/tiledb/sm/query/query_condition.cc index a8c78c8ff409..b93f78e21af3 100644 --- a/tiledb/sm/query/query_condition.cc +++ b/tiledb/sm/query/query_condition.cc @@ -123,6 +123,15 @@ Status QueryCondition::init( return Status::Ok(); } +void QueryCondition::rewrite_enumeration_conditions( + const ArraySchema& array_schema) { + if (!tree_) { + return; + } + + tree_->rewrite_enumeration_conditions(array_schema); +} + Status QueryCondition::check(const ArraySchema& array_schema) const { if (!tree_) { return Status::Ok(); @@ -144,6 +153,7 @@ Status QueryCondition::combine( } combined_cond->field_names_.clear(); + combined_cond->enumeration_field_names_.clear(); combined_cond->tree_ = this->tree_->combine(rhs.tree_, combination_op); return Status::Ok(); } @@ -158,6 +168,7 @@ Status QueryCondition::negate( } combined_cond->field_names_.clear(); + combined_cond->enumeration_field_names_.clear(); combined_cond->tree_ = this->tree_->get_negated_tree(); return Status::Ok(); } @@ -174,6 +185,15 @@ std::unordered_set& QueryCondition::field_names() const { return field_names_; } +std::unordered_set& QueryCondition::enumeration_field_names() + const { + if (enumeration_field_names_.empty() && tree_ != nullptr) { + tree_->get_enumeration_field_names(enumeration_field_names_); + } + + return enumeration_field_names_; +} + uint64_t QueryCondition::condition_timestamp() const { if (condition_marker_.empty()) { return 0; @@ -188,6 +208,10 @@ uint64_t QueryCondition::condition_timestamp() const { return timestamps.first; } +void QueryCondition::set_use_enumeration(bool use_enumeration) { + tree_->set_use_enumeration(use_enumeration); +} + /** Full template specialization for `char*` and `QueryConditionOp::LT`. */ template <> struct QueryCondition::BinaryCmpNullChecks { diff --git a/tiledb/sm/query/query_condition.h b/tiledb/sm/query/query_condition.h index 717f047ceab0..4349c19c765c 100644 --- a/tiledb/sm/query/query_condition.h +++ b/tiledb/sm/query/query_condition.h @@ -112,6 +112,15 @@ class QueryCondition { uint64_t condition_value_size, const QueryConditionOp& op); + /** + * Translate any query conditions against enumerated attributes to the + * underlying attribute type. + * + * @param array_schema The current array schema with all required enumerations + * loaded. + */ + void rewrite_enumeration_conditions(const ArraySchema& array_schema); + /** * Verifies that the current state contains supported comparison * operations. Currently, we support the following: @@ -160,6 +169,12 @@ class QueryCondition { */ std::unordered_set& field_names() const; + /** + * Returns a set of all unique filed names that reference an enumerated + * attribute condition in the AST representing the query condition. + */ + std::unordered_set& enumeration_field_names() const; + /** * Returns the timestamp for this condition. */ @@ -245,6 +260,17 @@ class QueryCondition { */ uint64_t condition_index() const; + /** + * By default, a query condition is applied against the enumerated values + * of an attribute. Setting use_enumeration to false prevents the translation + * and applies this query condition directly against the underlying integral + * attribute data. + * + * @param use_enumeration A bool indicating whether to use the enumeration + * values + */ + void set_use_enumeration(bool use_enumeration); + private: /* ********************************* */ /* PRIVATE DATATYPES */ @@ -294,6 +320,9 @@ class QueryCondition { /** Caches all field names in the value nodes of the AST. */ mutable std::unordered_set field_names_; + /** Caches all filed names that references enumerations in the AST */ + mutable std::unordered_set enumeration_field_names_; + /* ********************************* */ /* PRIVATE METHODS */ /* ********************************* */ diff --git a/tiledb/sm/storage_manager/storage_manager.cc b/tiledb/sm/storage_manager/storage_manager.cc index 8759e89d67ef..6469f6448bc7 100644 --- a/tiledb/sm/storage_manager/storage_manager.cc +++ b/tiledb/sm/storage_manager/storage_manager.cc @@ -46,6 +46,7 @@ #include "tiledb/sm/array/array_directory.h" #include "tiledb/sm/array_schema/array_schema.h" #include "tiledb/sm/array_schema/array_schema_evolution.h" +#include "tiledb/sm/array_schema/enumeration.h" #include "tiledb/sm/consolidator/consolidator.h" #include "tiledb/sm/consolidator/fragment_consolidator.h" #include "tiledb/sm/enums/array_type.h" @@ -612,6 +613,11 @@ Status StorageManagerCanonical::array_create( array_uri.join_path(constants::array_schema_dir_name); RETURN_NOT_OK(vfs()->create_dir(array_schema_dir_uri)); + // Create the enumerations directory inside the array schema directory + URI array_enumerations_uri = + array_schema_dir_uri.join_path(constants::array_enumerations_dir_name); + RETURN_NOT_OK(vfs()->create_dir(array_enumerations_uri)); + // Create commit directory URI array_commit_uri = array_uri.join_path(constants::array_commits_dir_name); RETURN_NOT_OK(vfs()->create_dir(array_commit_uri)); @@ -718,11 +724,9 @@ Status StorageManager::array_evolve_schema( auto&& array_schema = array_dir.load_array_schema_latest(encryption_key); // Evolve schema - auto&& [st1, array_schema_evolved] = - schema_evolution->evolve_schema(array_schema); - RETURN_NOT_OK(st1); + auto array_schema_evolved = schema_evolution->evolve_schema(array_schema); - Status st = store_array_schema(array_schema_evolved.value(), encryption_key); + Status st = store_array_schema(array_schema_evolved, encryption_key); if (!st.ok()) { logger_->status_no_return_value(st); return logger_->status(Status_StorageManagerError( @@ -1674,11 +1678,38 @@ Status StorageManagerCanonical::store_array_schema( URI array_schema_dir_uri = array_schema->array_uri().join_path(constants::array_schema_dir_name); RETURN_NOT_OK(vfs()->is_dir(array_schema_dir_uri, &schema_dir_exists)); + if (!schema_dir_exists) RETURN_NOT_OK(vfs()->create_dir(array_schema_dir_uri)); RETURN_NOT_OK(store_data_to_generic_tile(tile, schema_uri, encryption_key)); + bool enumerations_dir_exists = false; + URI array_enumerations_dir_uri = + array_schema_dir_uri.join_path(constants::array_enumerations_dir_name); + RETURN_NOT_OK( + vfs()->is_dir(array_enumerations_dir_uri, &enumerations_dir_exists)); + + if (!enumerations_dir_exists) { + RETURN_NOT_OK(vfs()->create_dir(array_enumerations_dir_uri)); + } + + for (auto enumeration : array_schema->enumerations()) { + SizeComputationSerializer enumeration_size_serializer; + enumeration->serialize( + enumeration_size_serializer, array_schema->write_version()); + + WriterTile tile{ + WriterTile::from_generic(enumeration_size_serializer.size())}; + Serializer serializer(tile.data(), tile.size()); + enumeration->serialize(serializer, array_schema->write_version()); + + auto abs_enumeration_uri = + array_enumerations_dir_uri.join_path(enumeration->name()); + RETURN_NOT_OK( + store_data_to_generic_tile(tile, abs_enumeration_uri, encryption_key)); + } + return Status::Ok(); } diff --git a/tiledb/sm/tile/CMakeLists.txt b/tiledb/sm/tile/CMakeLists.txt index 2a7180dd2557..280e545abe40 100644 --- a/tiledb/sm/tile/CMakeLists.txt +++ b/tiledb/sm/tile/CMakeLists.txt @@ -44,7 +44,6 @@ commence(object_library generic_tile_io) baseline buffer constants - context_resources crypto filter_pipeline tile @@ -52,4 +51,10 @@ commence(object_library generic_tile_io) ) conclude(object_library) +# This is linked outside the object_library scope because ContextResources +# is recompiled as part of the capi_context_stub. Including context_resources +# here like this prevents many headaches revolving around duplicate symbols +# when linking executables. +target_link_libraries(compile_generic_tile_io PRIVATE context_resources) + add_test_subdirectory() diff --git a/tiledb/storage_format/uri/CMakeLists.txt b/tiledb/storage_format/uri/CMakeLists.txt index 6923c26548d7..4dbd4dd3f319 100644 --- a/tiledb/storage_format/uri/CMakeLists.txt +++ b/tiledb/storage_format/uri/CMakeLists.txt @@ -30,6 +30,6 @@ include(object_library) # `uri_format` object library # commence(object_library uri_format) - this_target_sources(parse_uri.cc) + this_target_sources(parse_uri.cc generate_uri.cc) this_target_object_libraries(baseline time uuid vfs) conclude(object_library)