From e0f91da89dd8fa6418c4afe3c4b59b79750dca49 Mon Sep 17 00:00:00 2001 From: joe maley Date: Wed, 27 Jan 2021 14:23:19 -0500 Subject: [PATCH] Percent-encode reserved characters in file names (#2047) This bumps the format version from 7 to 8. For version 8, dimension and attribute names are percent-encoded to RFC 3986 + a few extra to handle reserved characters on Windows. This is intended to be a temporary measure. In the future, we will include a mapping between names and their ordered index in the array schema. Co-authored-by: Joe Maley --- HISTORY.md | 3 + format_spec/FORMAT_SPEC.md | 2 +- test/CMakeLists.txt | 1 + test/src/unit-capi-attributes.cc | 272 ++++++++++++++++++++++++ test/src/unit-capi-fragment_info.cc | 12 +- test/src/unit-cppapi-fragment_info.cc | 8 +- tiledb/sm/fragment/fragment_metadata.cc | 51 ++++- tiledb/sm/fragment/fragment_metadata.h | 9 + tiledb/sm/misc/constants.cc | 2 +- 9 files changed, 345 insertions(+), 15 deletions(-) create mode 100644 test/src/unit-capi-attributes.cc diff --git a/HISTORY.md b/HISTORY.md index 0c26cdf861d..9bef2da8358 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,9 @@ ## Improvements +* Parallelize across attributes when closing a write [#2048](https://github.com/TileDB-Inc/TileDB/pull/2048) +* Support for dimension/attribute names that contain commonly reserved filesystem characters [#2047](https://github.com/TileDB-Inc/TileDB/pull/2047) + ## Deprecations ## Bug fixes diff --git a/format_spec/FORMAT_SPEC.md b/format_spec/FORMAT_SPEC.md index 85fb159a8ca..9a74e6041d5 100644 --- a/format_spec/FORMAT_SPEC.md +++ b/format_spec/FORMAT_SPEC.md @@ -2,7 +2,7 @@ :information_source: **Notes:** -- The current TileDB format version number is **7** (`uint32_t`). +- The current TileDB format version number is **8** (`uint32_t`). - All data written by TileDB and referenced in this document is **little-endian**. ## Table of Contents diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a5f05b3f7b6..ed45f986df6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -106,6 +106,7 @@ set(TILEDB_TEST_SOURCES src/unit-capi-array.cc src/unit-capi-array_schema.cc src/unit-capi-async.cc + src/unit-capi-attributes.cc src/unit-capi-buffer.cc src/unit-capi-config.cc src/unit-capi-consolidation.cc diff --git a/test/src/unit-capi-attributes.cc b/test/src/unit-capi-attributes.cc new file mode 100644 index 00000000000..112b110d209 --- /dev/null +++ b/test/src/unit-capi-attributes.cc @@ -0,0 +1,272 @@ +/** + * @file unit-capi-attributes.cc + * + * @section LICENSE + * + * The MIT License + * + * @copyright Copyright (c) 2021 TileDB Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @section DESCRIPTION + * + * Tests of C API for attributes. + */ + +#include "catch.hpp" +#include "tiledb/sm/c_api/tiledb.h" + +#include + +#include "catch.hpp" +#include "test/src/helpers.h" +#include "test/src/vfs_helpers.h" +#ifdef _WIN32 +#include +#include "tiledb/sm/filesystem/win.h" +#else +#include "tiledb/sm/filesystem/posix.h" +#endif +#include "tiledb/sm/c_api/tiledb.h" +#include "tiledb/sm/misc/utils.h" + +#include +#include +#include +#include + +using namespace tiledb::test; + +struct Attributesfx { + // TileDB context + tiledb_ctx_t* ctx_; + tiledb_vfs_t* vfs_; + + // Vector of supported filesystems + const std::vector> fs_vec_; + + // Functions + Attributesfx(); + ~Attributesfx(); + void create_temp_dir(const std::string& path); + void remove_temp_dir(const std::string& path); + void create_dense_vector( + const std::string& path, const std::string& attr_name); + static std::string random_name(const std::string& prefix); +}; + +Attributesfx::Attributesfx() + : fs_vec_(vfs_test_get_fs_vec()) { + // Initialize vfs test + REQUIRE(vfs_test_init(fs_vec_, &ctx_, &vfs_).ok()); +} + +Attributesfx::~Attributesfx() { + // Close vfs test + REQUIRE(vfs_test_close(fs_vec_, ctx_, vfs_).ok()); + tiledb_vfs_free(&vfs_); + tiledb_ctx_free(&ctx_); +} + +void Attributesfx::create_temp_dir(const std::string& path) { + remove_temp_dir(path); + REQUIRE(tiledb_vfs_create_dir(ctx_, vfs_, path.c_str()) == TILEDB_OK); +} + +void Attributesfx::remove_temp_dir(const std::string& path) { + int is_dir = 0; + REQUIRE(tiledb_vfs_is_dir(ctx_, vfs_, path.c_str(), &is_dir) == TILEDB_OK); + if (is_dir) + REQUIRE(tiledb_vfs_remove_dir(ctx_, vfs_, path.c_str()) == TILEDB_OK); +} + +std::string Attributesfx::random_name(const std::string& prefix) { + std::stringstream ss; + ss << prefix << "-" << std::this_thread::get_id() << "-" + << TILEDB_TIMESTAMP_NOW_MS; + return ss.str(); +} + +void Attributesfx::create_dense_vector( + const std::string& path, const std::string& attr_name) { + int rc; + int64_t dim_domain[] = {1, 10}; + int64_t tile_extent = 2; + + // Create domain + tiledb_domain_t* domain; + rc = tiledb_domain_alloc(ctx_, &domain); + REQUIRE(rc == TILEDB_OK); + tiledb_dimension_t* dim; + rc = tiledb_dimension_alloc( + ctx_, "d1", TILEDB_INT64, dim_domain, &tile_extent, &dim); + REQUIRE(rc == TILEDB_OK); + rc = tiledb_domain_add_dimension(ctx_, domain, dim); + REQUIRE(rc == TILEDB_OK); + + tiledb_attribute_t* attr; + rc = tiledb_attribute_alloc(ctx_, attr_name.c_str(), TILEDB_INT32, &attr); + REQUIRE(rc == TILEDB_OK); + + // Create array schema + tiledb_array_schema_t* array_schema; + rc = tiledb_array_schema_alloc(ctx_, TILEDB_DENSE, &array_schema); + REQUIRE(rc == TILEDB_OK); + rc = tiledb_array_schema_set_cell_order(ctx_, array_schema, TILEDB_ROW_MAJOR); + REQUIRE(rc == TILEDB_OK); + rc = tiledb_array_schema_set_tile_order(ctx_, array_schema, TILEDB_ROW_MAJOR); + REQUIRE(rc == TILEDB_OK); + rc = tiledb_array_schema_set_domain(ctx_, array_schema, domain); + REQUIRE(rc == TILEDB_OK); + rc = tiledb_array_schema_add_attribute(ctx_, array_schema, attr); + REQUIRE(rc == TILEDB_OK); + + rc = tiledb_array_schema_check(ctx_, array_schema); + REQUIRE(rc == TILEDB_OK); + + // Create array + rc = tiledb_array_create(ctx_, path.c_str(), array_schema); + + REQUIRE(rc == TILEDB_OK); + tiledb_attribute_free(&attr); + tiledb_dimension_free(&dim); + tiledb_array_schema_free(&array_schema); +} + +TEST_CASE_METHOD( + Attributesfx, + "C API: Test attributes with illegal filesystem characters in the name", + "[capi][attributes][illegal_name]") { + const std::vector attr_names = { + "miles!hour", "miles#hour", "miles$hour", "miles%hour", "miles&hour", + "miles'hour", "miles(hour", "miles)hour", "miles*hour", "miles+hour", + "miles,hour", "miles/hour", "miles:hour", "miles;hour", "miles=hour", + "miles?hour", "miles@hour", "miles[hour", "miles]hour", "miles[hour", + "miles\"hour", "mileshour", "miles\\hour", "miles|hour"}; + + for (const auto& attr_name : attr_names) { + for (const auto& fs : fs_vec_) { + // Skip this test for Azure and Windows because they are broken for + // certain characters. The path to handle illegal characters is a + // temporary fix. We will get this working on Azure and Windows in the + // future. + bool skip = false; + if (dynamic_cast(fs.get()) != nullptr) + skip = true; +#ifdef _WIN32 + if (dynamic_cast(fs.get()) != nullptr) + skip = true; +#endif + if (skip) + continue; + + std::string temp_dir = fs->temp_dir(); + std::string array_name = temp_dir + "array-illegal-char"; + + // Create new TileDB context with file lock config disabled, rest the + // same. + tiledb_ctx_free(&ctx_); + tiledb_vfs_free(&vfs_); + + tiledb_config_t* config = nullptr; + tiledb_error_t* error = nullptr; + REQUIRE(tiledb_config_alloc(&config, &error) == TILEDB_OK); + REQUIRE(error == nullptr); + + REQUIRE(vfs_test_init(fs_vec_, &ctx_, &vfs_, config).ok()); + + tiledb_config_free(&config); + + create_temp_dir(temp_dir); + + create_dense_vector(array_name, attr_name); + + // Prepare cell buffers + int buffer_a1[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + uint64_t buffer_a1_size = sizeof(buffer_a1); + + // Open array + int64_t subarray[] = {1, 10}; + tiledb_array_t* array; + int rc = tiledb_array_alloc(ctx_, array_name.c_str(), &array); + CHECK(rc == TILEDB_OK); + rc = tiledb_array_open(ctx_, array, TILEDB_WRITE); + CHECK(rc == TILEDB_OK); + + // Submit query + tiledb_query_t* query; + rc = tiledb_query_alloc(ctx_, array, TILEDB_WRITE, &query); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_set_layout(ctx_, query, TILEDB_GLOBAL_ORDER); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_set_subarray(ctx_, query, subarray); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_set_buffer( + ctx_, query, attr_name.c_str(), buffer_a1, &buffer_a1_size); + CHECK(rc == TILEDB_OK); + + rc = tiledb_query_submit(ctx_, query); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_finalize(ctx_, query); + CHECK(rc == TILEDB_OK); + + // Close array and clean up + rc = tiledb_array_close(ctx_, array); + CHECK(rc == TILEDB_OK); + tiledb_array_free(&array); + tiledb_query_free(&query); + + int buffer_read[10]; + uint64_t buffer_read_size = sizeof(buffer_read); + + // Open array + rc = tiledb_array_alloc(ctx_, array_name.c_str(), &array); + CHECK(rc == TILEDB_OK); + rc = tiledb_array_open(ctx_, array, TILEDB_READ); + CHECK(rc == TILEDB_OK); + + // Submit query + rc = tiledb_query_alloc(ctx_, array, TILEDB_READ, &query); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_set_layout(ctx_, query, TILEDB_ROW_MAJOR); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_set_subarray(ctx_, query, subarray); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_set_buffer( + ctx_, query, attr_name.c_str(), buffer_read, &buffer_read_size); + CHECK(rc == TILEDB_OK); + rc = tiledb_query_submit(ctx_, query); + CHECK(rc == TILEDB_OK); + + // Close array and clean up + rc = tiledb_array_close(ctx_, array); + CHECK(rc == TILEDB_OK); + tiledb_array_free(&array); + tiledb_query_free(&query); + + // Check correctness + int buffer_read_c[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + CHECK(!std::memcmp(buffer_read, buffer_read_c, sizeof(buffer_read_c))); + CHECK(buffer_read_size == sizeof(buffer_read_c)); + + remove_temp_dir(temp_dir); + } + } +} diff --git a/test/src/unit-capi-fragment_info.cc b/test/src/unit-capi-fragment_info.cc index b7af9309fa1..e0843b844c8 100644 --- a/test/src/unit-capi-fragment_info.cc +++ b/test/src/unit-capi-fragment_info.cc @@ -1001,16 +1001,16 @@ TEST_CASE("C API: Test fragment info, dump", "[capi][fragment_info][dump]") { "- Fragment #1:\n" + " > URI: " + written_frag_uri_1 + "\n" + " > Type: dense\n" + " > Non-empty domain: [1, 6]\n" + " > Size: 1584\n" + " > Cell num: 10\n" + - " > Timestamp range: [1, 1]\n" + " > Format version: 7\n" + + " > Timestamp range: [1, 1]\n" + " > Format version: 8\n" + " > Has consolidated metadata: no\n" + "- Fragment #2:\n" + " > URI: " + written_frag_uri_2 + "\n" + " > Type: sparse\n" + " > Non-empty domain: [1, 7]\n" + " > Size: 1708\n" + " > Cell num: 4\n" + " > Timestamp range: [2, 2]\n" + - " > Format version: 7\n" + " > Has consolidated metadata: no\n" + + " > Format version: 8\n" + " > Has consolidated metadata: no\n" + "- Fragment #3:\n" + " > URI: " + written_frag_uri_3 + "\n" + " > Type: sparse\n" + " > Non-empty domain: [2, 9]\n" + " > Size: 1696\n" + " > Cell num: 3\n" + - " > Timestamp range: [3, 3]\n" + " > Format version: 7\n" + + " > Timestamp range: [3, 3]\n" + " > Format version: 8\n" + " > Has consolidated metadata: no\n"; FILE* gold_fout = fopen("gold_fout.txt", "w"); const char* dump = dump_str.c_str(); @@ -1130,7 +1130,7 @@ TEST_CASE( "- Fragment #1:\n" + " > URI: " + uri + "\n" + " > Type: dense\n" + " > Non-empty domain: [1, 10]\n" + " > Size: 1584\n" + " > Cell num: 10\n" + " > Timestamp range: [1, 3]\n" + - " > Format version: 7\n" + " > Has consolidated metadata: no\n"; + " > Format version: 8\n" + " > Has consolidated metadata: no\n"; FILE* gold_fout = fopen("gold_fout.txt", "w"); const char* dump = dump_str.c_str(); fwrite(dump, sizeof(char), strlen(dump), gold_fout); @@ -1212,7 +1212,7 @@ TEST_CASE( "- Fragment #1:\n" + " > URI: " + written_frag_uri + "\n" + " > Type: sparse\n" + " > Non-empty domain: [a, ddd]\n" + " > Size: 1833\n" + " > Cell num: 4\n" + - " > Timestamp range: [1, 1]\n" + " > Format version: 7\n" + + " > Timestamp range: [1, 1]\n" + " > Format version: 8\n" + " > Has consolidated metadata: no\n"; FILE* gold_fout = fopen("gold_fout.txt", "w"); const char* dump = dump_str.c_str(); @@ -1234,4 +1234,4 @@ TEST_CASE( tiledb_ctx_free(&ctx); tiledb_vfs_free(&vfs); tiledb_fragment_info_free(&fragment_info); -} \ No newline at end of file +} diff --git a/test/src/unit-cppapi-fragment_info.cc b/test/src/unit-cppapi-fragment_info.cc index 2df39cc56d1..ceb848541a3 100644 --- a/test/src/unit-cppapi-fragment_info.cc +++ b/test/src/unit-cppapi-fragment_info.cc @@ -626,16 +626,16 @@ TEST_CASE( "- Fragment #1:\n" + " > URI: " + written_frag_uri_1 + "\n" + " > Type: dense\n" + " > Non-empty domain: [1, 6]\n" + " > Size: 1584\n" + " > Cell num: 10\n" + - " > Timestamp range: [1, 1]\n" + " > Format version: 7\n" + + " > Timestamp range: [1, 1]\n" + " > Format version: 8\n" + " > Has consolidated metadata: no\n" + "- Fragment #2:\n" + " > URI: " + written_frag_uri_2 + "\n" + " > Type: sparse\n" + " > Non-empty domain: [1, 7]\n" + " > Size: 1708\n" + " > Cell num: 4\n" + " > Timestamp range: [2, 2]\n" + - " > Format version: 7\n" + " > Has consolidated metadata: no\n" + + " > Format version: 8\n" + " > Has consolidated metadata: no\n" + "- Fragment #3:\n" + " > URI: " + written_frag_uri_3 + "\n" + " > Type: sparse\n" + " > Non-empty domain: [2, 9]\n" + " > Size: 1696\n" + " > Cell num: 3\n" + - " > Timestamp range: [3, 3]\n" + " > Format version: 7\n" + + " > Timestamp range: [3, 3]\n" + " > Format version: 8\n" + " > Has consolidated metadata: no\n"; FILE* gold_fout = fopen("gold_fout.txt", "w"); const char* dump = dump_str.c_str(); @@ -654,4 +654,4 @@ TEST_CASE( // Clean up remove_dir(array_name, ctx.ptr().get(), vfs.ptr().get()); -} \ No newline at end of file +} diff --git a/tiledb/sm/fragment/fragment_metadata.cc b/tiledb/sm/fragment/fragment_metadata.cc index 8418d1f0f4b..fe3a888b662 100644 --- a/tiledb/sm/fragment/fragment_metadata.cc +++ b/tiledb/sm/fragment/fragment_metadata.cc @@ -590,16 +590,61 @@ uint64_t FragmentMetadata::tile_num() const { return sparse_tile_num_; } +std::string FragmentMetadata::encode_name(const std::string& name) const { + if (version_ <= 7) + return name; + + static const std::unordered_map percent_encoding{ + // RFC 3986 + {'!', "%21"}, + {'#', "%23"}, + {'$', "%24"}, + {'%', "%25"}, + {'&', "%26"}, + {'\'', "%27"}, + {'(', "%28"}, + {')', "%29"}, + {'*', "%2A"}, + {'+', "%2B"}, + {',', "%2C"}, + {'/', "%2F"}, + {':', "%3A"}, + {';', "%3B"}, + {'=', "%3D"}, + {'?', "%3F"}, + {'@', "%40"}, + {'[', "%5B"}, + {']', "%5D"}, + // Extra encodings to cover illegal characters on Windows + {'\"', "%22"}, + {'<', "%20"}, + {'>', "%2D"}, + {'\\', "%30"}, + {'|', "%3C"}}; + + std::stringstream percent_encoded_name; + for (const char c : name) { + if (percent_encoding.count(c) == 0) + percent_encoded_name << c; + else + percent_encoded_name << percent_encoding.at(c); + } + + return percent_encoded_name.str(); +} + URI FragmentMetadata::uri(const std::string& name) const { - return fragment_uri_.join_path(name + constants::file_suffix); + return fragment_uri_.join_path(encode_name(name) + constants::file_suffix); } URI FragmentMetadata::var_uri(const std::string& name) const { - return fragment_uri_.join_path(name + "_var" + constants::file_suffix); + return fragment_uri_.join_path( + encode_name(name) + "_var" + constants::file_suffix); } URI FragmentMetadata::validity_uri(const std::string& name) const { - return fragment_uri_.join_path(name + "_validity" + constants::file_suffix); + return fragment_uri_.join_path( + encode_name(name) + "_validity" + constants::file_suffix); } Status FragmentMetadata::load_tile_offsets( diff --git a/tiledb/sm/fragment/fragment_metadata.h b/tiledb/sm/fragment/fragment_metadata.h index 08ed65a8261..607fa098082 100644 --- a/tiledb/sm/fragment/fragment_metadata.h +++ b/tiledb/sm/fragment/fragment_metadata.h @@ -1091,6 +1091,15 @@ class FragmentMetadata { * fragment metadata file and unlocks the array. */ void clean_up(); + + /** + * Encodes a dimension/attribute name to use in a file name. The + * motiviation is to encode illegal/reserved file name characters. + * + * @param name The dimension/attribute name. + * return std::string The encoded dimension/attribute name. + */ + std::string encode_name(const std::string& name) const; }; } // namespace sm diff --git a/tiledb/sm/misc/constants.cc b/tiledb/sm/misc/constants.cc index a922f90f278..aeedb8ccfb2 100644 --- a/tiledb/sm/misc/constants.cc +++ b/tiledb/sm/misc/constants.cc @@ -467,7 +467,7 @@ const int32_t library_version[3] = { TILEDB_VERSION_MAJOR, TILEDB_VERSION_MINOR, TILEDB_VERSION_PATCH}; /** The TileDB serialization format version number. */ -const uint32_t format_version = 7; +const uint32_t format_version = 8; /** The maximum size of a tile chunk (unit of compression) in bytes. */ const uint64_t max_tile_chunk_size = 64 * 1024;