diff --git a/HISTORY.md b/HISTORY.md index 8e183870f8d..34e6e4cabdc 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,7 +6,8 @@ * Added config params `vfs.s3.aws_access_key_id` and `vfs.s3.aws_secret_access_key` for configure s3 access at runtime. [#1036](https://github.com/TileDB-Inc/TileDB/pull/1036) * Set LZ4, Zlib and Zstd compressors to build in release mode. [#1034](https://github.com/TileDB-Inc/TileDB/pull/1034) -* Added missing check if coordinates obey the global order in global order sparse writes. [#1039](https://github.com/TileDB-Inc/TileDB/pull/1039) +* Added missing check if coordinates obey the global order in global order sparse writes. [#1039](https://github.com/TileDB-Inc/TileDB/pull/1039) +* Changed coordinates to always be split before filtering. #1054 # TileDB v1.4.0 Release Notes diff --git a/doc/source/tutorials/format-description.rst b/doc/source/tutorials/format-description.rst index bddd9f2a6b3..7eac5e93567 100644 --- a/doc/source/tutorials/format-description.rst +++ b/doc/source/tutorials/format-description.rst @@ -9,7 +9,7 @@ describes the tile-based format shared by all files written by TileDB to each tile for filtering. The second section describes the byte format of the tile data written in each file in a TileDB array. -The current TileDB format version number is **1** (``uint32_t``). +The current TileDB format version number is **2** (``uint32_t``). .. note:: @@ -97,6 +97,12 @@ instead stored in the array schema and fragment metadata files), so an attribute | | | bytes. | +-------------------------+----------------------+---------------------------------+ +A coordinate ``Tile`` is additionally processed by "splitting" coordinate tuples +across dimensions. As an example, 3D coordinates are given by users in the form +``[x1, y1, z1, x2, y2, z2, ...]``. Before being filtered, the coordinate values +stored in the tile data are rearranged to be +``[x1, x2, ..., xN, y1, y2, ..., yN, z1, z2, ..., zN]``. + To account for filtering, some additional metadata is prepended in the tile data bytes in each tile. This filter pipeline metadata informs TileDB how the following tile bytes should be treated (for example, how to decompress it when diff --git a/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/__coords.tdb b/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/__coords.tdb new file mode 100755 index 00000000000..7c1f1dec379 Binary files /dev/null and b/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/__coords.tdb differ diff --git a/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/__fragment_metadata.tdb b/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/__fragment_metadata.tdb new file mode 100755 index 00000000000..bca735e8735 Binary files /dev/null and b/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/__fragment_metadata.tdb differ diff --git a/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/a.tdb b/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/a.tdb new file mode 100755 index 00000000000..e0f1ea25701 Binary files /dev/null and b/test/inputs/arrays/non_split_coords_v1_4_0/__5dc2cc9cce204a189f825fd4919d0c55_1541540586869/a.tdb differ diff --git a/test/inputs/arrays/non_split_coords_v1_4_0/__array_schema.tdb b/test/inputs/arrays/non_split_coords_v1_4_0/__array_schema.tdb new file mode 100755 index 00000000000..8e8614ce89d Binary files /dev/null and b/test/inputs/arrays/non_split_coords_v1_4_0/__array_schema.tdb differ diff --git a/test/inputs/arrays/non_split_coords_v1_4_0/__lock.tdb b/test/inputs/arrays/non_split_coords_v1_4_0/__lock.tdb new file mode 100755 index 00000000000..e69de29bb2d diff --git a/test/src/unit-backwards_compat.cc b/test/src/unit-backwards_compat.cc index 55f3dd96ca5..d55907b4876 100644 --- a/test/src/unit-backwards_compat.cc +++ b/test/src/unit-backwards_compat.cc @@ -61,6 +61,31 @@ TEST_CASE( } } +TEST_CASE( + "Backwards compatibility: Test reading 1.4.0 array with non-split coords", + "[backwards-compat]") { + Context ctx; + std::string array_uri(arrays_dir + "/non_split_coords_v1_4_0"); + Array array(ctx, array_uri, TILEDB_READ); + std::vector subarray = {1, 4, 10, 10}; + auto max_el = array.max_buffer_elements(subarray); + std::vector a_read; + a_read.resize(max_el["a"].second); + std::vector coords_read; + coords_read.resize(max_el[TILEDB_COORDS].second); + + Query query_r(ctx, array); + query_r.set_subarray(subarray) + .set_layout(TILEDB_ROW_MAJOR) + .set_buffer("a", a_read) + .set_coordinates(coords_read); + query_r.submit(); + array.close(); + + for (int i = 0; i < 4; i++) + REQUIRE(a_read[i] == i + 1); +} + TEST_CASE( "Backwards compatibility: Test reading arrays written with previous " "version of tiledb", diff --git a/tiledb/sm/filter/filter_pipeline.cc b/tiledb/sm/filter/filter_pipeline.cc index 9ab1327bffb..b508aebb0ac 100644 --- a/tiledb/sm/filter/filter_pipeline.cc +++ b/tiledb/sm/filter/filter_pipeline.cc @@ -353,9 +353,8 @@ Status FilterPipeline::run_forward(Tile* tile) const { current_tile_ = tile; - // Split the coords if the tile is compressed, otherwise it is wasted effort. - bool using_compression = get_filter() != nullptr; - if (using_compression && tile->stores_coords()) + // Split the coords if the tile stores coordinates. + if (tile->stores_coords()) tile->split_coordinates(); // Compute the chunks. @@ -426,10 +425,15 @@ Status FilterPipeline::run_reverse(Tile* tile) const { // Replace the tile's buffer with the unfiltered buffer. RETURN_NOT_OK(tile->buffer()->swap(unfiltered_tile)); - // Zip the coords if the tile was compressed, otherwise they were not split. - bool using_compression = get_filter() != nullptr; - if (using_compression && tile->stores_coords()) - tile->zip_coordinates(); + // Zip the coords. + if (tile->stores_coords()) { + // Note that format version < 2 only split the coordinates when compression + // was used. See https://github.com/TileDB-Inc/TileDB/issues/1053 + bool using_compression = get_filter() != nullptr; + if (tile->format_version() > 1 || using_compression) { + tile->zip_coordinates(); + } + } return Status::Ok(); diff --git a/tiledb/sm/fragment/fragment_metadata.cc b/tiledb/sm/fragment/fragment_metadata.cc index 5521f82217c..5cf1ff9c450 100644 --- a/tiledb/sm/fragment/fragment_metadata.cc +++ b/tiledb/sm/fragment/fragment_metadata.cc @@ -362,6 +362,10 @@ uint64_t FragmentMetadata::file_var_sizes(const std::string& attribute) const { return file_var_sizes_[attribute_id]; } +uint32_t FragmentMetadata::format_version() const { + return version_; +} + const URI& FragmentMetadata::fragment_uri() const { return fragment_uri_; } diff --git a/tiledb/sm/fragment/fragment_metadata.h b/tiledb/sm/fragment/fragment_metadata.h index cca8b2aef6f..f9a126c05b1 100644 --- a/tiledb/sm/fragment/fragment_metadata.h +++ b/tiledb/sm/fragment/fragment_metadata.h @@ -220,6 +220,9 @@ class FragmentMetadata { /** Returns the size of the input variable attribute. */ uint64_t file_var_sizes(const std::string& attribute) const; + /** Returns the format version of this fragment. */ + uint32_t format_version() const; + /** Returns the fragment URI. */ const URI& fragment_uri() const; diff --git a/tiledb/sm/misc/constants.cc b/tiledb/sm/misc/constants.cc index c7cba82c0f3..b87503cc3e3 100644 --- a/tiledb/sm/misc/constants.cc +++ b/tiledb/sm/misc/constants.cc @@ -373,7 +373,7 @@ const int32_t library_version[3] = { TILEDB_VERSION_MAJOR, TILEDB_VERSION_MINOR, TILEDB_VERSION_PATCH}; /** The TileDB serialization format version number. */ -const uint32_t format_version = 1; +const uint32_t format_version = 2; /** The maximum size of a tile chunk (unit of compression) in bytes. */ const uint64_t max_tile_chunk_size = 64 * 1024; diff --git a/tiledb/sm/query/reader.cc b/tiledb/sm/query/reader.cc index 28dd855b7b5..7be400b4e14 100644 --- a/tiledb/sm/query/reader.cc +++ b/tiledb/sm/query/reader.cc @@ -1675,7 +1675,8 @@ Status Reader::init_read_state() { return Status::Ok(); } -Status Reader::init_tile(const std::string& attribute, Tile* tile) const { +Status Reader::init_tile( + uint32_t format_version, const std::string& attribute, Tile* tile) const { // For easy reference auto domain = array_schema_->domain(); auto cell_size = array_schema_->cell_size(attribute); @@ -1688,13 +1689,17 @@ Status Reader::init_tile(const std::string& attribute, Tile* tile) const { auto tile_size = cell_num_per_tile * cell_size; // Initialize - RETURN_NOT_OK(tile->init(type, tile_size, cell_size, dim_num)); + RETURN_NOT_OK( + tile->init(format_version, type, tile_size, cell_size, dim_num)); return Status::Ok(); } Status Reader::init_tile( - const std::string& attribute, Tile* tile, Tile* tile_var) const { + uint32_t format_version, + const std::string& attribute, + Tile* tile, + Tile* tile_var) const { // For easy reference auto domain = array_schema_->domain(); auto capacity = array_schema_->capacity(); @@ -1705,11 +1710,13 @@ Status Reader::init_tile( // Initialize RETURN_NOT_OK(tile->init( + format_version, constants::cell_var_offset_type, tile_size, constants::cell_var_offset_size, 0)); - RETURN_NOT_OK(tile_var->init(type, tile_size, datatype_size(type), 0)); + RETURN_NOT_OK( + tile_var->init(format_version, type, tile_size, datatype_size(type), 0)); return Status::Ok(); } @@ -1860,10 +1867,12 @@ Status Reader::read_tiles( auto& tile_pair = it->second; auto& t = tile_pair.first; auto& t_var = tile_pair.second; + auto& fragment = fragment_metadata_[tile->fragment_idx_]; + auto format_version = fragment->format_version(); if (!var_size) { - RETURN_NOT_OK(init_tile(attribute, &t)); + RETURN_NOT_OK(init_tile(format_version, attribute, &t)); } else { - RETURN_NOT_OK(init_tile(attribute, &t, &t_var)); + RETURN_NOT_OK(init_tile(format_version, attribute, &t, &t_var)); } // Enqueue the read task in the Reader thread pool. diff --git a/tiledb/sm/query/reader.h b/tiledb/sm/query/reader.h index 015bed5ffb2..c0ea5b48447 100644 --- a/tiledb/sm/query/reader.h +++ b/tiledb/sm/query/reader.h @@ -813,22 +813,28 @@ class Reader { /** * Initializes a fixed-sized tile. * + * @param format_version The format version of the tile. * @param attribute The attribute the tile belongs to. * @param tile The tile to be initialized. * @return Status */ - Status init_tile(const std::string& attribute, Tile* tile) const; + Status init_tile( + uint32_t format_version, const std::string& attribute, Tile* tile) const; /** * Initializes a var-sized tile. * + * @param format_version The format version of the tile. * @param attribute The attribute the tile belongs to. * @param tile The offsets tile to be initialized. * @param tile_var The var-sized data tile to be initialized. * @return Status */ Status init_tile( - const std::string& attribute, Tile* tile, Tile* tile_var) const; + uint32_t format_version, + const std::string& attribute, + Tile* tile, + Tile* tile_var) const; /** * Initializes the fragment dense cell range iterators. There is one vector diff --git a/tiledb/sm/query/writer.cc b/tiledb/sm/query/writer.cc index 4f28040a43b..5301375aea4 100644 --- a/tiledb/sm/query/writer.cc +++ b/tiledb/sm/query/writer.cc @@ -1303,7 +1303,8 @@ Status Writer::init_tile(const std::string& attribute, Tile* tile) const { auto tile_size = cell_num_per_tile * cell_size; // Initialize - RETURN_NOT_OK(tile->init(type, tile_size, cell_size, dim_num)); + RETURN_NOT_OK(tile->init( + constants::format_version, type, tile_size, cell_size, dim_num)); return Status::Ok(); } @@ -1320,11 +1321,13 @@ Status Writer::init_tile( // Initialize RETURN_NOT_OK(tile->init( + constants::format_version, constants::cell_var_offset_type, tile_size, constants::cell_var_offset_size, 0)); - RETURN_NOT_OK(tile_var->init(type, tile_size, datatype_size(type), 0)); + RETURN_NOT_OK(tile_var->init( + constants::format_version, type, tile_size, datatype_size(type), 0)); return Status::Ok(); } diff --git a/tiledb/sm/tile/tile.cc b/tiledb/sm/tile/tile.cc index 56f5f4ef603..2a49b8b2665 100644 --- a/tiledb/sm/tile/tile.cc +++ b/tiledb/sm/tile/tile.cc @@ -95,10 +95,15 @@ uint64_t Tile::cell_num() const { return size() / cell_size_; } -Status Tile::init(Datatype type, uint64_t cell_size, unsigned int dim_num) { +Status Tile::init( + uint32_t format_version, + Datatype type, + uint64_t cell_size, + unsigned int dim_num) { cell_size_ = cell_size; dim_num_ = dim_num; type_ = type; + format_version_ = format_version; buffer_ = new Buffer(); if (buffer_ == nullptr) @@ -109,6 +114,7 @@ Status Tile::init(Datatype type, uint64_t cell_size, unsigned int dim_num) { } Status Tile::init( + uint32_t format_version, Datatype type, uint64_t tile_size, uint64_t cell_size, @@ -116,6 +122,7 @@ Status Tile::init( cell_size_ = cell_size; dim_num_ = dim_num; type_ = type; + format_version_ = format_version; buffer_ = new Buffer(); if (buffer_ == nullptr) @@ -162,6 +169,10 @@ bool Tile::filtered() const { return filtered_; } +uint32_t Tile::format_version() const { + return format_version_; +} + bool Tile::full() const { return (buffer_->size() != 0) && (buffer_->offset() == buffer_->alloced_size()); diff --git a/tiledb/sm/tile/tile.h b/tiledb/sm/tile/tile.h index d56bae71486..3afad6e2c0b 100644 --- a/tiledb/sm/tile/tile.h +++ b/tiledb/sm/tile/tile.h @@ -106,9 +106,14 @@ class Tile { * @param cell_size The cell size. * @param dim_num The number of dimensions in case the tile stores * coordinates. + * @param format_version The format version of the data in this tile. * @return Status */ - Status init(Datatype type, uint64_t cell_size, unsigned int dim_num); + Status init( + uint32_t format_version, + Datatype type, + uint64_t cell_size, + unsigned int dim_num); /** * Tile initializer. @@ -119,9 +124,11 @@ class Tile { * @param cell_size The cell size. * @param dim_num The number of dimensions in case the tile stores * coordinates. + * @param format_version The format version of the data in this tile. * @return Status */ Status init( + uint32_t format_version, Datatype type, uint64_t tile_size, uint64_t cell_size, @@ -165,6 +172,9 @@ class Tile { */ bool filtered() const; + /** Gets the format version number of the data in this Tile. */ + uint32_t format_version() const; + /** Checks if the tile is full. */ bool full() const; @@ -283,6 +293,9 @@ class Tile { /** The current state of the in-memory tile data with respect to filtering. */ bool filtered_; + /** The format version of the data in this tile. */ + uint32_t format_version_; + /** * If *true* the tile object will delete *buff* upon * destruction, otherwise it will not delete it. diff --git a/tiledb/sm/tile/tile_io.cc b/tiledb/sm/tile/tile_io.cc index ec390163954..993fe6f2d0b 100644 --- a/tiledb/sm/tile/tile_io.cc +++ b/tiledb/sm/tile/tile_io.cc @@ -123,7 +123,11 @@ Status TileIO::read_generic( *tile = new Tile(); RETURN_NOT_OK_ELSE( - (*tile)->init((Datatype)header.datatype, header.cell_size, 0), + (*tile)->init( + header.version_number, + (Datatype)header.datatype, + header.cell_size, + 0), delete *tile); auto tile_data_offset =