diff --git a/src/realm/array_mixed.cpp b/src/realm/array_mixed.cpp index 37870cc300..af7e0e0174 100644 --- a/src/realm/array_mixed.cpp +++ b/src/realm/array_mixed.cpp @@ -118,6 +118,22 @@ void ArrayMixed::set_null(size_t ndx) } } +std::optional ArrayMixed::get_string_id(size_t ndx) const +{ + int64_t val = m_composite.get(ndx); + if (val) { + const int64_t int_val = val >> s_data_shift; + const size_t payload_ndx{(size_t)int_val}; + const DataType type((val & s_data_type_mask) - 1); + if (type == type_String) { + ensure_string_array(); + REALM_ASSERT(size_t(int_val) < m_strings.size()); + return m_strings.get_string_id(payload_ndx); + } + } + return {}; +} + Mixed ArrayMixed::get(size_t ndx) const { int64_t val = m_composite.get(ndx); diff --git a/src/realm/array_mixed.hpp b/src/realm/array_mixed.hpp index 7fc544bc87..95afc264fe 100644 --- a/src/realm/array_mixed.hpp +++ b/src/realm/array_mixed.hpp @@ -97,6 +97,7 @@ class ArrayMixed : public ArrayPayload, private Array { { return m_composite.get(ndx) == 0; } + std::optional get_string_id(size_t ndx) const; void clear(); void erase(size_t ndx); diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index d7277faedf..7d47862be7 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include using namespace realm; @@ -192,7 +193,7 @@ StringData ArrayString::get(size_t ndx) const return {}; } -std::optional realm::ArrayString::get_string_id(size_t ndx) const +std::optional ArrayString::get_string_id(size_t ndx) const { if (m_type == Type::interned_strings) { return StringID(static_cast(m_arr)->get(ndx)); diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp index 25027dc3f0..ece4b291d9 100644 --- a/src/realm/obj.cpp +++ b/src/realm/obj.cpp @@ -629,6 +629,36 @@ BinaryData Obj::_get(ColKey::Idx col_ndx) const return ArrayBinary::get(alloc.translate(ref), m_row_ndx, alloc); } +std::optional Obj::get_string_id(ColKey col_key) const +{ + // we return a string id only if the property is string or mixed. + // And it got compressed. + + // only strings and mixed can have an interner + if (col_key.get_type() != col_type_String && col_key.get_type() != col_type_Mixed) + return {}; + + m_table->check_column(col_key); + _update_if_needed(); + + const auto col_ndx = col_key.get_index(); + const auto interner = m_table->get_string_interner(col_ndx); + ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); + + if (col_key.get_type() == col_type_Mixed) { + // mixed handling. Only strings in mixed may have a string id + ArrayMixed values(get_alloc()); + values.set_string_interner(interner); + values.init_from_ref(ref); + return values.get_string_id(m_row_ndx); + } + // must be string. + ArrayString values(get_alloc()); + values.set_string_interner(interner); + values.init_from_ref(ref); + return values.get_string_id(m_row_ndx); +} + Mixed Obj::get_any(ColKey col_key) const { m_table->check_column(col_key); diff --git a/src/realm/obj.hpp b/src/realm/obj.hpp index cffc1b56a7..bb320f02c9 100644 --- a/src/realm/obj.hpp +++ b/src/realm/obj.hpp @@ -117,6 +117,11 @@ class Obj { template U get(ColKey col_key) const; + std::optional get_string_id(ColKey) const; + std::optional get_string_id(StringData col_name) const + { + return get_string_id(get_column_key(col_name)); + } Mixed get_any(ColKey col_key) const; Mixed get_any(StringData col_name) const { diff --git a/src/realm/path.hpp b/src/realm/path.hpp index 6124590271..0dfb83f96c 100644 --- a/src/realm/path.hpp +++ b/src/realm/path.hpp @@ -256,6 +256,10 @@ class ExtendedColumnKey { ObjKey get_link_target(const Obj& obj) const; Mixed get_value(const Obj& obj) const; + // get String ID for the obj, it makes sense to call this method only if the col_key type is either Mixed or + // String. + std::optional get_string_id(const Obj& obj) const; + private: ColKey m_colkey; PathElement m_index; diff --git a/src/realm/sort_descriptor.cpp b/src/realm/sort_descriptor.cpp index 4d0e97c2bc..8e1258be04 100644 --- a/src/realm/sort_descriptor.cpp +++ b/src/realm/sort_descriptor.cpp @@ -23,9 +23,51 @@ #include #include #include +#include using namespace realm; +namespace { + +template +int compare(const T& i, const T& j, const Col& col) +{ + Mixed m_i = i.get_value(); + Mixed m_j = j.get_value(); + + // 1. not compressed + if (!i.compressed && !j.compressed) + return m_i.compare(m_j); + + ColKey ck{col.col_key}; + StringInterner* interner = col.table->get_string_interner(ck); + + // 2. two compressed strings + if (i.compressed && j.compressed) { + return interner->compare((StringID)m_i.get_int(), (StringID)m_j.get_int()); + } + + // 3. one index is a compressed string, and the other one is mixed. + if (i.compressed || j.compressed) { + if (m_i.is_type(type_String)) + return interner->compare(m_i.get_string(), (StringID)m_j.get_int()); + + if (m_j.is_type(type_String)) + return -interner->compare(m_j.get_string(), (StringID)m_i.get_int()); + } + + // 4. compare string vs any other non-string (since value comparison is triggered only if the type matches, we can + // skip fetching the actual values) + if (i.compressed) + m_i = Mixed{""}; + else + m_j = Mixed{""}; + + return m_i.compare(m_j); +} + +} // namespace + ConstTableRef ExtendedColumnKey::get_target_table(const Table* table) const { return (m_colkey.get_type() == col_type_Link) ? table->get_link_target(m_colkey) : ConstTableRef{}; @@ -85,6 +127,14 @@ Mixed ExtendedColumnKey::get_value(const Obj& obj) const return {}; } +std::optional ExtendedColumnKey::get_string_id(const Obj& obj) const +{ + const auto type = m_colkey.get_type(); + if (type != col_type_String && type != col_type_Mixed) + return {}; + return obj.get_string_id(m_colkey); +} + LinkPathPart::LinkPathPart(ColKey col_key, ConstTableRef source) : column_key(col_key) , from(source->get_key()) @@ -419,9 +469,8 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord } int c; - if (t == 0) { - c = i.cached_value.compare(j.cached_value); + c = compare(i, j, m_columns[t]); } else { if (m_cache[t - 1].empty()) { @@ -434,20 +483,25 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord const auto& obj = m_columns[t].table->get_object(key_i); const auto& col_key = m_columns[t].col_key; - cache_i.value = col_key.get_value(obj); + // store stringID instead of the actual string if possible cache_i.key = key_i; + const std::optional string_id = col_key.get_string_id(obj); + cache_i.compressed = string_id ? true : false; + cache_i.value = cache_i.compressed ? static_cast(*string_id) : col_key.get_value(obj); } - Mixed val_i = cache_i.value; if (cache_j.key != key_j) { const auto& obj = m_columns[t].table->get_object(key_j); const auto& col_key = m_columns[t].col_key; - cache_j.value = col_key.get_value(obj); + // store stringID instead of the actual string if possible cache_j.key = key_j; + const std::optional string_id = col_key.get_string_id(obj); + cache_j.compressed = string_id ? true : false; + cache_j.value = cache_j.compressed ? static_cast(*string_id) : col_key.get_value(obj); } - c = val_i.compare(cache_j.value); + c = compare(cache_i, cache_j, m_columns[t]); } // if c is negative i comes before j if (c) { @@ -476,9 +530,10 @@ void BaseDescriptor::Sorter::cache_first_column(IndexPairs& v) continue; } } - const auto obj = col.table->get_object(key); - index.cached_value = ck.get_value(obj); + const std::optional string_id = ck.get_string_id(obj); + index.compressed = string_id ? true : false; + index.cached_value = index.compressed ? static_cast(*string_id) : ck.get_value(obj); } } diff --git a/src/realm/sort_descriptor.hpp b/src/realm/sort_descriptor.hpp index 0224ea5de6..a3f03390b5 100644 --- a/src/realm/sort_descriptor.hpp +++ b/src/realm/sort_descriptor.hpp @@ -66,9 +66,18 @@ class BaseDescriptor { { return index_in_view < other.index_in_view; } + ObjKey get_key() const + { + return key_for_object; + } + Mixed get_value() const + { + return cached_value; + } ObjKey key_for_object; size_t index_in_view; Mixed cached_value; + bool compressed = false; }; class IndexPairs : public std::vector { public: @@ -115,6 +124,16 @@ class BaseDescriptor { struct ObjCache { ObjKey key; Mixed value; + bool compressed = false; + + ObjKey get_key() const + { + return key; + } + Mixed get_value() const + { + return value; + } }; using TableCache = std::vector; mutable std::vector m_cache; diff --git a/src/realm/string_compressor.cpp b/src/realm/string_compressor.cpp index b086f07649..9f88cc2420 100644 --- a/src/realm/string_compressor.cpp +++ b/src/realm/string_compressor.cpp @@ -290,16 +290,17 @@ int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B) if (code_A == code_B) continue; // symbols did not match: + // 1. both symbols are single characters if (code_A < 256 && code_B < 256) return code_A - code_B; - std::string a_str(code_A, 1); - auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion); - std::string b_str(code_B, 1); - auto str_B = std::string_view(code_B < 256 ? b_str : m_symbols[code_B - 256].expansion); - // to ensure comparison as StringData we need to convert the stringviews - StringData sd_a(str_A.data(), str_A.size()); - StringData sd_b(str_B.data(), str_B.size()); + + // 2. all the other possible cases + std::string str_a{(char)code_A, 1}; + std::string str_b{(char)code_B, 1}; + StringData sd_a = code_A < 256 ? str_a : m_symbols[code_A - 256].expansion; + StringData sd_b = code_B < 256 ? str_b : m_symbols[code_B - 256].expansion; + REALM_ASSERT_DEBUG(sd_a != sd_b); if (sd_a < sd_b) return -1; diff --git a/src/realm/string_data.hpp b/src/realm/string_data.hpp index 46e1df0713..63578350e7 100644 --- a/src/realm/string_data.hpp +++ b/src/realm/string_data.hpp @@ -34,6 +34,11 @@ namespace realm { +// Compressed strings have unique IDs, this defines a global alias +// for this. A StringID is an entry inside an array of N compressed strings. +// 0 means null, all the other ids [1, N-1] represent a valid string. +using StringID = size_t; + /// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms. /// This is what libc++ does, and it is a good general choice for a /// non-cryptographic hash function (suitable for std::unordered_map etc.). diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index bff3130901..dd6c27cab2 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -401,7 +401,7 @@ StringID StringInterner::intern(StringData sd) { REALM_ASSERT(m_top.is_attached()); std::lock_guard lock(m_mutex); - // special case for null string + // special case for null string if (sd.data() == nullptr) return 0; uint32_t h = (uint32_t)sd.hash(); @@ -619,7 +619,7 @@ std::optional StringInterner::lookup(StringData sd) int StringInterner::compare(StringID A, StringID B) { std::lock_guard lock(m_mutex); - // 0 is null, the first index starts from 1. + // 0 is null, the first index starts from 1. REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size()); // comparisons against null diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp index 93c1eec45b..7d53c12034 100644 --- a/src/realm/string_interner.hpp +++ b/src/realm/string_interner.hpp @@ -34,8 +34,6 @@ struct CompressedStringView; namespace realm { -using StringID = size_t; - class StringCompressor; struct CachedString { diff --git a/test/benchmark-common-tasks/main.cpp b/test/benchmark-common-tasks/main.cpp index b8e52fc5d7..2db7982a39 100644 --- a/test/benchmark-common-tasks/main.cpp +++ b/test/benchmark-common-tasks/main.cpp @@ -1630,6 +1630,70 @@ struct BenchmarkSort : BenchmarkWithStrings { } }; +// benchmark for testing compressed string sorting. +// N is the size of the string to generate +// M is the number of times we want store the string (number of dups) +template +struct BenchmarkSortCompressed : BenchmarkWithStringsTable { + std::string compressed_benchmark_name; + + BenchmarkSortCompressed() + : BenchmarkWithStringsTable() + { + if constexpr (N <= 15) { + compressed_benchmark_name = util::format("SortCompressedSmall(%1,%2)", N, M); + } + else if constexpr (N <= 63) { + compressed_benchmark_name = util::format("SortCompressedMedium(%1,%2)", N, M); + } + else { + compressed_benchmark_name = util::format("SortCompressedLarge(%1,%2)", N, M); + } + } + + const char* name() const + { + return compressed_benchmark_name.c_str(); + } + + void before_all(DBRef group) + { + BenchmarkWithStringsTable::before_all(group); + WriteTransaction tr(group); + TableRef t = tr.get_table(name()); + + auto gen_string = [](size_t length) { + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; + }; + + std::string str = ""; + for (size_t i = 0; i < BASE_SIZE; ++i) { + if (i % M == 0) + str = gen_string(N); + + Obj obj = t->create_object(); + obj.set(m_col, str); + } + tr.commit(); + } + + void operator()(DBRef) + { + ConstTableRef table = m_table; + TableView view = table->get_sorted_view(m_col); + } +}; + struct BenchmarkEmptyCommit : Benchmark { const char* name() const { @@ -2663,6 +2727,11 @@ int benchmark_common_tasks_main() BENCH(IterateTableByIteratorIndex); BENCH(BenchmarkSort); + BENCH(BenchmarkSortCompressed<10, 500>); + BENCH(BenchmarkSortCompressed<50, 500>); + BENCH(BenchmarkSortCompressed<100, 500>); + BENCH(BenchmarkSortCompressed<1000, 5000>); + BENCH(BenchmarkSortInt); BENCH(BenchmarkSortIntList); BENCH(BenchmarkSortIntDictionary); diff --git a/test/test_group.cpp b/test/test_group.cpp index 3e58f9c274..27c9470222 100644 --- a/test/test_group.cpp +++ b/test/test_group.cpp @@ -2623,10 +2623,6 @@ TEST(Test_Commit_Compression_Strings) auto set_s = obj.get_set(col_key_set_string); auto dictionary_s = obj.get_dictionary(col_key_dict_string); - CHECK_EQUAL(list_s.size(), i + 1); - CHECK_EQUAL(set_s.size(), i + 1); - CHECK_EQUAL(dictionary_s.size(), i + 1); - CHECK_EQUAL(list_s.get_any(i), str); CHECK_NOT_EQUAL(set_s.find_any(str), not_found); CHECK_NOT_EQUAL(dictionary_s.find_any(str), not_found); diff --git a/test/test_query.cpp b/test/test_query.cpp index 6520085aeb..37b109bfaa 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -4107,7 +4107,6 @@ TEST(Query_LinkChainSortErrors) CHECK_LOGIC_ERROR(t1->get_sorted_view(SortDescriptor({{t1_linklist_col}})), ErrorCodes::InvalidSortDescriptor); } - TEST(Query_EmptyDescriptors) { Group g; @@ -5773,4 +5772,339 @@ TEST_TYPES(Query_IntCompressed, Equal, NotEqual, Less, LessEqual, Greater, Great } } +// Many of our tests just test the correctness of sorting strings. +// For compressed strings we can use the string ids to perform the +// same task. We just need to commit first and then run the query. +// These tests are mainly verifying the following: +// +// 1. Store N strings inside a Mixed. Commit and sort. +// 2. Store inside a Mixed integers and Strings. Commit and sort +// 3. Store N strings in compressed format inside a Mixed property, store another N strings uncompressed in another +// column. Sort using both columns. +// 4. Store N compressed strings inside a table, linked by another table. Sort over links. + +// Note: Strings and Mixed use the same logic for compressing strings. Thus these tests are solely using Mixed +// columns. + + +static int gen_random_int(int min = 1, int max = 100) +{ + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(min, max); + return distribution(generator); +} + +static std::string gen_random_string(size_t length) +{ + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; +} + +TEST_TYPES(CompressedStrings_Sort, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col = t->get_column_key("any"); + for (auto& s : strings) { + t->create_object().set(col, Mixed{s}); + } + }); + rt->advance_read(); + + // sort and verify + TableRef table = rt->get_table("Table"); + ColKey col_key = table->get_column_key("any"); + bool ascending = type::value; + + auto cmp = [&ascending](auto& s1, auto& s2) { + Mixed m1{s1}; + Mixed m2{s2}; + return ascending ? m1 < m2 : m1 > m2; + }; + + std::sort(strings.begin(), strings.end(), cmp); + + TableView tv = table->where().find_all(); + tv.sort(col_key, ascending); + CHECK(tv.size() == strings.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(col_key), strings[i]); + } +} + +TEST_TYPES(CompressedStringsAndOtherMixed_Sort, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector strings; + std::vector ints; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + ints.push_back(size); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col_any = t->get_column_key("any"); + for (auto& s : strings) { + Obj obj = t->create_object(); + obj.set(col_any, Mixed{s}); + } + for (auto i : ints) { + Obj obj = t->create_object(); + obj.set(col_any, Mixed{i}); + } + }); + rt->advance_read(); + + // sort and verify + TableRef table = rt->get_table("Table"); + ColKey col_key = table->get_column_key("any"); + bool ascending = type::value; + + auto cmp = [&ascending](Mixed m1, Mixed m2) { + return ascending ? m1 < m2 : m1 > m2; + }; + + std::vector data; + for (auto& str : strings) + data.push_back(Mixed{str}); + for (auto i : ints) + data.push_back(Mixed{i}); + + std::sort(data.begin(), data.end(), cmp); + + TableView tv = table->where().find_all(); + tv.sort(col_key, ascending); + CHECK(tv.size() == data.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(col_key), data[i]); + } +} + +TEST_TYPES(CompressedStrings_CompressedAndUncompressedStringColumns, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any_compressed"); + t->add_column(type_Mixed, "any_uncompressed"); + }); + rt->advance_read(); + + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col = t->get_column_key("any_compressed"); + for (auto& s : strings) { + t->create_object().set(col, Mixed{s}); + } + }); + // any_compressed Mixed is now using compressed strings + + rt->advance_read(); + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col_compressed = t->get_column_key("any_compressed"); + ColKey col_uncompressed = t->get_column_key("any_uncompressed"); + + // add N new objects but as long as we don't commit these strings + // will be in uncompressed format + for (auto& s : strings) { + t->create_object().set(col_uncompressed, Mixed{s}); + } + + // sort and verify both columns + bool ascending = type::value; + auto cmp = [&ascending](auto& s1, auto& s2) { + Mixed m1{s1}; + Mixed m2{s2}; + return ascending ? m1 < m2 : m1 > m2; + }; + std::sort(strings.begin(), strings.end(), cmp); + + TableView tv = t->where().find_all(); + + tv.sort(SortDescriptor({{col_compressed}, {col_uncompressed}}, {ascending, ascending})); + + CHECK(tv.size() == strings.size() * 2); + for (size_t i = 0; i < 2 * N; ++i) { + auto compressed_str = tv[i].get_any(col_compressed); + auto uncompressed_str = tv[i].get_any(col_uncompressed); + if (!compressed_str.is_null()) { + CHECK_EQUAL(compressed_str, strings[i % N]); + } + if (!uncompressed_str.is_null()) { + CHECK_EQUAL(uncompressed_str, strings[i % N]); + } + } + }); + // after this point both columns will be in compressed format +} + +TEST_TYPES(CompressedStrings_SortOverLinks, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + auto o = wt.add_table("Other"); + // store N ints in T.any + t->add_column(type_Mixed, "any"); + // link O to T + t->add_column(*o, "link"); + // store N compressed strings in O.any + o->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector ints; + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + ints.push_back(size); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + // store N strings in O + TableRef table = wt.get_table("Other"); + ColKey col = table->get_column_key("any"); + for (const auto& s : strings) { + table->create_object().set(col, Mixed{s}); + } + }); + + rt->advance_read(); + + commit([&](auto& wt) { + TableRef table = wt.get_table("Table"); + TableRef other = wt.get_table("Other"); + + ColKey col = table->get_column_key("any"); + ColKey link = table->get_column_key("link"); + + // set N ints + for (auto i : ints) + table->create_object().set(col, Mixed{i}); + + // set N links to Other's objects storing compressed strings. + size_t i = 0; + for (Obj o : *other) + table->get_object(i++).set(link, o.get_key()); + }); + // any Mixed that contains strings is now pointing to a compressed string + rt->advance_read(); + + std::vector> data; + for (size_t i = 0; i < N; ++i) { + auto p = std::make_pair(Mixed{ints[i]}, Mixed{strings[i]}); + data.push_back(p); + } + + bool ascending = type::value; + auto cmp = [&ascending](auto& p1, auto& p2) { + // sort based on strings + Mixed m1 = p1.second; + Mixed m2 = p2.second; + return ascending ? m1 < m2 : m1 > m2; + }; + std::sort(data.begin(), data.end(), cmp); + + TableRef table = rt->get_table("Table"); + TableRef other = rt->get_table("Other"); + ColKey t_any = table->get_column_key("any"); + ColKey link = table->get_column_key("link"); + ColKey o_any = other->get_column_key("any"); + + TableView tv = table->where().find_all(); + tv.sort(SortDescriptor({{link, o_any}}, {ascending})); + CHECK(tv.size() == data.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(t_any), data[i].first); + } +} + #endif // TEST_QUERY diff --git a/test/test_string_compression.cpp b/test/test_string_compression.cpp index 017e81968c..cf104445b6 100644 --- a/test/test_string_compression.cpp +++ b/test/test_string_compression.cpp @@ -35,6 +35,7 @@ using namespace realm; TEST(StringInterner_Basic_Creation) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); StringData my_string = "aaaaaaaaaaaaaaa"; @@ -50,12 +51,12 @@ TEST(StringInterner_Basic_Creation) CHECK_EQUAL(my_string, origin_string); CHECK(interner.compare(*stored_id, id) == 0); // compare agaist self. - parent.destroy_deep(); } TEST(StringInterner_InternMultipleStrings) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -72,12 +73,12 @@ TEST(StringInterner_InternMultipleStrings) CHECK_EQUAL(*stored_id, id); CHECK_EQUAL(interner.compare(str, id), 0); } - parent.destroy_deep(); } TEST(StringInterner_TestLookup) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -96,13 +97,12 @@ TEST(StringInterner_TestLookup) CHECK(id); CHECK(interner.compare(StringData(s), *id) == 0); } - - parent.destroy_deep(); } TEST(StringInterner_VerifyComparison) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -173,13 +173,12 @@ TEST(StringInterner_VerifyComparison) res = interner.compare(test_upper_case_id, test_lower_case_id); CHECK_LESS(interner.get(test_upper_case_id), interner.get(test_lower_case_id)); CHECK_EQUAL(res, -1); - - parent.destroy_deep(); } TEST(StringInterner_VerifyInterningNull) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); auto null_id = interner.intern({}); @@ -203,13 +202,12 @@ TEST(StringInterner_VerifyInterningNull) CHECK_LESS(StringData{}, interner.get(str_id)); // compare via StringData CHECK_EQUAL(interner.compare(StringData{"test"}, null_id), 1); CHECK_GREATER(StringData{"test"}, interner.get(null_id)); - - parent.destroy_deep(); } TEST(StringInterner_VerifyLongString) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -221,13 +219,12 @@ TEST(StringInterner_VerifyLongString) const auto stored_id = interner.lookup(StringData(long_string)); CHECK_EQUAL(stored_id, 1); CHECK(interner.compare(StringData(long_string), *stored_id) == 0); - - parent.destroy_deep(); } TEST(StringInterner_VerifyExpansionFromSmallStringToLongString) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -249,6 +246,4 @@ TEST(StringInterner_VerifyExpansionFromSmallStringToLongString) stored_id = interner.lookup(StringData(long_string)); CHECK_EQUAL(stored_id, id); CHECK(interner.compare(StringData(long_string), *stored_id) == 0); - - parent.destroy_deep(); } diff --git a/test/test_utf8.cpp b/test/test_utf8.cpp index ec4c913eaf..de456b2d69 100644 --- a/test/test_utf8.cpp +++ b/test/test_utf8.cpp @@ -24,10 +24,13 @@ #include #include #include +#include #include -#include #include +#include +#include +#include #include "test.hpp" @@ -86,10 +89,22 @@ const char* u16sur2 = "\xF0\xA0\x9C\xB1"; // same as above, with larger unicode TEST(UTF8_Compare_Strings) { + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + // Useful line for creating new unit test cases: // bool ret = std::locale("us_EN")(string("a"), std::string("b")); - auto str_compare = [](StringData a, StringData b) { - return a < b; + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; // simplest test @@ -141,9 +156,21 @@ TEST(UTF8_Compare_Strings) TEST(UTF8_Compare_Core_utf8) { - auto str_compare = [](StringData a, StringData b) { - return a < b; + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; + // single utf16 code points (tests mostly Windows) CHECK_EQUAL(false, str_compare(uae, uae)); CHECK_EQUAL(false, str_compare(uAE, uAE)); @@ -169,7 +196,6 @@ TEST(UTF8_Compare_Core_utf8) CHECK_EQUAL(false, str_compare(u16sur2, u16sur2)); } - TEST(UTF8_Compare_Core_utf8_invalid) { // Test that invalid utf8 won't make decisions on data beyond Realm payload. Do @@ -194,8 +220,17 @@ TEST(UTF8_Compare_Core_utf8_invalid) // that return value is arbitrary for invalid utf8 bool ret = i1 < i2; CHECK_EQUAL(ret, i2 < i1); // must sort the same as before regardless of succeeding data -} + // the same applies if the strings are interned. + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + auto id1 = interner.intern(invalid1); + auto id2 = interner.intern(invalid2); + bool ret_interned = interner.compare(id1, id2) < 0; + CHECK_EQUAL(ret_interned, ret); +} TEST(Compare_Core_utf8_invalid_crash) { @@ -218,12 +253,22 @@ TEST(Compare_Core_utf8_invalid_crash) } } - TEST(UTF8_Compare_Core_utf8_zero) { - auto str_compare = [](StringData a, StringData b) { - return a < b; + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; + // Realm must support 0 characters in utf8 strings CHECK_EQUAL(false, str_compare(StringData("\0", 1), StringData("\0", 1))); CHECK_EQUAL(true, str_compare(StringData("\0", 1), StringData("a"))); @@ -238,7 +283,6 @@ TEST(UTF8_Compare_Core_utf8_zero) CHECK_EQUAL(true, str_compare(StringData("a\0", 2), StringData("a\0\0", 3))); CHECK_EQUAL(false, str_compare(StringData("a\0\0", 3), StringData("a\0", 2))); } - } // anonymous namespace #endif // TEST_UTF8