Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RCORE-2157 Avoid to decompress Strings while sorting them. Instead use fast comparison provided by the interner #7892

Merged
merged 24 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
2d58549
initial test
nicola-cab Jul 15, 2024
8089f9f
add logic for fetching stringID to Descriptors (partially working)
nicola-cab Jul 16, 2024
2142660
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jul 18, 2024
34f8bd3
fixes core tests
nicola-cab Jul 18, 2024
865f599
ops I cannot use C++20 :-)
nicola-cab Jul 18, 2024
d94c474
fix test
nicola-cab Jul 18, 2024
87b3d1a
fix handling for array mixed when it holds strings
nicola-cab Jul 18, 2024
6434ceb
fix improper use of mixed when it is not string
nicola-cab Jul 19, 2024
7d62244
Compare only if both string ids are available
nicola-cab Jul 19, 2024
e22000b
works for strings
nicola-cab Jul 19, 2024
f3f0c76
enable mixed and fix bug in cmp function
nicola-cab Jul 19, 2024
2c706f0
Merge branch 'feature/string-compression' into nc/RCORE-2157
nicola-cab Jul 22, 2024
fd61aee
code cleanup
nicola-cab Jul 22, 2024
39adf24
remove alias dup
nicola-cab Jul 22, 2024
5caa311
fix cmp function + simplify comparison function for sorting
nicola-cab Jul 23, 2024
0c59fb5
lint
nicola-cab Jul 23, 2024
4d9c3d1
Merge branch 'feature/string-compression' into nc/RCORE-2157
nicola-cab Jul 23, 2024
3d6feb6
more readeable cmp function
nicola-cab Jul 24, 2024
f3d4fb5
test refactory, still missing cmp over links + mixed of diff type cmp
nicola-cab Jul 25, 2024
29b04b3
tests
nicola-cab Jul 29, 2024
ee2cbec
code review
nicola-cab Jul 31, 2024
8b68853
more tests for utf8
nicola-cab Jul 31, 2024
195716c
code review
nicola-cab Aug 1, 2024
1629181
code review
nicola-cab Aug 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/realm/array_mixed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,22 @@ void ArrayMixed::set_null(size_t ndx)
}
}

std::optional<StringID> ArrayMixed::get_string_id(size_t ndx) const
{
int64_t val = m_composite.get(ndx);
if (val) {
const int64_t int_val = val >> s_data_shift;
const size_t payload_ndx{(size_t)int_val};
const DataType type((val & s_data_type_mask) - 1);
if (type == type_String) {
ensure_string_array();
REALM_ASSERT(size_t(int_val) < m_strings.size());
return m_strings.get_string_id(payload_ndx);
}
}
return {};
}

Mixed ArrayMixed::get(size_t ndx) const
{
int64_t val = m_composite.get(ndx);
Expand Down
1 change: 1 addition & 0 deletions src/realm/array_mixed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ class ArrayMixed : public ArrayPayload, private Array {
{
return m_composite.get(ndx) == 0;
}
std::optional<StringID> get_string_id(size_t ndx) const;

void clear();
void erase(size_t ndx);
Expand Down
3 changes: 2 additions & 1 deletion src/realm/array_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include <realm/impl/array_writer.hpp>
#include <realm/table.hpp>
#include <realm/string_interner.hpp>
#include <realm/string_compressor.hpp>
#include <realm/mixed.hpp>

using namespace realm;
Expand Down Expand Up @@ -192,7 +193,7 @@ StringData ArrayString::get(size_t ndx) const
return {};
}

std::optional<StringID> realm::ArrayString::get_string_id(size_t ndx) const
std::optional<StringID> ArrayString::get_string_id(size_t ndx) const
{
if (m_type == Type::interned_strings) {
return StringID(static_cast<Array*>(m_arr)->get(ndx));
Expand Down
30 changes: 30 additions & 0 deletions src/realm/obj.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,36 @@ BinaryData Obj::_get<BinaryData>(ColKey::Idx col_ndx) const
return ArrayBinary::get(alloc.translate(ref), m_row_ndx, alloc);
}

std::optional<StringID> Obj::get_string_id(ColKey col_key) const
{
// we return a string id only if the property is string or mixed.
// And it got compressed.

// only strings and mixed can have an interner
if (col_key.get_type() != col_type_String && col_key.get_type() != col_type_Mixed)
return {};

m_table->check_column(col_key);
_update_if_needed();

const auto col_ndx = col_key.get_index();
const auto interner = m_table->get_string_interner(col_ndx);
ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1));

if (col_key.get_type() == col_type_Mixed) {
// mixed handling. Only strings in mixed may have a string id
ArrayMixed values(get_alloc());
values.set_string_interner(interner);
values.init_from_ref(ref);
return values.get_string_id(m_row_ndx);
}
// must be string.
ArrayString values(get_alloc());
values.set_string_interner(interner);
values.init_from_ref(ref);
return values.get_string_id(m_row_ndx);
}

Mixed Obj::get_any(ColKey col_key) const
{
m_table->check_column(col_key);
Expand Down
5 changes: 5 additions & 0 deletions src/realm/obj.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ class Obj {
template <typename U>
U get(ColKey col_key) const;

std::optional<StringID> get_string_id(ColKey) const;
std::optional<StringID> get_string_id(StringData col_name) const
{
return get_string_id(get_column_key(col_name));
}
Mixed get_any(ColKey col_key) const;
Mixed get_any(StringData col_name) const
{
Expand Down
4 changes: 4 additions & 0 deletions src/realm/path.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ class ExtendedColumnKey {
ObjKey get_link_target(const Obj& obj) const;
Mixed get_value(const Obj& obj) const;

// get String ID for the obj, it makes sense to call this method only if the col_key type is either Mixed or
// String.
std::optional<StringID> get_string_id(const Obj& obj) const;

private:
ColKey m_colkey;
PathElement m_index;
Expand Down
87 changes: 80 additions & 7 deletions src/realm/sort_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <realm/util/assert.hpp>
#include <realm/list.hpp>
#include <realm/dictionary.hpp>
#include <realm/string_compressor.hpp>

using namespace realm;

Expand Down Expand Up @@ -85,6 +86,16 @@ Mixed ExtendedColumnKey::get_value(const Obj& obj) const
return {};
}

std::optional<StringID> ExtendedColumnKey::get_string_id(const Obj& obj) const
{
const auto type = m_colkey.get_type();
if (type != col_type_String && type != col_type_Mixed)
return {};
if (!has_index())
return obj.get_string_id(m_colkey);
return {};
}

LinkPathPart::LinkPathPart(ColKey col_key, ConstTableRef source)
: column_key(col_key)
, from(source->get_key())
Expand Down Expand Up @@ -391,6 +402,47 @@ void FilterDescriptor::execute(const Table& table, KeyValues& key_values, const
key_values = std::move(filtered);
}

template <typename T, typename U>
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
inline int compare(const T& i, const T& j, const U& col)
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
{
const auto id_i = i.cached_string_id;
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
const auto id_j = j.cached_string_id;

if (!id_i && !id_j)
// any other comparison that has nothing to do with strings
return i.get_value().compare(j.get_value());

if (id_i || id_j) {
// at least one is a compressed string.
const auto table = col.table;
const auto interner = table->get_string_interner(ColKey{col.col_key});

// two interned strings
if (id_i && id_j)
return interner->compare(*id_i, *id_j);
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved

// one compressed string and the other one could potentially be uncompressed.
const auto& index = id_i ? j : i;
const auto& other = id_i ? id_i : id_j;
const auto str = index.get_value().template get_if<StringData>();
if (str) {
// compressed string vs uncompressed string
const auto ret = interner->compare(*str, *other);
// if i is a compressed string, than reverse cmp result, since
// we always compare i vs j and not j vs i.
return id_i ? -ret : ret;
}
}

// mixed str vs mixed (any other value)
const auto& index = id_i ? i : j;
const auto& key = index.get_key();
const auto& obj = col.table->get_object(key);
const auto& col_key = col.col_key;
const auto val = col_key.get_value(obj);
return id_i ? val.compare(j.get_value()) : i.get_value().compare(val);
}

// This function must conform to 'is less' predicate - that is:
// return true if i is strictly smaller than j
bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ordering) const
Expand Down Expand Up @@ -419,9 +471,8 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord
}

int c;

if (t == 0) {
c = i.cached_value.compare(j.cached_value);
c = compare(i, j, m_columns[t]);
}
else {
if (m_cache[t - 1].empty()) {
Expand All @@ -434,20 +485,35 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord
const auto& obj = m_columns[t].table->get_object(key_i);
const auto& col_key = m_columns[t].col_key;

cache_i.value = col_key.get_value(obj);
// store stringID instead of the actual string if possible
cache_i.key = key_i;
cache_i.cached_string_id = col_key.get_string_id(obj);
Copy link
Contributor

@ironage ironage Jul 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will add unnecessary overhead per comparison when sorting on a non-string column. Ideally instead of doing the check inside the loop, we would have two separate specialized loops to keep other types fast. This might be complex to refactor though because there are already multiple optimizations piled in here.
ie. instead of this pattern:

for (..) {
    if interned string 
        do special comparison
    else 
        do value comparison
    }
}

we should do something like this:

if column supports string interning
    for (...) {
        do special intern comparison if possible
    }
} else {
    for (...) {
        do normal value comparison
    }
}

I think we should have some benchmarks on sorting integers that can back up my claim, I wonder how much this change costs them?

Copy link
Member Author

@nicola-cab nicola-cab Jul 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cost of fetching the String ID for a non-mixed/non-string properties is only the cost of checking that the property type may have a string interner associated (check on the column type) + an extra check for the optional type.
Also in the code, we iterate column by column, so the check per col type can only be perform when we are checking the column itself. I can try to run the benchmarks and see..

Copy link
Member Author

@nicola-cab nicola-cab Jul 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I think statically it does not make a huge difference, probably we are going to slow down a little bit sorting for all properties that are not strings or mixed, but, it is really difficult to gauge and also doesn't it depend on also by the data we have?

Req runs:    6  SortInt (MemOnly, EncryptionOff):             min  57.16ms (+3.73%)            max  76.90ms (+30.61%)           med  66.13ms (+17.93%)           avg  66.87ms (+18.40%)           stddev  8.98ms (+586.99%)     

Req runs:    5  SortIntList (MemOnly, EncryptionOff):         min 321.11ms (+15.11%)           max 397.76ms (+27.72%)           med 333.63ms (+9.98%)            avg 353.77ms (+18.18%)           stddev 36.49ms (+194.74%)     

Req runs:   10  SortIntDictionary (MemOnly, EncryptionOff):     min  37.68ms (-0.91%)            max     56ms (+7.95%)            med  40.05ms (-5.75%)            avg  42.62ms (-1.18%)            stddev  6.44ms (+38.90%)      

Req runs:  225  SortThenLimit (MemOnly, EncryptionOff):         min   1.89ms (+14.04%)           max   3.82ms (+44.19%)           med   2.09ms (+21.53%)           avg   2.17ms (+23.60%)           stddev   284us (+144.66%) 

Another run (same ref data as starting cmp input)


Req runs:    7  SortInt (MemOnly, EncryptionOff):             min  56.62ms (+2.75%)            max  64.11ms (+8.90%)            med  57.26ms (+2.11%)            avg  58.38ms (+3.36%)            stddev  2.67ms (+104.06%)     

Req runs:    5  SortIntList (MemOnly, EncryptionOff):         min 298.46ms (+6.99%)            max 332.43ms (+6.74%)            med 304.79ms (+0.47%)            avg 309.70ms (+3.46%)            stddev 13.23ms (+6.88%)       

Req runs:   11  SortIntDictionary (MemOnly, EncryptionOff):     min  36.85ms (-3.08%)            max  39.14ms (-24.55%)           med  37.59ms (-11.52%)           avg  37.72ms (-12.54%)           stddev   629us (-86.44%)      

Req runs:  241  SortThenLimit (MemOnly, EncryptionOff):         min   1.87ms (+12.72%)           max   3.40ms (+28.36%)           med   1.98ms (+14.96%)           avg   2.06ms (+17.05%)           stddev   228us (+95.75%)   

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a pretty big perf regression to me?

Copy link
Member Author

@nicola-cab nicola-cab Jul 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a bug in my code, I was constantly passing by copy the Extended Column, these are the right perfs:

Req runs:    8  SortInt (MemOnly, EncryptionOff):             min  58.66ms (+6.49%)            max  66.55ms (+11.33%)           med  60.24ms (+4.87%)            avg  61.30ms (+6.42%)            stddev  3.06ms (+70.90%)      

Req runs:    5  SortIntList (MemOnly, EncryptionOff):         min 290.56ms (+3.83%)            max 304.51ms (-1.68%)            med 302.37ms (+3.53%)            avg 299.28ms (+1.51%)            stddev  6.12ms (-45.79%)      

Req runs:   10  SortIntDictionary (MemOnly, EncryptionOff):     min  37.89ms (-3.70%)            max  45.80ms (-12.86%)           med  41.27ms (-0.43%)            avg  41.59ms (-3.61%)            stddev  2.53ms (-36.80%)      

Req runs:  231  SortThenLimit (MemOnly, EncryptionOff):         min   1.84ms (+10.23%)           max   2.93ms (-24.94%)           med   1.95ms (+6.03%)            avg   1.98ms (+1.44%)            stddev   144us (-58.33%)      

I am still failing to see why this is a massive problem for non-mixed or non-string columns. It does not seem like it is to me, but I must be wrong. Because you are both complaining about this :-)

I agree that Mixed and Strings are going to be impacted, though. Especially Mixed, because a mixed can hold anything in it…

Copy link
Member Author

@nicola-cab nicola-cab Jul 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, I tried BenchmarkSort already. And it is slower. But at the same time it is not the best benchmark either IMO.

Req runs: 5 Sort (MemOnly, EncryptionOff): * min 207.91ms (+72.16%) max 214.60ms (+48.24%) * med 210.27ms (+62.98%) * avg 211.07ms (+60.69%) stddev 2.99ms (-65.96%)

Mainly for these 3 reasons:

  1. The internal DS that the string interner is keeping is not lock free, we are constantly grabbing a lock. Especially for sorting stuff, it is a real problem (since we may be calling the sorting operator a lot) . Jira to cover this: (https://jira.mongodb.org/browse/RCORE-2214)

  2. We are constantly loading a lot of data when we need to access the compressed string leaves in the Trie we built, we need to improve the internal DS: (https://jira.mongodb.org/browse/RCORE-2125)

  3. BenchmarkSort is writing 200k randomly generated strings which are representing some number. This is very much the worse scenario for interning stuff. Also, in the feature branch we are not limiting compression of strings like we do for integers (where we check if there is a sizeable gain before compressing). We just compress all (I suspect we will introduce something).

I've added a benchmark for testing the sorting of strings that are in compressed format (as you suggested). Passing 2 parameters to the test:

  • N length of the strings to store
  • M the number of duplicated strings

You can see that already, with all these limitations, there is a point after that it pays off to sort using the compressed string ID.

Req runs:    5  SortCompressedSmall(10,500) (MemOnly, EncryptionOff):   * min 124.38ms (+30.21%)           max 143.33ms (+17.40%)         * med 138.98ms (+27.34%)         * avg 136.07ms (+24.38%)           stddev  7.89ms (-27.27%)      

Req runs:    5  SortCompressedMedium(50,500) (MemOnly, EncryptionOff):   * min 125.46ms (+11.46%)           max 131.65ms (+9.01%)          * med 128.17ms (+12.81%)         * avg 128.03ms (+11.23%)           stddev  2.47ms (-25.63%)      

Req runs:    5  SortCompressedLarge(100,500) (MemOnly, EncryptionOff):   * min 128.91ms (+16.33%)           max 145.32ms (+26.75%)         * med 131.26ms (+17.72%)         * avg 134.01ms (+19.24%)           stddev  6.69ms (+302.40%)     

Req runs:    5  SortCompressedLarge(1000,5000) (MemOnly, EncryptionOff):     min 196.71ms (-30.88%)           max 208.25ms (-36.51%)           med 202.42ms (-29.76%)           avg 202.50ms (-31.61%)           stddev  4.09ms (-77.60%)      

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the non-string benchmarks, we appear to be less than 10% worse which I think seems reasonable. Ideally, that code would not be affected at all, and there may be a way to do this by templating out the non-string sorting, but I'm not sure it is worth the duplication that will create.

For the BenchmarkSort case, I don't think we should ship this feature until we get that regression down by quite a bit. Can you create a ticket for the third item you suggested so that we don't forget about it? As long as we are tracking those 3 approaches and get them and recheck the benchmarks later, it shouldn't block this PR now.

Copy link
Member Author

@nicola-cab nicola-cab Aug 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep I agree.

  1. There is a bit of tech debt in the sorting logic. I tried to make the code templated based on column type, but I quit, because the code was basically duplicated, also we need to extract the column type in at least 2 cases. First one when we set the index pair and the other one when we are sorting the index pairs. I concluded it was too much hassle. But maybe there is a better way...

  2. Yes I agree with you for string benchmarks, in fact all the code is not going into next major. We do have already a jira for this (https://jira.mongodb.org/browse/RCORE-2125). I will drop a comment in order to track this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in fact all the code is not going into next major

What do you mean by this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant: that the code is not finished, we need to work on the optimizations I mentioned. The feature branch is just our base. Once we complete all the features, it will of course go into next major, like we did for integers.

if (cache_i.cached_string_id) {
cache_i.value = {};
}
else {
cache_i.value = col_key.get_value(obj);
cache_i.cached_string_id = {};
}
}
Mixed val_i = cache_i.value;

if (cache_j.key != key_j) {
const auto& obj = m_columns[t].table->get_object(key_j);
const auto& col_key = m_columns[t].col_key;

cache_j.value = col_key.get_value(obj);
// store stringID instead of the actual string if possible
cache_j.key = key_j;
cache_j.cached_string_id = col_key.get_string_id(obj);
if (cache_j.cached_string_id) {
cache_j.value = {};
}
else {
cache_j.value = col_key.get_value(obj);
cache_j.cached_string_id = {};
}
}

c = val_i.compare(cache_j.value);
c = compare(cache_i, cache_j, m_columns[t]);
}
// if c is negative i comes before j
if (c) {
Expand Down Expand Up @@ -478,7 +544,14 @@ void BaseDescriptor::Sorter::cache_first_column(IndexPairs& v)
}

const auto obj = col.table->get_object(key);
index.cached_value = ck.get_value(obj);
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
index.cached_string_id = ck.get_string_id(obj);
if (index.cached_string_id) {
index.cached_value = Mixed();
}
else {
index.cached_value = ck.get_value(obj);
// index.cached_string_id = {};
}
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down
21 changes: 21 additions & 0 deletions src/realm/sort_descriptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,20 @@ class BaseDescriptor {
{
return index_in_view < other.index_in_view;
}
ObjKey get_key() const
{
return key_for_object;
}
Mixed get_value() const
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
{
return cached_value;
}
ObjKey key_for_object;
size_t index_in_view;
Mixed cached_value;
// if the value is a string or mixed of string, we may want to store
// the compressed string id, instead of the whole string.
std::optional<StringID> cached_string_id = {};
};
class IndexPairs : public std::vector<BaseDescriptor::IndexPair> {
public:
Expand Down Expand Up @@ -115,6 +126,16 @@ class BaseDescriptor {
struct ObjCache {
ObjKey key;
Mixed value;
std::optional<StringID> cached_string_id;
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved

ObjKey get_key() const
{
return key;
}
Mixed get_value() const
{
return value;
}
};
using TableCache = std::vector<ObjCache>;
mutable std::vector<TableCache> m_cache;
Expand Down
5 changes: 5 additions & 0 deletions src/realm/string_data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@

namespace realm {

// Compressed strings have unique IDs, this defines a global alias
// for this. A StringID is an entry inside an array of N compressed strings.
// 0 means null, all the other ids [1, N-1] represent a valid string.
using StringID = size_t;

/// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms.
/// This is what libc++ does, and it is a good general choice for a
/// non-cryptographic hash function (suitable for std::unordered_map etc.).
Expand Down
2 changes: 0 additions & 2 deletions src/realm/string_interner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ struct CompressedStringView;

namespace realm {

using StringID = size_t;

class StringCompressor;

struct CachedString {
Expand Down
1 change: 1 addition & 0 deletions src/realm/table_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,7 @@ void TableView::apply_descriptors(const DescriptorOrdering& ordering)
// (handling detached refs is not required in linkviews)
for (size_t t = 0; t < sz; t++) {
ObjKey key = get_key(t);
// TODO: add the stirng id here???
if (m_table->is_valid(key)) {
index_pairs.emplace_back(key, t);
}
Expand Down
40 changes: 40 additions & 0 deletions test/test_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4107,6 +4107,46 @@ TEST(Query_LinkChainSortErrors)
CHECK_LOGIC_ERROR(t1->get_sorted_view(SortDescriptor({{t1_linklist_col}})), ErrorCodes::InvalidSortDescriptor);
}

TEST_TYPES(Query_SortingCompressedStrings, std::true_type, std::false_type)
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is covering very little of the newly added code. It needs to check sorting mixed with all of compressed strings, non-compressed strings, and non-strings, along with sorting over links.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep you are right, but we do have many of these tests already, I will add more tests like this and be sure we are covering all, and not solely rely on our existing tests (if they commit the strings before)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have updated the tests in order to cover what you asked @tgoyne .

{
using type = typename TEST_TYPE::type;

SHARED_GROUP_TEST_PATH(path);
std::unique_ptr<Replication> hist(make_in_realm_history());
DBRef sg = DB::create(*hist, path, DBOptions(crypt_key()));

TransactionRef rt = sg->start_read();
CHECK_EQUAL(0, rt->size());
{
WriteTransaction wt(sg);
TableRef t = wt.add_table("t");
auto t_string_col = t->add_column(type_String, "t_string");
t->create_object().set(t_string_col, "Z");
t->create_object().set(t_string_col, "B");
t->create_object().set(t_string_col, "A");
t->create_object().set(t_string_col, "C");
wt.commit();
}
rt->advance_read();
// the strings are now in compressed format (after a write transaction)
std::vector<std::string_view> results = {"A", "B", "C", "Z"};
TableRef t = rt->get_table("t");
const auto t_string_col = t->get_column_key("t_string");
TableView tv = t->where().find_all();
bool ascending;
if constexpr (type::value) {
ascending = true;
}
else {
ascending = false;
std::reverse(results.begin(), results.end());
}
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
tv.sort(SortDescriptor({{t_string_col}}, {ascending}));
for (size_t i = 0; i < results.size(); ++i) {
CHECK_EQUAL(tv[i].get<StringData>(t_string_col), results[i]);
}
}


TEST(Query_EmptyDescriptors)
{
Expand Down
Loading