realm · nicola-cab · Aug 1, 2024 · Jul 15, 2024 · Jul 16, 2024 · Jul 18, 2024
diff --git a/src/realm/array_mixed.cpp b/src/realm/array_mixed.cpp
@@ -118,6 +118,22 @@ void ArrayMixed::set_null(size_t ndx)
     }
 }
 
+std::optional<StringID> ArrayMixed::get_string_id(size_t ndx) const
+{
+    int64_t val = m_composite.get(ndx);
+    if (val) {
+        const int64_t int_val = val >> s_data_shift;
+        const size_t payload_ndx{(size_t)int_val};
+        const DataType type((val & s_data_type_mask) - 1);
+        if (type == type_String) {
+            ensure_string_array();
+            REALM_ASSERT(size_t(int_val) < m_strings.size());
+            return m_strings.get_string_id(payload_ndx);
+        }
+    }
+    return {};
+}
+
 Mixed ArrayMixed::get(size_t ndx) const
 {
     int64_t val = m_composite.get(ndx);

diff --git a/src/realm/array_mixed.hpp b/src/realm/array_mixed.hpp
@@ -97,6 +97,7 @@ class ArrayMixed : public ArrayPayload, private Array {
     {
         return m_composite.get(ndx) == 0;
     }
+    std::optional<StringID> get_string_id(size_t ndx) const;
 
     void clear();
     void erase(size_t ndx);

diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp
@@ -20,6 +20,7 @@
 #include <realm/impl/array_writer.hpp>
 #include <realm/table.hpp>
 #include <realm/string_interner.hpp>
+#include <realm/string_compressor.hpp>
 #include <realm/mixed.hpp>
 
 using namespace realm;
@@ -192,7 +193,7 @@ StringData ArrayString::get(size_t ndx) const
     return {};
 }
 
-std::optional<StringID> realm::ArrayString::get_string_id(size_t ndx) const
+std::optional<StringID> ArrayString::get_string_id(size_t ndx) const
 {
     if (m_type == Type::interned_strings) {
         return StringID(static_cast<Array*>(m_arr)->get(ndx));

diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp
@@ -629,6 +629,36 @@ BinaryData Obj::_get<BinaryData>(ColKey::Idx col_ndx) const
     return ArrayBinary::get(alloc.translate(ref), m_row_ndx, alloc);
 }
 
+std::optional<StringID> Obj::get_string_id(ColKey col_key) const
+{
+    // we return a string id only if the property is string or mixed.
+    // And it got compressed.
+
+    // only strings and mixed can have an interner
+    if (col_key.get_type() != col_type_String && col_key.get_type() != col_type_Mixed)
+        return {};
+
+    m_table->check_column(col_key);
+    _update_if_needed();
+
+    const auto col_ndx = col_key.get_index();
+    const auto interner = m_table->get_string_interner(col_ndx);
+    ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1));
+
+    if (col_key.get_type() == col_type_Mixed) {
+        // mixed handling. Only strings in mixed may have a string id
+        ArrayMixed values(get_alloc());
+        values.set_string_interner(interner);
+        values.init_from_ref(ref);
+        return values.get_string_id(m_row_ndx);
+    }
+    // must be string.
+    ArrayString values(get_alloc());
+    values.set_string_interner(interner);
+    values.init_from_ref(ref);
+    return values.get_string_id(m_row_ndx);
+}
+
 Mixed Obj::get_any(ColKey col_key) const
 {
     m_table->check_column(col_key);

diff --git a/src/realm/obj.hpp b/src/realm/obj.hpp
@@ -117,6 +117,11 @@ class Obj {
     template <typename U>
     U get(ColKey col_key) const;
 
+    std::optional<StringID> get_string_id(ColKey) const;
+    std::optional<StringID> get_string_id(StringData col_name) const
+    {
+        return get_string_id(get_column_key(col_name));
+    }
     Mixed get_any(ColKey col_key) const;
     Mixed get_any(StringData col_name) const
     {

diff --git a/src/realm/path.hpp b/src/realm/path.hpp
@@ -256,6 +256,10 @@ class ExtendedColumnKey {
     ObjKey get_link_target(const Obj& obj) const;
     Mixed get_value(const Obj& obj) const;
 
+    // get String ID for the obj, it makes sense to call this method only if the col_key type is either Mixed or
+    // String.
+    std::optional<StringID> get_string_id(const Obj& obj) const;
+
 private:
     ColKey m_colkey;
     PathElement m_index;

diff --git a/src/realm/sort_descriptor.cpp b/src/realm/sort_descriptor.cpp
@@ -23,6 +23,7 @@
 #include <realm/util/assert.hpp>
 #include <realm/list.hpp>
 #include <realm/dictionary.hpp>
+#include <realm/string_compressor.hpp>
 
 using namespace realm;
 
@@ -85,6 +86,16 @@ Mixed ExtendedColumnKey::get_value(const Obj& obj) const
     return {};
 }
 
+std::optional<StringID> ExtendedColumnKey::get_string_id(const Obj& obj) const
+{
+    const auto type = m_colkey.get_type();
+    if (type != col_type_String && type != col_type_Mixed)
+        return {};
+    if (!has_index())
+        return obj.get_string_id(m_colkey);
+    return {};
+}
+
 LinkPathPart::LinkPathPart(ColKey col_key, ConstTableRef source)
     : column_key(col_key)
     , from(source->get_key())
@@ -391,6 +402,47 @@ void FilterDescriptor::execute(const Table& table, KeyValues& key_values, const
     key_values = std::move(filtered);
 }
 
+template <typename T, typename U>
+inline int compare(const T& i, const T& j, const U& col)
+{
+    const auto id_i = i.cached_string_id;
+    const auto id_j = j.cached_string_id;
+
+    if (!id_i && !id_j)
+        // any other comparison that has nothing to do with strings
+        return i.get_value().compare(j.get_value());
+
+    if (id_i || id_j) {
+        // at least one is a compressed string.
+        const auto table = col.table;
+        const auto interner = table->get_string_interner(ColKey{col.col_key});
+
+        // two interned strings
+        if (id_i && id_j)
+            return interner->compare(*id_i, *id_j);
+
+        // one compressed string and the other one could potentially be uncompressed.
+        const auto& index = id_i ? j : i;
+        const auto& other = id_i ? id_i : id_j;
+        const auto str = index.get_value().template get_if<StringData>();
+        if (str) {
+            // compressed string vs uncompressed string
+            const auto ret = interner->compare(*str, *other);
+            // if i is a compressed string, than reverse cmp result, since
+            // we always compare i vs j and not j vs i.
+            return id_i ? -ret : ret;
+        }
+    }
+
+    // mixed str vs mixed (any other value)
+    const auto& index = id_i ? i : j;
+    const auto& key = index.get_key();
+    const auto& obj = col.table->get_object(key);
+    const auto& col_key = col.col_key;
+    const auto val = col_key.get_value(obj);
+    return id_i ? val.compare(j.get_value()) : i.get_value().compare(val);
+}
+
 // This function must conform to 'is less' predicate - that is:
 // return true if i is strictly smaller than j
 bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ordering) const
@@ -419,9 +471,8 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord
         }
 
         int c;
-
         if (t == 0) {
-            c = i.cached_value.compare(j.cached_value);
+            c = compare(i, j, m_columns[t]);
         }
         else {
             if (m_cache[t - 1].empty()) {
@@ -434,20 +485,35 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord
                 const auto& obj = m_columns[t].table->get_object(key_i);
                 const auto& col_key = m_columns[t].col_key;
 
-                cache_i.value = col_key.get_value(obj);
+                // store stringID instead of the actual string if possible
                 cache_i.key = key_i;
+                cache_i.cached_string_id = col_key.get_string_id(obj);
+                if (cache_i.cached_string_id) {
+                    cache_i.value = {};
+                }
+                else {
+                    cache_i.value = col_key.get_value(obj);
+                    cache_i.cached_string_id = {};
+                }
             }
-            Mixed val_i = cache_i.value;
 
             if (cache_j.key != key_j) {
                 const auto& obj = m_columns[t].table->get_object(key_j);
                 const auto& col_key = m_columns[t].col_key;
 
-                cache_j.value = col_key.get_value(obj);
+                // store stringID instead of the actual string if possible
                 cache_j.key = key_j;
+                cache_j.cached_string_id = col_key.get_string_id(obj);
+                if (cache_j.cached_string_id) {
+                    cache_j.value = {};
+                }
+                else {
+                    cache_j.value = col_key.get_value(obj);
+                    cache_j.cached_string_id = {};
+                }
             }
 
-            c = val_i.compare(cache_j.value);
+            c = compare(cache_i, cache_j, m_columns[t]);
         }
         // if c is negative i comes before j
         if (c) {
@@ -478,7 +544,14 @@ void BaseDescriptor::Sorter::cache_first_column(IndexPairs& v)
         }
 
         const auto obj = col.table->get_object(key);
-        index.cached_value = ck.get_value(obj);
+        index.cached_string_id = ck.get_string_id(obj);
+        if (index.cached_string_id) {
+            index.cached_value = Mixed();
+        }
+        else {
+            index.cached_value = ck.get_value(obj);
+            // index.cached_string_id = {};
+        }
     }
 }
 

diff --git a/src/realm/sort_descriptor.hpp b/src/realm/sort_descriptor.hpp
@@ -66,9 +66,20 @@ class BaseDescriptor {
         {
             return index_in_view < other.index_in_view;
         }
+        ObjKey get_key() const
+        {
+            return key_for_object;
+        }
+        Mixed get_value() const
+        {
+            return cached_value;
+        }
         ObjKey key_for_object;
         size_t index_in_view;
         Mixed cached_value;
+        // if the value is a string or mixed of string, we may want to store
+        // the compressed string id, instead of the whole string.
+        std::optional<StringID> cached_string_id = {};
     };
     class IndexPairs : public std::vector<BaseDescriptor::IndexPair> {
     public:
@@ -115,6 +126,16 @@ class BaseDescriptor {
         struct ObjCache {
             ObjKey key;
             Mixed value;
+            std::optional<StringID> cached_string_id;
+
+            ObjKey get_key() const
+            {
+                return key;
+            }
+            Mixed get_value() const
+            {
+                return value;
+            }
         };
         using TableCache = std::vector<ObjCache>;
         mutable std::vector<TableCache> m_cache;

diff --git a/src/realm/string_data.hpp b/src/realm/string_data.hpp
@@ -34,6 +34,11 @@
 
 namespace realm {
 
+// Compressed strings have unique IDs, this defines a global alias
+// for this. A StringID is an entry inside an array of N compressed strings.
+// 0 means null, all the other ids [1, N-1] represent a valid string.
+using StringID = size_t;
+
 /// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms.
 /// This is what libc++ does, and it is a good general choice for a
 /// non-cryptographic hash function (suitable for std::unordered_map etc.).

diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp
@@ -34,8 +34,6 @@ struct CompressedStringView;
 
 namespace realm {
 
-using StringID = size_t;
-
 class StringCompressor;
 
 struct CachedString {

diff --git a/src/realm/table_view.cpp b/src/realm/table_view.cpp
@@ -519,6 +519,7 @@ void TableView::apply_descriptors(const DescriptorOrdering& ordering)
         // (handling detached refs is not required in linkviews)
         for (size_t t = 0; t < sz; t++) {
             ObjKey key = get_key(t);
+            // TODO: add the stirng id here???
             if (m_table->is_valid(key)) {
                 index_pairs.emplace_back(key, t);
             }

diff --git a/test/test_query.cpp b/test/test_query.cpp
@@ -4107,6 +4107,46 @@ TEST(Query_LinkChainSortErrors)
     CHECK_LOGIC_ERROR(t1->get_sorted_view(SortDescriptor({{t1_linklist_col}})), ErrorCodes::InvalidSortDescriptor);
 }
 
+TEST_TYPES(Query_SortingCompressedStrings, std::true_type, std::false_type)
+{
+    using type = typename TEST_TYPE::type;
+
+    SHARED_GROUP_TEST_PATH(path);
+    std::unique_ptr<Replication> hist(make_in_realm_history());
+    DBRef sg = DB::create(*hist, path, DBOptions(crypt_key()));
+
+    TransactionRef rt = sg->start_read();
+    CHECK_EQUAL(0, rt->size());
+    {
+        WriteTransaction wt(sg);
+        TableRef t = wt.add_table("t");
+        auto t_string_col = t->add_column(type_String, "t_string");
+        t->create_object().set(t_string_col, "Z");
+        t->create_object().set(t_string_col, "B");
+        t->create_object().set(t_string_col, "A");
+        t->create_object().set(t_string_col, "C");
+        wt.commit();
+    }
+    rt->advance_read();
+    // the strings are now in compressed format (after a write transaction)
+    std::vector<std::string_view> results = {"A", "B", "C", "Z"};
+    TableRef t = rt->get_table("t");
+    const auto t_string_col = t->get_column_key("t_string");
+    TableView tv = t->where().find_all();
+    bool ascending;
+    if constexpr (type::value) {
+        ascending = true;
+    }
+    else {
+        ascending = false;
+        std::reverse(results.begin(), results.end());
+    }
+    tv.sort(SortDescriptor({{t_string_col}}, {ascending}));
+    for (size_t i = 0; i < results.size(); ++i) {
+        CHECK_EQUAL(tv[i].get<StringData>(t_string_col), results[i]);
+    }
+}
+
 
 TEST(Query_EmptyDescriptors)
 {