From f03e652781c4aa270ace7d965fb6cc5d07c91482 Mon Sep 17 00:00:00 2001 From: Matthew Fioravante Date: Sun, 16 Aug 2020 19:35:17 -0400 Subject: [PATCH] DBString: Dedup strings --- src/dbstring.cpp | 148 ++++++++++++++++++++++++++++++++++----------- src/lcf/dbstring.h | 41 +++++++------ 2 files changed, 136 insertions(+), 53 deletions(-) diff --git a/src/dbstring.cpp b/src/dbstring.cpp index 04152b647..c67270ca6 100644 --- a/src/dbstring.cpp +++ b/src/dbstring.cpp @@ -1,63 +1,141 @@ #include "lcf/dbstring.h" +#include +#include +#include namespace lcf { constexpr DBString::size_type DBString::npos; +alignas(DBString::size_type) constexpr char DBString::_empty_str[sizeof(size_type)]; -static constexpr size_t AllocSize(size_t len) { - return sizeof(DBString::size_type) + len + 1; -} +struct DBStringData { + using size_type = DBString::size_type; -static char* Alloc(size_t len) { - return reinterpret_cast(::operator new(AllocSize(len))); -} + size_type refcnt; + size_type size; -static char* Dup(const char* other, size_t size) { - if (size > 0) { - auto* s = Alloc(size); - std::memcpy(s, other, AllocSize(size)); - return s; + const char* data() const { + return reinterpret_cast(this + 1); } - return nullptr; -} - -static void Free(void* p) { - ::operator delete(p); -} - -DBString::DBString(const char* s, size_t len) -{ - if (len > 0) { - auto* ptr = Alloc(len); - _storage = ptr; - *reinterpret_cast(ptr) = len; - ptr += sizeof(size_type); + char* data() { + return reinterpret_cast(this + 1); + } - std::memcpy(ptr, s, len); - ptr += len; + static size_type alloc_size(StringView str) { + return sizeof(DBStringData) + str.size() + 1; + } - *ptr = '\0'; + static DBStringData* from_data(char* s) { + return reinterpret_cast(s) - 1; } +}; + +struct DBStringDataDeleter { + void operator()(DBStringData* p); +}; + +using DBStringDataPtr = std::unique_ptr; + +class DBStringAllocator { + public: + using size_type = DBString::size_type; + + static DBStringDataPtr Alloc(StringView str) { + auto* raw = ::operator new(DBStringData::alloc_size(str)); + auto data = DBStringDataPtr(new (raw) DBStringData()); + data->refcnt = 1; + data->size = str.size(); + std::memcpy(data->data(), str.data(), data->size); + data->data()[data->size] = '\0'; + + return data; + } + + static void Free(DBStringData* data) { + data->~DBStringData(); + ::operator delete(data); + } + + const char* Acquire(StringView str) { + if (str.empty()) { + return DBString::empty_str(); + } + + auto iter = _map.find(str); + if (iter != _map.end()) { + iter->second->refcnt += 1; + } else { + auto ptr = Alloc(str); + auto sv = StringView(ptr->data(), ptr->size); + // FIXME: Double hash lookup because the key changes.. + iter = _map.insert({ sv, std::move(ptr) }).first; + } + return iter->second->data(); + } + + const char* Dup(const char* s) { + if (s != DBString::empty_str()) { + auto* data = DBStringData::from_data(const_cast(s)); + data->refcnt += 1; + } + return s; + } + + void Release(StringView str) { + if (str.empty()) { + // This is needed, due to global DBStrings which are initialized to null. + // They may be destroyed *after* DBStringAllocator is destroyed! + // FIMXE: To fix this, use a hash table with constexpr default constructor. + return; + } + auto iter = _map.find(str); + if (iter != _map.end()) { + auto& data = iter->second; + data->refcnt -= 1; + assert(data->refcnt >= 0); + if (data->refcnt == 0) { + _map.erase(iter); + } + } + } + + static DBStringAllocator& instance() { + static DBStringAllocator alloc; + return alloc; + } + private: + DBStringAllocator() = default; + private: + std::unordered_map _map; +}; + +void DBStringDataDeleter::operator()(DBStringData* p) { + DBStringAllocator::Free(p); +} + +DBString::DBString(StringView s) + : _storage(DBStringAllocator::instance().Acquire(s)) +{ } DBString::DBString(const DBString& o) - : _storage(Dup(o._storage, o.size())) -{ } + : _storage(DBStringAllocator::instance().Dup(o._storage)) +{ +} DBString& DBString::operator=(const DBString& o) { if (this != &o) { + // What is strings are the same, skip double lookup? _reset(); - _storage = Dup(o._storage, o.size()); + _storage = DBStringAllocator::instance().Dup(o._storage); } return *this; } - - void DBString::_reset() noexcept { - Free(_storage); - _storage = nullptr; + assert(_storage != nullptr); + DBStringAllocator::instance().Release(*this); } } // namespace lcf diff --git a/src/lcf/dbstring.h b/src/lcf/dbstring.h index 9a6eb886b..1bc81fb80 100644 --- a/src/lcf/dbstring.h +++ b/src/lcf/dbstring.h @@ -20,7 +20,6 @@ #include "lcf/string_view.h" - namespace lcf { // A read-only string class optimized for database storage. @@ -36,22 +35,27 @@ class DBString { static constexpr size_type npos = size_type(-1); - constexpr DBString() = default; - explicit DBString(const std::string& s) : DBString(s.c_str(), s.size()) {} - explicit DBString(StringView s) : DBString(s.data(), s.size()) {} + // FIXME: When the allocator constructor is constexpr, this can be also + DBString() : DBString(StringView()) {} + explicit DBString(StringView s); + explicit DBString(const std::string& s) : DBString(StringView(s)) {} // Explicit construct for general const char* - explicit DBString(const char* s) : DBString(s, std::strlen(s)) {} + explicit DBString(const char* s) : DBString(StringView(s)) {} // Implicit constructor to capture string literals template DBString(const char(&literal)[N]) : DBString(StringView(literal)) {} - DBString(const char* s, size_t len); + DBString(const char* s, size_t len) : DBString(StringView(s, len)) {} - DBString(const DBString&); + DBString(const DBString& o); DBString& operator=(const DBString&); DBString(DBString&&) noexcept; DBString& operator=(DBString&&) noexcept; + void swap(DBString& o) noexcept { + std::swap(_storage, o._storage); + } + ~DBString() { _reset(); } explicit operator std::string() const { return std::string(c_str(), size()); } @@ -60,7 +64,7 @@ class DBString { char operator[](size_t i) const; char front() const { return (*this)[0]; } char back() const { return (*this)[size()-1]; } - const char* data() const; + const char* data() const { return _storage; } const char* c_str() const { return data(); } iterator begin() const { return data(); } @@ -71,10 +75,15 @@ class DBString { bool empty() const { return size() == 0; } size_type size() const; + + static constexpr const char* empty_str() { + return _empty_str + sizeof(size_type); + } private: void _reset() noexcept; private: - char* _storage = nullptr; + alignas(size_type) static constexpr char _empty_str[sizeof(size_type)] = {}; + const char* _storage = empty_str(); }; // This should be used over the conversion operator, so we can track all dbstr -> str instances @@ -116,28 +125,24 @@ template <> struct hash { namespace lcf { inline DBString::DBString(DBString&& o) noexcept - : _storage(o._storage) { - o._storage = nullptr; + std::swap(_storage, o._storage); } inline DBString& DBString::operator=(DBString&& o) noexcept { + return operator=(o); if (this != &o) { - if (_storage) { + if (!empty()) { _reset(); } _storage = o._storage; - o._storage = nullptr; + o._storage = empty_str(); } return *this; } -inline const char* DBString::data() const { - return _storage ? _storage + sizeof(size_type) : nullptr; -} - inline DBString::size_type DBString::size() const { - return _storage ? *reinterpret_cast(_storage) : 0; + return *(reinterpret_cast(_storage) - 1); } } // namespace lcf