Skip to content

Commit

Permalink
Port the toolchain to use the new Carbon hashtable (#4097)
Browse files Browse the repository at this point in the history
This works to leverage the capabilities of the hashtable as much as
possible, for example using the key context in the value stores.
However, there may still be opportunities to refactor more deeply and
use the functionality even better. Hopefully this is at least
a reasonable start and gets us a clean baseline.

On an Arm M1, this is a 15% improvement on my large lexing stress test,
but ends up a wash on my x86-64 server. This is a smaller benefit than
I expected, and it's because we're using a set-of-IDs and looking up
values with a key context for things like identifiers. This pattern has
a surprising tradeoff. The new hashtable uses significantly less memory,
a 10% peak RSS reduction just from the hashtable change. But indirecting
through the vector of values makes growing the hashtable dramatically
less cache-friendly: it causes growth to randomly access every key when
rehashing. On x86, everything gained by the faster hashtable is lost in
even slower growth. And even on Arm, this eats into the benefits.

But I have a plan to tweak how identifiers specifically work to avoid
most of the growth, and so I suspect this is the right tradeoff on the
whole. It gives us significant working set size reduction and we can
likely avoid the regressed operation (growth with rehash) in most cases
by clever reserving and if necessary by adding a hash caching layer to
the table infrastructure.

---------

Co-authored-by: Jon Ross-Perkins <[email protected]>
  • Loading branch information
chandlerc and jonmeow authored Jul 3, 2024
1 parent f5f8342 commit 8992d22
Show file tree
Hide file tree
Showing 27 changed files with 292 additions and 353 deletions.
1 change: 1 addition & 0 deletions toolchain/base/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ cc_library(
"//common:check",
"//common:hashing",
"//common:ostream",
"//common:set",
"@llvm-project//llvm:Support",
],
)
Expand Down
26 changes: 0 additions & 26 deletions toolchain/base/index_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <concepts>

#include "common/ostream.h"
#include "llvm/ADT/DenseMapInfo.h"

namespace Carbon {

Expand Down Expand Up @@ -75,31 +74,6 @@ auto operator<=>(IndexType lhs, IndexType rhs) -> std::strong_ordering {
return lhs.index <=> rhs.index;
}

// Provides base support for use of IdBase types as DenseMap/DenseSet keys.
//
// Usage (in global namespace):
// template <>
// struct llvm::DenseMapInfo<Carbon::MyType>
// : public Carbon::IndexMapInfo<Carbon::MyType> {};
template <typename Index>
struct IndexMapInfo {
static inline auto getEmptyKey() -> Index {
return Index(llvm::DenseMapInfo<int32_t>::getEmptyKey());
}

static inline auto getTombstoneKey() -> Index {
return Index(llvm::DenseMapInfo<int32_t>::getTombstoneKey());
}

static auto getHashValue(const Index& val) -> unsigned {
return llvm::DenseMapInfo<int32_t>::getHashValue(val.index);
}

static auto isEqual(const Index& lhs, const Index& rhs) -> bool {
return lhs == rhs;
}
};

} // namespace Carbon

#endif // CARBON_TOOLCHAIN_BASE_INDEX_BASE_H_
141 changes: 73 additions & 68 deletions toolchain/base/value_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,10 @@
#include <type_traits>

#include "common/check.h"
#include "common/hashing.h"
#include "common/ostream.h"
#include "common/set.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
Expand Down Expand Up @@ -77,26 +76,6 @@ struct FloatId : public IdBase, public Printable<FloatId> {
};
constexpr FloatId FloatId::Invalid(FloatId::InvalidIndex);

// DenseMapInfo for llvm::APFloat, for use in the canonical float value store.
// LLVM has an implementation of this but it strangely lives in the non-public
// header lib/IR/LLVMContextImpl.h, so we use our own.
// TODO: Remove this once our new hash table is available.
struct APFloatDenseMapInfo {
static auto getEmptyKey() -> llvm::APFloat {
return llvm::APFloat(llvm::APFloat::Bogus(), 1);
}
static auto getTombstoneKey() -> llvm::APFloat {
return llvm::APFloat(llvm::APFloat::Bogus(), 2);
}
static auto getHashValue(const llvm::APFloat& val) -> unsigned {
return hash_value(val);
}
static auto isEqual(const llvm::APFloat& lhs, const llvm::APFloat& rhs)
-> bool {
return lhs.bitwiseIsEqual(rhs);
}
};

// Corresponds to a Real value.
struct RealId : public IdBase, public Printable<RealId> {
using ValueType = Real;
Expand Down Expand Up @@ -220,28 +199,25 @@ class ValueStore
// to later retrieve the value.
//
// IdT::ValueType must represent the type being indexed.
template <typename IdT,
typename DenseMapInfoT = llvm::DenseMapInfo<typename IdT::ValueType>>
template <typename IdT>
class CanonicalValueStore {
public:
using ValueType = typename IdT::ValueType;

// Stores a canonical copy of the value and returns an ID to reference it.
auto Add(ValueType value) -> IdT {
auto [it, added] = map_.insert({value, IdT::Invalid});
if (added) {
it->second = values_.Add(value);
}
return it->second;
}
auto Add(ValueType value) -> IdT;

// Returns the value for an ID.
auto Get(IdT id) const -> const ValueType& { return values_.Get(id); }

// Reserves space.
auto Reserve(size_t size) -> void {
// Compute the resulting new insert count using the size of values -- the
// set doesn't have a fast to compute current size.
if (size > values_.size()) {
set_.GrowForInsertCount(size - values_.size(), KeyContext(values_));
}
values_.Reserve(size);
map_.reserve(size);
}

// These are to support printable structures, and are not guaranteed.
Expand All @@ -255,18 +231,34 @@ class CanonicalValueStore {
auto size() const -> size_t { return values_.size(); }

private:
// We store two copies of each value: one in the ValueStore for fast mapping
// from IdT to value, and another in the DenseMap for the reverse mapping.
//
// We could avoid this by storing the map instead as a DenseSet<IdT>, but
// there's no way to provide a DenseMapInfo that looks the IDs up in the
// vector.
//
// TODO: Switch to a better representation that avoids the duplication.
class KeyContext;

ValueStore<IdT> values_;
llvm::DenseMap<ValueType, IdT, DenseMapInfoT> map_;
Set<IdT, /*SmallSize=*/0, KeyContext> set_;
};

template <typename IdT>
class CanonicalValueStore<IdT>::KeyContext
: public TranslatingKeyContext<KeyContext> {
public:
explicit KeyContext(llvm::ArrayRef<ValueType> values) : values_(values) {}

// Note that it is safe to return a `const` reference here as the underlying
// object's lifetime is provided by the `store_`.
auto TranslateKey(IdT id) const -> const ValueType& {
return values_[id.index];
}

private:
llvm::ArrayRef<ValueType> values_;
};

template <typename IdT>
auto CanonicalValueStore<IdT>::Add(ValueType value) -> IdT {
auto make_key = [&] { return IdT(values_.Add(std::move(value))); };
return set_.Insert(value, make_key, KeyContext(values_.array_ref())).key();
}

// Storage for StringRefs. The caller is responsible for ensuring storage is
// allocated.
template <>
Expand All @@ -275,14 +267,7 @@ class CanonicalValueStore<StringId>
public:
// Returns an ID to reference the value. May return an existing ID if the
// string was previously added.
auto Add(llvm::StringRef value) -> StringId {
auto [it, inserted] = map_.insert({value, StringId(values_.size())});
if (inserted) {
CARBON_CHECK(it->second.index >= 0) << "Too many unique strings";
values_.push_back(value);
}
return it->second;
}
auto Add(llvm::StringRef value) -> StringId;

// Returns the value for an ID.
auto Get(StringId id) const -> llvm::StringRef {
Expand All @@ -291,12 +276,7 @@ class CanonicalValueStore<StringId>
}

// Returns an ID for the value, or Invalid if not found.
auto Lookup(llvm::StringRef value) const -> StringId {
if (auto it = map_.find(value); it != map_.end()) {
return it->second;
}
return StringId::Invalid;
}
auto Lookup(llvm::StringRef value) const -> StringId;

auto OutputYaml() const -> Yaml::OutputMapping {
return Yaml::OutputMapping([&](Yaml::OutputMapping::Map map) {
Expand All @@ -309,12 +289,47 @@ class CanonicalValueStore<StringId>
auto size() const -> size_t { return values_.size(); }

private:
llvm::DenseMap<llvm::StringRef, StringId> map_;
// Set inline size to 0 because these will typically be too large for the
class KeyContext;

// Set inline sizes to 0 because these will typically be too large for the
// stack, while this does make File smaller.
Set<StringId, /*SmallSize=*/0, KeyContext> set_;
llvm::SmallVector<llvm::StringRef, 0> values_;
};

class CanonicalValueStore<StringId>::KeyContext
: public TranslatingKeyContext<KeyContext> {
public:
explicit KeyContext(llvm::ArrayRef<llvm::StringRef> values)
: values_(values) {}

auto TranslateKey(StringId id) const -> llvm::StringRef {
return values_[id.index];
}

private:
llvm::ArrayRef<llvm::StringRef> values_;
};

inline auto CanonicalValueStore<StringId>::Add(llvm::StringRef value)
-> StringId {
auto make_key = [&] {
auto id = static_cast<StringId>(values_.size());
CARBON_CHECK(id.index >= 0) << "Too many unique strings";
values_.push_back(value);
return id;
};
return set_.Insert(value, make_key, KeyContext(values_)).key();
}

inline auto CanonicalValueStore<StringId>::Lookup(llvm::StringRef value) const
-> StringId {
if (auto result = set_.Lookup(value, KeyContext(values_))) {
return result.key();
}
return StringId::Invalid;
}

// A thin wrapper around a `ValueStore<StringId>` that provides a different IdT,
// while using a unified storage for values. This avoids potentially
// duplicative string hash maps, which are expensive.
Expand Down Expand Up @@ -344,7 +359,7 @@ class StringStoreWrapper : public Printable<StringStoreWrapper<IdT>> {
CanonicalValueStore<StringId>* values_;
};

using FloatValueStore = CanonicalValueStore<FloatId, APFloatDenseMapInfo>;
using FloatValueStore = CanonicalValueStore<FloatId>;

// Stores that will be used across compiler phases for a given compilation unit.
// This is provided mainly so that they don't need to be passed separately.
Expand Down Expand Up @@ -404,14 +419,4 @@ class SharedValueStores : public Yaml::Printable<SharedValueStores> {

} // namespace Carbon

// Support use of IdentifierId as DenseMap/DenseSet keys.
// TODO: Remove once NameId is used in checking.
template <>
struct llvm::DenseMapInfo<Carbon::IdentifierId>
: public Carbon::IndexMapInfo<Carbon::IdentifierId> {};
// Support use of StringId as DenseMap/DenseSet keys.
template <>
struct llvm::DenseMapInfo<Carbon::StringId>
: public Carbon::IndexMapInfo<Carbon::StringId> {};

#endif // CARBON_TOOLCHAIN_BASE_VALUE_STORE_H_
4 changes: 4 additions & 0 deletions toolchain/check/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ cc_library(
deps = [
":node_stack",
"//common:check",
"//common:map",
"//common:vlog",
"//toolchain/base:index_base",
"//toolchain/base:kind_switch",
Expand Down Expand Up @@ -95,6 +96,7 @@ cc_library(
":sem_ir_diagnostic_converter",
"//common:check",
"//common:error",
"//common:map",
"//common:ostream",
"//common:variant_helpers",
"//common:vlog",
Expand Down Expand Up @@ -183,6 +185,7 @@ cc_library(
deps = [
":context",
"//common:check",
"//common:map",
"//toolchain/base:kind_switch",
"//toolchain/parse:node_kind",
"//toolchain/sem_ir:file",
Expand Down Expand Up @@ -281,6 +284,7 @@ cc_library(
deps = [
"//common:check",
"//common:ostream",
"//common:set",
"//common:vlog",
"//toolchain/base:index_base",
"//toolchain/sem_ir:file",
Expand Down
Loading

0 comments on commit 8992d22

Please sign in to comment.