Skip to content

Commit

Permalink
i#4014 dr$sim phys: Improve v2p cache and hash performance
Browse files Browse the repository at this point in the history
Switches from the drcontainers hashtable to the new open-address
hashtable provided by DR.  This is 2x to 3x faster due to the reduced
dereferences from the inlined data.

Increases the last-value cache from one entry to an array of 8
entries.  This was found to provide improved performance on small
benchmarks.

Measured on bzip2 local and SPEC2006 runs as they are short enough to
allow interactive experimentation.  -use_physical still incurs a ~2.5x
slowdown, but it was 9x before these changes.  The bottleneck is no
longer the hashtable but is now spread across all the address
iteration and querying code.

Issue: #4014
  • Loading branch information
derekbruening committed Jun 7, 2022
1 parent a3544c2 commit 778c6e0
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 30 deletions.
68 changes: 43 additions & 25 deletions clients/drcachesim/tracer/physaddr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,15 @@ static const addr_t PAGE_INVALID = (addr_t)-1;
physaddr_t::physaddr_t()
#ifdef LINUX
: page_size_(dr_page_size())
, last_vpage_(PAGE_INVALID)
, last_ppage_(PAGE_INVALID)
, cache_idx_(0)
, fd_(-1)
, v2p_(nullptr)
, count_(0)
#endif
{
#ifdef LINUX
v2p_.table = nullptr;
memset(last_vpage_, static_cast<char>(PAGE_INVALID), sizeof(last_vpage_));
memset(last_ppage_, static_cast<char>(PAGE_INVALID), sizeof(last_ppage_));

page_bits_ = 0;
size_t temp = page_size_;
Expand All @@ -82,8 +83,12 @@ physaddr_t::physaddr_t()
physaddr_t::~physaddr_t()
{
#ifdef LINUX
if (v2p_.table != nullptr)
hashtable_delete(&v2p_);
NOTIFY(1,
"physaddr: hit cache: " UINT64_FORMAT_STRING
", hit table " UINT64_FORMAT_STRING ", miss " UINT64_FORMAT_STRING "\n",
num_hit_cache_, num_hit_table_, num_miss_);
if (v2p_ != nullptr)
dr_hashtable_destroy(dr_get_current_drcontext(), v2p_);
#endif
}

Expand All @@ -93,8 +98,12 @@ physaddr_t::init()
#ifdef LINUX
// Some threads may not do much, so start out small.
constexpr int V2P_INITIAL_BITS = 9;
hashtable_init_ex(&v2p_, V2P_INITIAL_BITS, HASH_INTPTR, /*strdup=*/false,
/*synch=*/false, nullptr, nullptr, nullptr);
// The hashtable lookup performance is important.
// A non-open-address hashtable is about 3x slower due to the extra
// loads, and higher resize thresholds are also slower.
// With the setup here, the hashtable lookup is no longer the bottleneck.
v2p_ = dr_hashtable_create(dr_get_current_drcontext(), V2P_INITIAL_BITS, 20,
/*synch=*/false, nullptr);

// We avoid std::ostringstream to avoid malloc use for static linking.
constexpr int MAX_PAGEMAP_FNAME = 64;
Expand All @@ -116,7 +125,8 @@ physaddr_t::init()
}

bool
physaddr_t::virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache)
physaddr_t::virtual2physical(void *drcontext, addr_t virt, OUT addr_t *phys,
OUT bool *from_cache)
{
#ifdef LINUX
if (phys == nullptr)
Expand All @@ -130,35 +140,41 @@ physaddr_t::virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache
// XXX i#4014: Provide a similar option that doesn't flush and just checks
// whether mappings have changed?
use_cache = false;
last_vpage_ = PAGE_INVALID;
hashtable_clear(&v2p_);
memset(last_vpage_, static_cast<char>(PAGE_INVALID), sizeof(last_vpage_));
dr_hashtable_clear(drcontext, v2p_);
count_ = 0;
}
if (use_cache) {
// Use cached values on the assumption that the kernel hasn't re-mapped
// this virtual page.
if (vpage == last_vpage_) {
if (from_cache != nullptr)
*from_cache = true;
*phys = last_ppage_ + page_offs(virt);
return true;
for (int i = 0; i < NUM_CACHE; ++i) {
if (vpage == last_vpage_[i]) {
if (from_cache != nullptr)
*from_cache = true;
*phys = last_ppage_[i] + page_offs(virt);
++num_hit_cache_;
return true;
}
}
// XXX i#1703: add (debug-build-only) internal stats here and
// on cache_t::request() fastpath.
void *lookup = hashtable_lookup(&v2p_, reinterpret_cast<void *>(vpage));
void *lookup = dr_hashtable_lookup(drcontext, v2p_, vpage);
if (lookup != nullptr) {
addr_t ppage = reinterpret_cast<addr_t>(lookup);
// Restore a 0 payload.
if (ppage == ZERO_ADDR_PAYLOAD)
ppage = 0;
last_vpage_ = vpage;
last_ppage_ = ppage;
if (from_cache != nullptr)
*from_cache = true;
*phys = last_ppage_ + page_offs(virt);
*phys = ppage + page_offs(virt);
last_vpage_[cache_idx_] = vpage;
last_ppage_[cache_idx_] = ppage;
cache_idx_ = (cache_idx_ + 1) % NUM_CACHE;
++num_hit_table_;
return true;
}
}
++num_miss_;
// Not cached, or forced to re-sync, so we have to read from the file.
if (fd_ == -1) {
NOTIFY(1, "v2p failure: file descriptor is invalid\n");
Expand Down Expand Up @@ -190,12 +206,14 @@ physaddr_t::virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache
// a 0 PFN. Since that could be valid, we need to check our capabilities
// to decide. Until then, we're currently returning 0 for every unprivileged query.
// Store 0 as a sentinel since 0 means no entry.
last_ppage_ = (addr_t)((entry & PAGEMAP_PFN) << page_bits_);
hashtable_add(
&v2p_, reinterpret_cast<void *>(vpage),
reinterpret_cast<void *>(last_ppage_ == 0 ? ZERO_ADDR_PAYLOAD : last_ppage_));
last_vpage_ = vpage;
*phys = last_ppage_ + page_offs(virt);
addr_t ppage = (addr_t)((entry & PAGEMAP_PFN) << page_bits_);
// Store 0 as a sentinel since 0 means no entry.
dr_hashtable_add(drcontext, v2p_, vpage,
reinterpret_cast<void *>(ppage == 0 ? ZERO_ADDR_PAYLOAD : ppage));
*phys = ppage + page_offs(virt);
last_ppage_[cache_idx_] = ppage;
last_vpage_[cache_idx_] = vpage;
cache_idx_ = (cache_idx_ + 1) % NUM_CACHE;
NOTIFY(2, "virtual %p => physical %p\n", virt, *phys);
return true;
#else
Expand Down
17 changes: 13 additions & 4 deletions clients/drcachesim/tracer/physaddr.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ class physaddr_t {
// Returns in "from_cache" whether the physical address had been queried before
// and was available in a local cache (which is cleared at -virt2phys_freq).
bool
virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache = nullptr);
virtual2physical(void *drcontext, addr_t virt, OUT addr_t *phys,
OUT bool *from_cache = nullptr);

private:
#ifdef LINUX
Expand All @@ -75,8 +76,11 @@ class physaddr_t {

size_t page_size_;
int page_bits_;
addr_t last_vpage_;
addr_t last_ppage_;
static constexpr int NUM_CACHE = 8;
addr_t last_vpage_[NUM_CACHE];
addr_t last_ppage_[NUM_CACHE];
// FIFO replacement.
int cache_idx_;
// TODO i#4014: An app with thousands of threads might hit open file limits,
// and even a hundred threads will use up DR's private FD limit and push
// other files into potential app conflicts.
Expand All @@ -86,11 +90,16 @@ class physaddr_t {
int fd_;
// We would use std::unordered_map, but that is not compatible with
// statically linking drmemtrace into an app.
hashtable_t v2p_;
// The drcontainers hashtable is too slow due to the extra dereferences:
// we need an open-addressed table.
void *v2p_;
// With hashtable_t nullptr is how non-existence is shown, so we store
// an actual 0 address (can happen for physical) as this sentinel.
static constexpr addr_t ZERO_ADDR_PAYLOAD = 1;
unsigned int count_;
uint64_t num_hit_cache_;
uint64_t num_hit_table_;
uint64_t num_miss_;
#endif
};

Expand Down
3 changes: 2 additions & 1 deletion clients/drcachesim/tracer/tracer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -901,7 +901,8 @@ process_buffer_for_physaddr(void *drcontext, per_thread_t *data, size_t header_s
addr_t virt = instru->get_entry_addr(drcontext, mem_ref);
bool from_cache = false;
addr_t phys = 0;
bool success = data->physaddr.virtual2physical(virt, &phys, &from_cache);
bool success =
data->physaddr.virtual2physical(drcontext, virt, &phys, &from_cache);
NOTIFY(4, "%s: type=%2d virt=%p phys=%p\n", __FUNCTION__, type, virt, phys);
if (!success) {
// XXX i#1735: Unfortunately this happens; currently we use the virtual
Expand Down

0 comments on commit 778c6e0

Please sign in to comment.