i#4014 dr$sim phys: Improve v2p cache and hash performance

Switches from the drcontainers hashtable to the new open-address hashtable provided by DR. This is 2x to 3x faster due to the reduced dereferences from the inlined data. Increases the last-value cache from one entry to an array of 8 entries. This was found to provide improved performance on small benchmarks. Measured on bzip2 local and SPEC2006 runs as they are short enough to allow interactive experimentation. -use_physical still incurs a ~2.5x slowdown, but it was 9x before these changes. The bottleneck is no longer the hashtable but is now spread across all the address iteration and querying code. Issue: #4014
DynamoRIO · Jun 7, 2022 · 778c6e0 · 778c6e0
1 parent a3544c2
commit 778c6e0
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 30 deletions.
diff --git a/clients/drcachesim/tracer/physaddr.cpp b/clients/drcachesim/tracer/physaddr.cpp
@@ -60,14 +60,15 @@ static const addr_t PAGE_INVALID = (addr_t)-1;
 physaddr_t::physaddr_t()
 #ifdef LINUX
     : page_size_(dr_page_size())
-    , last_vpage_(PAGE_INVALID)
-    , last_ppage_(PAGE_INVALID)
+    , cache_idx_(0)
     , fd_(-1)
+    , v2p_(nullptr)
     , count_(0)
 #endif
 {
 #ifdef LINUX
-    v2p_.table = nullptr;
+    memset(last_vpage_, static_cast<char>(PAGE_INVALID), sizeof(last_vpage_));
+    memset(last_ppage_, static_cast<char>(PAGE_INVALID), sizeof(last_ppage_));
 
     page_bits_ = 0;
     size_t temp = page_size_;
@@ -82,8 +83,12 @@ physaddr_t::physaddr_t()
 physaddr_t::~physaddr_t()
 {
 #ifdef LINUX
-    if (v2p_.table != nullptr)
-        hashtable_delete(&v2p_);
+    NOTIFY(1,
+           "physaddr: hit cache: " UINT64_FORMAT_STRING
+           ", hit table " UINT64_FORMAT_STRING ", miss " UINT64_FORMAT_STRING "\n",
+           num_hit_cache_, num_hit_table_, num_miss_);
+    if (v2p_ != nullptr)
+        dr_hashtable_destroy(dr_get_current_drcontext(), v2p_);
 #endif
 }
 
@@ -93,8 +98,12 @@ physaddr_t::init()
 #ifdef LINUX
     // Some threads may not do much, so start out small.
     constexpr int V2P_INITIAL_BITS = 9;
-    hashtable_init_ex(&v2p_, V2P_INITIAL_BITS, HASH_INTPTR, /*strdup=*/false,
-                      /*synch=*/false, nullptr, nullptr, nullptr);
+    // The hashtable lookup performance is important.
+    // A non-open-address hashtable is about 3x slower due to the extra
+    // loads, and higher resize thresholds are also slower.
+    // With the setup here, the hashtable lookup is no longer the bottleneck.
+    v2p_ = dr_hashtable_create(dr_get_current_drcontext(), V2P_INITIAL_BITS, 20,
+                               /*synch=*/false, nullptr);
 
     // We avoid std::ostringstream to avoid malloc use for static linking.
     constexpr int MAX_PAGEMAP_FNAME = 64;
@@ -116,7 +125,8 @@ physaddr_t::init()
 }
 
 bool
-physaddr_t::virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache)
+physaddr_t::virtual2physical(void *drcontext, addr_t virt, OUT addr_t *phys,
+                             OUT bool *from_cache)
 {
 #ifdef LINUX
     if (phys == nullptr)
@@ -130,35 +140,41 @@ physaddr_t::virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache
         // XXX i#4014: Provide a similar option that doesn't flush and just checks
         // whether mappings have changed?
         use_cache = false;
-        last_vpage_ = PAGE_INVALID;
-        hashtable_clear(&v2p_);
+        memset(last_vpage_, static_cast<char>(PAGE_INVALID), sizeof(last_vpage_));
+        dr_hashtable_clear(drcontext, v2p_);
         count_ = 0;
     }
     if (use_cache) {
         // Use cached values on the assumption that the kernel hasn't re-mapped
         // this virtual page.
-        if (vpage == last_vpage_) {
-            if (from_cache != nullptr)
-                *from_cache = true;
-            *phys = last_ppage_ + page_offs(virt);
-            return true;
+        for (int i = 0; i < NUM_CACHE; ++i) {
+            if (vpage == last_vpage_[i]) {
+                if (from_cache != nullptr)
+                    *from_cache = true;
+                *phys = last_ppage_[i] + page_offs(virt);
+                ++num_hit_cache_;
+                return true;
+            }
         }
         // XXX i#1703: add (debug-build-only) internal stats here and
         // on cache_t::request() fastpath.
-        void *lookup = hashtable_lookup(&v2p_, reinterpret_cast<void *>(vpage));
+        void *lookup = dr_hashtable_lookup(drcontext, v2p_, vpage);
         if (lookup != nullptr) {
             addr_t ppage = reinterpret_cast<addr_t>(lookup);
             // Restore a 0 payload.
             if (ppage == ZERO_ADDR_PAYLOAD)
                 ppage = 0;
-            last_vpage_ = vpage;
-            last_ppage_ = ppage;
             if (from_cache != nullptr)
                 *from_cache = true;
-            *phys = last_ppage_ + page_offs(virt);
+            *phys = ppage + page_offs(virt);
+            last_vpage_[cache_idx_] = vpage;
+            last_ppage_[cache_idx_] = ppage;
+            cache_idx_ = (cache_idx_ + 1) % NUM_CACHE;
+            ++num_hit_table_;
             return true;
         }
     }
+    ++num_miss_;
     // Not cached, or forced to re-sync, so we have to read from the file.
     if (fd_ == -1) {
         NOTIFY(1, "v2p failure: file descriptor is invalid\n");
@@ -190,12 +206,14 @@ physaddr_t::virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache
     // a 0 PFN.  Since that could be valid, we need to check our capabilities
     // to decide.  Until then, we're currently returning 0 for every unprivileged query.
     // Store 0 as a sentinel since 0 means no entry.
-    last_ppage_ = (addr_t)((entry & PAGEMAP_PFN) << page_bits_);
-    hashtable_add(
-        &v2p_, reinterpret_cast<void *>(vpage),
-        reinterpret_cast<void *>(last_ppage_ == 0 ? ZERO_ADDR_PAYLOAD : last_ppage_));
-    last_vpage_ = vpage;
-    *phys = last_ppage_ + page_offs(virt);
+    addr_t ppage = (addr_t)((entry & PAGEMAP_PFN) << page_bits_);
+    // Store 0 as a sentinel since 0 means no entry.
+    dr_hashtable_add(drcontext, v2p_, vpage,
+                     reinterpret_cast<void *>(ppage == 0 ? ZERO_ADDR_PAYLOAD : ppage));
+    *phys = ppage + page_offs(virt);
+    last_ppage_[cache_idx_] = ppage;
+    last_vpage_[cache_idx_] = vpage;
+    cache_idx_ = (cache_idx_ + 1) % NUM_CACHE;
     NOTIFY(2, "virtual %p => physical %p\n", virt, *phys);
     return true;
 #else

diff --git a/clients/drcachesim/tracer/physaddr.h b/clients/drcachesim/tracer/physaddr.h
@@ -58,7 +58,8 @@ class physaddr_t {
     // Returns in "from_cache" whether the physical address had been queried before
     // and was available in a local cache (which is cleared at -virt2phys_freq).
     bool
-    virtual2physical(addr_t virt, OUT addr_t *phys, OUT bool *from_cache = nullptr);
+    virtual2physical(void *drcontext, addr_t virt, OUT addr_t *phys,
+                     OUT bool *from_cache = nullptr);
 
 private:
 #ifdef LINUX
@@ -75,8 +76,11 @@ class physaddr_t {
 
     size_t page_size_;
     int page_bits_;
-    addr_t last_vpage_;
-    addr_t last_ppage_;
+    static constexpr int NUM_CACHE = 8;
+    addr_t last_vpage_[NUM_CACHE];
+    addr_t last_ppage_[NUM_CACHE];
+    // FIFO replacement.
+    int cache_idx_;
     // TODO i#4014: An app with thousands of threads might hit open file limits,
     // and even a hundred threads will use up DR's private FD limit and push
     // other files into potential app conflicts.
@@ -86,11 +90,16 @@ class physaddr_t {
     int fd_;
     // We would use std::unordered_map, but that is not compatible with
     // statically linking drmemtrace into an app.
-    hashtable_t v2p_;
+    // The drcontainers hashtable is too slow due to the extra dereferences:
+    // we need an open-addressed table.
+    void *v2p_;
     // With hashtable_t nullptr is how non-existence is shown, so we store
     // an actual 0 address (can happen for physical) as this sentinel.
     static constexpr addr_t ZERO_ADDR_PAYLOAD = 1;
     unsigned int count_;
+    uint64_t num_hit_cache_;
+    uint64_t num_hit_table_;
+    uint64_t num_miss_;
 #endif
 };
 

diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp
@@ -901,7 +901,8 @@ process_buffer_for_physaddr(void *drcontext, per_thread_t *data, size_t header_s
         addr_t virt = instru->get_entry_addr(drcontext, mem_ref);
         bool from_cache = false;
         addr_t phys = 0;
-        bool success = data->physaddr.virtual2physical(virt, &phys, &from_cache);
+        bool success =
+            data->physaddr.virtual2physical(drcontext, virt, &phys, &from_cache);
         NOTIFY(4, "%s: type=%2d virt=%p phys=%p\n", __FUNCTION__, type, virt, phys);
         if (!success) {
             // XXX i#1735: Unfortunately this happens; currently we use the virtual