Skip to content

Commit

Permalink
Introduce per-DSO cache for symbol lookup via dwfl_module_addrinfo
Browse files Browse the repository at this point in the history
The symbol table isn't necessarily sorted, and thus repeated lookups
in there can be expensive when a DSO has many entries in its symtab.

For example, the librustc_driver from rustc 1.40.0 has about 202594
symbols. A single call to dwfl_module_addrinfo can take milliseconds
on my laptop. Every time we get a sample at a so far unknown address,
we have to find the corresponding symbol. So we called this function
a lot, which can add up to a significant amount of time. Now, we
cache the symbol name and its offset and size information in a sorted
list and try to lookup the symbol there quickly. The impact of this
patch on the overall time required to analyze a ~1GB perf.data file
for a `cargo build` process (and it's child processes) is huge:

before:
```
        447.681,66 msec task-clock:u              #    0,989 CPUs utilized
                 0      context-switches:u        #    0,000 K/sec
                 0      cpu-migrations:u          #    0,000 K/sec
            45.214      page-faults:u             #    0,101 K/sec
 1.272.289.956.854      cycles:u                  #    2,842 GHz
 3.497.255.264.964      instructions:u            #    2,75  insn per cycle
   863.671.557.196      branches:u                # 1929,209 M/sec
     2.666.320.642      branch-misses:u           #    0,31% of all branches

     452,806895428 seconds time elapsed

     441,996666000 seconds user
       2,557237000 seconds sys
```

after:

```
         63.770,08 msec task-clock:u              #    0,995 CPUs utilized
                 0      context-switches:u        #    0,000 K/sec
                 0      cpu-migrations:u          #    0,000 K/sec
            35.102      page-faults:u             #    0,550 K/sec
   191.267.750.628      cycles:u                  #    2,999 GHz
   501.316.536.714      instructions:u            #    2,62  insn per cycle
   122.234.405.333      branches:u                # 1916,799 M/sec
       443.671.470      branch-misses:u           #    0,36% of all branches

      64,063443896 seconds time elapsed

      62,188041000 seconds user
       1,136533000 seconds sys
```

That means we are now roughly 7x faster than before.

Fixes: KDAB/hotspot#225
Change-Id: Ib7dbc800c9372044a847de68a8459dd7f7b0d3da
Reviewed-by: Ulf Hermann <[email protected]>
  • Loading branch information
milianw committed Jan 9, 2020
1 parent 807cccb commit 69e2662
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 6 deletions.
35 changes: 35 additions & 0 deletions app/perfaddresscache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,38 @@ void PerfAddressCache::cache(const PerfElfMap::ElfInfo& elf, quint64 addr,
else
(*invalidAddressCache)[addr] = entry;
}

static bool operator<(const PerfAddressCache::SymbolCacheEntry &lhs, quint64 addr)
{
return lhs.offset < addr;
}

PerfAddressCache::SymbolCacheEntry PerfAddressCache::findSymbol(const PerfElfMap::ElfInfo& elf,
quint64 addr) const
{
Q_ASSERT(elf.isValid());
const auto &symbols = m_symbolCache.value(elf.originalPath);
const auto relAddr = relativeAddress(elf, addr);
auto it = std::lower_bound(symbols.begin(), symbols.end(), relAddr);

if (it != symbols.end() && it->offset == relAddr)
return *it;
if (it == symbols.begin())
return {};

--it;

if (it->offset <= relAddr && it->offset + it->size > relAddr)
return *it;
return {};
}

void PerfAddressCache::cacheSymbol(const PerfElfMap::ElfInfo& elf, quint64 startAddr, quint64 size,
const QByteArray& symname)
{
Q_ASSERT(elf.isValid());
auto &symbols = m_symbolCache[elf.originalPath];
const auto offset = relativeAddress(elf, startAddr);
auto it = std::lower_bound(symbols.begin(), symbols.end(), offset);
symbols.insert(it, {offset, size, symname});
}
24 changes: 24 additions & 0 deletions app/perfaddresscache.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
#define PERFADDRESSCACHE_H

#include <QHash>
#include <QVector>

#include "perfelfmap.h"

class PerfAddressCache
Expand All @@ -38,12 +40,34 @@ class PerfAddressCache
};
using OffsetAddressCache = QHash<quint64, AddressCacheEntry>;

struct SymbolCacheEntry
{
SymbolCacheEntry(quint64 offset = 0, quint64 size = 0, const QByteArray &symname = {})
: offset(offset)
, size(size)
, symname(symname)
{}

bool isValid() const { return size != 0; }

quint64 offset;
quint64 size;
QByteArray symname;
};
using SymbolCache = QVector<SymbolCacheEntry>;

AddressCacheEntry find(const PerfElfMap::ElfInfo& elf, quint64 addr,
OffsetAddressCache *invalidAddressCache) const;
void cache(const PerfElfMap::ElfInfo& elf, quint64 addr,
const AddressCacheEntry& entry, OffsetAddressCache *invalidAddressCache);

SymbolCacheEntry findSymbol(const PerfElfMap::ElfInfo &elf, quint64 addr) const;
void cacheSymbol(const PerfElfMap::ElfInfo &elf, quint64 startAddr, quint64 size,
const QByteArray &symname);
private:
QHash<QByteArray, OffsetAddressCache> m_cache;
QHash<QByteArray, SymbolCache> m_symbolCache;
};
Q_DECLARE_TYPEINFO(PerfAddressCache::SymbolCacheEntry, Q_MOVABLE_TYPE);

#endif
22 changes: 16 additions & 6 deletions app/perfsymboltable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -894,8 +894,10 @@ static QByteArray fakeSymbolFromSection(Dwfl_Module *mod, Dwarf_Addr addr)
int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, bool isKernel,
bool *isInterworking)
{
auto addressCache = m_unwind->addressCache();

const auto& elf = findElf(ip);
auto cached = m_unwind->addressCache()->find(elf, ip, &m_invalidAddressCache);
auto cached = addressCache->find(elf, ip, &m_invalidAddressCache);
if (cached.isValid()) {
*isInterworking = cached.isInterworking;
return cached.locationId;
Expand All @@ -918,13 +920,21 @@ int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, bool isKernel,
PerfUnwind::Location functionLocation(addressLocation);

QByteArray symname;
GElf_Sym sym;
GElf_Off off = 0;

if (mod) {
// For addrinfo we need the raw pointer into symtab, so we need to adjust ourselves.
symname = dwfl_module_addrinfo(mod, addressLocation.address, &off, &sym, nullptr, nullptr,
nullptr);
auto cachedAddrInfo = addressCache->findSymbol(elf, addressLocation.address);
if (cachedAddrInfo.isValid()) {
off = addressLocation.address - elf.addr - cachedAddrInfo.offset;
symname = cachedAddrInfo.symname;
} else {
GElf_Sym sym;
// For addrinfo we need the raw pointer into symtab, so we need to adjust ourselves.
symname = dwfl_module_addrinfo(mod, addressLocation.address, &off, &sym, nullptr, nullptr,
nullptr);
if (off != addressLocation.address)
addressCache->cacheSymbol(elf, addressLocation.address - off, sym.st_size, symname);
}

if (off == addressLocation.address) {// no symbol found
symname = fakeSymbolFromSection(mod, addressLocation.address);
Expand Down Expand Up @@ -1023,7 +1033,7 @@ int PerfSymbolTable::lookupFrame(Dwarf_Addr ip, bool isKernel,

int locationId = m_unwind->resolveLocation(addressLocation);
*isInterworking = (symname == "$a" || symname == "$t");
m_unwind->addressCache()->cache(elf, ip, {locationId, *isInterworking}, &m_invalidAddressCache);
addressCache->cache(elf, ip, {locationId, *isInterworking}, &m_invalidAddressCache);
return locationId;
}

Expand Down
27 changes: 27 additions & 0 deletions tests/auto/addresscache/tst_addresscache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,33 @@ private slots:
PerfAddressCache::OffsetAddressCache invalidAddressCache;
QCOMPARE(cache.find(PerfElfMap::ElfInfo{}, 0x123, &invalidAddressCache).locationId, -1);
}

void testSymbolCache()
{
PerfElfMap::ElfInfo info_a{{}, 0x100, 100, 0,
QByteArrayLiteral("libfoo_a.so"),
QByteArrayLiteral("/usr/lib/libfoo_a.so")};
PerfElfMap::ElfInfo info_b{{}, 0x200, 100, 0,
QByteArrayLiteral("libfoo_b.so"),
QByteArrayLiteral("/usr/lib/libfoo_b.so")};

PerfAddressCache cache;

QVERIFY(!cache.findSymbol(info_a, 0x100).isValid());
QVERIFY(!cache.findSymbol(info_b, 0x100).isValid());

cache.cacheSymbol(info_a, 0x100, 10, "Foo");
for (auto addr : {0x100, 0x100 + 9}) {
const auto cached = cache.findSymbol(info_a, addr);
QVERIFY(cached.isValid());
QCOMPARE(cached.offset, 0);
QCOMPARE(cached.size, 10);
QCOMPARE(cached.symname, "Foo");
}
QVERIFY(!cache.findSymbol(info_a, 0x100 + 10).isValid());
QVERIFY(!cache.findSymbol(info_b, 0x100).isValid());
QVERIFY(!cache.findSymbol(info_b, 0x100 + 9).isValid());
}
};

QTEST_GUILESS_MAIN(TestAddressCache)
Expand Down

0 comments on commit 69e2662

Please sign in to comment.