Skip to content

Commit

Permalink
[HWASan] allow symbolizer script to index binaries by build id.
Browse files Browse the repository at this point in the history
Tested on an example callstack with misplaced binaries from Android.
Tested Regex against callstack without Build ID to confirm it still works.

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D123437
  • Loading branch information
fmayer committed Apr 11, 2022
1 parent 06285fc commit a0570e7
Showing 1 changed file with 90 additions and 5 deletions.
95 changes: 90 additions & 5 deletions compiler-rt/lib/hwasan/scripts/hwasan_symbolize
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import sys
import string
import subprocess
import argparse
import mmap
import struct
import os

if sys.version_info.major < 3:
# Simulate Python 3.x behaviour of defaulting to UTF-8 for print. This is
Expand All @@ -31,6 +34,71 @@ if sys.version_info.major < 3:
last_access_address = None
last_access_tag = None

# Below, a parser for a subset of ELF. It only supports 64 bit, little-endian,
# and only parses what is necessary to find the build ids. It uses a memoryview
# into an mmap to avoid copying.
Ehdr_size = 64
e_shnum_offset = 60
e_shoff_offset = 40

Shdr_size = 64
sh_type_offset = 4
sh_offset_offset = 24
sh_size_offset = 32
SHT_NOTE = 7

Nhdr_size = 12
NT_GNU_BUILD_ID = 3

def align_up(size, alignment):
return (size + alignment - 1) & ~(alignment - 1)

def handle_Nhdr(mv, sh_size):
offset = 0
while offset < sh_size:
n_namesz, n_descsz, n_type = struct.unpack_from('<III', buffer=mv,
offset=offset)
if (n_type == NT_GNU_BUILD_ID and n_namesz == 4 and
mv[offset + Nhdr_size: offset + Nhdr_size + 4] == b"GNU\x00"):
value = mv[offset + Nhdr_size + 4: offset + Nhdr_size + 4 + n_descsz]
return value.hex()
offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4)
return None

def handle_Shdr(mv):
sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset)
if sh_type != SHT_NOTE:
return None, None
sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset)
sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset)
return sh_offset, sh_size

def handle_elf(mv):
# \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on
# 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will
# have to extend the parsing code.
if mv[:6] != b'\x7fELF\x02\x01':
return None
e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset)
e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset)
for i in range(0, e_shnum):
start = e_shoff + i * Shdr_size
sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size])
if sh_offset is None:
continue
note_hdr = mv[sh_offset: sh_offset + sh_size]
result = handle_Nhdr(note_hdr, sh_size)
if result is not None:
return result

def get_buildid(filename):
with open(filename, "r") as fd:
if os.fstat(fd.fileno()).st_size < Ehdr_size:
return None
with mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ) as m:
with memoryview(m) as mv:
return handle_elf(mv)

class Symbolizer:
def __init__(self, path, binary_prefixes, paths_to_cut):
self.__pipe = None
Expand All @@ -39,6 +107,7 @@ class Symbolizer:
self.__paths_to_cut = paths_to_cut
self.__log = False
self.__warnings = set()
self.__index = {}

def enable_logging(self, enable):
self.__log = enable
Expand Down Expand Up @@ -77,9 +146,12 @@ class Symbolizer:
file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
return file_name

def __process_binary_name(self, name):
def __process_binary_name(self, name, buildid=None):
if name.startswith('/'):
name = name[1:]
if buildid is not None and buildid in self.__index:
return self.__index[buildid]

for p in self.__binary_prefixes:
full_path = os.path.join(p, name)
if os.path.exists(full_path):
Expand Down Expand Up @@ -121,10 +193,10 @@ class Symbolizer:
except Symbolizer.__EOF:
pass

def iter_call_stack(self, binary, addr):
def iter_call_stack(self, binary, buildid, addr):
self.__open_pipe()
p = self.__pipe
binary = self.__process_binary_name(binary)
binary = self.__process_binary_name(binary, buildid)
if not binary:
return
self.__write("CODE %s %s" % (binary, addr))
Expand All @@ -137,15 +209,25 @@ class Symbolizer:
except Symbolizer.__EOF:
pass

def build_index(self):
for p in self.__binary_prefixes:
for dname, _, fnames in os.walk(p):
for fn in fnames:
filename = os.path.join(dname, fn)
bid = get_buildid(filename)
if bid is not None:
self.__index[bid] = filename

def symbolize_line(line, symbolizer_path):
#0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45)
match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)', line, re.UNICODE)
match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE)
if match:
frameno = match.group(2)
binary = match.group(5)
addr = int(match.group(6), 16)
buildid = match.group(7)

frames = list(symbolizer.iter_call_stack(binary, addr))
frames = list(symbolizer.iter_call_stack(binary, buildid, addr))

if len(frames) > 0:
print("%s#%s%s%s in %s" % (match.group(1), match.group(2),
Expand Down Expand Up @@ -210,6 +292,7 @@ parser.add_argument('-v', action='store_true')
parser.add_argument('--ignore-tags', action='store_true')
parser.add_argument('--symbols', action='append')
parser.add_argument('--source', action='append')
parser.add_argument('--index', action='store_true')
parser.add_argument('--symbolizer')
parser.add_argument('args', nargs=argparse.REMAINDER)
args = parser.parse_args()
Expand Down Expand Up @@ -297,6 +380,8 @@ if args.v:

symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut)
symbolizer.enable_logging(args.d)
if args.index:
symbolizer.build_index()

for line in sys.stdin:
if sys.version_info.major < 3:
Expand Down

0 comments on commit a0570e7

Please sign in to comment.