[HWASan] allow symbolizer script to index binaries by build id.

Tested on an example callstack with misplaced binaries from Android.
Tested Regex against callstack without Build ID to confirm it still works.

Reviewed By: eugenis

Differential Revision: https://reviews.llvm.org/D123437
This commit is contained in:
Florian Mayer 2022-04-08 20:01:53 -07:00
parent 06285fc9fd
commit a0570e7750
1 changed files with 90 additions and 5 deletions

View File

@ -21,6 +21,9 @@ import sys
import string
import subprocess
import argparse
import mmap
import struct
import os
if sys.version_info.major < 3:
# Simulate Python 3.x behaviour of defaulting to UTF-8 for print. This is
@ -31,6 +34,71 @@ if sys.version_info.major < 3:
last_access_address = None
last_access_tag = None
# Below, a parser for a subset of ELF. It only supports 64 bit, little-endian,
# and only parses what is necessary to find the build ids. It uses a memoryview
# into an mmap to avoid copying.
Ehdr_size = 64
e_shnum_offset = 60
e_shoff_offset = 40
Shdr_size = 64
sh_type_offset = 4
sh_offset_offset = 24
sh_size_offset = 32
SHT_NOTE = 7
Nhdr_size = 12
NT_GNU_BUILD_ID = 3
def align_up(size, alignment):
return (size + alignment - 1) & ~(alignment - 1)
def handle_Nhdr(mv, sh_size):
offset = 0
while offset < sh_size:
n_namesz, n_descsz, n_type = struct.unpack_from('<III', buffer=mv,
offset=offset)
if (n_type == NT_GNU_BUILD_ID and n_namesz == 4 and
mv[offset + Nhdr_size: offset + Nhdr_size + 4] == b"GNU\x00"):
value = mv[offset + Nhdr_size + 4: offset + Nhdr_size + 4 + n_descsz]
return value.hex()
offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4)
return None
def handle_Shdr(mv):
sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset)
if sh_type != SHT_NOTE:
return None, None
sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset)
sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset)
return sh_offset, sh_size
def handle_elf(mv):
# \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on
# 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will
# have to extend the parsing code.
if mv[:6] != b'\x7fELF\x02\x01':
return None
e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset)
e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset)
for i in range(0, e_shnum):
start = e_shoff + i * Shdr_size
sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size])
if sh_offset is None:
continue
note_hdr = mv[sh_offset: sh_offset + sh_size]
result = handle_Nhdr(note_hdr, sh_size)
if result is not None:
return result
def get_buildid(filename):
with open(filename, "r") as fd:
if os.fstat(fd.fileno()).st_size < Ehdr_size:
return None
with mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ) as m:
with memoryview(m) as mv:
return handle_elf(mv)
class Symbolizer:
def __init__(self, path, binary_prefixes, paths_to_cut):
self.__pipe = None
@ -39,6 +107,7 @@ class Symbolizer:
self.__paths_to_cut = paths_to_cut
self.__log = False
self.__warnings = set()
self.__index = {}
def enable_logging(self, enable):
self.__log = enable
@ -77,9 +146,12 @@ class Symbolizer:
file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
return file_name
def __process_binary_name(self, name):
def __process_binary_name(self, name, buildid=None):
if name.startswith('/'):
name = name[1:]
if buildid is not None and buildid in self.__index:
return self.__index[buildid]
for p in self.__binary_prefixes:
full_path = os.path.join(p, name)
if os.path.exists(full_path):
@ -121,10 +193,10 @@ class Symbolizer:
except Symbolizer.__EOF:
pass
def iter_call_stack(self, binary, addr):
def iter_call_stack(self, binary, buildid, addr):
self.__open_pipe()
p = self.__pipe
binary = self.__process_binary_name(binary)
binary = self.__process_binary_name(binary, buildid)
if not binary:
return
self.__write("CODE %s %s" % (binary, addr))
@ -137,15 +209,25 @@ class Symbolizer:
except Symbolizer.__EOF:
pass
def build_index(self):
for p in self.__binary_prefixes:
for dname, _, fnames in os.walk(p):
for fn in fnames:
filename = os.path.join(dname, fn)
bid = get_buildid(filename)
if bid is not None:
self.__index[bid] = filename
def symbolize_line(line, symbolizer_path):
#0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45)
match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)', line, re.UNICODE)
match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE)
if match:
frameno = match.group(2)
binary = match.group(5)
addr = int(match.group(6), 16)
buildid = match.group(7)
frames = list(symbolizer.iter_call_stack(binary, addr))
frames = list(symbolizer.iter_call_stack(binary, buildid, addr))
if len(frames) > 0:
print("%s#%s%s%s in %s" % (match.group(1), match.group(2),
@ -210,6 +292,7 @@ parser.add_argument('-v', action='store_true')
parser.add_argument('--ignore-tags', action='store_true')
parser.add_argument('--symbols', action='append')
parser.add_argument('--source', action='append')
parser.add_argument('--index', action='store_true')
parser.add_argument('--symbolizer')
parser.add_argument('args', nargs=argparse.REMAINDER)
args = parser.parse_args()
@ -297,6 +380,8 @@ if args.v:
symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut)
symbolizer.enable_logging(args.d)
if args.index:
symbolizer.build_index()
for line in sys.stdin:
if sys.version_info.major < 3: