IMPALA-12643 (part 2): Fallback to safe libraries on error in resolve_minidumps.py

Since resolve_minidumps.py's call to minidump_stackwalk can go haywire
due to bad symbols in shared libraries, this adds a fallback mechanism
where it tries again with a "safe" list of shared libraries. These are
limited to the ones that make the most difference in resolving minidumps
(libc, libstdc++, and libjvm). The list of safe libraries can be
customized via the --safe_library_list.

Testing:
 - Verified that this uses the fallback on Centos 7 and resolves
   the minidumps successfully.

Change-Id: I6bb4c9f65f9c27bb3b86c7ff2f3a6a48e258ef01
Reviewed-on: http://gerrit.cloudera.org:8080/20863
Reviewed-by: Michael Smith <michael.smith@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
This commit is contained in:
Joe McDonnell
2023-12-17 13:59:35 -08:00
parent c0a015fdac
commit dac7f409ba

View File

@@ -43,6 +43,7 @@ import shutil
import subprocess
import sys
import tempfile
import traceback
from argparse import ArgumentParser
@@ -132,6 +133,26 @@ def read_module_info(minidump_dump_contents):
return modules
def filter_shared_library_modules(module_list, lib_allow_list):
"""Filter the list of modules by eliminating any shared libaries that do not match
one of the prefixes in the allow list. This keeps all non-shared libaries
(such as the main binary).
"""
filtered_module_list = []
for module in module_list:
code_file_basename = os.path.basename(module.code_file)
# Keep anything that is not a shared library (e.g. the main binary)
if ".so" not in code_file_basename:
filtered_module_list.append(module)
continue
# Only keep shared libraries that match an entry on the allow list.
for allow_lib in lib_allow_list:
if code_file_basename.startswith(allow_lib):
filtered_module_list.append(module)
break
return filtered_module_list
def find_breakpad_home():
"""Locate the Breakpad home directory.
@@ -331,10 +352,39 @@ def parse_args():
parser.add_argument('--minidump_file', required=True)
parser.add_argument('--output_file', required=True)
parser.add_argument('-v', '--verbose', action='store_true')
parser.add_argument('--safe_library_list',
default="libstdc++.so,libc.so,libjvm.so",
help="Comma-separate list of prefixes for allowed system libraries")
args = parser.parse_args()
return args
def dump_syms_and_resolve_stack(modules, minidump_file, output_file, verbose):
"""Dump the symbols for the listed modules and use them to resolve the minidump."""
# Create a temporary directory to store the symbols
# This automatically gets cleaned up
with tempfile.TemporaryDirectory() as tmp_dir:
# Dump symbols for all the modules into this temporary directory.
# Need both dump_syms and objcopy
dump_syms_bin = find_breakpad_binary("dump_syms")
if not dump_syms_bin:
logging.error("Could not find Breakpad dump_syms binary")
sys.exit(1)
objcopy_bin = find_objcopy_binary()
if not objcopy_bin:
logging.error("Could not find Binutils objcopy binary")
sys.exit(1)
dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir)
# Resolve the minidump with the temporary symbol directory
minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk")
if not minidump_stackwalk_bin:
logging.error("Could not find Breakpad minidump_stackwalk binary")
sys.exit(1)
resolve_minidump(find_breakpad_binary("minidump_stackwalk"), minidump_file,
tmp_dir, verbose, output_file)
def main():
args = parse_args()
@@ -361,28 +411,30 @@ def main():
logging.error("Failed to read modules for {0}".format(args.minidump_file))
sys.exit(1)
# Create a temporary directory to store the symbols.
# This automatically gets cleaned up.
with tempfile.TemporaryDirectory() as tmp_dir:
# Step 3: Dump symbols for all the modules into this temporary directory.
# Need both dump_syms and objcopy
dump_syms_bin = find_breakpad_binary("dump_syms")
if not dump_syms_bin:
logging.error("Could not find Breakpad dump_syms binary")
sys.exit(1)
objcopy_bin = find_objcopy_binary()
if not objcopy_bin:
logging.error("Could not find Binutils objcopy binary")
sys.exit(1)
dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir)
# Step 3: Dump the symbols and use them to resolve the minidump
# Sometimes there are libraries with corrupt/problematic symbols
# that can cause minidump_stackwalk to go haywire and use excessive
# memory. First, we try using symbols from all of the shared libraries.
# If that fails, we fallback to using a "safe" list of shared libraries.
try:
# Dump the symbols and use them to resolve the minidump
dump_syms_and_resolve_stack(modules, args.minidump_file, args.output_file,
args.verbose)
return
except Exception:
logging.warning("Encountered error: {0}".format(traceback.format_exc()))
logging.warning("Falling back to resolution using the safe library list")
logging.warning("Safe library list: {0}".format(args.safe_library_list))
# Step 4: Resolve the minidump with the temporary symbol directory
minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk")
if not minidump_stackwalk_bin:
logging.error("Could not find Breakpad minidump_stackwalk binary")
sys.exit(1)
resolve_minidump(find_breakpad_binary("minidump_stackwalk"), args.minidump_file,
tmp_dir, args.verbose, args.output_file)
# Limit the shared libraries to the "safe" list of shared libraries and
# try again.
if len(args.safe_library_list) == 0:
safe_library_list = []
else:
safe_library_list = args.safe_library_list.split(",")
safe_modules = filter_shared_library_modules(modules, safe_library_list)
dump_syms_and_resolve_stack(safe_modules, args.minidump_file, args.output_file,
args.verbose)
if __name__ == "__main__":