From dac7f409ba0619180835ea908349ac5108a58c3a Mon Sep 17 00:00:00 2001 From: Joe McDonnell Date: Sun, 17 Dec 2023 13:59:35 -0800 Subject: [PATCH] IMPALA-12643 (part 2): Fallback to safe libraries on error in resolve_minidumps.py Since resolve_minidumps.py's call to minidump_stackwalk can go haywire due to bad symbols in shared libraries, this adds a fallback mechanism where it tries again with a "safe" list of shared libraries. These are limited to the ones that make the most difference in resolving minidumps (libc, libstdc++, and libjvm). The list of safe libraries can be customized via the --safe_library_list. Testing: - Verified that this uses the fallback on Centos 7 and resolves the minidumps successfully. Change-Id: I6bb4c9f65f9c27bb3b86c7ff2f3a6a48e258ef01 Reviewed-on: http://gerrit.cloudera.org:8080/20863 Reviewed-by: Michael Smith Tested-by: Joe McDonnell --- bin/resolve_minidumps.py | 94 +++++++++++++++++++++++++++++++--------- 1 file changed, 73 insertions(+), 21 deletions(-) diff --git a/bin/resolve_minidumps.py b/bin/resolve_minidumps.py index f410fabcf..3c64d9d55 100755 --- a/bin/resolve_minidumps.py +++ b/bin/resolve_minidumps.py @@ -43,6 +43,7 @@ import shutil import subprocess import sys import tempfile +import traceback from argparse import ArgumentParser @@ -132,6 +133,26 @@ def read_module_info(minidump_dump_contents): return modules +def filter_shared_library_modules(module_list, lib_allow_list): + """Filter the list of modules by eliminating any shared libaries that do not match + one of the prefixes in the allow list. This keeps all non-shared libaries + (such as the main binary). + """ + filtered_module_list = [] + for module in module_list: + code_file_basename = os.path.basename(module.code_file) + # Keep anything that is not a shared library (e.g. the main binary) + if ".so" not in code_file_basename: + filtered_module_list.append(module) + continue + # Only keep shared libraries that match an entry on the allow list. + for allow_lib in lib_allow_list: + if code_file_basename.startswith(allow_lib): + filtered_module_list.append(module) + break + return filtered_module_list + + def find_breakpad_home(): """Locate the Breakpad home directory. @@ -331,10 +352,39 @@ def parse_args(): parser.add_argument('--minidump_file', required=True) parser.add_argument('--output_file', required=True) parser.add_argument('-v', '--verbose', action='store_true') + parser.add_argument('--safe_library_list', + default="libstdc++.so,libc.so,libjvm.so", + help="Comma-separate list of prefixes for allowed system libraries") args = parser.parse_args() return args +def dump_syms_and_resolve_stack(modules, minidump_file, output_file, verbose): + """Dump the symbols for the listed modules and use them to resolve the minidump.""" + # Create a temporary directory to store the symbols + # This automatically gets cleaned up + with tempfile.TemporaryDirectory() as tmp_dir: + # Dump symbols for all the modules into this temporary directory. + # Need both dump_syms and objcopy + dump_syms_bin = find_breakpad_binary("dump_syms") + if not dump_syms_bin: + logging.error("Could not find Breakpad dump_syms binary") + sys.exit(1) + objcopy_bin = find_objcopy_binary() + if not objcopy_bin: + logging.error("Could not find Binutils objcopy binary") + sys.exit(1) + dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir) + + # Resolve the minidump with the temporary symbol directory + minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk") + if not minidump_stackwalk_bin: + logging.error("Could not find Breakpad minidump_stackwalk binary") + sys.exit(1) + resolve_minidump(find_breakpad_binary("minidump_stackwalk"), minidump_file, + tmp_dir, verbose, output_file) + + def main(): args = parse_args() @@ -361,28 +411,30 @@ def main(): logging.error("Failed to read modules for {0}".format(args.minidump_file)) sys.exit(1) - # Create a temporary directory to store the symbols. - # This automatically gets cleaned up. - with tempfile.TemporaryDirectory() as tmp_dir: - # Step 3: Dump symbols for all the modules into this temporary directory. - # Need both dump_syms and objcopy - dump_syms_bin = find_breakpad_binary("dump_syms") - if not dump_syms_bin: - logging.error("Could not find Breakpad dump_syms binary") - sys.exit(1) - objcopy_bin = find_objcopy_binary() - if not objcopy_bin: - logging.error("Could not find Binutils objcopy binary") - sys.exit(1) - dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir) + # Step 3: Dump the symbols and use them to resolve the minidump + # Sometimes there are libraries with corrupt/problematic symbols + # that can cause minidump_stackwalk to go haywire and use excessive + # memory. First, we try using symbols from all of the shared libraries. + # If that fails, we fallback to using a "safe" list of shared libraries. + try: + # Dump the symbols and use them to resolve the minidump + dump_syms_and_resolve_stack(modules, args.minidump_file, args.output_file, + args.verbose) + return + except Exception: + logging.warning("Encountered error: {0}".format(traceback.format_exc())) + logging.warning("Falling back to resolution using the safe library list") + logging.warning("Safe library list: {0}".format(args.safe_library_list)) - # Step 4: Resolve the minidump with the temporary symbol directory - minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk") - if not minidump_stackwalk_bin: - logging.error("Could not find Breakpad minidump_stackwalk binary") - sys.exit(1) - resolve_minidump(find_breakpad_binary("minidump_stackwalk"), args.minidump_file, - tmp_dir, args.verbose, args.output_file) + # Limit the shared libraries to the "safe" list of shared libraries and + # try again. + if len(args.safe_library_list) == 0: + safe_library_list = [] + else: + safe_library_list = args.safe_library_list.split(",") + safe_modules = filter_shared_library_modules(modules, safe_library_list) + dump_syms_and_resolve_stack(safe_modules, args.minidump_file, args.output_file, + args.verbose) if __name__ == "__main__":