mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
Since resolve_minidumps.py's call to minidump_stackwalk can go haywire due to bad symbols in shared libraries, this adds a fallback mechanism where it tries again with a "safe" list of shared libraries. These are limited to the ones that make the most difference in resolving minidumps (libc, libstdc++, and libjvm). The list of safe libraries can be customized via the --safe_library_list. Testing: - Verified that this uses the fallback on Centos 7 and resolves the minidumps successfully. Change-Id: I6bb4c9f65f9c27bb3b86c7ff2f3a6a48e258ef01 Reviewed-on: http://gerrit.cloudera.org:8080/20863 Reviewed-by: Michael Smith <michael.smith@cloudera.com> Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
442 lines
17 KiB
Python
Executable File
442 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# This script automates symbol resolution for Breakpad minidumps
|
|
# under ideal circumstances. Specifically, it expects all the
|
|
# binaries to be in the same locations as when the minidump
|
|
# was taken. This is often true for minidumps on a developer
|
|
# workstation or at the end of an Impala test job. It finds Breakpad
|
|
# using environment variables from the Impala dev environment,
|
|
# so it must run inside the Impala dev environment.
|
|
# TODO: It may be possible to extend this to Docker images.
|
|
#
|
|
# Within this simple context, this script aims for complete
|
|
# symbol resolution. It uses Breakpad's minidump_dump utility
|
|
# to dump the minidump, then it parses the list of libraries
|
|
# that were used by the binary. It gets the symbols for all
|
|
# those libraries and resolves the minidump.
|
|
#
|
|
# Usage: resolve_minidumps.py --minidump_file [file] --output_file [file]
|
|
# (optional -v or --verbose for more output)
|
|
|
|
import errno
|
|
import logging
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import traceback
|
|
|
|
from argparse import ArgumentParser
|
|
|
|
|
|
class ModuleInfo:
|
|
def __init__(self, code_file, code_id, debug_file, debug_id):
|
|
self.code_file = code_file
|
|
self.code_id = code_id
|
|
self.debug_file = debug_file
|
|
self.debug_id = debug_id
|
|
|
|
|
|
def read_module_info(minidump_dump_contents):
|
|
"""Read the module information out of the minidump_dump raw contents.
|
|
This is expecting 'minidump_dump_contents' to be the minidump_dump
|
|
contents for the minidump split by newlines.
|
|
This will return a list of ModuleInfo objects.
|
|
"""
|
|
# Find the module_count
|
|
for idx, line in enumerate(minidump_dump_contents):
|
|
if line.strip().startswith("module_count"):
|
|
module_count = int(line.split("=")[1].strip())
|
|
break
|
|
|
|
# The minidump has a MDRawModule per module and it will have
|
|
# the same number of MDRawModule dumps as module_count.
|
|
module_boundaries = []
|
|
for idx, line in enumerate(minidump_dump_contents):
|
|
if line.startswith("MDRawModule"):
|
|
module_boundaries.append(idx)
|
|
|
|
if len(module_boundaries) != module_count:
|
|
logging.error("Failed to parse modules, mismatch in module count "
|
|
"({0} != {1})".format(len(module_boundaries), module_count))
|
|
return None
|
|
|
|
# Add one more entry to module_boundaries that is the end of the file
|
|
# That makes this more of a list of boundaries than the list of
|
|
# start locations.
|
|
module_boundaries.append(len(minidump_dump_contents))
|
|
|
|
modules = []
|
|
for module_idx in range(module_count):
|
|
module_start = module_boundaries[module_idx]
|
|
module_end = module_boundaries[module_idx + 1]
|
|
|
|
# Find the code_file
|
|
code_file = None
|
|
code_identifier = None
|
|
debug_file = None
|
|
debug_identifier = None
|
|
for line in minidump_dump_contents[module_start:module_end]:
|
|
if line.find("code_file") != -1:
|
|
code_file = line.split("=")[1].strip().strip('"')
|
|
elif line.find("code_identifier") != -1:
|
|
code_identifier = line.split("=")[1].strip().strip('"')
|
|
elif line.find("debug_file") != -1:
|
|
debug_file = line.split("=")[1].strip().strip('"')
|
|
elif line.find("debug_identifier") != -1:
|
|
debug_identifier = line.split("=")[1].strip().strip('"')
|
|
|
|
# Important: it is ok for the fields to be the zero-length string.
|
|
# We just care that they are non-None (i.e. the loop above encountered
|
|
# them and parsed a value).
|
|
if code_file is None or code_identifier is None or debug_file is None or \
|
|
debug_identifier is None:
|
|
logging.error("Failed to parse dump output, missing fields for MDRawModule "
|
|
"{0}".format(module_idx))
|
|
return None
|
|
|
|
# Jars and other files show up in this list, but they have
|
|
# code identifiers or debug identifiers as all zeros. Skip those,
|
|
# as there are no symbols to find.
|
|
if re.fullmatch("[0]+", code_identifier) or re.fullmatch("[0]+", debug_identifier):
|
|
continue
|
|
|
|
# Skip cases where the code identifier or debug identifier are null
|
|
if len(code_identifier) == 0 or len(debug_identifier) == 0:
|
|
continue
|
|
|
|
# linux-gate.so is a special case, and it is not an actual file on disk.
|
|
if code_file.startswith("linux-gate.so"):
|
|
continue
|
|
|
|
modules.append(ModuleInfo(code_file, code_identifier, debug_file, debug_identifier))
|
|
|
|
return modules
|
|
|
|
|
|
def filter_shared_library_modules(module_list, lib_allow_list):
|
|
"""Filter the list of modules by eliminating any shared libaries that do not match
|
|
one of the prefixes in the allow list. This keeps all non-shared libaries
|
|
(such as the main binary).
|
|
"""
|
|
filtered_module_list = []
|
|
for module in module_list:
|
|
code_file_basename = os.path.basename(module.code_file)
|
|
# Keep anything that is not a shared library (e.g. the main binary)
|
|
if ".so" not in code_file_basename:
|
|
filtered_module_list.append(module)
|
|
continue
|
|
# Only keep shared libraries that match an entry on the allow list.
|
|
for allow_lib in lib_allow_list:
|
|
if code_file_basename.startswith(allow_lib):
|
|
filtered_module_list.append(module)
|
|
break
|
|
return filtered_module_list
|
|
|
|
|
|
def find_breakpad_home():
|
|
"""Locate the Breakpad home directory.
|
|
|
|
We try to locate the package in the Impala toolchain folder.
|
|
"""
|
|
toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
|
|
if not toolchain_packages_home:
|
|
logging.error("IMPALA_TOOLCHAIN_PACKAGES_HOME is not set")
|
|
return None
|
|
|
|
if not os.path.isdir(toolchain_packages_home):
|
|
logging.error("Could not find toolchain packages directory")
|
|
return None
|
|
breakpad_version = os.environ.get('IMPALA_BREAKPAD_VERSION')
|
|
if not breakpad_version:
|
|
logging.error("Could not determine breakpad version from toolchain")
|
|
return None
|
|
breakpad_dir = '{0}/breakpad-{1}'.format(toolchain_packages_home, breakpad_version)
|
|
if not os.path.isdir(breakpad_dir):
|
|
logging.error("Could not find breakpad directory")
|
|
return None
|
|
|
|
return breakpad_dir
|
|
|
|
|
|
def find_breakpad_binary(binary_name):
|
|
"""Locate the specified Breadpad binary"""
|
|
breakpad_home = find_breakpad_home()
|
|
if not breakpad_home:
|
|
return None
|
|
|
|
binary_path = os.path.join(breakpad_home, 'bin', binary_name)
|
|
if not os.path.isfile(binary_path):
|
|
logging.error("Could not find {0} executable at {1}".format(binary_name, binary_path))
|
|
return None
|
|
|
|
return binary_path
|
|
|
|
|
|
def find_objcopy_binary():
|
|
"""Locate the 'objcopy' binary from Binutils.
|
|
|
|
We try to locate the package in the Impala toolchain folder.
|
|
TODO: Fall back to finding objcopy in the system path.
|
|
"""
|
|
toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
|
|
if not toolchain_packages_home:
|
|
logging.error("IMPALA_TOOLCHAIN_PACKAGES_HOME is not set")
|
|
return None
|
|
|
|
if not os.path.isdir(toolchain_packages_home):
|
|
logging.error("Could not find toolchain packages directory")
|
|
return None
|
|
binutils_version = os.environ.get('IMPALA_BINUTILS_VERSION')
|
|
if not binutils_version:
|
|
logging.error("Could not determine binutils version from toolchain")
|
|
return None
|
|
binutils_dir = "binutils-{0}".format(binutils_version)
|
|
objcopy = os.path.join(toolchain_packages_home, binutils_dir, 'bin', 'objcopy')
|
|
if not os.path.isfile(objcopy):
|
|
logging.error("Could not find objcopy executable at {0}".format(objcopy))
|
|
return None
|
|
return objcopy
|
|
|
|
|
|
def ensure_dir_exists(path):
|
|
"""Make sure the directory 'path' exists in a thread-safe way."""
|
|
try:
|
|
os.makedirs(path)
|
|
except OSError as e:
|
|
if e.errno != errno.EEXIST or not os.path.isdir(path):
|
|
raise e
|
|
|
|
|
|
def dump_symbols_for_binary(dump_syms, objcopy, binary, out_dir):
|
|
"""Dump symbols of a single binary file and move the result.
|
|
|
|
Symbols will be extracted to a temporary file and moved into place afterwards. Required
|
|
directories will be created if necessary.
|
|
"""
|
|
logging.info("Processing binary file: {0}".format(binary))
|
|
ensure_dir_exists(out_dir)
|
|
# tmp_fd will be closed when the file object created by os.fdopen() below gets
|
|
# destroyed.
|
|
tmp_fd, tmp_file = tempfile.mkstemp(dir=out_dir, suffix='.sym')
|
|
try:
|
|
# Create a temporary directory used for decompressing debug info
|
|
tempdir = tempfile.mkdtemp()
|
|
|
|
# Binaries can contain compressed debug symbols. Breakpad currently
|
|
# does not support dumping symbols for binaries with compressed debug
|
|
# symbols.
|
|
#
|
|
# As a workaround, this uses objcopy to create a copy of the binary with
|
|
# the debug symbols decompressed. If the debug symbols are not compressed
|
|
# in the original binary, objcopy simply makes a copy of the binary.
|
|
# Breakpad is able to read symbols from the decompressed binary, and
|
|
# those symbols work correctly in resolving a minidump from the original
|
|
# compressed binary.
|
|
# TODO: In theory, this could work with the binary.debug_path.
|
|
binary_basename = os.path.basename(binary)
|
|
decompressed_binary = os.path.join(tempdir, binary_basename)
|
|
objcopy_retcode = subprocess.call([objcopy, "--decompress-debug-sections",
|
|
binary, decompressed_binary])
|
|
|
|
# Run dump_syms on the binary
|
|
# If objcopy failed for some reason, fall back to running dump_syms
|
|
# directly on the original binary. This is unlikely to work, but it is a way of
|
|
# guaranteeing that objcopy is not the problem.
|
|
args = [dump_syms, decompressed_binary]
|
|
if objcopy_retcode != 0:
|
|
sys.stderr.write('objcopy failed. Trying to run dump_sym directly.\n')
|
|
args = [dump_syms, binary]
|
|
|
|
# Run dump_syms on the binary.
|
|
proc = subprocess.Popen(args, stdout=os.fdopen(tmp_fd, 'wb'), stderr=subprocess.PIPE)
|
|
_, stderr = proc.communicate()
|
|
if proc.returncode != 0:
|
|
sys.stderr.write('Failed to dump symbols from %s, return code %s\n' %
|
|
(binary, proc.returncode))
|
|
sys.stderr.write(stderr.decode('utf-8'))
|
|
os.remove(tmp_file)
|
|
return False
|
|
# Parse the temporary file to determine the full target path.
|
|
with open(tmp_file, 'r') as f:
|
|
header = f.readline().strip()
|
|
# Format of header is: MODULE os arch binary_id binary
|
|
_, _, _, binary_id, binary = header.split(' ')
|
|
out_path = os.path.join(out_dir, binary, binary_id)
|
|
ensure_dir_exists(out_path)
|
|
# Move the temporary file to its final destination.
|
|
shutil.move(tmp_file, os.path.join(out_path, '%s.sym' % binary))
|
|
except Exception as e:
|
|
# Only need to clean up in case of errors.
|
|
try:
|
|
os.remove(tmp_file)
|
|
except EnvironmentError:
|
|
pass
|
|
raise e
|
|
finally:
|
|
# Cleanup temporary directory
|
|
shutil.rmtree(tempdir)
|
|
return True
|
|
|
|
|
|
def dump_symbols_for_all_modules(dump_syms, objcopy, module_list, out_dir):
|
|
"""Given a list of modules (ModuleInfo objects), dump symbols for
|
|
each library listed.
|
|
"""
|
|
for module in module_list:
|
|
success = dump_symbols_for_binary(dump_syms, objcopy, module.code_file, out_dir)
|
|
if not success:
|
|
logging.warning("Failed to dump symbols for {0}".format(module.code_file))
|
|
|
|
|
|
def resolve_minidump(minidump_stackwalk, minidump_path, symbol_dir, verbose, out_file):
|
|
minidump_stackwalk_cmd = [minidump_stackwalk, minidump_path, symbol_dir]
|
|
# There are circumstances where the minidump_stackwalk can go wrong and become
|
|
# a runaway process capable of using all system memory. If the prlimit utility
|
|
# is present, we use it to apply a limit on the memory consumption.
|
|
#
|
|
# See if we have the prlimit utility
|
|
check_prlimit = subprocess.run(["prlimit", "-V"], stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL)
|
|
if check_prlimit.returncode == 0:
|
|
# The prlimit utility is available, so wrap the minidump_stackwalk command
|
|
# to apply a 4GB limit on virtual memory. In normal operations, 4G is plenty.
|
|
prlimit_wrapper = ["prlimit", "--as={0}".format(4 * 1024 * 1024 * 1024)]
|
|
minidump_stackwalk_cmd = prlimit_wrapper + minidump_stackwalk_cmd
|
|
with open(out_file, "w") as out_f:
|
|
stderr_output = None if verbose else subprocess.DEVNULL
|
|
subprocess.run(minidump_stackwalk_cmd, stdout=out_f,
|
|
stderr=stderr_output, check=True)
|
|
|
|
|
|
def raw_dump_for_minidump(minidump_dump, minidump_path):
|
|
"""Run minidump_dump on the specified minidump and split the output into lines"""
|
|
# minidump_dump sometimes returns an error code even though it produced usable output.
|
|
# So, this doesn't check the error code, and it relies on read_module_info() doing
|
|
# validation.
|
|
#
|
|
# Python 3.6 adjustments:
|
|
# 'capture_output=True' not supported: set stdout/stderr to subprocess.PIPE instead
|
|
# 'text=True' not supported: set 'universal_newlines=True' (the two are the same thing)
|
|
output = subprocess.run([minidump_dump, minidump_path], stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE, universal_newlines=True)
|
|
return output.stdout.split('\n')
|
|
|
|
|
|
def parse_args():
|
|
"""Parse command line arguments and perform sanity checks."""
|
|
# TODO:
|
|
# - Add ability to specify Breakpad home
|
|
# - Add ability to specify the symbol directory location (for reuse)
|
|
# - Add ability to specify Binutils home
|
|
parser = ArgumentParser()
|
|
parser.add_argument('--minidump_file', required=True)
|
|
parser.add_argument('--output_file', required=True)
|
|
parser.add_argument('-v', '--verbose', action='store_true')
|
|
parser.add_argument('--safe_library_list',
|
|
default="libstdc++.so,libc.so,libjvm.so",
|
|
help="Comma-separate list of prefixes for allowed system libraries")
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def dump_syms_and_resolve_stack(modules, minidump_file, output_file, verbose):
|
|
"""Dump the symbols for the listed modules and use them to resolve the minidump."""
|
|
# Create a temporary directory to store the symbols
|
|
# This automatically gets cleaned up
|
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
|
# Dump symbols for all the modules into this temporary directory.
|
|
# Need both dump_syms and objcopy
|
|
dump_syms_bin = find_breakpad_binary("dump_syms")
|
|
if not dump_syms_bin:
|
|
logging.error("Could not find Breakpad dump_syms binary")
|
|
sys.exit(1)
|
|
objcopy_bin = find_objcopy_binary()
|
|
if not objcopy_bin:
|
|
logging.error("Could not find Binutils objcopy binary")
|
|
sys.exit(1)
|
|
dump_symbols_for_all_modules(dump_syms_bin, objcopy_bin, modules, tmp_dir)
|
|
|
|
# Resolve the minidump with the temporary symbol directory
|
|
minidump_stackwalk_bin = find_breakpad_binary("minidump_stackwalk")
|
|
if not minidump_stackwalk_bin:
|
|
logging.error("Could not find Breakpad minidump_stackwalk binary")
|
|
sys.exit(1)
|
|
resolve_minidump(find_breakpad_binary("minidump_stackwalk"), minidump_file,
|
|
tmp_dir, verbose, output_file)
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
if args.verbose:
|
|
logging.basicConfig(level=logging.INFO)
|
|
else:
|
|
logging.basicConfig(level=logging.WARNING)
|
|
|
|
# Step 1: Get the raw dump for the specified minidump
|
|
minidump_dump_bin = find_breakpad_binary("minidump_dump")
|
|
if not minidump_dump_bin:
|
|
logging.error("Could not find Breakpad minidump_dump binary")
|
|
sys.exit(1)
|
|
contents = raw_dump_for_minidump(minidump_dump_bin, args.minidump_file)
|
|
if not contents:
|
|
logging.error(
|
|
"minidump_dump could not get the contents of {0}".format(args.minidump_file))
|
|
sys.exit(1)
|
|
|
|
# Step 2: Parse the raw dump to get the list of code modules
|
|
# This is the list of things that have symbols we need to dump.
|
|
modules = read_module_info(contents)
|
|
if not modules:
|
|
logging.error("Failed to read modules for {0}".format(args.minidump_file))
|
|
sys.exit(1)
|
|
|
|
# Step 3: Dump the symbols and use them to resolve the minidump
|
|
# Sometimes there are libraries with corrupt/problematic symbols
|
|
# that can cause minidump_stackwalk to go haywire and use excessive
|
|
# memory. First, we try using symbols from all of the shared libraries.
|
|
# If that fails, we fallback to using a "safe" list of shared libraries.
|
|
try:
|
|
# Dump the symbols and use them to resolve the minidump
|
|
dump_syms_and_resolve_stack(modules, args.minidump_file, args.output_file,
|
|
args.verbose)
|
|
return
|
|
except Exception:
|
|
logging.warning("Encountered error: {0}".format(traceback.format_exc()))
|
|
logging.warning("Falling back to resolution using the safe library list")
|
|
logging.warning("Safe library list: {0}".format(args.safe_library_list))
|
|
|
|
# Limit the shared libraries to the "safe" list of shared libraries and
|
|
# try again.
|
|
if len(args.safe_library_list) == 0:
|
|
safe_library_list = []
|
|
else:
|
|
safe_library_list = args.safe_library_list.split(",")
|
|
safe_modules = filter_shared_library_modules(modules, safe_library_list)
|
|
dump_syms_and_resolve_stack(safe_modules, args.minidump_file, args.output_file,
|
|
args.verbose)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|