mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
When using bin/dump_breakpad_symbols.py to dump symbols for RPM/DEB packages, the script extracts the packages to a temporary directory and relies on keeping that directory around until the processing is finished. The parallel processing added in IMPALA-11511 breaks the logic that keeps the temporary directory around, so the script generates errors like: Found debugging info in /tmp/tmpqfZ9MZ/usr/lib/debug/usr/lib/impala/sbin-retail/impalad.debug Failed to open ELF file '/tmp/tmpqfZ9MZ/usr/lib/debug/usr/lib/impala/sbin-retail/impalad.debug': No such file or directory Failed to write symbol file. This turns off parallelism for bin/dump_breakpad_symbols.py when processing RPM/DEB packages (i.e. -r/--pkg). This also avoids using a ThreadPool when num_processes <= 1. Testing: - Hand tested with Redhat 7 RPMs Change-Id: If2885a9cfb36a4f616b539599e7f744bd23552c3 Reviewed-on: http://gerrit.cloudera.org:8080/20943 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
385 lines
15 KiB
Python
Executable File
385 lines
15 KiB
Python
Executable File
#!/usr/bin/env impala-python
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# This script can be used to dump symbols using the 'dump_syms' binary, which is contained
|
|
# in Google Breakpad. It supports collecting binary files from different sources:
|
|
#
|
|
# - Scan an Impala build dir for ELF files
|
|
# - Read files from stdin
|
|
# - Process a list of one or multiple explicitly specified files
|
|
# - Extract an Impala rpm/deb and corresponding debuginfo rpm/deb file, scan for ELF
|
|
# files, and process them together with their respective .debug file.
|
|
#
|
|
# Dependencies:
|
|
# - dpkg (sudo apt-get -y install dpkg)
|
|
# - rpm2cpio (sudo apt-get -y install rpm2cpio)
|
|
# - cpio (sudo apt-get -y install cpio)
|
|
# - Google Breakpad, either installed via the Impala toolchain or separately
|
|
#
|
|
# Usage: dump_breakpad_symbols.py -h
|
|
#
|
|
# Typical usage patterns:
|
|
# -----------------------
|
|
#
|
|
# * Extract symbols from an rpm file and its debuginfo counterpart:
|
|
# ./dump_breakpad_symbols -d /tmp/syms \
|
|
# -r tmp/impala-2.5.0+cdh5.7.0+0-1.cdh5.7.0.p0.147.el6.x86_64.rpm \
|
|
# -s tmp/impala-debuginfo-2.5.0+cdh5.7.0+0-1.cdh5.7.0.p0.147.el6.x86_64.rpm
|
|
#
|
|
# Note that this will process all ELF binaries in the rpm, including both debug and
|
|
# release builds. Files are identified by hashes, so you don't need to worry about
|
|
# collisions and you can expect it to 'just work'.
|
|
#
|
|
# * Scan an impalad build directory and extract Breakpad symbols from all binaries:
|
|
# ./dump_breakpad_symbols.py -d /tmp/syms -b be/build/debug
|
|
#
|
|
# * Use the 'minidump_stackwalk' after symbol extraction tool to process a minidump file:
|
|
# $IMPALA_TOOLCHAIN_PACKAGES_HOME/breakpad-*/bin/minidump_stackwalk \
|
|
# /tmp/impala-minidumps/impalad/03c0ee26-bfd1-cf3e-43fa49ca-1a6aae25.dmp /tmp/syms
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
import errno
|
|
import logging
|
|
import magic
|
|
import multiprocessing
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
|
|
from argparse import ArgumentParser
|
|
from collections import namedtuple
|
|
from multiprocessing.pool import ThreadPool
|
|
|
|
BinarySymbolInfo = namedtuple('BinarySymbolInfo', 'path, debug_path')
|
|
|
|
|
|
def die(msg=''):
|
|
"""End the process, optionally after printing the passed error message."""
|
|
logging.error('ERROR: %s\n' % msg)
|
|
sys.exit(1)
|
|
|
|
|
|
def find_dump_syms_binary():
|
|
"""Locate the 'dump_syms' binary from Breakpad.
|
|
|
|
We try to locate the package in the Impala toolchain folder.
|
|
TODO: Lookup the binary in the system path. Not urgent, since the user can specify the
|
|
path as a command line switch.
|
|
"""
|
|
toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
|
|
if toolchain_packages_home:
|
|
if not os.path.isdir(toolchain_packages_home):
|
|
die('Could not find toolchain packages directory')
|
|
breakpad_version = os.environ.get('IMPALA_BREAKPAD_VERSION')
|
|
if not breakpad_version:
|
|
die('Could not determine breakpad version from toolchain')
|
|
breakpad_dir = 'breakpad-%s' % breakpad_version
|
|
dump_syms = os.path.join(toolchain_packages_home, breakpad_dir, 'bin', 'dump_syms')
|
|
if not os.path.isfile(dump_syms):
|
|
die('Could not find dump_syms executable at %s' % dump_syms)
|
|
return dump_syms
|
|
return ''
|
|
|
|
|
|
def find_objcopy_binary():
|
|
"""Locate the 'objcopy' binary from Binutils.
|
|
|
|
We try to locate the package in the Impala toolchain folder.
|
|
TODO: Fall back to finding objcopy in the system path.
|
|
"""
|
|
toolchain_packages_home = os.environ.get('IMPALA_TOOLCHAIN_PACKAGES_HOME')
|
|
if toolchain_packages_home:
|
|
if not os.path.isdir(toolchain_packages_home):
|
|
die('Could not find toolchain packages directory')
|
|
binutils_version = os.environ.get('IMPALA_BINUTILS_VERSION')
|
|
if not binutils_version:
|
|
die('Could not determine binutils version from toolchain')
|
|
binutils_dir = 'binutils-%s' % binutils_version
|
|
objcopy = os.path.join(toolchain_packages_home, binutils_dir, 'bin', 'objcopy')
|
|
if not os.path.isfile(objcopy):
|
|
die('Could not find objcopy executable at %s' % objcopy)
|
|
return objcopy
|
|
return ''
|
|
|
|
|
|
def parse_args():
|
|
"""Parse command line arguments and perform sanity checks."""
|
|
parser = ArgumentParser()
|
|
parser.add_argument('-d', '--dest_dir', required=True, help="""The target directory,
|
|
below which to place extracted symbol files""")
|
|
parser.add_argument('--dump_syms', help='Path to the dump_syms binary from Breakpad')
|
|
# Options controlling how to find input files.
|
|
parser.add_argument('-b', '--build_dir', help="""Path to a directory containing results
|
|
from an Impala build, e.g. be/build/debug""")
|
|
parser.add_argument('-f', '--binary_files', nargs='+', metavar="FILE",
|
|
help='List of binary files to process')
|
|
parser.add_argument('-i', '--stdin_files', action='store_true', help="""Read the list
|
|
of files to process from stdin""")
|
|
parser.add_argument('-r', '--pkg', '--rpm', help="""RPM/DEB file containing the binaries
|
|
to process, use with -s""")
|
|
parser.add_argument('-s', '--symbol_pkg', '--debuginfo_rpm', help="""RPM/DEB file
|
|
containing the debug symbols matching the binaries in -r""")
|
|
parser.add_argument('--no_symbol_pkg', '--no_debuginfo_rpm', action='store_true',
|
|
help="""Don't require a symbol pkg when processing a RPM/DEB package with -r""")
|
|
parser.add_argument('--objcopy', help='Path to the objcopy binary from Binutils')
|
|
parser.add_argument('--num_processes', type=int, default=multiprocessing.cpu_count(),
|
|
help="Number of parallel processes to use.")
|
|
args = parser.parse_args()
|
|
|
|
# Post processing checks
|
|
# Check that either both pkg and debuginfo_rpm/deb are specified, or none.
|
|
if not args.no_symbol_pkg and bool(args.pkg) != bool(args.symbol_pkg):
|
|
parser.print_usage()
|
|
die("The -r option requires a corresponding -s unless --no_symbol_pkg is specified")
|
|
input_flags = [args.build_dir, args.binary_files, args.stdin_files, args.pkg]
|
|
if sum(1 for flag in input_flags if flag) != 1:
|
|
die('You need to specify exactly one way to locate input files (-b/-f/-i/-r,-s)')
|
|
|
|
return args
|
|
|
|
|
|
def ensure_dir_exists(path):
|
|
"""Make sure the directory 'path' exists in a thread-safe way."""
|
|
try:
|
|
os.makedirs(path)
|
|
except OSError as e:
|
|
if e.errno != errno.EEXIST or not os.path.isdir(path):
|
|
raise e
|
|
|
|
|
|
def walk_path(path):
|
|
for dirpath, dirnames, filenames in os.walk(path):
|
|
for name in filenames:
|
|
yield os.path.join(dirpath, name)
|
|
|
|
|
|
def is_regular_file(path):
|
|
"""Check whether 'path' is a regular file, especially not a symlink."""
|
|
return os.path.isfile(path) and not os.path.islink(path)
|
|
|
|
|
|
def is_elf_file(path):
|
|
"""Check whether 'path' is an ELF file."""
|
|
return is_regular_file(path) and 'ELF' in magic.from_file(path)
|
|
|
|
|
|
def find_elf_files(path):
|
|
"""Walk 'path' and return a generator over all ELF files below."""
|
|
return (f for f in walk_path(path) if is_elf_file(f))
|
|
|
|
|
|
def extract_rpm(rpm, out_dir):
|
|
"""Extract 'rpm' into 'out_dir'."""
|
|
assert os.path.isdir(out_dir)
|
|
cmd = 'rpm2cpio %s | cpio -id' % rpm
|
|
subprocess.check_call(cmd, shell=True, cwd=out_dir)
|
|
|
|
|
|
def extract_deb(deb, out_dir):
|
|
"""Extract 'deb' into 'out_dir'."""
|
|
assert os.path.isdir(out_dir)
|
|
cmd = 'dpkg -x %s %s' % (deb, out_dir)
|
|
subprocess.check_call(cmd, shell=True)
|
|
|
|
|
|
def extract_pkg(pkg, out_dir):
|
|
"""Autodetect type of 'pkg' and extract it to 'out_dir'."""
|
|
pkg_magic = magic.from_file(pkg)
|
|
if 'RPM' in pkg_magic:
|
|
return extract_rpm(pkg, out_dir)
|
|
elif 'Debian' in pkg_magic:
|
|
return extract_deb(pkg, out_dir)
|
|
else:
|
|
die('Unsupported package type: %s' % pkg_magic)
|
|
|
|
|
|
def assert_file_exists(path):
|
|
if not os.path.isfile(path):
|
|
die('File does not exists: %s' % path)
|
|
|
|
|
|
def enumerate_pkg_files(pkg, symbol_pkg):
|
|
"""Return a generator over BinarySymbolInfo tuples for all ELF files in 'pkg'.
|
|
|
|
This function extracts both RPM/DEB files, then walks the binary pkg directory to
|
|
enumerate all ELF files. If there is no separate symbol pkg, it simply yields
|
|
all ELF files. If there is a separate symbol pkg, it matches the binaries
|
|
to the location of their respective .debug files and yields the matching tuples.
|
|
We use a generator here to keep the temporary directory and its contents around
|
|
until the consumer of the generator has finished its processing.
|
|
"""
|
|
IMPALA_BINARY_BASE = os.path.join('usr', 'lib', 'impala')
|
|
IMPALA_SYMBOL_BASE = os.path.join('usr', 'lib', 'debug', IMPALA_BINARY_BASE)
|
|
assert_file_exists(pkg)
|
|
if symbol_pkg:
|
|
assert_file_exists(symbol_pkg)
|
|
tmp_dir = tempfile.mkdtemp()
|
|
try:
|
|
# Extract pkg
|
|
logging.info('Extracting to %s: %s' % (tmp_dir, pkg))
|
|
extract_pkg(os.path.abspath(pkg), tmp_dir)
|
|
binary_base = os.path.join(tmp_dir, IMPALA_BINARY_BASE)
|
|
if symbol_pkg:
|
|
# Extract symbol_pkg
|
|
logging.info('Extracting to %s: %s' % (tmp_dir, symbol_pkg))
|
|
extract_pkg(os.path.abspath(symbol_pkg), tmp_dir)
|
|
symbol_base = os.path.join(tmp_dir, IMPALA_SYMBOL_BASE)
|
|
# Walk pkg path and find elf files
|
|
# Find folder with .debug file in symbol_pkg path
|
|
for binary_path in find_elf_files(binary_base):
|
|
# Add tuple to output
|
|
if symbol_pkg:
|
|
rel_dir = os.path.relpath(os.path.dirname(binary_path), binary_base)
|
|
debug_dir = os.path.join(symbol_base, rel_dir)
|
|
else:
|
|
debug_dir = None
|
|
yield BinarySymbolInfo(binary_path, debug_dir)
|
|
finally:
|
|
shutil.rmtree(tmp_dir)
|
|
|
|
|
|
def enumerate_binaries(args):
|
|
"""Enumerate all BinarySymbolInfo tuples, from which symbols should be extracted.
|
|
|
|
This function returns iterables, either lists or generators.
|
|
"""
|
|
if args.binary_files:
|
|
return (BinarySymbolInfo(f, None) for f in args.binary_files)
|
|
elif args.stdin_files:
|
|
return (BinarySymbolInfo(f, None) for f in sys.stdin.read().splitlines())
|
|
elif args.pkg:
|
|
return enumerate_pkg_files(args.pkg, args.symbol_pkg)
|
|
elif args.build_dir:
|
|
return (BinarySymbolInfo(f, None) for f in find_elf_files(args.build_dir))
|
|
die('No input method provided')
|
|
|
|
|
|
def process_binary(dump_syms, objcopy, binary, out_dir):
|
|
"""Dump symbols of a single binary file and move the result.
|
|
|
|
Symbols will be extracted to a temporary file and moved into place afterwards. Required
|
|
directories will be created if necessary.
|
|
"""
|
|
logging.info('Processing binary file: %s' % binary.path)
|
|
ensure_dir_exists(out_dir)
|
|
# tmp_fd will be closed when the file object created by os.fdopen() below gets
|
|
# destroyed.
|
|
tmp_fd, tmp_file = tempfile.mkstemp(dir=out_dir, suffix='.sym')
|
|
try:
|
|
# Create a temporary directory used for decompressing debug info
|
|
tempdir = tempfile.mkdtemp()
|
|
|
|
# Binaries can contain compressed debug symbols. Breakpad currently
|
|
# does not support dumping symbols for binaries with compressed debug
|
|
# symbols.
|
|
#
|
|
# As a workaround, this uses objcopy to create a copy of the binary with
|
|
# the debug symbols decompressed. If the debug symbols are not compressed
|
|
# in the original binary, objcopy simply makes a copy of the binary.
|
|
# Breakpad is able to read symbols from the decompressed binary, and
|
|
# those symbols work correctly in resolving a minidump from the original
|
|
# compressed binary.
|
|
# TODO: In theory, this could work with the binary.debug_path.
|
|
binary_basename = os.path.basename(binary.path)
|
|
decompressed_binary = os.path.join(tempdir, binary_basename)
|
|
objcopy_retcode = subprocess.call([objcopy, "--decompress-debug-sections",
|
|
binary.path, decompressed_binary])
|
|
|
|
# Run dump_syms on the binary
|
|
# If objcopy failed for some reason, fall back to running dump_syms
|
|
# directly on the original binary. This is unlikely to work, but it is a way of
|
|
# guaranteeing that objcopy is not the problem.
|
|
args = [dump_syms, decompressed_binary]
|
|
if objcopy_retcode != 0:
|
|
sys.stderr.write('objcopy failed. Trying to run dump_sym directly.\n')
|
|
args = [dump_syms, binary.path]
|
|
|
|
if binary.debug_path:
|
|
args.append(binary.debug_path)
|
|
proc = subprocess.Popen(args, stdout=os.fdopen(tmp_fd, 'wb'), stderr=subprocess.PIPE)
|
|
_, stderr = proc.communicate()
|
|
if proc.returncode != 0:
|
|
sys.stderr.write('dump_syms: Failed to dump symbols from %s, return code %s\n' %
|
|
(binary.path, proc.returncode))
|
|
sys.stderr.write(stderr)
|
|
os.remove(tmp_file)
|
|
return False
|
|
# Parse the temporary file to determine the full target path.
|
|
with open(tmp_file, 'r') as f:
|
|
header = f.readline().strip()
|
|
# Format of header is: MODULE os arch binary_id binary
|
|
_, _, _, binary_id, binary = header.split(' ')
|
|
out_path = os.path.join(out_dir, binary, binary_id)
|
|
ensure_dir_exists(out_path)
|
|
# Move the temporary file to its final destination.
|
|
shutil.move(tmp_file, os.path.join(out_path, '%s.sym' % binary))
|
|
except Exception as e:
|
|
# Only need to clean up in case of errors.
|
|
try:
|
|
os.remove(tmp_file)
|
|
except EnvironmentError:
|
|
pass
|
|
raise e
|
|
finally:
|
|
# Cleanup temporary directory
|
|
shutil.rmtree(tempdir)
|
|
return True
|
|
|
|
|
|
def main():
|
|
logging.basicConfig(level=logging.INFO)
|
|
args = parse_args()
|
|
dump_syms = args.dump_syms or find_dump_syms_binary()
|
|
assert dump_syms
|
|
objcopy = args.objcopy or find_objcopy_binary()
|
|
assert objcopy
|
|
status = 0
|
|
ensure_dir_exists(args.dest_dir)
|
|
# The logic for handling DEB/RPM packages does not currently work with
|
|
# parallelism, so disable parallelism if using the -r/--pkg option.
|
|
if args.num_processes > 1 and not bool(args.pkg):
|
|
# Use a thread pool to go parallel
|
|
thread_pool = ThreadPool(processes=args.num_processes)
|
|
|
|
def processing_fn(binary):
|
|
return process_binary(dump_syms, objcopy, binary, args.dest_dir)
|
|
|
|
for result in thread_pool.imap_unordered(processing_fn, enumerate_binaries(args)):
|
|
if not result:
|
|
thread_pool.terminate()
|
|
status = 1
|
|
break
|
|
|
|
thread_pool.close()
|
|
thread_pool.join()
|
|
else:
|
|
# For serial cases, simply avoid the ThreadPool altogether, as that makes it
|
|
# easy to reason about.
|
|
for binary in enumerate_binaries(args):
|
|
if not process_binary(dump_syms, objcopy, binary, args.dest_dir):
|
|
status = 1
|
|
break
|
|
sys.exit(status)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|