Files
impala/tests/custom_cluster/test_breakpad.py
Michael Smith f4b2ef5a00 IMPALA-11275: log thread info during minidump
Writes ThreadDebugInfo to stdout/stderr when a minidump is generated to
capture thread and query details related to the dump. Example message:
> Minidump in thread [1790536]async-exec-thread running query
  1a47cc1e2df94cb4:88dfa08200000000, fragment instance
  0000000000000000:0000000000000000

Refactors DumpCallback so that repeated writes to STDOUT/STDERR are less
redundant.

Adds unit tests to run with ThreadDebugInfo. Removes the 'static' prefix
from DumpCallback so it can be invoked from unit tests, but doesn't add
it to the header as it's intended to be for internal use.

Testing:
- Added crash to Coordinator::Exec and manually tested dump handling.
- Added a new unit test for DumpCallback.
- Ran tests/custom_cluster/test_breakpad.py to verify nothing broke in
  refactor. Those tests don't have ThreadDebugInfo available.

Change-Id: Iea2bdf10db29a0f8ccbe5e767b708781d42a9b8a
Reviewed-on: http://gerrit.cloudera.org:8080/18508
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2022-05-13 21:09:57 +00:00

497 lines
21 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import glob
import os
import psutil
import pytest
import shutil
import tempfile
import time
from resource import setrlimit, RLIMIT_CORE, RLIM_INFINITY
from signal import SIGSEGV, SIGKILL, SIGUSR1, SIGTERM
from subprocess import CalledProcessError
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.skip import SkipIfBuildType
DAEMONS = ['impalad', 'statestored', 'catalogd']
DAEMON_ARGS = ['impalad_args', 'state_store_args', 'catalogd_args']
class TestBreakpadBase(CustomClusterTestSuite):
"""Base class with utility methods for all breakpad tests."""
@classmethod
def get_workload(cls):
return 'functional-query'
def setup_method(self, method):
# Override parent
# The temporary directory gets removed in teardown_method() after each test.
self.tmp_dir = tempfile.mkdtemp()
def teardown_method(self, method):
# Override parent
# Stop the cluster to prevent future accesses to self.tmp_dir.
self.kill_cluster(SIGKILL)
assert self.tmp_dir
shutil.rmtree(self.tmp_dir)
@classmethod
def setup_class(cls):
super(TestBreakpadBase, cls).setup_class()
# Disable core dumps for this test
setrlimit(RLIMIT_CORE, (0, RLIM_INFINITY))
@classmethod
def teardown_class(cls):
# Re-enable core dumps
setrlimit(RLIMIT_CORE, (RLIM_INFINITY, RLIM_INFINITY))
def start_cluster_with_args(self, **kwargs):
cluster_options = []
for daemon_arg in DAEMON_ARGS:
daemon_options = " ".join("-{0}={1}".format(k, v) for k, v in kwargs.iteritems())
cluster_options.append("--{0}={1}".format(daemon_arg, daemon_options))
self._start_impala_cluster(cluster_options)
def start_cluster(self):
self.start_cluster_with_args(minidump_path=self.tmp_dir)
def kill_cluster(self, signal):
self.cluster.refresh()
processes = self.cluster.impalads + [self.cluster.catalogd, self.cluster.statestored]
processes = filter(None, processes)
self.kill_processes(processes, signal)
signal is SIGUSR1 or self.assert_all_processes_killed()
def kill_processes(self, processes, signal):
for process in processes:
process.kill(signal)
signal is SIGUSR1 or self.wait_for_all_processes_dead(processes)
def wait_for_all_processes_dead(self, processes, timeout=300):
for process in processes:
try:
# For every process in the list we might see the original Impala process plus a
# forked off child that is writing the minidump. We need to catch both.
for pid in process.get_pids():
print "Checking pid %s" % pid
psutil_process = psutil.Process(pid)
psutil_process.wait(timeout)
except psutil.NoSuchProcess:
# Process has exited in the meantime
pass
except psutil.TimeoutExpired:
raise RuntimeError("Unable to kill %s (pid %d) after %d seconds." %
(psutil_process.name, psutil_process.pid, timeout))
def get_num_processes(self, daemon):
self.cluster.refresh()
if daemon == 'impalad':
return len(self.cluster.impalads)
elif daemon == 'catalogd':
return self.cluster.catalogd and 1 or 0
elif daemon == 'statestored':
return self.cluster.statestored and 1 or 0
raise RuntimeError("Unknown daemon name: %s" % daemon)
def wait_for_num_processes(self, daemon, num_expected, timeout=30):
end = time.time() + timeout
self.cluster.refresh()
num_processes = self.get_num_processes(daemon)
while num_processes != num_expected and time.time() <= end:
time.sleep(1)
num_processes = self.get_num_processes(daemon)
return num_processes
def assert_all_processes_killed(self):
self.cluster.refresh()
assert not self.cluster.impalads
assert not self.cluster.statestored
assert not self.cluster.catalogd
def count_minidumps(self, daemon, base_dir=None):
base_dir = base_dir or self.tmp_dir
path = os.path.join(base_dir, daemon)
return len(glob.glob("%s/*.dmp" % path))
def count_all_minidumps(self, base_dir=None):
return sum((self.count_minidumps(daemon, base_dir) for daemon in DAEMONS))
def assert_num_minidumps_for_all_daemons(self, cluster_size, base_dir=None):
self.assert_num_logfile_entries(1)
assert self.count_minidumps('impalad', base_dir) == cluster_size
assert self.count_minidumps('statestored', base_dir) == 1
assert self.count_minidumps('catalogd', base_dir) == 1
def assert_num_logfile_entries(self, expected_count):
self.assert_impalad_log_contains('INFO', 'Wrote minidump to ',
expected_count=expected_count)
self.assert_impalad_log_contains('ERROR', 'Wrote minidump to ',
expected_count=expected_count)
self.assert_impalad_log_contains('INFO', 'Minidump with no thread info available.',
expected_count=expected_count)
self.assert_impalad_log_contains('ERROR', 'Minidump with no thread info available.',
expected_count=expected_count)
class TestBreakpadCore(TestBreakpadBase):
"""Core tests to check that the breakpad integration into the daemons works as
expected. This includes writing minidump when the daemons call abort(). Add tests here
that depend on functionality of Impala other than the breakpad integration itself.
"""
@pytest.mark.execute_serially
def test_abort_writes_minidump(self):
"""Check that abort() (e.g. hitting a DCHECK macro) writes a minidump."""
assert self.count_all_minidumps() == 0
failed_to_start = False
try:
# Calling with an unresolvable hostname will abort.
self.start_cluster_with_args(minidump_path=self.tmp_dir,
hostname="jhzvlthd")
except CalledProcessError:
failed_to_start = True
assert failed_to_start
# Don't check for minidumps until all processes have gone away so that
# the state of the cluster is not in flux.
self.wait_for_num_processes('impalad', 0)
assert self.count_minidumps('impalad') > 0
class TestBreakpadExhaustive(TestBreakpadBase):
"""Exhaustive tests to check that the breakpad integration into the daemons works as
expected. This includes writing minidump files on unhandled signals and rotating old
minidumps on startup.
"""
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('These breakpad tests only run in exhaustive')
super(TestBreakpadExhaustive, cls).setup_class()
@pytest.mark.execute_serially
def test_minidump_creation(self):
"""Check that when a daemon crashes, it writes a minidump file."""
assert self.count_all_minidumps() == 0
self.start_cluster()
assert self.count_all_minidumps() == 0
cluster_size = self.get_num_processes('impalad')
self.kill_cluster(SIGSEGV)
self.assert_num_minidumps_for_all_daemons(cluster_size)
@pytest.mark.execute_serially
def test_sigusr1_writes_minidump(self):
"""Check that when a daemon receives SIGUSR1, it writes a minidump file."""
assert self.count_all_minidumps() == 0
self.start_cluster()
assert self.count_all_minidumps() == 0
cluster_size = self.get_num_processes('impalad')
self.kill_cluster(SIGUSR1)
# Breakpad forks to write its minidump files, wait for all the clones to terminate.
assert self.wait_for_num_processes('impalad', cluster_size) == cluster_size
assert self.wait_for_num_processes('catalogd', 1) == 1
assert self.wait_for_num_processes('statestored', 1) == 1
# Make sure impalad still answers queries.
client = self.create_impala_client()
self.execute_query_expect_success(client, "SELECT COUNT(*) FROM functional.alltypes")
# Kill the cluster. Sending SIGKILL will not trigger minidumps to be written.
self.kill_cluster(SIGKILL)
self.assert_num_minidumps_for_all_daemons(cluster_size)
@pytest.mark.execute_serially
def test_sigusr1_doesnt_kill(self):
"""Check that when minidumps are disabled and a daemon receives SIGUSR1, it does not
die.
"""
assert self.count_all_minidumps() == 0
self.start_cluster_with_args(enable_minidumps=False)
cluster_size = self.get_num_processes('impalad')
self.kill_cluster(SIGUSR1)
# Check that no minidumps have been written.
self.assert_num_logfile_entries(0)
assert self.count_all_minidumps() == 0
# Check that all daemons are still alive.
assert self.get_num_processes('impalad') == cluster_size
assert self.get_num_processes('catalogd') == 1
assert self.get_num_processes('statestored') == 1
@pytest.mark.execute_serially
def test_sigterm_no_minidumps(self):
"""Check that when a SIGTERM is caught, no minidump file is written.
After receiving SIGTERM there should be no impalad/catalogd/statestored
running.
"""
assert self.count_all_minidumps() == 0
self.start_cluster()
cluster_size = self.get_num_processes('impalad')
assert self.count_all_minidumps() == 0
# impalad/catalogd/statestored should be running.
assert cluster_size > 0
assert self.get_num_processes('catalogd') == 1
assert self.get_num_processes('statestored') == 1
# There should be no SIGTERM message in the log
# when the system starts.
self.assert_impalad_log_contains('INFO', 'Caught signal: SIGTERM. Daemon will exit',
expected_count=0)
self.kill_cluster(SIGTERM)
# There should be no impalad/catalogd/statestored running.
# There should be no minidump generated.
assert self.get_num_processes('impalad') == 0
assert self.get_num_processes('catalogd') == 0
assert self.get_num_processes('statestored') == 0
assert self.count_all_minidumps() == 0
uid = os.getuid()
# There should be a SIGTERM message in the log now
# since we raised one above.
log_str = 'Caught signal: SIGTERM. Daemon will exit.'
self.assert_impalad_log_contains('INFO', log_str, expected_count=1)
@pytest.mark.execute_serially
def test_minidump_relative_path(self):
"""Check that setting 'minidump_path' to a relative value results in minidump files
written to 'log_dir'.
"""
minidump_base_dir = os.path.join(os.environ.get('LOG_DIR', '/tmp'), 'minidumps')
shutil.rmtree(minidump_base_dir, ignore_errors=True)
# Omitting minidump_path as a parameter to the cluster will choose the default
# configuration, which is a FLAGS_log_dir/minidumps.
self.start_cluster_with_args()
assert self.count_all_minidumps(minidump_base_dir) == 0
cluster_size = self.get_num_processes('impalad')
self.kill_cluster(SIGSEGV)
self.assert_num_minidumps_for_all_daemons(cluster_size, minidump_base_dir)
shutil.rmtree(minidump_base_dir)
@pytest.mark.execute_serially
def test_minidump_cleanup(self):
"""Check that a limited number of minidumps is preserved during startup."""
assert self.count_all_minidumps() == 0
self.start_cluster()
cluster_size = self.get_num_processes('impalad')
self.kill_cluster(SIGSEGV)
self.assert_num_logfile_entries(1)
# Maximum number of minidumps that the impalads should keep for this test.
max_minidumps = 2
self.start_cluster_with_args(minidump_path=self.tmp_dir,
max_minidumps=max_minidumps,
logbufsecs=1)
# Wait for log maintenance thread to clean up minidumps asynchronously.
start = time.time()
expected_impalad_minidumps = min(cluster_size, max_minidumps)
while (self.count_minidumps('impalad') != expected_impalad_minidumps
and time.time() - start < 10):
time.sleep(0.1)
assert self.count_minidumps('impalad') == expected_impalad_minidumps
assert self.count_minidumps('statestored') == 1
assert self.count_minidumps('catalogd') == 1
@pytest.mark.execute_serially
def test_minidump_cleanup_thread(self):
"""Check that periodic rotation preserves a limited number of minidumps."""
assert self.count_all_minidumps() == 0
# Maximum number of minidumps that the impalads should keep for this test.
max_minidumps = 2
# Sleep interval for the log rotation thread.
rotation_interval = 1
self.start_cluster_with_args(minidump_path=self.tmp_dir,
max_minidumps=max_minidumps,
logbufsecs=rotation_interval)
cluster_size = self.get_num_processes('impalad')
# We trigger several rounds of minidump creation to make sure that all daemons wrote
# enough files to trigger rotation.
for i in xrange(max_minidumps + 1):
self.kill_cluster(SIGUSR1)
# Breakpad forks to write its minidump files, sleep briefly to allow the forked
# processes to start.
time.sleep(1)
# Wait for all the clones to terminate.
assert self.wait_for_num_processes('impalad', cluster_size) == cluster_size
assert self.wait_for_num_processes('catalogd', 1) == 1
assert self.wait_for_num_processes('statestored', 1) == 1
self.assert_num_logfile_entries(i + 1)
# Sleep long enough for log cleaning to take effect.
time.sleep(rotation_interval + 1)
assert self.count_minidumps('impalad') == min(cluster_size, max_minidumps)
assert self.count_minidumps('statestored') == max_minidumps
assert self.count_minidumps('catalogd') == max_minidumps
@pytest.mark.execute_serially
def test_disable_minidumps(self):
"""Check that setting enable_minidumps to false disables minidump creation."""
assert self.count_all_minidumps() == 0
self.start_cluster_with_args(enable_minidumps=False)
self.kill_cluster(SIGSEGV)
self.assert_num_logfile_entries(0)
@pytest.mark.execute_serially
def test_empty_minidump_path_disables_breakpad(self):
"""Check that setting the minidump_path to an empty value disables minidump creation.
"""
assert self.count_all_minidumps() == 0
self.start_cluster_with_args(minidump_path='')
self.kill_cluster(SIGSEGV)
self.assert_num_logfile_entries(0)
def trigger_single_minidump_and_get_size(self):
"""Kill a single impalad with SIGSEGV to make it write a minidump. Kill the rest of
the cluster. Clean up the single minidump file and return its size.
"""
self.cluster.refresh()
assert self.get_num_processes('impalad') > 0
# Make one impalad write a minidump.
self.kill_processes(self.cluster.impalads[:1], SIGSEGV)
# Kill the rest of the cluster.
self.kill_cluster(SIGKILL)
assert self.count_minidumps('impalad') == 1
# Get file size of that miniump.
path = os.path.join(self.tmp_dir, 'impalad')
minidump_file = glob.glob("%s/*.dmp" % path)[0]
minidump_size = os.path.getsize(minidump_file)
os.remove(minidump_file)
assert self.count_all_minidumps() == 0
return minidump_size
@pytest.mark.execute_serially
def test_limit_minidump_size(self):
"""Check that setting the 'minidump_size_limit_hint_kb' to a small value will reduce
the minidump file size.
"""
assert self.count_all_minidumps() == 0
# Generate minidump with default settings.
self.start_cluster()
full_minidump_size = self.trigger_single_minidump_and_get_size()
# Start cluster with limited minidump file size, we use a very small value, to ensure
# the resulting minidump will be as small as possible.
self.start_cluster_with_args(minidump_path=self.tmp_dir,
minidump_size_limit_hint_kb=1)
reduced_minidump_size = self.trigger_single_minidump_and_get_size()
# Check that the minidump file size has been reduced.
assert reduced_minidump_size < full_minidump_size
class TestLogging(TestBreakpadBase):
"""Exhaustive tests to check that impala log is rolled periodically, obeying
max_log_size and max_log_files, even in the presence of heavy stderr writing.
"""
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('These logging tests only run in exhaustive')
super(TestLogging, cls).setup_class()
def start_cluster_with_args(self, cluster_size, log_dir, **kwargs):
cluster_options = []
for daemon_arg in DAEMON_ARGS:
daemon_options = " ".join("-{0}={1}".format(k, v) for k, v in kwargs.iteritems())
cluster_options.append("--{0}={1}".format(daemon_arg, daemon_options))
self._start_impala_cluster(cluster_options, cluster_size=cluster_size,
expected_num_impalads=cluster_size, impala_log_dir=log_dir)
def assert_logs(self, daemon, max_count, max_bytes):
"""Assert that there are at most 'max_count' of INFO + ERROR log files for the
specified daemon and the individual file size does not exceed 'max_bytes'.
Also assert that stdout/stderr are redirected to correct file on each rotation."""
log_dir = self.tmp_dir
log_paths = glob.glob("%s/%s*log.ERROR.*" % (log_dir, daemon)) \
+ glob.glob("%s/%s*log.INFO.*" % (log_dir, daemon))
assert len(log_paths) <= max_count
# group log_paths by pid and kind
log_group = {}
for path in sorted(log_paths):
tok = path.split('.')
key = tok[-1] + '.' + tok[-3] # pid + kind
if key in log_group:
log_group[key].append(path)
else:
log_group[key] = [path]
for key, paths in log_group.items():
for i in range(0, len(paths)):
try:
curr_path = paths[i]
# check log size
log_size = os.path.getsize(curr_path)
assert log_size <= max_bytes, "{} exceed {} bytes".format(curr_path, max_bytes)
if i < len(paths) - 1:
# check that we print the next_path in last line of this log file
next_path = paths[i + 1]
with open(curr_path, 'rb') as f:
f.seek(-2, os.SEEK_END)
while f.read(1) != b'\n':
f.seek(-2, os.SEEK_CUR)
last_line = f.readline().decode()
assert next_path in last_line
except OSError:
# The daemon might delete the log in the middle of assertion.
# In that case, do nothing and move on.
pass
def silent_remove(self, filename):
try:
os.remove(filename)
except OSError:
pass
def start_excessive_cerr_cluster(self, test_cluster_size=1, remove_symlink=False):
"""Check that impalad log is kept being rotated when most writing activity is coming
from stderr stream.
Along with LogFaultInjectionThread in init.cc, this test will fill impalad error logs
with approximately 128kb error messages per second."""
test_logbufsecs = 3
test_max_log_files = 2
test_max_log_size = 1 # 1 MB
test_error_msg = ('123456789abcde_' * 64) # 1 KB error message
test_debug_actions = 'LOG_MAINTENANCE_STDERR:FAIL@1.0@' + test_error_msg
daemon = 'impalad'
os.chmod(self.tmp_dir, 0744)
expected_log_max_bytes = int(1.2 * 1024**2) # 1.2 MB
self.assert_logs(daemon, 0, expected_log_max_bytes)
self.start_cluster_with_args(test_cluster_size, self.tmp_dir,
logbufsecs=test_logbufsecs,
max_log_files=test_max_log_files,
max_log_size=test_max_log_size,
debug_actions=test_debug_actions)
self.wait_for_num_processes(daemon, test_cluster_size, 30)
# Count both INFO and ERROR logs
expected_log_max_count = test_max_log_files * test_cluster_size * 2
# Wait for log maintenance thread to flush and rotate the logs asynchronously.
start = time.time()
while (time.time() - start < 40):
time.sleep(1)
self.assert_logs(daemon, expected_log_max_count, expected_log_max_bytes)
if (remove_symlink):
pattern = self.tmp_dir + '/' + daemon + '*'
symlinks = glob.glob(pattern + '.INFO') + glob.glob(pattern + '.ERROR')
for symlink in symlinks:
self.silent_remove(symlink)
@pytest.mark.execute_serially
def test_excessive_cerr(self):
"""Test excessive cerr activity with single node cluster."""
self.start_excessive_cerr_cluster()
@pytest.mark.execute_serially
def test_excessive_cerr_no_symlink(self):
"""Test excessive cerr activity with two node cluster and missing log symlinks."""
self.start_excessive_cerr_cluster(2, True)