mirror of
https://github.com/apache/impala.git
synced 2025-12-26 14:02:53 -05:00
Recently, we see many timeout failures of test_concurrent_ddls.py in S3 builds, e.g. IMPALA-10280, IMPALA-10301, IMPALA-10363. It'd be helpful to dump the server stacktraces so we can understand why some RPCs are slow/stuck. This patch extracts the logic of dumping stacktraces in script-timeout-check.sh to a separate script, dump-stacktraces.sh. The script also dumps jstacks of HMS and NameNode. Dumping all these stacktraces is time-consuming so we do them in parallel, which also helps to get consistent snapshots of all servers. When any tests in test_concurrent_ddls.py timeout, we use dump-stacktraces.sh to dump the stacktraces before exit. Previously, some tests depend on pytest.mark.timeout for detecting timeouts. It's hard to add a customized callback for dumping server stacktraces. So this patch refactors test_concurrent_ddls.py to only use timeout of multiprocessing. Tests: - Tested the scripts locally. - Verified the error handling of timeout logics in Jenkins jobs Change-Id: I514cf2d0ff842805c0abf7211f2a395151174173 Reviewed-on: http://gerrit.cloudera.org:8080/16800 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
80 lines
2.5 KiB
Bash
Executable File
80 lines
2.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# Helper script that dumps the stacktraces of catalogd, statestored, all impalads
|
|
# (if running) and Hive Metastore server. Results files are put in
|
|
# $IMPALA_TIMEOUT_LOGS_DIR.
|
|
#
|
|
|
|
function collect_gdb_backtraces() {
|
|
name=$1
|
|
pid=$2
|
|
result="${IMPALA_TIMEOUT_LOGS_DIR}/${name}_${pid}_$(date +%Y%m%d-%H%M%S).txt"
|
|
echo "**** Generating backtrace of $name with process id: $pid to $result ****"
|
|
gdb -ex "thread apply all bt" --batch -p $pid >"$result"
|
|
}
|
|
|
|
function collect_jstacks() {
|
|
name=$1
|
|
pid=$2
|
|
result="${IMPALA_TIMEOUT_LOGS_DIR}/${name}_${pid}_jstack_$(date +%Y%m%d-%H%M%S).txt"
|
|
echo "**** Generating jstack of $name with process id: $pid to $result ****"
|
|
$JAVA_HOME/bin/jstack -F $pid >"$result"
|
|
}
|
|
|
|
# Take stacktraces in parallel to get consistent snapshots
|
|
WORKER_PIDS=()
|
|
mkdir -p "$IMPALA_TIMEOUT_LOGS_DIR"
|
|
|
|
for pid in $(pgrep impalad); do
|
|
collect_gdb_backtraces impalad $pid && collect_jstacks impalad $pid &
|
|
WORKER_PIDS+=($!)
|
|
done
|
|
|
|
# Catalogd's process name may change. Use 'ps' directly to search the binary name.
|
|
CATALOGD_PID=$(ps aux | grep [c]atalogd | awk '{print $2}')
|
|
if [[ ! -z $CATALOGD_PID ]]; then
|
|
collect_gdb_backtraces catalogd $CATALOGD_PID && \
|
|
collect_jstacks catalogd $CATALOGD_PID &
|
|
WORKER_PIDS+=($!)
|
|
fi
|
|
|
|
STATESTORED_PID=$(pgrep statestored)
|
|
if [[ ! -z $STATESTORED_PID ]]; then
|
|
collect_gdb_backtraces statestored $STATESTORED_PID &
|
|
WORKER_PIDS+=($!)
|
|
fi
|
|
|
|
HMS_PID=$(ps aux | grep [H]iveMetaStore | awk '{print $2}')
|
|
if [[ ! -z $HMS_PID ]]; then
|
|
collect_jstacks hms $HMS_PID &
|
|
WORKER_PIDS+=($!)
|
|
fi
|
|
|
|
NAMENODE_PID=$(ps aux | grep [N]ameNode | awk '{print $2}')
|
|
if [[ ! -z $NAMENODE_PID ]]; then
|
|
collect_jstacks namenode $NAMENODE_PID &
|
|
WORKER_PIDS+=($!)
|
|
fi
|
|
|
|
for pid in "${WORKER_PIDS[@]}"; do
|
|
wait $pid
|
|
done
|
|
|