mirror of
https://github.com/apache/impala.git
synced 2026-01-05 12:01:11 -05:00
This is for review purposes only. This patch will be merged with David's big merge patch. Changes: 1) Make Kudu compilation dependent on the OS since not all OSs support Kudu. 2) Only run Kudu related tests when Kudu is supported (see #1). 3) Look for Kudu locally, but in a different location. To use a local build of Kudu, set KUDU_BUILD_DIR to the path Kudu was built in and set KUDU_CLIENT_DIR to the path KUDU was installed in. Example: git clone https://github.com/cloudera/kudu.git ...build 3rd party etc... mkdir -p $KUDU_BUILD_DIR cd $KUDU_BUILD_DIR cmake <path to Kudu source dir> make DESTDIR=$KUDU_CLIENT_DIR make install 4) Look for Kudu in the toolchain if not using a local Kudu build. 5) Add Kudu service startup scripts. The Kudu in the toolchain is actually a parcel that has been renamed (the contents were not modified in any way), that mean the Kudu service binaries are there. Those binaries are now used to run the Kudu service. Change-Id: I3db88cbd27f2ea2394f011bc8d1face37411ed58
441 lines
14 KiB
Bash
Executable File
441 lines
14 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Copyright 2014 Cloudera Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# This will create/control/destroy a local hdfs+yarn+llama cluster.
|
|
#
|
|
# The original idea was to run each node on a different loopback address but
|
|
# https://jira.cloudera.com/browse/CDH-16602 makes that impossible for now. So all roles
|
|
# run on 127.0.0.1, just like the standard mini cluster included with hadoop. The
|
|
# difference is with this cluster, each role runs in its own process and has its own
|
|
# configs. For each node, the layout of the configs, logs, start/stop scripts, etc, is
|
|
# kept as close as possible to a real cluster. For example, the first node will live in
|
|
# the dir "cdh-<version>/node-1" and its logs would be at "cdh-<version>/node-1/var/log".
|
|
|
|
set -euo pipefail
|
|
trap 'echo Error in $0 at line $LINENO: $(awk "NR == $LINENO" $0)' ERR
|
|
|
|
: ${IMPALA_KERBERIZE=}
|
|
|
|
while getopts vk OPT; do
|
|
case $OPT in
|
|
v) set -x;;
|
|
k) export IMPALA_KERBERIZE=1;;
|
|
?) echo "Usage: $0 [-v (verbose) -k (kerberize)] ACTION (see source...)"; exit 1;;
|
|
esac
|
|
done
|
|
shift $(($OPTIND-1))
|
|
|
|
DIR=$(dirname $0)
|
|
NODES_DIR="$DIR/cdh$CDH_MAJOR_VERSION"
|
|
NODE_COUNT=3
|
|
NODE_PREFIX=node-
|
|
COMMON_NODE_TEMPLATE="$DIR/node_templates/common"
|
|
NODE_TEMPLATE="$DIR/node_templates/cdh$CDH_MAJOR_VERSION"
|
|
TEMPLATE_SUFFIX=".tmpl"
|
|
|
|
# Each process should be marked with this so a "pkill -f" can be done to nuke everything.
|
|
export KILL_CLUSTER_MARKER=IBelongToTheMiniCluster
|
|
|
|
SUPPORTED_SERVICES=(hdfs yarn)
|
|
if [[ "$CDH_MAJOR_VERSION" -ge 5 ]]; then
|
|
SUPPORTED_SERVICES+=(llama kms)
|
|
fi
|
|
if $KUDU_IS_SUPPORTED; then
|
|
SUPPORTED_SERVICES+=(kudu)
|
|
fi
|
|
|
|
# All DataNodes and NodeManagers need a unique but fixed address. The IP is fixed at
|
|
# 127.0.0.1, so the only difference is the port. The address must be fixed because it is
|
|
# used as an identifier. If a node were restarted with a different address it would be
|
|
# considered a new node. The values below are arbitrary and may conflict with existing
|
|
# services. Fixed ports were preferred to dynamically chosen free ports for consistency.
|
|
DATANODE_FREE_PORT_START=31000
|
|
NODEMANAGER_FREE_PORT_START=31100
|
|
KUDU_TS_RPC_FREE_PORT_START=31200
|
|
|
|
# Used to populate config templates later. Any changes made here have no effect on an
|
|
# existing cluster.
|
|
export HDFS_WEBUI_PORT=5070 # changed from 50070 so it is not ephemeral
|
|
export YARN_WEBUI_PORT=8088 # same as default
|
|
export LLAMA_WEBUI_PORT=1501 # same as default
|
|
export KMS_WEBUI_PORT=16000 # same as default
|
|
export KUDU_WEBUI_PORT=8051 # same as default
|
|
|
|
# Empty dirs that should be included in the templates. Since git ignores empty dirs it is
|
|
# easier to maintain them here.
|
|
EMPTY_NODE_DIRS=$(echo data/dfs/{dn,nn} var/{run,lib/hadoop-hdfs,log} \
|
|
var/{log,run}/kudu/{master,ts} var/lib/kudu/{master,ts}/{wal,data})
|
|
if [[ "$CDH_MAJOR_VERSION" -ge 5 ]]; then
|
|
EMPTY_NODE_DIRS+=" /var/log/llama"
|
|
fi
|
|
|
|
EASY_ACCESS_LOG_DIR="$IMPALA_HOME/cluster_logs"
|
|
MINIKDC_INIT=${IMPALA_HOME}/testdata/bin/minikdc.sh
|
|
|
|
if $IS_OSX; then
|
|
FIND_EXECUTABLE_FILTER="-perm +0111"
|
|
else
|
|
FIND_EXECUTABLE_FILTER="-executable"
|
|
fi
|
|
|
|
#
|
|
# Various initialization routines for a Kerberized cluster setup. We
|
|
# do this if either
|
|
# 1) IMPALA_KERBERIZE is set
|
|
# 2) The config files exist and use kerberos
|
|
# If either are true, then we set IMPALA_KERBERIZE ourselves, ensure the
|
|
# minikdc is started, source the appropriate env vars from it, and get
|
|
# ourselves a fresh TGT (Ticket-Granting-Ticket).
|
|
#
|
|
function kerberize_setup {
|
|
if is_kerberized; then
|
|
export IMPALA_KERBERIZE=1
|
|
fi
|
|
|
|
# No kerberos? We're done.
|
|
if [ -z "${IMPALA_KERBERIZE}" ]; then
|
|
return
|
|
fi
|
|
|
|
# We must have the 'kinit' binary installed so that the minikdc can
|
|
# grant the user a ticket-granting-ticket.
|
|
if ! type -f kinit > /dev/null 2>&1; then
|
|
echo "Unable to locate 'kinit' on this machine, it must be installed"
|
|
echo "in order for you to actually communicate with the kerberized"
|
|
echo "components of the cluster."
|
|
echo "Try this (on ubuntu):"
|
|
echo " --> $ sudo apt-get install krb5-user"
|
|
echo
|
|
exit 1
|
|
fi
|
|
|
|
# While we're checking things, make sure the SASL GSSAPI libraries
|
|
# are installed. These aren't required until we go to start up
|
|
# the impala daemons, but we might as well fail early...
|
|
IMPALA_SASL_PATH="/usr/lib/sasl2 /usr/lib64/sasl2 /usr/local/lib/sasl2"
|
|
IMPALA_SASL_PATH="${IMPALA_SASL_PATH} /usr/lib/x86_64-linux-gnu/sasl2"
|
|
SASL_GSSAPI_FOUND=0
|
|
for i in ${IMPALA_SASL_PATH}; do
|
|
if ls $i/libgssapi* > /dev/null 2>&1; then
|
|
SASL_GSSAPI_FOUND=1
|
|
break
|
|
fi
|
|
done
|
|
if [ ${SASL_GSSAPI_FOUND} -eq 0 ]; then
|
|
echo "Unable to locate the SASL GSSAPI libraries in the following path:"
|
|
echo "${IMPALA_SASL_PATH}"
|
|
echo "This is required for Kerberos communication using SASL, which"
|
|
echo "the Impala daemons depend on. To install (on Ubuntu), try:"
|
|
echo " --> $ sudo apt-get install libsasl2-modules-gssapi-mit"
|
|
echo
|
|
exit 1
|
|
fi
|
|
|
|
# Starting it has no effect if it's already started...
|
|
${MINIKDC_INIT} start
|
|
|
|
# Source the appropriate minikdc environment variables
|
|
. ${MINIKDC_ENV}
|
|
|
|
# Give ourselves a fresh TGT. Debate here whether that should be
|
|
# ${USER} or 'impala'. Each has issues; see comments in buildall.sh.
|
|
# Stick with ${USER} for now.
|
|
kinit -k -t "${KRB5_KTNAME}" "${MINIKDC_PRINC_USER}"
|
|
if [ $? -ne 0 ]; then
|
|
echo "kinit failure; aborting"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
# Return success if the cluster is kerberized
|
|
function is_kerberized {
|
|
HCONFSC="`get_hadoop_client_conf_dir`/core-site.xml"
|
|
if [ -f "${HCONFSC}" ]; then
|
|
if grep -qi "Kerberos settings" "${HCONFSC}"; then
|
|
# If the config exists and has kerberos things in it, treat as kerberized
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
function cluster_exists {
|
|
# Just use the first node as an indicator...
|
|
if [[ ! -e "$NODES_DIR/${NODE_PREFIX}1" ]]; then
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
function create_cluster {
|
|
mkdir -p "$NODES_DIR"
|
|
|
|
# Used to populate config templates later
|
|
GROUP=$(id -gn)
|
|
export GROUP
|
|
|
|
# Blow away existing config files (so we don't pick up kerberos settings)
|
|
rm -f `get_hadoop_client_conf_dir`/*
|
|
|
|
if [ ! -z "${IMPALA_KERBERIZE}" ]; then
|
|
kerberize_setup
|
|
echo "Creating Kerberized cluster."
|
|
else
|
|
# Stop the minikdc in case it was running
|
|
. ${MINIKDC_ENV}
|
|
${MINIKDC_INIT} stop
|
|
fi
|
|
|
|
# Llama needs a mapping of DataNodes to NodeManagers and for that we'll need to know
|
|
# the hostname hadoop has chosen. It should be the first entry in /etc/hosts for
|
|
# 127.0.0.1.
|
|
#
|
|
# It is possible this hostname must also mactch the impala hostname, which will
|
|
# never be "localhost".
|
|
if $IS_OSX; then
|
|
HADOOP_HOSTNAME=$(dscacheutil -q host -a ip_address 127.0.0.1 | head -n 1 \
|
|
| awk '{print $2}')
|
|
else
|
|
if getent hosts 127.0.0.1 1>/dev/null; then
|
|
HADOOP_HOSTNAME=$(getent hosts 127.0.0.1 | awk '{print $2}')
|
|
else
|
|
# This may be an error case...
|
|
HADOOP_HOSTNAME=127.0.0.1
|
|
fi
|
|
fi
|
|
|
|
# For consistency, the first node will host all the master roles, including llama. Llama
|
|
# needs to know the ports of the datanodes and nodemanagers. The ports are generated
|
|
# below as part of setting up each node. The slave nodes are created first so the ports
|
|
# will be known when it is time to configure llama.
|
|
for ((NODE_IDX=$NODE_COUNT; NODE_IDX >= 1; NODE_IDX--)); do
|
|
NODE=${NODE_PREFIX}$NODE_IDX
|
|
NODE_DIR=$(get_node_dir $NODE)
|
|
|
|
echo "Creating $NODE at $NODE_DIR"
|
|
|
|
mkdir -p "$NODE_DIR"
|
|
cp -a "$COMMON_NODE_TEMPLATE"/* "$NODE_DIR"
|
|
if [[ -e "$NODE_TEMPLATE" ]]; then
|
|
cp -a "$NODE_TEMPLATE"/* "$NODE_DIR"
|
|
fi
|
|
if [[ $NODE_IDX -gt 1 ]]; then
|
|
# Remove master role scripts from slave nodes
|
|
rm -f "$NODE_DIR/etc/init.d/"{hdfs-namenode,yarn-resourcemanager} \
|
|
"$NODE_DIR/etc/init.d/"{llama-application,kms,kudu-master}
|
|
fi
|
|
for EMPTY_NODE_DIR in $EMPTY_NODE_DIRS; do
|
|
mkdir -p "$NODE_DIR/$EMPTY_NODE_DIR"
|
|
done
|
|
|
|
# Add some easy access links closer to IMPALA_HOME
|
|
EASY_ACCESS_LOG_LINK="$EASY_ACCESS_LOG_DIR/cdh$CDH_MAJOR_VERSION-$NODE"
|
|
if [[ ! -e "$EASY_ACCESS_LOG_LINK" ]]; then
|
|
mkdir -p "$EASY_ACCESS_LOG_DIR"
|
|
ln -s "$NODE_DIR/var/log" "$EASY_ACCESS_LOG_DIR"
|
|
mv "$IMPALA_HOME/cluster_logs/log" "$EASY_ACCESS_LOG_LINK"
|
|
fi
|
|
|
|
# Template population
|
|
DATANODE_PORT=$((DATANODE_FREE_PORT_START++))
|
|
NODEMANAGER_PORT=$((NODEMANAGER_FREE_PORT_START++))
|
|
KUDU_TS_RPC_PORT=$((KUDU_TS_RPC_FREE_PORT_START++))
|
|
echo "$NODE will use ports DATANODE_PORT=$DATANODE_PORT," \
|
|
"NODEMANAGER_PORT=$NODEMANAGER_PORT, and KUDU_TS_RPC_PORT=$KUDU_TS_RPC_PORT"
|
|
|
|
# Escape the first : to workaround https://jira.cloudera.com/browse/CDH-16840
|
|
LLAMA_PORT_MAPPINGS+="$HADOOP_HOSTNAME\\:$DATANODE_PORT="
|
|
LLAMA_PORT_MAPPINGS+="$HADOOP_HOSTNAME:$NODEMANAGER_PORT
|
|
"
|
|
export NODE NODE_DIR DATANODE_PORT NODEMANAGER_PORT LLAMA_PORT_MAPPINGS
|
|
export KUDU_TS_RPC_PORT
|
|
for TEMPLATE_PATH in $(find "$NODE_DIR" -name "*$TEMPLATE_SUFFIX"); do
|
|
ACTUAL_PATH="${TEMPLATE_PATH%$TEMPLATE_SUFFIX}"
|
|
|
|
# Search for strings like ${FOO}, if FOO is defined in the environment then replace
|
|
# "${FOO}" with the environment value.
|
|
perl -wpl -e 's/\$\{([^}]+)\}/defined $ENV{$1} ? $ENV{$1} : $&/eg' \
|
|
"$TEMPLATE_PATH" > "$ACTUAL_PATH.1"
|
|
|
|
# Chop out everything between the BEGIN/END Kerberos comments if
|
|
# not kerberized
|
|
if [ -z "${IMPALA_KERBERIZE}" ]; then
|
|
sed '/<!-- BEGIN Kerberos/,/END Kerberos settings -->/d' \
|
|
"$ACTUAL_PATH.1" > "$ACTUAL_PATH"
|
|
else
|
|
cp "$ACTUAL_PATH.1" "$ACTUAL_PATH"
|
|
fi
|
|
|
|
# Assumes that environment variables will be ALL CAPS...
|
|
if grep '\${[A-Z_]*}' "$ACTUAL_PATH"; then
|
|
echo "Found undefined variables in $ACTUAL_PATH, aborting"
|
|
exit 1
|
|
fi
|
|
|
|
if [[ -x "$TEMPLATE_PATH" ]]; then
|
|
chmod u+x "$ACTUAL_PATH"
|
|
fi
|
|
rm "$TEMPLATE_PATH" "$ACTUAL_PATH.1"
|
|
done
|
|
done
|
|
}
|
|
|
|
function start_cluster {
|
|
if ! cluster_exists; then
|
|
echo "The cluster must be created first"
|
|
return 1
|
|
fi
|
|
|
|
if [ ! -z "${IMPALA_KERBERIZE}" ] && ! is_kerberized; then
|
|
echo "Kerberized start requested, but the config files aren't set up"
|
|
echo "for kerberos. Destroy the cluster and rebuild it:"
|
|
echo " --> $ ./testdata/cluster/admin delete_cluster"
|
|
echo " --> $ IMPALA_KERBERIZE=1 ./testdata/cluster/admin create_cluster"
|
|
exit 1
|
|
fi
|
|
|
|
kerberize_setup
|
|
if is_kerberized; then
|
|
echo "Starting Kerberized cluster."
|
|
fi
|
|
|
|
for SERVICE in ${SUPPORTED_SERVICES[@]}; do
|
|
WEBUI_PORT_VAR="$(echo "$SERVICE" | awk '{print toupper($0)}')_WEBUI_PORT"
|
|
echo "Starting ${SERVICE} (Web UI - http://localhost:${!WEBUI_PORT_VAR})"
|
|
exec_init_script "$SERVICE*" start
|
|
if [ "${SERVICE}" = "hdfs" ]; then
|
|
chmod_hdfs_root
|
|
fi
|
|
done
|
|
|
|
# Check if the cluster is still alive...
|
|
sleep 10
|
|
check_cluster_status
|
|
return $?
|
|
}
|
|
|
|
function chmod_hdfs_root {
|
|
# When created, the cluster only contains the root directory with
|
|
# 0755 permissions and owned by hdfs. In a kerberized
|
|
# environment, this means that no one but hdfs can create
|
|
# anything! Chmod / to 01777 so that other users can create
|
|
# things. Insecure, but easy. While we're here, make other sane
|
|
# and permissive initial directories
|
|
if is_kerberized; then
|
|
PREVIOUS_PRINCIPAL=`klist | grep ^Default | awk '{print $3}'`
|
|
# Become hdfs:
|
|
kinit -k -t ${KRB5_KTNAME} ${MINIKDC_PRINC_HDFS}
|
|
# Do the chmod, etc
|
|
hadoop fs -chmod 1777 /
|
|
if ! hadoop fs -ls /tmp > /dev/null 2>&1; then
|
|
for i in /tmp /home /user; do
|
|
hadoop fs -mkdir $i
|
|
hadoop fs -chmod 1777 $i
|
|
done
|
|
fi
|
|
# Become ourselves again.
|
|
kinit -k -t ${KRB5_KTNAME} ${PREVIOUS_PRINCIPAL}
|
|
fi
|
|
}
|
|
|
|
function exec_init_script {
|
|
SCRIPT_NAME="$1"
|
|
shift
|
|
|
|
for SCRIPT in $(find "$NODES_DIR" -path "*/$NODE_PREFIX*/etc/init.d/$SCRIPT_NAME" \
|
|
$FIND_EXECUTABLE_FILTER -type f); do
|
|
if "$SCRIPT" status &>/dev/null; then
|
|
RUNNING=true
|
|
else
|
|
RUNNING=false
|
|
fi
|
|
case $1 in
|
|
start) if ! $RUNNING; then "$SCRIPT" start; fi;;
|
|
stop) if $RUNNING; then "$SCRIPT" stop; fi;;
|
|
*) "$SCRIPT" "$@";;
|
|
esac &
|
|
done
|
|
wait
|
|
}
|
|
|
|
function check_cluster_status {
|
|
if ! cluster_exists; then
|
|
echo "The cluster does not exist"
|
|
return 1
|
|
fi
|
|
|
|
ROLE_COUNT=0
|
|
NOT_RUNNING=()
|
|
for NODE_DIR in "$NODES_DIR/$NODE_PREFIX"*; do
|
|
for SERVICE in ${SUPPORTED_SERVICES[@]}; do
|
|
for SCRIPT in $(find "$NODE_DIR" -path "*/etc/init.d/$SERVICE*" $FIND_EXECUTABLE_FILTER \
|
|
-type f); do
|
|
ROLE_COUNT=$((ROLE_COUNT + 1))
|
|
if ! "$SCRIPT" status &>/dev/null; then
|
|
NOT_RUNNING+=("\n$(basename $SCRIPT) is not running on $(basename $NODE_DIR)")
|
|
fi
|
|
done
|
|
done
|
|
done
|
|
|
|
case ${#NOT_RUNNING[@]} in
|
|
0) echo "The cluster is running"; return;;
|
|
$ROLE_COUNT) echo "The cluster is not running"; return 1;;
|
|
*) echo -e "${NOT_RUNNING[@]}"; return 1;;
|
|
esac
|
|
}
|
|
|
|
function stop_cluster {
|
|
if cluster_exists; then
|
|
# Stop services in reverse order to give them a chance to shutdown cleanly
|
|
for ((SERVICE_IDX=${#SUPPORTED_SERVICES[@]} - 1; SERVICE_IDX >= 0; SERVICE_IDX--)); do
|
|
SERVICE="${SUPPORTED_SERVICES[$SERVICE_IDX]}"
|
|
echo "Stopping ${SERVICE}"
|
|
exec_init_script "$SERVICE*" stop
|
|
done
|
|
fi
|
|
# Kill everything anyways just in case a git clean -xdf was done
|
|
pkill -u $USER -f $KILL_CLUSTER_MARKER || true
|
|
}
|
|
|
|
function delete_data {
|
|
rm -rf "$NODES_DIR/$NODE_PREFIX"*/data/dfs/{nn,dn}/*
|
|
}
|
|
|
|
function delete_cluster {
|
|
pkill -u $USER -f $KILL_CLUSTER_MARKER || true
|
|
rm -rf "$NODES_DIR"
|
|
}
|
|
|
|
function get_node_dir {
|
|
if $IS_OSX; then
|
|
greadlink -f "$NODES_DIR/$1"
|
|
else
|
|
readlink -f "$NODES_DIR/$1"
|
|
fi
|
|
}
|
|
|
|
function get_hadoop_client_conf_dir {
|
|
echo "$NODES_DIR/$NODE_PREFIX"1/etc/hadoop/conf
|
|
}
|
|
|
|
COMMAND=$1
|
|
shift
|
|
# Use an "if" to avoid triggering the ERR trap.
|
|
if ! $COMMAND "$@"; then
|
|
exit 1
|
|
fi
|