Files
impala/bin/start-impala-cluster.py
Henry Robinson 8e5848eaf8 RM fixes to get tests passing
* One last NotifyThreadUsageChange() mismatched pair
* Don't set resource in plan fragment params if there isn't a resource
  available. This fixes the problem where if no fragment with resources
  was assigned to the same node as the coordinator, the coordinator
  would have a dummy resource allocation which didn't work with
  expansion.
* Substitute #ID in all impalad arguments to start-impala-cluster.py
  with the 0-indexed ID of the impalad being started. This is required
  to have different Impala processes use different cgroups.

Change-Id: If8c8fd8bef0809bdaf16115a45a9695fc2bf3e1b
(cherry picked from commit c71ce45e97570b8c09900eb5ae2e26984d3306a4)
Reviewed-on: http://gerrit.ent.cloudera.com:8080/2060
Tested-by: jenkins
Reviewed-by: Henry Robinson <henry@cloudera.com>
2014-03-24 15:07:45 -07:00

287 lines
13 KiB
Python
Executable File

#!/usr/bin/env python
# Copyright 2012 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Starts up an Impala cluster (ImpalaD + State Store) with the specified number of
# ImpalaD instances. Each ImpalaD runs on a different port allowing this to be run
# on a single machine.
import os
import sys
from time import sleep, time
from optparse import OptionParser
# Options
parser = OptionParser()
parser.add_option("-s", "--cluster_size", type="int", dest="cluster_size", default=3,
help="Size of the cluster (number of impalad instances to start).")
parser.add_option("--build_type", dest="build_type", default= 'debug',
help="Build type to use - debug / release")
parser.add_option("--impalad_args", dest="impalad_args", default="",
help="Additional arguments to pass to each Impalad during startup")
parser.add_option("--state_store_args", dest="state_store_args", default="",
help="Additional arguments to pass to State Store during startup")
parser.add_option("--catalogd_args", dest="catalogd_args", default="",
help="Additional arguments to pass to the Catalog Service at startup")
parser.add_option("--kill", "--kill_only", dest="kill_only", action="store_true",
default=False, help="Instead of starting the cluster, just kill all"\
" the running impalads and the statestored.")
parser.add_option("--force_kill", dest="force_kill", action="store_true", default=False,
help="Force kill impalad and statestore processes.")
parser.add_option("-r", "--restart_impalad_only", dest="restart_impalad_only",
action="store_true", default=False,
help="Restarts only the impalad processes")
parser.add_option("--in-process", dest="inprocess", action="store_true", default=False,
help="Start all Impala backends and state store in a single process.")
parser.add_option("--log_dir", dest="log_dir", default="/tmp",
help="Directory to store output logs to.")
parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default=False,
help="Prints all output to stderr/stdout.")
parser.add_option("--wait_for_cluster", dest="wait_for_cluster", action="store_true",
default=False, help="Wait until the cluster is ready to accept "\
"queries before returning.")
parser.add_option("--log_level", type="int", dest="log_level", default=1,
help="Set the impalad backend logging level")
parser.add_option("--jvm_args", dest="jvm_args", default="",
help="Additional arguments to pass to the JVM(s) during startup.")
options, args = parser.parse_args()
IMPALA_HOME = os.environ['IMPALA_HOME']
KNOWN_BUILD_TYPES = ['debug', 'release']
IMPALAD_PATH = os.path.join(IMPALA_HOME,
'bin/start-impalad.sh -build_type=%s' % options.build_type)
STATE_STORE_PATH = os.path.join(IMPALA_HOME, 'be/build',
options.build_type, 'statestore/statestored')
CATALOGD_PATH = os.path.join(IMPALA_HOME,
'bin/start-catalogd.sh -build_type=%s' % options.build_type)
MINI_IMPALA_CLUSTER_PATH = IMPALAD_PATH + " -in-process"
IMPALA_SHELL = os.path.join(IMPALA_HOME, 'bin/impala-shell.sh')
IMPALAD_PORTS = ("-beeswax_port=%d -hs2_port=%d -be_port=%d "
"-state_store_subscriber_port=%d -webserver_port=%d "
"-llama_callback_port=%d")
JVM_ARGS = "-jvm_debug_port=%s -jvm_args=%s"
BE_LOGGING_ARGS = "-log_filename=%s -log_dir=%s -v=%s -logbufsecs=5"
CLUSTER_WAIT_TIMEOUT_IN_SECONDS = 240
def exec_impala_process(cmd, args, stderr_log_file_path):
redirect_output = str()
if options.verbose:
args += ' -logtostderr=1'
else:
redirect_output = "1>%s" % stderr_log_file_path
cmd = '%s %s %s 2>&1 &' % (cmd, args, redirect_output)
os.system(cmd)
def kill_cluster_processes(force=False):
kill_matching_processes('catalogd')
kill_matching_processes('impalad')
kill_matching_processes('statestored')
kill_matching_processes('mini-impala-cluster')
def kill_matching_processes(binary_name, force=False):
"""Kills all processes with the given binary name"""
# -w = Wait for processes to die.
kill_cmd = "killall -w"
if force: kill_cmd += " -9"
os.system("%s %s" % (kill_cmd, binary_name))
def start_statestore():
print "Starting State Store logging to %s/statestored.INFO" % options.log_dir
stderr_log_file_path = os.path.join(options.log_dir, "statestore-error.log")
args = "%s %s" % (build_impalad_logging_args(0, "statestored"),
options.state_store_args)
exec_impala_process(STATE_STORE_PATH, args, stderr_log_file_path)
def start_catalogd():
print "Starting Catalog Service logging to %s/catalogd.INFO" % options.log_dir
stderr_log_file_path = os.path.join(options.log_dir, "catalogd-error.log")
args = "%s %s %s" % (build_impalad_logging_args(0, "catalogd"),
options.catalogd_args, build_jvm_args(options.cluster_size))
exec_impala_process(CATALOGD_PATH, args, stderr_log_file_path)
def start_mini_impala_cluster(cluster_size):
print ("Starting in-process Impala Cluster logging "
"to %s/mini-impala-cluster.INFO" % options.log_dir)
args = "-num_backends=%s %s" %\
(cluster_size, build_impalad_logging_args(0, 'mini-impala-cluster'))
stderr_log_file_path = os.path.join(options.log_dir, 'mini-impala-cluster-error.log')
exec_impala_process(MINI_IMPALA_CLUSTER_PATH, args, stderr_log_file_path)
def build_impalad_port_args(instance_num):
BASE_BEESWAX_PORT = 21000
BASE_HS2_PORT = 21050
BASE_BE_PORT = 22000
BASE_STATE_STORE_SUBSCRIBER_PORT = 23000
BASE_WEBSERVER_PORT = 25000
BASE_LLAMA_CALLBACK_PORT = 28000
return IMPALAD_PORTS % (BASE_BEESWAX_PORT + instance_num, BASE_HS2_PORT + instance_num,
BASE_BE_PORT + instance_num,
BASE_STATE_STORE_SUBSCRIBER_PORT + instance_num,
BASE_WEBSERVER_PORT + instance_num,
BASE_LLAMA_CALLBACK_PORT + instance_num)
def build_impalad_logging_args(instance_num, service_name):
log_file_path = os.path.join(options.log_dir, "%s.INFO" % service_name)
return BE_LOGGING_ARGS % (service_name, options.log_dir, options.log_level)
def build_jvm_args(instance_num):
BASE_JVM_DEBUG_PORT = 30000
return JVM_ARGS % (BASE_JVM_DEBUG_PORT + instance_num, options.jvm_args)
def start_impalad_instances(cluster_size):
# Start each impalad instance and optionally redirect the output to a log file.
for i in range(options.cluster_size):
if i == 0:
# The first impalad always logs to impalad.INFO
service_name = "impalad"
else:
service_name = "impalad_node%s" % i
args = "%s %s %s %s" %\
(build_impalad_logging_args(i, service_name), build_jvm_args(i),
build_impalad_port_args(i), options.impalad_args.replace("#ID", str(i)))
stderr_log_file_path = os.path.join(options.log_dir, '%s-error.log' % service_name)
exec_impala_process(IMPALAD_PATH, args, stderr_log_file_path)
def wait_for_impala_process_count(impala_cluster, retries=3):
"""Checks that the desired number of impalad/statestored processes are running.
Refresh until the number running impalad/statestored processes reaches the expected
number based on CLUSTER_SIZE, or the retry limit is hit. Failing this, raise a
RuntimeError.
"""
for i in range(retries):
if len(impala_cluster.impalads) < options.cluster_size or \
not impala_cluster.statestored or not impala_cluster.catalogd:
sleep(2)
impala_cluster.refresh()
msg = str()
if len(impala_cluster.impalads) < options.cluster_size:
impalads_found = len(impala_cluster.impalads)
msg += "Expected %d impalad(s), only %d found\n" %\
(options.cluster_size, impalads_found)
if not impala_cluster.statestored:
msg += "statestored failed to start.\n"
if not impala_cluster.catalogd:
msg += "catalogd failed to start.\n"
if msg:
raise RuntimeError, msg
def wait_for_cluster_web(timeout_in_seconds=CLUSTER_WAIT_TIMEOUT_IN_SECONDS):
"""Checks if the cluster is "ready"
A cluster is deemed "ready" if:
- All backends are registered with the statestore.
- Each impalad knows about all other impalads.
This information is retrieved by querying the statestore debug webpage
and each individual impalad's metrics webpage.
"""
impala_cluster = ImpalaCluster()
# impalad processes may take a while to come up.
wait_for_impala_process_count(impala_cluster)
for impalad in impala_cluster.impalads:
impalad.service.wait_for_num_known_live_backends(options.cluster_size,
timeout=CLUSTER_WAIT_TIMEOUT_IN_SECONDS, interval=2)
wait_for_catalog(impalad, timeout_in_seconds=CLUSTER_WAIT_TIMEOUT_IN_SECONDS)
def wait_for_catalog(impalad, timeout_in_seconds):
"""Waits for the impalad catalog to become ready"""
start_time = time()
catalog_ready = False
while (time() - start_time < timeout_in_seconds and not catalog_ready):
try:
num_dbs = impalad.service.get_metric_value('catalog.num-databases')
num_tbls = impalad.service.get_metric_value('catalog.num-tables')
catalog_ready = impalad.service.get_metric_value('catalog.ready')
print 'Waiting for Catalog... Status: %s DBs / %s tables (ready=%s)' %\
(num_dbs, num_tbls, catalog_ready)
except Exception, e:
print e
sleep(1)
if not catalog_ready:
raise RuntimeError, 'Catalog was not initialized in expected time period.'
def wait_for_cluster_cmdline(timeout_in_seconds=CLUSTER_WAIT_TIMEOUT_IN_SECONDS):
"""Checks if the cluster is "ready" by executing a simple query in a loop"""
start_time = time()
while os.system('%s -i localhost:21000 -q "%s"' % (IMPALA_SHELL, 'select 1')) != 0:
if time() - timeout_in_seconds > start_time:
raise RuntimeError, 'Cluster did not start within %d seconds' % timeout_in_seconds
print 'Cluster not yet available. Sleeping...'
sleep(2)
if __name__ == "__main__":
if options.kill_only:
kill_cluster_processes(force=options.force_kill)
sys.exit(0)
if options.build_type not in KNOWN_BUILD_TYPES:
print 'Invalid build type %s' % options.build_type
print 'Valid values: %s' % ', '.join(KNOWN_BUILD_TYPES)
sys.exit(1)
if options.cluster_size <= 0:
print 'Please specify a cluster size > 0'
sys.exit(1)
# Kill existing cluster processes based on the current configuration.
if options.restart_impalad_only:
if options.inprocess:
print 'Cannot perform individual component restarts using an in-process cluster'
sys.exit(1)
kill_matching_processes('impalad', force=options.force_kill)
else:
kill_cluster_processes(force=options.force_kill)
try:
import json
wait_for_cluster = wait_for_cluster_web
except ImportError:
print "json module not found, checking for cluster startup through the command-line"
wait_for_cluster = wait_for_cluster_cmdline
# If ImpalaCluster cannot be imported, fall back to the command-line to check
# whether impalads/statestore are up.
try:
from tests.common.impala_cluster import ImpalaCluster
if options.restart_impalad_only:
impala_cluster = ImpalaCluster()
if not impala_cluster.statestored or not impala_cluster.catalogd:
print 'No running statestored or catalogd detected. Restarting entire cluster.'
options.restart_impalad_only = False
except ImportError:
print 'ImpalaCluster module not found.'
# TODO: Update this code path to work similar to the ImpalaCluster code path when
# restarting only impalad processes. Specifically, we should do a full cluster
# restart if either the statestored or catalogd processes are down, even if
# restart_only_impalad=True.
wait_for_cluster = wait_for_cluster_cmdline
if options.inprocess:
# The statestore and the impalads start in the same process.
start_mini_impala_cluster(options.cluster_size)
wait_for_cluster_cmdline()
else:
try:
if not options.restart_impalad_only:
start_statestore()
start_catalogd()
start_impalad_instances(options.cluster_size)
wait_for_cluster()
except Exception, e:
print 'Error starting cluster: %s' % e
sys.exit(1)
print 'Impala Cluster Running with %d nodes.' % options.cluster_size