mirror of
https://github.com/apache/impala.git
synced 2026-01-04 09:00:56 -05:00
This is the second patch to address IMPALA-4684. The first patch exposed a transient Zookeeper connection error on RHEL7. This patch introduces a retry (up to 3 times), and somewhat better logging. Tested by running tests against an RHEL7 instance and confirming that all HBase nodes start up. Change-Id: I44b4eec342addcfe489f94c332bbe14225c9968c Reviewed-on: http://gerrit.cloudera.org:8080/5554 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Internal Jenkins
194 lines
6.8 KiB
Python
Executable File
194 lines
6.8 KiB
Python
Executable File
#!/usr/bin/env impala-python
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""Given a series of hosts and Zookeeper nodes, make sure that each node is accessible.
|
|
"""
|
|
|
|
import argparse
|
|
import hdfs
|
|
import logging
|
|
import pprint
|
|
import requests
|
|
import sys
|
|
import time
|
|
|
|
from contextlib import closing
|
|
from kazoo.client import KazooClient
|
|
from kazoo.exceptions import NoNodeError, ConnectionLoss
|
|
from kazoo.handlers.threading import KazooTimeoutError
|
|
|
|
LOGGER = logging.getLogger('hbase_check')
|
|
LOGGER.addHandler(logging.StreamHandler())
|
|
LOGGER.setLevel(logging.INFO)
|
|
|
|
TIMEOUT_SECONDS = 30
|
|
|
|
HDFS_HOST = '127.0.0.1:5070'
|
|
ZK_HOSTS = '127.0.0.1:2181'
|
|
HBASE_NODES = ['/hbase/master', '/hbase/rs']
|
|
ADMIN_USER = 'admin'
|
|
MAX_ZOOKEEPER_CONNECTION_RETRIES = 3
|
|
|
|
|
|
def parse_args():
|
|
"""Parse and return command line args."""
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument('--timeout', '-t', type=int, default=TIMEOUT_SECONDS,
|
|
help=('Number of seconds to try to get znode before giving up. '
|
|
'Default is {0} seconds.'.format(TIMEOUT_SECONDS)))
|
|
|
|
parser.add_argument('--hdfs_host', '-s', default=HDFS_HOST,
|
|
help=('Host:port where the HDFS web host is running, '
|
|
'e.g, 0.0.0.0:5070. Default is {0}.'.format(HDFS_HOST)))
|
|
|
|
parser.add_argument('--admin_user', '-u', default=ADMIN_USER,
|
|
help='Cluster admin username. Default is {0}.'.format(ADMIN_USER))
|
|
|
|
parser.add_argument('--zookeeper_hosts', '-z', default=ZK_HOSTS,
|
|
help=('Comma-delineated string of hosts in host:PORT format, '
|
|
'e.g, 0.0.0.0:2181. Default is {0}.'.format(ZK_HOSTS)))
|
|
|
|
parser.add_argument('-node', '-n', action='append', dest='nodes',
|
|
default=HBASE_NODES,
|
|
help=('HBase znode to check. Can be specified multiple times. '
|
|
'Defaults are -n {0}.'.format(' -n '.join(HBASE_NODES))))
|
|
return parser.parse_args()
|
|
|
|
|
|
def connect_to_zookeeper(host_list, timeout_seconds):
|
|
"""Connect to Zookeeper service.
|
|
|
|
Args:
|
|
host_list: Comma-separated string of hosts in host:port format
|
|
timeout_seconds: Number of seconds to attempt to connect to host
|
|
|
|
Returns:
|
|
KazooClient instance
|
|
"""
|
|
zk_client = KazooClient(hosts=host_list)
|
|
|
|
try:
|
|
LOGGER.info("Connecting to Zookeeper host(s).")
|
|
zk_client.start(timeout=timeout_seconds)
|
|
LOGGER.info("Success: " + str(zk_client))
|
|
return zk_client
|
|
except KazooTimeoutError as e:
|
|
LOGGER.error("Could not connect to Zookeeper: " + str(e))
|
|
sys.exit(1)
|
|
|
|
|
|
def check_znode(node, zk_client, timeout_seconds):
|
|
"""Given a Zookeeper client and a node, check that the node is up.
|
|
|
|
Args:
|
|
node: name of a znode as a string, e.g., /hbase/rs
|
|
zk_client: Zookeeper client object
|
|
timeout_seconds: Number of seconds to attempt to get node
|
|
|
|
Returns:
|
|
0 success, 1 on failure
|
|
"""
|
|
start_time = time.time()
|
|
while (time.time() - start_time) < timeout_seconds:
|
|
LOGGER.info("Waiting for HBase node: " + node)
|
|
try:
|
|
node_info = zk_client.get(node)
|
|
LOGGER.info("Success: " + node)
|
|
LOGGER.debug(pprint.pformat(node_info))
|
|
return 0
|
|
except NoNodeError:
|
|
time.sleep(1)
|
|
|
|
LOGGER.error("Failed while checking for HBase node: " + node)
|
|
return 1
|
|
|
|
|
|
def check_znodes_list_for_errors(nodes, zookeeper_hosts, timeout):
|
|
"""Confirm that the given list of znodes are responsive.
|
|
|
|
Args:
|
|
zk_client: Zookeeper client object
|
|
node: name of a znode as a string, e.g., /hbase/rs
|
|
timeout_seconds: Number of seconds to attempt to get node
|
|
|
|
Returns:
|
|
0 success, or else the number of errors
|
|
"""
|
|
connection_retries = 0
|
|
|
|
while True:
|
|
with closing(connect_to_zookeeper(zookeeper_hosts, timeout)) as zk_client:
|
|
try:
|
|
return sum([check_znode(node, zk_client, timeout) for node in nodes])
|
|
except ConnectionLoss as e:
|
|
connection_retries += 1
|
|
if connection_retries > MAX_ZOOKEEPER_CONNECTION_RETRIES:
|
|
LOGGER.error("Max connection retries exceeded: {0}".format(str(e)))
|
|
raise
|
|
else:
|
|
err_msg = ("Zookeeper connection loss: retrying connection "
|
|
"({0} of {1} attempts)")
|
|
LOGGER.warn(err_msg.format(connection_retries,
|
|
MAX_ZOOKEEPER_CONNECTION_RETRIES))
|
|
time.sleep(1)
|
|
except Exception as e:
|
|
LOGGER.error("Unexpected error checking HBase node: {0}".format(str(e)))
|
|
raise
|
|
finally:
|
|
LOGGER.info("Stopping Zookeeper client")
|
|
zk_client.stop()
|
|
|
|
|
|
def is_hdfs_running(host, admin_user):
|
|
"""Confirm that HDFS is available.
|
|
|
|
There is a pathological case where the HBase master can start up briefly if HDFS is not
|
|
available, and then quit immediately, but that can be long enough to give a false positive
|
|
that the HBase master is running.
|
|
|
|
Args:
|
|
host: HDFS host:port
|
|
admin_user: Admin username
|
|
|
|
Returns:
|
|
Boolean
|
|
"""
|
|
try:
|
|
hdfs_client = hdfs.InsecureClient('http://' + host, user=admin_user)
|
|
LOGGER.info("Contents of HDFS root: {0}".format(hdfs_client.list('/')))
|
|
return True
|
|
except (requests.exceptions.ConnectionError, hdfs.util.HdfsError) as e:
|
|
msg = 'Could not confirm HDFS is running at http://{0} - {1}'.format(host, e)
|
|
LOGGER.error(msg)
|
|
return False
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
|
|
if is_hdfs_running(args.hdfs_host, args.admin_user):
|
|
errors = check_znodes_list_for_errors(args.nodes, args.zookeeper_hosts, args.timeout)
|
|
|
|
if errors > 0:
|
|
msg = "Could not get one or more nodes. Exiting with errors: {0}".format(errors)
|
|
LOGGER.error(msg)
|
|
sys.exit(errors)
|
|
else:
|
|
sys.exit(1) |