Files
impala/tests/custom_cluster/test_process_failures.py
Sahil Takiar fc19e70cbc IMPALA-5534: Fix and enable experimental failure tests
Moves the test_catalog_hms_failures.py and test_process_failures.py from
the experimental tests to custom cluster tests.
catalog_service/test_hms_failure.py is combined with
custom_cluster/test_catalog_hms_failure.py as well in order to unify all
tests for HMS failures. Several modifications to the tests were
necessary to get them working again, but for the most part, the logic of
the tests remained the same. A few additional fault tolerance tests
(e.g. TestHiveMetaStoreFailure::test_hms_client_retries) were added as
well. The overall goal is to increase the process failure test coverage
for all components: impalads, statestore, catalogd, HMS, etc.

test_restart_catalogd in test_process_failures.py fails due to
IMPALA-9848, so it is skipped for now.

Testing:
* Ran new tests locally

Change-Id: I9dbb98017fb6c40cea349e7c63a35c325cbbc288
Reviewed-on: http://gerrit.cloudera.org:8080/16157
Reviewed-by: Sahil Takiar <stakiar@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2020-07-14 00:03:14 +00:00

199 lines
8.4 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
from tests.common.custom_cluster_test_suite import (
DEFAULT_CLUSTER_SIZE,
CustomClusterTestSuite)
# The exact query doesn't matter much for these tests, just want a query that touches
# data on all nodes.
QUERY = "select count(l_comment) from tpch.lineitem"
# The default number of statestore subscribers should be the default cluster size plus
# one for the catalogd.
DEFAULT_NUM_SUBSCRIBERS = DEFAULT_CLUSTER_SIZE + 1
class TestProcessFailures(CustomClusterTestSuite):
"""Validates killing and restarting impalad processes between query executions."""
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('These tests only run in exhaustive')
super(TestProcessFailures, cls).setup_class()
@pytest.mark.execute_serially
def test_restart_coordinator(self):
"""Restarts the coordinator between queries."""
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
self.execute_query_expect_success(client, QUERY)
statestored = self.cluster.statestored
impalad.restart()
statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
# Reconnect
client = impalad.service.create_beeswax_client()
impalad.service.wait_for_metric_value('catalog.ready', 1, timeout=60)
self.execute_query_expect_success(client, QUERY)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args='--use_local_catalog',
catalogd_args='--catalog_topic_mode=minimal')
def test_restart_statestore(self):
"""Tests the cluster still functions when the statestore dies."""
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
statestored = self.cluster.statestored
statestored.kill()
impalad.service.wait_for_metric_value(
'statestore-subscriber.connected', False, timeout=60)
# impalad should still see the same number of live backends
assert impalad.service.get_num_known_live_backends() == DEFAULT_CLUSTER_SIZE
self.execute_query_expect_success(client, QUERY)
# Reconnect
statestored.start()
impalad.service.wait_for_metric_value(
'statestore-subscriber.connected', True, timeout=60)
statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
# Wait for the number of live backends to reach the cluster size. Even though
# all backends have subscribed to the statestore, this impalad may not have
# received the update yet.
impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE, timeout=60)
self.execute_query_expect_success(client, QUERY)
@pytest.mark.execute_serially
def test_kill_restart_worker(self):
"""Verifies a worker is able to be killed."""
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
self.execute_query_expect_success(client, QUERY)
# select a different impalad and restart it
worker_impalad = self.cluster.get_different_impalad(impalad)
# Start executing a query. It will be cancelled due to a killed worker.
handle = client.execute_async(QUERY)
statestored = self.cluster.statestored
worker_impalad.kill()
# First wait until the the statestore realizes the impalad has gone down.
statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS - 1, timeout=60)
# Wait until the impalad registers another instance went down.
impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE - 1, timeout=60)
# Wait until the in-flight query has been cancelled.
# The in-flight query should have been cancelled, reporting a failed worker as the
# cause. The query may have been cancelled because the state store detected a failed
# node, or because a stream sender failed to establish a thrift connection. It is
# non-deterministic which of those paths will initiate cancellation, but in either
# case the query status should include the failed (or unreachable) worker.
query_id = handle.get_handle().id
error_state = "Failed due to unreachable impalad"
assert impalad.service.wait_for_query_status(client, query_id, error_state)
# Assert that the query status on the query profile web page contains the expected
# failed hostport.
failed_hostport = "%s:%s" % (worker_impalad.service.hostname,
worker_impalad.service.be_port)
query_profile_page = impalad.service.read_query_profile_page(query_id)
assert failed_hostport in query_profile_page,\
"Query status did not contain expected hostport %s\n\n%s" % (failed_hostport,
query_profile_page)
# Should work fine even if a worker is down.
self.execute_query_expect_success(client, QUERY)
# Bring the worker back online and validate queries still work.
worker_impalad.start()
statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
worker_impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
self.execute_query_expect_success(client, QUERY)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args='--use_local_catalog',
catalogd_args='--catalog_topic_mode=minimal')
@pytest.mark.xfail(run=False, reason="IMPALA-9848")
def test_restart_catalogd(self):
# Choose a random impalad verify a query can run against it.
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
self.execute_query_expect_success(client, QUERY)
# Kill the catalogd.
catalogd = self.cluster.catalogd
catalogd.kill()
# The statestore should detect the catalog service has gone down.
statestored = self.cluster.statestored
statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS - 1, timeout=60)
# We should still be able to execute queries using the impalad.
self.execute_query_expect_success(client, QUERY)
# Start the catalog service back up.
catalogd.start()
statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
# Execute a query against the catalog service.
impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
self.execute_query_expect_success(client, QUERY)
@pytest.mark.execute_serially
def test_restart_all_impalad(self):
"""Restarts all the impalads and runs a query"""
impalad = self.cluster.get_any_impalad()
client = impalad.service.create_beeswax_client()
self.execute_query_expect_success(client, QUERY)
# Kill each impalad and wait for the statestore to register the failures.
for impalad_proc in self.cluster.impalads:
impalad_proc.kill()
statestored = self.cluster.statestored
# There should be 1 remining subscriber, the catalogd
statestored.service.wait_for_live_subscribers(1, timeout=60)
# Start each impalad back up and wait for the statestore to see them.
for impalad_proc in self.cluster.impalads:
impalad_proc.start()
# The impalads should re-register with the statestore on restart at which point they
# can execute queries.
statestored.service.wait_for_live_subscribers(DEFAULT_NUM_SUBSCRIBERS, timeout=60)
for impalad in self.cluster.impalads:
impalad.service.wait_for_num_known_live_backends(DEFAULT_CLUSTER_SIZE, timeout=60)
impalad.service.wait_for_metric_value('catalog.ready', True, timeout=60)
client = impalad.service.create_beeswax_client()
self.execute_query_expect_success(client, QUERY)
# Make sure the catalog service is actually back up by executing an operation
# against it.
self.execute_query_expect_success(client, QUERY)