Files
impala/tests/custom_cluster/test_blacklist.py
Thomas Tauber-Marshall 014f973e92 IMPALA-9243: Add info about blacklisting decisions to the webui
This patch adds information about blacklisting decisions to the
/backends webui endpoint.

For the JSON, it adds an 'is_blacklisted' field to all backends, and
for and backends where 'is_blacklisted' is true it adds a
'blacklist_cause' field indicating the error status that led to the
backend getting blacklisted and an 'blacklist_time_remaining' field
indiciating how much longer the backend will remain on the blacklist.
It also adds counts for the number of blacklisted and quiescing
backends, if any, and the number of active (i.e. all other) backends.

For display, in order to prevent the table of backend information from
having too many columns (prior to this patch it already had 12), it
separates blacklisted, quiescing, and active backends into three
separate table, with the blacklisted and quiescing tables only getting
displayed if there are any such backends.

Additionally, tooltips are added next to the headers for the
blacklisted and quiescing tables that provide a brief explanation of
what it means for a backend to appear on there lists.

Using separate tables also facilitates having state-specific columns -
the blacklisted table displays columns for the blacklist cause and
time remaining. Future work could consider adding columns to the
quiescing table, such as time until the grace period and deadline
expires.

Testing:
- Manually ran various quiescing/blacklisting scenarios and confirmed
  the /backends page displays as expected.
- Added cases to test_web_pages (to verify the new fields when nothing
  is blacklisted) and test_blacklist.

Change-Id: Ia0c309315b142a50be102dcb516b36ec6cb3cf47
Reviewed-on: http://gerrit.cloudera.org:8080/15178
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2020-02-08 05:22:48 +00:00

170 lines
7.6 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
import pytest
import re
from beeswaxd.BeeswaxService import QueryState
from tests.common.skip import SkipIfNotHdfsMinicluster
from time import sleep
# Tests that verify the behavior of the executor blacklist.
@SkipIfNotHdfsMinicluster.tuned_for_minicluster
class TestBlacklist(CustomClusterTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('runs only in exhaustive')
super(TestBlacklist, cls).setup_class()
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
statestored_args="-statestore_heartbeat_frequency_ms=1000")
def test_kill_impalad(self, cursor):
"""Test that verifies that when an impalad is killed, it is properly blacklisted."""
# Run a query and verify that no impalads are blacklisted yet.
result = self.execute_query("select count(*) from tpch.lineitem")
assert re.search("Blacklisted Executors: (.*)", result.runtime_profile) is None, \
result.runtime_profile
# Kill an impalad
killed_impalad = self.cluster.impalads[2]
killed_impalad.kill()
# Run a query which should fail as the impalad hasn't been blacklisted yet.
try:
self.execute_query("select count(*) from tpch.lineitem")
assert False, "Query was expected to fail"
except Exception as e:
assert "Exec() rpc failed" in str(e)
# Run another query which should succeed and verify the impalad was blacklisted.
result = self.execute_query("select count(*) from tpch.lineitem")
backends_json = self.cluster.impalads[0].service.get_debug_webpage_json("/backends")
match = re.search("Blacklisted Executors: (.*)", result.runtime_profile)
assert match.group(1) == "%s:%s" % \
(killed_impalad.hostname, killed_impalad.service.be_port), result.runtime_profile
assert backends_json["num_blacklisted_backends"] == 1, backends_json
assert backends_json["num_active_backends"] == 2, backends_json
assert len(backends_json["backends"]) == 3, backends_json
num_blacklisted = 0
for backend_json in backends_json["backends"]:
if str(killed_impalad.service.krpc_port) in backend_json["krpc_address"]:
assert backend_json["is_blacklisted"], backend_json
num_blacklisted += 1
else:
assert not backend_json["is_blacklisted"], backend_json
assert num_blacklisted == 1, backends_json
# Sleep for long enough for the statestore to remove the impalad from the cluster
# membership, i.e. Statestore::FailedExecutorDetectionTime() + some padding
sleep(12)
# Run another query and verify nothing was blacklisted and only 2 backends were
# scheduled on.
result = self.execute_query("select count(*) from tpch.lineitem")
assert re.search("Blacklisted Executors: (.*)", result.runtime_profile) is None, \
result.runtime_profile
assert re.search("NumBackends: 2", result.runtime_profile), result.runtime_profile
@pytest.mark.execute_serially
def test_restart_impalad(self, cursor):
"""Test that verifies the behavior when an impalad is killed, blacklisted, and then
restarted."""
# Run a query and verify that no impalads are blacklisted yet.
result = self.execute_query("select count(*) from tpch.lineitem")
assert re.search("Blacklisted Executors: (.*)", result.runtime_profile) is None, \
result.runtime_profile
# Kill an impalad
killed_impalad = self.cluster.impalads[2]
killed_impalad.kill()
# Run a query which should fail as the impalad hasn't been blacklisted yet.
try:
self.execute_query("select count(*) from tpch.lineitem")
assert False, "Query was expected to fail"
except Exception as e:
assert "Exec() rpc failed" in str(e)
# Run another query which should succeed and verify the impalad was blacklisted.
result = self.execute_query("select count(*) from tpch.lineitem")
match = re.search("Blacklisted Executors: (.*)", result.runtime_profile)
assert match.group(1) == "%s:%s" % \
(killed_impalad.hostname, killed_impalad.service.be_port), result.runtime_profile
# Restart the impalad.
killed_impalad.start()
# Sleep for long enough for the statestore to update the membership to include the
# restarted impalad, ImpaladProcess.start() won't return until the Impalad says its
# ready to accept connections, at which point it will have already registered with the
# statestore, so we don't need to sleep very long.
sleep(2)
# Run another query and verify nothing was blacklisted and all 3 backends were
# scheduled on.
result = self.execute_query("select count(*) from tpch.lineitem")
assert re.search("Blacklisted Executors: (.*)", result.runtime_profile) is None, \
result.runtime_profile
assert re.search("NumBackends: 3", result.runtime_profile), result.runtime_profile
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(num_exclusive_coordinators=1,
statestored_args="-statestore_heartbeat_frequency_ms=1000")
def test_kill_impalad_with_running_queries(self, cursor):
"""Verifies that when an Impala executor is killed while running a query, that the
Coordinator blacklists the killed executor."""
# Run a query asynchronously. With the debug actions, this query should take a few
# minutes to complete.
query = "select count(*) from tpch_parquet.lineitem t1, tpch_parquet.lineitem t2 \
where t1.l_orderkey = t2.l_orderkey"
handle = self.execute_query_async(query, query_options={
'debug_action': '0:GETNEXT:DELAY|1:GETNEXT:DELAY'})
# Wait for the query to start running
self.wait_for_any_state(handle, [QueryState.RUNNING, QueryState.FINISHED], 10)
# Kill one of the Impala executors
killed_impalad = self.cluster.impalads[2]
killed_impalad.kill()
# Try to fetch results from the query. Fetch requests should fail because one of the
# impalads running the query was killed. When the query fails, the Coordinator should
# add the killed Impala executor to the blacklist (since a RPC to that node failed).
try:
self.client.fetch(query, handle)
assert False, "Query was expected to fail"
except Exception as e:
# The query should fail due to an RPC error.
assert "TransmitData() to " in str(e) or "EndDataStream() to " in str(e)
# Run another query which should succeed and verify the impalad was blacklisted.
self.client.clear_configuration() # remove the debug actions
result = self.execute_query("select count(*) from tpch.lineitem")
match = re.search("Blacklisted Executors: (.*)", result.runtime_profile)
assert match is not None and match.group(1) == "%s:%s" % \
(killed_impalad.hostname, killed_impalad.service.be_port), \
result.runtime_profile