Files
impala/tests/custom_cluster/test_auto_scaling.py
Joe McDonnell eb66d00f9f IMPALA-11974: Fix lazy list operators for Python 3 compatibility
Python 3 changes list operators such as range, map, and filter
to be lazy. Some code that expects the list operators to happen
immediately will fail. e.g.

Python 2:
range(0,5) == [0,1,2,3,4]
True

Python 3:
range(0,5) == [0,1,2,3,4]
False

The fix is to wrap locations with list(). i.e.

Python 3:
list(range(0,5)) == [0,1,2,3,4]
True

Since the base operators are now lazy, Python 3 also removes the
old lazy versions (e.g. xrange, ifilter, izip, etc). This uses
future's builtins package to convert the code to the Python 3
behavior (i.e. xrange -> future's builtins.range).

Most of the changes were done via these futurize fixes:
 - libfuturize.fixes.fix_xrange_with_import
 - lib2to3.fixes.fix_map
 - lib2to3.fixes.fix_filter

This eliminates the pylint warnings:
 - xrange-builtin
 - range-builtin-not-iterating
 - map-builtin-not-iterating
 - zip-builtin-not-iterating
 - filter-builtin-not-iterating
 - reduce-builtin
 - deprecated-itertools-function

Testing:
 - Ran core job

Change-Id: Ic7c082711f8eff451a1b5c085e97461c327edb5f
Reviewed-on: http://gerrit.cloudera.org:8080/19589
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
2023-03-09 17:17:57 +00:00

239 lines
9.4 KiB
Python

#!/usr/bin/env impala-python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import, division, print_function
from builtins import range
import logging
import pytest
from time import sleep, time
from tests.util.auto_scaler import AutoScaler
from tests.util.concurrent_workload import ConcurrentWorkload
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
LOG = logging.getLogger("test_auto_scaling")
TOTAL_BACKENDS_METRIC_NAME = "cluster-membership.backends.total"
class TestAutoScaling(CustomClusterTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('runs only in exhaustive')
super(TestAutoScaling, cls).setup_class()
"""This class contains tests that exercise the logic related to scaling clusters up and
down by adding and removing groups of executors."""
INITIAL_STARTUP_TIME_S = 10
STATE_CHANGE_TIMEOUT_S = 60
# This query will scan two partitions (month = 1, 2) and thus will have 1 fragment
# instance per executor on groups of size 2. Each partition has 2 rows, so it performs
# two comparisons and should take around 2 second to complete.
QUERY = """select * from functional_parquet.alltypestiny where month < 3
and id + random() < sleep(1000)"""
def _get_total_admitted_queries(self):
admitted_queries = self.impalad_test_service.get_total_admitted_queries(
"default-pool")
LOG.info("Current total admitted queries: %s", admitted_queries)
return admitted_queries
def _get_num_backends(self):
metric_val = self.impalad_test_service.get_metric_value(TOTAL_BACKENDS_METRIC_NAME)
LOG.info("Getting metric %s : %s", TOTAL_BACKENDS_METRIC_NAME, metric_val)
return metric_val
def _get_num_running_queries(self):
running_queries = self.impalad_test_service.get_num_running_queries("default-pool")
LOG.info("Current running queries: %s", running_queries)
return running_queries
def test_single_workload(self):
"""This test exercises the auto-scaling logic in the admission controller. It spins up
a base cluster (coordinator, catalog, statestore), runs a workload to initiate a
scaling up event as the queries start queuing, then stops the workload and observes
that the cluster gets shutdown."""
GROUP_SIZE = 2
EXECUTOR_SLOTS = 3
auto_scaler = AutoScaler(executor_slots=EXECUTOR_SLOTS, group_size=GROUP_SIZE)
workload = None
try:
auto_scaler.start()
sleep(self.INITIAL_STARTUP_TIME_S)
workload = ConcurrentWorkload(self.QUERY, num_streams=5)
LOG.info("Starting workload")
workload.start()
# Wait for workers to spin up
cluster_size = GROUP_SIZE + 1 # +1 to include coordinator.
assert any(self._get_num_backends() >= cluster_size or sleep(1)
for _ in range(self.STATE_CHANGE_TIMEOUT_S)), \
"Number of backends did not increase within %s s" % self.STATE_CHANGE_TIMEOUT_S
assert self.impalad_test_service.get_metric_value(
"cluster-membership.executor-groups.total-healthy") >= 1
# Wait until we admitted at least 10 queries
assert any(self._get_total_admitted_queries() >= 10 or sleep(1)
for _ in range(self.STATE_CHANGE_TIMEOUT_S)), \
"Did not admit enough queries within %s s" % self.STATE_CHANGE_TIMEOUT_S
# Wait for second executor group to start
cluster_size = (2 * GROUP_SIZE) + 1
assert any(self._get_num_backends() >= cluster_size or sleep(1)
for _ in range(self.STATE_CHANGE_TIMEOUT_S)), \
"Number of backends did not reach %s within %s s" % (
cluster_size, self.STATE_CHANGE_TIMEOUT_S)
assert self.impalad_test_service.get_metric_value(
"cluster-membership.executor-groups.total-healthy") >= 2
LOG.info("Stopping workload")
workload.stop()
# Wait for workers to spin down
self.impalad_test_service.wait_for_metric_value(
TOTAL_BACKENDS_METRIC_NAME, 1,
timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
assert self.impalad_test_service.get_metric_value(
"cluster-membership.executor-groups.total") == 0
finally:
if workload:
workload.stop()
LOG.info("Stopping auto scaler")
auto_scaler.stop()
def test_single_group_maxed_out(self):
"""This test starts an auto scaler and limits it to a single executor group. It then
makes sure that the query throughput does not exceed the expected limit."""
GROUP_SIZE = 2
EXECUTOR_SLOTS = 3
auto_scaler = AutoScaler(executor_slots=EXECUTOR_SLOTS, group_size=GROUP_SIZE,
max_groups=1, coordinator_slots=EXECUTOR_SLOTS)
workload = None
try:
auto_scaler.start()
sleep(self.INITIAL_STARTUP_TIME_S)
workload = ConcurrentWorkload(self.QUERY, num_streams=5)
LOG.info("Starting workload")
workload.start()
# Wait for workers to spin up
cluster_size = GROUP_SIZE + 1 # +1 to include coordinator.
self.impalad_test_service.wait_for_metric_value(
TOTAL_BACKENDS_METRIC_NAME, cluster_size,
timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
# Wait until we admitted at least 10 queries
assert any(self._get_total_admitted_queries() >= 10 or sleep(1)
for _ in range(self.STATE_CHANGE_TIMEOUT_S)), \
"Did not admit enough queries within %s s" % self.STATE_CHANGE_TIMEOUT_S
# Sample the number of running queries for while
SAMPLE_NUM_RUNNING_S = 30
end_time = time() + SAMPLE_NUM_RUNNING_S
num_running = []
while time() < end_time:
num_running.append(self._get_num_running_queries())
sleep(1)
# Must reach EXECUTOR_SLOTS but not exceed it
assert max(num_running) == EXECUTOR_SLOTS, \
"Unexpected number of running queries: %s" % num_running
# Check that only a single group started
assert self.impalad_test_service.get_metric_value(
"cluster-membership.executor-groups.total-healthy") == 1
LOG.info("Stopping workload")
workload.stop()
# Wait for workers to spin down
self.impalad_test_service.wait_for_metric_value(
TOTAL_BACKENDS_METRIC_NAME, 1,
timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
assert self.impalad_test_service.get_metric_value(
"cluster-membership.executor-groups.total") == 0
finally:
if workload:
workload.stop()
LOG.info("Stopping auto scaler")
auto_scaler.stop()
def test_sequential_startup(self):
"""This test starts an executor group sequentially and observes that no queries are
admitted until the group has been fully started."""
# Larger groups size so it takes a while to start up
GROUP_SIZE = 4
EXECUTOR_SLOTS = 3
auto_scaler = AutoScaler(executor_slots=EXECUTOR_SLOTS, group_size=GROUP_SIZE,
start_batch_size=1, max_groups=1)
workload = None
try:
auto_scaler.start()
sleep(self.INITIAL_STARTUP_TIME_S)
workload = ConcurrentWorkload(self.QUERY, num_streams=5)
LOG.info("Starting workload")
workload.start()
# Wait for first executor to start up
self.impalad_test_service.wait_for_metric_value(
"cluster-membership.executor-groups.total", 1,
timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
# Wait for remaining executors to start up and make sure that no queries are
# admitted during startup
end_time = time() + self.STATE_CHANGE_TIMEOUT_S
startup_complete = False
cluster_size = GROUP_SIZE + 1 # +1 to include coordinator.
while time() < end_time:
num_admitted = self._get_total_admitted_queries()
num_backends = self._get_num_backends()
if num_backends < cluster_size:
assert num_admitted == 0, "%s/%s backends started but %s queries have " \
"already been admitted." % (num_backends, cluster_size, num_admitted)
if num_admitted > 0:
assert num_backends == cluster_size
startup_complete = True
break
sleep(1)
assert startup_complete, "Did not start up in %s s" % self.STATE_CHANGE_TIMEOUT_S
LOG.info("Stopping workload")
workload.stop()
# Wait for workers to spin down
self.impalad_test_service.wait_for_metric_value(
TOTAL_BACKENDS_METRIC_NAME, 1,
timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
assert self.impalad_test_service.get_metric_value(
"cluster-membership.executor-groups.total") == 0
finally:
if workload:
workload.stop()
LOG.info("Stopping auto scaler")
auto_scaler.stop()