Files
impala/tests/common/cluster_config.py
jasonmfehr b3b2dbaca3 IMPALA-13772: Fix Workload Management DMLs Timeouts
The insert DMLs executed by workload management to add rows to the
completed queries Iceberg table time out after 10 seconds because
that is the default FETCH_ROWS_TIMEOUT_MS value. If the DML queues up
in admission control, this timeout will quickly cause the DML to be
cancelled. The fix is to set the FETCH_ROWS_TIMEOUT_MS query option
to 0 for the workload management insert DMLs.

Even though the workload management DMLs do not retrieve any rows,
the FETCH_ROWS_TIMEOUT_MS value still applies because the internal
server functions call into the client request state's
ExecQueryOrDmlRequest() function which starts query execution and
immediately returns. Then, the BlockOnWait function in
impala-server.cc is called. This function times out based on the
FETCH_ROWS_TIMEOUT_MS value.

A new coordinator startup flag 'query_log_dml_exec_timeout_s' is
added to specify the EXEC_TIME_LIMIT_S query option on the workload
management insert DML statements. This flag ensures the DMLs will
time out if they do not complete in a reasonable timeframe.

While adding the new coordinator startup flag, a bug in the
internal-server code was discovered. This bug caused a return status
of 'ok' even when the query exec time limit was reached and the query
cancelled. This bug has also been fixed.

Testing:
  1. Added new custom cluster test that simulates a busy cluster where
       the workload management DML queues for longer than 10 seconds.
  2. Existing tests in test_query_log and test_admission_controller
       passed.
  3. One internal-server-test ctest was modified to assert for a
       returned status of error when a query is cancelled.
  4. Added a new cusom cluster test that asserts the workload
       management DML is cancelled based on the value of the new
       coordinator startup flag.

Change-Id: I0cc7fbce40eadfb253d8cff5cbb83e2ad63a979f
Reviewed-on: http://gerrit.cloudera.org:8080/22511
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-02-26 03:12:31 +00:00

93 lines
4.2 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Common cluster configurations as decorators for custom cluster tests
from __future__ import absolute_import, division, print_function
import os
import shutil
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
# Same as in tests/authorization/test_ranger.py
ADMIN = "admin"
# The path to resources directory which contains the admission control config files.
RESOURCES_DIR = os.path.join(os.environ['IMPALA_HOME'], "fe", "src", "test", "resources")
enable_authorization = CustomClusterTestSuite.with_args(
# Same as IMPALAD_ARGS and CATALOGD_ARGS in tests/authorization/test_ranger.py
impalad_args="--server-name=server1 --ranger_service_type=hive "
"--ranger_app_id=impala --authorization_provider=ranger",
catalogd_args="--server-name=server1 --ranger_service_type=hive "
"--ranger_app_id=impala --authorization_provider=ranger"
)
def impalad_admission_ctrl_flags(max_requests, max_queued, pool_max_mem,
proc_mem_limit=None, queue_wait_timeout_ms=None,
admission_control_slots=None, executor_groups=None,
codegen_cache_capacity=0):
extra_flags = ""
if proc_mem_limit is not None:
extra_flags += " -mem_limit={0}".format(proc_mem_limit)
if queue_wait_timeout_ms is not None:
extra_flags += " -queue_wait_timeout_ms={0}".format(queue_wait_timeout_ms)
if admission_control_slots is not None:
extra_flags += " -admission_control_slots={0}".format(admission_control_slots)
if executor_groups is not None:
extra_flags += " -executor_groups={0}".format(executor_groups)
extra_flags += " -codegen_cache_capacity={0}".format(codegen_cache_capacity)
return ("-vmodule admission-controller=3 -default_pool_max_requests {0} "
"-default_pool_max_queued {1} -default_pool_mem_limit {2} {3}".format(
max_requests, max_queued, pool_max_mem, extra_flags))
admit_one_query_at_a_time = CustomClusterTestSuite.with_args(
impalad_args=impalad_admission_ctrl_flags(1, 1, 0)
)
admit_no_query = CustomClusterTestSuite.with_args(
impalad_args=impalad_admission_ctrl_flags(0, 0, 0)
)
single_coordinator = CustomClusterTestSuite.with_args(
num_exclusive_coordinators=1
)
def impalad_admission_ctrl_config_args(fs_allocation_file, llama_site_file,
additional_args="", make_copy=False):
"""Generates impalad startup flags configuring the fair scheduler and llama site path
options and setting logging for admission control to VLOG_ROW.
The specified fair scheduler and llama site files are copied first, and the copies
are used as the value for the relevant startup flags."""
fs_allocation_path = os.path.join(RESOURCES_DIR, fs_allocation_file)
llama_site_path = os.path.join(RESOURCES_DIR, llama_site_file)
if make_copy:
copy_fs_allocation_path = os.path.join(RESOURCES_DIR, "copy-" + fs_allocation_file)
copy_llama_site_path = os.path.join(RESOURCES_DIR, "copy-" + llama_site_file)
shutil.copy2(fs_allocation_path, copy_fs_allocation_path)
shutil.copy2(llama_site_path, copy_llama_site_path)
fs_allocation_path = copy_fs_allocation_path
llama_site_path = copy_llama_site_path
return ("-vmodule admission-controller=3 -fair_scheduler_allocation_path %s "
"-llama_site_path %s %s" % (fs_allocation_path, llama_site_path,
additional_args))