Files
impala/tests/custom_cluster/test_codegen_cache.py
Riza Suminto 95f353ac4a IMPALA-13507: Allow disabling glog buffering via with_args fixture
We have plenty of custom_cluster tests that assert against content of
Impala daemon log files while the process is still running using
assert_log_contains() and it's wrappers. The method specifically mention
about disabling glog buffering ('-logbuflevel=-1'), but not all
custom_cluster tests do that. This often result in flaky test that hard
to triage and often neglected if it does not frequently run in core
exploration.

This patch adds boolean param 'disable_log_buffering' into
CustomClusterTestSuite.with_args for test to declare intention to
inspect log files in live minicluster. If it is True, start minicluster
with '-logbuflevel=-1' for all daemons. If it is False, log WARNING on
any calls to assert_log_contains().

There are several complex custom_cluster tests that left unchanged and
print out such WARNING logs, such as:
- TestQueryLive
- TestQueryLogTableBeeswax
- TestQueryLogOtherTable
- TestQueryLogTableHS2
- TestQueryLogTableAll
- TestQueryLogTableBufferPool
- TestStatestoreRpcErrors
- TestWorkloadManagementInitWait
- TestWorkloadManagementSQLDetails

This patch also fixed some small flake8 issues on modified tests.

There is a flakiness sign at test_query_live.py where test query is
submitted to coordinator and fail because sys.impala_query_live table
has not exist yet from coordinator's perspective. This patch modify
test_query_live.py to wait for few seconds until sys.impala_query_live
is queryable.

Testing:
- Pass custom_cluster tests in exhaustive exploration.

Change-Id: I56fb1746b8f3cea9f3db3514a86a526dffb44a61
Reviewed-on: http://gerrit.cloudera.org:8080/22015
Reviewed-by: Jason Fehr <jfehr@cloudera.com>
Reviewed-by: Michael Smith <michael.smith@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2024-11-05 04:49:05 +00:00

412 lines
20 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import, division, print_function
from builtins import range
import pytest
from copy import copy
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.skip import SkipIf, SkipIfNotHdfsMinicluster
from tests.common.test_result_verifier import assert_codegen_cache_hit
from tests.util.filesystem_utils import get_fs_path
@SkipIf.not_hdfs
@SkipIfNotHdfsMinicluster.scheduling
class TestCodegenCache(CustomClusterTestSuite):
""" This test enables the codegen cache and verfies that cache hit and miss counts
in the runtime profile and metrics are as expected.
"""
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('runs only in exhaustive')
super(TestCodegenCache, cls).setup_class()
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache(self, vector):
self._test_codegen_cache(vector,
("select * from (select * from functional.alltypes "
+ "limit 1000000) t1 where int_col > 10 limit 10"))
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_int_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where int_col > 0")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_tinyint_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where tinyint_col > 0")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_bool_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where bool_col > 0")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_bigint_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where bigint_col > 0")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_float_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where float_col > 0")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_double_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where double_col > 0")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_date_string_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where date_string_col != ''")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_string_col(self, vector):
self._test_codegen_cache(vector,
"select * from functional.alltypes where string_col != ''")
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_poly_func_string_col(self, vector):
self._test_codegen_cache(vector,
("select * from functional.alltypes where "
+ "CHAR_LENGTH(string_col) > 0"))
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_poly_func_date_string_col(self, vector):
self._test_codegen_cache(vector,
("select * from functional.alltypes where "
+ "CHAR_LENGTH(date_string_col) > 0"))
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
# Test native uda is missed in the codegen cache, as it is disabled.
def test_codegen_cache_uda_miss(self, vector):
database = "test_codegen_cache_uda_miss"
self._load_functions(database)
self._test_codegen_cache(vector,
"select test_count(int_col) from functional.alltypestiny", False)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
# Test native udf is missed in the codegen cache, as it is disabled.
def test_codegen_cache_udf_miss(self, vector):
database = "test_codegen_cache_udf_miss"
self._load_functions(database)
self._test_codegen_cache(vector,
"select sum(identity(bigint_col)) from functional.alltypes", False)
SYMBOL_EMITTER_TESTS_IMPALAD_ARGS = "--cache_force_single_shard=1 \
--codegen_symbol_emitter_log_successful_destruction_test_only=1 \
--codegen_cache_entry_bytes_charge_overhead=10000000 --codegen_cache_capacity=25MB "
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args=SYMBOL_EMITTER_TESTS_IMPALAD_ARGS + "--asm_module_dir=/dev/null",
disable_log_buffering=True)
# Regression test for IMPALA-12260.
def test_codegen_cache_with_asm_module_dir(self, vector):
self._test_codegen_cache_with_symbol_emitter(vector)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args=SYMBOL_EMITTER_TESTS_IMPALAD_ARGS + "--perf_map",
disable_log_buffering=True)
# Regression test for IMPALA-12260.
def test_codegen_cache_with_perf_map(self, vector):
self._test_codegen_cache_with_symbol_emitter(vector)
def _test_codegen_cache_with_symbol_emitter(self, vector):
"""Regression test for IMPALA-12260. In the test we run two queries. The first query
produces two entries in the cache, and they both have a 'CodegenSymbolEmitter' as
event listeners because of the '--asm_module_dir' or '--perf_map' startup flag. The
second query inserts new entries in the cache - the size of the cache should be such
that both entries from the first query fit in it but both are evicted during the
second query.
When an 'llvm::ExecutionEngine', which is part of the cache entry, is destroyed, it
frees any remaining object files and notifies the listeners about this, so the
listeners should be alive at this time. Prior to IMPALA-12260 the
'CodegenSymbolEmitter's of the cached fragment instances were destroyed at the end of
the first query, causing a use-after-free (sometimes leading to a crash) during the
second one.
The choice of the size of the cache is based on the following:
- the first query imposes a lower bound on the cache size (both cache entries should
fit in the cache) AND
- the second query imposes an upper bound (the cache entries of the first query
should be evicted during the second query).
The acceptable values are in the intersection of these two intervals.
However, code changes and the difference between debug and release builds
can have a huge effect on the acceptable range. To get around this, we use
the '--codegen_cache_entry_bytes_charge_overhead' startup flag to
artificially assign a higher size to the cache entries, compared to which
the real size, and therefore also changes in the real size, are
insignificant.
This test verifies that the use-after-free scenario doesn't happen. We can't rely on
the crash to detect it because
1) the crash is not guaranteed to happen, use-after-free is undefined behaviour
2) the crash may happen well after the query has finished returning results.
Therefore in 'CodegenSymbolEmitter' we count how many object files have been emitted
and freed. If the difference is greater than zero at the time of the destruction of
the 'CodegenSymbolEmitter', the LLVM execution engine to which the symbol emitter is
subscribed is still alive and will attempt to notify the symbol emitter when it will
have already been destroyed, leading to use-after-free.
When the --codegen_symbol_emitter_log_successful_destruction_test_only flag is set to
true, 'CodegenSymbolEmitter' will log a message when it is being destroyed correctly
(i.e. when use-after-free will not happen). If we don't have the expected message in
the logs (after some timeout), the test fails.
After IMPALA-11805, codegen caching is no longer using the 'llvm::ExecutionEngine',
instead we use 'CodeGenObjectCache'. While 'CodeGenObjectCache' doesn't impact the
lifecycle of 'CodegenSymbolEmitter's, the testcase in this context still verifies
the correct usage of 'CodegenSymbolEmitter's."""
exec_options = copy(vector.get_value('exec_option'))
exec_options['exec_single_node_rows_threshold'] = 0
q1 = """select int_col from functional_parquet.alltypessmall
order by int_col desc limit 20"""
q2 = """select t1.bool_col, t1.year, t1.month
from functional_parquet.alltypes t1
inner join functional_parquet.alltypessmall t2 on t1.year = t2.year
group by t1.id, t1.bool_col, t1.smallint_col, t1.bigint_col, t1.float_col,
t1.double_col, t1.date_string_col, t1.string_col, t1.timestamp_col, t1.year,
t1.month
order by t1.id, t1.bool_col, t1.smallint_col, t1.bigint_col, t1.float_col,
t1.double_col, t1.date_string_col, t1.string_col, t1.timestamp_col, t1.year,
t1.month"""
self._check_metric_expect_init()
symbol_emitter_ok_msg = "Successful destruction of CodegenSymbolEmitter object."
# ## First query
self.execute_query_expect_success(self.client, q1, exec_options)
cache_entries_in_use = self.get_metric('impala.codegen-cache.entries-in-use')
cache_entries_evicted = self.get_metric('impala.codegen-cache.entries-evicted')
# Query 1 contains 2 fragments.
fragments_ran = 2
assert cache_entries_in_use > 0
assert self.get_metric('impala.codegen-cache.hits') == 0
# Initialising the cross-compiled modules also consumes an LLVM executor engine.
expected_num_msg = fragments_ran + 1
self.assert_impalad_log_contains("INFO", symbol_emitter_ok_msg, expected_num_msg)
# ## Second query
self.execute_query_expect_success(self.client, q2, exec_options)
assert self.get_metric('impala.codegen-cache.hits') == 0
# Query 2 contains 4 fragments.
fragments_ran = fragments_ran + 4
cache_entries_evicted = self.get_metric('impala.codegen-cache.entries-evicted')
assert cache_entries_evicted >= cache_entries_in_use
# Initialising the cross-compiled modules also consumes an LLVM executor engine.
expected_num_msg = fragments_ran + 1
self.assert_impalad_log_contains("INFO", symbol_emitter_ok_msg, expected_num_msg)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
# Regression test for IMPALA-12269. The first query uses one of the codegen'd functions
# in two objects, so it is added to be jitted twice. For the second query it is added
# only once. The hash of the function names should be the same in both cases.
def test_codegen_cache_with_duplicate_fn_names(self, vector):
exec_options = copy(vector.get_value('exec_option'))
exec_options['exec_single_node_rows_threshold'] = 0
q1 = """select int_col, tinyint_col from functional_parquet.alltypessmall
order by int_col desc limit 20"""
q2 = """select tinyint_col from functional_parquet.alltypessmall
order by int_col desc limit 20"""
self._check_metric_expect_init()
self.execute_query_expect_success(self.client, q1, exec_options)
assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
self.execute_query_expect_success(self.client, q2, exec_options)
# If the function name hashes of the first and the second query didn't match, there
# would be no cache hit and the cache entry from the first query would be evicted
# because the llvm modules of the two queries, hence the cache keys, are identical.
assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
assert self.get_metric('impala.codegen-cache.hits') == 1
# Expect two misses for the two fragments of the first query and one for one of the
# fragments of the second query.
assert self.get_metric('impala.codegen-cache.misses') == 3
def _check_metric_expect_init(self):
# Verifies that the cache metrics are all zero.
assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
assert self.get_metric('impala.codegen-cache.entries-in-use') == 0
assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') == 0
assert self.get_metric('impala.codegen-cache.hits') == 0
assert self.get_metric('impala.codegen-cache.misses') == 0
def _test_codegen_cache(self, vector, sql, expect_hit=True, expect_num_frag=2):
# Do not disable codegen.
exec_options = copy(vector.get_value('exec_option'))
exec_options['exec_single_node_rows_threshold'] = 0
self._check_metric_expect_init()
result = self.execute_query(sql, exec_options)
assert_codegen_cache_hit(result.runtime_profile, False)
# expect_num_cache_miss_fragment is 1 iff expect_hit is False, and expect only
# one fragment codegen cache missing for the case if expect_hit is False.
expect_num_cache_miss_fragment = 1
if expect_hit:
expect_num_cache_miss_fragment = 0
expect_num_cache_hit = expect_num_frag - expect_num_cache_miss_fragment
# Verifies that the cache misses > 0, because the look up fails in an empty
# brandnew cache, then a new entry should be stored successfully, so the in-use
# entry number and bytes should be larger than 0.
assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
assert self.get_metric('impala.codegen-cache.entries-in-use') == expect_num_cache_hit
assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
assert self.get_metric('impala.codegen-cache.hits') == 0
assert self.get_metric('impala.codegen-cache.misses') == expect_num_cache_hit
result = self.execute_query(sql, exec_options)
# Verify again, the expected cache hit should be reflected.
if expect_hit:
assert_codegen_cache_hit(result.runtime_profile, True)
else:
assert_codegen_cache_hit(result.runtime_profile, False)
assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
assert self.get_metric('impala.codegen-cache.entries-in-use') == expect_num_cache_hit
assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
assert self.get_metric('impala.codegen-cache.hits') == expect_num_cache_hit
assert self.get_metric('impala.codegen-cache.misses') == expect_num_cache_hit
def _load_functions(self, database):
create_func_template = """
use default;
drop database if exists {database} CASCADE;
create database {database};
create aggregate function {database}.test_count(int) returns bigint
location '{location_uda}' update_fn='CountUpdate';
create function {database}.identity(boolean) returns boolean
location '{location_udf}' symbol='Identity';
create function {database}.identity(bigint) returns bigint
location '{location_udf}' symbol='Identity';
use {database};
"""
location_uda = get_fs_path('/test-warehouse/libudasample.so')
location_udf = get_fs_path('/test-warehouse/libTestUdfs.so')
queries = create_func_template.format(database=database,
location_uda=location_uda, location_udf=location_udf)
queries = [q for q in queries.split(';') if q.strip()]
for query in queries:
if query.strip() == '': continue
result = self.execute_query_expect_success(self.client, query)
assert result is not None
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=3,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_udf_crash(self, vector):
# The testcase would crash if we don't disable the native udf for codegen cache.
database = "test_codegen_cache_udf_crash"
self._load_functions(database)
self.run_test_case('QueryTest/codegen-cache-udf', vector, use_db=database)
# Even the udf is disabled and the queries are using the udf, there could be
# other fragments stored to the codegen cache, so we check whether the codegen
# cache is enabled to other cases.
assert self.get_metric('impala.codegen-cache.entries-in-use') > 0
assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
# Run multiple times, recreate the udfs, would crash if the udf is reused from
# the codegen cache.
for i in range(3):
# Make the database different
database = database + "diff"
self._load_functions(database)
self.run_test_case('QueryTest/codegen-cache-udf', vector, use_db=database)
def _test_codegen_cache_timezone_crash_helper(self, database):
create_db_template = """
use default;
drop database if exists {database} CASCADE;
create database {database};
create table {database}.alltimezones as select * from functional.alltimezones;
use {database};
"""
queries = create_db_template.format(database=database)
queries = [q for q in queries.split(';') if q.strip()]
query = "select timezone, utctime, localtime,\
from_utc_timestamp(utctime,timezone) as\
impalaresult from alltimezones where\
localtime != from_utc_timestamp(utctime,timezone)"
queries.append(query)
for query in queries:
if query.strip() == '': continue
result = self.execute_query_expect_success(self.client, query)
assert result is not None
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(cluster_size=1,
impalad_args="--codegen_cache_capacity=1GB")
def test_codegen_cache_timezone_crash(self, vector):
# The testcase tests whether it would crash using the broken builtin function
# from_utc_timestamp from the codegen cache.
database = "test_codegen_cache_timezone_crash"
# Run multiple times, recreate the database each time. Except for the first run,
# other runs should all hit the cache.
# Expect won't crash.
for i in range(5):
# Make the database different
self._test_codegen_cache_timezone_crash_helper(database + str(i))
# During the table creation, there will be one fragment involved, for the
# query we are going to test, will be two fragments, so totally three
# fragments involved, should all be cached.
assert self.get_metric('impala.codegen-cache.entries-in-use') == 3
assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
assert self.get_metric('impala.codegen-cache.hits') == i * 3
assert self.get_metric('impala.codegen-cache.misses') == 3