IMPALA-13188: Add test that compute stats does not result in a different tuple cache key

The patch introduces a new test, TestTupleCacheComputeStats, to
verify that compute stats does not change the tuple cache key.
The test creates a simple table with one row, runs an explain
on a basic query, then inserts more rows, computes the stats,
and reruns the same explain query. It compares the two results
to ensure that the cache keys are identical in the planning
phase.

Tests:
Passed the test.

Change-Id: I918232f0af3a6ab8c32823da4dba8f8cd31369d0
Reviewed-on: http://gerrit.cloudera.org:8080/21917
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Yida Wu
2024-10-09 16:00:13 -07:00
committed by Impala Public Jenkins
parent 3cf05fe21a
commit 47b638e667
2 changed files with 73 additions and 0 deletions

View File

@@ -25,6 +25,8 @@ import string
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.test_dimensions import (
add_exec_option_dimension, add_mandatory_exec_option)
from tests.util.parse_util import (
match_memory_estimate, parse_mem_to_mb, match_cache_key)
TABLE_LAYOUT = 'name STRING, age INT, address STRING'
CACHE_START_ARGS = "--tuple_cache_dir=/tmp --log_level=2"
@@ -703,3 +705,48 @@ class TestTupleCacheCountStar(TestTupleCacheBase):
result1 = self.execute_query(query)
result2 = self.execute_query(query)
assert result1.success and result2.success
class TestTupleCacheComputeStats(TestTupleCacheBase):
@classmethod
def add_test_dimensions(cls):
super(TestTupleCacheComputeStats, cls).add_test_dimensions()
add_exec_option_dimension(cls, 'mt_dop', [0, 2])
@CustomClusterTestSuite.with_args(
start_args=CACHE_START_ARGS, cluster_size=1)
@pytest.mark.execute_serially
def test_tuple_cache_key_with_stats(self, vector, unique_database):
"""
This test verifies if compute stats affect the tuple cache key.
"""
self.client.set_configuration(vector.get_value('exec_option'))
fq_table = "{0}.tuple_cache_stats_test".format(unique_database)
# Create a table.
self.create_table(fq_table, scale=1)
# Get the explain text for a simple query.
query = "explain select * from {0}".format(fq_table)
result1 = self.execute_query(query)
# Insert rows to make the stats different.
for i in range(10):
self.execute_query("INSERT INTO {0} VALUES ({1})".format(
fq_table, table_value(i)))
# Run compute stats and get the explain text again for the same query.
self.client.execute("COMPUTE STATS {0}".format(fq_table))
result2 = self.execute_query(query)
# Verify memory estimations are different, while the cache keys are identical.
assert result1.success and result2.success
mem_limit1, units1 = match_memory_estimate(result1.data)
mem_limit1 = parse_mem_to_mb(mem_limit1, units1)
mem_limit2, units2 = match_memory_estimate(result2.data)
mem_limit2 = parse_mem_to_mb(mem_limit2, units2)
assert mem_limit1 != mem_limit2
cache_key1 = match_cache_key(result1.data)
cache_key2 = match_cache_key(result2.data)
assert cache_key1 is not None and cache_key1 == cache_key2

View File

@@ -34,6 +34,7 @@ EXPECTED_TPCH_STRESS_QUERIES_COUNT = EXPECTED_TPCH_QUERIES_COUNT + 3
MEM_ESTIMATE_PATTERN = re.compile(
r"Per-Host Resource Estimates: Memory=(\d+\.?\d*)(P|T|G|M|K)?B")
NEW_GLOG_ENTRY_PATTERN = re.compile(r"[IWEF](?P<Time>\d{4} \d{2}:\d{2}:\d{2}\.\d{6}).*")
CACHE_KEY_PATTERN = re.compile(r"cache key: ([a-f0-9]+)")
def parse_glog(text, start_time=None):
@@ -155,6 +156,31 @@ def match_memory_estimate(explain_lines):
return mem_limit, units
def match_cache_key(explain_lines):
"""
Given a list of strings from EXPLAIN output, find the cache key.
Params:
explain_lines: list of str
Returns:
str - The cache key if found
Raises:
Exception if no cache key is found
"""
cache_key = None
for line in explain_lines:
regex_result = CACHE_KEY_PATTERN.search(line)
if regex_result:
cache_key = regex_result.group(1)
break
if cache_key is None:
raise Exception(
'could not find cache key in explain string:\n' + '\n'.join(explain_lines))
return cache_key
def get_bytes_summary_stats_counter(counter_name, runtime_profile):
"""Extracts a list of TSummaryStatsCounters from a given runtime profile where the units
are in bytes. Each entry in the returned list corresponds to a single occurrence of