IMPALA-13188: Add test that compute stats does not result in a different tuple cache key

The patch introduces a new test, TestTupleCacheComputeStats, to verify that compute stats does not change the tuple cache key. The test creates a simple table with one row, runs an explain on a basic query, then inserts more rows, computes the stats, and reruns the same explain query. It compares the two results to ensure that the cache keys are identical in the planning phase. Tests: Passed the test. Change-Id: I918232f0af3a6ab8c32823da4dba8f8cd31369d0 Reviewed-on: http://gerrit.cloudera.org:8080/21917 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-25 02:03:09 -05:00 · 2024-10-09 16:00:13 -07:00
parent 3cf05fe21a
commit 47b638e667
2 changed files with 73 additions and 0 deletions
--- a/tests/custom_cluster/test_tuple_cache.py
+++ b/tests/custom_cluster/test_tuple_cache.py
@@ -25,6 +25,8 @@ import string
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.test_dimensions import (
    add_exec_option_dimension, add_mandatory_exec_option)
+from tests.util.parse_util import (
+    match_memory_estimate, parse_mem_to_mb, match_cache_key)

 TABLE_LAYOUT = 'name STRING, age INT, address STRING'
 CACHE_START_ARGS = "--tuple_cache_dir=/tmp --log_level=2"
@@ -703,3 +705,48 @@ class TestTupleCacheCountStar(TestTupleCacheBase):
    result1 = self.execute_query(query)
    result2 = self.execute_query(query)
    assert result1.success and result2.success
+
+
+class TestTupleCacheComputeStats(TestTupleCacheBase):
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestTupleCacheComputeStats, cls).add_test_dimensions()
+    add_exec_option_dimension(cls, 'mt_dop', [0, 2])
+
+  @CustomClusterTestSuite.with_args(
+    start_args=CACHE_START_ARGS, cluster_size=1)
+  @pytest.mark.execute_serially
+  def test_tuple_cache_key_with_stats(self, vector, unique_database):
+    """
+    This test verifies if compute stats affect the tuple cache key.
+    """
+    self.client.set_configuration(vector.get_value('exec_option'))
+    fq_table = "{0}.tuple_cache_stats_test".format(unique_database)
+
+    # Create a table.
+    self.create_table(fq_table, scale=1)
+
+    # Get the explain text for a simple query.
+    query = "explain select * from {0}".format(fq_table)
+    result1 = self.execute_query(query)
+
+    # Insert rows to make the stats different.
+    for i in range(10):
+      self.execute_query("INSERT INTO {0} VALUES ({1})".format(
+        fq_table, table_value(i)))
+
+    # Run compute stats and get the explain text again for the same query.
+    self.client.execute("COMPUTE STATS {0}".format(fq_table))
+    result2 = self.execute_query(query)
+
+    # Verify memory estimations are different, while the cache keys are identical.
+    assert result1.success and result2.success
+    mem_limit1, units1 = match_memory_estimate(result1.data)
+    mem_limit1 = parse_mem_to_mb(mem_limit1, units1)
+    mem_limit2, units2 = match_memory_estimate(result2.data)
+    mem_limit2 = parse_mem_to_mb(mem_limit2, units2)
+    assert mem_limit1 != mem_limit2
+    cache_key1 = match_cache_key(result1.data)
+    cache_key2 = match_cache_key(result2.data)
+    assert cache_key1 is not None and cache_key1 == cache_key2
--- a/tests/util/parse_util.py
+++ b/tests/util/parse_util.py
@@ -34,6 +34,7 @@ EXPECTED_TPCH_STRESS_QUERIES_COUNT = EXPECTED_TPCH_QUERIES_COUNT + 3
 MEM_ESTIMATE_PATTERN = re.compile(
    r"Per-Host Resource Estimates: Memory=(\d+\.?\d*)(P|T|G|M|K)?B")
 NEW_GLOG_ENTRY_PATTERN = re.compile(r"[IWEF](?P<Time>\d{4} \d{2}:\d{2}:\d{2}\.\d{6}).*")
+CACHE_KEY_PATTERN = re.compile(r"cache key: ([a-f0-9]+)")


 def parse_glog(text, start_time=None):
@@ -155,6 +156,31 @@ def match_memory_estimate(explain_lines):
  return mem_limit, units


+def match_cache_key(explain_lines):
+  """
+  Given a list of strings from EXPLAIN output, find the cache key.
+
+  Params:
+    explain_lines: list of str
+
+  Returns:
+    str - The cache key if found
+
+  Raises:
+    Exception if no cache key is found
+  """
+  cache_key = None
+  for line in explain_lines:
+    regex_result = CACHE_KEY_PATTERN.search(line)
+    if regex_result:
+      cache_key = regex_result.group(1)
+      break
+  if cache_key is None:
+    raise Exception(
+      'could not find cache key in explain string:\n' + '\n'.join(explain_lines))
+  return cache_key
+
+
 def get_bytes_summary_stats_counter(counter_name, runtime_profile):
  """Extracts a list of TSummaryStatsCounters from a given runtime profile where the units
     are in bytes. Each entry in the returned list corresponds to a single occurrence of