IMPALA-13887: Incorporate column/field information into cache key

The correctness verification for the tuple cache found an issue
with TestParquet::test_resolution_by_name(). The test creates a
table, selects, alters the table to change a column name, and
selects again. With parquet_fallback_schema_resolution=NAME, the
column names determine behavior. The tuple cache key did not
include the column names, so it was producing an incorrect result
after changing the column name.

This change adds information about the column / field name to the
TSlotDescriptor so that it is incorporated into the tuple cache key.
This is only needed when producing the tuple cache key, so it is
omitted for other cases.

Testing:
 - Ran TestParquet::test_resolution_by_name() with correctness
   verification
 - Added custom cluster test that runs the test_resolution_by_name()
   test case with tuple caching. This fails without this change.

Change-Id: Iebfa777452daf66851b86383651d35e1b0a5f262
Reviewed-on: http://gerrit.cloudera.org:8080/23073
Reviewed-by: Yida Wu <wydbaggio000@gmail.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Joe McDonnell
2025-06-15 10:05:56 -07:00
parent 50f01352aa
commit 7b25a7b070
4 changed files with 37 additions and 0 deletions

View File

@@ -52,6 +52,12 @@ struct TSlotDescriptor {
9: required i32 slotIdx
10: required CatalogObjects.TVirtualColumnType virtual_col_type =
CatalogObjects.TVirtualColumnType.NONE
// The path includes column / field names materialized by a scan. This is set for
// producing the tuple cache key, because the names of columns / fields determine
// behavior when resolving Parquet columns/fields by name. This information is
// provided by other structures for the executor, so it only needs to be set for
// the tuple cache.
11: optional string path
}
struct TColumnDescriptor {

View File

@@ -478,6 +478,19 @@ public class SlotDescriptor {
serialCtx.translateTupleId(parent_.getId()).asInt(), type_.toThrift(),
materializedPath, byteOffset_, nullIndicatorByte_, nullIndicatorBit_,
slotIdx_, getVirtualColumnType());
// The path contains information about the column/field materialized by the scan.
// This information is needed for the tuple caching key, because the names of
// the columns / fields determine the runtime behavior when resolving Parquet
// columns by name (i.e. a table with column X is different from a table with column
// Y even if it points to the same files). This information is provided to the
// executor in other ways, so this is only necessary when constructing the tuple
// caching key.
if (serialCtx.isTupleCache() && path_ != null) {
// When path_ is non-null, label_ is a representation of the path as a single
// string. label_ can still be set when the path_ is null, but that content is
// not interesting to the tuple cache.
result.setPath(label_);
}
if (itemTupleDesc_ != null) {
// Check for recursive or otherwise invalid item tuple descriptors. Since we assign
// tuple ids globally in increasing order, the id of an item tuple descriptor must

View File

@@ -36,6 +36,19 @@ string
'NULL'
====
---- QUERY
# This test case is added specifically for tuple caching. This query is symmetric
# to the "select a, b" query above, so it tests that the tuple cache key contains
# enough information to distinguish the different columns.
set parquet_fallback_schema_resolution="NAME";
select new_a, b from resolution_by_name_test;
---- TYPES
string,string
---- RESULTS
'NULL','bbbbbbb'
'NULL','dddd'
'NULL','f'
====
---- QUERY
# Can still resolve by ordinal
set parquet_fallback_schema_resolution="POSITION";
select b, new_a from resolution_by_name_test;

View File

@@ -399,6 +399,11 @@ class TestTupleCacheSingle(TestTupleCacheBase):
assert result_agg.success
assertCounterOrder(result_agg.runtime_profile, NUM_HITS, [1, 0])
def test_parquet_resolution_by_name(self, vector, unique_database):
"""Verify that parquet_fallback_schema_resolution=NAME works with tuple caching"""
self.run_test_case('QueryTest/parquet-resolution-by-name', vector,
use_db=unique_database)
@CustomClusterTestSuite.with_args(start_args=CACHE_START_ARGS)
class TestTupleCacheCluster(TestTupleCacheBase):