Files
impala/testdata/workloads/functional-query/queries/QueryTest/parquet-resolution-by-name.test
Joe McDonnell 7b25a7b070 IMPALA-13887: Incorporate column/field information into cache key
The correctness verification for the tuple cache found an issue
with TestParquet::test_resolution_by_name(). The test creates a
table, selects, alters the table to change a column name, and
selects again. With parquet_fallback_schema_resolution=NAME, the
column names determine behavior. The tuple cache key did not
include the column names, so it was producing an incorrect result
after changing the column name.

This change adds information about the column / field name to the
TSlotDescriptor so that it is incorporated into the tuple cache key.
This is only needed when producing the tuple cache key, so it is
omitted for other cases.

Testing:
 - Ran TestParquet::test_resolution_by_name() with correctness
   verification
 - Added custom cluster test that runs the test_resolution_by_name()
   test case with tuple caching. This fails without this change.

Change-Id: Iebfa777452daf66851b86383651d35e1b0a5f262
Reviewed-on: http://gerrit.cloudera.org:8080/23073
Reviewed-by: Yida Wu <wydbaggio000@gmail.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-06-27 18:13:19 +00:00

239 lines
5.9 KiB
Plaintext

====
---- QUERY
# Create a table and populate with data file
create table resolution_by_name_test stored as parquet
as select * from functional_parquet.tinytable;
select a, b from resolution_by_name_test;
---- TYPES
string,string
---- RESULTS
'aaaaaaa','bbbbbbb'
'ccccc','dddd'
'eeeeeeee','f'
====
---- QUERY
# Rearrange the columns and make sure we can still resolve by name
alter table resolution_by_name_test replace columns (b string, a string);
set parquet_fallback_schema_resolution="NAME";
select a, b from resolution_by_name_test;
---- TYPES
string,string
---- RESULTS
'aaaaaaa','bbbbbbb'
'ccccc','dddd'
'eeeeeeee','f'
====
---- QUERY
# Renaming a column will cause the column to not be resolved
set parquet_fallback_schema_resolution="NAME";
alter table resolution_by_name_test change a new_a string;
select new_a from resolution_by_name_test;
---- TYPES
string
---- RESULTS
'NULL'
'NULL'
'NULL'
====
---- QUERY
# This test case is added specifically for tuple caching. This query is symmetric
# to the "select a, b" query above, so it tests that the tuple cache key contains
# enough information to distinguish the different columns.
set parquet_fallback_schema_resolution="NAME";
select new_a, b from resolution_by_name_test;
---- TYPES
string,string
---- RESULTS
'NULL','bbbbbbb'
'NULL','dddd'
'NULL','f'
====
---- QUERY
# Can still resolve by ordinal
set parquet_fallback_schema_resolution="POSITION";
select b, new_a from resolution_by_name_test;
---- TYPES
string,string
---- RESULTS
'aaaaaaa','bbbbbbb'
'ccccc','dddd'
'eeeeeeee','f'
====
---- QUERY
# Check that we can parse the integer enum value as well
set parquet_fallback_schema_resolution=1;
select new_a from resolution_by_name_test;
---- TYPES
string
---- RESULTS
'NULL'
'NULL'
'NULL'
====
---- QUERY
set parquet_fallback_schema_resolution=0;
select b, new_a from resolution_by_name_test;
---- TYPES
string,string
---- RESULTS
'aaaaaaa','bbbbbbb'
'ccccc','dddd'
'eeeeeeee','f'
====
---- QUERY
# Test nested types resolution
create table nested_resolution_by_name_test like functional_parquet.complextypestbl;
====
---- SHELL
hdfs dfs -cp -d -f $FILESYSTEM_PREFIX/test-warehouse/complextypestbl_parquet/nullable.parq \
$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/nested_resolution_by_name_test/
hdfs dfs -cp -d -f $FILESYSTEM_PREFIX/test-warehouse/complextypestbl_parquet/nonnullable.parq \
$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/nested_resolution_by_name_test/
====
---- QUERY
select id, nested_struct.a, b.item
from nested_resolution_by_name_test t, t.nested_struct.b
---- TYPES
bigint,int,int
---- RESULTS
1,1,1
2,NULL,NULL
7,7,2
7,7,3
7,7,NULL
8,-1,-1
====
---- QUERY
# Can safely ignore extra fields in nested_struct
alter table nested_resolution_by_name_test change nested_struct nested_struct
struct<a:int, b: array<int>>;
select id, nested_struct.a, b.item
from nested_resolution_by_name_test t, t.nested_struct.b
---- TYPES
bigint,int,int
---- RESULTS
1,1,1
2,NULL,NULL
7,7,2
7,7,3
7,7,NULL
8,-1,-1
====
---- QUERY
# Rearrange nested_struct's fields and make sure we can still resolve by name
alter table nested_resolution_by_name_test change nested_struct nested_struct
struct<b: array<int>, a: int>;
set parquet_fallback_schema_resolution="name";
select id, nested_struct.a, b.item
from nested_resolution_by_name_test t, t.nested_struct.b
---- TYPES
bigint,int,int
---- RESULTS
1,1,1
2,NULL,NULL
7,7,2
7,7,3
7,7,NULL
8,-1,-1
====
---- QUERY
# Can add back a single field
alter table nested_resolution_by_name_test change nested_struct nested_struct
struct<b: array<int>, a: int, g: map<string, struct<h: struct<i: array<float>>>>>;
set parquet_fallback_schema_resolution="name";
select id, g.key
from nested_resolution_by_name_test t, t.nested_struct.g
---- TYPES
bigint,string
---- RESULTS
1,'foo'
2,'g1'
2,'g2'
2,'g3'
2,'g4'
2,'g5'
5,'foo'
====
---- QUERY
# Add back single more nested field (and remove 'g' field)
alter table nested_resolution_by_name_test change nested_struct nested_struct
struct<b: array<int>, a: int, c: struct<d: array<array<struct<f: string>>>>>;
set parquet_fallback_schema_resolution="name";
select tmp.f from nested_resolution_by_name_test.nested_struct.c.d.item tmp;
---- TYPES
string
---- RESULTS
'aaa'
'bbb'
'c'
'NULL'
'aaa'
'NULL'
'bbb'
'NULL'
'c'
'NULL'
'NULL'
'nonnullable'
====
---- QUERY
# Can't rename nested field
alter table nested_resolution_by_name_test change nested_struct nested_struct
struct<b: array<int>, a: int, c: struct<d: array<array<struct<renamed: string>>>>>;
set parquet_fallback_schema_resolution="name";
select tmp.renamed from nested_resolution_by_name_test.nested_struct.c.d.item tmp;
---- TYPES
string
---- RESULTS
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
'NULL'
====
---- QUERY
# Test switched key/value map fields
create table switched_map_fields_resolution_test (int_map map<string,int>)
stored as parquet;
====
---- SHELL
hdfs dfs -copyFromLocal -d -f \
$IMPALA_HOME/testdata/parquet_schema_resolution/switched_map.parq \
$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/switched_map_fields_resolution_test/
====
---- QUERY
# Switched map fields should be resolvable by name.
set parquet_fallback_schema_resolution="name";
select key, value from switched_map_fields_resolution_test.int_map
---- TYPES
string,int
---- RESULTS
'a',1
'b',2
'c',3
====
---- QUERY
# Can't resolve switched map fields by position since types are switched.
set parquet_fallback_schema_resolution="position";
select key, value from switched_map_fields_resolution_test.int_map
---- CATCH
File '$NAMENODE/test-warehouse/$DATABASE.db/switched_map_fields_resolution_test/
switched_map.parq' has an incompatible Parquet schema for column
'$DATABASE.switched_map_fields_resolution_test.int_map.key'.
Column type: STRING, Parquet schema:
required int32 value [i:0 d:1 r:1]
====
---- QUERY
# Check that we handle bad options gracefully
set parquet_fallback_schema_resolution="FOO"
---- CATCH
Invalid parquet fallback schema resolution: 'FOO'. Valid values are POSITION(0), NAME(1), FIELD_ID(2).
====