mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
The correctness verification for the tuple cache found an issue with TestParquet::test_resolution_by_name(). The test creates a table, selects, alters the table to change a column name, and selects again. With parquet_fallback_schema_resolution=NAME, the column names determine behavior. The tuple cache key did not include the column names, so it was producing an incorrect result after changing the column name. This change adds information about the column / field name to the TSlotDescriptor so that it is incorporated into the tuple cache key. This is only needed when producing the tuple cache key, so it is omitted for other cases. Testing: - Ran TestParquet::test_resolution_by_name() with correctness verification - Added custom cluster test that runs the test_resolution_by_name() test case with tuple caching. This fails without this change. Change-Id: Iebfa777452daf66851b86383651d35e1b0a5f262 Reviewed-on: http://gerrit.cloudera.org:8080/23073 Reviewed-by: Yida Wu <wydbaggio000@gmail.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
239 lines
5.9 KiB
Plaintext
239 lines
5.9 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Create a table and populate with data file
|
|
create table resolution_by_name_test stored as parquet
|
|
as select * from functional_parquet.tinytable;
|
|
select a, b from resolution_by_name_test;
|
|
---- TYPES
|
|
string,string
|
|
---- RESULTS
|
|
'aaaaaaa','bbbbbbb'
|
|
'ccccc','dddd'
|
|
'eeeeeeee','f'
|
|
====
|
|
---- QUERY
|
|
# Rearrange the columns and make sure we can still resolve by name
|
|
alter table resolution_by_name_test replace columns (b string, a string);
|
|
set parquet_fallback_schema_resolution="NAME";
|
|
select a, b from resolution_by_name_test;
|
|
---- TYPES
|
|
string,string
|
|
---- RESULTS
|
|
'aaaaaaa','bbbbbbb'
|
|
'ccccc','dddd'
|
|
'eeeeeeee','f'
|
|
====
|
|
---- QUERY
|
|
# Renaming a column will cause the column to not be resolved
|
|
set parquet_fallback_schema_resolution="NAME";
|
|
alter table resolution_by_name_test change a new_a string;
|
|
select new_a from resolution_by_name_test;
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
====
|
|
---- QUERY
|
|
# This test case is added specifically for tuple caching. This query is symmetric
|
|
# to the "select a, b" query above, so it tests that the tuple cache key contains
|
|
# enough information to distinguish the different columns.
|
|
set parquet_fallback_schema_resolution="NAME";
|
|
select new_a, b from resolution_by_name_test;
|
|
---- TYPES
|
|
string,string
|
|
---- RESULTS
|
|
'NULL','bbbbbbb'
|
|
'NULL','dddd'
|
|
'NULL','f'
|
|
====
|
|
---- QUERY
|
|
# Can still resolve by ordinal
|
|
set parquet_fallback_schema_resolution="POSITION";
|
|
select b, new_a from resolution_by_name_test;
|
|
---- TYPES
|
|
string,string
|
|
---- RESULTS
|
|
'aaaaaaa','bbbbbbb'
|
|
'ccccc','dddd'
|
|
'eeeeeeee','f'
|
|
====
|
|
---- QUERY
|
|
# Check that we can parse the integer enum value as well
|
|
set parquet_fallback_schema_resolution=1;
|
|
select new_a from resolution_by_name_test;
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
====
|
|
---- QUERY
|
|
set parquet_fallback_schema_resolution=0;
|
|
select b, new_a from resolution_by_name_test;
|
|
---- TYPES
|
|
string,string
|
|
---- RESULTS
|
|
'aaaaaaa','bbbbbbb'
|
|
'ccccc','dddd'
|
|
'eeeeeeee','f'
|
|
====
|
|
---- QUERY
|
|
# Test nested types resolution
|
|
create table nested_resolution_by_name_test like functional_parquet.complextypestbl;
|
|
====
|
|
---- SHELL
|
|
hdfs dfs -cp -d -f $FILESYSTEM_PREFIX/test-warehouse/complextypestbl_parquet/nullable.parq \
|
|
$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/nested_resolution_by_name_test/
|
|
hdfs dfs -cp -d -f $FILESYSTEM_PREFIX/test-warehouse/complextypestbl_parquet/nonnullable.parq \
|
|
$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/nested_resolution_by_name_test/
|
|
====
|
|
---- QUERY
|
|
select id, nested_struct.a, b.item
|
|
from nested_resolution_by_name_test t, t.nested_struct.b
|
|
---- TYPES
|
|
bigint,int,int
|
|
---- RESULTS
|
|
1,1,1
|
|
2,NULL,NULL
|
|
7,7,2
|
|
7,7,3
|
|
7,7,NULL
|
|
8,-1,-1
|
|
====
|
|
---- QUERY
|
|
# Can safely ignore extra fields in nested_struct
|
|
alter table nested_resolution_by_name_test change nested_struct nested_struct
|
|
struct<a:int, b: array<int>>;
|
|
select id, nested_struct.a, b.item
|
|
from nested_resolution_by_name_test t, t.nested_struct.b
|
|
---- TYPES
|
|
bigint,int,int
|
|
---- RESULTS
|
|
1,1,1
|
|
2,NULL,NULL
|
|
7,7,2
|
|
7,7,3
|
|
7,7,NULL
|
|
8,-1,-1
|
|
====
|
|
---- QUERY
|
|
# Rearrange nested_struct's fields and make sure we can still resolve by name
|
|
alter table nested_resolution_by_name_test change nested_struct nested_struct
|
|
struct<b: array<int>, a: int>;
|
|
set parquet_fallback_schema_resolution="name";
|
|
select id, nested_struct.a, b.item
|
|
from nested_resolution_by_name_test t, t.nested_struct.b
|
|
---- TYPES
|
|
bigint,int,int
|
|
---- RESULTS
|
|
1,1,1
|
|
2,NULL,NULL
|
|
7,7,2
|
|
7,7,3
|
|
7,7,NULL
|
|
8,-1,-1
|
|
====
|
|
---- QUERY
|
|
# Can add back a single field
|
|
alter table nested_resolution_by_name_test change nested_struct nested_struct
|
|
struct<b: array<int>, a: int, g: map<string, struct<h: struct<i: array<float>>>>>;
|
|
set parquet_fallback_schema_resolution="name";
|
|
select id, g.key
|
|
from nested_resolution_by_name_test t, t.nested_struct.g
|
|
---- TYPES
|
|
bigint,string
|
|
---- RESULTS
|
|
1,'foo'
|
|
2,'g1'
|
|
2,'g2'
|
|
2,'g3'
|
|
2,'g4'
|
|
2,'g5'
|
|
5,'foo'
|
|
====
|
|
---- QUERY
|
|
# Add back single more nested field (and remove 'g' field)
|
|
alter table nested_resolution_by_name_test change nested_struct nested_struct
|
|
struct<b: array<int>, a: int, c: struct<d: array<array<struct<f: string>>>>>;
|
|
set parquet_fallback_schema_resolution="name";
|
|
select tmp.f from nested_resolution_by_name_test.nested_struct.c.d.item tmp;
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'aaa'
|
|
'bbb'
|
|
'c'
|
|
'NULL'
|
|
'aaa'
|
|
'NULL'
|
|
'bbb'
|
|
'NULL'
|
|
'c'
|
|
'NULL'
|
|
'NULL'
|
|
'nonnullable'
|
|
====
|
|
---- QUERY
|
|
# Can't rename nested field
|
|
alter table nested_resolution_by_name_test change nested_struct nested_struct
|
|
struct<b: array<int>, a: int, c: struct<d: array<array<struct<renamed: string>>>>>;
|
|
set parquet_fallback_schema_resolution="name";
|
|
select tmp.renamed from nested_resolution_by_name_test.nested_struct.c.d.item tmp;
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
'NULL'
|
|
====
|
|
---- QUERY
|
|
# Test switched key/value map fields
|
|
create table switched_map_fields_resolution_test (int_map map<string,int>)
|
|
stored as parquet;
|
|
====
|
|
---- SHELL
|
|
hdfs dfs -copyFromLocal -d -f \
|
|
$IMPALA_HOME/testdata/parquet_schema_resolution/switched_map.parq \
|
|
$FILESYSTEM_PREFIX/test-warehouse/$DATABASE.db/switched_map_fields_resolution_test/
|
|
====
|
|
---- QUERY
|
|
# Switched map fields should be resolvable by name.
|
|
set parquet_fallback_schema_resolution="name";
|
|
select key, value from switched_map_fields_resolution_test.int_map
|
|
---- TYPES
|
|
string,int
|
|
---- RESULTS
|
|
'a',1
|
|
'b',2
|
|
'c',3
|
|
====
|
|
---- QUERY
|
|
# Can't resolve switched map fields by position since types are switched.
|
|
set parquet_fallback_schema_resolution="position";
|
|
select key, value from switched_map_fields_resolution_test.int_map
|
|
---- CATCH
|
|
File '$NAMENODE/test-warehouse/$DATABASE.db/switched_map_fields_resolution_test/
|
|
switched_map.parq' has an incompatible Parquet schema for column
|
|
'$DATABASE.switched_map_fields_resolution_test.int_map.key'.
|
|
Column type: STRING, Parquet schema:
|
|
required int32 value [i:0 d:1 r:1]
|
|
====
|
|
---- QUERY
|
|
# Check that we handle bad options gracefully
|
|
set parquet_fallback_schema_resolution="FOO"
|
|
---- CATCH
|
|
Invalid parquet fallback schema resolution: 'FOO'. Valid values are POSITION(0), NAME(1), FIELD_ID(2).
|
|
====
|