IMPALA-9499: Display support for all complex types in a SELECT * query

This change adds EXPAND_COMPLEX_TYPES query option to support the
display of complex types in SELECT statements where star (*) expression
is in the select list. By default, the query option is disabled. When
it's enabled, it changes the behaviour of star expansion to list all
top-level column fields including ones with complex types, instead of
listing the scalar column fields only. Nested complex type expansion is
also supported, eg.: struct.* will enumerate the members of the struct.
Array, map and struct types are supported.

Testing:
 - Analyzer tests check select statements when the query option is
   enabled or disabled.
 - EE tests check the proper complex type deserialization when the query
   option is enabled, and the original behaviour when the option is
   disabled.

Change-Id: I84b5e5703f9e0ce0f4f8bff83941677dd7489974
Reviewed-on: http://gerrit.cloudera.org:8080/18863
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Peter Rozsa
2022-08-11 09:24:09 +02:00
committed by Impala Public Jenkins
parent 0b251cc6bc
commit 04b5319e6e
10 changed files with 255 additions and 11 deletions

View File

@@ -1229,6 +1229,10 @@ Status impala::SetQueryOption(const string& key, const string& value,
query_options->__set_orc_schema_resolution(enum_type);
break;
}
case TImpalaQueryOptions::EXPAND_COMPLEX_TYPES: {
query_options->__set_expand_complex_types(IsTrue(value));
break;
}
default:
if (IsRemovedQueryOption(key)) {
LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'";

View File

@@ -50,7 +50,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
// time we add or remove a query option to/from the enum TImpalaQueryOptions.
#define QUERY_OPTS_TABLE \
DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(), \
TImpalaQueryOptions::ORC_SCHEMA_RESOLUTION + 1); \
TImpalaQueryOptions::EXPAND_COMPLEX_TYPES + 1); \
REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \
QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR) \
REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS) \
@@ -274,7 +274,8 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
QUERY_OPT_FN(enable_replan, ENABLE_REPLAN, TQueryOptionLevel::ADVANCED) \
QUERY_OPT_FN(test_replan, TEST_REPLAN, TQueryOptionLevel::ADVANCED) \
QUERY_OPT_FN(lock_max_wait_time_s, LOCK_MAX_WAIT_TIME_S, TQueryOptionLevel::REGULAR) \
QUERY_OPT_FN(orc_schema_resolution, ORC_SCHEMA_RESOLUTION, TQueryOptionLevel::REGULAR);
QUERY_OPT_FN(orc_schema_resolution, ORC_SCHEMA_RESOLUTION, TQueryOptionLevel::REGULAR) \
QUERY_OPT_FN(expand_complex_types, EXPAND_COMPLEX_TYPES, TQueryOptionLevel::REGULAR);
/// Enforce practical limits on some query options to avoid undesired query state.
static const int64_t SPILLABLE_BUFFER_LIMIT = 1LL << 40; // 1 TB

View File

@@ -736,6 +736,10 @@ enum TImpalaQueryOptions {
// Determines how to resolve ORC files' schemas. Valid values are "position" and "name".
ORC_SCHEMA_RESOLUTION = 146;
// Expands complex types in star queries
EXPAND_COMPLEX_TYPES = 147
}
// The summary of a DML statement.

View File

@@ -594,6 +594,9 @@ struct TQueryOptions {
// See comment in ImpalaService.thrift
147: optional TSchemaResolutionStrategy orc_schema_resolution = 0;
// See comment in ImpalaService.thrift
148: optional bool expand_complex_types = false;
}
// Impala currently has three types of sessions: Beeswax, HiveServer2 and external

View File

@@ -455,7 +455,9 @@ public class InlineViewRef extends TableRef {
int numColLabels = getColLabels().size();
Preconditions.checkState(numColLabels > 0);
Set<String> uniqueColAliases = Sets.newHashSetWithExpectedSize(numColLabels);
List<StructField> fields = Lists.newArrayListWithCapacity(numColLabels);
// Using linked set to preserve order and uniqueness of fields. If using a
// list, star-expanded complex columns would be enumerated multiple times.
Set<StructField> fields = Sets.newLinkedHashSetWithExpectedSize(numColLabels);
for (int i = 0; i < numColLabels; ++i) {
// inline view select statement has been analyzed. Col label should be filled.
Expr selectItemExpr = queryStmt_.getResultExprs().get(i);
@@ -512,7 +514,7 @@ public class InlineViewRef extends TableRef {
result.setMaskedTable(baseTbl);
}
}
result.setType(new StructType(fields));
result.setType(new StructType(Lists.newArrayList(fields)));
return result;
}

View File

@@ -793,24 +793,27 @@ public class SelectStmt extends QueryStmt {
/**
* Helper function used during star expansion to add a single result expr
* based on a given raw path to be resolved relative to an existing path.
* Ignores paths with a complex-typed destination because they are currently
* illegal in any select list (even for inline views, etc.)
*/
private void addStarResultExpr(Path resolvedPath,
String... relRawPath) throws AnalysisException {
Path p = Path.createRelPath(resolvedPath, relRawPath);
Preconditions.checkState(p.resolve());
if (p.destType().isComplexType()) return;
SlotDescriptor slotDesc = analyzer_.registerSlotRef(p);
if (p.destType().isComplexType() &&
!analyzer_.getQueryCtx().client_request.query_options.expand_complex_types) {
return;
}
SlotDescriptor slotDesc = analyzer_.registerSlotRef(p, false);
SlotRef slotRef = new SlotRef(slotDesc);
Preconditions.checkState(slotRef.isAnalyzed(),
"Analysis should be done in constructor");
if(slotRef.getType().isStructType()){
slotRef.reExpandStruct(analyzer_);
}
// Empty matched types means this is expanded from star of a catalog table.
// For star of complex types, e.g. my_struct.*, my_array.*, my_map.*, the matched
// types will have the complex type so it's not empty.
if (resolvedPath.getMatchedTypes().isEmpty()) {
Preconditions.checkState(!slotDesc.getType().isComplexType(),
"Star expansion should only introduce scalar columns");
analyzer_.registerColumnForMasking(slotDesc);
}
resultExprs_.add(slotRef);

View File

@@ -23,6 +23,8 @@ import org.apache.impala.thrift.TColumnType;
import org.apache.impala.thrift.TStructField;
import org.apache.impala.thrift.TTypeNode;
import java.util.Objects;
/**
* TODO: Support comments for struct fields. The Metastore does not properly store
* comments of struct fields. We set comment_ to null to avoid compatibility issues.
@@ -99,4 +101,8 @@ public class StructField {
StructField otherStructField = (StructField) other;
return otherStructField.name_.equals(name_) && otherStructField.type_.equals(type_);
}
@Override
public int hashCode() {
return Objects.hash(name_, type_);
}
}

View File

@@ -1071,6 +1071,33 @@ public class AnalyzeStmtsTest extends AnalyzerTest {
"Unable to INSERT into target table (default.new_tbl) because the column " +
"'tiny_struct' has a complex type 'STRUCT<b:BOOLEAN>' and Impala doesn't " +
"support inserting into tables containing complex type columns");
//Make complex types available in star queries
ctx.getQueryOptions().setExpand_complex_types(true);
//TODO: Once IMPALA-10851 is resolved it can be removed
ctx.getQueryOptions().setDisable_codegen(true);
AnalyzesOk("select * from functional_parquet.complextypes_structs",ctx);
AnalyzesOk("select * from functional_parquet.complextypes_nested_structs",ctx);
AnalyzesOk("select * from functional_parquet.complextypes_maps_view",ctx);
AnalyzesOk("select outer_struct.str, outer_struct.* from " +
"functional_parquet.complextypes_nested_structs",ctx);
AnalyzesOk("select * from (select * from " +
"functional_parquet.complextypes_nested_structs) v",ctx);
AnalyzesOk("select * from (select int_map, int_map_array from " +
"functional_parquet.complextypestbl) v",ctx);
ctx.getQueryOptions().setDisable_codegen(false);
AnalyzesOk("select * from functional_parquet.complextypes_arrays",ctx);
AnalyzesOk("select * from " +
"functional_parquet.complextypes_arrays_only_view",ctx);
AnalyzesOk("select v.id, v.* from " +
"(select * from functional_parquet.complextypes_arrays) v",ctx);
AnalysisError("select * from functional.allcomplextypes",
ctx,"STRUCT type inside collection types is not supported.");
}
@Test
@@ -4634,7 +4661,6 @@ public class AnalyzeStmtsTest extends AnalyzerTest {
testNumberOfMembers(BaseTableRef.class, 0);
testNumberOfMembers(InlineViewRef.class, 10);
}
@SuppressWarnings("rawtypes")
private void testNumberOfMembers(Class cl, int expectedNumMembers) {
int actualNumMembers = 0;

View File

@@ -0,0 +1,164 @@
====
---- QUERY
# Expanding star on table with array type
set EXPAND_COMPLEX_TYPES=true;
select * from complextypes_arrays where id = 1
---- RESULTS
1,'[1,2,3,4,5]','["one","two","three","four","five"]'
---- TYPES
INT, STRING, STRING
====
---- QUERY
# Expanding star on table with array type, and an additional scalar field, without EXPAND_COMPLEX_TYPES
select id, * from complextypes_arrays where id = 2
---- RESULTS
2,2
---- TYPES
INT, INT
====
---- QUERY
# Expanding star on table with array type, and an additional scalar field
set EXPAND_COMPLEX_TYPES=true;
select id, * from complextypes_arrays where id = 3
---- RESULTS
3,3,'[10,9,8]','["ten"]'
---- TYPES
INT, INT, STRING, STRING
====
---- QUERY
# Expanding star on view with array type, and an additional scalar field
set EXPAND_COMPLEX_TYPES=true;
select id, * from complextypes_arrays_only_view where id = 4
---- RESULTS
4,4,'NULL','[]'
---- TYPES
BIGINT, BIGINT, STRING, STRING
====
---- QUERY
# Expanding star on view with map type, and an additional scalar field
set EXPAND_COMPLEX_TYPES=true;
select id, * from complextypes_maps_view where id = 5
---- RESULTS
5,5,'{}','NULL'
---- TYPES
BIGINT, BIGINT, STRING, STRING
====
---- QUERY
# Expanding star on views with array type, and an additional scalar field
set EXPAND_COMPLEX_TYPES=true;
select v1.id, * from complextypes_arrays_only_view v1, complextypes_arrays_only_view v2
where v1.id = v2.id and v1.id = 6
---- RESULTS
6,6,'NULL','NULL',6,'NULL','NULL'
---- TYPES
BIGINT, BIGINT, STRING, STRING, BIGINT, STRING, STRING
====
---- QUERY
# Expanding star on views with array type, and an additional scalar field, without EXPAND_COMPLEX_TYPES
select v1.id, *
from complextypes_arrays_only_view v1, complextypes_arrays_only_view v2
where v1.id = v2.id and v1.id = 7
---- RESULTS
7,7,7
---- TYPES
BIGINT, BIGINT, BIGINT
====
---- QUERY
# Expanding stars on view aliases with array type
set EXPAND_COMPLEX_TYPES=true;
select v1.*, v2.* from complextypes_arrays_only_view v1, complextypes_arrays_only_view v2
where v1.id = v2.id and v1.id = 1
---- RESULTS
1,'[1,2,3]','[[1,2],[3,4]]',1,'[1,2,3]','[[1,2],[3,4]]'
---- TYPES
BIGINT, STRING, STRING, BIGINT, STRING, STRING
====
---- QUERY
# Expanding star on table with struct type
set EXPAND_COMPLEX_TYPES=true;
select * from complextypes_structs where id = 2
---- RESULTS
2,'second item','{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}','{"b":false}','{"i":19191,"s":"small_struct_str"}'
---- TYPES
INT, STRING, STRING, STRING, STRING
====
---- QUERY
# Expanding star on table with struct type, and an additional scalar field
set EXPAND_COMPLEX_TYPES=true;
select id, * from complextypes_structs where id = 3
---- RESULTS
3,3,'third item','{"ti":null,"si":null,"i":null,"bi":null,"b":null,"f":null,"do":null,"da":null,"ts":null,"s1":null,"s2":null,"c1":null,"c2":null,"vc":null,"de1":null,"de2":null}','{"b":true}','{"i":98765,"s":null}'
---- TYPES
INT, INT, STRING, STRING, STRING, STRING
====
---- QUERY
# Expanding star on tables with struct type, and an additional scalar field
set EXPAND_COMPLEX_TYPES=true;
select t1.id, *
from complextypes_structs t1, complextypes_structs t2
where t1.id = t2.id and t1.id = 4
---- RESULTS
4,4,'fourth item','{"ti":90,"si":30482,"i":1664336,"bi":23567459873,"b":true,"f":0.5600000023841858,"do":NaN,"da":"2000-12-31","ts":"2024-01-01 00:00:00.123400000","s1":"random string","s2":"","c1":"c","c2":"d ","vc":"addsdrr","de1":33357,"de2":null}','{"b":null}','{"i":null,"s":"str"}',4,'fourth item','{"ti":90,"si":30482,"i":1664336,"bi":23567459873,"b":true,"f":0.5600000023841858,"do":NaN,"da":"2000-12-31","ts":"2024-01-01 00:00:00.123400000","s1":"random string","s2":"","c1":"c","c2":"d ","vc":"addsdrr","de1":33357,"de2":null}','{"b":null}','{"i":null,"s":"str"}'
---- TYPES
INT, INT, STRING, STRING, STRING, STRING, INT, STRING, STRING, STRING, STRING
====
---- QUERY
# Expanding inner struct
set EXPAND_COMPLEX_TYPES=true;
select alltypes.* from complextypes_structs where alltypes.ti = 100
---- RESULTS
100,12348,156789012,163234345342,True,1234.56005859375,65323423.33,2021-05-30,2021-06-01 10:19:04,'some string','another str','x','xyz','somevarcha',12345,NULL
---- TYPES
TINYINT, SMALLINT, INT, BIGINT, BOOLEAN, FLOAT, DOUBLE, DATE, TIMESTAMP, STRING, STRING, CHAR, CHAR, VARCHAR, DECIMAL, DECIMAL
====
---- QUERY
# Expanding struct and array
set EXPAND_COMPLEX_TYPES=true;
select * from complextypes_structs s, complextypes_arrays a
where s.id = a.id and s.id = 5
---- RESULTS
5,'fifth item','NULL','{"b":false}','{"i":98765,"s":"abcde f"}',5,'[10,NULL,12]','["ten","eleven","twelve","thirteen"]'
---- TYPES
INT, STRING, STRING, STRING, STRING, INT, STRING, STRING
====
---- QUERY
# Expanding star on tables with array and struct type, without EXPAND_COMPLEX_TYPES
select a.*, s.*
from complextypes_arrays a, complextypes_structs s
where a.id = s.id and a.id = 6
---- RESULTS
6,6,'sixth item'
---- TYPES
INT, INT, STRING
====
---- QUERY
# Expanding struct and array as separate aliases
set EXPAND_COMPLEX_TYPES=true;
select s.*, a.* from complextypes_structs s, complextypes_arrays a
where s.id = a.id and s.id = 1
---- RESULTS
1,'first item','{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}','{"b":true}','NULL',1,'[1,2,3,4,5]','["one","two","three","four","five"]'
---- TYPES
INT, STRING, STRING, STRING, STRING, INT, STRING, STRING
====
---- QUERY
# Expanding struct, map and array as separate aliases
set EXPAND_COMPLEX_TYPES=true;
select s.*, a.*, m.*
from complextypes_structs s, complextypes_arrays a, complextypes_maps_view m
where s.id = a.id and s.id = m.id and s.id = 2
---- RESULTS
2,'second item','{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}','{"b":false}','{"i":19191,"s":"small_struct_str"}',2,'[1,NULL,3,4,5]','["one","two","three",NULL,"five"]',2,'{"k1":2,"k2":NULL}','[{"k3":NULL,"k1":1},NULL,{}]'
---- TYPES
INT, STRING, STRING, STRING, STRING, INT, STRING, STRING, BIGINT, STRING, STRING
====
---- QUERY
# Expanding struct, map and array as separate aliases, without EXPAND_COMPLEX_TYPES
select s.*, a.*, m.*
from complextypes_structs s, complextypes_arrays a, complextypes_maps_view m
where s.id = a.id and s.id = m.id and s.id = 3
---- RESULTS
3,'third item',3,3
---- TYPES
INT, STRING, INT, BIGINT
====

View File

@@ -877,3 +877,34 @@ class TestMaxNestingDepth(ImpalaTestSuite):
assert False, "Expected table loading to fail."
except ImpalaBeeswaxException, e:
assert "Type exceeds the maximum nesting depth" in str(e)
class TestNestedTypesStarExpansion(ImpalaTestSuite):
"""Functional tests for nested types when star expansion query
option (EXPAND_COMPLEX_TYPES) is enabled/disabled, run for all file formats that
support nested types."""
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestNestedTypesStarExpansion, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ['parquet', 'orc'])
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('protocol') == 'hs2')
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension_from_dict({
'disable_codegen': ['True']}))
cls.ImpalaTestMatrix.add_mandatory_exec_option(
'convert_legacy_hive_parquet_utc_timestamps', 'true')
cls.ImpalaTestMatrix.add_mandatory_exec_option('TIMEZONE', '"Europe/Budapest"')
def test_star_expansion(self, vector):
# Queries with star (*) expression on tables with array, map
# and struct complex type, through views, with query option
# EXPAND_COMPLEX_TYPES enabled and disabled.
self.run_test_case('QueryTest/nested-types-star-expansion', vector)