mirror of
https://github.com/apache/impala.git
synced 2026-01-06 15:01:43 -05:00
IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence files
This commit unblocks queries materializing only scalar typed columns on tables backed by RC/sequence files containing complex typed columns. This worked prior to 2.3.0 release. Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe Reviewed-on: http://gerrit.cloudera.org:8080/2580 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Internal Jenkins
This commit is contained in:
committed by
Internal Jenkins
parent
2809746b2c
commit
5cd7ada727
@@ -35,26 +35,26 @@ public enum HdfsFileFormat {
|
||||
RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
|
||||
"org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
|
||||
"org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
|
||||
false),
|
||||
false, true),
|
||||
TEXT("org.apache.hadoop.mapred.TextInputFormat",
|
||||
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
|
||||
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
|
||||
false),
|
||||
false, false),
|
||||
LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
|
||||
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
|
||||
"",
|
||||
false),
|
||||
"", false, false),
|
||||
SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
|
||||
"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
|
||||
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false),
|
||||
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
|
||||
true),
|
||||
AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
|
||||
"org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
|
||||
"org.apache.hadoop.hive.serde2.avro.AvroSerDe",
|
||||
false),
|
||||
false, false),
|
||||
PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
|
||||
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
|
||||
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
|
||||
true);
|
||||
true, true);
|
||||
|
||||
private final String inputFormat_;
|
||||
private final String outputFormat_;
|
||||
@@ -63,12 +63,18 @@ public enum HdfsFileFormat {
|
||||
// Indicates whether we support scanning complex types for this file format.
|
||||
private final boolean isComplexTypesSupported_;
|
||||
|
||||
// Indicates whether the file format can skip complex columns in scans and just
|
||||
// materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true.
|
||||
// TODO: Remove this once we support complex types for all file formats.
|
||||
private final boolean canSkipColumnTypes_;
|
||||
|
||||
HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib,
|
||||
boolean isComplexTypesSupported) {
|
||||
boolean isComplexTypesSupported, boolean canSkipColumnTypes) {
|
||||
inputFormat_ = inputFormat;
|
||||
outputFormat_ = outputFormat;
|
||||
serializationLib_ = serializationLib;
|
||||
isComplexTypesSupported_ = isComplexTypesSupported;
|
||||
canSkipColumnTypes_ = canSkipColumnTypes;
|
||||
}
|
||||
|
||||
public String inputFormat() { return inputFormat_; }
|
||||
@@ -234,6 +240,12 @@ public enum HdfsFileFormat {
|
||||
*/
|
||||
public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }
|
||||
|
||||
/**
|
||||
* Returns true if this file format can skip complex typed columns and materialize
|
||||
* only scalar typed columns.
|
||||
*/
|
||||
public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }
|
||||
|
||||
/**
|
||||
* Returns a list with all formats for which isComplexTypesSupported() is true.
|
||||
*/
|
||||
|
||||
@@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode {
|
||||
}
|
||||
if (firstComplexTypedCol == null) return;
|
||||
|
||||
boolean hasMaterializedSlots = false;
|
||||
boolean referencesComplexTypedCol = false;
|
||||
for (SlotDescriptor slotDesc: desc_.getSlots()) {
|
||||
if (slotDesc.isMaterialized()) {
|
||||
hasMaterializedSlots = true;
|
||||
if (!slotDesc.isMaterialized()) continue;
|
||||
if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) {
|
||||
referencesComplexTypedCol = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode {
|
||||
for (HdfsPartition part: partitions_) {
|
||||
HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat();
|
||||
if (format.isComplexTypesSupported()) continue;
|
||||
// Allow count(*) and similar queries on RC_FILE with complex types.
|
||||
if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue;
|
||||
// If the file format allows querying just scalar typed columns and the query
|
||||
// doesn't materialize any complex typed columns, it is allowed.
|
||||
if (format.canSkipComplexTypes() && !referencesComplexTypedCol) {
|
||||
continue;
|
||||
}
|
||||
String errSuffix = String.format(
|
||||
"Complex types are supported for these file formats: %s",
|
||||
Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats()));
|
||||
|
||||
@@ -38,12 +38,17 @@ select 1 from functional_rc_snap.complextypes_fileformat t, t.a
|
||||
not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
|
||||
Complex types are supported for these file formats: PARQUET.
|
||||
====
|
||||
# Complex types are not supported on RC files, even if no complex-typed
|
||||
# columns are selected.
|
||||
select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m
|
||||
---- PLAN
|
||||
not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
|
||||
Complex types are supported for these file formats: PARQUET.
|
||||
====
|
||||
# Complex types are not supported on RC files, however queries materializing
|
||||
# only scalar type columns are allowed.
|
||||
select id from functional_rc_snap.complextypes_fileformat
|
||||
---- PLAN
|
||||
not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
|
||||
Complex types are supported for these file formats: PARQUET.
|
||||
00:SCAN HDFS [functional_rc_snap.complextypes_fileformat]
|
||||
partitions=1/1 files=1 size=56B
|
||||
====
|
||||
# Complex types are not supported on RC files but count(*) and similar
|
||||
# queries should work.
|
||||
@@ -61,12 +66,12 @@ select s.f1 from functional_seq_snap.complextypes_fileformat t, t.a
|
||||
not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
|
||||
Complex types are supported for these file formats: PARQUET.
|
||||
====
|
||||
# Complex types are not supported on sequence files, even if no complex-typed
|
||||
# columns are selected.
|
||||
select 1 from functional_seq_snap.complextypes_fileformat
|
||||
# Queries referencing only scalar typed columns on sequence files
|
||||
# are allowed.
|
||||
select id from functional_seq_snap.complextypes_fileformat
|
||||
---- PLAN
|
||||
not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
|
||||
Complex types are supported for these file formats: PARQUET.
|
||||
00:SCAN HDFS [functional_seq_snap.complextypes_fileformat]
|
||||
partitions=1/1 files=1 size=87B
|
||||
====
|
||||
# Scanning all partitions fails because there are partitions with a file format for which
|
||||
# complex types are not supported. The error message is abbreviated because it is
|
||||
|
||||
Reference in New Issue
Block a user