IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence files

This commit unblocks queries materializing only scalar typed
columns on tables backed by RC/sequence files containing complex
typed columns. This worked prior to 2.3.0 release.

Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe
Reviewed-on: http://gerrit.cloudera.org:8080/2580
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Internal Jenkins
This commit is contained in:
Bharath Vissapragada
2016-03-13 06:17:06 -07:00
committed by Internal Jenkins
parent 2809746b2c
commit 5cd7ada727
3 changed files with 43 additions and 22 deletions

View File

@@ -35,26 +35,26 @@ public enum HdfsFileFormat {
RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
"org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
"org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
false),
false, true),
TEXT("org.apache.hadoop.mapred.TextInputFormat",
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
false),
false, false),
LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"",
false),
"", false, false),
SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
"org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false),
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
true),
AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
"org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
"org.apache.hadoop.hive.serde2.avro.AvroSerDe",
false),
false, false),
PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
"org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
true);
true, true);
private final String inputFormat_;
private final String outputFormat_;
@@ -63,12 +63,18 @@ public enum HdfsFileFormat {
// Indicates whether we support scanning complex types for this file format.
private final boolean isComplexTypesSupported_;
// Indicates whether the file format can skip complex columns in scans and just
// materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true.
// TODO: Remove this once we support complex types for all file formats.
private final boolean canSkipColumnTypes_;
HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib,
boolean isComplexTypesSupported) {
boolean isComplexTypesSupported, boolean canSkipColumnTypes) {
inputFormat_ = inputFormat;
outputFormat_ = outputFormat;
serializationLib_ = serializationLib;
isComplexTypesSupported_ = isComplexTypesSupported;
canSkipColumnTypes_ = canSkipColumnTypes;
}
public String inputFormat() { return inputFormat_; }
@@ -234,6 +240,12 @@ public enum HdfsFileFormat {
*/
public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }
/**
* Returns true if this file format can skip complex typed columns and materialize
* only scalar typed columns.
*/
public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }
/**
* Returns a list with all formats for which isComplexTypesSupported() is true.
*/

View File

@@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode {
}
if (firstComplexTypedCol == null) return;
boolean hasMaterializedSlots = false;
boolean referencesComplexTypedCol = false;
for (SlotDescriptor slotDesc: desc_.getSlots()) {
if (slotDesc.isMaterialized()) {
hasMaterializedSlots = true;
if (!slotDesc.isMaterialized()) continue;
if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) {
referencesComplexTypedCol = true;
break;
}
}
@@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode {
for (HdfsPartition part: partitions_) {
HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat();
if (format.isComplexTypesSupported()) continue;
// Allow count(*) and similar queries on RC_FILE with complex types.
if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue;
// If the file format allows querying just scalar typed columns and the query
// doesn't materialize any complex typed columns, it is allowed.
if (format.canSkipComplexTypes() && !referencesComplexTypedCol) {
continue;
}
String errSuffix = String.format(
"Complex types are supported for these file formats: %s",
Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats()));

View File

@@ -38,12 +38,17 @@ select 1 from functional_rc_snap.complextypes_fileformat t, t.a
not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
====
# Complex types are not supported on RC files, even if no complex-typed
# columns are selected.
select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m
---- PLAN
not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
====
# Complex types are not supported on RC files, however queries materializing
# only scalar type columns are allowed.
select id from functional_rc_snap.complextypes_fileformat
---- PLAN
not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
00:SCAN HDFS [functional_rc_snap.complextypes_fileformat]
partitions=1/1 files=1 size=56B
====
# Complex types are not supported on RC files but count(*) and similar
# queries should work.
@@ -61,12 +66,12 @@ select s.f1 from functional_seq_snap.complextypes_fileformat t, t.a
not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
====
# Complex types are not supported on sequence files, even if no complex-typed
# columns are selected.
select 1 from functional_seq_snap.complextypes_fileformat
# Queries referencing only scalar typed columns on sequence files
# are allowed.
select id from functional_seq_snap.complextypes_fileformat
---- PLAN
not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
Complex types are supported for these file formats: PARQUET.
00:SCAN HDFS [functional_seq_snap.complextypes_fileformat]
partitions=1/1 files=1 size=87B
====
# Scanning all partitions fails because there are partitions with a file format for which
# complex types are not supported. The error message is abbreviated because it is