From 5cd7ada727d04fe56d62ced2e8bfa56f4448ea57 Mon Sep 17 00:00:00 2001 From: Bharath Vissapragada Date: Sun, 13 Mar 2016 06:17:06 -0700 Subject: [PATCH] IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence files This commit unblocks queries materializing only scalar typed columns on tables backed by RC/sequence files containing complex typed columns. This worked prior to 2.3.0 release. Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe Reviewed-on: http://gerrit.cloudera.org:8080/2580 Reviewed-by: Alex Behm Tested-by: Internal Jenkins --- .../impala/catalog/HdfsFileFormat.java | 28 +++++++++++++------ .../cloudera/impala/planner/HdfsScanNode.java | 14 ++++++---- .../complex-types-file-formats.test | 23 +++++++++------ 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java index 9c883fc64..3670aa5fb 100644 --- a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java +++ b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java @@ -35,26 +35,26 @@ public enum HdfsFileFormat { RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat", "org.apache.hadoop.hive.ql.io.RCFileOutputFormat", "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe", - false), + false, true), TEXT("org.apache.hadoop.mapred.TextInputFormat", "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", - false), + false, false), LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat", "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", - "", - false), + "", false, false), SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat", "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat", - "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false), + "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false, + true), AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat", "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat", "org.apache.hadoop.hive.serde2.avro.AvroSerDe", - false), + false, false), PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", - true); + true, true); private final String inputFormat_; private final String outputFormat_; @@ -63,12 +63,18 @@ public enum HdfsFileFormat { // Indicates whether we support scanning complex types for this file format. private final boolean isComplexTypesSupported_; + // Indicates whether the file format can skip complex columns in scans and just + // materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true. + // TODO: Remove this once we support complex types for all file formats. + private final boolean canSkipColumnTypes_; + HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib, - boolean isComplexTypesSupported) { + boolean isComplexTypesSupported, boolean canSkipColumnTypes) { inputFormat_ = inputFormat; outputFormat_ = outputFormat; serializationLib_ = serializationLib; isComplexTypesSupported_ = isComplexTypesSupported; + canSkipColumnTypes_ = canSkipColumnTypes; } public String inputFormat() { return inputFormat_; } @@ -234,6 +240,12 @@ public enum HdfsFileFormat { */ public boolean isComplexTypesSupported() { return isComplexTypesSupported_; } + /** + * Returns true if this file format can skip complex typed columns and materialize + * only scalar typed columns. + */ + public boolean canSkipComplexTypes() { return canSkipColumnTypes_; } + /** * Returns a list with all formats for which isComplexTypesSupported() is true. */ diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java index c6f772298..5edc0dcb1 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java @@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode { } if (firstComplexTypedCol == null) return; - boolean hasMaterializedSlots = false; + boolean referencesComplexTypedCol = false; for (SlotDescriptor slotDesc: desc_.getSlots()) { - if (slotDesc.isMaterialized()) { - hasMaterializedSlots = true; + if (!slotDesc.isMaterialized()) continue; + if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) { + referencesComplexTypedCol = true; break; } } @@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode { for (HdfsPartition part: partitions_) { HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat(); if (format.isComplexTypesSupported()) continue; - // Allow count(*) and similar queries on RC_FILE with complex types. - if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue; + // If the file format allows querying just scalar typed columns and the query + // doesn't materialize any complex typed columns, it is allowed. + if (format.canSkipComplexTypes() && !referencesComplexTypedCol) { + continue; + } String errSuffix = String.format( "Complex types are supported for these file formats: %s", Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats())); diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test index 487bb3b5b..f0431a25d 100644 --- a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test +++ b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test @@ -38,12 +38,17 @@ select 1 from functional_rc_snap.complextypes_fileformat t, t.a not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT'. Complex types are supported for these file formats: PARQUET. ==== -# Complex types are not supported on RC files, even if no complex-typed -# columns are selected. +select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m +---- PLAN +not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT'. +Complex types are supported for these file formats: PARQUET. +==== +# Complex types are not supported on RC files, however queries materializing +# only scalar type columns are allowed. select id from functional_rc_snap.complextypes_fileformat ---- PLAN -not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT'. -Complex types are supported for these file formats: PARQUET. +00:SCAN HDFS [functional_rc_snap.complextypes_fileformat] + partitions=1/1 files=1 size=56B ==== # Complex types are not supported on RC files but count(*) and similar # queries should work. @@ -61,12 +66,12 @@ select s.f1 from functional_seq_snap.complextypes_fileformat t, t.a not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT'. Complex types are supported for these file formats: PARQUET. ==== -# Complex types are not supported on sequence files, even if no complex-typed -# columns are selected. -select 1 from functional_seq_snap.complextypes_fileformat +# Queries referencing only scalar typed columns on sequence files +# are allowed. +select id from functional_seq_snap.complextypes_fileformat ---- PLAN -not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT'. -Complex types are supported for these file formats: PARQUET. +00:SCAN HDFS [functional_seq_snap.complextypes_fileformat] + partitions=1/1 files=1 size=87B ==== # Scanning all partitions fails because there are partitions with a file format for which # complex types are not supported. The error message is abbreviated because it is