From 5cd7ada727d04fe56d62ced2e8bfa56f4448ea57 Mon Sep 17 00:00:00 2001
From: Bharath Vissapragada <bharathv@cloudera.com>
Date: Sun, 13 Mar 2016 06:17:06 -0700
Subject: [PATCH] IMPALA-3194: Allow queries materializing scalar type columns
 in RC/sequence files

This commit unblocks queries materializing only scalar typed
columns on tables backed by RC/sequence files containing complex
typed columns. This worked prior to 2.3.0 release.

Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe
Reviewed-on: http://gerrit.cloudera.org:8080/2580
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Internal Jenkins
---
 .../impala/catalog/HdfsFileFormat.java        | 28 +++++++++++++------
 .../cloudera/impala/planner/HdfsScanNode.java | 14 ++++++----
 .../complex-types-file-formats.test           | 23 +++++++++------
 3 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
index 9c883fc64..3670aa5fb 100644
--- a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
+++ b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
@@ -35,26 +35,26 @@ public enum HdfsFileFormat {
   RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
       "org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
       "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
-      false),
+      false, true),
   TEXT("org.apache.hadoop.mapred.TextInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
       "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
-      false),
+      false, false),
   LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
-      "",
-      false),
+      "", false, false),
   SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
       "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
-      "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false),
+      "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
+      true),
   AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
       "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
       "org.apache.hadoop.hive.serde2.avro.AvroSerDe",
-      false),
+      false, false),
   PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
       "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-      true);
+      true, true);
 
   private final String inputFormat_;
   private final String outputFormat_;
@@ -63,12 +63,18 @@ public enum HdfsFileFormat {
   // Indicates whether we support scanning complex types for this file format.
   private final boolean isComplexTypesSupported_;
 
+  // Indicates whether the file format can skip complex columns in scans and just
+  // materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true.
+  // TODO: Remove this once we support complex types for all file formats.
+  private final boolean canSkipColumnTypes_;
+
   HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib,
-      boolean isComplexTypesSupported) {
+      boolean isComplexTypesSupported, boolean canSkipColumnTypes) {
     inputFormat_ = inputFormat;
     outputFormat_ = outputFormat;
     serializationLib_ = serializationLib;
     isComplexTypesSupported_ = isComplexTypesSupported;
+    canSkipColumnTypes_ = canSkipColumnTypes;
   }
 
   public String inputFormat() { return inputFormat_; }
@@ -234,6 +240,12 @@ public enum HdfsFileFormat {
    */
   public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }
 
+  /**
+   * Returns true if this file format can skip complex typed columns and materialize
+   * only scalar typed columns.
+   */
+  public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }
+
   /**
    * Returns a list with all formats for which isComplexTypesSupported() is true.
    */
diff --git a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
index c6f772298..5edc0dcb1 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
@@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode {
     }
     if (firstComplexTypedCol == null) return;
 
-    boolean hasMaterializedSlots = false;
+    boolean referencesComplexTypedCol = false;
     for (SlotDescriptor slotDesc: desc_.getSlots()) {
-      if (slotDesc.isMaterialized()) {
-        hasMaterializedSlots = true;
+      if (!slotDesc.isMaterialized()) continue;
+      if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) {
+        referencesComplexTypedCol = true;
         break;
       }
     }
@@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode {
     for (HdfsPartition part: partitions_) {
       HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat();
       if (format.isComplexTypesSupported()) continue;
-      // Allow count(*) and similar queries on RC_FILE with complex types.
-      if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue;
+      // If the file format allows querying just scalar typed columns and the query
+      // doesn't materialize any complex typed columns, it is allowed.
+      if (format.canSkipComplexTypes() && !referencesComplexTypedCol) {
+        continue;
+      }
       String errSuffix = String.format(
           "Complex types are supported for these file formats: %s",
           Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats()));
diff --git a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
index 487bb3b5b..f0431a25d 100644
--- a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
@@ -38,12 +38,17 @@ select 1 from functional_rc_snap.complextypes_fileformat t, t.a
 not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
 Complex types are supported for these file formats: PARQUET.
 ====
-# Complex types are not supported on RC files, even if no complex-typed
-# columns are selected.
+select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m
+---- PLAN
+not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+# Complex types are not supported on RC files, however queries materializing
+# only scalar type columns are allowed.
 select id from functional_rc_snap.complextypes_fileformat
 ---- PLAN
-not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
-Complex types are supported for these file formats: PARQUET.
+00:SCAN HDFS [functional_rc_snap.complextypes_fileformat]
+   partitions=1/1 files=1 size=56B
 ====
 # Complex types are not supported on RC files but count(*) and similar
 # queries should work.
@@ -61,12 +66,12 @@ select s.f1 from functional_seq_snap.complextypes_fileformat t, t.a
 not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
 Complex types are supported for these file formats: PARQUET.
 ====
-# Complex types are not supported on sequence files, even if no complex-typed
-# columns are selected.
-select 1 from functional_seq_snap.complextypes_fileformat
+# Queries referencing only scalar typed columns on sequence files
+# are allowed.
+select id from functional_seq_snap.complextypes_fileformat
 ---- PLAN
-not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
-Complex types are supported for these file formats: PARQUET.
+00:SCAN HDFS [functional_seq_snap.complextypes_fileformat]
+   partitions=1/1 files=1 size=87B
 ====
 # Scanning all partitions fails because there are partitions with a file format for which
 # complex types are not supported. The error message is abbreviated because it is