IMPALA-3194: Allow queries materializing scalar type columns in RC/sequence files

This commit unblocks queries materializing only scalar typed columns on tables backed by RC/sequence files containing complex typed columns. This worked prior to 2.3.0 release. Change-Id: I3a89b211bdc01f7e07497e293fafd75ccf0500fe Reviewed-on: http://gerrit.cloudera.org:8080/2580 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Internal Jenkins
2026-01-06 15:01:43 -05:00 · 2016-03-13 06:17:06 -07:00
parent 2809746b2c
commit 5cd7ada727
3 changed files with 43 additions and 22 deletions
--- a/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
+++ b/fe/src/main/java/com/cloudera/impala/catalog/HdfsFileFormat.java
@@ -35,26 +35,26 @@ public enum HdfsFileFormat {
  RC_FILE("org.apache.hadoop.hive.ql.io.RCFileInputFormat",
      "org.apache.hadoop.hive.ql.io.RCFileOutputFormat",
      "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe",
-      false),
+      false, true),
  TEXT("org.apache.hadoop.mapred.TextInputFormat",
      "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
      "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
-      false),
+      false, false),
  LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
      "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
-      "",
-      false),
+      "", false, false),
  SEQUENCE_FILE("org.apache.hadoop.mapred.SequenceFileInputFormat",
      "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat",
-      "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false),
+      "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", false,
+      true),
  AVRO("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat",
      "org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat",
      "org.apache.hadoop.hive.serde2.avro.AvroSerDe",
-      false),
+      false, false),
  PARQUET("org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
      "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
      "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-      true);
+      true, true);

  private final String inputFormat_;
  private final String outputFormat_;
@@ -63,12 +63,18 @@ public enum HdfsFileFormat {
  // Indicates whether we support scanning complex types for this file format.
  private final boolean isComplexTypesSupported_;

+  // Indicates whether the file format can skip complex columns in scans and just
+  // materialize scalar typed columns. Ignored if isComplexTypesSupported_ is true.
+  // TODO: Remove this once we support complex types for all file formats.
+  private final boolean canSkipColumnTypes_;
+
  HdfsFileFormat(String inputFormat, String outputFormat, String serializationLib,
-      boolean isComplexTypesSupported) {
+      boolean isComplexTypesSupported, boolean canSkipColumnTypes) {
    inputFormat_ = inputFormat;
    outputFormat_ = outputFormat;
    serializationLib_ = serializationLib;
    isComplexTypesSupported_ = isComplexTypesSupported;
+    canSkipColumnTypes_ = canSkipColumnTypes;
  }

  public String inputFormat() { return inputFormat_; }
@@ -234,6 +240,12 @@ public enum HdfsFileFormat {
   */
  public boolean isComplexTypesSupported() { return isComplexTypesSupported_; }

+  /**
+   * Returns true if this file format can skip complex typed columns and materialize
+   * only scalar typed columns.
+   */
+  public boolean canSkipComplexTypes() { return canSkipColumnTypes_; }
+
  /**
   * Returns a list with all formats for which isComplexTypesSupported() is true.
   */
--- a/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/HdfsScanNode.java
@@ -185,10 +185,11 @@ public class HdfsScanNode extends ScanNode {
    }
    if (firstComplexTypedCol == null) return;

-    boolean hasMaterializedSlots = false;
+    boolean referencesComplexTypedCol = false;
    for (SlotDescriptor slotDesc: desc_.getSlots()) {
-      if (slotDesc.isMaterialized()) {
-        hasMaterializedSlots = true;
+      if (!slotDesc.isMaterialized()) continue;
+      if (slotDesc.getType().isComplexType() || slotDesc.getColumn() == null) {
+        referencesComplexTypedCol = true;
        break;
      }
    }
@@ -196,8 +197,11 @@ public class HdfsScanNode extends ScanNode {
    for (HdfsPartition part: partitions_) {
      HdfsFileFormat format = part.getInputFormatDescriptor().getFileFormat();
      if (format.isComplexTypesSupported()) continue;
-      // Allow count(*) and similar queries on RC_FILE with complex types.
-      if (format == HdfsFileFormat.RC_FILE && !hasMaterializedSlots) continue;
+      // If the file format allows querying just scalar typed columns and the query
+      // doesn't materialize any complex typed columns, it is allowed.
+      if (format.canSkipComplexTypes() && !referencesComplexTypedCol) {
+        continue;
+      }
      String errSuffix = String.format(
          "Complex types are supported for these file formats: %s",
          Joiner.on(", ").join(HdfsFileFormat.complexTypesFormats()));
--- a/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/complex-types-file-formats.test
@@ -38,12 +38,17 @@ select 1 from functional_rc_snap.complextypes_fileformat t, t.a
 not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
 Complex types are supported for these file formats: PARQUET.
 ====
-# Complex types are not supported on RC files, even if no complex-typed
-# columns are selected.
+select s.f1 from functional_rc_snap.complextypes_fileformat t, t.m
+---- PLAN
+not implemented: Scan of table 't' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
+Complex types are supported for these file formats: PARQUET.
+====
+# Complex types are not supported on RC files, however queries materializing
+# only scalar type columns are allowed.
 select id from functional_rc_snap.complextypes_fileformat
 ---- PLAN
-not implemented: Scan of table 'functional_rc_snap.complextypes_fileformat' in format 'RC_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
-Complex types are supported for these file formats: PARQUET.
+00:SCAN HDFS [functional_rc_snap.complextypes_fileformat]
+   partitions=1/1 files=1 size=56B
 ====
 # Complex types are not supported on RC files but count(*) and similar
 # queries should work.
@@ -61,12 +66,12 @@ select s.f1 from functional_seq_snap.complextypes_fileformat t, t.a
 not implemented: Scan of table 't' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
 Complex types are supported for these file formats: PARQUET.
 ====
-# Complex types are not supported on sequence files, even if no complex-typed
-# columns are selected.
-select 1 from functional_seq_snap.complextypes_fileformat
+# Queries referencing only scalar typed columns on sequence files
+# are allowed.
+select id from functional_seq_snap.complextypes_fileformat
 ---- PLAN
-not implemented: Scan of table 'functional_seq_snap.complextypes_fileformat' in format 'SEQUENCE_FILE' is not supported because the table has a column 's' with a complex type 'STRUCT<f1:STRING,f2:INT>'.
-Complex types are supported for these file formats: PARQUET.
+00:SCAN HDFS [functional_seq_snap.complextypes_fileformat]
+   partitions=1/1 files=1 size=87B
 ====
 # Scanning all partitions fails because there are partitions with a file format for which
 # complex types are not supported. The error message is abbreviated because it is