IMPALA-1149: read bytes fields as strings in HdfsAvroScanner::MaterializeTuple()

Hive converts "bytes"-type fields to an array<tinyint> column, which we can't even load the metadata for. However, if a bytes field appears in a file schema but not the table schema, this change allows us to read (but not materialize) the field. Otherwise we can't read the file at all. This change also adds a "bytes"-type field to one of the files in functional_avro_snap.schema_resolution_test. Change-Id: I25953ee049e174fc4dbff5d68520a6f87e545339 Reviewed-on: http://gerrit.sjc.cloudera.com:8080/3823 Reviewed-by: Skye Wanderman-Milne <skye@cloudera.com> Tested-by: jenkins (cherry picked from commit 0e2e7c1ac0f63623b7ec3724920e9927cd782508) Reviewed-on: http://gerrit.sjc.cloudera.com:8080/3895
2025-12-19 18:12:08 -05:00 · 2014-08-18 14:32:19 -07:00
parent 91d262a1d7
commit 8e44347831
4 changed files with 12 additions and 6 deletions
--- a/be/src/exec/hdfs-avro-scanner.cc
+++ b/be/src/exec/hdfs-avro-scanner.cc
@@ -411,8 +411,7 @@ HdfsAvroScanner::SchemaElement HdfsAvroScanner::ConvertSchema(
        // not a complex type nor a [<primitive type>, "null"] union itself), we treat
        // this node as the same type as child except with null_union_position set
        // appropriately.
-        if (is_avro_primitive(child.schema.get()) &&
-            child.null_union_position == -1) {
+        if (is_avro_primitive(child.schema.get()) && child.null_union_position == -1) {
          element = child;
          element.null_union_position = null_position;
        }
@@ -620,6 +619,7 @@ void HdfsAvroScanner::MaterializeTuple(MemPool* pool, uint8_t** data, Tuple* tup
        ReadAvroDouble(slot_type, data, write_slot, slot, pool);
        break;
      case AVRO_STRING:
+      case AVRO_BYTES:
        ReadAvroString(slot_type, data, write_slot, slot, pool);
        break;
      case AVRO_DECIMAL: {
@@ -808,6 +808,8 @@ Function* HdfsAvroScanner::CodegenMaterializeTuple(HdfsScanNode* node,
        break;
      default:
        // Unsupported type, can't codegen
+        VLOG(1) << "Failed to codegen MaterializeTuple() due to unsupported type: "
+                << element.schema->type;
        fn->eraseFromParent();
        return NULL;
    }
@@ -858,5 +860,7 @@ Function* HdfsAvroScanner::CodegenDecodeAvroData(RuntimeState* state,
  DCHECK_EQ(replaced, 1);
  decode_avro_data_fn->setName("DecodeAvroData");

-  return codegen->OptimizeFunctionWithExprs(decode_avro_data_fn);
+  decode_avro_data_fn = codegen->OptimizeFunctionWithExprs(decode_avro_data_fn);
+  DCHECK(decode_avro_data_fn != NULL);
+  return decode_avro_data_fn;
 }
--- a/testdata/avro_schema_resolution/file_schema1.avsc
+++ b/testdata/avro_schema_resolution/file_schema1.avsc
@@ -1,7 +1,9 @@
 {"name": "a",
 "type": "record",
- "comment": "Contains a field not in the table and none of the table fields",
+ "comment": "Contains fields not in the table and none of the table fields",
 "fields": [
-    {"name":"boolean2", "type": {"type": "boolean", "note": "alternate primitive syntax"}}
+    {"name":"boolean2", "type": {"type": "boolean", "note": "alternate primitive syntax"}},
+    {"name":"bytes", "type": "bytes",
+     "note": "bytes is an invalid column type but can be read as a non-materialized field"}
 ]
 }
--- a/testdata/avro_schema_resolution/records1.avro
+++ b/testdata/avro_schema_resolution/records1.avro
--- a/testdata/avro_schema_resolution/records1.json
+++ b/testdata/avro_schema_resolution/records1.json
@@ -1 +1 @@
-{"boolean2": false}
+{"boolean2": false, "bytes": "\u00FF"}