Fix null string parsing.

2025-12-19 18:12:08 -05:00 · 2012-09-08 15:59:04 -07:00
parent 25aa8cba0d
commit 4d0319d32b
7 changed files with 163 additions and 67 deletions
--- a/be/src/exec/hdfs-text-table-writer.cc
+++ b/be/src/exec/hdfs-text-table-writer.cc
@@ -55,9 +55,11 @@ Status HdfsTextTableWriter::AppendRowBatch(RowBatch* batch,
    // the first num_non_partition_cols values.
    for (int j = 0; j < num_non_partition_cols; ++j) {
      void* value = output_exprs_[j]->GetValue(current_row);
-      // NULL values become empty strings
      if (value != NULL) {
        output_exprs_[j]->PrintValue(value, &row_stringstream);
+      } else {
+        // NULL values in hive are encoded as '\N'
+        row_stringstream << "\\N";
      }
      // Append field delimiter.
      if (j + 1 < num_non_partition_cols) {
--- a/be/src/exec/text-converter.inline.h
+++ b/be/src/exec/text-converter.inline.h
@@ -15,77 +15,82 @@ namespace impala {

 inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tuple,
    const char* data, int len, bool copy_string, bool need_escape, MemPool* pool) {
-  if (len == 0) {
+  if (len == 0 && slot_desc->type() != TYPE_STRING) {
    tuple->SetNull(slot_desc->null_indicator_offset());
-  } else {
-    StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
-    void* slot = tuple->GetSlot(slot_desc->tuple_offset());
+    return true;
+  } else if (len == 2 && data[0] == '\\' && data[1] == 'N') {
+    // Hive encodes NULLs as '\N'
+    tuple->SetNull(slot_desc->null_indicator_offset());
+    return true;
+  } 
+    
+  StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
+  void* slot = tuple->GetSlot(slot_desc->tuple_offset());

-    // Parse the raw-text data. Translate the text string to internal format.
-    switch (slot_desc->type()) {
-      case TYPE_STRING: {
-        StringValue* str_slot = reinterpret_cast<StringValue*>(slot);
-        str_slot->ptr = const_cast<char*>(data);
-        str_slot->len = len;
-        if (copy_string || need_escape) {
-          DCHECK(pool != NULL);
-          char* slot_data = reinterpret_cast<char*>(pool->Allocate(len));
-          if (need_escape) {
-            UnescapeString(data, slot_data, &str_slot->len);
-          } else {
-            memcpy(slot_data, data, str_slot->len);
-          }
-          str_slot->ptr = slot_data;
+  // Parse the raw-text data. Translate the text string to internal format.
+  switch (slot_desc->type()) {
+    case TYPE_STRING: {
+      StringValue* str_slot = reinterpret_cast<StringValue*>(slot);
+      str_slot->ptr = const_cast<char*>(data);
+      str_slot->len = len;
+      if (len != 0 && (copy_string || need_escape)) {
+        DCHECK(pool != NULL);
+        char* slot_data = reinterpret_cast<char*>(pool->Allocate(len));
+        if (need_escape) {
+          UnescapeString(data, slot_data, &str_slot->len);
+        } else {
+          memcpy(slot_data, data, str_slot->len);
        }
-        break;
+        str_slot->ptr = slot_data;
      }
-      case TYPE_BOOLEAN:
-        *reinterpret_cast<bool*>(slot) =
-          StringParser::StringToBool(data, len, &parse_result);
-        break;
-      case TYPE_TINYINT:
-        *reinterpret_cast<int8_t*>(slot) =
-          StringParser::StringToInt<int8_t>(data, len, &parse_result);
-        break;
-      case TYPE_SMALLINT:
-        *reinterpret_cast<int16_t*>(slot) =
-          StringParser::StringToInt<int16_t>(data, len, &parse_result);
-        break;
-      case TYPE_INT:
-        *reinterpret_cast<int32_t*>(slot) =
-          StringParser::StringToInt<int32_t>(data, len, &parse_result);
-        break;
-      case TYPE_BIGINT:
-        *reinterpret_cast<int64_t*>(slot) =
-          StringParser::StringToInt<int64_t>(data, len, &parse_result);
-        break;
-      case TYPE_FLOAT:
-        *reinterpret_cast<float*>(slot) =
-          StringParser::StringToFloat<float>(data, len, &parse_result);
-        break;
-      case TYPE_DOUBLE:
-        *reinterpret_cast<double*>(slot) =
-          StringParser::StringToFloat<double>(data, len, &parse_result);
-        break;
-      case TYPE_TIMESTAMP: {
-        std::string strbuf(data, len);
-        TimestampValue* ts_slot = reinterpret_cast<TimestampValue*>(slot);
-        *ts_slot = TimestampValue(strbuf);
-        if (ts_slot->NotADateTime()) {
-          parse_result = StringParser::PARSE_FAILURE;
-        }
-        break;
-      }
-      default:
-        DCHECK(false) << "bad slot type: " << TypeToString(slot_desc->type());
-        break;
+      break;
    }
+    case TYPE_BOOLEAN:
+      *reinterpret_cast<bool*>(slot) =
+        StringParser::StringToBool(data, len, &parse_result);
+      break;
+    case TYPE_TINYINT:
+      *reinterpret_cast<int8_t*>(slot) =
+        StringParser::StringToInt<int8_t>(data, len, &parse_result);
+      break;
+    case TYPE_SMALLINT:
+      *reinterpret_cast<int16_t*>(slot) =
+        StringParser::StringToInt<int16_t>(data, len, &parse_result);
+      break;
+    case TYPE_INT:
+      *reinterpret_cast<int32_t*>(slot) =
+        StringParser::StringToInt<int32_t>(data, len, &parse_result);
+      break;
+    case TYPE_BIGINT:
+      *reinterpret_cast<int64_t*>(slot) =
+        StringParser::StringToInt<int64_t>(data, len, &parse_result);
+      break;
+    case TYPE_FLOAT:
+      *reinterpret_cast<float*>(slot) =
+        StringParser::StringToFloat<float>(data, len, &parse_result);
+      break;
+    case TYPE_DOUBLE:
+      *reinterpret_cast<double*>(slot) =
+        StringParser::StringToFloat<double>(data, len, &parse_result);
+      break;
+    case TYPE_TIMESTAMP: {
+      std::string strbuf(data, len);
+      TimestampValue* ts_slot = reinterpret_cast<TimestampValue*>(slot);
+      *ts_slot = TimestampValue(strbuf);
+      if (ts_slot->NotADateTime()) {
+        parse_result = StringParser::PARSE_FAILURE;
+      }
+      break;
+    }
+    default:
+      DCHECK(false) << "bad slot type: " << TypeToString(slot_desc->type());
+      break;
+  }

-    // TODO: add warning for overflow case
-    if (parse_result == StringParser::PARSE_FAILURE) {
-      tuple->SetNull(slot_desc->null_indicator_offset());
-      return false;
-    }
+  // TODO: add warning for overflow case
+  if (parse_result == StringParser::PARSE_FAILURE) {
+    tuple->SetNull(slot_desc->null_indicator_offset());
+    return false;
  }

  return true;
--- a/fe/src/test/java/com/cloudera/impala/service/MiscQueryTest.java
+++ b/fe/src/test/java/com/cloudera/impala/service/MiscQueryTest.java
@@ -0,0 +1,11 @@
+// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
+
+package com.cloudera.impala.service;
+import org.junit.Test;
+
+public class MiscQueryTest extends BaseQueryTest {
+  @Test
+  public void TestMisc() {
+    runTestInExecutionMode(EXECUTION_MODE, "misc", false, 1000);
+  }
+}
--- a/testdata/LikeTbl/data.csv
+++ b/testdata/LikeTbl/data.csv
@@ -12,3 +12,4 @@ beginning of line,begin%,not begin%,^begin.*,^not begin.*
 end of line,%line,%line end,.*line$,.*line end$
 middle of line,%of%,%of,^.*of.*$,.*of$
 ,%,,.*,X
+\N,%,\N,.*,X
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -1039,3 +1039,49 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/TinyTable/data.csv' OVERWRIT
 ----
 ANALYZE TABLE %(table_name)s COMPUTE STATISTICS;
 ====
+functional
+----
+nulltable
+----
+CREATE TABLE %(table_name)s (
+  a string,
+  b string,
+  c string,
+  d int,
+  e double)
+row format delimited fields terminated by ','
+stored as %(file_format)s;
+INSERT OVERWRITE TABLE nulltable select 'a', '', NULL, NULL, NULL from alltypes limit 1;
+----
+FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
+----
+${IMPALA_HOME}/bin/run-query.sh --query=" \
+  INSERT OVERWRITE TABLE %(table_name)s \
+  select * FROM %(base_table_name)s"
+----
+----
+ANALYZE TABLE %(table_name)s COMPUTE STATISTICS;
+====
+====
+functional
+----
+nullescapedtable
+----
+CREATE TABLE %(table_name)s (
+  a string,
+  b string,
+  c string,
+  d int,
+  e double)
+row format delimited fields terminated by ',' escaped by '\\'
+stored as %(file_format)s;
+INSERT OVERWRITE TABLE nullescapedtable select 'a', '', NULL, NULL, NULL from alltypes limit 1;
+----
+FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
+----
+${IMPALA_HOME}/bin/run-query.sh --query=" \
+  INSERT OVERWRITE TABLE %(table_name)s \
+  select * FROM %(base_table_name)s"
+----
+----
+ANALYZE TABLE %(table_name)s COMPUTE STATISTICS;
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
@@ -305,6 +305,7 @@ select * from LikeTbl
 string, string, string, string, string
 ---- RESULTS
 'NULL','%','NULL','.*','X'
+'','%','','.*','X'
 'beginning of line','begin%','not begin%','^begin.*','^not begin.*'
 'eight','%eight%','n%eight%','.*eight.*','n.*eight.*'
 'end of line','%line','%line end','.*line$','.*line end$'
@@ -337,6 +338,7 @@ string, string
 'ten','%ten%'
 'three','%three%'
 'two','%two%'
+'','%'
 ====
 select str_col, match_like_col from LikeTbl
 where str_col NOT LIKE match_like_col
@@ -349,6 +351,7 @@ where str_col LIKE no_match_like_col
 ---- TYPES
 string, string
 ---- RESULTS
+'','%'
 ====
 select str_col, no_match_like_col from LikeTbl
 where str_col NOT LIKE no_match_like_col
@@ -387,6 +390,7 @@ string, string
 'ten','.*ten.*'
 'three','.*three.*'
 'two','.*two.*'
+'','.*'
 ====
 select str_col, no_match_regex_col from LikeTbl
 where str_col REGEXP no_match_regex_col
@@ -418,6 +422,7 @@ string, string
 'ten','n.*ten.*'
 'three','n.*three.*'
 'two','n.*two.*'
+'','X'
 ====
 select 1+2
 ---- TYPES
@@ -1100,4 +1105,4 @@ where (cast('2012-01-01 09:10:11' as timestamp) + interval tinyint_col seconds)
 bigint
 ---- RESULTS
 1000
-====
+====
--- a/testdata/workloads/functional-query/queries/QueryTest/misc.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/misc.test
@@ -0,0 +1,26 @@
+# Test for selecting from table with null and empty strings.
+select * from nulltable$TABLE
+---- TYPES
+string, string, string, int, double
+---- RESULTS
+'a','','NULL',NULL,NULL
+====
+select count(*),count(a),count(b),count(c),count(d),count(e) from nulltable$TABLE
+---- TYPES
+bigint, bigint, bigint, bigint, bigint, bigint
+---- RESULTS
+1,1,1,0,0,0
+====
+# Test for selecting from table with '\' escape character with null and empty strings.
+select * from nullescapedtable$TABLE
+---- TYPES
+string, string, string, int, double
+---- RESULTS
+'a','','NULL',NULL,NULL
+====
+select count(*),count(a),count(b),count(c), count(d), count(e) from nullescapedtable$TABLE
+---- TYPES
+bigint, bigint, bigint, bigint, bigint, bigint
+---- RESULTS
+1,1,1,0,0,0
+====