Fix null string parsing.

This commit is contained in:
Nong Li
2012-09-08 15:59:04 -07:00
committed by Henry Robinson
parent 25aa8cba0d
commit 4d0319d32b
7 changed files with 163 additions and 67 deletions

View File

@@ -55,9 +55,11 @@ Status HdfsTextTableWriter::AppendRowBatch(RowBatch* batch,
// the first num_non_partition_cols values.
for (int j = 0; j < num_non_partition_cols; ++j) {
void* value = output_exprs_[j]->GetValue(current_row);
// NULL values become empty strings
if (value != NULL) {
output_exprs_[j]->PrintValue(value, &row_stringstream);
} else {
// NULL values in hive are encoded as '\N'
row_stringstream << "\\N";
}
// Append field delimiter.
if (j + 1 < num_non_partition_cols) {

View File

@@ -15,77 +15,82 @@ namespace impala {
inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tuple,
const char* data, int len, bool copy_string, bool need_escape, MemPool* pool) {
if (len == 0) {
if (len == 0 && slot_desc->type() != TYPE_STRING) {
tuple->SetNull(slot_desc->null_indicator_offset());
} else {
StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
return true;
} else if (len == 2 && data[0] == '\\' && data[1] == 'N') {
// Hive encodes NULLs as '\N'
tuple->SetNull(slot_desc->null_indicator_offset());
return true;
}
StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
// Parse the raw-text data. Translate the text string to internal format.
switch (slot_desc->type()) {
case TYPE_STRING: {
StringValue* str_slot = reinterpret_cast<StringValue*>(slot);
str_slot->ptr = const_cast<char*>(data);
str_slot->len = len;
if (copy_string || need_escape) {
DCHECK(pool != NULL);
char* slot_data = reinterpret_cast<char*>(pool->Allocate(len));
if (need_escape) {
UnescapeString(data, slot_data, &str_slot->len);
} else {
memcpy(slot_data, data, str_slot->len);
}
str_slot->ptr = slot_data;
// Parse the raw-text data. Translate the text string to internal format.
switch (slot_desc->type()) {
case TYPE_STRING: {
StringValue* str_slot = reinterpret_cast<StringValue*>(slot);
str_slot->ptr = const_cast<char*>(data);
str_slot->len = len;
if (len != 0 && (copy_string || need_escape)) {
DCHECK(pool != NULL);
char* slot_data = reinterpret_cast<char*>(pool->Allocate(len));
if (need_escape) {
UnescapeString(data, slot_data, &str_slot->len);
} else {
memcpy(slot_data, data, str_slot->len);
}
break;
str_slot->ptr = slot_data;
}
case TYPE_BOOLEAN:
*reinterpret_cast<bool*>(slot) =
StringParser::StringToBool(data, len, &parse_result);
break;
case TYPE_TINYINT:
*reinterpret_cast<int8_t*>(slot) =
StringParser::StringToInt<int8_t>(data, len, &parse_result);
break;
case TYPE_SMALLINT:
*reinterpret_cast<int16_t*>(slot) =
StringParser::StringToInt<int16_t>(data, len, &parse_result);
break;
case TYPE_INT:
*reinterpret_cast<int32_t*>(slot) =
StringParser::StringToInt<int32_t>(data, len, &parse_result);
break;
case TYPE_BIGINT:
*reinterpret_cast<int64_t*>(slot) =
StringParser::StringToInt<int64_t>(data, len, &parse_result);
break;
case TYPE_FLOAT:
*reinterpret_cast<float*>(slot) =
StringParser::StringToFloat<float>(data, len, &parse_result);
break;
case TYPE_DOUBLE:
*reinterpret_cast<double*>(slot) =
StringParser::StringToFloat<double>(data, len, &parse_result);
break;
case TYPE_TIMESTAMP: {
std::string strbuf(data, len);
TimestampValue* ts_slot = reinterpret_cast<TimestampValue*>(slot);
*ts_slot = TimestampValue(strbuf);
if (ts_slot->NotADateTime()) {
parse_result = StringParser::PARSE_FAILURE;
}
break;
}
default:
DCHECK(false) << "bad slot type: " << TypeToString(slot_desc->type());
break;
break;
}
case TYPE_BOOLEAN:
*reinterpret_cast<bool*>(slot) =
StringParser::StringToBool(data, len, &parse_result);
break;
case TYPE_TINYINT:
*reinterpret_cast<int8_t*>(slot) =
StringParser::StringToInt<int8_t>(data, len, &parse_result);
break;
case TYPE_SMALLINT:
*reinterpret_cast<int16_t*>(slot) =
StringParser::StringToInt<int16_t>(data, len, &parse_result);
break;
case TYPE_INT:
*reinterpret_cast<int32_t*>(slot) =
StringParser::StringToInt<int32_t>(data, len, &parse_result);
break;
case TYPE_BIGINT:
*reinterpret_cast<int64_t*>(slot) =
StringParser::StringToInt<int64_t>(data, len, &parse_result);
break;
case TYPE_FLOAT:
*reinterpret_cast<float*>(slot) =
StringParser::StringToFloat<float>(data, len, &parse_result);
break;
case TYPE_DOUBLE:
*reinterpret_cast<double*>(slot) =
StringParser::StringToFloat<double>(data, len, &parse_result);
break;
case TYPE_TIMESTAMP: {
std::string strbuf(data, len);
TimestampValue* ts_slot = reinterpret_cast<TimestampValue*>(slot);
*ts_slot = TimestampValue(strbuf);
if (ts_slot->NotADateTime()) {
parse_result = StringParser::PARSE_FAILURE;
}
break;
}
default:
DCHECK(false) << "bad slot type: " << TypeToString(slot_desc->type());
break;
}
// TODO: add warning for overflow case
if (parse_result == StringParser::PARSE_FAILURE) {
tuple->SetNull(slot_desc->null_indicator_offset());
return false;
}
// TODO: add warning for overflow case
if (parse_result == StringParser::PARSE_FAILURE) {
tuple->SetNull(slot_desc->null_indicator_offset());
return false;
}
return true;

View File

@@ -0,0 +1,11 @@
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
package com.cloudera.impala.service;
import org.junit.Test;
public class MiscQueryTest extends BaseQueryTest {
@Test
public void TestMisc() {
runTestInExecutionMode(EXECUTION_MODE, "misc", false, 1000);
}
}

View File

@@ -12,3 +12,4 @@ beginning of line,begin%,not begin%,^begin.*,^not begin.*
end of line,%line,%line end,.*line$,.*line end$
middle of line,%of%,%of,^.*of.*$,.*of$
,%,,.*,X
\N,%,\N,.*,X
1 one %one% n%one% .*one.* n.*one.*
12 end of line %line %line end .*line$ .*line end$
13 middle of line %of% %of ^.*of.*$ .*of$
14 % .* X
15 \N % \N .* X

View File

@@ -1039,3 +1039,49 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/TinyTable/data.csv' OVERWRIT
----
ANALYZE TABLE %(table_name)s COMPUTE STATISTICS;
====
functional
----
nulltable
----
CREATE TABLE %(table_name)s (
a string,
b string,
c string,
d int,
e double)
row format delimited fields terminated by ','
stored as %(file_format)s;
INSERT OVERWRITE TABLE nulltable select 'a', '', NULL, NULL, NULL from alltypes limit 1;
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
----
ANALYZE TABLE %(table_name)s COMPUTE STATISTICS;
====
====
functional
----
nullescapedtable
----
CREATE TABLE %(table_name)s (
a string,
b string,
c string,
d int,
e double)
row format delimited fields terminated by ',' escaped by '\\'
stored as %(file_format)s;
INSERT OVERWRITE TABLE nullescapedtable select 'a', '', NULL, NULL, NULL from alltypes limit 1;
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
----
ANALYZE TABLE %(table_name)s COMPUTE STATISTICS;

View File

@@ -305,6 +305,7 @@ select * from LikeTbl
string, string, string, string, string
---- RESULTS
'NULL','%','NULL','.*','X'
'','%','','.*','X'
'beginning of line','begin%','not begin%','^begin.*','^not begin.*'
'eight','%eight%','n%eight%','.*eight.*','n.*eight.*'
'end of line','%line','%line end','.*line$','.*line end$'
@@ -337,6 +338,7 @@ string, string
'ten','%ten%'
'three','%three%'
'two','%two%'
'','%'
====
select str_col, match_like_col from LikeTbl
where str_col NOT LIKE match_like_col
@@ -349,6 +351,7 @@ where str_col LIKE no_match_like_col
---- TYPES
string, string
---- RESULTS
'','%'
====
select str_col, no_match_like_col from LikeTbl
where str_col NOT LIKE no_match_like_col
@@ -387,6 +390,7 @@ string, string
'ten','.*ten.*'
'three','.*three.*'
'two','.*two.*'
'','.*'
====
select str_col, no_match_regex_col from LikeTbl
where str_col REGEXP no_match_regex_col
@@ -418,6 +422,7 @@ string, string
'ten','n.*ten.*'
'three','n.*three.*'
'two','n.*two.*'
'','X'
====
select 1+2
---- TYPES
@@ -1100,4 +1105,4 @@ where (cast('2012-01-01 09:10:11' as timestamp) + interval tinyint_col seconds)
bigint
---- RESULTS
1000
====
====

View File

@@ -0,0 +1,26 @@
# Test for selecting from table with null and empty strings.
select * from nulltable$TABLE
---- TYPES
string, string, string, int, double
---- RESULTS
'a','','NULL',NULL,NULL
====
select count(*),count(a),count(b),count(c),count(d),count(e) from nulltable$TABLE
---- TYPES
bigint, bigint, bigint, bigint, bigint, bigint
---- RESULTS
1,1,1,0,0,0
====
# Test for selecting from table with '\' escape character with null and empty strings.
select * from nullescapedtable$TABLE
---- TYPES
string, string, string, int, double
---- RESULTS
'a','','NULL',NULL,NULL
====
select count(*),count(a),count(b),count(c), count(d), count(e) from nullescapedtable$TABLE
---- TYPES
bigint, bigint, bigint, bigint, bigint, bigint
---- RESULTS
1,1,1,0,0,0
====