diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc index af6ddf72d..6e9eb614c 100644 --- a/be/src/exec/hdfs-scanner.cc +++ b/be/src/exec/hdfs-scanner.cc @@ -354,7 +354,7 @@ Function* HdfsScanner::CodegenWriteCompleteTuple( SlotDescriptor* slot_desc = node->materialized_slots()[i]; Function* fn = TextConverter::CodegenWriteSlot(codegen, tuple_desc, slot_desc, node->hdfs_table()->null_column_value().data(), - node->hdfs_table()->null_column_value().size(), true); + node->hdfs_table()->null_column_value().size(), true, state->strict_mode()); if (fn == NULL) return NULL; slot_fns.push_back(fn); } diff --git a/be/src/exec/hdfs-text-scanner.cc b/be/src/exec/hdfs-text-scanner.cc index dd0d524f1..f3b669dd0 100644 --- a/be/src/exec/hdfs-text-scanner.cc +++ b/be/src/exec/hdfs-text-scanner.cc @@ -215,7 +215,8 @@ Status HdfsTextScanner::InitNewRange() { scan_node_->is_materialized_col(), hdfs_partition->line_delim(), field_delim, collection_delim, hdfs_partition->escape_char())); text_converter_.reset(new TextConverter(hdfs_partition->escape_char(), - scan_node_->hdfs_table()->null_column_value())); + scan_node_->hdfs_table()->null_column_value(), true, + state_->strict_mode())); RETURN_IF_ERROR(ResetScanner()); return Status::OK(); diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc index 03cfc07de..8e71ef4db 100644 --- a/be/src/exec/text-converter.cc +++ b/be/src/exec/text-converter.cc @@ -30,10 +30,11 @@ using namespace impala; using namespace llvm; TextConverter::TextConverter(char escape_char, const string& null_col_val, - bool check_null) + bool check_null, bool strict_mode) : escape_char_(escape_char), null_col_val_(null_col_val), - check_null_(check_null) { + check_null_(check_null), + strict_mode_(strict_mode) { } void TextConverter::UnescapeString(const char* src, char* dest, int* len, @@ -90,9 +91,19 @@ void TextConverter::UnescapeString(const char* src, char* dest, int* len, // call void @SetNull({ i8, i32 }* %tuple_arg) // ret i1 false // } +// +// If strict_mode = true, then 'parse_slot' also treats overflows errors, e.g.: +// parse_slot: ; preds = %check_zero +// %slot = getelementptr inbounds { i8, i32 }* %tuple_arg, i32 0, i32 1 +// %1 = call i32 @IrStringToInt32(i8* %data, i32 %len, i32* %parse_result) +// %parse_result1 = load i32, i32* %parse_result +// %failed = icmp eq i32 %parse_result1, 1 +// %overflowed = icmp eq i32 %parse_result1, 2 +// %failed_or = or i1 %failed, %overflowed +// br i1 %failed_or, label %parse_fail, label %parse_success Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen, TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc, - const char* null_col_val, int len, bool check_null) { + const char* null_col_val, int len, bool check_null, bool strict_mode) { if (slot_desc->type().type == TYPE_CHAR) { LOG(INFO) << "Char isn't supported for CodegenWriteSlot"; return NULL; @@ -226,15 +237,23 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen, &parse_success_block, &parse_failed_block); LlvmCodeGen::NamedVariable parse_result("parse_result", codegen->GetType(TYPE_INT)); Value* parse_result_ptr = codegen->CreateEntryBlockAlloca(fn, parse_result); - Value* failed_value = codegen->GetIntConstant(TYPE_INT, StringParser::PARSE_FAILURE); // Call Impala's StringTo* function Value* result = builder.CreateCall(parse_fn, ArrayRef({args[1], args[2], parse_result_ptr})); Value* parse_result_val = builder.CreateLoad(parse_result_ptr, "parse_result"); + Value* failed_value = codegen->GetIntConstant(TYPE_INT, StringParser::PARSE_FAILURE); - // Check for parse error. TODO: handle overflow + // Check for parse error. Value* parse_failed = builder.CreateICmpEQ(parse_result_val, failed_value, "failed"); + if (strict_mode) { + // In strict_mode, also check if parse_result is PARSE_OVERFLOW. + Value* overflow_value = codegen->GetIntConstant(TYPE_INT, + StringParser::PARSE_OVERFLOW); + Value* parse_overflow = builder.CreateICmpEQ(parse_result_val, overflow_value, + "overflowed"); + parse_failed = builder.CreateOr(parse_failed, parse_overflow, "failed_or"); + } builder.CreateCondBr(parse_failed, parse_failed_block, parse_success_block); // Parse succeeded diff --git a/be/src/exec/text-converter.h b/be/src/exec/text-converter.h index 365114634..9f533d583 100644 --- a/be/src/exec/text-converter.h +++ b/be/src/exec/text-converter.h @@ -42,8 +42,10 @@ class TextConverter { /// null_col_val: Special string to indicate NULL column values. /// check_null: If set, then the WriteSlot() functions set the target slot to NULL /// if their input string matches null_vol_val. + /// strict_mode: If set, numerical overflow/underflow are considered to be parse + /// errors. TextConverter(char escape_char, const std::string& null_col_val, - bool check_null = true); + bool check_null = true, bool strict_mode = false); /// Converts slot data, of length 'len', into type of slot_desc, /// and writes the result into the tuples's slot. @@ -74,9 +76,11 @@ class TextConverter { /// if its input string matches null_vol_val. /// The codegenerated function does not support escape characters and should not /// be used for partitions that contain escapes. + /// strict_mode: If set, numerical overflow/underflow are considered to be parse + /// errors. static llvm::Function* CodegenWriteSlot(LlvmCodeGen* codegen, TupleDescriptor* tuple_desc, SlotDescriptor* slot_desc, - const char* null_col_val, int len, bool check_null); + const char* null_col_val, int len, bool check_null, bool strict_mode = false); private: char escape_char_; @@ -84,6 +88,9 @@ class TextConverter { std::string null_col_val_; /// Indicates whether we should check for null_col_val_ and set slots to NULL. bool check_null_; + /// Indicates whether numerical overflow/underflow are considered to be parse + /// errors. + bool strict_mode_; }; } diff --git a/be/src/exec/text-converter.inline.h b/be/src/exec/text-converter.inline.h index d50b66f9a..052f1b331 100644 --- a/be/src/exec/text-converter.inline.h +++ b/be/src/exec/text-converter.inline.h @@ -167,10 +167,12 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup break; } - // TODO: add warning for overflow case - if (parse_result == StringParser::PARSE_FAILURE) { - tuple->SetNull(slot_desc->null_indicator_offset()); - return false; + if (UNLIKELY(parse_result != StringParser::PARSE_SUCCESS)) { + if (parse_result == StringParser::PARSE_FAILURE || + (strict_mode_ && parse_result == StringParser::PARSE_OVERFLOW)) { + tuple->SetNull(slot_desc->null_indicator_offset()); + return false; + } } return true; diff --git a/be/src/runtime/runtime-state.h b/be/src/runtime/runtime-state.h index f7c024863..860692fc6 100644 --- a/be/src/runtime/runtime-state.h +++ b/be/src/runtime/runtime-state.h @@ -100,6 +100,9 @@ class RuntimeState { bool abort_on_error() const { return query_ctx().request.query_options.abort_on_error; } + bool strict_mode() const { + return query_ctx().request.query_options.strict_mode; + } bool abort_on_default_limit_exceeded() const { return query_ctx().request.query_options.abort_on_default_limit_exceeded; } diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index 83e35482c..b82b5c3cd 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -401,6 +401,11 @@ Status impala::SetQueryOption(const string& key, const string& value, } break; } + case TImpalaQueryOptions::STRICT_MODE: { + query_options->__set_strict_mode( + iequals(value, "true") || iequals(value, "1")); + break; + } default: // We hit this DCHECK(false) if we forgot to add the corresponding entry here // when we add a new query option. diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index c86bc6ba5..4104f2b50 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -32,7 +32,7 @@ class TQueryOptions; // the DCHECK. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::PREFETCH_MODE + 1);\ + TImpalaQueryOptions::STRICT_MODE + 1);\ QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR)\ QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -79,7 +79,9 @@ class TQueryOptions; QUERY_OPT_FN(s3_skip_insert_staging, S3_SKIP_INSERT_STAGING)\ QUERY_OPT_FN(runtime_filter_min_size, RUNTIME_FILTER_MIN_SIZE)\ QUERY_OPT_FN(runtime_filter_max_size, RUNTIME_FILTER_MAX_SIZE)\ - QUERY_OPT_FN(prefetch_mode, PREFETCH_MODE); + QUERY_OPT_FN(prefetch_mode, PREFETCH_MODE)\ + QUERY_OPT_FN(strict_mode, STRICT_MODE); + /// Converts a TQueryOptions struct into a map of key, value pairs. void TQueryOptionsToMap(const TQueryOptions& query_options, diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift index ada1a20b0..55dd83864 100644 --- a/common/thrift/ImpalaInternalService.thrift +++ b/common/thrift/ImpalaInternalService.thrift @@ -196,6 +196,9 @@ struct TQueryOptions { // Prefetching behavior during hash tables' building and probing. 48: optional Types.TPrefetchMode prefetch_mode = Types.TPrefetchMode.HT_BUCKET + + // Additional strict handling of invalid data parsing and type conversions. + 49: optional bool strict_mode = false } // Impala currently has two types of sessions: Beeswax and HiveServer2 diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index 29ae4ccbf..0777c3238 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -222,7 +222,10 @@ enum TImpalaQueryOptions { RUNTIME_FILTER_MIN_SIZE, // Prefetching behavior during hash tables' building and probing. - PREFETCH_MODE + PREFETCH_MODE, + + // Additional strict handling of invalid data parsing and type conversions. + STRICT_MODE } // The summary of an insert. diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 4a54d5f68..894fa8b4b 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -927,8 +927,6 @@ float_col float double_col double ---- ROW_FORMAT delimited fields terminated by ',' escaped by '\\' ----- DEPENDENT_LOAD -INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} SELECT * FROM {db_name}.{table_name}; ---- LOAD LOAD DATA LOCAL INPATH '{impala_home}/testdata/data/overflow.txt' OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name}; ==== diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index fb787ac36..d95f3c447 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -102,9 +102,10 @@ table_name:tblwithraggedcolumns, constraint:exclude, table_format:hbase/none/non table_name:greptiny, constraint:exclude, table_format:hbase/none/none table_name:tinyinttable, constraint:exclude, table_format:hbase/none/none -# overflow has a bigint that's too big. hbase may lose precision, hence this -# table cannot be loaded. -table_name:overflow, constraint:exclude, table_format:hbase/none/none +# overflow uses a manually constructed text file which doesn't make sense to write to +# other table formats since the values that would be written are different (e.g. already +# truncated.) +table_name:overflow, constraint:restrict_to, table_format:text/none/none # widerow has a single column with a single row containing a 10MB string. hbase doesn't # seem to like this. diff --git a/testdata/workloads/functional-query/queries/QueryTest/show.test b/testdata/workloads/functional-query/queries/QueryTest/show.test index 6579f9bad..af54c7471 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/show.test +++ b/testdata/workloads/functional-query/queries/QueryTest/show.test @@ -77,7 +77,6 @@ show tables '*' 'nullinsert' 'nullinsert_alt' 'nulltable' -'overflow' 'rankingssmall' 'stringpartitionkey' 'tblwithraggedcolumns' diff --git a/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test b/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test new file mode 100644 index 000000000..8dccc078c --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/strict-mode-abort.test @@ -0,0 +1,31 @@ +==== +---- QUERY +select tinyint_col from overflow +---- CATCH +Error converting column: 0 TO TINYINT (Data is: 1000) +==== +---- QUERY +select smallint_col from overflow +---- CATCH +Error converting column: 1 TO SMALLINT (Data is: 100000) +==== +---- QUERY +select int_col from overflow +---- CATCH +Error converting column: 2 TO INT (Data is: 10000000000000000) +==== +---- QUERY +select bigint_col from overflow +---- CATCH +Error converting column: 3 TO BIGINT (Data is: 10000000000000000000) +==== +---- QUERY +select float_col from overflow +---- CATCH +Error converting column: 4 TO FLOAT (Data is: 1e1000000) +==== +---- QUERY +select double_col from overflow +---- CATCH +Error converting column: 5 TO DOUBLE (Data is: 1e10000) +==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test b/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test new file mode 100644 index 000000000..2d85a74fa --- /dev/null +++ b/testdata/workloads/functional-query/queries/QueryTest/strict-mode.test @@ -0,0 +1,28 @@ +==== +---- QUERY +select * from overflow +---- ERRORS +Error converting column: 0 TO TINYINT (Data is: 1000) +Error converting column: 1 TO SMALLINT (Data is: 100000) +Error converting column: 2 TO INT (Data is: 10000000000000000) +Error converting column: 3 TO BIGINT (Data is: 10000000000000000000) +Error converting column: 4 TO FLOAT (Data is: 1e1000000) +Error converting column: 5 TO DOUBLE (Data is: 1e10000) +file: hdfs://regex:.$ +record: 1000,100000,10000000000000000,10000000000000000000,1e1000000,1e10000 +Error converting column: 0 TO TINYINT (Data is: -1000) +Error converting column: 1 TO SMALLINT (Data is: -100000) +Error converting column: 2 TO INT (Data is: -10000000000000000) +Error converting column: 3 TO BIGINT (Data is: -10000000000000000000) +Error converting column: 4 TO FLOAT (Data is: -1e1000000) +Error converting column: 5 TO DOUBLE (Data is: -1e10000) +file: hdfs://regex:.$ +record: -1000,-100000,-10000000000000000,-10000000000000000000,-1e1000000,-1e10000 + +---- RESULTS +1,2,3,4,5.5,6.6 +NULL,NULL,NULL,NULL,NULL,NULL +NULL,NULL,NULL,NULL,NULL,NULL +---- TYPES +tinyint, smallint, int, bigint, float, double +==== diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py index 71fd21c44..f69c9b7ab 100644 --- a/tests/query_test/test_queries.py +++ b/tests/query_test/test_queries.py @@ -134,6 +134,14 @@ class TestQueriesTextTables(ImpalaTestSuite): def test_overflow(self, vector): self.run_test_case('QueryTest/overflow', vector) + def test_strict_mode(self, vector): + vector.get_value('exec_option')['strict_mode'] = 1 + vector.get_value('exec_option')['abort_on_error'] = 0 + self.run_test_case('QueryTest/strict-mode', vector) + + vector.get_value('exec_option')['abort_on_error'] = 1 + self.run_test_case('QueryTest/strict-mode-abort', vector) + def test_data_source_tables(self, vector): self.run_test_case('QueryTest/data-source-tables', vector)