From 9939c9d00928eb315ba53379f205d2ea299e2fc0 Mon Sep 17 00:00:00 2001 From: Victor Bittorf Date: Wed, 17 Sep 2014 16:03:49 -0700 Subject: [PATCH] Bugfix and tests for CHAR(N) and VARCHAR(N) Fixed a bug when setting the length in reading/write text files for CHAR(N). Also added chars_tiny table for testing CHAR(N) and VARCHAR(N). Change-Id: If5d5db30afa4b00cf03c68c6a845f182970329f4 Reviewed-on: http://gerrit.sjc.cloudera.com:8080/4415 Reviewed-by: Victor Bittorf Tested-by: jenkins --- be/src/exec/hash-table.cc | 15 ++--- be/src/exec/hdfs-text-table-writer.cc | 9 +-- be/src/exec/text-converter.inline.h | 1 + be/src/exprs/expr.cc | 2 +- be/src/exprs/scalar-fn-call.cc | 6 ++ be/src/runtime/descriptors.cc | 1 + .../cloudera/impala/analysis/CastExpr.java | 9 +++ .../cloudera/impala/catalog/ScalarType.java | 1 + .../cloudera/impala/planner/HashJoinNode.java | 16 +++-- testdata/data/chars-tiny.txt | 9 +++ .../functional/functional_schema_template.sql | 17 +++++ .../functional/schema_constraints.csv | 3 + .../queries/QueryTest/chars.test | 66 +++++++++++++++++++ 13 files changed, 136 insertions(+), 19 deletions(-) create mode 100644 testdata/data/chars-tiny.txt diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc index f87fba5e1..2ade936cc 100644 --- a/be/src/exec/hash-table.cc +++ b/be/src/exec/hash-table.cc @@ -483,10 +483,9 @@ Function* HashTableCtx::CodegenEvalRow(RuntimeState* state, bool build) { Function* expr_fn; Status status = ctxs[i]->root()->GetCodegendComputeFn(state, &expr_fn); if (!status.ok()) { - stringstream ss; - ss << "Problem with codegen: " << status.GetErrorMsg(); - state->LogError(ss.str()); - fn->eraseFromParent(); // deletes function + // TODO disabled because CHAR codegen can fail + //stringstream ss; + //ss << "Problem with codegen: " << status.GetErrorMsg(); return NULL; } @@ -752,7 +751,6 @@ Function* HashTableCtx::CodegenEquals(RuntimeState* state) { Value* row = args[1]; BasicBlock* false_block = BasicBlock::Create(context, "false_block", fn); - for (int i = 0; i < build_expr_ctxs_.size(); ++i) { BasicBlock* null_block = BasicBlock::Create(context, "null", fn); BasicBlock* not_null_block = BasicBlock::Create(context, "not_null", fn); @@ -762,10 +760,9 @@ Function* HashTableCtx::CodegenEquals(RuntimeState* state) { Function* expr_fn; Status status = build_expr_ctxs_[i]->root()->GetCodegendComputeFn(state, &expr_fn); if (!status.ok()) { - stringstream ss; - ss << "Problem with codegen: " << status.GetErrorMsg(); - state->LogError(ss.str()); - fn->eraseFromParent(); // deletes function + // TODO disabled because CHAR codegen can fail + //stringstream ss; + //ss << "Problem with codegen: " << status.GetErrorMsg(); return NULL; } diff --git a/be/src/exec/hdfs-text-table-writer.cc b/be/src/exec/hdfs-text-table-writer.cc index a8b3559a0..f02ad0ef0 100644 --- a/be/src/exec/hdfs-text-table-writer.cc +++ b/be/src/exec/hdfs-text-table-writer.cc @@ -125,11 +125,12 @@ Status HdfsTextTableWriter::AppendRowBatch(RowBatch* batch, void* value = output_expr_ctxs_[j]->GetValue(current_row); if (value != NULL) { const ColumnType& type = output_expr_ctxs_[j]->root()->type(); - if (type.IsVarLen()) { - PrintEscaped(reinterpret_cast(value)); - } else if (type.type == TYPE_CHAR) { - StringValue sv(StringValue::CharSlotToPtr(value, type), type.len); + if (type.type == TYPE_CHAR) { + char* val_ptr = StringValue::CharSlotToPtr(value, type); + StringValue sv(val_ptr, StringValue::UnpaddedCharLength(val_ptr, type.len)); PrintEscaped(&sv); + } else if (type.IsVarLen()) { + PrintEscaped(reinterpret_cast(value)); } else { output_expr_ctxs_[j]->PrintValue(value, &rowbatch_stringstream_); } diff --git a/be/src/exec/text-converter.inline.h b/be/src/exec/text-converter.inline.h index 75d546c12..be4a34301 100644 --- a/be/src/exec/text-converter.inline.h +++ b/be/src/exec/text-converter.inline.h @@ -77,6 +77,7 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup if (type.type == TYPE_CHAR) { StringValue::PadWithSpaces(str.ptr, buffer_len, str.len); + str.len = type.len; } // write back to the slot, if !IsVarLen() we already wrote to the slot if (type.IsVarLen()) { diff --git a/be/src/exprs/expr.cc b/be/src/exprs/expr.cc index c5a1172b8..aed028d27 100644 --- a/be/src/exprs/expr.cc +++ b/be/src/exprs/expr.cc @@ -280,7 +280,7 @@ int Expr::ComputeResultsLayout(const vector& exprs, vector* offsets, // Collect all the byte sizes and sort them for (int i = 0; i < exprs.size(); ++i) { data[i].expr_idx = i; - if (exprs[i]->type().type == TYPE_STRING || exprs[i]->type().type == TYPE_VARCHAR) { + if (exprs[i]->type().IsVarLen()) { data[i].byte_size = 16; data[i].variable_length = true; } else { diff --git a/be/src/exprs/scalar-fn-call.cc b/be/src/exprs/scalar-fn-call.cc index 10e26290e..1a5710951 100644 --- a/be/src/exprs/scalar-fn-call.cc +++ b/be/src/exprs/scalar-fn-call.cc @@ -227,6 +227,12 @@ Status ScalarFnCall::GetCodegendComputeFn(RuntimeState* state, llvm::Function** *fn = ir_compute_fn_; return Status::OK; } + for (int i = 0; i < GetNumChildren(); ++i) { + if (children_[i]->type().type == TYPE_CHAR) { + *fn = NULL; + return Status("ScalarFnCall Codegen not supported for CHAR"); + } + } LlvmCodeGen* codegen = state->codegen(); DCHECK(codegen != NULL); diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc index c6ec8d4d0..78e82ee59 100644 --- a/be/src/runtime/descriptors.cc +++ b/be/src/runtime/descriptors.cc @@ -527,6 +527,7 @@ StructType* TupleDescriptor::GenerateLlvmStruct(LlvmCodeGen* codegen) { // Add the slot types to the struct description. for (int i = 0; i < slots().size(); ++i) { SlotDescriptor* slot_desc = slots()[i]; + if (slot_desc->type().type == TYPE_CHAR) return NULL; if (slot_desc->is_materialized()) { slot_desc->field_idx_ = slot_desc->slot_idx_ + num_null_bytes_; DCHECK_LT(slot_desc->field_idx(), struct_fields.size()); diff --git a/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java b/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java index 80bbe3891..ecf63c694 100644 --- a/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java +++ b/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java @@ -104,6 +104,15 @@ public class CastExpr extends Expr { beSymbol, null, null, true)); continue; } + if (fromType.getPrimitiveType() == PrimitiveType.CHAR + && toType.getPrimitiveType() == PrimitiveType.CHAR) { + // Allow casting from CHAR(N) to Char(N) + String beSymbol = "impala::CastFunctions::CastToChar"; + db.addBuiltin(ScalarFunction.createBuiltin(getFnName(ScalarType.CHAR), + Lists.newArrayList((Type) ScalarType.createCharType(-1)), false, + ScalarType.CHAR, beSymbol, null, null, true)); + continue; + } if (fromType.getPrimitiveType() == PrimitiveType.VARCHAR && toType.getPrimitiveType() == PrimitiveType.VARCHAR) { // Allow casting from VARCHAR(N) to VARCHAR(M) diff --git a/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java b/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java index 368cc112f..302f295ed 100644 --- a/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java +++ b/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java @@ -170,6 +170,7 @@ public class ScalarType extends Type { if (isWildcardChar()) return "CHAR(*)"; return "CHAR(" + len_ + ")"; } else if (type_ == PrimitiveType.DECIMAL) { + if (isWildcardDecimal()) return "DECIMAL(*,*)"; return "DECIMAL(" + precision_ + "," + scale_ + ")"; } else if (type_ == PrimitiveType.VARCHAR) { if (isWildcardVarchar()) return "VARCHAR(*)"; diff --git a/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java b/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java index be2679e83..2d7a641ff 100644 --- a/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java +++ b/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java @@ -147,12 +147,18 @@ public class HashJoinNode extends PlanNode { Type t0 = eqPred.getChild(0).getType(); Type t1 = eqPred.getChild(1).getType(); if (!t0.matchesType(t1)) { - // With decimal types, the child types do not have to match because the equality - // builtin handles it. However, they will not hash correctly so insert a cast. - Preconditions.checkState(t0.isDecimal()); - Preconditions.checkState(t1.isDecimal()); + // With decimal and char types, the child types do not have to match because + // the equality builtin handles it. However, they will not hash correctly so + // insert a cast. + boolean bothDecimal = t0.isDecimal() && t1.isDecimal(); + boolean bothString = t0.isStringType() && t1.isStringType(); + if (!bothDecimal && !bothString) { + throw new InternalException("Cannot compare " + + t0.toSql() + " to " + t1.toSql() + " in join predicate."); + } Type compatibleType = Type.getAssignmentCompatibleType(t0, t1); - Preconditions.checkState(compatibleType.isDecimal()); + Preconditions.checkState(compatibleType.isDecimal() || + compatibleType.isStringType()); try { if (!t0.equals(compatibleType)) { eqPred.setChild(0, eqPred.getChild(0).castTo(compatibleType)); diff --git a/testdata/data/chars-tiny.txt b/testdata/data/chars-tiny.txt new file mode 100644 index 000000000..100c66df3 --- /dev/null +++ b/testdata/data/chars-tiny.txt @@ -0,0 +1,9 @@ +1aaaa,1bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,1cccc +2aaaaaa,2bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,2cccccc +3aaa,3bbbbb,3ccc +4aa,4bbbb,4cc +5a,5bbb,5c +\N,\N,\N +6a,6b,6c +6a,6b,6c +a,b,c diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index fa407ff15..57daf986a 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -1332,6 +1332,23 @@ select * from functional.{table_name}; ---- DATASET functional ---- BASE_TABLE_NAME +chars_tiny +---- COLUMNS +cs CHAR(5) +cl CHAR(140) +vc VARCHAR(32) +---- ROW_FORMAT +delimited fields terminated by ',' +---- LOAD +`hadoop fs -mkdir -p /test-warehouse/chars_tiny && hadoop fs -put -f \ +${IMPALA_HOME}/testdata/data/chars-tiny.txt /test-warehouse/chars_tiny/ +---- DEPENDENT_LOAD +INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name} +select * from functional.{table_name}; +==== +---- DATASET +functional +---- BASE_TABLE_NAME widetable_250_cols ---- COLUMNS `${IMPALA_HOME}/testdata/common/widetable.py --get_columns -n 250 diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv index 53a3343dd..257716604 100644 --- a/testdata/datasets/functional/schema_constraints.csv +++ b/testdata/datasets/functional/schema_constraints.csv @@ -104,6 +104,9 @@ table_name:decimal_tiny, constraint:restrict_to, table_format:parquet/none/none table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block +# TODO first set of tests are for text/none/none +table_name:chars_tiny, constraint:restrict_to, table_format:text/none/none + # hbase doesn't seem to like very wide tables table_name:widetable_250_cols, constraint:exclude, table_format:hbase/none/none table_name:widetable_500_cols, constraint:exclude, table_format:hbase/none/none diff --git a/testdata/workloads/functional-query/queries/QueryTest/chars.test b/testdata/workloads/functional-query/queries/QueryTest/chars.test index a021cf73e..9ecaab453 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/chars.test +++ b/testdata/workloads/functional-query/queries/QueryTest/chars.test @@ -106,4 +106,70 @@ select cshort, clong, vc from allchars_par char,char,string ---- RESULTS '12345','123456 ','12345' +======= +---- QUERY +select count(*), count(cs), count(cl), count(vc) from chars_tiny +---- TYPES +bigint,bigint,bigint,bigint +---- RESULTS +9,8,8,8 +==== +---- QUERY +select * from chars_tiny where cs = cast('6a' as CHAR(2)) +---- TYPES +char,char,string +---- RESULTS +'6a ','6b ','6c' +'6a ','6b ','6c' +==== +---- QUERY +select count(*) from chars_tiny where vc != cast('5c' as varchar(3)) +---- TYPES +bigint +---- RESULTS +7 +==== +---- QUERY +select count(*) from chars_tiny where cs != cast('a' as char(3)) +---- TYPES +bigint +---- RESULTS +7 +==== +---- QUERY +select count(DISTINCT cs) from chars_tiny where vc = cast('5c' as varchar(10)) +---- TYPES +bigint +---- RESULTS +1 +==== +---- QUERY +select count(DISTINCT cs) from chars_tiny where cs = cast('5a' as char(10)) +---- TYPES +bigint +---- RESULTS +1 +==== +---- QUERY +select cs, count(cl) from functional.chars_tiny group by cs having count(vc) > 1 +---- TYPES +char, bigint +---- RESULTS +'6a ',2 +==== +---- QUERY +select A.cs from functional.chars_tiny as A, functional.chars_tiny as B where +cast(A.cs as char(1)) = cast(B.cl as char(1)) order by A.cs +---- TYPES +char +---- RESULTS +'1aaaa' +'2aaaa' +'3aaa ' +'4aa ' +'5a ' +'6a ' +'6a ' +'6a ' +'6a ' ====