From 9939c9d00928eb315ba53379f205d2ea299e2fc0 Mon Sep 17 00:00:00 2001
From: Victor Bittorf <victor.bittorf@cloudera.com>
Date: Wed, 17 Sep 2014 16:03:49 -0700
Subject: [PATCH] Bugfix and tests for CHAR(N) and VARCHAR(N)

Fixed a bug when setting the length in reading/write text files for CHAR(N).
Also added chars_tiny table for testing CHAR(N) and VARCHAR(N).

Change-Id: If5d5db30afa4b00cf03c68c6a845f182970329f4
Reviewed-on: http://gerrit.sjc.cloudera.com:8080/4415
Reviewed-by: Victor Bittorf <victor.bittorf@cloudera.com>
Tested-by: jenkins
---
 be/src/exec/hash-table.cc                     | 15 ++---
 be/src/exec/hdfs-text-table-writer.cc         |  9 +--
 be/src/exec/text-converter.inline.h           |  1 +
 be/src/exprs/expr.cc                          |  2 +-
 be/src/exprs/scalar-fn-call.cc                |  6 ++
 be/src/runtime/descriptors.cc                 |  1 +
 .../cloudera/impala/analysis/CastExpr.java    |  9 +++
 .../cloudera/impala/catalog/ScalarType.java   |  1 +
 .../cloudera/impala/planner/HashJoinNode.java | 16 +++--
 testdata/data/chars-tiny.txt                  |  9 +++
 .../functional/functional_schema_template.sql | 17 +++++
 .../functional/schema_constraints.csv         |  3 +
 .../queries/QueryTest/chars.test              | 66 +++++++++++++++++++
 13 files changed, 136 insertions(+), 19 deletions(-)
 create mode 100644 testdata/data/chars-tiny.txt
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index f87fba5e1..2ade936cc 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -483,10 +483,9 @@ Function* HashTableCtx::CodegenEvalRow(RuntimeState* state, bool build) {
     Function* expr_fn;
     Status status = ctxs[i]->root()->GetCodegendComputeFn(state, &expr_fn);
     if (!status.ok()) {
-      stringstream ss;
-      ss << "Problem with codegen: " << status.GetErrorMsg();
-      state->LogError(ss.str());
-      fn->eraseFromParent(); // deletes function
+      // TODO disabled because CHAR codegen can fail
+      //stringstream ss;
+      //ss << "Problem with codegen: " << status.GetErrorMsg();
       return NULL;
     }
 
@@ -752,7 +751,6 @@ Function* HashTableCtx::CodegenEquals(RuntimeState* state) {
   Value* row = args[1];
 
   BasicBlock* false_block = BasicBlock::Create(context, "false_block", fn);
-
   for (int i = 0; i < build_expr_ctxs_.size(); ++i) {
     BasicBlock* null_block = BasicBlock::Create(context, "null", fn);
     BasicBlock* not_null_block = BasicBlock::Create(context, "not_null", fn);
@@ -762,10 +760,9 @@ Function* HashTableCtx::CodegenEquals(RuntimeState* state) {
     Function* expr_fn;
     Status status = build_expr_ctxs_[i]->root()->GetCodegendComputeFn(state, &expr_fn);
     if (!status.ok()) {
-      stringstream ss;
-      ss << "Problem with codegen: " << status.GetErrorMsg();
-      state->LogError(ss.str());
-      fn->eraseFromParent(); // deletes function
+      // TODO disabled because CHAR codegen can fail
+      //stringstream ss;
+      //ss << "Problem with codegen: " << status.GetErrorMsg();
       return NULL;
     }
 
diff --git a/be/src/exec/hdfs-text-table-writer.cc b/be/src/exec/hdfs-text-table-writer.cc
index a8b3559a0..f02ad0ef0 100644
--- a/be/src/exec/hdfs-text-table-writer.cc
+++ b/be/src/exec/hdfs-text-table-writer.cc
@@ -125,11 +125,12 @@ Status HdfsTextTableWriter::AppendRowBatch(RowBatch* batch,
         void* value = output_expr_ctxs_[j]->GetValue(current_row);
         if (value != NULL) {
           const ColumnType& type = output_expr_ctxs_[j]->root()->type();
-          if (type.IsVarLen()) {
-            PrintEscaped(reinterpret_cast<const StringValue*>(value));
-          } else if (type.type == TYPE_CHAR) {
-            StringValue sv(StringValue::CharSlotToPtr(value, type), type.len);
+          if (type.type == TYPE_CHAR) {
+            char* val_ptr = StringValue::CharSlotToPtr(value, type);
+            StringValue sv(val_ptr, StringValue::UnpaddedCharLength(val_ptr, type.len));
             PrintEscaped(&sv);
+          } else if (type.IsVarLen()) {
+            PrintEscaped(reinterpret_cast<const StringValue*>(value));
           } else {
             output_expr_ctxs_[j]->PrintValue(value, &rowbatch_stringstream_);
           }
diff --git a/be/src/exec/text-converter.inline.h b/be/src/exec/text-converter.inline.h
index 75d546c12..be4a34301 100644
--- a/be/src/exec/text-converter.inline.h
+++ b/be/src/exec/text-converter.inline.h
@@ -77,6 +77,7 @@ inline bool TextConverter::WriteSlot(const SlotDescriptor* slot_desc, Tuple* tup
 
       if (type.type == TYPE_CHAR) {
         StringValue::PadWithSpaces(str.ptr, buffer_len, str.len);
+        str.len = type.len;
       }
       // write back to the slot, if !IsVarLen() we already wrote to the slot
       if (type.IsVarLen()) {
diff --git a/be/src/exprs/expr.cc b/be/src/exprs/expr.cc
index c5a1172b8..aed028d27 100644
--- a/be/src/exprs/expr.cc
+++ b/be/src/exprs/expr.cc
@@ -280,7 +280,7 @@ int Expr::ComputeResultsLayout(const vector<Expr*>& exprs, vector<int>* offsets,
   // Collect all the byte sizes and sort them
   for (int i = 0; i < exprs.size(); ++i) {
     data[i].expr_idx = i;
-    if (exprs[i]->type().type == TYPE_STRING || exprs[i]->type().type == TYPE_VARCHAR) {
+    if (exprs[i]->type().IsVarLen()) {
       data[i].byte_size = 16;
       data[i].variable_length = true;
     } else {
diff --git a/be/src/exprs/scalar-fn-call.cc b/be/src/exprs/scalar-fn-call.cc
index 10e26290e..1a5710951 100644
--- a/be/src/exprs/scalar-fn-call.cc
+++ b/be/src/exprs/scalar-fn-call.cc
@@ -227,6 +227,12 @@ Status ScalarFnCall::GetCodegendComputeFn(RuntimeState* state, llvm::Function**
     *fn = ir_compute_fn_;
     return Status::OK;
   }
+  for (int i = 0; i < GetNumChildren(); ++i) {
+    if (children_[i]->type().type == TYPE_CHAR) {
+      *fn = NULL;
+      return Status("ScalarFnCall Codegen not supported for CHAR");
+    }
+  }
 
   LlvmCodeGen* codegen = state->codegen();
   DCHECK(codegen != NULL);
diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc
index c6ec8d4d0..78e82ee59 100644
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -527,6 +527,7 @@ StructType* TupleDescriptor::GenerateLlvmStruct(LlvmCodeGen* codegen) {
   // Add the slot types to the struct description.
   for (int i = 0; i < slots().size(); ++i) {
     SlotDescriptor* slot_desc = slots()[i];
+    if (slot_desc->type().type == TYPE_CHAR) return NULL;
     if (slot_desc->is_materialized()) {
       slot_desc->field_idx_ = slot_desc->slot_idx_ + num_null_bytes_;
       DCHECK_LT(slot_desc->field_idx(), struct_fields.size());
diff --git a/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java b/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java
index 80bbe3891..ecf63c694 100644
--- a/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java
+++ b/fe/src/main/java/com/cloudera/impala/analysis/CastExpr.java
@@ -104,6 +104,15 @@ public class CastExpr extends Expr {
               beSymbol, null, null, true));
           continue;
         }
+        if (fromType.getPrimitiveType() == PrimitiveType.CHAR
+            && toType.getPrimitiveType() == PrimitiveType.CHAR) {
+          // Allow casting from CHAR(N) to Char(N)
+          String beSymbol = "impala::CastFunctions::CastToChar";
+          db.addBuiltin(ScalarFunction.createBuiltin(getFnName(ScalarType.CHAR),
+              Lists.newArrayList((Type) ScalarType.createCharType(-1)), false,
+              ScalarType.CHAR, beSymbol, null, null, true));
+          continue;
+        }
         if (fromType.getPrimitiveType() == PrimitiveType.VARCHAR
             && toType.getPrimitiveType() == PrimitiveType.VARCHAR) {
           // Allow casting from VARCHAR(N) to VARCHAR(M)
diff --git a/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java b/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java
index 368cc112f..302f295ed 100644
--- a/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java
+++ b/fe/src/main/java/com/cloudera/impala/catalog/ScalarType.java
@@ -170,6 +170,7 @@ public class ScalarType extends Type {
       if (isWildcardChar()) return "CHAR(*)";
       return "CHAR(" + len_ + ")";
     } else  if (type_ == PrimitiveType.DECIMAL) {
+      if (isWildcardDecimal()) return "DECIMAL(*,*)";
       return "DECIMAL(" + precision_ + "," + scale_ + ")";
     } else if (type_ == PrimitiveType.VARCHAR) {
       if (isWildcardVarchar()) return "VARCHAR(*)";
diff --git a/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java b/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java
index be2679e83..2d7a641ff 100644
--- a/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java
+++ b/fe/src/main/java/com/cloudera/impala/planner/HashJoinNode.java
@@ -147,12 +147,18 @@ public class HashJoinNode extends PlanNode {
       Type t0 = eqPred.getChild(0).getType();
       Type t1 = eqPred.getChild(1).getType();
       if (!t0.matchesType(t1)) {
-        // With decimal types, the child types do not have to match because the equality
-        // builtin handles it. However, they will not hash correctly so insert a cast.
-        Preconditions.checkState(t0.isDecimal());
-        Preconditions.checkState(t1.isDecimal());
+        // With decimal and char types, the child types do not have to match because
+        // the equality builtin handles it. However, they will not hash correctly so
+        // insert a cast.
+        boolean bothDecimal = t0.isDecimal() && t1.isDecimal();
+        boolean bothString = t0.isStringType() && t1.isStringType();
+        if (!bothDecimal && !bothString) {
+          throw new InternalException("Cannot compare " +
+              t0.toSql() + " to " + t1.toSql() + " in join predicate.");
+        }
         Type compatibleType = Type.getAssignmentCompatibleType(t0, t1);
-        Preconditions.checkState(compatibleType.isDecimal());
+        Preconditions.checkState(compatibleType.isDecimal() ||
+            compatibleType.isStringType());
         try {
           if (!t0.equals(compatibleType)) {
             eqPred.setChild(0, eqPred.getChild(0).castTo(compatibleType));
diff --git a/testdata/data/chars-tiny.txt b/testdata/data/chars-tiny.txt
new file mode 100644
index 000000000..100c66df3
--- /dev/null
+++ b/testdata/data/chars-tiny.txt
@@ -0,0 +1,9 @@
+1aaaa,1bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,1cccc
+2aaaaaa,2bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,2cccccc
+3aaa,3bbbbb,3ccc
+4aa,4bbbb,4cc
+5a,5bbb,5c
+\N,\N,\N
+6a,6b,6c
+6a,6b,6c
+a,b,c
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index fa407ff15..57daf986a 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -1332,6 +1332,23 @@ select * from functional.{table_name};
 ---- DATASET
 functional
 ---- BASE_TABLE_NAME
+chars_tiny
+---- COLUMNS
+cs CHAR(5)
+cl CHAR(140)
+vc VARCHAR(32)
+---- ROW_FORMAT
+delimited fields terminated by ','
+---- LOAD
+`hadoop fs -mkdir -p /test-warehouse/chars_tiny && hadoop fs -put -f \
+${IMPALA_HOME}/testdata/data/chars-tiny.txt /test-warehouse/chars_tiny/
+---- DEPENDENT_LOAD
+INSERT OVERWRITE TABLE {db_name}{db_suffix}.{table_name}
+select * from functional.{table_name};
+====
+---- DATASET
+functional
+---- BASE_TABLE_NAME
 widetable_250_cols
 ---- COLUMNS
 `${IMPALA_HOME}/testdata/common/widetable.py --get_columns -n 250
diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index 53a3343dd..257716604 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv
@@ -104,6 +104,9 @@ table_name:decimal_tiny, constraint:restrict_to, table_format:parquet/none/none
 
 table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block
 
+# TODO first set of tests are for text/none/none
+table_name:chars_tiny, constraint:restrict_to, table_format:text/none/none
+
 # hbase doesn't seem to like very wide tables
 table_name:widetable_250_cols, constraint:exclude, table_format:hbase/none/none
 table_name:widetable_500_cols, constraint:exclude, table_format:hbase/none/none
diff --git a/testdata/workloads/functional-query/queries/QueryTest/chars.test b/testdata/workloads/functional-query/queries/QueryTest/chars.test
index a021cf73e..9ecaab453 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/chars.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/chars.test
@@ -106,4 +106,70 @@ select cshort, clong, vc from allchars_par
 char,char,string
 ---- RESULTS
 '12345','123456                                                                                                                                      ','12345'
+=======
+---- QUERY
+select count(*), count(cs), count(cl), count(vc) from chars_tiny
+---- TYPES
+bigint,bigint,bigint,bigint
+---- RESULTS
+9,8,8,8
+====
+---- QUERY
+select * from chars_tiny where cs = cast('6a' as CHAR(2))
+---- TYPES
+char,char,string
+---- RESULTS
+'6a   ','6b                                                                                                                                          ','6c'
+'6a   ','6b                                                                                                                                          ','6c'
+====
+---- QUERY
+select count(*) from chars_tiny where vc != cast('5c' as varchar(3))
+---- TYPES
+bigint
+---- RESULTS
+7
+====
+---- QUERY
+select count(*) from chars_tiny where cs != cast('a' as char(3))
+---- TYPES
+bigint
+---- RESULTS
+7
+====
+---- QUERY
+select count(DISTINCT cs) from chars_tiny where vc = cast('5c' as varchar(10))
+---- TYPES
+bigint
+---- RESULTS
+1
+====
+---- QUERY
+select count(DISTINCT cs) from chars_tiny where cs = cast('5a' as char(10))
+---- TYPES
+bigint
+---- RESULTS
+1
+====
+---- QUERY
+select cs, count(cl) from functional.chars_tiny group by cs having count(vc) > 1
+---- TYPES
+char, bigint
+---- RESULTS
+'6a   ',2
+====
+---- QUERY
+select A.cs from functional.chars_tiny as A, functional.chars_tiny as B where 
+cast(A.cs as char(1)) = cast(B.cl as char(1)) order by A.cs
+---- TYPES
+char
+---- RESULTS
+'1aaaa'
+'2aaaa'
+'3aaa '
+'4aa  '
+'5a   '
+'6a   '
+'6a   '
+'6a   '
+'6a   '
 ====