From d7246d64c777384f29fe0f824ee0036b58e8aa2d Mon Sep 17 00:00:00 2001
From: Tim Armstrong <tarmstrong@cloudera.com>
Date: Wed, 28 Sep 2016 13:04:07 -0700
Subject: [PATCH] IMPALA-1430,IMPALA-4108: codegen all builtin aggregate
 functions

This change enables codegen for all builtin aggregate functions,
e.g. timestamp functions and group_concat.

There are several parts to the change:
* Adding support for generic UDAs. Previous the codegen code did not
  handle multiple input arguments or NULL return values.
* Defaulting to using the UDA interface when there is not a special
  codegen path (we have implementations of all builtin aggregate
  functions for the interpreted path).
* Remove all the logic to disable codegen for the special cases that now
  are supported.

Also fix the generation of code to get/set NULL bits since I needed
to add functionality there anyway.

Testing:
Add tests that check that codegen was enabled for builtin aggregate
functions. Also fix some gaps in the preexisting tests.

Also add tests for UDAs that check input/output nulls are handled
correctly, in anticipation of enabling codegen for arbitrary UDAs.
The tests are run with both codegen enabled and disabled. To avoid
flaky tests, we switch the UDF tests to use "unique_database".

Perf:
Ran local TPC-H and targeted perf. Spent a lot of time on TPC-H Q1,
since my original approach regressed it ~5%. In the end the problem was
to do with the ordering of loads/stores to the slot and null bit in the
generated code: the previous version of the code exploited some
properties of the particular aggregate function. I ended up replicating
this behaviour to avoid regressing perf.

Change-Id: Id9dc21d1d676505d3617e1e4f37557397c4fb260
Reviewed-on: http://gerrit.cloudera.org:8080/4655
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Internal Jenkins
---
 be/src/benchmarks/hash-benchmark.cc           |   2 +-
 be/src/codegen/codegen-anyval.cc              |  63 +-
 be/src/codegen/codegen-anyval.h               |  55 +-
 be/src/codegen/gen_ir_descriptions.py         |   4 +-
 be/src/codegen/llvm-codegen-test.cc           |  16 +-
 be/src/codegen/llvm-codegen.cc                |  19 +-
 be/src/codegen/llvm-codegen.h                 |  29 +-
 be/src/exec/aggregation-node.cc               |  43 +-
 be/src/exec/exec-node.cc                      |   2 +-
 be/src/exec/hash-join-node.cc                 |  12 +-
 be/src/exec/hash-table.cc                     |  42 +-
 be/src/exec/hdfs-avro-scanner.cc              |  23 +-
 be/src/exec/hdfs-scanner.cc                   |  15 +-
 be/src/exec/old-hash-table.cc                 |  10 +-
 .../exec/partitioned-aggregation-node-ir.cc   |   2 +-
 be/src/exec/partitioned-aggregation-node.cc   | 611 ++++++++++--------
 be/src/exec/partitioned-aggregation-node.h    |  25 +-
 be/src/exec/partitioned-hash-join-node.cc     |  12 +-
 be/src/exec/text-converter.cc                 |  12 +-
 be/src/exprs/aggregate-functions-ir.cc        |  24 +-
 be/src/exprs/case-expr.cc                     |   2 +-
 be/src/exprs/compound-predicates.cc           |   7 +-
 be/src/exprs/expr-codegen-test.cc             |   2 +-
 be/src/exprs/expr.cc                          |   4 +-
 be/src/exprs/literal.cc                       |   4 +-
 be/src/exprs/null-literal.cc                  |   2 +-
 be/src/exprs/scalar-fn-call.cc                |  12 +-
 be/src/exprs/slot-ref.cc                      |  48 +-
 be/src/runtime/descriptors.cc                 | 120 ++--
 be/src/runtime/descriptors.h                  |  40 +-
 be/src/runtime/tuple.cc                       |   5 +-
 be/src/runtime/types.h                        |  19 +-
 be/src/testutil/test-udas.cc                  |  36 ++
 be/src/testutil/test-udfs.cc                  |   6 +-
 be/src/util/tuple-row-compare.cc              |   2 +-
 .../queries/QueryTest/java-udf.test           | 104 +--
 .../QueryTest/libs_with_same_filenames.test   |  11 +-
 .../queries/QueryTest/load-java-udfs.test     |  79 +--
 .../queries/QueryTest/uda-mem-limit.test      |   3 -
 .../queries/QueryTest/uda.test                |  28 +
 .../QueryTest/udf-codegen-required.test       |  10 +
 .../queries/QueryTest/udf-errors.test         |  37 +-
 .../queries/QueryTest/udf-mem-limit.test      |   5 -
 .../queries/QueryTest/udf.test                |   7 -
 tests/common/test_result_verifier.py          |  22 +
 tests/query_test/test_aggregation.py          | 122 +++-
 tests/query_test/test_udfs.py                 |  83 ++-
 47 files changed, 1039 insertions(+), 802 deletions(-)
 create mode 100644 testdata/workloads/functional-query/queries/QueryTest/udf-codegen-required.test

diff --git a/be/src/benchmarks/hash-benchmark.cc b/be/src/benchmarks/hash-benchmark.cc
index 5ab7f5c39..4f6d7f1dc 100644
--- a/be/src/benchmarks/hash-benchmark.cc
+++ b/be/src/benchmarks/hash-benchmark.cc
@@ -343,7 +343,7 @@ Function* CodegenCrcHash(LlvmCodeGen* codegen, bool mixed) {
   prototype.AddArgument(
       LlvmCodeGen::NamedVariable("results", codegen->GetPtrType(TYPE_INT)));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[3];
   Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
 
diff --git a/be/src/codegen/codegen-anyval.cc b/be/src/codegen/codegen-anyval.cc
index a1ca6d984..bd2740984 100644
--- a/be/src/codegen/codegen-anyval.cc
+++ b/be/src/codegen/codegen-anyval.cc
@@ -120,8 +120,7 @@ Type* CodegenAnyVal::GetUnloweredPtrType(LlvmCodeGen* cg, const ColumnType& type
   return GetUnloweredType(cg, type)->getPointerTo();
 }
 
-Value* CodegenAnyVal::CreateCall(
-    LlvmCodeGen* cg, LlvmCodeGen::LlvmBuilder* builder, Function* fn,
+Value* CodegenAnyVal::CreateCall(LlvmCodeGen* cg, LlvmBuilder* builder, Function* fn,
     ArrayRef<Value*> args, const char* name, Value* result_ptr) {
   if (fn->getReturnType()->isVoidTy()) {
     // Void return type indicates that this function returns a DecimalVal via the first
@@ -152,20 +151,15 @@ Value* CodegenAnyVal::CreateCall(
   }
 }
 
-CodegenAnyVal CodegenAnyVal::CreateCallWrapped(
-    LlvmCodeGen* cg, LlvmCodeGen::LlvmBuilder* builder, const ColumnType& type,
-    Function* fn, ArrayRef<Value*> args, const char* name) {
+CodegenAnyVal CodegenAnyVal::CreateCallWrapped(LlvmCodeGen* cg, LlvmBuilder* builder,
+    const ColumnType& type, Function* fn, ArrayRef<Value*> args, const char* name) {
   Value* v = CreateCall(cg, builder, fn, args, name);
   return CodegenAnyVal(cg, builder, type, v, name);
 }
 
-CodegenAnyVal::CodegenAnyVal(LlvmCodeGen* codegen, LlvmCodeGen::LlvmBuilder* builder,
-                             const ColumnType& type, Value* value, const char* name)
-  : type_(type),
-    value_(value),
-    name_(name),
-    codegen_(codegen),
-    builder_(builder) {
+CodegenAnyVal::CodegenAnyVal(LlvmCodeGen* codegen, LlvmBuilder* builder,
+    const ColumnType& type, Value* value, const char* name)
+  : type_(type), value_(value), name_(name), codegen_(codegen), builder_(builder) {
   Type* value_type = GetLoweredType(codegen, type);
   if (value_ == NULL) {
     // No Value* was specified, so allocate one on the stack and load it.
@@ -175,7 +169,7 @@ CodegenAnyVal::CodegenAnyVal(LlvmCodeGen* codegen, LlvmCodeGen::LlvmBuilder* bui
   DCHECK_EQ(value_->getType(), value_type);
 }
 
-Value* CodegenAnyVal::GetIsNull(const char* name) {
+Value* CodegenAnyVal::GetIsNull(const char* name) const {
   switch (type_.type) {
     case TYPE_BIGINT:
     case TYPE_DOUBLE: {
@@ -448,23 +442,28 @@ void CodegenAnyVal::SetDate(Value* date) {
   value_ = builder_->CreateInsertValue(value_, v, 0, name_);
 }
 
-Value* CodegenAnyVal::GetUnloweredPtr() {
-  Value* value_ptr = codegen_->CreateEntryBlockAlloca(*builder_, value_->getType());
-  builder_->CreateStore(value_, value_ptr);
-  return builder_->CreateBitCast(value_ptr, GetUnloweredPtrType(codegen_, type_));
+Value* CodegenAnyVal::GetLoweredPtr(const string& name) const {
+  Value* lowered_ptr =
+      codegen_->CreateEntryBlockAlloca(*builder_, value_->getType(), name.c_str());
+  builder_->CreateStore(GetLoweredValue(), lowered_ptr);
+  return lowered_ptr;
 }
 
-void CodegenAnyVal::SetFromRawPtr(Value* raw_ptr) {
-  Value* val_ptr =
-      builder_->CreateBitCast(raw_ptr, codegen_->GetPtrType(type_), "val_ptr");
-  Value* val = builder_->CreateLoad(val_ptr);
-  SetFromRawValue(val);
+Value* CodegenAnyVal::GetUnloweredPtr(const string& name) const {
+  // Get an unlowered pointer by creating a lowered pointer then bitcasting it.
+  // TODO: if the original value was unlowered, this generates roundabout code that
+  // lowers the value and casts it back. Generally LLVM's optimiser can reason
+  // about what's going on and undo our shenanigans to generate sane code, but it
+  // would be nice to just emit reasonable code in the first place.
+  return builder_->CreateBitCast(
+      GetLoweredPtr(), GetUnloweredPtrType(codegen_, type_), name);
 }
 
 void CodegenAnyVal::SetFromRawValue(Value* raw_val) {
   DCHECK_EQ(raw_val->getType(), codegen_->GetType(type_))
-      << endl << LlvmCodeGen::Print(raw_val)
-      << endl << type_ << " => " << LlvmCodeGen::Print(codegen_->GetType(type_));
+      << endl
+      << LlvmCodeGen::Print(raw_val) << endl
+      << type_ << " => " << LlvmCodeGen::Print(codegen_->GetType(type_));
   switch (type_.type) {
     case TYPE_STRING:
     case TYPE_VARCHAR: {
@@ -614,9 +613,7 @@ void CodegenAnyVal::WriteToSlot(const SlotDescriptor& slot_desc, Value* tuple,
 
   // Null block: set null bit
   builder_->SetInsertPoint(null_block);
-  Function* set_null_fn = slot_desc.GetUpdateNullFn(codegen_, true);
-  DCHECK(set_null_fn != NULL);
-  builder_->CreateCall(set_null_fn, tuple);
+  slot_desc.CodegenSetNullIndicator(codegen_, builder_, tuple, codegen_->true_value());
   builder_->CreateBr(insert_before);
 
   // Leave builder_ after conditional blocks
@@ -704,6 +701,14 @@ Value* CodegenAnyVal::Compare(CodegenAnyVal* other, const char* name) {
   return builder_->CreateCall(compare_fn, args, name);
 }
 
+void CodegenAnyVal::CodegenBranchIfNull(LlvmBuilder* builder, BasicBlock* null_block) {
+  Value* is_null = GetIsNull();
+  BasicBlock* not_null_block = BasicBlock::Create(
+      codegen_->context(), "not_null", builder->GetInsertBlock()->getParent());
+  builder->CreateCondBr(is_null, null_block, not_null_block);
+  builder->SetInsertPoint(not_null_block);
+}
+
 Value* CodegenAnyVal::GetHighBits(int num_bits, Value* v, const char* name) {
   DCHECK_EQ(v->getType()->getIntegerBitWidth(), num_bits * 2);
   Value* shifted = builder_->CreateAShr(v, num_bits);
@@ -762,8 +767,8 @@ Value* CodegenAnyVal::GetNullVal(LlvmCodeGen* codegen, Type* val_type) {
   return ConstantInt::get(val_type, 1);
 }
 
-CodegenAnyVal CodegenAnyVal::GetNonNullVal(LlvmCodeGen* codegen,
-    LlvmCodeGen::LlvmBuilder* builder, const ColumnType& type, const char* name) {
+CodegenAnyVal CodegenAnyVal::GetNonNullVal(LlvmCodeGen* codegen, LlvmBuilder* builder,
+    const ColumnType& type, const char* name) {
   Type* val_type = GetLoweredType(codegen, type);
   // All zeros => 'is_null' = false
   Value* value = Constant::getNullValue(val_type);
diff --git a/be/src/codegen/codegen-anyval.h b/be/src/codegen/codegen-anyval.h
index 290ca0365..c07f3eb16 100644
--- a/be/src/codegen/codegen-anyval.h
+++ b/be/src/codegen/codegen-anyval.h
@@ -51,6 +51,8 @@ namespace impala {
 /// TYPE_DOUBLE/DoubleVal: { i8, double }
 /// TYPE_STRING/StringVal: { i64, i8* }
 /// TYPE_TIMESTAMP/TimestampVal: { i64, i64 }
+/// TYPE_DECIMAL/DecimalVal (isn't lowered):
+/// %"struct.impala_udf::DecimalVal" { {i8}, [15 x i8], {i128} }
 //
 /// TODO:
 /// - unit tests
@@ -78,14 +80,14 @@ class CodegenAnyVal {
   /// result will not be a pointer in this case).
   //
   /// 'name' optionally specifies the name of the returned value.
-  static llvm::Value* CreateCall(LlvmCodeGen* cg, LlvmCodeGen::LlvmBuilder* builder,
+  static llvm::Value* CreateCall(LlvmCodeGen* cg, LlvmBuilder* builder,
       llvm::Function* fn, llvm::ArrayRef<llvm::Value*> args, const char* name = "",
       llvm::Value* result_ptr = NULL);
 
   /// Same as above but wraps the result in a CodegenAnyVal.
-  static CodegenAnyVal CreateCallWrapped(LlvmCodeGen* cg,
-      LlvmCodeGen::LlvmBuilder* builder, const ColumnType& type, llvm::Function* fn,
-      llvm::ArrayRef<llvm::Value*> args, const char* name = "");
+  static CodegenAnyVal CreateCallWrapped(LlvmCodeGen* cg, LlvmBuilder* builder,
+      const ColumnType& type, llvm::Function* fn, llvm::ArrayRef<llvm::Value*> args,
+      const char* name = "");
 
   /// Returns the lowered AnyVal type associated with 'type'.
   /// E.g.: TYPE_BOOLEAN (which corresponds to a BooleanVal) => i16
@@ -116,8 +118,8 @@ class CodegenAnyVal {
   /// E.g.: TYPE_DOUBLE (lowered DoubleVal: { i8, double }) => { 0, 0 }
   /// This returns a CodegenAnyVal, rather than the unwrapped Value*, because the actual
   /// value still needs to be set.
-  static CodegenAnyVal GetNonNullVal(LlvmCodeGen* codegen,
-      LlvmCodeGen::LlvmBuilder* builder, const ColumnType& type, const char* name = "");
+  static CodegenAnyVal GetNonNullVal(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      const ColumnType& type, const char* name = "");
 
   /// Creates a wrapper around a lowered *Val value.
   //
@@ -132,14 +134,14 @@ class CodegenAnyVal {
   /// must be of the correct lowered type.
   //
   /// If 'name' is specified, it will be used when generated instructions that set value_.
-  CodegenAnyVal(LlvmCodeGen* codegen, LlvmCodeGen::LlvmBuilder* builder,
-                const ColumnType& type, llvm::Value* value = NULL, const char* name = "");
+  CodegenAnyVal(LlvmCodeGen* codegen, LlvmBuilder* builder, const ColumnType& type,
+      llvm::Value* value = NULL, const char* name = "");
 
   /// Returns the current type-lowered value.
-  llvm::Value* value() { return value_; }
+  llvm::Value* GetLoweredValue() const { return value_; }
 
   /// Gets the 'is_null' field of the *Val.
-  llvm::Value* GetIsNull(const char* name = "is_null");
+  llvm::Value* GetIsNull(const char* name = "is_null") const;
 
   /// Get the 'val' field of the *Val. Do not call if this represents a StringVal or
   /// TimestampVal. If this represents a DecimalVal, returns 'val4', 'val8', or 'val16'
@@ -180,19 +182,18 @@ class CodegenAnyVal {
   void SetDate(llvm::Value* date);
   void SetTimeOfDay(llvm::Value* time_of_day);
 
-  /// Allocas and stores this value in an unlowered pointer, and returns the pointer. This
-  /// *Val should be non-null.
-  llvm::Value* GetUnloweredPtr();
+  /// Stores this value in an alloca allocation, and returns the pointer, which has the
+  /// lowered type. This *Val should be non-null. The output variable is called 'name'.
+  llvm::Value* GetLoweredPtr(const std::string& name = "") const;
+
+  /// Stores this value in an alloca allocation, and returns the pointer, which has the
+  /// unlowered type. This *Val should be non-null. The output variable is called 'name'.
+  llvm::Value* GetUnloweredPtr(const std::string& name = "") const;
 
   /// Set this *Val's value based on 'raw_val'. 'raw_val' should be a native type,
   /// StringValue, or TimestampValue.
   void SetFromRawValue(llvm::Value* raw_val);
 
-  /// Set this *Val's value based on void* 'raw_ptr'. 'raw_ptr' should be a pointer to a
-  /// native type, StringValue, or TimestampValue (i.e. the value returned by an
-  /// interpreted compute fn).
-  void SetFromRawPtr(llvm::Value* raw_ptr);
-
   /// Converts this *Val's value to a native type, StringValue, TimestampValue, etc.
   /// This should only be used if this *Val is not null.
   ///
@@ -235,9 +236,23 @@ class CodegenAnyVal {
   /// this < 'other', 0 if this == 'other', > 0 if this > 'other'.
   llvm::Value* Compare(CodegenAnyVal* other, const char* name = "result");
 
+  /// Generate code to branch to 'null_block' if this value is NULL. The branch terminates
+  /// the current BasicBlock, so a new BasicBlock for the non-NULL case is also created,
+  /// and builder's insert position is set to the start of the non-NULL block.
+  ///
+  /// This corresponds to the C++ code:
+  /// if (val.is_null) goto null_block;
+  ///
+  /// non_null_block:
+  ///   <-- Builder insert position after this function returns.
+  /// ...
+  /// null_block:
+  /// ...
+  void CodegenBranchIfNull(LlvmBuilder* builder, llvm::BasicBlock* null_block);
+
   /// Ctor for created an uninitialized CodegenAnYVal that can be assigned to later.
   CodegenAnyVal()
-    : type_(INVALID_TYPE), value_(NULL), name_(NULL), codegen_(NULL), builder_(NULL) { }
+    : type_(INVALID_TYPE), value_(NULL), name_(NULL), codegen_(NULL), builder_(NULL) {}
 
  private:
   ColumnType type_;
@@ -245,7 +260,7 @@ class CodegenAnyVal {
   const char* name_;
 
   LlvmCodeGen* codegen_;
-  LlvmCodeGen::LlvmBuilder* builder_;
+  LlvmBuilder* builder_;
 
   /// Helper function for getting the top (most significant) half of 'v'.
   /// 'v' should have width = 'num_bits' * 2 and be an integer type.
diff --git a/be/src/codegen/gen_ir_descriptions.py b/be/src/codegen/gen_ir_descriptions.py
index ddcd1dbe8..09f3a2014 100755
--- a/be/src/codegen/gen_ir_descriptions.py
+++ b/be/src/codegen/gen_ir_descriptions.py
@@ -59,8 +59,8 @@ ir_functions = [
    "_ZN6impala26PartitionedAggregationNode22ProcessBatchNoGroupingEPNS_8RowBatchE"],
   ["PART_AGG_NODE_PROCESS_BATCH_STREAMING",
    "_ZN6impala26PartitionedAggregationNode21ProcessBatchStreamingEbNS_13TPrefetchMode4typeEPNS_8RowBatchES4_PNS_12HashTableCtxEPi"],
-  ["PART_AGG_NODE_GET_EXPR_CTX",
-   "_ZNK6impala26PartitionedAggregationNode17GetAggExprContextEi"],
+  ["PART_AGG_NODE_GET_EXPR_CTXS",
+   "_ZNK6impala26PartitionedAggregationNode18GetAggExprContextsEi"],
   ["AVG_UPDATE_BIGINT",
    "_ZN6impala18AggregateFunctions9AvgUpdateIN10impala_udf9BigIntValEEEvPNS2_15FunctionContextERKT_PNS2_9StringValE"],
   ["AVG_UPDATE_DOUBLE",
diff --git a/be/src/codegen/llvm-codegen-test.cc b/be/src/codegen/llvm-codegen-test.cc
index e6f2f7993..178baa2ef 100644
--- a/be/src/codegen/llvm-codegen-test.cc
+++ b/be/src/codegen/llvm-codegen-test.cc
@@ -124,7 +124,7 @@ TEST_F(LlvmCodeGenTest, BadIRFile) {
 // The random int in there is the address of jitted_counter
 Function* CodegenInnerLoop(LlvmCodeGen* codegen, int64_t* jitted_counter, int delta) {
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
 
   LlvmCodeGen::FnPrototype fn_prototype(codegen, "JittedInnerLoop", codegen->void_type());
   Function* jitted_loop_call = fn_prototype.GeneratePrototype();
@@ -173,10 +173,10 @@ TEST_F(LlvmCodeGenTest, ReplaceFnCall) {
   ASSERT_OK(LlvmCodeGenTest::CreateFromFile(&pool, module_file.c_str(), &codegen));
   EXPECT_TRUE(codegen.get() != NULL);
 
-  Function* loop_call = codegen->GetFunction(loop_call_name);
+  Function* loop_call = codegen->GetFunction(loop_call_name, false);
   EXPECT_TRUE(loop_call != NULL);
   EXPECT_TRUE(loop_call->arg_empty());
-  Function* loop = codegen->GetFunction(loop_name);
+  Function* loop = codegen->GetFunction(loop_name, false);
   EXPECT_TRUE(loop != NULL);
   EXPECT_EQ(loop->arg_size(), 1);
 
@@ -261,7 +261,7 @@ Function* CodegenStringTest(LlvmCodeGen* codegen) {
 
   LlvmCodeGen::FnPrototype prototype(codegen, "StringTest", codegen->GetType(TYPE_INT));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("str", string_val_ptr_type));
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
 
   Value* str;
   Function* interop_fn = prototype.GeneratePrototype(&builder, &str);
@@ -340,7 +340,7 @@ TEST_F(LlvmCodeGenTest, MemcpyTest) {
   prototype.AddArgument(LlvmCodeGen::NamedVariable("src", codegen->ptr_type()));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("n", codegen->GetType(TYPE_INT)));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
 
   char src[] = "abcd";
   char dst[] = "aaaa";
@@ -396,9 +396,9 @@ TEST_F(LlvmCodeGenTest, HashTest) {
 
     // Create a codegen'd function that hashes all the types and returns the results.
     // The tuple/values to hash are baked into the codegen for simplicity.
-    LlvmCodeGen::FnPrototype prototype(codegen.get(), "HashTest",
-        codegen->GetType(TYPE_INT));
-    LlvmCodeGen::LlvmBuilder builder(codegen->context());
+    LlvmCodeGen::FnPrototype prototype(
+        codegen.get(), "HashTest", codegen->GetType(TYPE_INT));
+    LlvmBuilder builder(codegen->context());
 
     // Test both byte-size specific hash functions and the generic loop hash function
     Function* fn_fixed = prototype.GeneratePrototype(&builder, NULL);
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index f629d3505..f6e7e972a 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -671,7 +671,7 @@ Status LlvmCodeGen::MaterializeFunction(Function *fn) {
   return MaterializeFunctionHelper(fn);
 }
 
-Function* LlvmCodeGen::GetFunction(const string& symbol) {
+Function* LlvmCodeGen::GetFunction(const string& symbol, bool clone) {
   Function* fn = module_->getFunction(symbol.c_str());
   if (fn == NULL) {
     LOG(ERROR) << "Unable to locate function " << symbol;
@@ -679,6 +679,7 @@ Function* LlvmCodeGen::GetFunction(const string& symbol) {
   }
   Status status = MaterializeFunction(fn);
   if (UNLIKELY(!status.ok())) return NULL;
+  if (clone) return CloneFunction(fn);
   return fn;
 }
 
@@ -1335,15 +1336,15 @@ Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
       Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_BIGINT));
       int i = 0;
       while (num_bytes >= 8) {
-        Value* index[] = { GetIntConstant(TYPE_INT, i++) };
-        Value* d = builder.CreateLoad(builder.CreateGEP(ptr, index));
+        Value* index[] = {GetIntConstant(TYPE_INT, i++)};
+        Value* d = builder.CreateLoad(builder.CreateInBoundsGEP(ptr, index));
         result_64 = builder.CreateCall(crc64_fn, ArrayRef<Value*>({result_64, d}));
         num_bytes -= 8;
       }
       result = builder.CreateTrunc(result_64, GetType(TYPE_INT));
-      Value* index[] = { GetIntConstant(TYPE_INT, i * 8) };
+      Value* index[] = {GetIntConstant(TYPE_INT, i * 8)};
       // Update data to past the 8-byte chunks
-      data = builder.CreateGEP(data, index);
+      data = builder.CreateInBoundsGEP(data, index);
     }
 
     if (num_bytes >= 4) {
@@ -1351,8 +1352,8 @@ Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
       Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_INT));
       Value* d = builder.CreateLoad(ptr);
       result = builder.CreateCall(crc32_fn, ArrayRef<Value*>({result, d}));
-      Value* index[] = { GetIntConstant(TYPE_INT, 4) };
-      data = builder.CreateGEP(data, index);
+      Value* index[] = {GetIntConstant(TYPE_INT, 4)};
+      data = builder.CreateInBoundsGEP(data, index);
       num_bytes -= 4;
     }
 
@@ -1361,8 +1362,8 @@ Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
       Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_SMALLINT));
       Value* d = builder.CreateLoad(ptr);
       result = builder.CreateCall(crc16_fn, ArrayRef<Value*>({result, d}));
-      Value* index[] = { GetIntConstant(TYPE_INT, 2) };
-      data = builder.CreateGEP(data, index);
+      Value* index[] = {GetIntConstant(TYPE_INT, 2)};
+      data = builder.CreateInBoundsGEP(data, index);
       num_bytes -= 2;
     }
 
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index 095f939e0..21f9f7188 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -71,11 +71,15 @@ namespace llvm {
 
 namespace impala {
 
-class CodegenAnyVal;
 class CodegenSymbolEmitter;
 class SubExprElimination;
 class TupleDescriptor;
 
+/// Define builder subclass in case we want to change the template arguments later
+class LlvmBuilder : public llvm::IRBuilder<> {
+  using llvm::IRBuilder<>::IRBuilder;
+};
+
 /// LLVM code generator.  This is the top level object to generate jitted code.
 //
 /// LLVM provides a c++ IR builder interface so IR does not need to be written
@@ -157,9 +161,6 @@ class LlvmCodeGen {
   /// If false, only output IR for functions which were generated.
   std::string GetIR(bool full_module) const;
 
-  /// Typedef builder in case we want to change the template arguments later
-  typedef llvm::IRBuilder<> LlvmBuilder;
-
   /// Utility struct that wraps a variable name and llvm type.
   struct NamedVariable {
     std::string name;
@@ -330,18 +331,22 @@ class LlvmCodeGen {
   /// recursively. Returns NULL if there is any error.
   ///
   /// If 'clone' is true, a clone of the function will be returned. Clones should be used
-  /// iff the caller will modify the returned function. This avoids clobbering the
-  /// function in case other users need it, but we don't clone if we can avoid it to
-  /// reduce compilation time.
+  /// iff the caller will modify the returned function so that the original unmodified
+  /// function remains available. Avoid cloning if possible to reduce compilation time.
   ///
   /// TODO: Return Status instead.
   llvm::Function* GetFunction(IRFunction::Type ir_type, bool clone);
 
   /// Return the function with the symbol name 'symbol' from the module. The returned
-  /// function and its callee will be recursively materialized. The returned function
-  /// isn't cloned. Returns NULL if there is any error.
+  /// function and its callee will be recursively materialized. Returns NULL if there is
+  /// any error.
+  ///
+  /// If 'clone' is true, a clone of the function will be returned. Clones should be used
+  /// iff the caller will modify the returned function so that the original unmodified
+  /// function remains available. Avoid cloning if possible to reduce compilation time.
+  ///
   /// TODO: Return Status instead.
-  llvm::Function* GetFunction(const string& symbol);
+  llvm::Function* GetFunction(const string& symbol, bool clone);
 
   /// Returns the hash function with signature:
   ///   int32_t Hash(int8_t* data, int len, int32_t seed);
@@ -442,8 +447,8 @@ class LlvmCodeGen {
   /// Codegens IR to load array[idx] and returns the loaded value. 'array' should be a
   /// C-style array (e.g. i32*) or an IR array (e.g. [10 x i32]). This function does not
   /// do bounds checking.
-  llvm::Value* CodegenArrayAt(LlvmBuilder*, llvm::Value* array, int idx,
-      const char* name = "");
+  llvm::Value* CodegenArrayAt(
+      LlvmBuilder*, llvm::Value* array, int idx, const char* name = "");
 
   /// Loads a module at 'file' and links it to the module associated with
   /// this LlvmCodeGen object. The module must be on the local filesystem.
diff --git a/be/src/exec/aggregation-node.cc b/be/src/exec/aggregation-node.cc
index 67928ca5e..0607aa307 100644
--- a/be/src/exec/aggregation-node.cc
+++ b/be/src/exec/aggregation-node.cc
@@ -550,6 +550,10 @@ llvm::Function* AggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
       codegen->GetPtrType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME);
   PointerType* expr_ctx_ptr_type = codegen->GetPtrType(ExprContext::LLVM_CLASS_NAME);
   StructType* tuple_struct = intermediate_tuple_desc_->GetLlvmStruct(codegen);
+  if (tuple_struct == NULL) {
+    VLOG_QUERY << "Could not codegen UpdateSlot(): could not generate tuple struct.";
+    return NULL;
+  }
   PointerType* tuple_ptr_type = codegen->GetPtrType(tuple_struct);
   PointerType* tuple_row_ptr_type = codegen->GetPtrType(TupleRow::LLVM_CLASS_NAME);
 
@@ -560,7 +564,7 @@ llvm::Function* AggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
   prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_tuple", tuple_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[4];
   Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
   Value* fn_ctx_arg = args[0];
@@ -588,8 +592,8 @@ llvm::Function* AggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
 
   if (slot_desc->is_nullable()) {
     // Dst is NULL, just update dst slot to src slot and clear null bit
-    Function* clear_null_fn = slot_desc->GetUpdateNullFn(codegen, false);
-    builder.CreateCall(clear_null_fn, ArrayRef<Value*>({agg_tuple_arg}));
+    slot_desc->CodegenSetNullIndicator(
+        codegen, &builder, agg_tuple_arg, codegen->false_value());
   }
 
   // Update the slot
@@ -630,32 +634,23 @@ llvm::Function* AggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
 
       // Create pointer to src_anyval to pass to HllUpdate() function. We must use the
       // unlowered type.
-      Value* src_lowered_ptr = codegen->CreateEntryBlockAlloca(
-          fn, LlvmCodeGen::NamedVariable("src_lowered_ptr", src.value()->getType()));
-      builder.CreateStore(src.value(), src_lowered_ptr);
-      Type* unlowered_ptr_type =
-          CodegenAnyVal::GetUnloweredType(codegen, input_expr->type())->getPointerTo();
-      Value* src_unlowered_ptr =
-          builder.CreateBitCast(src_lowered_ptr, unlowered_ptr_type, "src_unlowered_ptr");
+      Value* src_unlowered_ptr = src.GetUnloweredPtr("src_unlowered_ptr");
 
       // Create StringVal* intermediate argument from dst_value
-      CodegenAnyVal dst_stringval = CodegenAnyVal::GetNonNullVal(
-          codegen, &builder, TYPE_STRING, "dst_stringval");
+      CodegenAnyVal dst_stringval =
+          CodegenAnyVal::GetNonNullVal(codegen, &builder, TYPE_STRING, "dst_stringval");
       dst_stringval.SetFromRawValue(dst_value);
       // Create pointer to dst_stringval to pass to HllUpdate() function. We must use
       // the unlowered type.
-      Value* dst_lowered_ptr = codegen->CreateEntryBlockAlloca(
-          fn, LlvmCodeGen::NamedVariable("dst_lowered_ptr",
-                                         dst_stringval.value()->getType()));
-      builder.CreateStore(dst_stringval.value(), dst_lowered_ptr);
-      unlowered_ptr_type =
+      Value* dst_lowered_ptr = dst_stringval.GetLoweredPtr("dst_lowered_ptr");
+      Type* dst_unlowered_ptr_type =
           codegen->GetPtrType(CodegenAnyVal::GetUnloweredType(codegen, TYPE_STRING));
-      Value* dst_unlowered_ptr =
-          builder.CreateBitCast(dst_lowered_ptr, unlowered_ptr_type, "dst_unlowered_ptr");
+      Value* dst_unlowered_ptr = builder.CreateBitCast(
+          dst_lowered_ptr, dst_unlowered_ptr_type, "dst_unlowered_ptr");
 
       // Call 'hll_fn'
-      builder.CreateCall(hll_fn,
-          ArrayRef<Value*>({fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr}));
+      builder.CreateCall(
+          hll_fn, ArrayRef<Value*>({fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr}));
 
       // Convert StringVal intermediate 'dst_arg' back to StringValue
       Value* anyval_result = builder.CreateLoad(dst_lowered_ptr, "anyval_result");
@@ -765,13 +760,17 @@ Function* AggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen) {
   //     ExprContext** expr_ctx, Tuple* tuple, TupleRow* row)
   // This signature needs to match the non-codegen'd signature exactly.
   StructType* tuple_struct = intermediate_tuple_desc_->GetLlvmStruct(codegen);
+  if (tuple_struct == NULL) {
+    VLOG_QUERY << "Could not codegen UpdateSlot(): could not generate tuple struct.";
+    return NULL;
+  }
   PointerType* tuple_ptr = PointerType::get(tuple_struct, 0);
   LlvmCodeGen::FnPrototype prototype(codegen, "UpdateTuple", codegen->void_type());
   prototype.AddArgument(LlvmCodeGen::NamedVariable("this_ptr", agg_node_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_tuple", agg_tuple_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple_row", tuple_row_ptr_type));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[3];
   Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
 
diff --git a/be/src/exec/exec-node.cc b/be/src/exec/exec-node.cc
index 510a7d986..255217f87 100644
--- a/be/src/exec/exec-node.cc
+++ b/be/src/exec/exec-node.cc
@@ -519,7 +519,7 @@ Status ExecNode::CodegenEvalConjuncts(LlvmCodeGen* codegen,
       LlvmCodeGen::NamedVariable("num_ctxs", codegen->GetType(TYPE_INT)));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[3];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* ctxs_arg = args[0];
diff --git a/be/src/exec/hash-join-node.cc b/be/src/exec/hash-join-node.cc
index f2ac9283a..7c335bef3 100644
--- a/be/src/exec/hash-join-node.cc
+++ b/be/src/exec/hash-join-node.cc
@@ -548,7 +548,7 @@ Function* HashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen) {
   prototype.AddArgument(LlvmCodeGen::NamedVariable("build_arg", tuple_row_ptr_type));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[4];
   Function* fn = prototype.GeneratePrototype(&builder, args);
   Value* out_row_arg = builder.CreateBitCast(args[1], tuple_row_working_type, "out");
@@ -560,8 +560,9 @@ Function* HashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen) {
 
   // Copy probe row
   codegen->CodegenMemcpy(&builder, out_row_arg, probe_row_arg, probe_tuple_row_size_);
-  Value* build_row_idx[] = { codegen->GetIntConstant(TYPE_INT, num_probe_tuples) };
-  Value* build_row_dst = builder.CreateGEP(out_row_arg, build_row_idx, "build_dst_ptr");
+  Value* build_row_idx[] = {codegen->GetIntConstant(TYPE_INT, num_probe_tuples)};
+  Value* build_row_dst =
+      builder.CreateInBoundsGEP(out_row_arg, build_row_idx, "build_dst_ptr");
 
   // Copy build row.
   BasicBlock* build_not_null_block = BasicBlock::Create(context, "build_not_null", fn);
@@ -578,9 +579,8 @@ Function* HashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen) {
     // to work.
     builder.SetInsertPoint(build_null_block);
     for (int i = 0; i < num_build_tuples; ++i) {
-      Value* array_idx[] =
-          { codegen->GetIntConstant(TYPE_INT, i + num_probe_tuples) };
-      Value* dst = builder.CreateGEP(out_row_arg, array_idx, "dst_tuple_ptr");
+      Value* array_idx[] = {codegen->GetIntConstant(TYPE_INT, i + num_probe_tuples)};
+      Value* dst = builder.CreateInBoundsGEP(out_row_arg, array_idx, "dst_tuple_ptr");
       builder.CreateStore(codegen->null_ptr_value(), dst);
     }
     builder.CreateRetVoid();
diff --git a/be/src/exec/hash-table.cc b/be/src/exec/hash-table.cc
index 74fc53434..8d45ec7bb 100644
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -567,8 +567,8 @@ string HashTable::PrintStats() const {
 // Helper function to store a value into the results buffer if the expr
 // evaluated to NULL.  We don't want (NULL, 1) to hash to the same as (0,1) so
 // we'll pick a more random value.
-static void CodegenAssignNullValue(LlvmCodeGen* codegen,
-    LlvmCodeGen::LlvmBuilder* builder, Value* dst, const ColumnType& type) {
+static void CodegenAssignNullValue(
+    LlvmCodeGen* codegen, LlvmBuilder* builder, Value* dst, const ColumnType& type) {
   uint64_t fnv_seed = HashUtil::FNV_SEED;
 
   if (type.type == TYPE_STRING || type.type == TYPE_VARCHAR) {
@@ -715,11 +715,11 @@ Status HashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, bool build, Function**
   prototype.AddArgument(LlvmCodeGen::NamedVariable("this_ptr", this_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("expr_values", codegen->ptr_type()));
-  prototype.AddArgument(LlvmCodeGen::NamedVariable("expr_values_null",
-      codegen->ptr_type()));
+  prototype.AddArgument(
+      LlvmCodeGen::NamedVariable("expr_values_null", codegen->ptr_type()));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* this_ptr = args[0];
@@ -740,10 +740,10 @@ Status HashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, bool build, Function**
     // vector of exprs
     // Convert result buffer to llvm ptr type
     int offset = expr_values_cache_.expr_values_offsets(i);
-    Value* loc = builder.CreateGEP(
+    Value* loc = builder.CreateInBoundsGEP(
         NULL, expr_values, codegen->GetIntConstant(TYPE_INT, offset), "loc_addr");
-    Value* llvm_loc = builder.CreatePointerCast(loc,
-        codegen->GetPtrType(ctxs[i]->root()->type()), "loc");
+    Value* llvm_loc = builder.CreatePointerCast(
+        loc, codegen->GetPtrType(ctxs[i]->root()->type()), "loc");
 
     BasicBlock* null_block = BasicBlock::Create(context, "null", *fn);
     BasicBlock* not_null_block = BasicBlock::Create(context, "not_null", *fn);
@@ -768,7 +768,7 @@ Status HashTableCtx::CodegenEvalRow(LlvmCodeGen* codegen, bool build, Function**
 
     // Set null-byte result
     Value* null_byte = builder.CreateZExt(is_null, codegen->GetType(TYPE_TINYINT));
-    Value* llvm_null_byte_loc = builder.CreateGEP(
+    Value* llvm_null_byte_loc = builder.CreateInBoundsGEP(
         NULL, expr_values_null, codegen->GetIntConstant(TYPE_INT, i), "null_byte_loc");
     builder.CreateStore(null_byte, llvm_null_byte_loc);
     builder.CreateCondBr(is_null, null_block, not_null_block);
@@ -863,7 +863,7 @@ Status HashTableCtx::CodegenHashRow(LlvmCodeGen* codegen, bool use_murmur, Funct
       LlvmCodeGen::NamedVariable("expr_values_null", codegen->ptr_type()));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[3];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* this_arg = args[0];
@@ -910,7 +910,7 @@ Status HashTableCtx::CodegenHashRow(LlvmCodeGen* codegen, bool use_murmur, Funct
       Value* str_null_result = NULL;
 
       int offset = expr_values_cache_.expr_values_offsets(i);
-      Value* llvm_loc = builder.CreateGEP(
+      Value* llvm_loc = builder.CreateInBoundsGEP(
           NULL, expr_values, codegen->GetIntConstant(TYPE_INT, offset), "loc_addr");
 
       // If the hash table stores nulls, we need to check if the stringval
@@ -920,11 +920,11 @@ Status HashTableCtx::CodegenHashRow(LlvmCodeGen* codegen, bool use_murmur, Funct
         not_null_block = BasicBlock::Create(context, "not_null", *fn);
         continue_block = BasicBlock::Create(context, "continue", *fn);
 
-        Value* llvm_null_byte_loc = builder.CreateGEP(NULL, expr_values_null,
+        Value* llvm_null_byte_loc = builder.CreateInBoundsGEP(NULL, expr_values_null,
             codegen->GetIntConstant(TYPE_INT, i), "null_byte_loc");
         Value* null_byte = builder.CreateLoad(llvm_null_byte_loc, "null_byte");
-        Value* is_null = builder.CreateICmpNE(null_byte,
-            codegen->GetIntConstant(TYPE_TINYINT, 0), "is_null");
+        Value* is_null = builder.CreateICmpNE(
+            null_byte, codegen->GetIntConstant(TYPE_TINYINT, 0), "is_null");
         builder.CreateCondBr(is_null, null_block, not_null_block);
 
         // For null, we just want to call the hash function on the portion of
@@ -1071,7 +1071,7 @@ Status HashTableCtx::CodegenEquals(LlvmCodeGen* codegen, bool force_null_equalit
       LlvmCodeGen::NamedVariable("expr_values_null", codegen->ptr_type()));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* this_ptr = args[0];
@@ -1116,19 +1116,19 @@ Status HashTableCtx::CodegenEquals(LlvmCodeGen* codegen, bool force_null_equalit
     // We consider null values equal if we are comparing build rows or if the join
     // predicate is <=>
     if (force_null_equality || finds_nulls_[i]) {
-      Value* llvm_null_byte_loc = builder.CreateGEP(
+      Value* llvm_null_byte_loc = builder.CreateInBoundsGEP(
           NULL, expr_values_null, codegen->GetIntConstant(TYPE_INT, i), "null_byte_loc");
       Value* null_byte = builder.CreateLoad(llvm_null_byte_loc);
-      row_is_null = builder.CreateICmpNE(null_byte,
-          codegen->GetIntConstant(TYPE_TINYINT, 0));
+      row_is_null =
+          builder.CreateICmpNE(null_byte, codegen->GetIntConstant(TYPE_TINYINT, 0));
     }
 
     // Get llvm value for row_val from 'expr_values'
     int offset = expr_values_cache_.expr_values_offsets(i);
-    Value* loc = builder.CreateGEP(
+    Value* loc = builder.CreateInBoundsGEP(
         NULL, expr_values, codegen->GetIntConstant(TYPE_INT, offset), "loc");
-    Value* row_val = builder.CreatePointerCast(loc,
-        codegen->GetPtrType(build_expr_ctxs_[i]->root()->type()), "row_val");
+    Value* row_val = builder.CreatePointerCast(
+        loc, codegen->GetPtrType(build_expr_ctxs_[i]->root()->type()), "row_val");
 
     // Branch for GetValue() returning NULL
     builder.CreateCondBr(is_null, null_block, not_null_block);
diff --git a/be/src/exec/hdfs-avro-scanner.cc b/be/src/exec/hdfs-avro-scanner.cc
index a073cef84..c217b651a 100644
--- a/be/src/exec/hdfs-avro-scanner.cc
+++ b/be/src/exec/hdfs-avro-scanner.cc
@@ -776,10 +776,10 @@ void HdfsAvroScanner::SetStatusValueOverflow(TErrorCode::type error_code, int64_
 // bail_out:           ; preds = %read_field11, %end_field3, %read_field2, %end_field,
 //   ret i1 false      ;         %read_field, %entry
 // }
-Status HdfsAvroScanner::CodegenMaterializeTuple(HdfsScanNodeBase* node,
-    LlvmCodeGen* codegen, Function** materialize_tuple_fn) {
+Status HdfsAvroScanner::CodegenMaterializeTuple(
+    HdfsScanNodeBase* node, LlvmCodeGen* codegen, Function** materialize_tuple_fn) {
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
 
   Type* this_type = codegen->GetType(HdfsAvroScanner::LLVM_CLASS_NAME);
   DCHECK(this_type != NULL);
@@ -853,8 +853,7 @@ Status HdfsAvroScanner::CodegenReadRecord(
   }
   DCHECK_EQ(record.schema->type, AVRO_RECORD);
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder* builder =
-      reinterpret_cast<LlvmCodeGen::LlvmBuilder*>(void_builder);
+  LlvmBuilder* builder = reinterpret_cast<LlvmBuilder*>(void_builder);
 
   // Codegen logic for parsing each field and, if necessary, populating a slot with the
   // result.
@@ -911,9 +910,8 @@ Status HdfsAvroScanner::CodegenReadRecord(
       // Write null field IR
       builder->SetInsertPoint(null_block);
       if (slot_idx != HdfsScanNode::SKIP_COLUMN) {
-        Function* set_null_fn = slot_desc->GetUpdateNullFn(codegen, true);
-        DCHECK(set_null_fn != NULL);
-        builder->CreateCall(set_null_fn, ArrayRef<Value*>({tuple_val}));
+        slot_desc->CodegenSetNullIndicator(
+            codegen, builder, tuple_val, codegen->true_value());
       }
       // LLVM requires all basic blocks to end with a terminating instruction
       builder->CreateBr(end_field_block);
@@ -944,11 +942,10 @@ Status HdfsAvroScanner::CodegenReadRecord(
 }
 
 Status HdfsAvroScanner::CodegenReadScalar(const AvroSchemaElement& element,
-    SlotDescriptor* slot_desc, LlvmCodeGen* codegen, void* void_builder,
-    Value* this_val, Value* pool_val, Value* tuple_val, Value* data_val,
-    Value* data_end_val, Value** ret_val) {
-  LlvmCodeGen::LlvmBuilder* builder =
-      reinterpret_cast<LlvmCodeGen::LlvmBuilder*>(void_builder);
+    SlotDescriptor* slot_desc, LlvmCodeGen* codegen, void* void_builder, Value* this_val,
+    Value* pool_val, Value* tuple_val, Value* data_val, Value* data_end_val,
+    Value** ret_val) {
+  LlvmBuilder* builder = reinterpret_cast<LlvmBuilder*>(void_builder);
   Function* read_field_fn;
   switch (element.schema->type) {
     case AVRO_BOOLEAN:
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index 388552292..390085fc2 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -380,7 +380,7 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node,
   prototype.AddArgument(LlvmCodeGen::NamedVariable("error_in_row", uint8_ptr_type));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[8];
   Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
 
@@ -411,8 +411,8 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node,
   // Put tuple in tuple_row
   Value* tuple_row_typed =
       builder.CreateBitCast(tuple_row_arg, PointerType::get(tuple_ptr_type, 0));
-  Value* tuple_row_idxs[] = { codegen->GetIntConstant(TYPE_INT, node->tuple_idx()) };
-  Value* tuple_in_row_addr = builder.CreateGEP(tuple_row_typed, tuple_row_idxs);
+  Value* tuple_row_idxs[] = {codegen->GetIntConstant(TYPE_INT, node->tuple_idx())};
+  Value* tuple_in_row_addr = builder.CreateInBoundsGEP(tuple_row_typed, tuple_row_idxs);
   builder.CreateStore(tuple_arg, tuple_in_row_addr);
   builder.CreateBr(parse_block);
 
@@ -447,11 +447,12 @@ Status HdfsScanner::CodegenWriteCompleteTuple(HdfsScanNodeBase* node,
         codegen->GetIntConstant(TYPE_INT, 1),
       };
       Value* error_idxs[] = {
-        codegen->GetIntConstant(TYPE_INT, slot_idx),
+          codegen->GetIntConstant(TYPE_INT, slot_idx),
       };
-      Value* data_ptr = builder.CreateGEP(fields_arg, data_idxs, "data_ptr");
-      Value* len_ptr = builder.CreateGEP(fields_arg, len_idxs, "len_ptr");
-      Value* error_ptr = builder.CreateGEP(errors_arg, error_idxs, "slot_error_ptr");
+      Value* data_ptr = builder.CreateInBoundsGEP(fields_arg, data_idxs, "data_ptr");
+      Value* len_ptr = builder.CreateInBoundsGEP(fields_arg, len_idxs, "len_ptr");
+      Value* error_ptr =
+          builder.CreateInBoundsGEP(errors_arg, error_idxs, "slot_error_ptr");
       Value* data = builder.CreateLoad(data_ptr, "data");
       Value* len = builder.CreateLoad(len_ptr, "len");
 
diff --git a/be/src/exec/old-hash-table.cc b/be/src/exec/old-hash-table.cc
index f51bc7483..44bf7e4d0 100644
--- a/be/src/exec/old-hash-table.cc
+++ b/be/src/exec/old-hash-table.cc
@@ -182,8 +182,8 @@ int OldHashTable::AddBloomFilters() {
 // Helper function to store a value into the results buffer if the expr
 // evaluated to NULL.  We don't want (NULL, 1) to hash to the same as (0,1) so
 // we'll pick a more random value.
-static void CodegenAssignNullValue(LlvmCodeGen* codegen,
-    LlvmCodeGen::LlvmBuilder* builder, Value* dst, const ColumnType& type) {
+static void CodegenAssignNullValue(
+    LlvmCodeGen* codegen, LlvmBuilder* builder, Value* dst, const ColumnType& type) {
   uint64_t fnv_seed = HashUtil::FNV_SEED;
 
   if (type.type == TYPE_STRING || type.type == TYPE_VARCHAR) {
@@ -288,7 +288,7 @@ Function* OldHashTable::CodegenEvalTupleRow(LlvmCodeGen* codegen, bool build) {
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[2];
   Function* fn = prototype.GeneratePrototype(&builder, args);
 
@@ -429,7 +429,7 @@ Function* OldHashTable::CodegenHashCurrentRow(LlvmCodeGen* codegen) {
   prototype.AddArgument(LlvmCodeGen::NamedVariable("this_ptr", this_ptr_type));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* this_arg;
   Function* fn = prototype.GeneratePrototype(&builder, &this_arg);
 
@@ -612,7 +612,7 @@ Function* OldHashTable::CodegenEquals(LlvmCodeGen* codegen) {
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[2];
   Function* fn = prototype.GeneratePrototype(&builder, args);
   Value* row = args[1];
diff --git a/be/src/exec/partitioned-aggregation-node-ir.cc b/be/src/exec/partitioned-aggregation-node-ir.cc
index 83362e773..b69613170 100644
--- a/be/src/exec/partitioned-aggregation-node-ir.cc
+++ b/be/src/exec/partitioned-aggregation-node-ir.cc
@@ -26,7 +26,7 @@
 
 using namespace impala;
 
-ExprContext* PartitionedAggregationNode::GetAggExprContext(int i) const {
+ExprContext* const* PartitionedAggregationNode::GetAggExprContexts(int i) const {
   return agg_expr_ctxs_[i];
 }
 
diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc
index 17fbb6c7b..56db26fcc 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -17,21 +17,20 @@
 
 #include "exec/partitioned-aggregation-node.h"
 
-#include <algorithm>
 #include <math.h>
+#include <algorithm>
 #include <set>
 #include <sstream>
-#include <gutil/strings/substitute.h>
-#include <thrift/protocol/TDebugProtocol.h>
 
 #include "codegen/codegen-anyval.h"
 #include "codegen/llvm-codegen.h"
 #include "exec/hash-table.inline.h"
 #include "exprs/agg-fn-evaluator.h"
 #include "exprs/anyval-util.h"
-#include "exprs/expr.h"
 #include "exprs/expr-context.h"
+#include "exprs/expr.h"
 #include "exprs/slot-ref.h"
+#include "gutil/strings/substitute.h"
 #include "runtime/buffered-tuple-stream.inline.h"
 #include "runtime/descriptors.h"
 #include "runtime/mem-pool.h"
@@ -40,8 +39,8 @@
 #include "runtime/row-batch.h"
 #include "runtime/runtime-state.h"
 #include "runtime/string-value.inline.h"
-#include "runtime/tuple.h"
 #include "runtime/tuple-row.h"
+#include "runtime/tuple.h"
 #include "udf/udf-internal.h"
 #include "util/debug-util.h"
 #include "util/runtime-profile-counters.h"
@@ -150,21 +149,20 @@ Status PartitionedAggregationNode::Init(const TPlanNode& tnode, RuntimeState* st
       Expr::CreateExprTrees(pool_, tnode.agg_node.grouping_exprs, &grouping_expr_ctxs_));
   for (int i = 0; i < tnode.agg_node.aggregate_functions.size(); ++i) {
     AggFnEvaluator* evaluator;
-    RETURN_IF_ERROR(AggFnEvaluator::Create(
-        pool_, tnode.agg_node.aggregate_functions[i], &evaluator));
+    RETURN_IF_ERROR(
+        AggFnEvaluator::Create(pool_, tnode.agg_node.aggregate_functions[i], &evaluator));
     aggregate_evaluators_.push_back(evaluator);
-    ExprContext* agg_expr_ctx;
-    if (evaluator->input_expr_ctxs().size() == 1) {
-      agg_expr_ctx = evaluator->input_expr_ctxs()[0];
+    ExprContext* const* agg_expr_ctxs;
+    if (evaluator->input_expr_ctxs().size() > 0) {
+      agg_expr_ctxs = evaluator->input_expr_ctxs().data();
     } else {
-      // CodegenUpdateSlot() can only support aggregate operator with only one ExprContext
-      // so it doesn't support operator such as group_concat. There are also aggregate
-      // operators with no ExprContext (e.g. count(*)). In cases above, 'agg_expr_ctxs_'
-      // will contain NULL for that entry.
+      // Some aggregate functions have no input expressions and therefore no ExprContext
+      // (e.g. count(*)). In those cases, 'agg_expr_ctxs_' will contain NULL for that
+      // entry.
       DCHECK(evaluator->agg_op() == AggFnEvaluator::OTHER || evaluator->is_count_star());
-      agg_expr_ctx = NULL;
+      agg_expr_ctxs = NULL;
     }
-    agg_expr_ctxs_.push_back(agg_expr_ctx);
+    agg_expr_ctxs_.push_back(agg_expr_ctxs);
   }
   return Status::OK();
 }
@@ -1059,32 +1057,31 @@ void PartitionedAggregationNode::InitAggSlots(
       intermediate_tuple_desc_->slots().begin() + grouping_expr_ctxs_.size();
   for (int i = 0; i < aggregate_evaluators_.size(); ++i, ++slot_desc) {
     AggFnEvaluator* evaluator = aggregate_evaluators_[i];
+    // To minimize branching on the UpdateTuple path, initialize the result value so that
+    // the Add() UDA function can ignore the NULL bit of its destination value. E.g. for
+    // SUM(), if we initialize the destination value to 0 (with the NULL bit set), we can
+    // just start adding to the destination value (rather than repeatedly checking the
+    // destination NULL bit. The codegen'd version of UpdateSlot() exploits this to
+    // eliminate a branch per value.
+    //
+    // For boolean and numeric types, the default values are false/0, so the nullable
+    // aggregate functions SUM() and AVG() produce the correct result. For MIN()/MAX(),
+    // initialize the value to max/min possible value for the same effect.
     evaluator->Init(agg_fn_ctxs[i], intermediate_tuple);
-    // Codegen specific path for min/max.
-    // To minimize branching on the UpdateTuple path, initialize the result value
-    // so that UpdateTuple doesn't have to check if the aggregation
-    // dst slot is null.
-    // TODO: remove when we don't use the irbuilder for codegen here.  This optimization
-    // will no longer be necessary when all aggregates are implemented with the UDA
-    // interface.
-    if ((*slot_desc)->type().type != TYPE_STRING &&
-        (*slot_desc)->type().type != TYPE_VARCHAR &&
-        (*slot_desc)->type().type != TYPE_TIMESTAMP &&
-        (*slot_desc)->type().type != TYPE_CHAR) {
+
+    AggFnEvaluator::AggregationOp agg_op = evaluator->agg_op();
+    if ((agg_op == AggFnEvaluator::MIN || agg_op == AggFnEvaluator::MAX)
+        && !evaluator->intermediate_type().IsStringType()
+        && !evaluator->intermediate_type().IsTimestampType()) {
       ExprValue default_value;
       void* default_value_ptr = NULL;
-      switch (evaluator->agg_op()) {
-        case AggFnEvaluator::MIN:
-          default_value_ptr = default_value.SetToMax((*slot_desc)->type());
-          RawValue::Write(default_value_ptr, intermediate_tuple, *slot_desc, NULL);
-          break;
-        case AggFnEvaluator::MAX:
-          default_value_ptr = default_value.SetToMin((*slot_desc)->type());
-          RawValue::Write(default_value_ptr, intermediate_tuple, *slot_desc, NULL);
-          break;
-        default:
-          break;
+      if (evaluator->agg_op() == AggFnEvaluator::MIN) {
+        default_value_ptr = default_value.SetToMax((*slot_desc)->type());
+      } else {
+        DCHECK_EQ(evaluator->agg_op(), AggFnEvaluator::MAX);
+        default_value_ptr = default_value.SetToMin((*slot_desc)->type());
       }
+      RawValue::Write(default_value_ptr, intermediate_tuple, *slot_desc, NULL);
     }
   }
 }
@@ -1450,111 +1447,123 @@ Status PartitionedAggregationNode::QueryMaintenance(RuntimeState* state) {
 // void UpdateSlot(FunctionContext* agg_fn_ctx, ExprContext* agg_expr_ctx,
 //     AggTuple* agg_tuple, char** row)
 //
-// The IR for sum(double_col) is:
+// The IR for sum(double_col), which is constructed directly with the IRBuilder, is:
 //
 // define void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                         %"class.impala::ExprContext"* %agg_expr_ctx,
-//                         { i8, [7 x i8], double }* %agg_tuple,
-//                         %"class.impala::TupleRow"* %row) #34 {
-//
+//    %"class.impala::ExprContext"** %agg_expr_ctxs,
+//    { i8, [7 x i8], double }* %agg_tuple, %"class.impala::TupleRow"* %row) #34 {
 // entry:
-//   %src = call { i8, double } @GetSlotRef(%"class.impala::ExprContext"* %agg_expr_ctx,
-//       %"class.impala::TupleRow"* %row)
-//   %0 = extractvalue { i8, double } %src, 0
-//   %is_null = trunc i8 %0 to i1
-//   br i1 %is_null, label %ret, label %src_not_null
-//
-// src_not_null:                                     ; preds = %entry
+//   %expr_ctx_ptr = getelementptr %"class.impala::ExprContext"*,
+//      %"class.impala::ExprContext"** %agg_expr_ctxs, i32 0
+//   %expr_ctx = load %"class.impala::ExprContext"*,
+//      %"class.impala::ExprContext"** %expr_ctx_ptr
+//   %input0 = call { i8, double } @GetSlotRef(%"class.impala::ExprContext"* %expr_ctx,
+//      %"class.impala::TupleRow"* %row)
 //   %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8], double },
 //       { i8, [7 x i8], double }* %agg_tuple, i32 0, i32 2
-//   call void @SetNotNull({ i8, [7 x i8], double }* %agg_tuple)
 //   %dst_val = load double, double* %dst_slot_ptr
-//   %val = extractvalue { i8, double } %src, 1
+//   %0 = extractvalue { i8, double } %input0, 0
+//   %is_null = trunc i8 %0 to i1
+//   br i1 %is_null, label %ret, label %not_null
+//
+// ret:                                              ; preds = %not_null, %entry
+//   ret void
+//
+// not_null:                                         ; preds = %entry
+//   %val = extractvalue { i8, double } %input0, 1
 //   %1 = fadd double %dst_val, %val
+//   %2 = bitcast { i8, [7 x i8], double }* %agg_tuple to i8*
+//   %null_byte_ptr = getelementptr i8, i8* %2, i32 0
+//   %null_byte = load i8, i8* %null_byte_ptr
+//   %null_bit_cleared = and i8 %null_byte, -2
+//   store i8 %null_bit_cleared, i8* %null_byte_ptr
 //   store double %1, double* %dst_slot_ptr
 //   br label %ret
-//
-// ret:                                              ; preds = %src_not_null, %entry
-//   ret void
 // }
 //
-// The IR for ndv(double_col) is:
+// The IR for min(timestamp_col), which uses the UDA interface, is:
 //
 // define void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                         %"class.impala::ExprContext"* %agg_expr_ctx,
-//                         { i8, [7 x i8], %"struct.impala::StringValue" }* %agg_tuple,
-//                         %"class.impala::TupleRow"* %row) #34 {
+//      %"class.impala::ExprContext"** %agg_expr_ctxs,
+//      { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple,
+//      %"class.impala::TupleRow"* %row) #34 {
 // entry:
-//   %dst_lowered_ptr = alloca { i64, i8* }
-//   %src_lowered_ptr = alloca { i8, double }
-//   %src = call { i8, double } @GetSlotRef(%"class.impala::ExprContext"* %agg_expr_ctx,
-//       %"class.impala::TupleRow"* %row)
-//   %0 = extractvalue { i8, double } %src, 0
-//   %is_null = trunc i8 %0 to i1
-//   br i1 %is_null, label %ret, label %src_not_null
-//
-// src_not_null:                                     ; preds = %entry
-//   %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8], %"struct.impala::StringValue" },
-//       { i8, [7 x i8], %"struct.impala::StringValue" }* %agg_tuple, i32 0, i32 2
-//   call void @SetNotNull({ i8, [7 x i8], %"struct.impala::StringValue" }* %agg_tuple)
-//   %dst_val =
-//       load %"struct.impala::StringValue", %"struct.impala::StringValue"* %dst_slot_ptr
-//   store { i8, double } %src, { i8, double }* %src_lowered_ptr
-//   %src_unlowered_ptr =
-//       bitcast { i8, double }* %src_lowered_ptr to %"struct.impala_udf::DoubleVal"*
-//   %ptr = extractvalue %"struct.impala::StringValue" %dst_val, 0
-//   %dst = insertvalue { i64, i8* } zeroinitializer, i8* %ptr, 1
-//   %len = extractvalue %"struct.impala::StringValue" %dst_val, 1
-//   %1 = extractvalue { i64, i8* } %dst, 0
-//   %2 = zext i32 %len to i64
-//   %3 = shl i64 %2, 32
-//   %4 = and i64 %1, 4294967295
-//   %5 = or i64 %4, %3
-//   %dst1 = insertvalue { i64, i8* } %dst, i64 %5, 0
-//   store { i64, i8* } %dst1, { i64, i8* }* %dst_lowered_ptr
-//   %dst_unlowered_ptr =
-//       bitcast { i64, i8* }* %dst_lowered_ptr to %"struct.impala_udf::StringVal"*
-//   call void @HllUpdate(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                        %"struct.impala_udf::DoubleVal"* %src_unlowered_ptr,
-//                        %"struct.impala_udf::StringVal"* %dst_unlowered_ptr)
-//   %anyval_result = load { i64, i8* }, { i64, i8* }* %dst_lowered_ptr
-//   %6 = extractvalue { i64, i8* } %anyval_result, 0
-//   %7 = ashr i64 %6, 32
-//   %8 = trunc i64 %7 to i32
-//   %9 = insertvalue %"struct.impala::StringValue" zeroinitializer, i32 %8, 1
-//   %10 = extractvalue { i64, i8* } %anyval_result, 1
-//   %11 = insertvalue %"struct.impala::StringValue" %9, i8* %10, 0
-//   store %"struct.impala::StringValue" %11, %"struct.impala::StringValue"* %dst_slot_ptr
+//   %dst_lowered_ptr = alloca { i64, i64 }
+//   %input_lowered_ptr = alloca { i64, i64 }
+//   %expr_ctx_ptr = getelementptr %"class.impala::ExprContext"*,
+//        %"class.impala::ExprContext"** %agg_expr_ctxs, i32 0
+//   %expr_ctx = load %"class.impala::ExprContext"*,
+//        %"class.impala::ExprContext"** %expr_ctx_ptr
+//   %input0 = call { i64, i64 } @GetSlotRef(%"class.impala::ExprContext"* %expr_ctx,
+//        %"class.impala::TupleRow"* %row)
+//   %dst_slot_ptr = getelementptr inbounds { i8, [7 x i8],
+//        %"class.impala::TimestampValue" }, { i8, [7 x i8],
+//        %"class.impala::TimestampValue" }* %agg_tuple, i32 0, i32 2
+//   %dst_val = load %"class.impala::TimestampValue",
+//        %"class.impala::TimestampValue"* %dst_slot_ptr
+//   %0 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple to i8*
+//   %null_byte_ptr = getelementptr i8, i8* %0, i32 0
+//   %null_byte = load i8, i8* %null_byte_ptr
+//   %null_mask = and i8 %null_byte, 1
+//   %is_null = icmp ne i8 %null_mask, 0
+//   %is_null_ext = zext i1 %is_null to i64
+//   %1 = or i64 0, %is_null_ext
+//   %dst = insertvalue { i64, i64 } zeroinitializer, i64 %1, 0
+//   %time_of_day = extractvalue %"class.impala::TimestampValue" %dst_val, 0, 0, 0, 0
+//   %dst1 = insertvalue { i64, i64 } %dst, i64 %time_of_day, 1
+//   %date = extractvalue %"class.impala::TimestampValue" %dst_val, 1, 0, 0
+//   %2 = extractvalue { i64, i64 } %dst1, 0
+//   %3 = zext i32 %date to i64
+//   %4 = shl i64 %3, 32
+//   %5 = and i64 %2, 4294967295
+//   %6 = or i64 %5, %4
+//   %dst2 = insertvalue { i64, i64 } %dst1, i64 %6, 0
+//   store { i64, i64 } %input0, { i64, i64 }* %input_lowered_ptr
+//   %input_unlowered_ptr = bitcast { i64, i64 }* %input_lowered_ptr
+//        to %"struct.impala_udf::TimestampVal"*
+//   store { i64, i64 } %dst2, { i64, i64 }* %dst_lowered_ptr
+//   %dst_unlowered_ptr = bitcast { i64, i64 }* %dst_lowered_ptr
+//        to %"struct.impala_udf::TimestampVal"*
+//   call void
+//        @_ZN6impala18AggregateFunctions3MinIN10impala_udf12TimestampValEEEvPNS2_15FunctionContextERKT_PS6_.2(
+//        %"class.impala_udf::FunctionContext"* %agg_fn_ctx,
+//        %"struct.impala_udf::TimestampVal"* %input_unlowered_ptr,
+//        %"struct.impala_udf::TimestampVal"* %dst_unlowered_ptr)
+//   %anyval_result = load { i64, i64 }, { i64, i64 }* %dst_lowered_ptr
+//   %7 = extractvalue { i64, i64 } %anyval_result, 1
+//   %8 = insertvalue %"class.impala::TimestampValue" zeroinitializer, i64 %7, 0, 0, 0, 0
+//   %9 = extractvalue { i64, i64 } %anyval_result, 0
+//   %10 = ashr i64 %9, 32
+//   %11 = trunc i64 %10 to i32
+//   %12 = insertvalue %"class.impala::TimestampValue" %8, i32 %11, 1, 0, 0
+//   %13 = extractvalue { i64, i64 } %anyval_result, 0
+//   %result_is_null = trunc i64 %13 to i1
+//   %14 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple to i8*
+//   %null_byte_ptr3 = getelementptr i8, i8* %14, i32 0
+//   %null_byte4 = load i8, i8* %null_byte_ptr3
+//   %null_bit_cleared = and i8 %null_byte4, -2
+//   %15 = sext i1 %result_is_null to i8
+//   %null_bit = and i8 %15, 1
+//   %null_bit_set = or i8 %null_bit_cleared, %null_bit
+//   store i8 %null_bit_set, i8* %null_byte_ptr3
+//   store %"class.impala::TimestampValue" %12,
+//        %"class.impala::TimestampValue"* %dst_slot_ptr
 //   br label %ret
 //
-// ret:                                              ; preds = %src_not_null, %entry
+// ret:                                              ; preds = %entry
 //   ret void
 // }
+//
 Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
     AggFnEvaluator* evaluator, SlotDescriptor* slot_desc, Function** fn) {
-  // TODO: Fix this DCHECK and Init() once CodegenUpdateSlot() can handle AggFnEvaluator
-  // with multiple input expressions (e.g. group_concat).
-  DCHECK_EQ(evaluator->input_expr_ctxs().size(), 1);
-  ExprContext* agg_expr_ctx = evaluator->input_expr_ctxs()[0];
-  Expr* agg_expr = agg_expr_ctx->root();
-
-  // TODO: implement timestamp
-  if (agg_expr->type().type == TYPE_TIMESTAMP &&
-      evaluator->agg_op() != AggFnEvaluator::AVG) {
-    return Status("PartitionedAggregationNode::CodegenUpdateSlot(): timestamp input type "
-        "NYI");
-  }
-
-  Function* agg_expr_fn;
-  RETURN_IF_ERROR(agg_expr->GetCodegendComputeFn(codegen, &agg_expr_fn));
-
   PointerType* fn_ctx_type =
       codegen->GetPtrType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME);
-  PointerType* expr_ctx_type = codegen->GetPtrType(ExprContext::LLVM_CLASS_NAME);
+  PointerType* expr_ctxs_type =
+      codegen->GetPtrPtrType(codegen->GetType(ExprContext::LLVM_CLASS_NAME));
   StructType* tuple_struct = intermediate_tuple_desc_->GetLlvmStruct(codegen);
   if (tuple_struct == NULL) {
     return Status("PartitionedAggregationNode::CodegenUpdateSlot(): failed to generate "
-        "intermediate tuple desc");
+                  "intermediate tuple desc");
   }
   PointerType* tuple_ptr_type = codegen->GetPtrType(tuple_struct);
   PointerType* tuple_row_ptr_type = codegen->GetPtrType(TupleRow::LLVM_CLASS_NAME);
@@ -1562,129 +1571,129 @@ Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
   // Create UpdateSlot prototype
   LlvmCodeGen::FnPrototype prototype(codegen, "UpdateSlot", codegen->void_type());
   prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_fn_ctx", fn_ctx_type));
-  prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_expr_ctx", expr_ctx_type));
+  prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_expr_ctxs", expr_ctxs_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("agg_tuple", tuple_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, &args[0]);
   Value* agg_fn_ctx_arg = args[0];
-  Value* agg_expr_ctx_arg = args[1];
+  Value* agg_expr_ctxs_arg = args[1];
   Value* agg_tuple_arg = args[2];
   Value* row_arg = args[3];
 
-  BasicBlock* src_not_null_block =
-      BasicBlock::Create(codegen->context(), "src_not_null", *fn);
-  BasicBlock* ret_block = BasicBlock::Create(codegen->context(), "ret", *fn);
+  DCHECK_GE(evaluator->input_expr_ctxs().size(), 1);
+  vector<CodegenAnyVal> input_vals;
+  for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) {
+    ExprContext* agg_expr_ctx = evaluator->input_expr_ctxs()[i];
+    Expr* agg_expr = agg_expr_ctx->root();
+    Function* agg_expr_fn;
+    RETURN_IF_ERROR(agg_expr->GetCodegendComputeFn(codegen, &agg_expr_fn));
+    DCHECK(agg_expr_fn != NULL);
 
-  // Call expr function to get src slot value
-  Value* agg_expr_fn_args[] = { agg_expr_ctx_arg, row_arg };
-  CodegenAnyVal src = CodegenAnyVal::CreateCallWrapped(
-      codegen, &builder, agg_expr->type(), agg_expr_fn, agg_expr_fn_args, "src");
-
-  Value* src_is_null = src.GetIsNull();
-  builder.CreateCondBr(src_is_null, ret_block, src_not_null_block);
-
-  // Src slot is not null, update dst_slot
-  builder.SetInsertPoint(src_not_null_block);
-  Value* dst_ptr = builder.CreateStructGEP(NULL, agg_tuple_arg, slot_desc->llvm_field_idx(),
-      "dst_slot_ptr");
-  Value* result = NULL;
-
-  if (slot_desc->is_nullable()) {
-    // Dst is NULL, just update dst slot to src slot and clear null bit
-    Function* clear_null_fn = slot_desc->GetUpdateNullFn(codegen, false);
-    builder.CreateCall(clear_null_fn, ArrayRef<Value*>({agg_tuple_arg}));
+    // Call expr function with the matching expr context to get src slot value.
+    Value* expr_ctx_ptr = builder.CreateInBoundsGEP(
+        agg_expr_ctxs_arg, codegen->GetIntConstant(TYPE_INT, i), "expr_ctx_ptr");
+    Value* expr_ctx = builder.CreateLoad(expr_ctx_ptr, "expr_ctx");
+    string input_name = Substitute("input$0", i);
+    input_vals.push_back(
+        CodegenAnyVal::CreateCallWrapped(codegen, &builder, agg_expr->type(), agg_expr_fn,
+            ArrayRef<Value*>({expr_ctx, row_arg}), input_name.c_str()));
   }
 
-  // Update the slot
-  Value* dst_value = builder.CreateLoad(dst_ptr, "dst_val");
-  switch (evaluator->agg_op()) {
-    case AggFnEvaluator::COUNT:
-      if (evaluator->is_merge()) {
-        result = builder.CreateAdd(dst_value, src.GetVal(), "count_sum");
+  AggFnEvaluator::AggregationOp agg_op = evaluator->agg_op();
+  const ColumnType& dst_type = evaluator->intermediate_type();
+  bool dst_is_int_or_float_or_bool = dst_type.IsIntegerType()
+      || dst_type.IsFloatingPointType() || dst_type.IsBooleanType();
+  bool dst_is_numeric_or_bool = dst_is_int_or_float_or_bool || dst_type.IsDecimalType();
+
+  BasicBlock* ret_block = BasicBlock::Create(codegen->context(), "ret", *fn);
+
+  // Emit the code to compute 'result' and set the NULL indicator if needed. First check
+  // for special cases where we can emit a very simple instruction sequence, then fall
+  // back to the general-purpose approach of calling the cross-compiled builtin UDA.
+  CodegenAnyVal& src = input_vals[0];
+  // 'dst_slot_ptr' points to the slot in the aggregate tuple to update.
+  Value* dst_slot_ptr = builder.CreateStructGEP(
+      NULL, agg_tuple_arg, slot_desc->llvm_field_idx(), "dst_slot_ptr");
+  Value* result = NULL;
+  Value* dst_value = builder.CreateLoad(dst_slot_ptr, "dst_val");
+  if (agg_op == AggFnEvaluator::COUNT) {
+    src.CodegenBranchIfNull(&builder, ret_block);
+    if (evaluator->is_merge()) {
+      result = builder.CreateAdd(dst_value, src.GetVal(), "count_sum");
+    } else {
+      result = builder.CreateAdd(
+          dst_value, codegen->GetIntConstant(TYPE_BIGINT, 1), "count_inc");
+    }
+    DCHECK(!slot_desc->is_nullable());
+  } else if ((agg_op == AggFnEvaluator::MIN || agg_op == AggFnEvaluator::MAX)
+      && dst_is_numeric_or_bool) {
+    bool is_min = agg_op == AggFnEvaluator::MIN;
+    src.CodegenBranchIfNull(&builder, ret_block);
+    Function* min_max_fn = codegen->CodegenMinMax(slot_desc->type(), is_min);
+    Value* min_max_args[] = {dst_value, src.GetVal()};
+    result =
+        builder.CreateCall(min_max_fn, min_max_args, is_min ? "min_value" : "max_value");
+    // Dst may have been NULL, make sure to unset the NULL bit.
+    DCHECK(slot_desc->is_nullable());
+    slot_desc->CodegenSetNullIndicator(
+        codegen, &builder, agg_tuple_arg, codegen->false_value());
+  } else if (agg_op == AggFnEvaluator::SUM && dst_is_int_or_float_or_bool) {
+    src.CodegenBranchIfNull(&builder, ret_block);
+    if (dst_type.IsFloatingPointType()) {
+      result = builder.CreateFAdd(dst_value, src.GetVal());
+    } else {
+      result = builder.CreateAdd(dst_value, src.GetVal());
+    }
+    // Dst may have been NULL, make sure to unset the NULL bit.
+    DCHECK(slot_desc->is_nullable());
+    slot_desc->CodegenSetNullIndicator(
+        codegen, &builder, agg_tuple_arg, codegen->false_value());
+  } else {
+    // The remaining cases are implemented using the UDA interface.
+    // Create intermediate argument 'dst' from 'dst_value'
+    CodegenAnyVal dst = CodegenAnyVal::GetNonNullVal(codegen, &builder, dst_type, "dst");
+
+    // For a subset of builtins we generate a different code sequence that exploits two
+    // properties of the builtins. First, NULL input values can be skipped. Second, the
+    // value of the slot was initialized in the right way in InitAggSlots() (e.g. 0 for
+    // SUM) that we get the right result if UpdateSlot() pretends that the NULL bit of
+    // 'dst' is unset. Empirically this optimisation makes TPC-H Q1 5-10% faster.
+    bool special_null_handling = !evaluator->intermediate_type().IsStringType()
+        && !evaluator->intermediate_type().IsTimestampType()
+        && (agg_op == AggFnEvaluator::MIN || agg_op == AggFnEvaluator::MAX
+               || agg_op == AggFnEvaluator::SUM || agg_op == AggFnEvaluator::AVG
+               || agg_op == AggFnEvaluator::NDV);
+    if (slot_desc->is_nullable()) {
+      if (special_null_handling) {
+        src.CodegenBranchIfNull(&builder, ret_block);
+        slot_desc->CodegenSetNullIndicator(
+            codegen, &builder, agg_tuple_arg, codegen->false_value());
       } else {
-        result = builder.CreateAdd(dst_value,
-            codegen->GetIntConstant(TYPE_BIGINT, 1), "count_inc");
+        dst.SetIsNull(slot_desc->CodegenIsNull(codegen, &builder, agg_tuple_arg));
       }
-      break;
-    case AggFnEvaluator::MIN: {
-      Function* min_fn = codegen->CodegenMinMax(slot_desc->type(), true);
-      Value* min_args[] = { dst_value, src.GetVal() };
-      result = builder.CreateCall(min_fn, min_args, "min_value");
-      break;
     }
-    case AggFnEvaluator::MAX: {
-      Function* max_fn = codegen->CodegenMinMax(slot_desc->type(), false);
-      Value* max_args[] = { dst_value, src.GetVal() };
-      result = builder.CreateCall(max_fn, max_args, "max_value");
-      break;
+    dst.SetFromRawValue(dst_value);
+
+    // Call the UDA to update/merge 'src' into 'dst', with the result stored in
+    // 'updated_dst_val'.
+    CodegenAnyVal updated_dst_val;
+    RETURN_IF_ERROR(CodegenCallUda(
+        codegen, &builder, evaluator, agg_fn_ctx_arg, input_vals, dst, &updated_dst_val));
+    result = updated_dst_val.ToNativeValue();
+
+    if (slot_desc->is_nullable() && !special_null_handling) {
+      // Set NULL bit in the slot based on the return value.
+      Value* result_is_null = updated_dst_val.GetIsNull("result_is_null");
+      slot_desc->CodegenSetNullIndicator(
+          codegen, &builder, agg_tuple_arg, result_is_null);
     }
-    case AggFnEvaluator::SUM:
-      if (slot_desc->type().type != TYPE_DECIMAL) {
-        if (slot_desc->type().type == TYPE_FLOAT ||
-            slot_desc->type().type == TYPE_DOUBLE) {
-          result = builder.CreateFAdd(dst_value, src.GetVal());
-        } else {
-          result = builder.CreateAdd(dst_value, src.GetVal());
-        }
-        break;
-      }
-      DCHECK_EQ(slot_desc->type().type, TYPE_DECIMAL);
-      // Fall through to xcompiled case
-    case AggFnEvaluator::AVG:
-    case AggFnEvaluator::NDV: {
-      // Get xcompiled update/merge function from IR module
-      const string& symbol = evaluator->is_merge() ?
-                             evaluator->merge_symbol() : evaluator->update_symbol();
-      const ColumnType& dst_type = evaluator->intermediate_type();
-      Function* ir_fn = codegen->GetFunction(symbol);
-      DCHECK(ir_fn != NULL);
-
-      // Clone and replace constants.
-      ir_fn = codegen->CloneFunction(ir_fn);
-      vector<FunctionContext::TypeDesc> arg_types;
-      arg_types.push_back(AnyValUtil::ColumnTypeToTypeDesc(agg_expr->type()));
-      Expr::InlineConstants(AnyValUtil::ColumnTypeToTypeDesc(dst_type), arg_types,
-          codegen, ir_fn);
-
-      // Create pointer to src to pass to ir_fn. We must use the unlowered type.
-      Value* src_lowered_ptr = codegen->CreateEntryBlockAlloca(
-          *fn, LlvmCodeGen::NamedVariable("src_lowered_ptr", src.value()->getType()));
-      builder.CreateStore(src.value(), src_lowered_ptr);
-      Type* unlowered_ptr_type =
-          CodegenAnyVal::GetUnloweredPtrType(codegen, agg_expr->type());
-      Value* src_unlowered_ptr =
-          builder.CreateBitCast(src_lowered_ptr, unlowered_ptr_type, "src_unlowered_ptr");
-
-      // Create intermediate argument 'dst' from 'dst_value'
-      CodegenAnyVal dst = CodegenAnyVal::GetNonNullVal(
-          codegen, &builder, dst_type, "dst");
-      dst.SetFromRawValue(dst_value);
-      // Create pointer to dst to pass to ir_fn. We must use the unlowered type.
-      Value* dst_lowered_ptr = codegen->CreateEntryBlockAlloca(
-          *fn, LlvmCodeGen::NamedVariable("dst_lowered_ptr", dst.value()->getType()));
-      builder.CreateStore(dst.value(), dst_lowered_ptr);
-      unlowered_ptr_type = CodegenAnyVal::GetUnloweredPtrType(codegen, dst_type);
-      Value* dst_unlowered_ptr =
-          builder.CreateBitCast(dst_lowered_ptr, unlowered_ptr_type, "dst_unlowered_ptr");
-
-      // Call 'ir_fn'
-      builder.CreateCall(ir_fn,
-          ArrayRef<Value*>({agg_fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr}));
-
-      // Convert StringVal intermediate 'dst_arg' back to StringValue
-      Value* anyval_result = builder.CreateLoad(dst_lowered_ptr, "anyval_result");
-      result = CodegenAnyVal(codegen, &builder, dst_type, anyval_result).ToNativeValue();
-      break;
-    }
-    default:
-      DCHECK(false) << "bad aggregate operator: " << evaluator->agg_op();
   }
 
   // TODO: Store to register in the loop and store once to memory at the end of the loop.
-  builder.CreateStore(result, dst_ptr);
+  builder.CreateStore(result, dst_slot_ptr);
   builder.CreateBr(ret_block);
 
   builder.SetInsertPoint(ret_block);
@@ -1698,6 +1707,58 @@ Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
   return Status::OK();
 }
 
+Status PartitionedAggregationNode::CodegenCallUda(LlvmCodeGen* codegen,
+    LlvmBuilder* builder, AggFnEvaluator* evaluator, Value* agg_fn_ctx,
+    const vector<CodegenAnyVal>& input_vals, const CodegenAnyVal& dst,
+    CodegenAnyVal* updated_dst_val) {
+  DCHECK_EQ(evaluator->input_expr_ctxs().size(), input_vals.size());
+  const string& symbol =
+      evaluator->is_merge() ? evaluator->merge_symbol() : evaluator->update_symbol();
+  const ColumnType& dst_type = evaluator->intermediate_type();
+
+  // TODO: to support actual UDAs, not just builtin functions using the UDA interface,
+  // we need to load the function at this point.
+  Function* uda_fn = codegen->GetFunction(symbol, true);
+  DCHECK(uda_fn != NULL);
+
+  vector<FunctionContext::TypeDesc> arg_types;
+  for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) {
+    arg_types.push_back(AnyValUtil::ColumnTypeToTypeDesc(
+        evaluator->input_expr_ctxs()[i]->root()->type()));
+  }
+  Expr::InlineConstants(
+      AnyValUtil::ColumnTypeToTypeDesc(dst_type), arg_types, codegen, uda_fn);
+
+  // Set up arguments for call to UDA, which are the FunctionContext*, followed by
+  // pointers to all input values, followed by a pointer to the destination value.
+  vector<Value*> uda_fn_args;
+  uda_fn_args.push_back(agg_fn_ctx);
+
+  // Create pointers to input args to pass to uda_fn. We must use the unlowered type,
+  // e.g. IntVal, because the UDA interface expects the values to be passed as const
+  // references to the classes.
+  for (int i = 0; i < evaluator->input_expr_ctxs().size(); ++i) {
+    uda_fn_args.push_back(input_vals[i].GetUnloweredPtr("input_unlowered_ptr"));
+  }
+
+  // Create pointer to dst to pass to uda_fn. We must use the unlowered type for the
+  // same reason as above.
+  Value* dst_lowered_ptr = dst.GetLoweredPtr("dst_lowered_ptr");
+  Type* dst_unlowered_ptr_type = CodegenAnyVal::GetUnloweredPtrType(codegen, dst_type);
+  Value* dst_unlowered_ptr = builder->CreateBitCast(
+      dst_lowered_ptr, dst_unlowered_ptr_type, "dst_unlowered_ptr");
+  uda_fn_args.push_back(dst_unlowered_ptr);
+
+  // Call 'uda_fn'
+  builder->CreateCall(uda_fn, uda_fn_args);
+
+  // Convert intermediate 'dst_arg' back to the native type.
+  Value* anyval_result = builder->CreateLoad(dst_lowered_ptr, "anyval_result");
+
+  *updated_dst_val = CodegenAnyVal(codegen, builder, dst_type, anyval_result);
+  return Status::OK();
+}
+
 // IR codegen for the UpdateTuple loop.  This loop is query specific and based on the
 // aggregate functions.  The function signature must match the non- codegen'd UpdateTuple
 // exactly.
@@ -1706,78 +1767,60 @@ Status PartitionedAggregationNode::CodegenUpdateSlot(LlvmCodeGen* codegen,
 //
 // ; Function Attrs: alwaysinline
 // define void @UpdateTuple(%"class.impala::PartitionedAggregationNode"* %this_ptr,
-//                          %"class.impala_udf::FunctionContext"** %agg_fn_ctxs,
-//                          %"class.impala::Tuple"* %tuple,
-//                          %"class.impala::TupleRow"* %row,
-//                          i1 %is_merge) #34 {
+//      %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, %"class.impala::Tuple"*
+//      %tuple,
+//      %"class.impala::TupleRow"* %row, i1 %is_merge) #34 {
 // entry:
 //   %tuple1 =
-//       bitcast %"class.impala::Tuple"* %tuple to { i8, [7 x i8], i64, i64, double }*
+//      bitcast %"class.impala::Tuple"* %tuple to { i8, [7 x i8], i64, i64, double }*
 //   %src_slot = getelementptr inbounds { i8, [7 x i8], i64, i64, double },
-//       { i8, [7 x i8], i64, i64, double }* %tuple1, i32 0, i32 2
+//      { i8, [7 x i8], i64, i64, double }* %tuple1, i32 0, i32 2
 //   %count_star_val = load i64, i64* %src_slot
 //   %count_star_inc = add i64 %count_star_val, 1
 //   store i64 %count_star_inc, i64* %src_slot
 //   %0 = getelementptr %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 1
+//      %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 1
 //   %agg_fn_ctx = load %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %0
-//   %1 = call %"class.impala::ExprContext"*
-//       @_ZNK6impala26PartitionedAggregationNode17GetAggExprContextEi(
-//           %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 1)
+//      %"class.impala_udf::FunctionContext"** %0
+//   %1 = call %"class.impala::ExprContext"**
+//      @_ZNK6impala26PartitionedAggregationNode18GetAggExprContextsEi(
+//      %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 1)
 //   call void @UpdateSlot(%"class.impala_udf::FunctionContext"* %agg_fn_ctx,
-//                         %"class.impala::ExprContext"* %1,
-//                         { i8, [7 x i8], i64, i64, double }* %tuple1,
-//                         %"class.impala::TupleRow"* %row)
+//      %"class.impala::ExprContext"** %1, { i8, [7 x i8], i64, i64, double }* %tuple1,
+//      %"class.impala::TupleRow"* %row)
 //   %2 = getelementptr %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 2
+//      %"class.impala_udf::FunctionContext"** %agg_fn_ctxs, i32 2
 //   %agg_fn_ctx2 = load %"class.impala_udf::FunctionContext"*,
-//       %"class.impala_udf::FunctionContext"** %2
-//   %3 = call %"class.impala::ExprContext"*
-//       @_ZNK6impala26PartitionedAggregationNode17GetAggExprContextEi(
-//           %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 2)
-//   call void @UpdateSlot.3(%"class.impala_udf::FunctionContext"* %agg_fn_ctx2,
-//                          %"class.impala::ExprContext"* %3,
-//                          { i8, [7 x i8], i64, i64, double }* %tuple1,
-//                          %"class.impala::TupleRow"* %row)
+//      %"class.impala_udf::FunctionContext"** %2
+//   %3 = call %"class.impala::ExprContext"**
+//      @_ZNK6impala26PartitionedAggregationNode18GetAggExprContextsEi(
+//      %"class.impala::PartitionedAggregationNode"* %this_ptr, i32 2)
+//   call void @UpdateSlot.4(%"class.impala_udf::FunctionContext"* %agg_fn_ctx2,
+//      %"class.impala::ExprContext"** %3, { i8, [7 x i8], i64, i64, double }* %tuple1,
+//      %"class.impala::TupleRow"* %row)
 //   ret void
 // }
-Status PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
-    Function** fn) {
+Status PartitionedAggregationNode::CodegenUpdateTuple(
+    LlvmCodeGen* codegen, Function** fn) {
   SCOPED_TIMER(codegen->codegen_timer());
 
-  int j = grouping_expr_ctxs_.size();
-  for (int i = 0; i < aggregate_evaluators_.size(); ++i, ++j) {
-    SlotDescriptor* slot_desc = intermediate_tuple_desc_->slots()[j];
-    AggFnEvaluator* evaluator = aggregate_evaluators_[i];
-
-    // Don't codegen things that aren't builtins (for now)
-    if (!evaluator->is_builtin()) {
-      return Status("PartitionedAggregationNode::CodegenUpdateTuple(): UDA codegen NYI");
-    }
-
-    bool supported = true;
-    AggFnEvaluator::AggregationOp op = evaluator->agg_op();
-    PrimitiveType type = slot_desc->type().type;
-    // Char and timestamp intermediates aren't supported
-    if (type == TYPE_TIMESTAMP || type == TYPE_CHAR) supported = false;
-    // Only AVG and NDV support string intermediates
-    if ((type == TYPE_STRING || type == TYPE_VARCHAR) &&
-        !(op == AggFnEvaluator::AVG || op == AggFnEvaluator::NDV)) {
-      supported = false;
-    }
-    if (!supported) {
-      stringstream ss;
-      ss << "Could not codegen PartitionedAggregationNode::UpdateTuple because "
-         << "intermediate type " << slot_desc->type() << " is not yet supported for "
-         << "aggregate function \"" << evaluator->fn_name() << "()\"";
-      return Status(ss.str());
+  for (const SlotDescriptor* slot_desc : intermediate_tuple_desc_->slots()) {
+    if (slot_desc->type().type == TYPE_CHAR) {
+      return Status("PartitionedAggregationNode::CodegenUpdateTuple(): cannot codegen"
+                    "CHAR in aggregations");
     }
   }
 
   if (intermediate_tuple_desc_->GetLlvmStruct(codegen) == NULL) {
     return Status("PartitionedAggregationNode::CodegenUpdateTuple(): failed to generate "
-        "intermediate tuple desc");
+                  "intermediate tuple desc");
+  }
+
+  for (AggFnEvaluator* evaluator : aggregate_evaluators_) {
+    // Don't codegen things that aren't builtins (for now)
+    if (!evaluator->is_builtin()) {
+      return Status("PartitionedAggregationNode::CodegenUpdateTuple(): UDA codegen NYI");
+    }
   }
 
   // Get the types to match the UpdateTuple signature
@@ -1800,7 +1843,7 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
   prototype.AddArgument(LlvmCodeGen::NamedVariable("row", tuple_row_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("is_merge", codegen->boolean_type()));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[5];
   *fn = prototype.GeneratePrototype(&builder, &args[0]);
   Value* this_arg = args[0];
@@ -1812,13 +1855,13 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
   // TODO: get rid of this by using right type in function signature
   tuple_arg = builder.CreateBitCast(tuple_arg, tuple_ptr, "tuple");
 
-  Function* get_expr_ctx_fn =
-      codegen->GetFunction(IRFunction::PART_AGG_NODE_GET_EXPR_CTX, false);
-  DCHECK(get_expr_ctx_fn != NULL);
+  Function* get_expr_ctxs_fn =
+      codegen->GetFunction(IRFunction::PART_AGG_NODE_GET_EXPR_CTXS, false);
+  DCHECK(get_expr_ctxs_fn != NULL);
 
   // Loop over each expr and generate the IR for that slot.  If the expr is not
   // count(*), generate a helper IR function to update the slot and call that.
-  j = grouping_expr_ctxs_.size();
+  int j = grouping_expr_ctxs_.size();
   for (int i = 0; i < aggregate_evaluators_.size(); ++i, ++j) {
     SlotDescriptor* slot_desc = intermediate_tuple_desc_->slots()[j];
     AggFnEvaluator* evaluator = aggregate_evaluators_[i];
@@ -1838,9 +1881,9 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
       Value* agg_fn_ctx = builder.CreateLoad(agg_fn_ctx_ptr, "agg_fn_ctx");
       // Call GetExprCtx() to get the expression context.
       DCHECK(agg_expr_ctxs_[i] != NULL);
-      Value* get_expr_ctx_args[] = { this_arg, codegen->GetIntConstant(TYPE_INT, i) };
-      Value* agg_expr_ctx = builder.CreateCall(get_expr_ctx_fn, get_expr_ctx_args);
-      Value* update_slot_args[] = { agg_fn_ctx, agg_expr_ctx, tuple_arg, row_arg };
+      Value* get_expr_ctxs_args[] = {this_arg, codegen->GetIntConstant(TYPE_INT, i)};
+      Value* agg_expr_ctxs = builder.CreateCall(get_expr_ctxs_fn, get_expr_ctxs_args);
+      Value* update_slot_args[] = {agg_fn_ctx, agg_expr_ctxs, tuple_arg, row_arg};
       builder.CreateCall(update_slot_fn, update_slot_args);
     }
   }
@@ -1849,8 +1892,8 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(LlvmCodeGen* codegen,
   // CodegenProcessBatch() does the final optimizations.
   *fn = codegen->FinalizeFunction(*fn);
   if (*fn == NULL) {
-    return Status("PartitionedAggregationNode::CodegeUpdateTuple(): codegen'd "
-        "UpdateTuple() function failed verification, see log");
+    return Status("PartitionedAggregationNode::CodegenUpdateTuple(): codegen'd "
+                  "UpdateTuple() function failed verification, see log");
   }
   return Status::OK();
 }
diff --git a/be/src/exec/partitioned-aggregation-node.h b/be/src/exec/partitioned-aggregation-node.h
index 9e14e6657..4287ceaf0 100644
--- a/be/src/exec/partitioned-aggregation-node.h
+++ b/be/src/exec/partitioned-aggregation-node.h
@@ -30,13 +30,17 @@
 #include "runtime/string-value.h"
 
 namespace llvm {
-  class Function;
+class BasicBlock;
+class Function;
+class Value;
 }
 
 namespace impala {
 
 class AggFnEvaluator;
+class CodegenAnyVal;
 class LlvmCodeGen;
+class LlvmBuilder;
 class RowBatch;
 class RuntimeState;
 struct StringValue;
@@ -203,7 +207,7 @@ class PartitionedAggregationNode : public ExecNode {
   /// version of UpdateTuple() to avoid loading aggregate_evaluators_[i] at runtime.
   /// An entry is NULL if the aggregate evaluator is not codegen'ed or there is no Expr
   /// in the aggregate evaluator (e.g. count(*)).
-  std::vector<ExprContext*> agg_expr_ctxs_;
+  std::vector<ExprContext* const*> agg_expr_ctxs_;
 
   /// FunctionContext for each aggregate function and backing MemPool. String data
   /// returned by the aggregate functions is allocated via these contexts.
@@ -520,12 +524,13 @@ class PartitionedAggregationNode : public ExecNode {
   /// This function processes each individual row in ProcessBatch(). Must be inlined into
   /// ProcessBatch for codegen to substitute function calls with codegen'd versions.
   /// May spill partitions if not enough memory is available.
-  template<bool AGGREGATED_ROWS>
+  template <bool AGGREGATED_ROWS>
   Status IR_ALWAYS_INLINE ProcessRow(TupleRow* row, HashTableCtx* ht_ctx);
 
-  /// Accessor for the expression context of an AggFnEvaluator. Used only in codegen'ed
+  /// Accessor for the expression contexts of an AggFnEvaluator. Returns an array of
+  /// pointers the the AggFnEvaluator's expression contexts. Used only in codegen'ed
   /// version of UpdateTuple().
-  ExprContext* IR_ALWAYS_INLINE GetAggExprContext(int i) const;
+  ExprContext* const* IR_ALWAYS_INLINE GetAggExprContexts(int i) const;
 
   /// Create a new intermediate tuple in partition, initialized with row. ht_ctx is
   /// the context for the partition's hash table and hash is the precomputed hash of
@@ -640,6 +645,16 @@ class PartitionedAggregationNode : public ExecNode {
   Status CodegenUpdateSlot(LlvmCodeGen* codegen, AggFnEvaluator* evaluator,
       SlotDescriptor* slot_desc, llvm::Function** fn);
 
+  /// Codegen a call to a function implementing the UDA interface with input values
+  /// from 'input_vals'. 'dst_val' should contain the previous value of the aggregate
+  /// function, and 'updated_dst_val' is set to the new value after the Update or Merge
+  /// operation is applied. The instruction sequence for the UDA call is inserted at
+  /// the insert position of 'builder'.
+  Status CodegenCallUda(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      AggFnEvaluator* evaluator, llvm::Value* agg_fn_ctx,
+      const std::vector<CodegenAnyVal>& input_vals, const CodegenAnyVal& dst_val,
+      CodegenAnyVal* updated_dst_val);
+
   /// Codegen UpdateTuple(). Returns non-OK status if codegen is unsuccessful.
   Status CodegenUpdateTuple(LlvmCodeGen* codegen, llvm::Function** fn);
 
diff --git a/be/src/exec/partitioned-hash-join-node.cc b/be/src/exec/partitioned-hash-join-node.cc
index 116ac02a9..c3211af14 100644
--- a/be/src/exec/partitioned-hash-join-node.cc
+++ b/be/src/exec/partitioned-hash-join-node.cc
@@ -1144,7 +1144,7 @@ Status PartitionedHashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen,
   prototype.AddArgument(LlvmCodeGen::NamedVariable("build_arg", tuple_row_ptr_type));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* out_row_arg = builder.CreateBitCast(args[1], tuple_row_working_type, "out");
@@ -1156,8 +1156,9 @@ Status PartitionedHashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen,
 
   // Copy probe row
   codegen->CodegenMemcpy(&builder, out_row_arg, probe_row_arg, probe_tuple_row_size_);
-  Value* build_row_idx[] = { codegen->GetIntConstant(TYPE_INT, num_probe_tuples) };
-  Value* build_row_dst = builder.CreateGEP(out_row_arg, build_row_idx, "build_dst_ptr");
+  Value* build_row_idx[] = {codegen->GetIntConstant(TYPE_INT, num_probe_tuples)};
+  Value* build_row_dst =
+      builder.CreateInBoundsGEP(out_row_arg, build_row_idx, "build_dst_ptr");
 
   // Copy build row.
   BasicBlock* build_not_null_block = BasicBlock::Create(context, "build_not_null", *fn);
@@ -1176,9 +1177,8 @@ Status PartitionedHashJoinNode::CodegenCreateOutputRow(LlvmCodeGen* codegen,
     // to work.
     builder.SetInsertPoint(build_null_block);
     for (int i = 0; i < num_build_tuples; ++i) {
-      Value* array_idx[] =
-          { codegen->GetIntConstant(TYPE_INT, i + num_probe_tuples) };
-      Value* dst = builder.CreateGEP(out_row_arg, array_idx, "dst_tuple_ptr");
+      Value* array_idx[] = {codegen->GetIntConstant(TYPE_INT, i + num_probe_tuples)};
+      Value* dst = builder.CreateInBoundsGEP(out_row_arg, array_idx, "dst_tuple_ptr");
       builder.CreateStore(codegen->null_ptr_value(), dst);
     }
     builder.CreateRetVoid();
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index d38d662ae..cb5d70cb7 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -128,19 +128,13 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
   if (tuple_type == NULL) return NULL;
   PointerType* tuple_ptr_type = tuple_type->getPointerTo();
 
-  Function* set_null_fn = slot_desc->GetUpdateNullFn(codegen, true);
-  if (set_null_fn == NULL) {
-    LOG(ERROR) << "Could not codegen WriteSlot because slot update codegen failed.";
-    return NULL;
-  }
-
   LlvmCodeGen::FnPrototype prototype(
       codegen, "WriteSlot", codegen->GetType(TYPE_BOOLEAN));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple_arg", tuple_ptr_type));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("data", codegen->ptr_type()));
   prototype.AddArgument(LlvmCodeGen::NamedVariable("len", codegen->GetType(TYPE_INT)));
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
+  LlvmBuilder builder(codegen->context());
   Value* args[3];
   Function* fn = prototype.GeneratePrototype(&builder, &args[0]);
 
@@ -267,13 +261,13 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
 
     // Parse failed, set slot to null and return false
     builder.SetInsertPoint(parse_failed_block);
-    builder.CreateCall(set_null_fn, ArrayRef<Value*>({args[0]}));
+    slot_desc->CodegenSetNullIndicator(codegen, &builder, args[0], codegen->true_value());
     builder.CreateRet(codegen->false_value());
   }
 
   // Case where data is \N or len == 0 and it is not a string col
   builder.SetInsertPoint(set_null_block);
-  builder.CreateCall(set_null_fn, ArrayRef<Value*>({args[0]}));
+  slot_desc->CodegenSetNullIndicator(codegen, &builder, args[0], codegen->true_value());
   builder.CreateRet(codegen->true_value());
 
   return codegen->FinalizeFunction(fn);
diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index c07d1db7d..7ed638634 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -164,12 +164,10 @@ StringVal ToStringVal(FunctionContext* context, T val) {
   stringstream ss;
   ss << val;
   const string &str = ss.str();
-  return StringVal::CopyFrom(context, reinterpret_cast<const uint8_t*>(str.c_str()), str.size());
+  return StringVal::CopyFrom(
+      context, reinterpret_cast<const uint8_t*>(str.c_str()), str.size());
 }
 
-// Delimiter to use if the separator is NULL.
-static const StringVal DEFAULT_STRING_CONCAT_DELIM((uint8_t*)", ", 2);
-
 constexpr int AggregateFunctions::HLL_PRECISION;
 constexpr int AggregateFunctions::HLL_LEN;
 
@@ -627,15 +625,21 @@ void AggregateFunctions::Max(FunctionContext*,
 // StringConcatUpdate().
 typedef int StringConcatHeader;
 
-void AggregateFunctions::StringConcatUpdate(FunctionContext* ctx,
-    const StringVal& src, StringVal* result) {
-  StringConcatUpdate(ctx, src, DEFAULT_STRING_CONCAT_DELIM, result);
+// Delimiter to use if the separator is not provided.
+static inline StringVal ALWAYS_INLINE DefaultStringConcatDelim() {
+  return StringVal(reinterpret_cast<uint8_t*>(const_cast<char*>(", ")), 2);
 }
 
-void AggregateFunctions::StringConcatUpdate(FunctionContext* ctx,
-    const StringVal& src, const StringVal& separator, StringVal* result) {
+void AggregateFunctions::StringConcatUpdate(
+    FunctionContext* ctx, const StringVal& src, StringVal* result) {
+  StringConcatUpdate(ctx, src, DefaultStringConcatDelim(), result);
+}
+
+void AggregateFunctions::StringConcatUpdate(FunctionContext* ctx, const StringVal& src,
+    const StringVal& separator, StringVal* result) {
   if (src.is_null) return;
-  const StringVal* sep = separator.is_null ? &DEFAULT_STRING_CONCAT_DELIM : &separator;
+  const StringVal default_delim = DefaultStringConcatDelim();
+  const StringVal* sep = separator.is_null ? &default_delim : &separator;
   if (result->is_null) {
     // Header of the intermediate state holds the length of the first separator.
     const int header_len = sizeof(StringConcatHeader);
diff --git a/be/src/exprs/case-expr.cc b/be/src/exprs/case-expr.cc
index f139d64b2..3c0db796f 100644
--- a/be/src/exprs/case-expr.cc
+++ b/be/src/exprs/case-expr.cc
@@ -196,7 +196,7 @@ Status CaseExpr::GetCodegendComputeFn(LlvmCodeGen* codegen, Function** fn) {
   }
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
 
   Value* args[2];
   Function* function = CreateIrFunctionPrototype(codegen, "CaseExpr", &args);
diff --git a/be/src/exprs/compound-predicates.cc b/be/src/exprs/compound-predicates.cc
index 531a70c79..d367e5a95 100644
--- a/be/src/exprs/compound-predicates.cc
+++ b/be/src/exprs/compound-predicates.cc
@@ -131,7 +131,7 @@ Status CompoundPredicate::CodegenComputeFn(
   RETURN_IF_ERROR(children()[1]->GetCodegendComputeFn(codegen, &rhs_function));
 
   LLVMContext& context = codegen->context();
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[2];
   Function* function = CreateIrFunctionPrototype(codegen, "CompoundPredicate", &args);
 
@@ -140,8 +140,7 @@ Status CompoundPredicate::CodegenComputeFn(
 
   // Control blocks for aggregating results
   BasicBlock* lhs_null_block = BasicBlock::Create(context, "lhs_null", function);
-  BasicBlock* lhs_not_null_block =
-      BasicBlock::Create(context, "lhs_not_null", function);
+  BasicBlock* lhs_not_null_block = BasicBlock::Create(context, "lhs_not_null", function);
   BasicBlock* lhs_null_rhs_not_null_block =
       BasicBlock::Create(context, "lhs_null_rhs_not_null", function);
   BasicBlock* lhs_not_null_rhs_null_block =
@@ -232,7 +231,7 @@ Status CompoundPredicate::CodegenComputeFn(
   CodegenAnyVal ret(codegen, &builder, TYPE_BOOLEAN, NULL, "ret");
   ret.SetIsNull(is_null_phi);
   ret.SetVal(val_phi);
-  builder.CreateRet(ret.value());
+  builder.CreateRet(ret.GetLoweredValue());
 
   *fn = codegen->FinalizeFunction(function);
   DCHECK(*fn != NULL);
diff --git a/be/src/exprs/expr-codegen-test.cc b/be/src/exprs/expr-codegen-test.cc
index 800a440c6..6ccbbf43a 100644
--- a/be/src/exprs/expr-codegen-test.cc
+++ b/be/src/exprs/expr-codegen-test.cc
@@ -252,7 +252,7 @@ TEST_F(ExprCodegenTest, TestInlineConstants) {
   test_udf_file << getenv("IMPALA_HOME") << "/be/build/latest/exprs/expr-codegen-test.ll";
   scoped_ptr<LlvmCodeGen> codegen;
   ASSERT_OK(LlvmCodeGen::CreateFromFile(&pool, test_udf_file.str(), "test", &codegen));
-  Function* fn = codegen->GetFunction(TEST_GET_CONSTANT_SYMBOL);
+  Function* fn = codegen->GetFunction(TEST_GET_CONSTANT_SYMBOL, false);
   ASSERT_TRUE(fn != NULL);
 
   // Function verification should fail because we haven't inlined GetConstant() calls
diff --git a/be/src/exprs/expr.cc b/be/src/exprs/expr.cc
index 0141bae7c..045c030c0 100644
--- a/be/src/exprs/expr.cc
+++ b/be/src/exprs/expr.cc
@@ -684,10 +684,10 @@ Status Expr::GetCodegendComputeFnWrapper(LlvmCodeGen* codegen, Function** fn) {
   ir_compute_fn_ = CreateIrFunctionPrototype(codegen, "CodegenComputeFnWrapper", &args);
   BasicBlock* entry_block =
       BasicBlock::Create(codegen->context(), "entry", ir_compute_fn_);
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
   Value* this_ptr =
       codegen->CastPtrToLlvmPtr(codegen->GetPtrType(Expr::LLVM_CLASS_NAME), this);
-  Value* compute_fn_args[] = { this_ptr, args[0], args[1] };
+  Value* compute_fn_args[] = {this_ptr, args[0], args[1]};
   Value* ret = CodegenAnyVal::CreateCall(
       codegen, &builder, static_getval_fn, compute_fn_args, "ret");
   builder.CreateRet(ret);
diff --git a/be/src/exprs/literal.cc b/be/src/exprs/literal.cc
index 03bdd507b..5b075f602 100644
--- a/be/src/exprs/literal.cc
+++ b/be/src/exprs/literal.cc
@@ -385,7 +385,7 @@ Status Literal::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function** fn)
   Value* args[2];
   *fn = CreateIrFunctionPrototype(codegen, "Literal", &args);
   BasicBlock* entry_block = BasicBlock::Create(codegen->context(), "entry", *fn);
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
 
   CodegenAnyVal v = CodegenAnyVal::GetNonNullVal(codegen, &builder, type_);
   switch (type_.type) {
@@ -439,7 +439,7 @@ Status Literal::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function** fn)
       return Status(ss.str());
   }
 
-  builder.CreateRet(v.value());
+  builder.CreateRet(v.GetLoweredValue());
   *fn = codegen->FinalizeFunction(*fn);
   ir_compute_fn_ = *fn;
   return Status::OK();
diff --git a/be/src/exprs/null-literal.cc b/be/src/exprs/null-literal.cc
index a8f4241b6..6985c5ba4 100644
--- a/be/src/exprs/null-literal.cc
+++ b/be/src/exprs/null-literal.cc
@@ -101,7 +101,7 @@ Status NullLiteral::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function**
   Value* args[2];
   *fn = CreateIrFunctionPrototype(codegen, "NullLiteral", &args);
   BasicBlock* entry_block = BasicBlock::Create(codegen->context(), "entry", *fn);
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
 
   Value* v = CodegenAnyVal::GetNullVal(codegen, type());
   builder.CreateRet(v);
diff --git a/be/src/exprs/scalar-fn-call.cc b/be/src/exprs/scalar-fn-call.cc
index 3a7ffb1ba..5457cede8 100644
--- a/be/src/exprs/scalar-fn-call.cc
+++ b/be/src/exprs/scalar-fn-call.cc
@@ -319,7 +319,7 @@ Status ScalarFnCall::GetCodegendComputeFn(LlvmCodeGen* codegen, Function** fn) {
   Value* expr_ctx = args[0];
   Value* row = args[1];
   BasicBlock* block = BasicBlock::Create(codegen->context(), "entry", *fn);
-  LlvmCodeGen::LlvmBuilder builder(block);
+  LlvmBuilder builder(block);
 
   // Populate UDF arguments
   vector<Value*> udf_args;
@@ -482,7 +482,7 @@ Status ScalarFnCall::GetUdf(LlvmCodeGen* codegen, Function** udf) {
   } else if (fn_.binary_type == TFunctionBinaryType::BUILTIN) {
     // In this path, we're running a builtin with the UDF interface. The IR is
     // in the llvm module.
-    *udf = codegen->GetFunction(fn_.scalar_fn.symbol);
+    *udf = codegen->GetFunction(fn_.scalar_fn.symbol, false);
     if (*udf == NULL) {
       // Builtins symbols should exist unless there is a version mismatch.
       stringstream ss;
@@ -502,11 +502,11 @@ Status ScalarFnCall::GetUdf(LlvmCodeGen* codegen, Function** udf) {
   } else {
     // We're running an IR UDF.
     DCHECK_EQ(fn_.binary_type, TFunctionBinaryType::IR);
-    *udf = codegen->GetFunction(fn_.scalar_fn.symbol);
+    *udf = codegen->GetFunction(fn_.scalar_fn.symbol, false);
     if (*udf == NULL) {
       stringstream ss;
-      ss << "Unable to locate function " << fn_.scalar_fn.symbol
-         << " from LLVM module " << fn_.hdfs_location;
+      ss << "Unable to locate function " << fn_.scalar_fn.symbol << " from LLVM module "
+         << fn_.hdfs_location;
       return Status(ss.str());
     }
     *udf = codegen->FinalizeFunction(*udf);
@@ -527,7 +527,7 @@ Status ScalarFnCall::GetFunction(RuntimeState* state, const string& symbol, void
     DCHECK_EQ(fn_.binary_type, TFunctionBinaryType::IR);
     LlvmCodeGen* codegen = state->codegen();
     DCHECK(codegen != NULL);
-    Function* ir_fn = codegen->GetFunction(symbol);
+    Function* ir_fn = codegen->GetFunction(symbol, false);
     if (ir_fn == NULL) {
       stringstream ss;
       ss << "Unable to locate function " << symbol << " from LLVM module "
diff --git a/be/src/exprs/slot-ref.cc b/be/src/exprs/slot-ref.cc
index eeba42f98..3a70b6c72 100644
--- a/be/src/exprs/slot-ref.cc
+++ b/be/src/exprs/slot-ref.cc
@@ -134,11 +134,11 @@ string SlotRef::DebugString() const {
 //   br label %check_slot_null
 //
 // check_slot_null:                                  ; preds = %entry
-//   %null_ptr = getelementptr i8* %tuple_ptr, i32 0
+//   %null_byte_ptr = getelementptr i8* %tuple_ptr, i32 0
 //   %null_byte = load i8* %null_ptr
 //   %null_byte_set = and i8 %null_byte, 2
-//   %slot_is_null = icmp ne i8 %null_byte_set, 0
-//   br i1 %slot_is_null, label %ret, label %get_slot
+//   %is_null = icmp ne i8 %null_byte_set, 0
+//   br i1 %is_null, label %ret, label %get_slot
 //
 // get_slot:                                         ; preds = %check_slot_null
 //   %slot_addr = getelementptr i8* %tuple_ptr, i32 8
@@ -189,28 +189,24 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function** fn)
   Value* row_ptr = args[1];
 
   Value* tuple_offset = ConstantInt::get(codegen->int_type(), tuple_idx_);
-  Value* null_byte_offset =
-    ConstantInt::get(codegen->int_type(), null_indicator_offset_.byte_offset);
   Value* slot_offset = ConstantInt::get(codegen->int_type(), slot_offset_);
-  Value* null_mask =
-      ConstantInt::get(codegen->tinyint_type(), null_indicator_offset_.bit_mask);
   Value* zero = ConstantInt::get(codegen->GetType(TYPE_TINYINT), 0);
   Value* one = ConstantInt::get(codegen->GetType(TYPE_TINYINT), 1);
 
   BasicBlock* entry_block = BasicBlock::Create(context, "entry", *fn);
+  bool slot_is_nullable = null_indicator_offset_.bit_mask != 0;
   BasicBlock* check_slot_null_indicator_block = NULL;
-  if (null_indicator_offset_.bit_mask != 0) {
-    check_slot_null_indicator_block =
-        BasicBlock::Create(context, "check_slot_null", *fn);
+  if (slot_is_nullable) {
+    check_slot_null_indicator_block = BasicBlock::Create(context, "check_slot_null", *fn);
   }
   BasicBlock* get_slot_block = BasicBlock::Create(context, "get_slot", *fn);
   BasicBlock* ret_block = BasicBlock::Create(context, "ret", *fn);
 
-  LlvmCodeGen::LlvmBuilder builder(entry_block);
+  LlvmBuilder builder(entry_block);
   // Get the tuple offset addr from the row
   Value* cast_row_ptr = builder.CreateBitCast(
       row_ptr, PointerType::get(codegen->ptr_type(), 0), "cast_row_ptr");
-  Value* tuple_addr = builder.CreateGEP(cast_row_ptr, tuple_offset, "tuple_addr");
+  Value* tuple_addr = builder.CreateInBoundsGEP(cast_row_ptr, tuple_offset, "tuple_addr");
   // Load the tuple*
   Value* tuple_ptr = builder.CreateLoad(tuple_addr, "tuple_ptr");
 
@@ -218,32 +214,30 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function** fn)
   if (tuple_is_nullable_) {
     Value* tuple_is_null = builder.CreateIsNull(tuple_ptr, "tuple_is_null");
     // Check slot is null only if the null indicator bit is set
-    if (null_indicator_offset_.bit_mask == 0) {
-      builder.CreateCondBr(tuple_is_null, ret_block, get_slot_block);
-    } else {
+    if (slot_is_nullable) {
       builder.CreateCondBr(tuple_is_null, ret_block, check_slot_null_indicator_block);
+    } else {
+      builder.CreateCondBr(tuple_is_null, ret_block, get_slot_block);
     }
   } else {
-    if (null_indicator_offset_.bit_mask == 0) {
-      builder.CreateBr(get_slot_block);
-    } else {
+    if (slot_is_nullable) {
       builder.CreateBr(check_slot_null_indicator_block);
+    } else {
+      builder.CreateBr(get_slot_block);
     }
   }
 
   // Branch for tuple* != NULL.  Need to check if null-indicator is set
-  if (check_slot_null_indicator_block != NULL) {
+  if (slot_is_nullable) {
     builder.SetInsertPoint(check_slot_null_indicator_block);
-    Value* null_addr = builder.CreateGEP(tuple_ptr, null_byte_offset, "null_ptr");
-    Value* null_val = builder.CreateLoad(null_addr, "null_byte");
-    Value* slot_null_mask = builder.CreateAnd(null_val, null_mask, "null_byte_set");
-    Value* is_slot_null = builder.CreateICmpNE(slot_null_mask, zero, "slot_is_null");
+    Value* is_slot_null = SlotDescriptor::CodegenIsNull(
+        codegen, &builder, null_indicator_offset_, tuple_ptr);
     builder.CreateCondBr(is_slot_null, ret_block, get_slot_block);
   }
 
   // Branch for slot != NULL
   builder.SetInsertPoint(get_slot_block);
-  Value* slot_ptr = builder.CreateGEP(tuple_ptr, slot_offset, "slot_addr");
+  Value* slot_ptr = builder.CreateInBoundsGEP(tuple_ptr, slot_offset, "slot_addr");
   Value* val_ptr = builder.CreateBitCast(slot_ptr, codegen->GetPtrType(type_), "val_ptr");
   // Depending on the type, load the values we need
   Value* val = NULL;
@@ -313,7 +307,7 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function** fn)
     result.SetIsNull(is_null_phi);
     result.SetPtr(ptr_phi);
     result.SetLen(len_phi);
-    builder.CreateRet(result.value());
+    builder.CreateRet(result.GetLoweredValue());
   } else if (type() == TYPE_TIMESTAMP) {
     DCHECK(time_of_day != NULL);
     DCHECK(date != NULL);
@@ -343,7 +337,7 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function** fn)
     result.SetIsNull(is_null_phi);
     result.SetTimeOfDay(time_of_day_phi);
     result.SetDate(date_phi);
-    builder.CreateRet(result.value());
+    builder.CreateRet(result.GetLoweredValue());
   } else {
     DCHECK(val != NULL);
     PHINode* val_phi = builder.CreatePHI(val->getType(), 2, "val_phi");
@@ -360,7 +354,7 @@ Status SlotRef::GetCodegendComputeFn(LlvmCodeGen* codegen, llvm::Function** fn)
         CodegenAnyVal::GetNonNullVal(codegen, &builder, type(), "result");
     result.SetIsNull(is_null_phi);
     result.SetVal(val_phi);
-    builder.CreateRet(result.value());
+    builder.CreateRet(result.GetLoweredValue());
   }
 
   *fn = codegen->FinalizeFunction(*fn);
diff --git a/be/src/runtime/descriptors.cc b/be/src/runtime/descriptors.cc
index 41485f937..f8ec79cf0 100644
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -77,9 +77,8 @@ ostream& operator<<(ostream& os, const NullIndicatorOffset& null_indicator) {
   return os;
 }
 
-SlotDescriptor::SlotDescriptor(
-    const TSlotDescriptor& tdesc, const TupleDescriptor* parent,
-    const TupleDescriptor* collection_item_descriptor)
+SlotDescriptor::SlotDescriptor(const TSlotDescriptor& tdesc,
+    const TupleDescriptor* parent, const TupleDescriptor* collection_item_descriptor)
   : id_(tdesc.id),
     type_(ColumnType::FromThrift(tdesc.slotType)),
     parent_(parent),
@@ -89,9 +88,7 @@ SlotDescriptor::SlotDescriptor(
     null_indicator_offset_(tdesc.nullIndicatorByte, tdesc.nullIndicatorBit),
     slot_idx_(tdesc.slotIdx),
     slot_size_(type_.GetSlotSize()),
-    llvm_field_idx_(-1),
-    set_not_null_fn_(NULL),
-    set_null_fn_(NULL) {
+    llvm_field_idx_(-1) {
   DCHECK_NE(type_.type, TYPE_STRUCT);
   DCHECK(parent_ != NULL) << tdesc.parent;
   if (type_.IsCollectionType()) {
@@ -569,60 +566,79 @@ void DescriptorTbl::GetTupleDescs(vector<TupleDescriptor*>* descs) const {
   }
 }
 
-// Generate function to set a slot to be null or not-null.  The resulting IR
-// for SetNotNull looks like:
-// (in this case the tuple contains only a nullable double)
-// define void @SetNotNull({ i8, double }* %tuple) {
-// entry:
-//   %null_byte_ptr = getelementptr inbounds { i8, double }* %tuple, i32 0, i32 0
-//   %null_byte = load i8* %null_byte_ptr
-//   %0 = and i8 %null_byte, -2
-//   store i8 %0, i8* %null_byte_ptr
-//   ret void
-// }
-Function* SlotDescriptor::GetUpdateNullFn(LlvmCodeGen* codegen, bool set_null) const {
-  if (set_null && set_null_fn_ != NULL) return set_null_fn_;
-  if (!set_null && set_not_null_fn_ != NULL) return set_not_null_fn_;
+Value* SlotDescriptor::CodegenIsNull(
+    LlvmCodeGen* codegen, LlvmBuilder* builder, Value* tuple) const {
+  return CodegenIsNull(codegen, builder, null_indicator_offset_, tuple);
+}
 
-  StructType* tuple_type = parent()->GetLlvmStruct(codegen);
-  PointerType* tuple_ptr_type = tuple_type->getPointerTo();
-  LlvmCodeGen::FnPrototype prototype(codegen, (set_null) ? "SetNull" :"SetNotNull",
-      codegen->void_type());
-  prototype.AddArgument(LlvmCodeGen::NamedVariable("tuple", tuple_ptr_type));
+// Example IR for getting the first null bit:
+//  %0 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple to i8*
+//  %null_byte_ptr = getelementptr i8, i8* %0, i32 0
+//  %null_byte = load i8, i8* %null_byte_ptr
+//  %null_mask = and i8 %null_byte, 1
+//  %is_null = icmp ne i8 %null_mask, 0
+Value* SlotDescriptor::CodegenIsNull(LlvmCodeGen* codegen, LlvmBuilder* builder,
+    const NullIndicatorOffset& null_indicator_offset, Value* tuple) {
+  Value* null_byte =
+      CodegenGetNullByte(codegen, builder, null_indicator_offset, tuple, NULL);
+  Constant* mask =
+      ConstantInt::get(codegen->tinyint_type(), null_indicator_offset.bit_mask);
+  Value* null_mask = builder->CreateAnd(null_byte, mask, "null_mask");
+  Constant* zero = ConstantInt::get(codegen->tinyint_type(), 0);
+  return builder->CreateICmpNE(null_mask, zero, "is_null");
+}
 
-  LlvmCodeGen::LlvmBuilder builder(codegen->context());
-  Value* tuple_arg;
-  Function* fn = prototype.GeneratePrototype(&builder, &tuple_arg);
-
-  Value* tuple_int8_ptr =
-      builder.CreateBitCast(tuple_arg, codegen->ptr_type(), "tuple_int8_ptr");
-  Value* null_byte_offset =
-      ConstantInt::get(codegen->int_type(), null_indicator_offset_.byte_offset);
-  Value* null_byte_ptr =
-      builder.CreateInBoundsGEP(tuple_int8_ptr, null_byte_offset, "null_byte_ptr");
-  Value* null_byte = builder.CreateLoad(null_byte_ptr, "null_byte");
+// Example IR for setting the first null bit to a non-constant 'is_null' value:
+//  %14 = bitcast { i8, [7 x i8], %"class.impala::TimestampValue" }* %agg_tuple to i8*
+//  %null_byte_ptr3 = getelementptr i8, i8* %14, i32 0
+//  %null_byte4 = load i8, i8* %null_byte_ptr3
+//  %null_bit_cleared = and i8 %null_byte4, -2
+//  %15 = sext i1 %result_is_null to i8
+//  %null_bit = and i8 %15, 1
+//  %null_bit_set = or i8 %null_bit_cleared, %null_bit
+//  store i8 %null_bit_set, i8* %null_byte_ptr3
+void SlotDescriptor::CodegenSetNullIndicator(
+    LlvmCodeGen* codegen, LlvmBuilder* builder, Value* tuple, Value* is_null) const {
+  DCHECK_EQ(is_null->getType(), codegen->boolean_type());
+  Value* null_byte_ptr;
+  Value* null_byte =
+      CodegenGetNullByte(codegen, builder, null_indicator_offset_, tuple, &null_byte_ptr);
+  Constant* mask =
+      ConstantInt::get(codegen->tinyint_type(), null_indicator_offset_.bit_mask);
+  Constant* not_mask =
+      ConstantInt::get(codegen->tinyint_type(), ~null_indicator_offset_.bit_mask);
 
+  ConstantInt* constant_is_null = dyn_cast<ConstantInt>(is_null);
   Value* result = NULL;
-  if (set_null) {
-    Value* null_set = codegen->GetIntConstant(
-        TYPE_TINYINT, null_indicator_offset_.bit_mask);
-    result = builder.CreateOr(null_byte, null_set);
+  if (constant_is_null != NULL) {
+    if (constant_is_null->isOne()) {
+      result = builder->CreateOr(null_byte, mask, "null_bit_set");
+    } else {
+      DCHECK(constant_is_null->isZero());
+      result = builder->CreateAnd(null_byte, not_mask, "null_bit_cleared");
+    }
   } else {
-    Value* null_clear_val =
-        codegen->GetIntConstant(TYPE_TINYINT, ~null_indicator_offset_.bit_mask);
-    result = builder.CreateAnd(null_byte, null_clear_val);
+    // Avoid branching by computing the new byte as:
+    // (null_byte & ~mask) | (-null & mask);
+    Value* byte_with_cleared_bit =
+        builder->CreateAnd(null_byte, not_mask, "null_bit_cleared");
+    Value* sign_extended_null = builder->CreateSExt(is_null, codegen->tinyint_type());
+    Value* bit_only = builder->CreateAnd(sign_extended_null, mask, "null_bit");
+    result = builder->CreateOr(byte_with_cleared_bit, bit_only, "null_bit_set");
   }
 
-  builder.CreateStore(result, null_byte_ptr);
-  builder.CreateRetVoid();
+  builder->CreateStore(result, null_byte_ptr);
+}
 
-  fn = codegen->FinalizeFunction(fn);
-  if (set_null) {
-    set_null_fn_ = fn;
-  } else {
-    set_not_null_fn_ = fn;
-  }
-  return fn;
+Value* SlotDescriptor::CodegenGetNullByte(LlvmCodeGen* codegen, LlvmBuilder* builder,
+    const NullIndicatorOffset& null_indicator_offset, Value* tuple,
+    Value** null_byte_ptr) {
+  Constant* byte_offset =
+      ConstantInt::get(codegen->int_type(), null_indicator_offset.byte_offset);
+  Value* tuple_bytes = builder->CreateBitCast(tuple, codegen->ptr_type());
+  Value* byte_ptr = builder->CreateInBoundsGEP(tuple_bytes, byte_offset, "null_byte_ptr");
+  if (null_byte_ptr != NULL) *null_byte_ptr = byte_ptr;
+  return builder->CreateLoad(byte_ptr, "null_byte");
 }
 
 StructType* TupleDescriptor::GetLlvmStruct(LlvmCodeGen* codegen) const {
diff --git a/be/src/runtime/descriptors.h b/be/src/runtime/descriptors.h
index 81226531c..4a508001c 100644
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -35,19 +35,21 @@ namespace llvm {
   class Function;
   class PointerType;
   class StructType;
+  class Value;
 };
 
 namespace impala {
 
+class Expr;
+class ExprContext;
+class LlvmBuilder;
 class LlvmCodeGen;
 class ObjectPool;
+class RuntimeState;
 class TDescriptorTable;
 class TSlotDescriptor;
 class TTable;
 class TTupleDescriptor;
-class Expr;
-class ExprContext;
-class RuntimeState;
 
 /// A path into a table schema (e.g. a vector of ColumnTypes) pointing to a particular
 /// column/field. The i-th element of the path is the ordinal position of the column/field
@@ -133,9 +135,22 @@ class SlotDescriptor {
 
   std::string DebugString() const;
 
-  /// Codegen for: void SetNull(Tuple* tuple) / void SetNotNull(Tuple* tuple)
-  /// The codegen'd IR function is cached.
-  llvm::Function* GetUpdateNullFn(LlvmCodeGen*, bool set_null) const;
+  /// Generate LLVM code at the insert position of 'builder' that returns a boolean value
+  /// represented as a LLVM i1 indicating whether this slot is null in 'tuple'.
+  llvm::Value* CodegenIsNull(
+      LlvmCodeGen* codegen, LlvmBuilder* builder, llvm::Value* tuple) const;
+
+  /// Generate LLVM code at the insert position of 'builder' that returns a boolean value
+  /// represented as a LLVM i1 with value of the NULL bit at 'null_indicator_offset' in
+  /// 'tuple'.
+  static llvm::Value* CodegenIsNull(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      const NullIndicatorOffset& null_indicator_offset, llvm::Value* tuple);
+
+  /// Generate LLVM code at the insert position of 'builder' to set this slot's
+  /// NULL bit in the given 'tuple' to the value 'is_null'. 'tuple' is a pointer
+  /// to the tuple, and 'is_null' is an boolean value represented as a LLVM i1.
+  void CodegenSetNullIndicator(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      llvm::Value* tuple, llvm::Value* is_null) const;
 
  private:
   friend class DescriptorTbl;
@@ -163,13 +178,16 @@ class SlotDescriptor {
   /// any padding bytes.
   int llvm_field_idx_;
 
-  /// Cached codegen'd functions
-  mutable llvm::Function* set_not_null_fn_;
-  mutable llvm::Function* set_null_fn_;
-
   /// collection_item_descriptor should be non-NULL iff this is a collection slot
   SlotDescriptor(const TSlotDescriptor& tdesc, const TupleDescriptor* parent,
-                 const TupleDescriptor* collection_item_descriptor);
+      const TupleDescriptor* collection_item_descriptor);
+
+  /// Generate LLVM code at the insert position of 'builder' to get the i8 value of the
+  /// the byte containing 'null_indicator_offset' in 'tuple'. If 'null_byte_ptr' is
+  /// non-NULL, sets that to a pointer to the null byte.
+  static llvm::Value* CodegenGetNullByte(LlvmCodeGen* codegen, LlvmBuilder* builder,
+      const NullIndicatorOffset& null_indicator_offset, llvm::Value* tuple,
+      llvm::Value** null_byte_ptr);
 };
 
 class ColumnDescriptor {
diff --git a/be/src/runtime/tuple.cc b/be/src/runtime/tuple.cc
index 7fa8b2b3d..dc8392203 100644
--- a/be/src/runtime/tuple.cc
+++ b/be/src/runtime/tuple.cc
@@ -349,7 +349,7 @@ Status Tuple::CodegenMaterializeExprs(LlvmCodeGen* codegen, bool collect_string_
   prototype.AddArgument("total_string_lengths", int_ptr_type);
   prototype.AddArgument("num_non_null_string_values", int_ptr_type);
 
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[8];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* opaque_tuple_arg = args[0];
@@ -363,6 +363,9 @@ Status Tuple::CodegenMaterializeExprs(LlvmCodeGen* codegen, bool collect_string_
 
   // Cast the opaque Tuple* argument to the generated struct type
   Type* tuple_struct_type = desc.GetLlvmStruct(codegen);
+  if (tuple_struct_type == NULL) {
+    return Status("CodegenMaterializeExprs(): failed to generate tuple desc");
+  }
   PointerType* tuple_type = codegen->GetPtrType(tuple_struct_type);
   Value* tuple = builder.CreateBitCast(opaque_tuple_arg, tuple_type, "tuple");
 
diff --git a/be/src/runtime/types.h b/be/src/runtime/types.h
index 955879247..b0835bfcc 100644
--- a/be/src/runtime/types.h
+++ b/be/src/runtime/types.h
@@ -163,13 +163,28 @@ struct ColumnType {
     return thrift_type;
   }
 
+  inline bool IsBooleanType() const { return type == TYPE_BOOLEAN; }
+
+  inline bool IsIntegerType() const {
+    return type == TYPE_TINYINT || type == TYPE_SMALLINT || type == TYPE_INT
+        || type == TYPE_BIGINT;
+  }
+
+  inline bool IsFloatingPointType() const {
+    return type == TYPE_FLOAT || type == TYPE_DOUBLE;
+  }
+
+  inline bool IsDecimalType() const { return type == TYPE_DECIMAL; }
+
   inline bool IsStringType() const {
     return type == TYPE_STRING || type == TYPE_VARCHAR || type == TYPE_CHAR;
   }
 
+  inline bool IsTimestampType() const { return type == TYPE_TIMESTAMP; }
+
   inline bool IsVarLenStringType() const {
-    return type == TYPE_STRING || type == TYPE_VARCHAR ||
-        (type == TYPE_CHAR && len > MAX_CHAR_INLINE_LENGTH);
+    return type == TYPE_STRING || type == TYPE_VARCHAR
+        || (type == TYPE_CHAR && len > MAX_CHAR_INLINE_LENGTH);
   }
 
   inline bool IsComplexType() const {
diff --git a/be/src/testutil/test-udas.cc b/be/src/testutil/test-udas.cc
index 569853eec..009750081 100644
--- a/be/src/testutil/test-udas.cc
+++ b/be/src/testutil/test-udas.cc
@@ -130,3 +130,39 @@ void ArgIsConstUpdate(FunctionContext* context, const IntVal& val,
 void ArgIsConstMerge(FunctionContext* context, const BooleanVal& src, BooleanVal* dst) {
   dst->val |= src.val;
 }
+
+// Defines aggregate function for testing NULL handling. Returns NULL if an even number
+// of non-NULL inputs are consumed or 1 if an odd number of non-NULL inputs are consumed.
+void ToggleNullInit(FunctionContext* context, IntVal* total) {
+  *total = IntVal::null();
+}
+
+void ToggleNullUpdate(FunctionContext* context, const IntVal& val, IntVal* total) {
+  if (total->is_null) {
+    *total = IntVal(1);
+  } else {
+    *total = IntVal::null();
+  }
+}
+
+void ToggleNullMerge(FunctionContext* context, const IntVal& src, IntVal* dst) {
+  if (src.is_null != dst->is_null) {
+    *dst = IntVal(1);
+  } else {
+    *dst = IntVal::null();
+  }
+}
+
+// Defines aggregate function for testing input NULL handling. Returns the number of NULL
+// input values.
+void CountNullsInit(FunctionContext* context, BigIntVal* total) {
+  *total = BigIntVal(0);
+}
+
+void CountNullsUpdate(FunctionContext* context, const BigIntVal& val, BigIntVal* total) {
+  if (val.is_null) ++total->val;
+}
+
+void CountNullsMerge(FunctionContext* context, const BigIntVal& src, BigIntVal* dst) {
+  dst->val += src.val;
+}
diff --git a/be/src/testutil/test-udfs.cc b/be/src/testutil/test-udfs.cc
index efdfa0a84..56d4524ce 100644
--- a/be/src/testutil/test-udfs.cc
+++ b/be/src/testutil/test-udfs.cc
@@ -56,11 +56,7 @@ IntVal AllTypes(
 StringVal NoArgs(FunctionContext* context) {
   const char* result = "string";
   StringVal ret(context, strlen(result));
-  // TODO: llvm 3.3 seems to have a bug if we use memcpy here making
-  // the IR udf crash. This is fixed in 3.3.1. Fix this when we upgrade.
-  //memcpy(ret.ptr, result, strlen(result));
-  // IMPALA-775
-  for (int i = 0; i < strlen(result); ++i) ret.ptr[i] = result[i];
+  memcpy(ret.ptr, result, strlen(result));
   return ret;
 }
 
diff --git a/be/src/util/tuple-row-compare.cc b/be/src/util/tuple-row-compare.cc
index 69892ae27..c15985307 100644
--- a/be/src/util/tuple-row-compare.cc
+++ b/be/src/util/tuple-row-compare.cc
@@ -205,7 +205,7 @@ Status TupleRowComparator::CodegenCompare(LlvmCodeGen* codegen, Function** fn) {
   prototype.AddArgument("lhs", tuple_row_type);
   prototype.AddArgument("rhs", tuple_row_type);
 
-  LlvmCodeGen::LlvmBuilder builder(context);
+  LlvmBuilder builder(context);
   Value* args[4];
   *fn = prototype.GeneratePrototype(&builder, args);
   Value* lhs_ctxs_arg = args[0];
diff --git a/testdata/workloads/functional-query/queries/QueryTest/java-udf.test b/testdata/workloads/functional-query/queries/QueryTest/java-udf.test
index c473fdfbd..c4e2f1c0b 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/java-udf.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/java-udf.test
@@ -1,20 +1,20 @@
 ====
 ---- QUERY
-select udf_test.hive_pi()
+select hive_pi()
 ---- RESULTS
 3.141592653589793
 ---- TYPES
 DOUBLE
 ====
 ---- QUERY
-select udf_test.hive_bin(100)
+select hive_bin(100)
 ---- RESULTS
 '1100100'
 ---- TYPES
 STRING
 ====
 ---- QUERY
-select min(udf_test.hive_pi()) from functional.alltypesagg
+select min(hive_pi()) from functional.alltypesagg
 ---- RESULTS
 3.141592653589793
 ---- TYPES
@@ -22,49 +22,49 @@ DOUBLE
 ====
 ---- QUERY
 # Test identity functions
-select udf_test.identity(true);
+select identity(true);
 ---- TYPES
 boolean
 ---- RESULTS
 true
 ====
 ---- QUERY
-select udf_test.identity(cast(10 as tinyint));
+select identity(cast(10 as tinyint));
 ---- TYPES
 tinyint
 ---- RESULTS
 10
 ====
 ---- QUERY
-select udf_test.identity(cast(10 as smallint));
+select identity(cast(10 as smallint));
 ---- TYPES
 smallint
 ---- RESULTS
 10
 ====
 ---- QUERY
-select udf_test.identity(cast(10 as int));
+select identity(cast(10 as int));
 ---- TYPES
 int
 ---- RESULTS
 10
 ====
 ---- QUERY
-select udf_test.identity(cast(10 as bigint));
+select identity(cast(10 as bigint));
 ---- TYPES
 bigint
 ---- RESULTS
 10
 ====
 ---- QUERY
-select udf_test.identity(cast(10.0 as float));
+select identity(cast(10.0 as float));
 ---- TYPES
 float
 ---- RESULTS
 10
 ====
 ---- QUERY
-select udf_test.identity(cast(10.0 as double));
+select identity(cast(10.0 as double));
 ---- TYPES
 double
 ---- RESULTS
@@ -73,16 +73,16 @@ double
 ---- QUERY
 # IMPALA-1456. Each "identity" call below tests a different type (BytesWritable, Text,
 # and String).
-select udf_test.identity("why hello there"),
-       udf_test.identity("why", " hello there"),
-       udf_test.identity("why", " hello", " there");
+select identity("why hello there"),
+       identity("why", " hello there"),
+       identity("why", " hello", " there");
 ---- TYPES
 string, string, string
 ---- RESULTS
 'why hello there','why hello there','why hello there'
 ====
 ---- QUERY
-select udf_test.identity(NULL);
+select identity(NULL);
 ---- TYPES
 boolean
 ---- RESULTS
@@ -91,9 +91,9 @@ NULL
 ---- QUERY
 # IMPALA-1134. Each "identity" call below tests a different type (BytesWritable, Text,
 # and String). The different types are handled slightly differently.
-select length(udf_test.identity("0123456789")),
-       length(udf_test.identity("0123456789", "0123456789")),
-       length(udf_test.identity("0123456789", "0123456789", "0123456789"));
+select length(identity("0123456789")),
+       length(identity("0123456789", "0123456789")),
+       length(identity("0123456789", "0123456789", "0123456789"));
 ---- TYPES
 int, int, int
 ---- RESULTS
@@ -101,14 +101,14 @@ int, int, int
 ====
 ---- QUERY
 # IMPALA-1392: Hive UDFs that throw exceptions should return NULL
-select udf_test.throws_exception();
+select throws_exception();
 ---- TYPES
 boolean
 ---- RESULTS
 NULL
 ====
 ---- QUERY
-select udf_test.throws_exception() from functional.alltypestiny;
+select throws_exception() from functional.alltypestiny;
 ---- TYPES
 boolean
 ---- RESULTS
@@ -122,49 +122,49 @@ NULL
 NULL
 ====
 ---- QUERY
-select udf_test.hive_add(cast(1 as int), cast(2 as int));
+select hive_add(cast(1 as int), cast(2 as int));
 ---- TYPES
 int
 ---- RESULTS
 3
 ====
 ---- QUERY
-select udf_test.hive_add(udf_test.hive_add(cast(1 as int), cast(2 as int)), cast(2 as int));
+select hive_add(hive_add(cast(1 as int), cast(2 as int)), cast(2 as int));
 ---- TYPES
 int
 ---- RESULTS
 5
 ====
 ---- QUERY
-select udf_test.hive_add(cast(udf_test.hive_add(cast(1 as int), cast(2 as int)) - udf_test.hive_add(cast(2 as int), cast(1 as int)) as int), cast(2 as int));
+select hive_add(cast(hive_add(cast(1 as int), cast(2 as int)) - hive_add(cast(2 as int), cast(1 as int)) as int), cast(2 as int));
 ---- TYPES
 int
 ---- RESULTS
 2
 ====
 ---- QUERY
-select udf_test.hive_add(cast(1 as smallint), cast(2 as smallint));
+select hive_add(cast(1 as smallint), cast(2 as smallint));
 ---- TYPES
 smallint
 ---- RESULTS
 3
 ====
 ---- QUERY
-select udf_test.hive_add(cast(1.0 as float), cast(2.0 as float));
+select hive_add(cast(1.0 as float), cast(2.0 as float));
 ---- TYPES
 float
 ---- RESULTS
 3.0
 ====
 ---- QUERY
-select udf_test.hive_add(cast(1.0 as double), cast(2.0 as double));
+select hive_add(cast(1.0 as double), cast(2.0 as double));
 ---- TYPES
 double
 ---- RESULTS
 3.0
 ====
 ---- QUERY
-select udf_test.hive_add(cast(1 as boolean), cast(0 as boolean));
+select hive_add(cast(1 as boolean), cast(0 as boolean));
 ---- TYPES
 boolean
 ---- RESULTS
@@ -172,63 +172,63 @@ false
 ====
 ---- QUERY
 # Testing whether all of persistent Java udfs are accessible.
-select java_udfs_test.identity(true);
+select identity_anytype(true);
 ---- TYPES
 boolean
 ---- RESULTS
 true
 ====
 ---- QUERY
-select java_udfs_test.identity(cast(10 as tinyint));
+select identity_anytype(cast(10 as tinyint));
 ---- TYPES
 tinyint
 ---- RESULTS
 10
 ====
 ---- QUERY
-select java_udfs_test.identity(cast(10 as smallint));
+select identity_anytype(cast(10 as smallint));
 ---- TYPES
 smallint
 ---- RESULTS
 10
 ====
 ---- QUERY
-select java_udfs_test.identity(cast(10 as int));
+select identity_anytype(cast(10 as int));
 ---- TYPES
 int
 ---- RESULTS
 10
 ====
 ---- QUERY
-select java_udfs_test.identity(cast(10 as bigint));
+select identity_anytype(cast(10 as bigint));
 ---- TYPES
 bigint
 ---- RESULTS
 10
 ====
 ---- QUERY
-select java_udfs_test.identity(cast(10.0 as float));
+select identity_anytype(cast(10.0 as float));
 ---- TYPES
 float
 ---- RESULTS
 10
 ====
 ---- QUERY
-select java_udfs_test.identity(cast(10.0 as double));
+select identity_anytype(cast(10.0 as double));
 ---- TYPES
 double
 ---- RESULTS
 10
 ====
 ---- QUERY
-select java_udfs_test.identity("a", "b");
+select identity_anytype("a", "b");
 ---- TYPES
 string
 ---- RESULTS
 'ab'
 ====
 ---- QUERY
-select java_udfs_test.identity("a", "b", "c");
+select identity_anytype("a", "b", "c");
 ---- TYPES
 string
 ---- RESULTS
@@ -238,37 +238,37 @@ string
 # IMPALA-3378: test many Java UDFs being opened and run concurrently
 select * from
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(bool_col) union all
+ where identity(bool_col) union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(tinyint_col) > 1 union all
+ where identity(tinyint_col) > 1 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(smallint_col) > 1 union all
+ where identity(smallint_col) > 1 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(int_col) > 1 union all
+ where identity(int_col) > 1 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(bigint_col) > 1 union all
+ where identity(bigint_col) > 1 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(float_col) > 1.0 union all
+ where identity(float_col) > 1.0 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(double_col) > 1.0 union all
+ where identity(double_col) > 1.0 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(string_col) > '1' union all
+ where identity(string_col) > '1' union all
 (select max(int_col) from functional.alltypesagg
- where not udf_test.identity(bool_col) union all
+ where not identity(bool_col) union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(tinyint_col) > 2 union all
+ where identity(tinyint_col) > 2 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(smallint_col) > 2 union all
+ where identity(smallint_col) > 2 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(int_col) > 2 union all
+ where identity(int_col) > 2 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(bigint_col) > 2 union all
+ where identity(bigint_col) > 2 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(float_col) > 2.0 union all
+ where identity(float_col) > 2.0 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(double_col) > 2.0 union all
+ where identity(double_col) > 2.0 union all
 (select max(int_col) from functional.alltypesagg
- where udf_test.identity(string_col) > '2'
+ where identity(string_col) > '2'
 )))))))))))))))) v
 ---- TYPES
 INT
@@ -301,7 +301,7 @@ values('toast'), ('scone'), ('stuff'), ('sssss'), ('yes'), ('scone'), ('stuff');
 # Regression test for IMPALA-4266: memory management bugs with output strings from
 # Java UDFS, exposed by using the UDF as a grouping key in an aggregation.
 # The UDF replaces "s" with "ss" in the strings.
-select distinct udf_test.replace_string(_c0) as es
+select distinct replace_string(_c0) as es
 from replace_string_input
 order by 1;
 ---- TYPES
diff --git a/testdata/workloads/functional-query/queries/QueryTest/libs_with_same_filenames.test b/testdata/workloads/functional-query/queries/QueryTest/libs_with_same_filenames.test
index cb46d9cb8..64fdced57 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/libs_with_same_filenames.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/libs_with_same_filenames.test
@@ -1,24 +1,21 @@
 ====
 ---- QUERY
-drop function if exists same_lib_filename_udf_test.no_args();
-drop function if exists same_lib_filename_udf_test.no_args2();
-
-create function same_lib_filename_udf_test.no_args() returns string
+create function no_args() returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/libTestUdfs.so' symbol='NoArgs';
 
-create function same_lib_filename_udf_test.no_args2() returns string
+create function no_args2() returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/udf_test/libTestUdfs.so' symbol='NoArgs';
 ---- RESULTS
 ====
 ---- QUERY
-select same_lib_filename_udf_test.no_args();
+select no_args();
 ---- TYPES
 string
 ---- RESULTS
 'string'
 ====
 ---- QUERY
-select same_lib_filename_udf_test.no_args2();
+select no_args2();
 ---- TYPES
 string
 ---- RESULTS
diff --git a/testdata/workloads/functional-query/queries/QueryTest/load-java-udfs.test b/testdata/workloads/functional-query/queries/QueryTest/load-java-udfs.test
index ab0c8bbd8..5ff44884a 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/load-java-udfs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/load-java-udfs.test
@@ -1,130 +1,99 @@
 ====
 ---- QUERY
-drop function if exists udf_test.hive_pi();
-drop function if exists udf_test.hive_round(double);
-drop function if exists udf_test.hive_floor(double);
-drop function if exists udf_test.hive_mod(int, int);
-drop function if exists udf_test.hive_bin(bigint);
-drop function if exists udf_test.hive_lower(string);
-
-drop function if exists udf_test.identity(boolean);
-drop function if exists udf_test.identity(tinyint);
-drop function if exists udf_test.identity(smallint);
-drop function if exists udf_test.identity(int);
-drop function if exists udf_test.identity(bigint);
-drop function if exists udf_test.identity(float);
-drop function if exists udf_test.identity(double);
-drop function if exists udf_test.identity(string);
-drop function if exists udf_test.identity(string, string);
-drop function if exists udf_test.identity(string, string, string);
-drop function if exists udf_test.identity(timestamp);
-
-drop function if exists udf_test.hive_add(int, int);
-drop function if exists udf_test.hive_add(float, float);
-drop function if exists udf_test.hive_add(double, double);
-drop function if exists udf_test.hive_add(smallint, smallint);
-drop function if exists udf_test.hive_add(boolean, boolean);
-
-drop function if exists udf_test.throws_exception();
-
-drop function if exists java_udfs_test.identity;
-
-drop function if exists udf_test.replace_string(string);
-
-create function udf_test.hive_pi() returns double
+create function hive_pi() returns double
 location '$FILESYSTEM_PREFIX/test-warehouse/hive-exec.jar'
 symbol='org.apache.hadoop.hive.ql.udf.UDFPI';
 
-create function udf_test.hive_round(double) returns double
+create function hive_round(double) returns double
 location '$FILESYSTEM_PREFIX/test-warehouse/hive-exec.jar'
 symbol='org.apache.hadoop.hive.ql.udf.UDFRound';
 
-create function udf_test.hive_floor(double) returns bigint
+create function hive_floor(double) returns bigint
 location '$FILESYSTEM_PREFIX/test-warehouse/hive-exec.jar'
 symbol='org.apache.hadoop.hive.ql.udf.UDFFloor';
 
-create function udf_test.hive_mod(int, int) returns int
+create function hive_mod(int, int) returns int
 location '$FILESYSTEM_PREFIX/test-warehouse/hive-exec.jar'
 symbol='org.apache.hadoop.hive.ql.udf.UDFPosMod';
 
-create function udf_test.hive_bin(bigint) returns string
+create function hive_bin(bigint) returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/hive-exec.jar'
 symbol='org.apache.hadoop.hive.ql.udf.UDFBin';
 
-create function udf_test.hive_lower(string) returns string
+create function hive_lower(string) returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/hive-exec.jar'
 symbol='org.apache.hadoop.hive.ql.udf.UDFLower';
 
 # Used to test persistent java functions
-create function java_udfs_test.identity
+create function identity_anytype
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(boolean) returns boolean
+create function identity(boolean) returns boolean
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(tinyint) returns tinyint
+create function identity(tinyint) returns tinyint
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(smallint) returns smallint
+create function identity(smallint) returns smallint
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(int) returns int
+create function identity(int) returns int
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(bigint) returns bigint
+create function identity(bigint) returns bigint
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(float) returns float
+create function identity(float) returns float
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(double) returns double
+create function identity(double) returns double
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(string) returns string
+create function identity(string) returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(string, string) returns string
+create function identity(string, string) returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.identity(string, string, string) returns string
+create function identity(string, string, string) returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.hive_add(int, int) returns int
+create function hive_add(int, int) returns int
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.hive_add(smallint, smallint) returns smallint
+create function hive_add(smallint, smallint) returns smallint
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.hive_add(float, float) returns float
+create function hive_add(float, float) returns float
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.hive_add(double, double) returns double
+create function hive_add(double, double) returns double
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.hive_add(boolean, boolean) returns boolean
+create function hive_add(boolean, boolean) returns boolean
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdf';
 
-create function udf_test.throws_exception() returns boolean
+create function throws_exception() returns boolean
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.TestUdfException';
 
-create function udf_test.replace_string(string) returns string
+create function replace_string(string) returns string
 location '$FILESYSTEM_PREFIX/test-warehouse/impala-hive-udfs.jar'
 symbol='org.apache.impala.ReplaceStringUdf';
 ====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/uda-mem-limit.test b/testdata/workloads/functional-query/queries/QueryTest/uda-mem-limit.test
index 3a2127486..b7c2f5faf 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/uda-mem-limit.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/uda-mem-limit.test
@@ -1,8 +1,5 @@
 ====
 ---- QUERY
-create database if not exists native_function_test;
-use native_function_test;
-
 drop function if exists agg_memtest(bigint);
 
 create aggregate function agg_memtest(bigint) returns bigint
diff --git a/testdata/workloads/functional-query/queries/QueryTest/uda.test b/testdata/workloads/functional-query/queries/QueryTest/uda.test
index 21877cf2b..05e24e71b 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/uda.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/uda.test
@@ -41,3 +41,31 @@ true
 ---- TYPES
 boolean
 ====
+---- QUERY
+# Test with even number of input rows.
+select toggle_null(id), count(*)
+from functional_parquet.alltypesagg
+---- RESULTS
+NULL,11000
+---- TYPES
+int,bigint
+====
+---- QUERY
+# Test with odd number of input rows.
+select toggle_null(id), count(*)
+from functional_parquet.alltypesagg
+where id <= 9998
+---- RESULTS
+1,10999
+---- TYPES
+int,bigint
+====
+---- QUERY
+# Test that input NULLs are passed to aggregate functions ok.
+select count_nulls(tinyint_col), count(*)
+from functional.alltypesagg
+---- RESULTS
+2000,11000
+---- TYPES
+bigint,bigint
+====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/udf-codegen-required.test b/testdata/workloads/functional-query/queries/QueryTest/udf-codegen-required.test
new file mode 100644
index 000000000..07c93c329
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/udf-codegen-required.test
@@ -0,0 +1,10 @@
+====
+---- QUERY
+# Codegen is required for > 20 args.
+select twenty_one_args(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21);
+---- TYPES
+INT
+---- RESULTS
+231
+====
+
diff --git a/testdata/workloads/functional-query/queries/QueryTest/udf-errors.test b/testdata/workloads/functional-query/queries/QueryTest/udf-errors.test
index 948d5849e..73aee7e01 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/udf-errors.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/udf-errors.test
@@ -1,23 +1,19 @@
 ====
 ---- QUERY
-create database if not exists udf_test_errors;
----- RESULTS
-====
----- QUERY
-create function if not exists udf_test_errors.hive_pi() returns double
+create function if not exists hive_pi() returns double
 location '$FILESYSTEM_PREFIX/test-warehouse/hive-exec.jar'
 symbol='org.apache.hadoop.hive.ql.udf.UDFPI';
 ---- RESULTS
 ====
 ---- QUERY
-create function if not exists udf_test_errors.foo() returns double
+create function if not exists foo() returns double
 location '$FILESYSTEM_PREFIX/test-warehouse/not-a-real-file.so'
 symbol='FnDoesNotExist';
 ---- CATCH
 Could not load binary: $FILESYSTEM_PREFIX/test-warehouse/not-a-real-file.so
 ====
 ---- QUERY
-create function if not exists udf_test_errors.foo() returns double
+create function if not exists foo() returns double
 location '$FILESYSTEM_PREFIX/test-warehouse/not-a-real-file.so'
 symbol='FnDoesNotExist';
 ---- CATCH
@@ -25,7 +21,7 @@ Could not load binary: $FILESYSTEM_PREFIX/test-warehouse/not-a-real-file.so
 ====
 ---- QUERY
 # This test is run with codegen disabled. Interpretation only handles up to 20 arguments.
-create function if not exists udf_test_errors.twenty_args(int, int, int, int, int, int,
+create function if not exists twenty_args(int, int, int, int, int, int,
     int, int, int, int, int, int, int, int, int, int, int, int, int, int) returns int
 location '$FILESYSTEM_PREFIX/test-warehouse/libTestUdfs.so'
 symbol='TwentyArgs';
@@ -33,7 +29,7 @@ symbol='TwentyArgs';
 ====
 ---- QUERY
 # Verifies that interpretation can support up to 20 arguments
-select udf_test_errors.twenty_args(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20);
+select twenty_args(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20);
 ---- TYPES
 INT
 ---- RESULTS
@@ -41,7 +37,7 @@ INT
 ====
 ---- QUERY
 # This test is run with codegen disabled. Interpretation only handles up to 20 arguments.
-create function if not exists udf_test_errors.twenty_one_args(int, int, int, int, int, int,
+create function if not exists twenty_one_args(int, int, int, int, int, int,
     int, int, int, int, int, int, int, int, int, int, int, int, int, int, int) returns int
 location '$FILESYSTEM_PREFIX/test-warehouse/libTestUdfs.so'
 symbol='TwentyOneArgs';
@@ -49,35 +45,36 @@ symbol='TwentyOneArgs';
 ====
 ---- QUERY
 # Verifies that interpretation fails with more than 20 arguments.
-select udf_test_errors.twenty_one_args(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21);
+select twenty_one_args(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21);
 ---- CATCH
 Cannot interpret native UDF 'twenty_one_args': number of arguments is more than 20. Codegen is needed. Please set DISABLE_CODEGEN to false.
 ====
 ---- QUERY
 # This test is run with codegen disabled. IR UDF will fail.
-create function if not exists udf_test_errors.nine_args_ir(int, int, int, int, int, int,
+create function if not exists nine_args_ir(int, int, int, int, int, int,
     int, int, int) returns int
 location '$FILESYSTEM_PREFIX/test-warehouse/test-udfs.ll'
 symbol='NineArgs';
 ---- RESULTS
 ====
 ---- QUERY
-select udf_test_errors.nine_args_ir(1,2,3,4,5,6,7,8,9);
+select nine_args_ir(1,2,3,4,5,6,7,8,9);
 ---- CATCH
 Cannot interpret LLVM IR UDF 'nine_args_ir': Codegen is needed. Please set DISABLE_CODEGEN to false.
 ====
 ---- QUERY
-drop database udf_test_errors;
+use default;
+drop database $DATABASE;
 ---- CATCH
-Cannot drop non-empty database: udf_test_errors
+Cannot drop non-empty database:
 ====
 ---- QUERY
-drop function udf_test_errors.hive_pi();
-drop function udf_test_errors.twenty_args(int, int, int, int, int, int, int, int,
+use $DATABASE;
+drop function hive_pi();
+drop function twenty_args(int, int, int, int, int, int, int, int,
     int, int, int, int, int, int, int, int, int, int, int, int);
-drop function udf_test_errors.twenty_one_args(int, int, int, int, int, int, int, int,
+drop function twenty_one_args(int, int, int, int, int, int, int, int,
     int, int, int, int, int, int, int, int, int, int, int, int, int);
-drop function udf_test_errors.nine_args_ir(int, int, int, int, int, int, int, int, int);
-drop database udf_test_errors;
+drop function nine_args_ir(int, int, int, int, int, int, int, int, int);
 ---- RESULTS
 ====
diff --git a/testdata/workloads/functional-query/queries/QueryTest/udf-mem-limit.test b/testdata/workloads/functional-query/queries/QueryTest/udf-mem-limit.test
index f9b8d4b9d..3f23aebc6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/udf-mem-limit.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/udf-mem-limit.test
@@ -1,10 +1,5 @@
 ====
 ---- QUERY
-create database if not exists native_function_test;
-use native_function_test;
-
-drop function if exists memtest(bigint);
-
 create function memtest(bigint) returns bigint
 location '$FILESYSTEM_PREFIX/test-warehouse/libTestUdfs.so' symbol='MemTest'
 prepare_fn='MemTestPrepare' close_fn='MemTestClose';
diff --git a/testdata/workloads/functional-query/queries/QueryTest/udf.test b/testdata/workloads/functional-query/queries/QueryTest/udf.test
index 5cbbecb33..ef777a1e6 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/udf.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/udf.test
@@ -536,10 +536,3 @@ INT
 ---- RESULTS
 210
 ====
----- QUERY
-select twenty_one_args(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21);
----- TYPES
-INT
----- RESULTS
-231
-====
diff --git a/tests/common/test_result_verifier.py b/tests/common/test_result_verifier.py
index 6bf991e58..80daf09bb 100644
--- a/tests/common/test_result_verifier.py
+++ b/tests/common/test_result_verifier.py
@@ -482,3 +482,25 @@ def verify_runtime_profile(expected, actual):
   assert len(unmatched_lines) == 0, ("Did not find matches for lines in runtime profile:"
       "\nEXPECTED LINES:\n%s\n\nACTUAL PROFILE:\n%s" % ('\n'.join(unmatched_lines),
         actual))
+
+def get_node_exec_options(profile_string, exec_node_id):
+  """ Return a list with all of the ExecOption strings for the given exec node id. """
+  results = []
+  matched_node = False
+  id_string = "(id={0})".format(exec_node_id)
+  for line in profile_string.splitlines():
+    if matched_node and line.strip().startswith("ExecOption:"):
+      results.append(line.strip())
+    matched_node = False
+    if id_string in line:
+      # Check for the ExecOption string on the next line.
+      matched_node = True
+  return results
+
+def assert_codegen_enabled(profile_string, exec_node_ids):
+  """ Check that codegen is enabled for the given exec node ids by parsing the text
+  runtime profile in 'profile_string'"""
+  for exec_node_id in exec_node_ids:
+    for exec_options in get_node_exec_options(profile_string, exec_node_id):
+      assert 'Codegen Enabled' in exec_options
+      assert not 'Codegen Disabled' in exec_options
diff --git a/tests/query_test/test_aggregation.py b/tests/query_test/test_aggregation.py
index 82d50c8d9..c44c78481 100644
--- a/tests/query_test/test_aggregation.py
+++ b/tests/query_test/test_aggregation.py
@@ -19,28 +19,64 @@
 #
 import pytest
 
+from tests.common.environ import USING_OLD_AGGS_JOINS
 from tests.common.impala_test_suite import ImpalaTestSuite
+from tests.common.skip import SkipIfOldAggsJoins
 from tests.common.test_dimensions import (
     create_exec_option_dimension,
     create_uncompressed_text_dimension)
+from tests.common.test_result_verifier import assert_codegen_enabled
 from tests.common.test_vector import TestDimension
-from tests.common.skip import SkipIfOldAggsJoins
 
-agg_functions = ['sum', 'count', 'min', 'max', 'avg']
-
-data_types = ['int', 'bool', 'double', 'bigint', 'tinyint',
-              'smallint', 'float', 'timestamp']
+# Test dimensions for TestAggregation.
+AGG_FUNCTIONS = ['sum', 'count', 'min', 'max', 'avg', 'ndv']
+DATA_TYPES = ['int', 'bool', 'double', 'bigint', 'tinyint',
+              'smallint', 'float', 'timestamp', 'string']
 
+# Lookup table for TestAggregation results.
 result_lut = {
-  # TODO: Add verification for other types
   'sum-tinyint': 45000, 'avg-tinyint': 5, 'count-tinyint': 9000,
-      'min-tinyint': 1, 'max-tinyint': 9,
+      'min-tinyint': 1, 'max-tinyint': 9, 'ndv-tinyint': 9,
   'sum-smallint': 495000, 'avg-smallint': 50, 'count-smallint': 9900,
-      'min-smallint': 1, 'max-smallint': 99,
+      'min-smallint': 1, 'max-smallint': 99, 'ndv-smallint': 99,
   'sum-int': 4995000, 'avg-int': 500, 'count-int': 9990,
-      'min-int': 1, 'max-int': 999,
+      'min-int': 1, 'max-int': 999, 'ndv-int': 999,
   'sum-bigint': 49950000, 'avg-bigint': 5000, 'count-bigint': 9990,
-      'min-bigint': 10, 'max-bigint': 9990,
+      'min-bigint': 10, 'max-bigint' : 9990, 'ndv-bigint': 999,
+  'sum-bool': 5000, 'count-bool': 10000, 'min-bool': 'false',
+    'max-bool': 'true', 'avg-bool': 0.5, 'ndv-bool': 2,
+  'sum-double': 50449500.0, 'count-double': 9990, 'min-double': 10.1,
+      'max-double': 10089.9, 'avg-double': 5050.0, 'ndv-double': 999,
+  'sum-float': 5494500.0, 'count-float': 9990, 'min-float': 1.10,
+      'max-float': 1098.9, 'avg-float': 550.0, 'ndv-float': 999,
+  'count-timestamp': 10000, 'min-timestamp': '2010-01-01 00:00:00',
+      'max-timestamp': '2010-01-10 18:02:05.100000000',
+      'avg-timestamp': '2010-01-05 20:47:11.705080000', 'ndv-timestamp': 10000,
+  'count-string': 10000, 'min-string': '0', 'max-string': '999', 'ndv-string': 999,
+  'sum-distinct-tinyint': 45, 'count-distinct-tinyint': 9, 'min-distinct-tinyint': 1,
+      'max-distinct-tinyint': 9, 'avg-distinct-tinyint': 5, 'ndv-distinct-tinyint': 9,
+  'sum-distinct-smallint': 4950, 'count-distinct-smallint': 99,
+      'min-distinct-smallint': 1, 'max-distinct-smallint': 99,
+      'avg-distinct-smallint': 50, 'ndv-distinct-smallint': 99,
+  'sum-distinct-int': 499500, 'count-distinct-int': 999, 'min-distinct-int': 1,
+      'max-distinct-int': 999, 'avg-distinct-int': 500, 'ndv-distinct-int': 999,
+  'sum-distinct-bigint': 4995000, 'count-distinct-bigint': 999, 'min-distinct-bigint': 10,
+      'max-distinct-bigint': 9990, 'avg-distinct-bigint': 5000,
+      'ndv-distinct-bigint': 999,
+  'sum-distinct-bool': 1, 'count-distinct-bool': 2, 'min-distinct-bool': 'false',
+      'max-distinct-bool': 'true', 'avg-distinct-bool': 0.5, 'ndv-distinct-bool': 2,
+  'sum-distinct-double': 5044950.0, 'count-distinct-double': 999,
+      'min-distinct-double': 10.1, 'max-distinct-double': 10089.9,
+      'avg-distinct-double': 5050.0, 'ndv-distinct-double': 999,
+  'sum-distinct-float': 549450.0, 'count-distinct-float': 999, 'min-distinct-float': 1.1,
+      'max-distinct-float': 1098.9, 'avg-distinct-float': 550.0,
+      'ndv-distinct-float': 999,
+  'count-distinct-timestamp': 10000, 'min-distinct-timestamp': '2010-01-01 00:00:00',
+      'max-distinct-timestamp': '2010-01-10 18:02:05.100000000',
+      'avg-distinct-timestamp': '2010-01-05 20:47:11.705080000',
+      'ndv-distinct-timestamp': 10000,
+  'count-distinct-string': 1000, 'min-distinct-string': '0',
+      'max-distinct-string': '999', 'ndv-distinct-string': 999,
 }
 
 class TestAggregation(ImpalaTestSuite):
@@ -53,8 +89,8 @@ class TestAggregation(ImpalaTestSuite):
     super(TestAggregation, cls).add_test_dimensions()
 
     # Add two more dimensions
-    cls.TestMatrix.add_dimension(TestDimension('agg_func', *agg_functions))
-    cls.TestMatrix.add_dimension(TestDimension('data_type', *data_types))
+    cls.TestMatrix.add_dimension(TestDimension('agg_func', *AGG_FUNCTIONS))
+    cls.TestMatrix.add_dimension(TestDimension('data_type', *DATA_TYPES))
     cls.TestMatrix.add_constraint(lambda v: cls.is_valid_vector(v))
 
   @classmethod
@@ -68,30 +104,63 @@ class TestAggregation(ImpalaTestSuite):
       if vector.get_value('exec_option')['batch_size'] != 0: return False
 
     # Avro doesn't have timestamp type
+    non_numeric = data_type in ['bool', 'string']
     if file_format == 'avro' and data_type == 'timestamp':
       return False
-    elif agg_func not in ['min', 'max', 'count'] and data_type == 'bool':
+    elif non_numeric and agg_func not in ['min', 'max', 'count', 'ndv']:
       return False
     elif agg_func == 'sum' and data_type == 'timestamp':
       return False
     return True
 
   def test_aggregation(self, vector):
+    exec_option = vector.get_value('exec_option')
+    disable_codegen = exec_option['disable_codegen']
+    # The old aggregation node does not support codegen for all aggregate functions.
+    check_codegen_enabled = not disable_codegen and not USING_OLD_AGGS_JOINS
     data_type, agg_func = (vector.get_value('data_type'), vector.get_value('agg_func'))
+
     query = 'select %s(%s_col) from alltypesagg where day is not null' % (agg_func,
         data_type)
-    result = self.execute_scalar(query, vector.get_value('exec_option'),
-                                 table_format=vector.get_value('table_format'))
-    if 'int' in data_type:
-      assert result_lut['%s-%s' % (agg_func, data_type)] == int(result)
+    result = self.execute_query(query, exec_option,
+       table_format=vector.get_value('table_format'))
+    assert len(result.data) == 1
+    self.verify_agg_result(agg_func, data_type, False, result.data[0]);
+
+    if check_codegen_enabled:
+      # Verify codegen was enabled for both stages of the aggregation.
+      assert_codegen_enabled(result.runtime_profile, [1, 3])
 
-    # AVG
-    if vector.get_value('data_type') == 'timestamp' and\
-       vector.get_value('agg_func') == 'avg':
-      return
     query = 'select %s(DISTINCT(%s_col)) from alltypesagg where day is not null' % (
         agg_func, data_type)
-    result = self.execute_scalar(query, vector.get_value('exec_option'))
+    result = self.execute_query(query, vector.get_value('exec_option'))
+    assert len(result.data) == 1
+    self.verify_agg_result(agg_func, data_type, True, result.data[0]);
+
+    if check_codegen_enabled:
+      # Verify codegen was enabled for all stages of the aggregation.
+      assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
+
+  def verify_agg_result(self, agg_func, data_type, distinct, actual_string):
+    key = '%s-%s%s' % (agg_func, 'distinct-' if distinct else '', data_type)
+
+    if agg_func == 'ndv':
+      # NDV is inherently approximate. Compare with some tolerance.
+      err = abs(result_lut[key] - int(actual_string))
+      rel_err =  err / float(result_lut[key])
+      print key, result_lut[key], actual_string,abs(result_lut[key] - int(actual_string))
+      assert err <= 1 or rel_err < 0.05
+    elif data_type in ('float', 'double') and agg_func != 'count':
+      # Compare with a margin of error.
+      delta = 1e6 if data_type == 'double' else 1e3
+      assert abs(result_lut[key] - float(actual_string)) < delta
+    elif data_type == 'timestamp' and agg_func != 'count':
+      # Strip off everything past 10s of microseconds.
+      ignore_digits = 4
+      assert result_lut[key][:-ignore_digits] == actual_string[:-ignore_digits]
+    else:
+      assert str(result_lut[key]) == actual_string
+
 
 class TestAggregationQueries(ImpalaTestSuite):
   """Run the aggregation test suite, with codegen enabled and disabled, to exercise our
@@ -140,6 +209,7 @@ class TestAggregationQueries(ImpalaTestSuite):
        first phase is running on multiple nodes). Need to pull the result apart and
        compare the actual items)"""
     exec_option = vector.get_value('exec_option')
+    disable_codegen = exec_option['disable_codegen']
     table_format = vector.get_value('table_format')
     # Test group_concat distinct with other aggregate function and groupings.
     # expected result is the row: 2010,'1, 2, 3, 4','1-2-3-4','2|3|1|4',40,4
@@ -156,6 +226,10 @@ class TestAggregationQueries(ImpalaTestSuite):
       assert(set(row[i].split(delimiter[i-1])) == set(['1', '2', '3', '4']))
     assert(row[4] == '40')
     assert(row[5] == '4')
+    check_codegen_enabled = not disable_codegen and not USING_OLD_AGGS_JOINS
+    if check_codegen_enabled:
+      # Verify codegen was enabled for all three stages of the aggregation.
+      assert_codegen_enabled(result.runtime_profile, [1, 2, 4])
 
     # Test group_concat distinct with arrow delimiter, with multiple rows
     query = """select day, group_concat(distinct string_col, "->")
@@ -185,6 +259,10 @@ class TestAggregationQueries(ImpalaTestSuite):
     where int_col < 10"""
     result = self.execute_query(query, exec_option, table_format=table_format)
     assert(set((result.data)[0].split(" ")) == set(['1','2','3','4','5','6','7','8','9']))
+    if check_codegen_enabled:
+      # Verify codegen was enabled for all four stages of the aggregation.
+      assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
+
 
 class TestTPCHAggregationQueries(ImpalaTestSuite):
   # Uses the TPC-H dataset in order to have larger aggregations.
diff --git a/tests/query_test/test_udfs.py b/tests/query_test/test_udfs.py
index 0497cbd9d..5e9fdd934 100644
--- a/tests/query_test/test_udfs.py
+++ b/tests/query_test/test_udfs.py
@@ -24,7 +24,7 @@ from tests.common.impala_cluster import ImpalaCluster
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIfLocal
 from tests.common.test_dimensions import (
-    create_single_exec_option_dimension,
+    create_exec_option_dimension,
     create_uncompressed_text_dimension)
 from tests.util.calculation_util import get_random_id
 from tests.util.filesystem_utils import get_fs_path, IS_S3
@@ -37,48 +37,46 @@ class TestUdfs(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestUdfs, cls).add_test_dimensions()
-    # Without limiting the test suite to a single exec option, the tests will fail
-    # because the same test case may be executed in parallel with different exec option
-    # values leading to conflicting DDL ops.
-    cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
+    cls.TestMatrix.add_dimension(
+      create_exec_option_dimension(disable_codegen_options=[False, True]))
 
     # There is no reason to run these tests using all dimensions.
     cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
 
-  def test_native_functions(self, vector):
-    database = 'native_function_test'
-
+  def test_native_functions(self, vector, unique_database):
     self.__load_functions(
-      self.create_udfs_template, vector, database,
+      self.create_udfs_template, vector, unique_database,
       get_fs_path('/test-warehouse/libTestUdfs.so'))
     self.__load_functions(
-      self.create_sample_udas_template, vector, database,
+      self.create_sample_udas_template, vector, unique_database,
       get_fs_path('/test-warehouse/libudasample.so'))
     self.__load_functions(
-      self.create_test_udas_template, vector, database,
+      self.create_test_udas_template, vector, unique_database,
       get_fs_path('/test-warehouse/libTestUdas.so'))
 
-    self.run_test_case('QueryTest/udf', vector, use_db=database)
-    if not IS_S3: # S3 doesn't support INSERT
-      self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
-    self.run_test_case('QueryTest/uda', vector, use_db=database)
+    self.run_test_case('QueryTest/udf', vector, use_db=unique_database)
+    if not vector.get_value('exec_option')['disable_codegen']:
+      self.run_test_case('QueryTest/udf-codegen-required', vector, use_db=unique_database)
+    self.run_test_case('QueryTest/udf-init-close', vector, use_db=unique_database)
+    self.run_test_case('QueryTest/uda', vector, use_db=unique_database)
 
-  def test_ir_functions(self, vector):
-    database = 'ir_function_test'
+  def test_ir_functions(self, vector, unique_database):
+    if vector.get_value('exec_option')['disable_codegen']:
+      # IR functions require codegen to be enabled.
+      return
     self.__load_functions(
-      self.create_udfs_template, vector, database,
+      self.create_udfs_template, vector, unique_database,
       get_fs_path('/test-warehouse/test-udfs.ll'))
-    self.run_test_case('QueryTest/udf', vector, use_db=database)
-    if not IS_S3: # S3 doesn't support INSERT
-      self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
+    self.run_test_case('QueryTest/udf', vector, use_db=unique_database)
+    self.run_test_case('QueryTest/udf-init-close', vector, use_db=unique_database)
 
-  def test_udf_errors(self, vector):
+  def test_udf_errors(self, vector, unique_database):
     # Disable codegen to force interpretation path to be taken.
     # Aim to exercise two failure cases:
     # 1. too many arguments
     # 2. IR UDF
     vector.get_value('exec_option')['disable_codegen'] = 1
-    self.run_test_case('QueryTest/udf-errors', vector)
+    self.run_test_case('QueryTest/udf-errors', vector, use_db=unique_database)
 
   def test_udf_invalid_symbol(self, vector):
     """ IMPALA-1642: Impala crashes if the symbol for a Hive UDF doesn't exist
@@ -98,17 +96,9 @@ class TestUdfs(ImpalaTestSuite):
     finally:
       self.client.execute(drop_fn_stmt)
 
-  def test_java_udfs(self, vector):
-    self.client.execute("create database if not exists java_udfs_test "
-        "location '%s'" % get_fs_path('/test-warehouse/java_udf_test.db'))
-    self.client.execute("create database if not exists udf_test "
-        "location '%s'" % get_fs_path('/test-warehouse/udf_test.db'))
-    try:
-      self.run_test_case('QueryTest/load-java-udfs', vector)
-      self.run_test_case('QueryTest/java-udf', vector)
-    finally:
-      self.client.execute("drop database if exists java_udfs_test cascade")
-      self.client.execute("drop database if exists udf_test cascade")
+  def test_java_udfs(self, vector, unique_database):
+    self.run_test_case('QueryTest/load-java-udfs', vector, use_db=unique_database)
+    self.run_test_case('QueryTest/java-udf', vector, use_db=unique_database)
 
   @SkipIfLocal.multiple_impalad
   def test_hive_udfs_missing_jar(self, vector, unique_database):
@@ -150,13 +140,8 @@ class TestUdfs(ImpalaTestSuite):
     except ImpalaBeeswaxException, e:
       assert "Failed to get file info" in str(e)
 
-  def test_libs_with_same_filenames(self, vector):
-    self.client.execute("create database if not exists same_lib_filename_udf_test "
-        "location '%s'" % get_fs_path('/test-warehouse/same_lib_filename_udf_test.db'))
-    try:
-      self.run_test_case('QueryTest/libs_with_same_filenames', vector)
-    finally:
-      self.client.execute("drop database if exists same_lib_filename_udf_test cascade")
+  def test_libs_with_same_filenames(self, vector, unique_database):
+    self.run_test_case('QueryTest/libs_with_same_filenames', vector, use_db=unique_database)
 
   def test_udf_update_via_drop(self, vector, unique_database):
     """Test updating the UDF binary without restarting Impala. Dropping
@@ -263,19 +248,19 @@ class TestUdfs(ImpalaTestSuite):
   # Run serially because this will blow the process limit, potentially causing other
   # queries to fail
   @pytest.mark.execute_serially
-  def test_mem_limits(self, vector):
+  def test_mem_limits(self, vector, unique_database):
     # Set the mem limit high enough that a simple scan can run
     mem_limit = 1024 * 1024
     vector.get_value('exec_option')['mem_limit'] = mem_limit
 
     try:
-      self.run_test_case('QueryTest/udf-mem-limit', vector)
+      self.run_test_case('QueryTest/udf-mem-limit', vector, use_db=unique_database)
       assert False, "Query was expected to fail"
     except ImpalaBeeswaxException, e:
       self.__check_exception(e)
 
     try:
-      self.run_test_case('QueryTest/uda-mem-limit', vector)
+      self.run_test_case('QueryTest/uda-mem-limit', vector, use_db=unique_database)
       assert False, "Query was expected to fail"
     except ImpalaBeeswaxException, e:
       self.__check_exception(e)
@@ -330,6 +315,8 @@ returns decimal(9,2) location '{location}' update_fn='SumSmallDecimalUpdate';
   create_test_udas_template = """
 drop function if exists {database}.trunc_sum(double);
 drop function if exists {database}.arg_is_const(int, int);
+drop function if exists {database}.toggle_null(int);
+drop function if exists {database}.count_nulls(bigint);
 
 create database if not exists {database};
 
@@ -341,6 +328,14 @@ serialize_fn='TruncSumSerialize' finalize_fn='TruncSumFinalize';
 create aggregate function {database}.arg_is_const(int, int)
 returns boolean location '{location}'
 init_fn='ArgIsConstInit' update_fn='ArgIsConstUpdate' merge_fn='ArgIsConstMerge';
+
+create aggregate function {database}.toggle_null(int)
+returns int location '{location}'
+update_fn='ToggleNullUpdate' merge_fn='ToggleNullMerge';
+
+create aggregate function {database}.count_nulls(bigint)
+returns bigint location '{location}'
+update_fn='CountNullsUpdate' merge_fn='CountNullsMerge';
 """
 
   # Create test UDF functions in {database} from library {location}