IMPALA-775,IMPALA-3374: Upgrade LLVM to 3.8.0

This requires various changes for Impala to be fully functional with the new version of LLVM. The original JIT was removed from LLVM, we need to switch to the new MCJIT API and implementation. MCJIT only supports module-at-a-time compilation, so the module must be finalised before any compilation happens. We did't depend on the old behaviour deeply, but various small fixes were required. MCJIT requires that every IR module has a name. We relied on the old JIT's workaround for the __dso_handle symbol, which we have to emulate for MCJIT with a customer memory manager until we can get rid of global initialisers in cross-compiled code. LLVM made a number of incompatible API changes and reorganised headers. Clang took over responsibility for padding structs by marking structs as packed and inserting bytes so that members are aligned correctly (previously it relies LLVM aligning struct members based on the target's alignment rules). This means Impala also needs to manually pad its structs since clang-emitted structs look to LLVM like they have do not need to be inlined. Our inlining pass would require some modification to work and is redundant with LLVM's inlining pass, so was removed along with the unused subexpr elimination pass. LLVM now depends on another system library libtinfo, so we use llvm-config to get the required system libs directly. There were various issues with __builtin_add_overflow and __builtin_mul_overflow that are newly available in LLVM 3.8. First, LLVM emitted a call to a function in libclang_rt, which we don't link in and has symbols that conflict with the gcc runtime library. Second, the performance actually regressed by using the builtins (I tested this manually by copying across the definition of the required function). Change-Id: I17d7afd05ad3b472a0bfe035bfc3daada5597b2d Reviewed-on: http://gerrit.cloudera.org:8080/2486 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Internal Jenkins
2025-12-23 21:08:39 -05:00 · 2016-02-15 19:25:52 -08:00
parent e54c99cd9a
commit be415f380f
33 changed files with 545 additions and 757 deletions
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -191,16 +191,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/generated-sources)
 set(CLANG_INCLUDE_FLAGS)

 if (IMPALA_TOOLCHAIN)
-  # When the Toolchain is used we use LLVM 3.3 that was built in a different path that it
-  # is invoked from, and a GCC that resides in a different location. LVVM 3.3 relies on
-  # hard-coded path information about where to find the system headers and does not support
-  # specifying the -gcc-toolchain flag to dynamically provide this information. Because of
-  # these reasons we need to manually add the system c++ headers to the path when we
-  # compile the IR code with clang.
-  set(CLANG_BASE_FLAGS
-    "-I${GCC_ROOT}/include/c++/$ENV{IMPALA_GCC_VERSION}"
-    "-I${GCC_ROOT}/include/c++/$ENV{IMPALA_GCC_VERSION}/backward"
-    "-I${GCC_ROOT}/include/c++/$ENV{IMPALA_GCC_VERSION}/x86_64-unknown-linux-gnu")
+  # Ensure that clang uses the gcc toolchain headers.
+  set(CLANG_BASE_FLAGS --gcc-toolchain=${GCC_ROOT})
  set(CLANG_INCLUDE_FLAGS ${CLANG_BASE_FLAGS})
 endif()

@@ -328,6 +320,8 @@ set (IMPALA_LINK_LIBS ${IMPALA_LINK_LIBS}
  ${JAVA_JVM_LIBRARY}
  kudu_client
  -lrt
+  -ltinfo # Needed for LLVM
+  -ldl # Needed for LLVM
 )

 # The above link list does not include tcmalloc. This is because the Impala JVM support
--- a/be/src/benchmarks/hash-benchmark.cc
+++ b/be/src/benchmarks/hash-benchmark.cc
@@ -381,7 +381,7 @@ Function* CodegenCrcHash(LlvmCodeGen* codegen, bool mixed) {
  Value* data = builder.CreateGEP(args[1], offset);

  Value* seed = codegen->GetIntConstant(TYPE_INT, HashUtil::FNV_SEED);
-  seed = builder.CreateCall3(fixed_fn, data, dummy_len, seed);
+  seed = builder.CreateCall(fixed_fn, ArrayRef<Value*>({data, dummy_len, seed}));

  // Get the string data
  if (mixed) {
@@ -389,11 +389,11 @@ Function* CodegenCrcHash(LlvmCodeGen* codegen, bool mixed) {
        data, codegen->GetIntConstant(TYPE_INT, fixed_byte_size));
    Value* string_val =
        builder.CreateBitCast(string_data, codegen->GetPtrType(TYPE_STRING));
-    Value* str_ptr = builder.CreateStructGEP(string_val, 0);
-    Value* str_len = builder.CreateStructGEP(string_val, 1);
+    Value* str_ptr = builder.CreateStructGEP(NULL, string_val, 0);
+    Value* str_len = builder.CreateStructGEP(NULL, string_val, 1);
    str_ptr = builder.CreateLoad(str_ptr);
    str_len = builder.CreateLoad(str_len);
-    seed = builder.CreateCall3(string_hash_fn, str_ptr, str_len, seed);
+    seed = builder.CreateCall(string_hash_fn, ArrayRef<Value*>({str_ptr, str_len, seed}));
  }

  Value* result = builder.CreateGEP(args[2], counter);
@@ -426,7 +426,7 @@ int main(int argc, char **argv) {

  Status status;
  scoped_ptr<LlvmCodeGen> codegen;
-  status = LlvmCodeGen::LoadImpalaIR(&obj_pool, "test", &codegen);
+  status = LlvmCodeGen::CreateImpalaCodegen(&obj_pool, "test", &codegen);
  if (!status.ok()) {
    cout << "Could not start codegen.";
    return -1;
--- a/be/src/benchmarks/overflow-benchmark.cc
+++ b/be/src/benchmarks/overflow-benchmark.cc
@@ -143,6 +143,7 @@ static bool AdjustToSameScaleLookupTbl(const Decimal16Value& x, int x_scale,
 }

 #if 5 <= __GNUC__ || __has_builtin(__builtin_add_overflow)
+#define HAVE_BUILTIN_ADD_OVERFLOW
 template<typename RESULT_T>
 DecimalValue<RESULT_T> BuiltinAdd(const Decimal16Value& val, int this_scale,
    const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
@@ -239,13 +240,16 @@ DecimalValue<RESULT_T> Add(const Decimal16Value& val, int this_scale,
  }

 TEST_ADD(TestAdd, Add, true);
-#if 5 <= __GNUC__ || __has_builtin (__builtin_add_overflow)
+#ifdef HAVE_BUILTIN_ADD_OVERFLOW
 TEST_ADD(TestBuiltinAddOverflow, BuiltinAdd, false);
 #endif
 TEST_ADD(TestAddOverflowLookupTbl, AddLookupTbl, false);
 TEST_ADD(TestAddOverflow, Add, false);

-#if 5 <= __GNUC__ || __has_builtin(__builtin_mul_overflow)
+// Disabled __builtin_mul_overflow since Clang emits a call to __muloti4, which is
+// not implemented in the GCC runtime library.
+#if 5 <= __GNUC__
+#define HAVE_BUILTIN_MUL_OVERFLOW
 template<typename RESULT_T>
 DecimalValue<RESULT_T> BuiltinMultiply(const Decimal16Value& val, int this_scale,
    const Decimal16Value& other, int other_scale, int result_precision, int result_scale,
@@ -369,7 +373,7 @@ DecimalValue<RESULT_T> Multiply(const Decimal16Value& val, int this_scale,
  }

 TEST_MUL(TestMul, Multiply, true);
-#if 5 <= __GNUC__ || __has_builtin (__builtin_mul_overflow)
+#ifdef HAVE_BUILTIN_MUL_OVERFLOW
 TEST_MUL(TestBuiltinMulOverflow, BuiltinMultiply, false);
 #endif
 TEST_MUL(TestMulOverflowCheckMSB, MultiplyCheckMSB, false);
@@ -438,7 +442,7 @@ int main(int argc, char** argv) {

  Benchmark add_overflow_suite("Decimal16 Add Overflow");
  add_overflow_suite.AddBenchmark("without_check_overflow", TestAdd, &data);
-#if 5 <= __GNUC__ || __has_builtin (__builtin_mul_overflow)
+#ifdef HAVE_BUILTIN_ADD_OVERFLOW
  add_overflow_suite.AddBenchmark("builtin_add_overflow", TestBuiltinAddOverflow, &data);
 #endif
  add_overflow_suite.AddBenchmark("add_overflow_lookup_table",
@@ -448,7 +452,7 @@ int main(int argc, char** argv) {

  Benchmark mul_overflow_suite("Decimal16 Mul Overflow");
  mul_overflow_suite.AddBenchmark("without_check_overflow", TestMul, &data);
-#if 5 <= __GNUC__ || __has_builtin (__builtin_mul_overflow)
+#ifdef HAVE_BUILTIN_MUL_OVERFLOW
  mul_overflow_suite.AddBenchmark("builtin_mul_overflow", TestBuiltinMulOverflow, &data);
 #endif
  mul_overflow_suite.AddBenchmark("mul_overflow_check_msb",
--- a/be/src/codegen/CMakeLists.txt
+++ b/be/src/codegen/CMakeLists.txt
@@ -26,7 +26,6 @@ set(IR_NO_SSE_C_FILE $ENV{IMPALA_HOME}/be/generated-sources/impala-ir/impala-no-
 add_library(CodeGen
  codegen-anyval.cc
  llvm-codegen.cc
-  subexpr-elimination.cc
  instruction-counter.cc
  ${IR_SSE_C_FILE}
  ${IR_NO_SSE_C_FILE}
--- a/be/src/codegen/codegen-anyval.cc
+++ b/be/src/codegen/codegen-anyval.cc
@@ -570,14 +570,14 @@ Value* CodegenAnyVal::Eq(CodegenAnyVal* other) {
    case TYPE_VARCHAR: {
      Function* eq_fn =
          codegen_->GetFunction(IRFunction::CODEGEN_ANYVAL_STRING_VAL_EQ, false);
-      return builder_->CreateCall2(
-          eq_fn, GetUnloweredPtr(), other->GetUnloweredPtr(), "eq");
+      return builder_->CreateCall(
+          eq_fn, ArrayRef<Value*>({GetUnloweredPtr(), other->GetUnloweredPtr()}), "eq");
    }
    case TYPE_TIMESTAMP: {
      Function* eq_fn =
          codegen_->GetFunction(IRFunction::CODEGEN_ANYVAL_TIMESTAMP_VAL_EQ, false);
-      return builder_->CreateCall2(
-          eq_fn, GetUnloweredPtr(), other->GetUnloweredPtr(), "eq");
+      return builder_->CreateCall(
+          eq_fn, ArrayRef<Value*>({GetUnloweredPtr(), other->GetUnloweredPtr()}), "eq");
    }
    default:
      DCHECK(false) << "NYI: " << type_.DebugString();
@@ -607,12 +607,14 @@ Value* CodegenAnyVal::EqToNativePtr(Value* native_ptr) {
    case TYPE_VARCHAR: {
      Function* eq_fn =
          codegen_->GetFunction(IRFunction::CODEGEN_ANYVAL_STRING_VALUE_EQ, false);
-      return builder_->CreateCall2(eq_fn, GetUnloweredPtr(), native_ptr, "cmp_raw");
+      return builder_->CreateCall(eq_fn,
+          ArrayRef<Value*>({GetUnloweredPtr(), native_ptr}), "cmp_raw");
    }
    case TYPE_TIMESTAMP: {
      Function* eq_fn =
          codegen_->GetFunction(IRFunction::CODEGEN_ANYVAL_TIMESTAMP_VALUE_EQ, false);
-      return builder_->CreateCall2(eq_fn, GetUnloweredPtr(), native_ptr, "cmp_raw");
+      return builder_->CreateCall(eq_fn,
+          ArrayRef<Value*>({GetUnloweredPtr(), native_ptr}), "cmp_raw");
    }
    default:
      DCHECK(false) << "NYI: " << type_.DebugString();
--- a/be/src/codegen/instruction-counter-test.cc
+++ b/be/src/codegen/instruction-counter-test.cc
@@ -18,10 +18,8 @@

 #include "llvm/IR/Module.h"
 #include "llvm/IR/Function.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/IR/IRBuilder.h"

 #include "codegen/llvm-codegen.h"
@@ -52,11 +50,14 @@ Module* CodegenMulAdd(LLVMContext* context) {
  Function* mul_add = cast<Function>(c);
  mul_add->setCallingConv(CallingConv::C);
  Function::arg_iterator args = mul_add->arg_begin();
-  Value* x = args++;
+  Value* x = &*args;
+  ++args;
  x->setName("x");
-  Value* y = args++;
+  Value* y = &*args;
+  ++args;
  y->setName("y");
-  Value* z = args++;
+  Value* z = &*args;
+  ++args;
  z->setName("z");
  BasicBlock* block = BasicBlock::Create(*context, "entry", mul_add);
  IRBuilder<> builder(block);
@@ -113,9 +114,11 @@ Module* CodegenGcd(LLVMContext* context) {
      IntegerType::get(*context, 32), IntegerType::get(*context, 32), NULL);
  Function* gcd = cast<Function>(c);
  Function::arg_iterator args = gcd->arg_begin();
-  Value* x = args++;
+  Value* x = &*args;
+  ++args;
  x->setName("x");
-  Value* y = args++;
+  Value* y = &*args;
+  ++args;
  y->setName("y");
  BasicBlock* entry = BasicBlock::Create(*context, "entry", gcd);
  BasicBlock* ret = BasicBlock::Create(*context, "return", gcd);
--- a/be/src/codegen/llvm-codegen-test.cc
+++ b/be/src/codegen/llvm-codegen-test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <memory>
 #include <string>
 #include <boost/thread/thread.hpp>

@@ -26,6 +27,7 @@
 #include "common/names.h"

 using namespace llvm;
+using std::unique_ptr;

 namespace impala {

@@ -39,22 +41,23 @@ class LlvmCodeGenTest : public testing:: Test {
      LlvmCodeGen object2(&pool, "Test");
      LlvmCodeGen object3(&pool, "Test");

-      ASSERT_OK(object1.Init());
-      ASSERT_OK(object2.Init());
-      ASSERT_OK(object3.Init());
+      ASSERT_OK(object1.Init(unique_ptr<Module>(new Module("Test", object1.context()))));
+      ASSERT_OK(object2.Init(unique_ptr<Module>(new Module("Test", object2.context()))));
+      ASSERT_OK(object3.Init(unique_ptr<Module>(new Module("Test", object3.context()))));
    }
  }

  // Wrapper to call private test-only methods on LlvmCodeGen object
-  static Status LoadFromFile(ObjectPool* pool, const string& filename,
+  static Status CreateFromFile(ObjectPool* pool, const string& filename,
      scoped_ptr<LlvmCodeGen>* codegen) {
-    return LlvmCodeGen::LoadFromFile(pool, filename, "test", codegen);
+    return LlvmCodeGen::CreateFromFile(pool, filename, "test", codegen);
  }

  static LlvmCodeGen* CreateCodegen(ObjectPool* pool) {
    LlvmCodeGen* codegen = pool->Add(new LlvmCodeGen(pool, "Test"));
    if (codegen != NULL) {
-      Status status = codegen->Init();
+      Status status = codegen->Init(
+          unique_ptr<Module>(new Module("Test", codegen->context())));
      if (!status.ok()) return NULL;
    }
    return codegen;
@@ -64,18 +67,20 @@ class LlvmCodeGenTest : public testing:: Test {
    codegen->ClearHashFns();
  }

-  static void* JitFunction(LlvmCodeGen* codegen, Function* function) {
-    return codegen->JitFunction(function);
+  static void AddFunctionToJit(LlvmCodeGen* codegen, llvm::Function* fn, void** fn_ptr) {
+    // Bypass Impala-specific logic in AddFunctionToJit() that assumes Impala's struct
+    // types are available in the module.
+    return codegen->AddFunctionToJitInternal(fn, fn_ptr);
  }

-  static void AddFunctionToJit(LlvmCodeGen* codegen, llvm::Function* fn, void** fn_ptr) {
-    // Bypass Impala-specific logic in AddFunctionToJit().
-    codegen->fns_to_jit_compile_.push_back(make_pair(fn, fn_ptr));
+  static bool VerifyFunction(LlvmCodeGen* codegen, llvm::Function* fn) {
+    return codegen->VerifyFunction(fn);
  }

  static Status FinalizeModule(LlvmCodeGen* codegen) {
    return codegen->FinalizeModule();
  }
+
 };

 // Simple test to just make and destroy llvmcodegen objects.  LLVM
@@ -99,7 +104,7 @@ TEST_F(LlvmCodeGenTest, BadIRFile) {
  ObjectPool pool;
  string module_file = "NonExistentFile.ir";
  scoped_ptr<LlvmCodeGen> codegen;
-  EXPECT_FALSE(LlvmCodeGenTest::LoadFromFile(&pool, module_file.c_str(), &codegen).ok());
+  EXPECT_FALSE(LlvmCodeGenTest::CreateFromFile(&pool, module_file.c_str(), &codegen).ok());
 }

 // IR for the generated linner loop
@@ -160,7 +165,7 @@ TEST_F(LlvmCodeGenTest, ReplaceFnCall) {

  // Part 1: Load the module and make sure everything is loaded correctly.
  scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGenTest::LoadFromFile(&pool, module_file.c_str(), &codegen));
+  ASSERT_OK(LlvmCodeGenTest::CreateFromFile(&pool, module_file.c_str(), &codegen));
  EXPECT_TRUE(codegen.get() != NULL);

  vector<Function*> functions;
@@ -192,35 +197,43 @@ TEST_F(LlvmCodeGenTest, ReplaceFnCall) {
  Function* jitted_loop = codegen->CloneFunction(loop);
  int num_replaced =
      codegen->ReplaceCallSites(jitted_loop, jitted_loop_call, loop_call_name);
-  EXPECT_EQ(num_replaced, 1);
-  EXPECT_TRUE(codegen->VerifyFunction(jitted_loop));
+  EXPECT_EQ(1, num_replaced);
+  EXPECT_TRUE(VerifyFunction(codegen.get(), jitted_loop));

-  // Part 4: Generate a new inner loop function and update 'loop'
+  // Part 4: Generate a new inner loop function and a new loop function
  Function* jitted_loop_call2 = CodegenInnerLoop(codegen.get(), &jitted_counter, -2);
-  num_replaced = codegen->ReplaceCallSites(loop, jitted_loop_call2, loop_call_name);
-  EXPECT_EQ(num_replaced, 1);
-  EXPECT_TRUE(codegen->VerifyFunction(loop));
+  Function* jitted_loop2 = codegen->CloneFunction(loop);
+  num_replaced = codegen->ReplaceCallSites(jitted_loop2, jitted_loop_call2, loop_call_name);
+  EXPECT_EQ(1, num_replaced);
+  EXPECT_TRUE(VerifyFunction(codegen.get(), jitted_loop2));

-  // Part 5: JIT and run both loops
+  void* original_loop = NULL;
+  AddFunctionToJit(codegen.get(), loop, &original_loop);
  void* new_loop = NULL;
  AddFunctionToJit(codegen.get(), jitted_loop, &new_loop);
  void* new_loop2 = NULL;
-  AddFunctionToJit(codegen.get(), loop, &new_loop2);
+  AddFunctionToJit(codegen.get(), jitted_loop2, &new_loop2);

+  // Part 5: compile all the functions (we can't add more functions after jitting with
+  // MCJIT) then run them.
  ASSERT_OK(LlvmCodeGenTest::FinalizeModule(codegen.get()));
+  ASSERT_TRUE(original_loop != NULL);
  ASSERT_TRUE(new_loop != NULL);
  ASSERT_TRUE(new_loop2 != NULL);

+  TestLoopFn original_loop_fn = reinterpret_cast<TestLoopFn>(original_loop);
+  original_loop_fn(5);
+  EXPECT_EQ(0, jitted_counter);
+
  TestLoopFn new_loop_fn = reinterpret_cast<TestLoopFn>(new_loop);
-  EXPECT_EQ(jitted_counter, 0);
  new_loop_fn(5);
-  EXPECT_EQ(jitted_counter, 5);
+  EXPECT_EQ(5, jitted_counter);
  new_loop_fn(5);
-  EXPECT_EQ(jitted_counter, 10);
+  EXPECT_EQ(10, jitted_counter);

  TestLoopFn new_loop_fn2 = reinterpret_cast<TestLoopFn>(new_loop2);
  new_loop_fn2(5);
-  EXPECT_EQ(jitted_counter, 0);
+  EXPECT_EQ(0, jitted_counter);
 }

 // Test function for c++/ir interop for strings.  Function will do:
@@ -254,14 +267,14 @@ Function* CodegenStringTest(LlvmCodeGen* codegen) {
  Function* interop_fn = prototype.GeneratePrototype(&builder, &str);

  // strval->ptr[0] = 'A'
-  Value* str_ptr = builder.CreateStructGEP(str, 0, "str_ptr");
+  Value* str_ptr = builder.CreateStructGEP(NULL, str, 0, "str_ptr");
  Value* ptr = builder.CreateLoad(str_ptr, "ptr");
  Value* first_char_offset[] = { codegen->GetIntConstant(TYPE_INT, 0) };
  Value* first_char_ptr = builder.CreateGEP(ptr, first_char_offset, "first_char_ptr");
  builder.CreateStore(codegen->GetIntConstant(TYPE_TINYINT, 'A'), first_char_ptr);

  // Update and return old len
-  Value* len_ptr = builder.CreateStructGEP(str, 1, "len_ptr");
+  Value* len_ptr = builder.CreateStructGEP(NULL, str, 1, "len_ptr");
  Value* len = builder.CreateLoad(len_ptr, "len");
  builder.CreateStore(codegen->GetIntConstant(TYPE_INT, 1), len_ptr);
  builder.CreateRet(len);
@@ -276,7 +289,7 @@ TEST_F(LlvmCodeGenTest, StringValue) {
  ObjectPool pool;

  scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGen::LoadImpalaIR(&pool, "test", &codegen));
+  ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen));
  EXPECT_TRUE(codegen.get() != NULL);

  string str("Test");
@@ -289,11 +302,13 @@ TEST_F(LlvmCodeGenTest, StringValue) {

  Function* string_test_fn = CodegenStringTest(codegen.get());
  EXPECT_TRUE(string_test_fn != NULL);
-  EXPECT_TRUE(codegen->VerifyFunction(string_test_fn));
+  EXPECT_TRUE(VerifyFunction(codegen.get(), string_test_fn));

  // Jit compile function
-  void* jitted_fn = LlvmCodeGenTest::JitFunction(codegen.get(), string_test_fn);
-  EXPECT_TRUE(jitted_fn != NULL);
+  void* jitted_fn = NULL;
+  AddFunctionToJit(codegen.get(), string_test_fn, &jitted_fn);
+  ASSERT_OK(LlvmCodeGenTest::FinalizeModule(codegen.get()));
+  ASSERT_TRUE(jitted_fn != NULL);

  // Call IR function
  typedef int (*TestStringInteropFn)(StringValue*);
@@ -317,7 +332,7 @@ TEST_F(LlvmCodeGenTest, MemcpyTest) {
  ObjectPool pool;

  scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGen::LoadImpalaIR(&pool, "test", &codegen));
+  ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen));
  ASSERT_TRUE(codegen.get() != NULL);

  LlvmCodeGen::FnPrototype prototype(codegen.get(), "MemcpyTest", codegen->void_type());
@@ -338,7 +353,9 @@ TEST_F(LlvmCodeGenTest, MemcpyTest) {
  fn = codegen->FinalizeFunction(fn);
  ASSERT_TRUE(fn != NULL);

-  void* jitted_fn = LlvmCodeGenTest::JitFunction(codegen.get(), fn);
+  void* jitted_fn = NULL;
+  LlvmCodeGenTest::AddFunctionToJit(codegen.get(), fn, &jitted_fn);
+  ASSERT_OK(LlvmCodeGenTest::FinalizeModule(codegen.get()));
  ASSERT_TRUE(jitted_fn != NULL);

  typedef void (*TestMemcpyFn)(char*, char*, int64_t);
@@ -357,21 +374,21 @@ TEST_F(LlvmCodeGenTest, HashTest) {
  const char* data1 = "test string";
  const char* data2 = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";

-  scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGen::LoadImpalaIR(&pool, "test", &codegen));
-  ASSERT_TRUE(codegen.get() != NULL);
-
  bool restore_sse_support = false;

-  Value* llvm_data1 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
-      const_cast<char*>(data1));
-  Value* llvm_data2 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
-      const_cast<char*>(data2));
-  Value* llvm_len1 = codegen->GetIntConstant(TYPE_INT, strlen(data1));
-  Value* llvm_len2 = codegen->GetIntConstant(TYPE_INT, strlen(data2));
-
  // Loop to test both the sse4 on/off paths
  for (int i = 0; i < 2; ++i) {
+    scoped_ptr<LlvmCodeGen> codegen;
+    ASSERT_OK(LlvmCodeGen::CreateImpalaCodegen(&pool, "test", &codegen));
+    ASSERT_TRUE(codegen.get() != NULL);
+
+    Value* llvm_data1 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
+        const_cast<char*>(data1));
+    Value* llvm_data2 = codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
+        const_cast<char*>(data2));
+    Value* llvm_len1 = codegen->GetIntConstant(TYPE_INT, strlen(data1));
+    Value* llvm_len2 = codegen->GetIntConstant(TYPE_INT, strlen(data2));
+
    uint32_t expected_hash = 0;
    expected_hash = HashUtil::Hash(data1, strlen(data1), expected_hash);
    expected_hash = HashUtil::Hash(data2, strlen(data2), expected_hash);
@@ -394,15 +411,20 @@ TEST_F(LlvmCodeGenTest, HashTest) {
    ASSERT_TRUE(generic_hash_fn != NULL);

    Value* seed = codegen->GetIntConstant(TYPE_INT, 0);
-    seed = builder.CreateCall3(data1_hash_fn, llvm_data1, llvm_len1, seed);
-    seed = builder.CreateCall3(data2_hash_fn, llvm_data2, llvm_len2, seed);
-    seed = builder.CreateCall3(generic_hash_fn, llvm_data1, llvm_len1, seed);
+    seed = builder.CreateCall(data1_hash_fn,
+        ArrayRef<Value*>({llvm_data1, llvm_len1, seed}));
+    seed = builder.CreateCall(data2_hash_fn,
+        ArrayRef<Value*>({llvm_data2, llvm_len2, seed}));
+    seed = builder.CreateCall(generic_hash_fn,
+        ArrayRef<Value*>({llvm_data1, llvm_len1, seed}));
    builder.CreateRet(seed);

    fn_fixed = codegen->FinalizeFunction(fn_fixed);
    ASSERT_TRUE(fn_fixed != NULL);

-    void* jitted_fn = LlvmCodeGenTest::JitFunction(codegen.get(), fn_fixed);
+    void* jitted_fn = NULL;
+    LlvmCodeGenTest::AddFunctionToJit(codegen.get(), fn_fixed, &jitted_fn);
+    ASSERT_OK(LlvmCodeGenTest::FinalizeModule(codegen.get()));
    ASSERT_TRUE(jitted_fn != NULL);

    typedef uint32_t (*TestHashFn)();
@@ -430,8 +452,8 @@ TEST_F(LlvmCodeGenTest, HashTest) {
 }

 int main(int argc, char **argv) {
-  impala::InitCommonRuntime(argc, argv, false, impala::TestInfo::BE_TEST);
  ::testing::InitGoogleTest(&argc, argv);
+  impala::InitCommonRuntime(argc, argv, false, impala::TestInfo::BE_TEST);
  impala::LlvmCodeGen::InitializeLlvm();
  return RUN_ALL_TESTS();
 }
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -17,28 +17,30 @@
 #include <fstream>
 #include <iostream>
 #include <sstream>
+#include <boost/algorithm/string.hpp>
 #include <boost/thread/mutex.hpp>
 #include <gutil/strings/substitute.h>

 #include <llvm/ADT/Triple.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
 #include <llvm/Analysis/InstructionSimplify.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Bitcode/ReaderWriter.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
-#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/MCJIT.h>
 #include <llvm/IR/DataLayout.h>
-#include <llvm/Linker.h>
-#include <llvm/PassManager.h>
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/NoFolder.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Linker/Linker.h>
 #include <llvm/Support/DynamicLibrary.h>
 #include <llvm/Support/ErrorHandling.h>
 #include <llvm/Support/Host.h>
-#include "llvm/Support/InstIterator.h"
-#include <llvm/Support/NoFolder.h>
 #include <llvm/Support/TargetRegistry.h>
 #include <llvm/Support/TargetSelect.h>
 #include <llvm/Support/raw_ostream.h>
-#include <llvm/Support/system_error.h>
-#include <llvm/Target/TargetLibraryInfo.h>
 #include <llvm/Transforms/IPO.h>
 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
 #include <llvm/Transforms/Scalar.h>
@@ -49,7 +51,7 @@
 #include "codegen/codegen-anyval.h"
 #include "codegen/impala-ir-data.h"
 #include "codegen/instruction-counter.h"
-#include "codegen/subexpr-elimination.h"
+#include "codegen/mcjit-mem-mgr.h"
 #include "impala-ir/impala-ir-names.h"
 #include "runtime/hdfs-fs-cache.h"
 #include "util/cpu-info.h"
@@ -62,6 +64,7 @@
 using namespace llvm;
 using namespace strings;
 using std::fstream;
+using std::unique_ptr;

 DEFINE_bool(print_llvm_ir_instruction_count, false,
    "if true, prints the instruction counts of all JIT'd functions");
@@ -77,8 +80,10 @@ DECLARE_string(local_library_dir);

 namespace impala {

-static mutex llvm_initialization_lock;
-static bool llvm_initialized = false;
+bool LlvmCodeGen::llvm_initialized_ = false;
+
+string LlvmCodeGen::cpu_name_;
+vector<string> LlvmCodeGen::cpu_attrs_;

 static void LlvmCodegenHandleError(void* user_data, const std::string& reason,
    bool gen_crash_diag) {
@@ -86,18 +91,15 @@ static void LlvmCodegenHandleError(void* user_data, const std::string& reason,
 }

 void LlvmCodeGen::InitializeLlvm(bool load_backend) {
-  mutex::scoped_lock initialization_lock(llvm_initialization_lock);
-  if (llvm_initialized) return;
+  DCHECK(!llvm_initialized_);
  llvm::remove_fatal_error_handler();
  llvm::install_fatal_error_handler(LlvmCodegenHandleError);
-  // This allocates a global llvm struct and enables multithreading.
-  // There is no real good time to clean this up but we only make it once.
-  bool result = llvm::llvm_start_multithreaded();
-  DCHECK(result);
-  // This can *only* be called once per process and is used to setup
-  // dynamically linking jitted code.
+  // These functions can *only* be called once per process and are used to set up
+  // LLVM subsystems for code generation targeting the machine we're running on.
  llvm::InitializeNativeTarget();
-  llvm_initialized = true;
+  llvm::InitializeNativeTargetAsmPrinter();
+  llvm::InitializeNativeTargetAsmParser();
+  llvm_initialized_ = true;

  if (load_backend) {
    string path;
@@ -106,6 +108,12 @@ void LlvmCodeGen::InitializeLlvm(bool load_backend) {
    bool failed = llvm::sys::DynamicLibrary::LoadLibraryPermanently(path.c_str());
    DCHECK_EQ(failed, 0);
  }
+
+  cpu_name_ = llvm::sys::getHostCPUName().str();
+  LOG(INFO) << "CPU class for runtime code generation: " << cpu_name_;
+  GetHostCPUAttrs(&cpu_attrs_);
+  LOG(INFO) << "CPU flags for runtime code generation: "
+            << boost::algorithm::join(cpu_attrs_, ",");
 }

 LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) :
@@ -115,10 +123,9 @@ LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) :
  is_corrupt_(false),
  is_compiled_(false),
  context_(new llvm::LLVMContext()),
-  module_(NULL),
-  execution_engine_(NULL) {
+  module_(NULL) {

-  DCHECK(llvm_initialized) << "Must call LlvmCodeGen::InitializeLlvm first.";
+  DCHECK(llvm_initialized_) << "Must call LlvmCodeGen::InitializeLlvm first.";

  load_module_timer_ = ADD_TIMER(&profile_, "LoadTime");
  prepare_module_timer_ = ADD_TIMER(&profile_, "PrepareTime");
@@ -132,60 +139,63 @@ LlvmCodeGen::LlvmCodeGen(ObjectPool* pool, const string& id) :
  loaded_functions_.resize(IRFunction::FN_END);
 }

-Status LlvmCodeGen::LoadFromFile(ObjectPool* pool,
+Status LlvmCodeGen::CreateFromFile(ObjectPool* pool,
    const string& file, const string& id, scoped_ptr<LlvmCodeGen>* codegen) {
  codegen->reset(new LlvmCodeGen(pool, id));
  SCOPED_TIMER((*codegen)->profile_.total_time_counter());

-  Module* loaded_module;
-  RETURN_IF_ERROR(LoadModuleFromFile(codegen->get(), file, &loaded_module));
-  (*codegen)->module_ = loaded_module;
+  unique_ptr<Module> loaded_module;
+  RETURN_IF_ERROR((*codegen)->LoadModuleFromFile(file, &loaded_module));

-  return (*codegen)->Init();
+  return (*codegen)->Init(std::move(loaded_module));
 }

-Status LlvmCodeGen::LoadFromMemory(ObjectPool* pool, MemoryBuffer* module_ir,
+Status LlvmCodeGen::CreateFromMemory(ObjectPool* pool, MemoryBufferRef module_ir,
    const string& module_name, const string& id, scoped_ptr<LlvmCodeGen>* codegen) {
  codegen->reset(new LlvmCodeGen(pool, id));
  SCOPED_TIMER((*codegen)->profile_.total_time_counter());

-  Module* loaded_module;
-  RETURN_IF_ERROR(LoadModuleFromMemory(codegen->get(), module_ir, module_name,
-      &loaded_module));
-  (*codegen)->module_ = loaded_module;
+  unique_ptr<Module> loaded_module;
+  RETURN_IF_ERROR(
+      (*codegen)->LoadModuleFromMemory(module_ir, module_name, &loaded_module));

-  return (*codegen)->Init();
+  return (*codegen)->Init(std::move(loaded_module));
 }

-Status LlvmCodeGen::LoadModuleFromFile(LlvmCodeGen* codegen, const string& file,
-      llvm::Module** module) {
-  OwningPtr<MemoryBuffer> file_buffer;
+Status LlvmCodeGen::LoadModuleFromFile(const string& file, unique_ptr<Module>* module) {
+  unique_ptr<MemoryBuffer> file_buffer;
  {
-    SCOPED_TIMER(codegen->load_module_timer_);
+    SCOPED_TIMER(load_module_timer_);

-    llvm::error_code err = MemoryBuffer::getFile(file, file_buffer);
-    if (err.value() != 0) {
+    ErrorOr<unique_ptr<MemoryBuffer>> tmp_file_buffer = MemoryBuffer::getFile(file);
+    if (!tmp_file_buffer) {
      stringstream ss;
-      ss << "Could not load module " << file << ": " << err.message();
+      ss << "Could not load module " << file << ": "
+         << tmp_file_buffer.getError().message();
      return Status(ss.str());
    }
+    file_buffer = std::move(tmp_file_buffer.get());
  }

-  COUNTER_ADD(codegen->module_bitcode_size_, file_buffer->getBufferSize());
-  return LoadModuleFromMemory(codegen, file_buffer.get(), file, module);
+  COUNTER_ADD(module_bitcode_size_, file_buffer->getBufferSize());
+  return LoadModuleFromMemory(file_buffer->getMemBufferRef(), file, module);
 }

-Status LlvmCodeGen::LoadModuleFromMemory(LlvmCodeGen* codegen, MemoryBuffer* module_ir,
-      std::string module_name, llvm::Module** module) {
-  SCOPED_TIMER(codegen->prepare_module_timer_);
-  string error;
-  *module = ParseBitcodeFile(module_ir, codegen->context(), &error);
-  if (*module == NULL) {
+Status LlvmCodeGen::LoadModuleFromMemory(MemoryBufferRef module_ir, string module_name,
+    unique_ptr<Module>* module) {
+  DCHECK(!module_name.empty());
+  SCOPED_TIMER(prepare_module_timer_);
+  ErrorOr<unique_ptr<Module>> tmp_module =
+      parseBitcodeFile(module_ir, context());
+  if (!tmp_module) {
    stringstream ss;
-    ss << "Could not parse module " << module_name << ": " << error;
+    ss << "Could not parse module " << module_name << ": " << tmp_module.getError();
    return Status(ss.str());
  }
-  COUNTER_ADD(codegen->module_bitcode_size_, module_ir->getBufferSize());
+
+  *module = std::move(tmp_module.get());
+  (*module)->setModuleIdentifier(module_name);
+  COUNTER_ADD(module_bitcode_size_, module_ir.getBufferSize());
  return Status::OK();
 }

@@ -194,21 +204,23 @@ Status LlvmCodeGen::LinkModule(const string& file) {
  if (linked_modules_.find(file) != linked_modules_.end()) return Status::OK();

  SCOPED_TIMER(profile_.total_time_counter());
-  Module* new_module;
-  RETURN_IF_ERROR(LoadModuleFromFile(this, file, &new_module));
-  string error_msg;
-  bool error =
-      Linker::LinkModules(module_, new_module, Linker::DestroySource, &error_msg);
+  unique_ptr<Module> new_module;
+  RETURN_IF_ERROR(LoadModuleFromFile(file, &new_module));
+
+  // The module data layout must match the one selected by the execution engine.
+  new_module->setDataLayout(execution_engine_->getDataLayout());
+
+  bool error = Linker::linkModules(*module_, std::move(new_module));
  if (error) {
    stringstream ss;
-    ss << "Problem linking " << file << " to main module: " << error_msg;
+    ss << "Problem linking " << file << " to main module.";
    return Status(ss.str());
  }
  linked_modules_.insert(file);
  return Status::OK();
 }

-Status LlvmCodeGen::LoadImpalaIR(
+Status LlvmCodeGen::CreateImpalaCodegen(
    ObjectPool* pool, const string& id, scoped_ptr<LlvmCodeGen>* codegen_ret) {
  // Select the appropriate IR version.  We cannot use LLVM IR with sse instructions on
  // a machine without sse support (loading the module will fail regardless of whether
@@ -224,9 +236,9 @@ Status LlvmCodeGen::LoadImpalaIR(
        impala_no_sse_llvm_ir_len);
    module_name = "Impala IR with no SSE support";
  }
-  scoped_ptr<MemoryBuffer> module_ir_buf(
+  unique_ptr<MemoryBuffer> module_ir_buf(
      MemoryBuffer::getMemBuffer(module_ir, "", false));
-  RETURN_IF_ERROR(LoadFromMemory(pool, module_ir_buf.get(), module_name, id,
+  RETURN_IF_ERROR(CreateFromMemory(pool, module_ir_buf->getMemBufferRef(), module_name, id,
      codegen_ret));
  LlvmCodeGen* codegen = codegen_ret->get();

@@ -241,9 +253,9 @@ Status LlvmCodeGen::LoadImpalaIR(
  codegen->timestamp_val_type_ = codegen->GetType(TimestampValue::LLVM_CLASS_NAME);

  // Verify size is correct
-  const DataLayout* data_layout = codegen->execution_engine()->getDataLayout();
+  const DataLayout& data_layout = codegen->execution_engine()->getDataLayout();
  const StructLayout* layout =
-      data_layout->getStructLayout(static_cast<StructType*>(codegen->string_val_type_));
+      data_layout.getStructLayout(static_cast<StructType*>(codegen->string_val_type_));
  if (layout->getSizeInBytes() != sizeof(StringValue)) {
    DCHECK_EQ(layout->getSizeInBytes(), sizeof(StringValue));
    return Status("Could not create llvm struct type for StringVal");
@@ -289,10 +301,9 @@ Status LlvmCodeGen::LoadImpalaIR(
  return Status::OK();
 }

-Status LlvmCodeGen::Init() {
-  if (module_ == NULL) {
-    module_ = new Module(id_, context());
-  }
+Status LlvmCodeGen::Init(unique_ptr<Module> module) {
+  DCHECK(module != NULL);
+
  llvm::CodeGenOpt::Level opt_level = CodeGenOpt::Aggressive;
 #ifndef NDEBUG
  // For debug builds, don't generate JIT compiled optimized assembly.
@@ -300,21 +311,27 @@ Status LlvmCodeGen::Init() {
  // blows up the fe tests (which take ~10-20 ms each).
  opt_level = CodeGenOpt::None;
 #endif
-  EngineBuilder builder = EngineBuilder(module_).setOptLevel(opt_level);
-  //TODO Uncomment the below line as soon as we upgrade to LLVM 3.5 to enable SSE, if
-  // available. In LLVM 3.3 this is done automatically and cannot be enabled because
-  // for some reason SSE4 intrinsics selection will not work.
-  //builder.setMCPU(llvm::sys::getHostCPUName());
+  module_ = module.get();
+  EngineBuilder builder(std::move(module));
+  builder.setEngineKind(EngineKind::JIT);
+  builder.setOptLevel(opt_level);
+  builder.setMCJITMemoryManager(
+      unique_ptr<ImpalaMCJITMemoryManager>(new ImpalaMCJITMemoryManager()));
+  builder.setMCPU(cpu_name_);
+  builder.setMAttrs(cpu_attrs_);
  builder.setErrorStr(&error_string_);
+
  execution_engine_.reset(builder.create());
  if (execution_engine_ == NULL) {
-    // execution_engine_ will take ownership of the module if it is created
-    delete module_;
+    module_ = NULL; // module_ was owned by builder.
    stringstream ss;
    ss << "Could not create ExecutionEngine: " << error_string_;
    return Status(ss.str());
  }

+  // The module data layout must match the one selected by the execution engine.
+  module_->setDataLayout(execution_engine_->getDataLayout());
+
  void_type_ = Type::getVoidTy(context());
  ptr_type_ = PointerType::get(GetType(TYPE_TINYINT), 0);
  true_value_ = ConstantInt::get(context(), APInt(1, true, true));
@@ -326,16 +343,23 @@ Status LlvmCodeGen::Init() {
 }

 LlvmCodeGen::~LlvmCodeGen() {
-  for (set<Function*>::iterator iter = jitted_functions_.begin();
-      iter != jitted_functions_.end(); ++iter) {
-    execution_engine_->freeMachineCodeForFunction(*iter);
-  }
 }

 void LlvmCodeGen::EnableOptimizations(bool enable) {
  optimizations_enabled_ = enable;
 }

+void LlvmCodeGen::GetHostCPUAttrs(vector<string>* attrs) {
+  // LLVM's ExecutionEngine expects features to be enabled or disabled with a list
+  // of strings like ["+feature1", "-feature2"].
+  StringMap<bool> cpu_features;
+  llvm::sys::getHostCPUFeatures(cpu_features);
+  for (const StringMapEntry<bool>& entry: cpu_features) {
+    attrs->emplace_back(
+        Substitute("$0$1", entry.second ? "+" : "-", entry.first().data()));
+  }
+}
+
 string LlvmCodeGen::GetIR(bool full_module) const {
  string str;
  raw_string_ostream stream(str);
@@ -343,7 +367,7 @@ string LlvmCodeGen::GetIR(bool full_module) const {
    module_->print(stream, NULL);
  } else {
    for (int i = 0; i < codegend_functions_.size(); ++i) {
-      codegend_functions_[i]->print(stream, NULL);
+      codegend_functions_[i]->print(stream, true);
    }
  }
  return str;
@@ -369,8 +393,12 @@ Type* LlvmCodeGen::GetType(const ColumnType& type) {
      return Type::getDoubleTy(context());
    case TYPE_STRING:
    case TYPE_VARCHAR:
-    case TYPE_CHAR:
      return string_val_type_;
+    case TYPE_CHAR:
+      // IMPALA-3207: Codegen for CHAR is not yet implemented, this should not
+      // be called for TYPE_CHAR.
+      DCHECK(false) << "NYI";
+      return NULL;
    case TYPE_TIMESTAMP:
      return timestamp_val_type_;
    case TYPE_DECIMAL:
@@ -387,7 +415,7 @@ PointerType* LlvmCodeGen::GetPtrType(const ColumnType& type) {

 Type* LlvmCodeGen::GetType(const string& name) {
  Type* type = module_->getTypeByName(name);
-  DCHECK(type != NULL);
+  DCHECK(type != NULL) << name;
  return type;
 }

@@ -503,7 +531,12 @@ bool LlvmCodeGen::VerifyFunction(Function* fn) {
    }
  }

-  if (!is_corrupt_) is_corrupt_ = llvm::verifyFunction(*fn, PrintMessageAction);
+  if (!is_corrupt_) {
+    string str;
+    raw_string_ostream stream(str);
+    is_corrupt_ = verifyFunction(*fn, &stream);
+    if (is_corrupt_) LOG(ERROR) << str;
+  }

  if (is_corrupt_) {
    string fn_name = fn->getName(); // llvm has some fancy operator overloading
@@ -537,7 +570,7 @@ Function* LlvmCodeGen::FnPrototype::GeneratePrototype(
  for (Function::arg_iterator iter = fn->arg_begin();
      iter != fn->arg_end(); ++iter, ++idx) {
    iter->setName(args_[idx].name);
-    if (params != NULL) params[idx] = iter;
+    if (params != NULL) params[idx] = &*iter;
  }

  if (builder != NULL) {
@@ -551,6 +584,7 @@ Function* LlvmCodeGen::FnPrototype::GeneratePrototype(

 int LlvmCodeGen::ReplaceCallSites(Function* caller, Function* new_fn,
    const string& replacee_name) {
+  DCHECK(!is_compiled_);
  DCHECK(caller->getParent() == module_);
  DCHECK(caller != NULL);
  DCHECK(new_fn != NULL);
@@ -573,6 +607,7 @@ int LlvmCodeGen::ReplaceCallSites(Function* caller, Function* new_fn,
 }

 Function* LlvmCodeGen::CloneFunction(Function* fn) {
+  DCHECK(!is_compiled_);
  ValueToValueMapTy dummy_vmap;
  // CloneFunction() automatically gives the new function a unique name
  Function* fn_clone = llvm::CloneFunction(fn, dummy_vmap, false);
@@ -581,64 +616,6 @@ Function* LlvmCodeGen::CloneFunction(Function* fn) {
  return fn_clone;
 }

-// TODO: revisit this. Inlining all call sites might not be the right call.  We
-// probably need to make this more complicated and somewhat cost based or write
-// our own optimization passes.
-int LlvmCodeGen::InlineCallSites(Function* fn, bool skip_registered_fns) {
-  int functions_inlined = 0;
-  // Collect all call sites
-  vector<CallInst*> call_sites;
-
-  // loop over all blocks
-  Function::iterator block_iter = fn->begin();
-  while (block_iter != fn->end()) {
-    BasicBlock* block = block_iter++;
-    // loop over instructions in the block
-    BasicBlock::iterator instr_iter = block->begin();
-    while (instr_iter != block->end()) {
-      Instruction* instr = instr_iter++;
-      // look for call instructions
-      if (CallInst::classof(instr)) {
-        CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
-        Function* called_fn = call_instr->getCalledFunction();
-        // called_fn will be NULL if it's a virtual function call, etc.
-        if (called_fn == NULL || !called_fn->hasFnAttribute(Attribute::AlwaysInline)) {
-          continue;
-        }
-        if (skip_registered_fns) {
-          if (registered_exprs_.find(called_fn) != registered_exprs_.end()) {
-            continue;
-          }
-        }
-        call_sites.push_back(call_instr);
-      }
-    }
-  }
-
-  // Inline all call sites.  InlineFunction can still fail (function is recursive, etc)
-  // but that always leaves the original function in a consistent state
-  for (int i = 0; i < call_sites.size(); ++i) {
-    llvm::InlineFunctionInfo info;
-    if (llvm::InlineFunction(call_sites[i], info)) {
-      ++functions_inlined;
-    }
-  }
-  return functions_inlined;
-}
-
-Function* LlvmCodeGen::OptimizeFunctionWithExprs(Function* fn) {
-  int num_inlined;
-  do {
-    // This assumes that all redundant exprs have been registered.
-    num_inlined = InlineCallSites(fn, false);
-  } while (num_inlined > 0);
-
-  // TODO(skye): fix subexpression elimination
-  // SubExprElimination subexpr_elim(this);
-  // subexpr_elim.Run(fn);
-  return FinalizeFunction(fn);
-}
-
 Function* LlvmCodeGen::FinalizeFunction(Function* function) {
  function->addFnAttr(llvm::Attribute::AlwaysInline);

@@ -670,16 +647,9 @@ Status LlvmCodeGen::FinalizeModule() {

  // Don't waste time optimizing module if there are no functions to JIT. This can happen
  // if the codegen object is created but no functions are successfully codegen'd.
-  if (optimizations_enabled_ && !FLAGS_disable_optimization_passes &&
-      !fns_to_jit_compile_.empty()) {
-    OptimizeModule();
-  }
+  if (fns_to_jit_compile_.empty()) return Status::OK();

-  SCOPED_TIMER(compile_timer_);
-  // JIT compile all codegen'd functions
-  for (int i = 0; i < fns_to_jit_compile_.size(); ++i) {
-    *fns_to_jit_compile_[i].second = JitFunction(fns_to_jit_compile_[i].first);
-  }
+  if (optimizations_enabled_ && !FLAGS_disable_optimization_passes) OptimizeModule();

  if (FLAGS_opt_module_dir.size() != 0) {
    string path = FLAGS_opt_module_dir + "/" + id_ + "_opt.ll";
@@ -692,6 +662,19 @@ Status LlvmCodeGen::FinalizeModule() {
    }
  }

+  {
+    SCOPED_TIMER(compile_timer_);
+    // Finalize module, which compiles all functions.
+    execution_engine_->finalizeObject();
+  }
+
+  // Get pointers to all codegen'd functions.
+  for (int i = 0; i < fns_to_jit_compile_.size(); ++i) {
+    Function* function = fns_to_jit_compile_[i].first;
+    void* jitted_function = execution_engine_->getPointerToFunction(function);
+    DCHECK(jitted_function != NULL) << "Failed to jit " << function->getName().data();
+    *fns_to_jit_compile_[i].second = jitted_function;
+  }
  return Status::OK();
 }

@@ -703,19 +686,19 @@ void LlvmCodeGen::OptimizeModule() {
  // TODO: we can likely muck with this to get better compile speeds or write
  // our own passes.  Our subexpression elimination optimization can be rolled into
  // a pass.
-  PassManagerBuilder pass_builder ;
+  PassManagerBuilder pass_builder;
  // 2 maps to -O2
  // TODO: should we switch to 3? (3 may not produce different IR than 2 while taking
  // longer, but we should check)
  pass_builder.OptLevel = 2;
  // Don't optimize for code size (this corresponds to -O2/-O3)
  pass_builder.SizeLevel = 0;
-  pass_builder.Inliner = createFunctionInliningPass() ;
+  pass_builder.Inliner = createFunctionInliningPass();

-  // Specifying the data layout is necessary for some optimizations (e.g. removing many
-  // of the loads/stores produced by structs).
-  const string& data_layout_str = module_->getDataLayout();
-  DCHECK(!data_layout_str.empty());
+  // The TargetIRAnalysis pass is required to provide information about the target
+  // machine to optimisation passes, e.g. the cost model.
+  TargetIRAnalysis target_analysis =
+      execution_engine_->getTargetMachine()->getTargetIRAnalysis();

  // Before running any other optimization passes, run the internalize pass, giving it
  // the names of all functions registered by AddFunctionToJit(), followed by the
@@ -726,8 +709,8 @@ void LlvmCodeGen::OptimizeModule() {
  for (int i = 0; i < fns_to_jit_compile_.size(); ++i) {
    exported_fn_names.push_back(fns_to_jit_compile_[i].first->getName().data());
  }
-  scoped_ptr<PassManager> module_pass_manager(new PassManager());
-  module_pass_manager->add(new DataLayout(data_layout_str));
+  unique_ptr<legacy::PassManager> module_pass_manager(new legacy::PassManager());
+  module_pass_manager->add(createTargetTransformInfoWrapperPass(target_analysis));
  module_pass_manager->add(createInternalizePass(exported_fn_names));
  module_pass_manager->add(createGlobalDCEPass());
  module_pass_manager->run(*module_);
@@ -740,8 +723,9 @@ void LlvmCodeGen::OptimizeModule() {
  COUNTER_SET(num_instructions_, counter.GetCount(InstructionCounter::TOTAL_INSTS));

  // Create and run function pass manager
-  scoped_ptr<FunctionPassManager> fn_pass_manager(new FunctionPassManager(module_));
-  fn_pass_manager->add(new DataLayout(data_layout_str));
+  unique_ptr<legacy::FunctionPassManager> fn_pass_manager(
+      new legacy::FunctionPassManager(module_));
+  fn_pass_manager->add(createTargetTransformInfoWrapperPass(target_analysis));
  pass_builder.populateFunctionPassManager(*fn_pass_manager);
  fn_pass_manager->doInitialization();
  for (Module::iterator it = module_->begin(), end = module_->end(); it != end ; ++it) {
@@ -750,8 +734,8 @@ void LlvmCodeGen::OptimizeModule() {
  fn_pass_manager->doFinalization();

  // Create and run module pass manager
-  module_pass_manager.reset(new PassManager());
-  module_pass_manager->add(new DataLayout(data_layout_str));
+  module_pass_manager.reset(new legacy::PassManager());
+  module_pass_manager->add(createTargetTransformInfoWrapperPass(target_analysis));
  pass_builder.populateModulePassManager(*module_pass_manager);
  module_pass_manager->run(*module_);
  if (FLAGS_print_llvm_ir_instruction_count) {
@@ -789,25 +773,19 @@ void LlvmCodeGen::AddFunctionToJit(Function* fn, void** fn_ptr) {
    fn_wrapper->addAttribute(1, Attribute::StructRet);
    // Call 'fn' and store the result in the result argument
    Value* result =
-        builder.CreateCall(fn, ArrayRef<Value*>(&args[1], fn->arg_size()), "result");
+        builder.CreateCall(fn, ArrayRef<Value*>({&args[1], fn->arg_size()}), "result");
    builder.CreateStore(result, args[0]);
    builder.CreateRetVoid();
    fn = FinalizeFunction(fn_wrapper);
    DCHECK(fn != NULL);
  }
-  fns_to_jit_compile_.push_back(make_pair(fn, fn_ptr));
+
+  AddFunctionToJitInternal(fn, fn_ptr);
 }

-void* LlvmCodeGen::JitFunction(Function* function) {
-  if (is_corrupt_) return NULL;
-
-  // TODO: log a warning if the jitted function is too big (larger than I cache)
-  void* jitted_function = execution_engine_->getPointerToFunction(function);
-  boost::lock_guard<mutex> l(jitted_functions_lock_);
-  if (jitted_function != NULL) {
-    jitted_functions_.insert(function);
-  }
-  return jitted_function;
+void LlvmCodeGen::AddFunctionToJitInternal(Function* fn, void** fn_ptr) {
+  DCHECK(!is_compiled_);
+  fns_to_jit_compile_.push_back(make_pair(fn, fn_ptr));
 }

 void LlvmCodeGen::CodegenDebugTrace(LlvmBuilder* builder, const char* str,
@@ -832,18 +810,14 @@ void LlvmCodeGen::CodegenDebugTrace(LlvmBuilder* builder, const char* str,
 }

 void LlvmCodeGen::GetFunctions(vector<Function*>* functions) {
-  Module::iterator fn_iter = module_->begin();
-  while (fn_iter != module_->end()) {
-    Function* fn = fn_iter++;
-    if (!fn->empty()) functions->push_back(fn);
+  for (Function& fn: module_->functions()) {
+    if (!fn.empty()) functions->push_back(&fn);
  }
 }

 void LlvmCodeGen::GetSymbols(unordered_set<string>* symbols) {
-  Module::iterator fn_iter = module_->begin();
-  while (fn_iter != module_->end()) {
-    Function* fn = fn_iter++;
-    if (!fn->empty()) symbols->insert(fn->getName());
+  for (const Function& fn: module_->functions()) {
+    if (!fn.empty()) symbols->insert(fn.getName());
  }
 }

@@ -1065,7 +1039,7 @@ Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
      while (num_bytes >= 8) {
        Value* index[] = { GetIntConstant(TYPE_INT, i++) };
        Value* d = builder.CreateLoad(builder.CreateGEP(ptr, index));
-        result_64 = builder.CreateCall2(crc64_fn, result_64, d);
+        result_64 = builder.CreateCall(crc64_fn, ArrayRef<Value*>({result_64, d}));
        num_bytes -= 8;
      }
      result = builder.CreateTrunc(result_64, GetType(TYPE_INT));
@@ -1078,7 +1052,7 @@ Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
      DCHECK_LT(num_bytes, 8);
      Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_INT));
      Value* d = builder.CreateLoad(ptr);
-      result = builder.CreateCall2(crc32_fn, result, d);
+      result = builder.CreateCall(crc32_fn, ArrayRef<Value*>({result, d}));
      Value* index[] = { GetIntConstant(TYPE_INT, 4) };
      data = builder.CreateGEP(data, index);
      num_bytes -= 4;
@@ -1088,7 +1062,7 @@ Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
      DCHECK_LT(num_bytes, 4);
      Value* ptr = builder.CreateBitCast(data, GetPtrType(TYPE_SMALLINT));
      Value* d = builder.CreateLoad(ptr);
-      result = builder.CreateCall2(crc16_fn, result, d);
+      result = builder.CreateCall(crc16_fn, ArrayRef<Value*>({result, d}));
      Value* index[] = { GetIntConstant(TYPE_INT, 2) };
      data = builder.CreateGEP(data, index);
      num_bytes -= 2;
@@ -1097,7 +1071,7 @@ Function* LlvmCodeGen::GetHashFunction(int num_bytes) {
    if (num_bytes > 0) {
      DCHECK_EQ(num_bytes, 1);
      Value* d = builder.CreateLoad(data);
-      result = builder.CreateCall2(crc8_fn, result, d);
+      result = builder.CreateCall(crc8_fn, ArrayRef<Value*>({result, d}));
      --num_bytes;
    }
    DCHECK_EQ(num_bytes, 0);
@@ -1149,7 +1123,7 @@ Argument* LlvmCodeGen::GetArgument(Function* fn, int i) {
  DCHECK_LE(i, fn->arg_size());
  Function::arg_iterator iter = fn->arg_begin();
  for (int j = 0; j < i; ++j) ++iter;
-  return iter;
+  return &*iter;
 }

 Value* LlvmCodeGen::GetPtrTo(LlvmBuilder* builder, Value* v, const char* name) {
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -19,13 +19,13 @@
 #include "common/status.h"

 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include <boost/scoped_ptr.hpp>
 #include <boost/thread/mutex.hpp>
 #include <boost/unordered_set.hpp>

-#include <llvm/Analysis/Verifier.h>
 #include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/Intrinsics.h>
@@ -46,16 +46,18 @@ namespace llvm {
  class ConstantFolder;
  class ExecutionEngine;
  class Function;
-  class FunctionPassManager;
  class LLVMContext;
  class Module;
  class NoFolder;
-  class PassManager;
  class PointerType;
  class StructType;
  class TargetData;
  class Type;
  class Value;
+  namespace legacy {
+    class FunctionPassManager;
+    class PassManager;
+  }

  template<bool B, typename T, typename I>
  class IRBuilder;
@@ -88,14 +90,13 @@ class SubExprElimination;
 /// be called directly.  The test interface can be used to load any precompiled
 /// module or none at all (but this class will not validate the module).
 //
-/// This class is mostly not threadsafe.  During the Prepare() phase of the fragment
-/// execution, nodes should codegen functions, and register those functions with
-/// AddFunctionToJit().
+/// This class is not threadsafe.  During the Prepare() phase of the fragment execution,
+/// nodes should codegen functions, and register those functions with AddFunctionToJit().
 /// Afterward, FinalizeModule() should be called at which point all codegened functions
-/// are optimized. After FinalizeModule() returns, all function pointers registered with
-/// AddFunctionToJit() will be pointing to the appropriate JIT'd function.
+/// are optimized and compiled. After FinalizeModule() returns, all function pointers
+/// registered with AddFunctionToJit() will be pointing to the appropriate JIT'd function.
 //
-/// Currently, each query will create and initialize one of these
+/// Currently, each fragment instance  will create and initialize one of these
 /// objects.  This requires loading and parsing the cross compiled modules.
 /// TODO: we should be able to do this once per process and let llvm compile
 /// functions from across modules.
@@ -108,8 +109,8 @@ class SubExprElimination;
 class LlvmCodeGen {
 public:
  /// This function must be called once per process before any llvm API calls are
-  /// made.  LLVM needs to allocate data structures for multi-threading support and
-  /// to enable dynamic linking of jitted code.
+  /// made.  It is not valid to call it multiple times. LLVM needs to allocate data
+  /// structures for multi-threading support and to enable dynamic linking of jitted code.
  /// if 'load_backend', load the backend static object for llvm.  This is needed
  /// when libbackend.so is loaded from java.  llvm will be default only look in
  /// the current object and not be able to find the backend symbols
@@ -117,22 +118,15 @@ class LlvmCodeGen {
  /// side is not loading the be explicitly anymore.
  static void InitializeLlvm(bool load_backend = false);

-  /// Loads and parses the precompiled impala IR module
+  /// Creates a codegen instance for Impala initialized with the cross-compiled Impala IR.
  /// 'codegen' will contain the created object on success.
  /// 'id' is used for outputting the IR module for debugging.
-  static Status LoadImpalaIR(
+  static Status CreateImpalaCodegen(
      ObjectPool*, const std::string& id, boost::scoped_ptr<LlvmCodeGen>* codegen);

-  /// Load a pre-compiled IR module from 'file'.  This creates a top level
-  /// codegen object.
-  /// codegen will contain the created object on success.
-  static Status LoadFromFile(ObjectPool*, const std::string& file, const std::string& id,
-      boost::scoped_ptr<LlvmCodeGen>* codegen);
-
-  /// Load a pre-compiled IR module from module_ir.  This creates a top level codegen
-  /// object.  codegen will contain the created object on success.
-  static Status LoadFromMemory(ObjectPool*, llvm::MemoryBuffer* module_ir,
-      const std::string& module_name, const std::string& id,
+  /// Creates a LlvmCodeGen instance initialized with the module bitcode from 'file'.
+  /// 'codegen' will contain the created object on success.
+  static Status CreateFromFile(ObjectPool*, const std::string& file, const std::string& id,
      boost::scoped_ptr<LlvmCodeGen>* codegen);

  /// Removes all jit compiled dynamically linked functions from the process.
@@ -200,6 +194,9 @@ class LlvmCodeGen {
    std::vector<NamedVariable> args_;
  };

+  /// Get host cpu attributes in format expected by EngineBuilder.
+  static void GetHostCPUAttrs(std::vector<std::string>* attrs);
+
  /// Return a pointer type to 'type'
  llvm::PointerType* GetPtrType(llvm::Type* type);

@@ -278,48 +275,18 @@ class LlvmCodeGen {
  /// otherwise, it returns the function object.
  llvm::Function* FinalizeFunction(llvm::Function* function);

-  /// Inlines all function calls for 'fn' that are marked as always inline.
-  /// (We can't inline all call sites since pulling in boost/other libs could have
-  /// recursion.  Instead, we just inline our functions and rely on the llvm inliner to
-  /// pick the rest.)
-  /// 'fn' is modified in place. Returns the number of functions inlined.  This is *not*
-  /// called recursively (i.e. second level function calls are not inlined). This can be
-  /// called again to inline those until this returns 0.
-  int InlineCallSites(llvm::Function* fn, bool skip_registered_fns);
-
-  /// Optimizes the function in place.  This uses a combination of llvm optimization
-  /// passes as well as some custom heuristics.  This should be called for all
-  /// functions which call Exprs.  The exprs will be inlined as much as possible,
-  /// and will do basic sub expression elimination.
-  /// This should be called before FinalizeModule for functions that want to remove
-  /// redundant exprs.  This should be called at the highest level possible to
-  /// maximize the number of redundant exprs that can be found.
-  /// TODO: we need to spend more time to output better IR.  Asking llvm to
-  /// remove redundant codeblocks on its own is too difficult for it.
-  /// TODO: this should implement the llvm FunctionPass interface and integrated
-  /// with the llvm optimization passes.
-  llvm::Function* OptimizeFunctionWithExprs(llvm::Function* fn);
-
-  /// Adds the function to be automatically jit compiled after the module is optimized.
-  /// That is, after FinalizeModule(), this will do *result_fn_ptr = JitFunction(fn);
-  //
-  /// This is useful since it is not valid to call JitFunction() before every part of the
-  /// query has finished adding their IR and it's convenient to not have to rewalk the
-  /// objects. This provides the same behavior as walking each of those objects and calling
-  /// JitFunction().
-  //
-  /// In addition, any functions not registered with AddFunctionToJit() are marked as
-  /// internal in FinalizeModule() and may be removed as part of optimization.
-  //
+  /// Adds the function to be automatically jit compiled when the codegen object is
+  /// finalized. FinalizeModule() will set fn_ptr to point to the jitted function.
+  ///
+  /// Only functions registered with AddFunctionToJit() and their dependencies are
+  /// compiled by FinalizeModule(): other functions are considered dead code and will
+  /// be removed during optimization.
+  ///
  /// This will also wrap functions returning DecimalVals in an ABI-compliant wrapper (see
  /// the comment in the .cc file for details). This is so we don't accidentally try to
  /// call non-compliant code from native code.
  void AddFunctionToJit(llvm::Function* fn, void** fn_ptr);

-  /// Verfies the function, e.g., checks that the IR is well-formed.  Returns false if
-  /// function is invalid.
-  bool VerifyFunction(llvm::Function* function);
-
  /// This will generate a printf call instruction to output 'message' at the builder's
  /// insert point. If 'v1' is non-NULL, it will also be passed to the printf call. Only
  /// for debugging.
@@ -421,54 +388,56 @@ class LlvmCodeGen {
  /// No-op if size is zero.
  void CodegenMemcpy(LlvmBuilder*, llvm::Value* dst, llvm::Value* src, int size);

-  /// Loads an LLVM module. 'file' should be the local path to the LLVM bitcode
-  /// file. The caller is responsible for cleaning up module.
-  static Status LoadModuleFromFile(LlvmCodeGen* codegen, const string& file,
-      llvm::Module** module);
-
  /// Codegens IR to load array[idx] and returns the loaded value. 'array' should be a
  /// C-style array (e.g. i32*) or an IR array (e.g. [10 x i32]). This function does not
  /// do bounds checking.
  llvm::Value* CodegenArrayAt(LlvmBuilder*, llvm::Value* array, int idx,
      const char* name = "");

-  /// Loads an LLVM module. 'module_ir' should be a reference to a memory buffer containing
-  /// LLVM bitcode. module_name is the name of the module to use when reporting errors.
-  /// The caller is responsible for cleaning up module.
-  static Status LoadModuleFromMemory(LlvmCodeGen* codegen, llvm::MemoryBuffer* module_ir,
-      std::string module_name, llvm::Module** module);
-
  /// Loads a module at 'file' and links it to the module associated with
  /// this LlvmCodeGen object. The module must be on the local filesystem.
  Status LinkModule(const std::string& file);

-  // Used for testing.
-  void ResetVerification() { is_corrupt_ = false; }
-
 private:
+  friend class ExprCodegenTest;
  friend class LlvmCodeGenTest;
  friend class SubExprElimination;

  /// Top level codegen object.  'module_id' is used for debugging when outputting the IR.
  LlvmCodeGen(ObjectPool* pool, const std::string& module_id);

-  /// Initializes the jitter and execution engine.
-  Status Init();
+  /// Initializes the jitter and execution engine with the given module.
+  Status Init(std::unique_ptr<llvm::Module> module);
+
+  /// Creates a LlvmCodeGen instance initialized with the module bitcode from 'module_ir'.
+  /// 'codegen' will contain the created object on success.
+  static Status CreateFromMemory(ObjectPool* pool, llvm::MemoryBufferRef module_ir,
+      const std::string& module_name, const std::string& id,
+      boost::scoped_ptr<LlvmCodeGen>* codegen);
+
+  /// Loads an LLVM module. 'file' should be the local path to the LLVM bitcode
+  /// file. The caller is responsible for cleaning up module.
+  Status LoadModuleFromFile(const string& file, std::unique_ptr<llvm::Module>* module);
+
+  /// Loads an LLVM module. 'module_ir' should be a reference to a memory buffer containing
+  /// LLVM bitcode. module_name is the name of the module to use when reporting errors.
+  /// The caller is responsible for cleaning up module.
+  Status LoadModuleFromMemory(llvm::MemoryBufferRef module_ir, std::string module_name,
+      std::unique_ptr<llvm::Module>* module);

  /// Load the intrinsics impala needs.  This is a one time initialization.
  /// Values are stored in 'llvm_intrinsics_'
  Status LoadIntrinsics();

-  /// Get the function pointer to the JIT'd version of function.
-  /// The result is a function pointer that is dynamically linked into the process.
-  /// Returns NULL if the function is invalid.
-  /// Note that this will compile, but not optimize, function if necessary.
-  //
-  /// This function shouldn't be called after calling FinalizeModule(). Instead use
-  /// AddFunctionToJit() to register a function pointer. This is because FinalizeModule()
-  /// may remove any functions not registered in AddFunctionToJit(). As such, this
-  /// function is mostly useful for tests that do not call FinalizeModule() at all.
-  void* JitFunction(llvm::Function* function);
+  /// Internal function for unit tests: skips Impala-specific wrapper generation logic.
+  void AddFunctionToJitInternal(llvm::Function* fn, void** fn_ptr);
+
+  /// Verifies the function, e.g., checks that the IR is well-formed.  Returns false if
+  /// function is invalid.
+  bool VerifyFunction(llvm::Function* function);
+
+  // Used for testing.
+  void ResetVerification() { is_corrupt_ = false; }

  /// Optimizes the module. This includes pruning the module of any unused functions.
  void OptimizeModule();
@@ -476,6 +445,13 @@ class LlvmCodeGen {
  /// Clears generated hash fns.  This is only used for testing.
  void ClearHashFns();

+  /// Whether InitializeLlvm() has been called.
+  static bool llvm_initialized_;
+
+  /// Host CPU name and attributes, filled in by InitializeLlvm().
+  static std::string cpu_name_;
+  static std::vector<std::string> cpu_attrs_;
+
  /// ID used for debugging (can be e.g. the fragment instance ID)
  std::string id_;

@@ -523,21 +499,14 @@ class LlvmCodeGen {

  /// Top level llvm object.  Objects from different contexts do not share anything.
  /// We can have multiple instances of the LlvmCodeGen object in different threads
-  boost::scoped_ptr<llvm::LLVMContext> context_;
+  std::unique_ptr<llvm::LLVMContext> context_;

  /// Top level codegen object.  Contains everything to jit one 'unit' of code.
-  /// Owned by the execution_engine_.
+  /// module_ is set by Init(). module_ is owned by execution_engine_.
  llvm::Module* module_;

  /// Execution/Jitting engine.
-  boost::scoped_ptr<llvm::ExecutionEngine> execution_engine_;
-
-  /// Keeps track of all the functions that have been jit compiled and linked into
-  /// the process. Special care needs to be taken if we need to modify these functions.
-  std::set<llvm::Function*> jitted_functions_;
-
-  /// Lock protecting jitted_functions_
-  boost::mutex jitted_functions_lock_;
+  std::unique_ptr<llvm::ExecutionEngine> execution_engine_;

  /// Keeps track of the external functions that have been included in this module
  /// e.g libc functions or non-jitted impala functions.
--- a/be/src/codegen/mcjit-mem-mgr.h
+++ b/be/src/codegen/mcjit-mem-mgr.h
@@ -0,0 +1,40 @@
+// Copyright 2016 Cloudera Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef IMPALA_CODEGEN_MCJIT_MEM_MGR_H
+#define IMPALA_CODEGEN_MCJIT_MEM_MGR_H
+
+#include <llvm/ExecutionEngine/SectionMemoryManager.h>
+
+extern void *__dso_handle __attribute__ ((__visibility__ ("hidden")));
+
+namespace impala {
+
+/// Custom memory manager to resolve references to __dso_handle in cross-compiled IR.
+/// This uses the same approach as the legacy llvm JIT to handle __dso_handle. MCJIT
+/// doesn't handle those for us: see LLVM issue 18062.
+/// TODO: get rid of this by purging the cross-compiled IR of references to __dso_handle,
+/// which come from global variables with destructors.
+class ImpalaMCJITMemoryManager : public llvm::SectionMemoryManager {
+ public:
+  virtual uint64_t getSymbolAddress(const std::string& name) override {
+    if (name == "__dso_handle") return reinterpret_cast<uint64_t>(&__dso_handle);
+    return SectionMemoryManager::getSymbolAddress(name);
+  }
+};
+
+}
+
+#endif
--- a/be/src/codegen/subexpr-elimination.cc
+++ b/be/src/codegen/subexpr-elimination.cc
@@ -1,208 +0,0 @@
-// Copyright 2012 Cloudera Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "codegen/subexpr-elimination.h"
-
-#include <fstream>
-#include <iostream>
-#include <sstream>
-
-#include <llvm/Analysis/Dominators.h>
-#include <llvm/Analysis/InstructionSimplify.h>
-#include <llvm/Analysis/Passes.h>
-#include <llvm/Support/InstIterator.h>
-#include "llvm/Transforms/IPO.h"
-#include <llvm/Transforms/Scalar.h>
-#include <llvm/Transforms/Utils/SSAUpdater.h>
-#include <llvm/Transforms/Utils/BasicBlockUtils.h>
-
-#include "common/logging.h"
-#include "codegen/subexpr-elimination.h"
-#include "impala-ir/impala-ir-names.h"
-#include "util/cpu-info.h"
-#include "util/path-builder.h"
-
-#include "common/names.h"
-
-using namespace impala;
-using namespace llvm;
-
-SubExprElimination::SubExprElimination(LlvmCodeGen* codegen) :
-    codegen_(codegen) {
-}
-
-// Before running the standard llvm optimization passes, first remove redundant calls
-// to slotref expression.  SlotRefs are more heavyweight due to the null handling that
-// is required and after they are inlined, llvm is unable to eliminate the redundant
-// inlined code blocks.
-// For example:
-//   select colA + colA would generate an inner loop with 2 calls to the colA slot ref,
-// rather than doing subexpression elimination.  To handle this, we will:
-//   1. inline all call sites in the original function except calls to SlotRefs
-//   2. for all call sites to SlotRefs except the first to that SlotRef, replace the
-//      results from the secondary calls with the result from the first and remove
-//      the call instruction.
-//   3. Inline calls to the SlotRefs (there should only be one for each slot ref).
-//
-// In the above example, the input function would look something like:
-// int ArithmeticAdd(TupleRow* row, bool* is_null) {
-//   bool lhs_is_null, rhs_is_null;
-//   int lhs_value = SlotRef(row, &lhs_is_null);
-//   if (lhs_is_null) { *is_null = true; return 0; }
-//   int rhs_value = SlotRef(row, &rhs_is_null);
-//   if (rhs_is_null) { *is_null = true; return 0; }
-//   *is_null = false; return lhs_value + rhs_value;
-// }
-// During step 2, we'd substitute the second call to SlotRef with the results from
-// the first call.
-// int ArithmeticAdd(TupleRow* row, bool* is_null) {
-//   bool lhs_is_null, rhs_is_null;
-//   int lhs_value = SlotRef(row, &lhs_is_null);
-//   if (lhs_is_null) { *is_null = true; return 0; }
-//   int rhs_value = lhs_value;
-//   rhs_is_null = lhs_is_null;
-//   if (rhs_is_null) { *is_null = true; return 0; }
-//   *is_null = false; return lhs_value + rhs_value;
-// }
-// And then rely on llvm to finish the removing the redundant code, resulting in:
-// int ArithmeticAdd(TupleRow* row, bool* is_null) {
-//   bool lhs_is_null, rhs_is_null;
-//   int lhs_value = SlotRef(row, &lhs_is_null);
-//   if (lhs_is_null) { *is_null = true; return 0; }
-//   *is_null = false; return lhs_value + lhs_value;
-// }
-// Details on how to do this:
-// http://llvm.org/docs/ProgrammersManual.html#replacing-an-instruction-with-another-value
-
-// Step 2 requires more manipulation to ensure the resulting IR is still valid IR.
-// The call to the expr returns two things, both of which need to be replaced.
-// The value of the function as the return argument and whether or not the result was
-// null as a function output argument.
-//    1. The return value is trivial since with SSA, it is easy to identity all uses of
-//       We simply replace the subsequent call instructions with the value.
-//    2. For the is_null result ptr, we replace the call to the expr with a store
-//       instruction of the cached value.
-//       i.e:
-//           val1 = Call(is_null_ptr);
-//           is_null1 = *is_null_ptr
-//           ...
-//           val2 = Call(is_null_ptr);
-//           is_null2 = *is_null_ptr
-//       Becomes:
-//           val1 = Call(is_null_ptr);
-//           is_null1 = *is_null_ptr
-//           ...
-//           val2 = val1;
-//           *is_null_ptr = is_null1;
-//           is_null2 = *is_null_ptr
-//       We do this because the is_null ptr is not SSA form, making manipulating it
-//       complex. The above approach exactly preserves the Call function, including
-//       all writes to ptrs. We then rely on the llvm load/store removal pass which
-//       will remove the redundant loads (which is tricky since you have to track
-//       other instructions that wrote to the ptr, etc).
-// When doing the eliminations, we need to consider the call graph to make sure
-// the instruction we are replacing with dominates the instruction we are replacing;
-// that is, we need to guarantee the instruction we are replacing with always executes
-// before the replacee instruction in all code paths.
-// TODO: remove all this with expr refactoring. Everything will be SSA form then.
-struct CachedExprResult {
-  // First function call result. Subsequent calls will be replaced with this value
-  CallInst* result;
-  // First is null result. Subsequent calls will be replaced with this value.
-  Instruction* is_null_value;
-};
-
-bool SubExprElimination::Run(Function* fn) {
-  // Step 1:
-  int num_inlined;
-  do {
-    // This assumes that all redundant exprs have been registered.
-    num_inlined = codegen_->InlineCallSites(fn, true);
-  } while (num_inlined > 0);
-
-  // Mapping of (expr eval function, its 'row' arg) to cached result.  We want to remove
-  // redundant calls to the same function with the same argument.
-  map<pair<Function*, Value*>, CachedExprResult> cached_slot_ref_results;
-
-  // Step 2:
-  DominatorTree dom_tree;
-  dom_tree.runOnFunction(*fn);
-
-  inst_iterator fn_end = inst_end(fn);
-  inst_iterator instr_iter = inst_begin(fn);
-  // Loop over every instruction in the function.
-  while (instr_iter != fn_end) {
-    Instruction* instr = &*instr_iter;
-    ++instr_iter;
-    // Look for call instructions
-    if (!CallInst::classof(instr)) continue;
-
-    CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
-    Function* called_fn = call_instr->getCalledFunction();
-    if (codegen_->registered_exprs_.find(called_fn) ==
-        codegen_->registered_exprs_.end()) {
-      continue;
-    }
-
-    // Found a registered expr function.  We generate the IR in a very specific way
-    // when calling the expr.  The call instruction is always followed by loading the
-    // resulting is_null result.  We need to update both.
-    // TODO: we need to update this to do more analysis since we are relying on a very
-    // specific code structure to do this.
-
-    // Arguments are (row, scratch_buffer, is_null);
-    DCHECK_EQ(call_instr->getNumArgOperands(), 3);
-    Value* row_arg = call_instr->getArgOperand(0);
-
-    DCHECK(BitCastInst::classof(row_arg));
-    BitCastInst* row_cast = reinterpret_cast<BitCastInst*>(row_arg);
-    // Get at the underlying row arg.  We need to differentiate between
-    // call Fn(row1) and call Fn(row2). (identical fns but different input).
-    row_arg = row_cast->getOperand(0);
-
-    instr = &*instr_iter;
-    ++instr_iter;
-
-    if (!LoadInst::classof(instr)) continue;
-    LoadInst* is_null_value = reinterpret_cast<LoadInst*>(instr);
-    Value* loaded_ptr = is_null_value->getPointerOperand();
-
-    // Subexpr elimination requires the IR to be a very specific form.
-    //   call SlotRef(row, NULL, is_null_ptr)
-    //   load is_null_ptr
-    // Since we generate this IR currently, we can enforce this logic in our exprs
-    // TODO: this should be removed/generalized with expr refactoring
-    DCHECK_EQ(loaded_ptr, call_instr->getArgOperand(2));
-
-    pair<Function*, Value*> call_desc = make_pair(called_fn, row_arg);
-    if (cached_slot_ref_results.find(call_desc) == cached_slot_ref_results.end()) {
-      CachedExprResult cache_entry;
-      cache_entry.result = call_instr;
-      cache_entry.is_null_value = is_null_value;
-      cached_slot_ref_results[call_desc] = cache_entry;
-    } else {
-      // Reuse the result.
-      CachedExprResult& cache_entry = cached_slot_ref_results[call_desc];
-      if (dom_tree.dominates(cache_entry.result, call_instr)) {
-        new StoreInst(cache_entry.is_null_value, loaded_ptr, call_instr);
-        call_instr->replaceAllUsesWith(cache_entry.result);
-        call_instr->eraseFromParent();
-      }
-    }
-  }
-
-  // Step 3:
-  codegen_->InlineCallSites(fn, false);
-  return true;
-}
--- a/be/src/codegen/subexpr-elimination.h
+++ b/be/src/codegen/subexpr-elimination.h
@@ -1,41 +0,0 @@
-// Copyright 2012 Cloudera Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-
-#ifndef IMPALA_CODEGEN_SUBEXPR_ELIMINATION_H
-#define IMPALA_CODEGEN_SUBEXPR_ELIMINATION_H
-
-#include "common/status.h"
-#include "codegen/llvm-codegen.h"
-
-namespace impala {
-
-/// Optimization pass to remove redundant exprs.
-/// TODO: make this into a llvm function pass (i.e. implement FunctionPass interface).
-class SubExprElimination {
- public:
-  SubExprElimination(LlvmCodeGen* codegen);
-
-  /// Perform subexpr elimination on function.
-  bool Run(llvm::Function* function);
-
- private:
-  /// Parent codegen object.
-  LlvmCodeGen* codegen_;
-};
-
-}
-
-#endif
-
--- a/be/src/exec/aggregation-node.cc
+++ b/be/src/exec/aggregation-node.cc
@@ -557,14 +557,14 @@ llvm::Function* AggregationNode::CodegenUpdateSlot(

  // Src slot is not null, update dst_slot
  builder.SetInsertPoint(src_not_null_block);
-  Value* dst_ptr =
-      builder.CreateStructGEP(agg_tuple_arg, slot_desc->field_idx(), "dst_slot_ptr");
+  Value* dst_ptr = builder.CreateStructGEP(NULL, agg_tuple_arg,
+      slot_desc->llvm_field_idx(), "dst_slot_ptr");
  Value* result = NULL;

  if (slot_desc->is_nullable()) {
    // Dst is NULL, just update dst slot to src slot and clear null bit
    Function* clear_null_fn = slot_desc->GetUpdateNullFn(codegen, false);
-    builder.CreateCall(clear_null_fn, agg_tuple_arg);
+    builder.CreateCall(clear_null_fn, ArrayRef<Value*>({agg_tuple_arg}));
  }

  // Update the slot
@@ -629,7 +629,8 @@ llvm::Function* AggregationNode::CodegenUpdateSlot(
          builder.CreateBitCast(dst_lowered_ptr, unlowered_ptr_type, "dst_unlowered_ptr");

      // Call 'hll_fn'
-      builder.CreateCall3(hll_fn, fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr);
+      builder.CreateCall(hll_fn,
+          ArrayRef<Value*>({fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr}));

      // Convert StringVal intermediate 'dst_arg' back to StringValue
      Value* anyval_result = builder.CreateLoad(dst_lowered_ptr, "anyval_result");
@@ -748,9 +749,9 @@ Function* AggregationNode::CodegenUpdateTuple(RuntimeState* state) {
    if (evaluator->is_count_star()) {
      // TODO: we should be able to hoist this up to the loop over the batch and just
      // increment the slot by the number of rows in the batch.
-      int field_idx = slot_desc->field_idx();
+      int field_idx = slot_desc->llvm_field_idx();
      Value* const_one = codegen->GetIntConstant(TYPE_BIGINT, 1);
-      Value* slot_ptr = builder.CreateStructGEP(args[1], field_idx, "src_slot");
+      Value* slot_ptr = builder.CreateStructGEP(NULL, args[1], field_idx, "src_slot");
      Value* slot_loaded = builder.CreateLoad(slot_ptr, "count_star_val");
      Value* count_inc = builder.CreateAdd(slot_loaded, const_one, "count_star_inc");
      builder.CreateStore(count_inc, slot_ptr);
@@ -760,7 +761,8 @@ Function* AggregationNode::CodegenUpdateTuple(RuntimeState* state) {
      Value* fn_ctx_arg = codegen->CastPtrToLlvmPtr(
          codegen->GetPtrType(FunctionContextImpl::LLVM_FUNCTIONCONTEXT_NAME),
          agg_fn_ctxs_[i]);
-      builder.CreateCall3(update_slot_fn, fn_ctx_arg, args[1], args[2]);
+      builder.CreateCall(update_slot_fn,
+          ArrayRef<Value*>({fn_ctx_arg, args[1], args[2]}));
    }
  }
  builder.CreateRetVoid();
@@ -826,7 +828,7 @@ Function* AggregationNode::CodegenProcessRowBatch(
  replaced = codegen->ReplaceCallSites(process_batch_fn, update_tuple_fn, "UpdateTuple");
  DCHECK_EQ(replaced, 1);

-  return codegen->OptimizeFunctionWithExprs(process_batch_fn);
+  return codegen->FinalizeFunction(process_batch_fn);
 }

 }
--- a/be/src/exec/hash-join-node.cc
+++ b/be/src/exec/hash-join-node.cc
@@ -594,7 +594,7 @@ Function* HashJoinNode::CodegenProcessBuildBatch(RuntimeState* state,
  replaced = codegen->ReplaceCallSites(process_build_batch_fn, hash_fn, "HashCurrentRow");
  DCHECK_EQ(replaced, 1);

-  return codegen->OptimizeFunctionWithExprs(process_build_batch_fn);
+  return codegen->FinalizeFunction(process_build_batch_fn);
 }

 Function* HashJoinNode::CodegenProcessProbeBatch(RuntimeState* state, Function* hash_fn) {
@@ -653,5 +653,5 @@ Function* HashJoinNode::CodegenProcessProbeBatch(RuntimeState* state, Function*
  replaced = codegen->ReplaceCallSites(process_probe_batch_fn, equals_fn, "Equals");
  DCHECK_EQ(replaced, 2);

-  return codegen->OptimizeFunctionWithExprs(process_probe_batch_fn);
+  return codegen->FinalizeFunction(process_probe_batch_fn);
 }
--- a/be/src/exec/hash-table.cc
+++ b/be/src/exec/hash-table.cc
@@ -416,8 +416,8 @@ static void CodegenAssignNullValue(LlvmCodeGen* codegen,
  int64_t fvn_seed = HashUtil::FNV_SEED;

  if (type.type == TYPE_STRING || type.type == TYPE_VARCHAR) {
-    Value* dst_ptr = builder->CreateStructGEP(dst, 0, "string_ptr");
-    Value* dst_len = builder->CreateStructGEP(dst, 1, "string_len");
+    Value* dst_ptr = builder->CreateStructGEP(NULL, dst, 0, "string_ptr");
+    Value* dst_len = builder->CreateStructGEP(NULL, dst, 1, "string_len");
    Value* null_len = codegen->GetIntConstant(TYPE_INT, fvn_seed);
    Value* null_ptr = builder->CreateIntToPtr(null_len, codegen->ptr_type());
    builder->CreateStore(null_ptr, dst_ptr);
@@ -654,7 +654,8 @@ Status HashTableCtx::CodegenHashCurrentRow(RuntimeState* state, bool use_murmur,
  // Call GetHashSeed() to get seeds_[level_]
  Function* get_hash_seed_fn =
      codegen->GetFunction(IRFunction::HASH_TABLE_GET_HASH_SEED, false);
-  Value* seed = builder.CreateCall(get_hash_seed_fn, this_arg, "seed");
+  Value* seed = builder.CreateCall(get_hash_seed_fn, ArrayRef<Value*>({this_arg}),
+      "seed");

  Value* hash_result = seed;
  Value* data = codegen->CastPtrToLlvmPtr(codegen->ptr_type(), expr_values_buffer_);
@@ -665,7 +666,8 @@ Status HashTableCtx::CodegenHashCurrentRow(RuntimeState* state, bool use_murmur,
                          codegen->GetMurmurHashFunction(results_buffer_size_) :
                          codegen->GetHashFunction(results_buffer_size_);
      Value* len = codegen->GetIntConstant(TYPE_INT, results_buffer_size_);
-      hash_result = builder.CreateCall3(hash_fn, data, len, hash_result, "hash");
+      hash_result = builder.CreateCall(hash_fn,
+          ArrayRef<Value*>({data, len, hash_result}), "hash");
    }
  } else {
    if (var_result_begin_ > 0) {
@@ -673,7 +675,8 @@ Status HashTableCtx::CodegenHashCurrentRow(RuntimeState* state, bool use_murmur,
                          codegen->GetMurmurHashFunction(var_result_begin_) :
                          codegen->GetHashFunction(var_result_begin_);
      Value* len = codegen->GetIntConstant(TYPE_INT, var_result_begin_);
-      hash_result = builder.CreateCall3(hash_fn, data, len, hash_result, "hash");
+      hash_result = builder.CreateCall(hash_fn,
+          ArrayRef<Value*>({data, len, hash_result}), "hash");
    }

    // Hash string slots
@@ -711,8 +714,8 @@ Status HashTableCtx::CodegenHashCurrentRow(RuntimeState* state, bool use_murmur,
                                 codegen->GetHashFunction(sizeof(StringValue));
        Value* llvm_loc = codegen->CastPtrToLlvmPtr(codegen->ptr_type(), loc);
        Value* len = codegen->GetIntConstant(TYPE_INT, sizeof(StringValue));
-        str_null_result =
-            builder.CreateCall3(null_hash_fn, llvm_loc, len, hash_result, "str_null");
+        str_null_result = builder.CreateCall(null_hash_fn,
+            ArrayRef<Value*>({llvm_loc, len, hash_result}), "str_null");
        builder.CreateBr(continue_block);

        builder.SetInsertPoint(not_null_block);
@@ -721,16 +724,16 @@ Status HashTableCtx::CodegenHashCurrentRow(RuntimeState* state, bool use_murmur,
      // Convert expr_values_buffer_ loc to llvm value
      Value* str_val = codegen->CastPtrToLlvmPtr(codegen->GetPtrType(TYPE_STRING), loc);

-      Value* ptr = builder.CreateStructGEP(str_val, 0);
-      Value* len = builder.CreateStructGEP(str_val, 1);
+      Value* ptr = builder.CreateStructGEP(NULL, str_val, 0);
+      Value* len = builder.CreateStructGEP(NULL, str_val, 1);
      ptr = builder.CreateLoad(ptr, "ptr");
      len = builder.CreateLoad(len, "len");

      // Call hash(ptr, len, hash_result);
      Function* general_hash_fn = use_murmur ? codegen->GetMurmurHashFunction() :
                                  codegen->GetHashFunction();
-      Value* string_hash_result =
-          builder.CreateCall3(general_hash_fn, ptr, len, hash_result, "string_hash");
+      Value* string_hash_result = builder.CreateCall(general_hash_fn,
+          ArrayRef<Value*>({ptr, len, hash_result}), "string_hash");

      if (stores_nulls_) {
        builder.CreateBr(continue_block);
--- a/be/src/exec/hdfs-avro-scanner.cc
+++ b/be/src/exec/hdfs-avro-scanner.cc
@@ -768,8 +768,8 @@ Status HdfsAvroScanner::CodegenReadRecord(
          codegen->GetFunction(IRFunction::READ_UNION_TYPE, false);
      Value* null_union_pos_val =
          codegen->GetIntConstant(TYPE_INT, field->null_union_position);
-      Value* is_not_null_val = builder->CreateCall3(
-          read_union_fn, this_val, null_union_pos_val, data_val, "is_not_null");
+      Value* is_not_null_val = builder->CreateCall(read_union_fn,
+          ArrayRef<Value*>({this_val, null_union_pos_val, data_val}), "is_not_null");
      builder->CreateCondBr(is_not_null_val, read_field_block, null_block);

      // Write null field IR
@@ -777,7 +777,7 @@ Status HdfsAvroScanner::CodegenReadRecord(
      if (slot_idx != HdfsScanNode::SKIP_COLUMN) {
        Function* set_null_fn = slot_desc->GetUpdateNullFn(codegen, true);
        DCHECK(set_null_fn != NULL);
-        builder->CreateCall(set_null_fn, tuple_val);
+        builder->CreateCall(set_null_fn, ArrayRef<Value*>({tuple_val}));
      }
      // LLVM requires all basic blocks to end with a terminating instruction
      builder->CreateBr(end_field_block);
@@ -858,8 +858,8 @@ Status HdfsAvroScanner::CodegenReadScalar(const AvroSchemaElement& element,
    } else {
      slot_type_val = builder->getInt32(slot_desc->type().type);
    }
-    Value* slot_val =
-        builder->CreateStructGEP(tuple_val, slot_desc->field_idx(), "slot");
+    Value* slot_val = builder->CreateStructGEP(NULL, tuple_val, slot_desc->llvm_field_idx(),
+        "slot");
    opaque_slot_val =
        builder->CreateBitCast(slot_val, codegen->ptr_type(), "opaque_slot");
  }
@@ -911,7 +911,7 @@ Function* HdfsAvroScanner::CodegenDecodeAvroData(RuntimeState* state,
  DCHECK_EQ(replaced, 1);

  decode_avro_data_fn->setName("DecodeAvroData");
-  decode_avro_data_fn = codegen->OptimizeFunctionWithExprs(decode_avro_data_fn);
+  decode_avro_data_fn = codegen->FinalizeFunction(decode_avro_data_fn);
  DCHECK(decode_avro_data_fn != NULL);
  return decode_avro_data_fn;
 }
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -426,7 +426,7 @@ Function* HdfsScanner::CodegenWriteCompleteTuple(
  if (node->num_materialized_partition_keys() == 0) {
    // No partition key slots, just zero the NULL bytes.
    for (int i = 0; i < tuple_desc->num_null_bytes(); ++i) {
-      Value* null_byte = builder.CreateStructGEP(tuple_arg, i, "null_byte");
+      Value* null_byte = builder.CreateStructGEP(NULL, tuple_arg, i, "null_byte");
      builder.CreateStore(codegen->GetIntConstant(TYPE_TINYINT, 0), null_byte);
    }
  } else {
@@ -484,7 +484,8 @@ Function* HdfsScanner::CodegenWriteCompleteTuple(

      // Call slot parse function
      Function* slot_fn = slot_fns[slot_idx];
-      Value* slot_parsed = builder.CreateCall3(slot_fn, tuple_arg, data, len);
+      Value* slot_parsed = builder.CreateCall(slot_fn,
+          ArrayRef<Value*>({tuple_arg, data, len}));
      Value* slot_error = builder.CreateNot(slot_parsed, "slot_parse_error");
      error_in_row = builder.CreateOr(error_in_row, slot_error, "error_in_row");
      slot_error = builder.CreateZExt(slot_error, codegen->GetType(TYPE_TINYINT));
@@ -515,8 +516,8 @@ Function* HdfsScanner::CodegenWriteCompleteTuple(

      Function* get_ctx_fn =
          codegen->GetFunction(IRFunction::HDFS_SCANNER_GET_CONJUNCT_CTX, false);
-      Value* ctx = builder.CreateCall2(
-          get_ctx_fn, this_arg, codegen->GetIntConstant(TYPE_INT, conjunct_idx));
+      Value* ctx = builder.CreateCall(get_ctx_fn,
+          ArrayRef<Value*>({this_arg, codegen->GetIntConstant(TYPE_INT, conjunct_idx)}));

      Value* conjunct_args[] = {ctx, tuple_row_arg};
      CodegenAnyVal result = CodegenAnyVal::CreateCallWrapped(
@@ -530,7 +531,7 @@ Function* HdfsScanner::CodegenWriteCompleteTuple(
  builder.SetInsertPoint(eval_fail_block);
  builder.CreateRet(codegen->false_value());

-  codegen->OptimizeFunctionWithExprs(fn);
+  codegen->FinalizeFunction(fn);
  return codegen->FinalizeFunction(fn);
 }

--- a/be/src/exec/old-hash-table.cc
+++ b/be/src/exec/old-hash-table.cc
@@ -173,8 +173,8 @@ static void CodegenAssignNullValue(LlvmCodeGen* codegen,
  int64_t fvn_seed = HashUtil::FNV_SEED;

  if (type.type == TYPE_STRING || type.type == TYPE_VARCHAR) {
-    Value* dst_ptr = builder->CreateStructGEP(dst, 0, "string_ptr");
-    Value* dst_len = builder->CreateStructGEP(dst, 1, "string_len");
+    Value* dst_ptr = builder->CreateStructGEP(NULL, dst, 0, "string_ptr");
+    Value* dst_len = builder->CreateStructGEP(NULL, dst, 1, "string_len");
    Value* null_len = codegen->GetIntConstant(TYPE_INT, fvn_seed);
    Value* null_ptr = builder->CreateIntToPtr(null_len, codegen->ptr_type());
    builder->CreateStore(null_ptr, dst_ptr);
@@ -426,13 +426,15 @@ Function* OldHashTable::CodegenHashCurrentRow(RuntimeState* state) {
    if (results_buffer_size_ > 0) {
      Function* hash_fn = codegen->GetHashFunction(results_buffer_size_);
      Value* len = codegen->GetIntConstant(TYPE_INT, results_buffer_size_);
-      hash_result = builder.CreateCall3(hash_fn, data, len, hash_result);
+      hash_result =
+          builder.CreateCall(hash_fn, ArrayRef<Value*>({data, len, hash_result}));
    }
  } else {
    if (var_result_begin_ > 0) {
      Function* hash_fn = codegen->GetHashFunction(var_result_begin_);
      Value* len = codegen->GetIntConstant(TYPE_INT, var_result_begin_);
-      hash_result = builder.CreateCall3(hash_fn, data, len, hash_result);
+      hash_result =
+          builder.CreateCall(hash_fn, ArrayRef<Value*>({data, len, hash_result}));
    }

    // Hash string slots
@@ -468,7 +470,8 @@ Function* OldHashTable::CodegenHashCurrentRow(RuntimeState* state) {
        Function* null_hash_fn = codegen->GetHashFunction(sizeof(StringValue));
        Value* llvm_loc = codegen->CastPtrToLlvmPtr(codegen->ptr_type(), loc);
        Value* len = codegen->GetIntConstant(TYPE_INT, sizeof(StringValue));
-        str_null_result = builder.CreateCall3(null_hash_fn, llvm_loc, len, hash_result);
+        str_null_result = builder.CreateCall(null_hash_fn,
+            ArrayRef<Value*>({llvm_loc, len, hash_result}));
        builder.CreateBr(continue_block);

        builder.SetInsertPoint(not_null_block);
@@ -477,15 +480,15 @@ Function* OldHashTable::CodegenHashCurrentRow(RuntimeState* state) {
      // Convert expr_values_buffer_ loc to llvm value
      Value* str_val = codegen->CastPtrToLlvmPtr(codegen->GetPtrType(TYPE_STRING), loc);

-      Value* ptr = builder.CreateStructGEP(str_val, 0, "ptr");
-      Value* len = builder.CreateStructGEP(str_val, 1, "len");
+      Value* ptr = builder.CreateStructGEP(NULL, str_val, 0, "ptr");
+      Value* len = builder.CreateStructGEP(NULL, str_val, 1, "len");
      ptr = builder.CreateLoad(ptr);
      len = builder.CreateLoad(len);

      // Call hash(ptr, len, hash_result);
      Function* general_hash_fn = codegen->GetHashFunction();
      Value* string_hash_result =
-          builder.CreateCall3(general_hash_fn, ptr, len, hash_result);
+          builder.CreateCall(general_hash_fn, ArrayRef<Value*>({ptr, len, hash_result}));

      if (stores_nulls_) {
        builder.CreateBr(continue_block);
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -1491,14 +1491,14 @@ Status PartitionedAggregationNode::CodegenUpdateSlot(

  // Src slot is not null, update dst_slot
  builder.SetInsertPoint(src_not_null_block);
-  Value* dst_ptr =
-      builder.CreateStructGEP(agg_tuple_arg, slot_desc->field_idx(), "dst_slot_ptr");
+  Value* dst_ptr = builder.CreateStructGEP(NULL, agg_tuple_arg, slot_desc->llvm_field_idx(),
+      "dst_slot_ptr");
  Value* result = NULL;

  if (slot_desc->is_nullable()) {
    // Dst is NULL, just update dst slot to src slot and clear null bit
    Function* clear_null_fn = slot_desc->GetUpdateNullFn(codegen, false);
-    builder.CreateCall(clear_null_fn, agg_tuple_arg);
+    builder.CreateCall(clear_null_fn, ArrayRef<Value*>({agg_tuple_arg}));
  }

  // Update the slot
@@ -1574,7 +1574,8 @@ Status PartitionedAggregationNode::CodegenUpdateSlot(
          builder.CreateBitCast(dst_lowered_ptr, unlowered_ptr_type, "dst_unlowered_ptr");

      // Call 'ir_fn'
-      builder.CreateCall3(ir_fn, fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr);
+      builder.CreateCall(ir_fn,
+          ArrayRef<Value*>({fn_ctx_arg, src_unlowered_ptr, dst_unlowered_ptr}));

      // Convert StringVal intermediate 'dst_arg' back to StringValue
      Value* anyval_result = builder.CreateLoad(dst_lowered_ptr, "anyval_result");
@@ -1711,9 +1712,9 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(Function** fn) {
    if (evaluator->is_count_star()) {
      // TODO: we should be able to hoist this up to the loop over the batch and just
      // increment the slot by the number of rows in the batch.
-      int field_idx = slot_desc->field_idx();
+      int field_idx = slot_desc->llvm_field_idx();
      Value* const_one = codegen->GetIntConstant(TYPE_BIGINT, 1);
-      Value* slot_ptr = builder.CreateStructGEP(tuple_arg, field_idx, "src_slot");
+      Value* slot_ptr = builder.CreateStructGEP(NULL, tuple_arg, field_idx, "src_slot");
      Value* slot_loaded = builder.CreateLoad(slot_ptr, "count_star_val");
      Value* count_inc = builder.CreateAdd(slot_loaded, const_one, "count_star_inc");
      builder.CreateStore(count_inc, slot_ptr);
@@ -1722,7 +1723,7 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(Function** fn) {
      RETURN_IF_ERROR(CodegenUpdateSlot(evaluator, slot_desc, &update_slot_fn));
      Value* fn_ctx_ptr = builder.CreateConstGEP1_32(agg_fn_ctxs_arg, i);
      Value* fn_ctx = builder.CreateLoad(fn_ctx_ptr, "fn_ctx");
-      builder.CreateCall3(update_slot_fn, fn_ctx, tuple_arg, row_arg);
+      builder.CreateCall(update_slot_fn, ArrayRef<Value*>({fn_ctx, tuple_arg, row_arg}));
    }
  }
  builder.CreateRetVoid();
@@ -1782,7 +1783,7 @@ Status PartitionedAggregationNode::CodegenProcessBatch(Function** fn) {

  replaced = codegen->ReplaceCallSites(process_batch_fn, update_tuple_fn, "UpdateTuple");
  DCHECK_GE(replaced, 1);
-  *fn = codegen->OptimizeFunctionWithExprs(process_batch_fn);
+  *fn = codegen->FinalizeFunction(process_batch_fn);
  if (*fn == NULL) {
    return Status("PartitionedAggregationNode::CodegenProcessBatch(): codegen'd "
        "ProcessBatch() function failed verification, see log");
@@ -1837,7 +1838,7 @@ Status PartitionedAggregationNode::CodegenProcessBatchStreaming(Function** fn) {
  DCHECK_EQ(replaced, 1);

  DCHECK(process_batch_streaming_fn != NULL);
-  *fn = codegen->OptimizeFunctionWithExprs(process_batch_streaming_fn);
+  *fn = codegen->FinalizeFunction(process_batch_streaming_fn);
  if (*fn == NULL) {
    return Status("PartitionedAggregationNode::CodegenProcessBatchStreaming(): codegen'd "
        "ProcessBatchStreaming() function failed verification, see log");
--- a/be/src/exec/partitioned-hash-join-node.cc
+++ b/be/src/exec/partitioned-hash-join-node.cc
@@ -1607,13 +1607,13 @@ Status PartitionedHashJoinNode::CodegenProcessBuildBatch(
      ConstantInt::get(Type::getInt1Ty(codegen->context()), false));

  // Finalize ProcessBuildBatch functions
-  process_build_batch_fn = codegen->OptimizeFunctionWithExprs(process_build_batch_fn);
+  process_build_batch_fn = codegen->FinalizeFunction(process_build_batch_fn);
  if (process_build_batch_fn == NULL) {
    return Status("Codegen'd PartitionedHashJoinNode::ProcessBuildBatch() function "
        "failed verification, see log");
  }
  process_build_batch_fn_level0 =
-      codegen->OptimizeFunctionWithExprs(process_build_batch_fn_level0);
+      codegen->FinalizeFunction(process_build_batch_fn_level0);
  if (process_build_batch_fn == NULL) {
    return Status("Codegen'd level-zero PartitionedHashJoinNode::ProcessBuildBatch() "
        "function failed verification, see log");
@@ -1765,13 +1765,13 @@ Status PartitionedHashJoinNode::CodegenProcessProbeBatch(
  DCHECK_EQ(replaced, 1);

  // Finalize ProcessProbeBatch functions
-  process_probe_batch_fn = codegen->OptimizeFunctionWithExprs(process_probe_batch_fn);
+  process_probe_batch_fn = codegen->FinalizeFunction(process_probe_batch_fn);
  if (process_probe_batch_fn == NULL) {
    return Status("PartitionedHashJoinNode::CodegenProcessProbeBatch(): codegen'd "
        "ProcessProbeBatch() function failed verification, see log");
  }
  process_probe_batch_fn_level0 =
-      codegen->OptimizeFunctionWithExprs(process_probe_batch_fn_level0);
+      codegen->FinalizeFunction(process_probe_batch_fn_level0);
  if (process_probe_batch_fn_level0 == NULL) {
    return Status("PartitionedHashJoinNode::CodegenProcessProbeBatch(): codegen'd "
        "level-zero ProcessProbeBatch() function failed verification, see log");
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -141,12 +141,12 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
  Value* is_null;
  if (check_null) {
    if (is_default_null) {
-      is_null = builder.CreateCall2(is_null_string_fn, args[1], args[2]);
+      is_null = builder.CreateCall(is_null_string_fn,
+          ArrayRef<Value*>({args[1], args[2]}));
    } else {
-      is_null = builder.CreateCall4(is_null_string_fn, args[1], args[2],
-          codegen->CastPtrToLlvmPtr(codegen->ptr_type(),
-              const_cast<char*>(null_col_val)),
-          codegen->GetIntConstant(TYPE_INT, len));
+      is_null = builder.CreateCall(is_null_string_fn, ArrayRef<Value*>({args[1], args[2],
+          codegen->CastPtrToLlvmPtr(codegen->ptr_type(), const_cast<char*>(null_col_val)),
+          codegen->GetIntConstant(TYPE_INT, len)}));
    }
  } else {
    // Constant FALSE as branch condition. We rely on later optimization passes
@@ -168,11 +168,11 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,

  // Codegen parse slot block
  builder.SetInsertPoint(parse_slot_block);
-  Value* slot = builder.CreateStructGEP(args[0], slot_desc->field_idx(), "slot");
+  Value* slot = builder.CreateStructGEP(NULL, args[0], slot_desc->llvm_field_idx(), "slot");

  if (slot_desc->type().IsVarLenStringType()) {
-    Value* ptr = builder.CreateStructGEP(slot, 0, "string_ptr");
-    Value* len = builder.CreateStructGEP(slot, 1, "string_len");
+    Value* ptr = builder.CreateStructGEP(NULL, slot, 0, "string_ptr");
+    Value* len = builder.CreateStructGEP(NULL, slot, 1, "string_len");

    builder.CreateStore(args[1], ptr);
    // TODO codegen memory allocation for CHAR
@@ -229,7 +229,8 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
    Value* failed_value = codegen->GetIntConstant(TYPE_INT, StringParser::PARSE_FAILURE);

    // Call Impala's StringTo* function
-    Value* result = builder.CreateCall3(parse_fn, args[1], args[2], parse_result_ptr);
+    Value* result = builder.CreateCall(parse_fn,
+        ArrayRef<Value*>({args[1], args[2], parse_result_ptr}));
    Value* parse_result_val = builder.CreateLoad(parse_result_ptr, "parse_result");

    // Check for parse error.  TODO: handle overflow
@@ -243,13 +244,13 @@ Function* TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,

    // Parse failed, set slot to null and return false
    builder.SetInsertPoint(parse_failed_block);
-    builder.CreateCall(set_null_fn, args[0]);
+    builder.CreateCall(set_null_fn, ArrayRef<Value*>({args[0]}));
    builder.CreateRet(codegen->false_value());
  }

  // Case where data is \N or len == 0 and it is not a string col
  builder.SetInsertPoint(set_null_block);
-  builder.CreateCall(set_null_fn, args[0]);
+  builder.CreateCall(set_null_fn, ArrayRef<Value*>({args[0]}));
  builder.CreateRet(codegen->true_value());

  return codegen->FinalizeFunction(fn);
--- a/be/src/exprs/expr-codegen-test.cc
+++ b/be/src/exprs/expr-codegen-test.cc
@@ -113,6 +113,14 @@ class ExprCodegenTest : public ::testing::Test {
    EXPECT_EQ(constants_.arg2_type_size, 0); // varlen
  }

+  static bool VerifyFunction(LlvmCodeGen* codegen, llvm::Function* fn) {
+    return codegen->VerifyFunction(fn);
+  }
+
+  static void ResetVerification(LlvmCodeGen* codegen) {
+    codegen->ResetVerification();
+  }
+
  FunctionContext* fn_ctx_;
  Constants constants_;
 };
@@ -239,19 +247,19 @@ TEST_F(ExprCodegenTest, TestInlineConstants) {
  stringstream test_udf_file;
  test_udf_file << getenv("IMPALA_HOME") << "/be/build/latest/exprs/expr-codegen-test.ll";
  scoped_ptr<LlvmCodeGen> codegen;
-  ASSERT_OK(LlvmCodeGen::LoadFromFile(&pool, test_udf_file.str(), "test", &codegen));
+  ASSERT_OK(LlvmCodeGen::CreateFromFile(&pool, test_udf_file.str(), "test", &codegen));
  Function* fn = codegen->module()->getFunction(TEST_GET_CONSTANT_SYMBOL);
  ASSERT_TRUE(fn != NULL);

  // Function verification should fail because we haven't inlined GetConstant() calls
-  bool verification_succeeded = codegen->VerifyFunction(fn);
+  bool verification_succeeded = VerifyFunction(codegen.get(), fn);
  EXPECT_FALSE(verification_succeeded);

  // Call InlineConstants() and rerun verification
  int replaced = InlineConstants(ctx->root(), codegen.get(), fn);
  EXPECT_EQ(replaced, 4);
-  codegen->ResetVerification();
-  verification_succeeded = codegen->VerifyFunction(fn);
+  ResetVerification(codegen.get());
+  verification_succeeded = VerifyFunction(codegen.get(), fn);
  EXPECT_TRUE(verification_succeeded) << LlvmCodeGen::Print(fn);

  // Compile module
--- a/be/src/exprs/expr.cc
+++ b/be/src/exprs/expr.cc
@@ -15,11 +15,11 @@
 #include <sstream>

 #include <llvm/ExecutionEngine/ExecutionEngine.h>
-#include <llvm/PassManager.h>
+#include <llvm/IR/InstIterator.h>
+#include <llvm/IR/LegacyPassManager.h>
 #include <llvm/Transforms/Scalar.h>
 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
 #include <llvm/Transforms/Utils/UnrollLoop.h>
-#include <llvm/Support/InstIterator.h>
 #include <thrift/protocol/TDebugProtocol.h>

 #include "codegen/codegen-anyval.h"
--- a/be/src/exprs/scalar-fn-call.cc
+++ b/be/src/exprs/scalar-fn-call.cc
@@ -291,7 +291,7 @@ Status ScalarFnCall::GetCodegendComputeFn(RuntimeState* state, llvm::Function**

  // First argument is always FunctionContext*.
  // Index into our registered offset in the ExprContext.
-  llvm::Value* expr_ctx_gep = builder.CreateStructGEP(expr_ctx, 1, "expr_ctx_gep");
+  llvm::Value* expr_ctx_gep = builder.CreateStructGEP(NULL, expr_ctx, 1, "expr_ctx_gep");
  llvm::Value* fn_ctxs_base = builder.CreateLoad(expr_ctx_gep, "fn_ctxs_base");
  // Use GEP to add our index to the base pointer
  llvm::Value* fn_ctx_ptr =
@@ -305,12 +305,12 @@ Status ScalarFnCall::GetCodegendComputeFn(RuntimeState* state, llvm::Function**
  if (vararg_start_idx_ != -1) {
    // FunctionContextImpl is first field of FunctionContext
    // fn_ctx_impl_ptr has type FunctionContextImpl**
-    llvm::Value* fn_ctx_impl_ptr = builder.CreateStructGEP(fn_ctx, 0, "fn_ctx_impl_ptr");
+    llvm::Value* fn_ctx_impl_ptr = builder.CreateStructGEP(NULL, fn_ctx, 0, "fn_ctx_impl_ptr");
    llvm::Value* fn_ctx_impl = builder.CreateLoad(fn_ctx_impl_ptr, "fn_ctx_impl");
    // varargs_buffer is first field of FunctionContextImpl
    // varargs_buffer_ptr has type i8**
-    llvm::Value* varargs_buffer_ptr =
-        builder.CreateStructGEP(fn_ctx_impl, 0, "varargs_buffer");
+    llvm::Value* varargs_buffer_ptr = builder.CreateStructGEP(NULL, fn_ctx_impl, 0,
+        "varargs_buffer");
    varargs_buffer = builder.CreateLoad(varargs_buffer_ptr);
  }
  // Tracks where to write the next vararg to
@@ -387,7 +387,10 @@ Status ScalarFnCall::GetCodegendComputeFn(RuntimeState* state, llvm::Function**
  builder.CreateRet(result_val);

  *fn = codegen->FinalizeFunction(*fn);
-  DCHECK(*fn != NULL);
+  if (*fn == NULL) {
+    return Status(
+        TErrorCode::UDF_VERIFY_FAILED, fn_.scalar_fn.symbol, fn_.hdfs_location);
+  }
  ir_compute_fn_ = *fn;
  return Status::OK();
 }
@@ -540,7 +543,7 @@ void ScalarFnCall::EvaluateChildren(ExprContext* context, TupleRow* row,

 template<typename RETURN_TYPE>
 RETURN_TYPE ScalarFnCall::InterpretEval(ExprContext* context, TupleRow* row) {
-  DCHECK(scalar_fn_ != NULL);
+  DCHECK(scalar_fn_ != NULL) << DebugString();
  FunctionContext* fn_ctx = context->fn_context(fn_context_index_);
  vector<AnyVal*>* input_vals = fn_ctx->impl()->staging_input_vals();
  EvaluateChildren(context, row, input_vals);
--- a/be/src/exprs/slot-ref.cc
+++ b/be/src/exprs/slot-ref.cc
@@ -247,17 +247,17 @@ Status SlotRef::GetCodegendComputeFn(RuntimeState* state, llvm::Function** fn) {
  Value* time_of_day = NULL;
  Value* date = NULL;
  if (type_.IsStringType()) {
-    Value* ptr_ptr = builder.CreateStructGEP(val_ptr, 0, "ptr_ptr");
+    Value* ptr_ptr = builder.CreateStructGEP(NULL, val_ptr, 0, "ptr_ptr");
    ptr = builder.CreateLoad(ptr_ptr, "ptr");
-    Value* len_ptr = builder.CreateStructGEP(val_ptr, 1, "len_ptr");
+    Value* len_ptr = builder.CreateStructGEP(NULL, val_ptr, 1, "len_ptr");
    len = builder.CreateLoad(len_ptr, "len");
  } else if (type() == TYPE_TIMESTAMP) {
-    Value* time_of_day_ptr = builder.CreateStructGEP(val_ptr, 0, "time_of_day_ptr");
+    Value* time_of_day_ptr = builder.CreateStructGEP(NULL, val_ptr, 0, "time_of_day_ptr");
    // Cast boost::posix_time::time_duration to i64
    Value* time_of_day_cast =
        builder.CreateBitCast(time_of_day_ptr, codegen->GetPtrType(TYPE_BIGINT));
    time_of_day = builder.CreateLoad(time_of_day_cast, "time_of_day");
-    Value* date_ptr = builder.CreateStructGEP(val_ptr, 1, "date_ptr");
+    Value* date_ptr = builder.CreateStructGEP(NULL, val_ptr, 1, "date_ptr");
    // Cast boost::gregorian::date to i32
    Value* date_cast = builder.CreateBitCast(date_ptr, codegen->GetPtrType(TYPE_INT));
    date = builder.CreateLoad(date_cast, "date");
--- a/be/src/runtime/decimal-value.inline.h
+++ b/be/src/runtime/decimal-value.inline.h
@@ -93,7 +93,9 @@ inline DecimalValue<T> DecimalValue<T>::ScaleTo(int src_scale, int dst_scale,
  return DecimalValue(result);
 }

-#if 5 <= __GNUC__ || __has_builtin(__builtin_add_overflow)
+// Use __builtin_add_overflow on GCC if available.
+// Avoid using on Clang: it regresses performance.
+#if 5 <= __GNUC__
 template<typename T>
 template<typename RESULT_T>
 inline DecimalValue<RESULT_T> DecimalValue<T>::Add(int this_scale,
@@ -143,7 +145,10 @@ inline DecimalValue<RESULT_T> DecimalValue<T>::Add(int this_scale,
 }
 #endif

-#if 5 <= __GNUC__ || __has_builtin(__builtin_mul_overflow)
+// Use __builtin_mul_overflow on GCC if available.
+// Avoid using on Clang: it requires a function __muloti present in the Clang runtime
+// library but not the GCC runtime library and regresses performance.
+#if 5 <= __GNUC__
 template<typename T>
 template<typename RESULT_T>
 inline DecimalValue<RESULT_T> DecimalValue<T>::Multiply(int this_scale,
--- a/be/src/runtime/descriptors.cc
+++ b/be/src/runtime/descriptors.cc
@@ -84,7 +84,7 @@ SlotDescriptor::SlotDescriptor(
    null_indicator_offset_(tdesc.nullIndicatorByte, tdesc.nullIndicatorBit),
    slot_idx_(tdesc.slotIdx),
    slot_size_(type_.GetSlotSize()),
-    field_idx_(-1),
+    llvm_field_idx_(-1),
    is_null_fn_(NULL),
    set_not_null_fn_(NULL),
    set_null_fn_(NULL) {
@@ -122,7 +122,7 @@ string SlotDescriptor::DebugString() const {
    out << " collection_item_tuple_id=" << collection_item_descriptor_->id();
  }
  out << " offset=" << tuple_offset_ << " null=" << null_indicator_offset_.DebugString()
-      << " slot_idx=" << slot_idx_ << " field_idx=" << field_idx_
+      << " slot_idx=" << slot_idx_ << " field_idx=" << llvm_field_idx_
      << ")";
  return out.str();
 }
@@ -601,7 +601,8 @@ Function* SlotDescriptor::GetIsNullFn(LlvmCodeGen* codegen) const {
  Value* tuple_ptr;
  Function* fn = prototype.GeneratePrototype(&builder, &tuple_ptr);

-  Value* null_byte_ptr = builder.CreateStructGEP(tuple_ptr, byte_offset, "null_byte_ptr");
+  Value* null_byte_ptr = builder.CreateStructGEP(NULL, tuple_ptr, byte_offset,
+      "null_byte_ptr");
  Value* null_byte = builder.CreateLoad(null_byte_ptr, "null_byte");
  Value* null_mask = builder.CreateAnd(null_byte, mask, "null_mask");
  Value* is_null = builder.CreateICmpNE(null_mask, zero, "is_null");
@@ -635,8 +636,7 @@ Function* SlotDescriptor::GetUpdateNullFn(LlvmCodeGen* codegen, bool set_null) c
  Value* tuple_ptr;
  Function* fn = prototype.GeneratePrototype(&builder, &tuple_ptr);

-  Value* null_byte_ptr =
-      builder.CreateStructGEP(
+  Value* null_byte_ptr = builder.CreateStructGEP(NULL,
          tuple_ptr, null_indicator_offset_.byte_offset, "null_byte_ptr");
  Value* null_byte = builder.CreateLoad(null_byte_ptr, "null_byte");
  Value* result = NULL;
@@ -663,53 +663,57 @@ Function* SlotDescriptor::GetUpdateNullFn(LlvmCodeGen* codegen, bool set_null) c
  return fn;
 }

-// The default llvm packing is identical to what we do in the FE.  Each field is aligned
-// to begin on the size for that type.
-// TODO: Understand llvm::SetTargetData which allows you to explicitly define the packing
-// rules.
 StructType* TupleDescriptor::GetLlvmStruct(LlvmCodeGen* codegen) const {
  // If we already generated the llvm type, just return it.
  if (llvm_struct_ != NULL) return llvm_struct_;

-  // For each null byte, add a byte to the struct
-  vector<Type*> struct_fields;
-  struct_fields.resize(num_null_bytes_ + slots_.size());
-  for (int i = 0; i < num_null_bytes_; ++i) {
-    struct_fields[i] = codegen->GetType(TYPE_TINYINT);
+  // Sort slots in the order they will appear in LLVM struct.
+  vector<SlotDescriptor*> sorted_slots(slots_.size());
+  for (SlotDescriptor* slot: slots_) {
+    sorted_slots[slot->slot_idx_] = slot;
  }

+  // For each null byte, add a byte to the struct
+  vector<Type*> struct_fields;
+  for (int i = 0; i < num_null_bytes_; ++i) {
+    struct_fields.push_back(codegen->GetType(TYPE_TINYINT));
+  }
+  int curr_struct_offset = num_null_bytes_;
+
  // Add the slot types to the struct description.
-  for (int i = 0; i < slots().size(); ++i) {
-    SlotDescriptor* slot_desc = slots()[i];
-    if (slot_desc->type().type == TYPE_CHAR) return NULL;
-    slot_desc->field_idx_ = slot_desc->slot_idx_ + num_null_bytes_;
-    DCHECK_LT(slot_desc->field_idx(), struct_fields.size());
-    struct_fields[slot_desc->field_idx()] = codegen->GetType(slot_desc->type());
+  for (SlotDescriptor* slot: sorted_slots) {
+    // IMPALA-3207: Codegen for CHAR is not yet implemented: bail out of codegen here.
+    if (slot->type().type == TYPE_CHAR) return NULL;
+    DCHECK_LE(curr_struct_offset, slot->tuple_offset());
+    if (curr_struct_offset < slot->tuple_offset()) {
+      // Need to add padding to ensure slots are aligned correctly. Clang likes to
+      // sometimes pad structs in its own way. When it does this, it sets the 'packed'
+      // flag, which means that at the LLVM level the struct type has no alignment
+      // requirements, even if it does at the C language level.
+      struct_fields.push_back(ArrayType::get(codegen->GetType(TYPE_TINYINT),
+          slot->tuple_offset() - curr_struct_offset));
+    }
+    slot->llvm_field_idx_ = struct_fields.size();
+    struct_fields.push_back(codegen->GetType(slot->type()));
+    curr_struct_offset = slot->tuple_offset() + slot->slot_size();
+  }
+  DCHECK_LE(curr_struct_offset, byte_size_);
+  if (curr_struct_offset < byte_size_) {
+    struct_fields.push_back(ArrayType::get(codegen->GetType(TYPE_TINYINT),
+        byte_size_ - curr_struct_offset));
  }

  // Construct the struct type.
+  // We don't mark the struct as packed but it shouldn't matter either way: LLVM should
+  // not insert any additional padding since the contents are already aligned.
  StructType* tuple_struct = StructType::get(codegen->context(),
      ArrayRef<Type*>(struct_fields));
-
-  // Verify the alignment is correct.  It is essential that the layout matches
-  // identically.  If the layout does not match, return NULL indicating the
-  // struct could not be codegen'd.  This will trigger codegen for anything using
-  // the tuple to be disabled.
-  const DataLayout* data_layout = codegen->execution_engine()->getDataLayout();
-  const StructLayout* layout = data_layout->getStructLayout(tuple_struct);
-  if (layout->getSizeInBytes() != byte_size()) {
-    DCHECK_EQ(layout->getSizeInBytes(), byte_size());
-    return NULL;
-  }
-  for (int i = 0; i < slots().size(); ++i) {
-    SlotDescriptor* slot_desc = slots()[i];
-    int field_idx = slot_desc->field_idx();
+  const DataLayout& data_layout = codegen->execution_engine()->getDataLayout();
+  const StructLayout* layout = data_layout.getStructLayout(tuple_struct);
+  for (SlotDescriptor* slot: slots()) {
    // Verify that the byte offset in the llvm struct matches the tuple offset
-    // computed in the FE
-    if (layout->getElementOffset(field_idx) != slot_desc->tuple_offset()) {
-      DCHECK_EQ(layout->getElementOffset(field_idx), slot_desc->tuple_offset());
-      return NULL;
-    }
+    // computed in the FE.
+    DCHECK_EQ(layout->getElementOffset(slot->llvm_field_idx()), slot->tuple_offset());
  }
  llvm_struct_ = tuple_struct;
  return tuple_struct;
--- a/be/src/runtime/descriptors.h
+++ b/be/src/runtime/descriptors.h
@@ -113,7 +113,7 @@ class SlotDescriptor {
  int col_pos() const { return col_path_[0]; }
  const SchemaPath& col_path() const { return col_path_; }
  /// Returns the field index in the generated llvm struct for this slot's tuple
-  int field_idx() const { return field_idx_; }
+  int llvm_field_idx() const { return llvm_field_idx_; }
  int tuple_offset() const { return tuple_offset_; }
  const NullIndicatorOffset& null_indicator_offset() const {
    return null_indicator_offset_;
@@ -160,10 +160,10 @@ class SlotDescriptor {
  /// the byte size of this slot.
  const int slot_size_;

-  /// the idx of the slot in the llvm codegen'd tuple struct
-  /// this is set by TupleDescriptor during codegen and takes into account
-  /// leading null bytes.
-  int field_idx_;
+  /// The idx of the slot in the llvm codegen'd tuple struct
+  /// This is set by TupleDescriptor during codegen and takes into account
+  /// leading null bytes and any padding bytes.
+  int llvm_field_idx_;

  /// Cached codegen'd functions
  mutable llvm::Function* is_null_fn_;
--- a/be/src/runtime/lib-cache.cc
+++ b/be/src/runtime/lib-cache.cc
@@ -396,12 +396,12 @@ Status LibCache::GetCacheEntryInternal(const string& hdfs_lib_file, LibType type
    RETURN_IF_ERROR(
        DynamicOpen((*entry)->local_path.c_str(), &(*entry)->shared_object_handle));
  } else if (type == TYPE_IR) {
-    // Load the module and populate all symbols.
+    // Load the module temporarily and populate all symbols.
    ObjectPool pool;
    scoped_ptr<LlvmCodeGen> codegen;
    string module_id = filesystem::path((*entry)->local_path).stem().string();
-    RETURN_IF_ERROR(LlvmCodeGen::LoadFromFile(
-        &pool, (*entry)->local_path, module_id, &codegen));
+    RETURN_IF_ERROR(
+        LlvmCodeGen::CreateFromFile(&pool, (*entry)->local_path, module_id, &codegen));
    codegen->GetSymbols(&(*entry)->symbols);
  } else {
    DCHECK_EQ(type, TYPE_JAR);
--- a/be/src/runtime/runtime-state.cc
+++ b/be/src/runtime/runtime-state.cc
@@ -184,7 +184,7 @@ Status RuntimeState::CreateBlockMgr() {
 Status RuntimeState::CreateCodegen() {
  if (codegen_.get() != NULL) return Status::OK();
  // TODO: add the fragment ID to the codegen ID as well
-  RETURN_IF_ERROR(LlvmCodeGen::LoadImpalaIR(
+  RETURN_IF_ERROR(LlvmCodeGen::CreateImpalaCodegen(
      obj_pool_.get(), PrintId(fragment_instance_id()), &codegen_));
  codegen_->EnableOptimizations(true);
  profile_.AddChild(codegen_->runtime_profile());
--- a/bin/bootstrap_toolchain.py
+++ b/bin/bootstrap_toolchain.py
@@ -297,6 +297,6 @@ def unpack_name_and_version(package):

 if __name__ == "__main__":
  packages = ["avro", "binutils", "boost", "bzip2", "gcc", "gflags", "glog", "gperftools",
-      "gtest", "kudu", "llvm", ("llvm", "3.3-p1"), ("llvm", "3.7.0"), "lz4", "openldap",
+      "gtest", "kudu", "llvm", ("llvm", "3.8.0-asserts-p1"), "lz4", "openldap",
      "rapidjson", "re2", "snappy", "thrift", "zlib"]
  bootstrap(packages)
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -231,9 +231,9 @@ export IMPALA_GLOG_VERSION=0.3.2
 export IMPALA_GPERFTOOLS_VERSION=2.5
 export IMPALA_GTEST_VERSION=1.6.0
 export IMPALA_KUDU_VERSION=0.8.0-RC1
-export IMPALA_LLVM_VERSION=3.3
-export IMPALA_LLVM_DEBUG_VERSION=3.3
-export IMPALA_LLVM_ASAN_VERSION=3.7.0
+export IMPALA_LLVM_VERSION=3.8.0
+export IMPALA_LLVM_DEBUG_VERSION=3.8.0
+export IMPALA_LLVM_ASAN_VERSION=3.8.0
 export IMPALA_LZ4_VERSION=svn
 export IMPALA_MIN_BOOST_VERSION=1.46.0
 export IMPALA_OPENLDAP_VERSION=2.4.25
@@ -246,18 +246,18 @@ export IMPALA_THRIFT_VERSION=0.9.0
 export IMPALA_THRIFT_JAVA_VERSION=0.9.0
 export IMPALA_ZLIB_VERSION=1.2.8

-# Some of the variables need to be overwritten to explicitely mark the patch level
+# Some of the variables need to be overwritten to explicitly mark the patch level
 if [[ -n "$IMPALA_TOOLCHAIN" ]]; then
  IMPALA_AVRO_VERSION+=-p4
  IMPALA_BZIP2_VERSION+=-p1
  IMPALA_GLOG_VERSION+=-p1
  IMPALA_THRIFT_VERSION+=-p2
  IMPALA_RE2_VERSION+=-p1
-  IMPALA_LLVM_VERSION+=-no-asserts-p1
-  # Debug builds should use the default release-with-assertions build from the toolchain
-  # Note that the default toolchain build of 3.7 and trunk is release with no assertions,
-  # so this will need to be revisited when upgrading the LLVM version.
-  IMPALA_LLVM_DEBUG_VERSION+=-p1
+  IMPALA_LLVM_VERSION+=-p1
+  IMPALA_LLVM_ASAN_VERSION+=-p1
+  # Debug builds should use the release+asserts build to get additional coverage.
+  # Don't use the LLVM debug build because the binaries are too large to distribute.
+  IMPALA_LLVM_DEBUG_VERSION+=-asserts-p1
 fi

 export KUDU_MASTER=${KUDU_MASTER:-"127.0.0.1"}
@@ -270,7 +270,6 @@ export KUDU_JAVA_VERSION=0.6.0
 if [[ $OSTYPE == "darwin"* ]]; then
  IMPALA_CYRUS_SASL_VERSION=2.1.26
  IMPALA_GPERFTOOLS_VERSION=2.3
-  IMPALA_LLVM_VERSION=3.3-p1
  IMPALA_OPENSSL_VERSION=1.0.1p
  IMPALA_THRIFT_VERSION=0.9.2
  IMPALA_THRIFT_JAVA_VERSION=0.9.2
--- a/cmake_modules/FindLlvm.cmake
+++ b/cmake_modules/FindLlvm.cmake
@@ -66,7 +66,7 @@ execute_process(
 # Get the link libs we need.  llvm has many and we don't want to link all of the libs
 # if we don't need them.
 execute_process(
-  COMMAND ${LLVM_CONFIG_EXECUTABLE} --libnames core jit native ipo bitreader target linker
+  COMMAND ${LLVM_CONFIG_EXECUTABLE} --libnames core mcjit native ipo bitreader target linker analysis
  OUTPUT_VARIABLE LLVM_MODULE_LIBS
  OUTPUT_STRIP_TRAILING_WHITESPACE
 )
@@ -74,7 +74,7 @@ execute_process(
 # CMake really doesn't like adding link directories and wants absolute paths
 # Reconstruct it with LLVM_MODULE_LIBS and LLVM_LIBRARY_DIR
 string(REPLACE " " ";" LIBS_LIST ${LLVM_MODULE_LIBS})
-set (LLVM_MODULE_LIBS "-ldl")
+set (LLVM_MODULE_LIBS "")
 foreach (LIB ${LIBS_LIST})
  set(LLVM_MODULE_LIBS ${LLVM_MODULE_LIBS} "${LLVM_LIBRARY_DIR}/${LIB}")
 endforeach(LIB)