IMPALA-5081: Add codegen_opt_level query option

Adds the 'codegen_opt_level' query option to select LLVM optimization level for generated code. Retains the prior behavior - O2 - as default. If optimization level is changed for an entry already in cache, the cache entry will be used unless the new optimization level is higher than the cached level. Adds additional counters for NumOptimizedFunctions and NumOptimizedInstructions, which allow observing some impacts from codegen optimization. These additional counters, and tracking opt level for cached entries, increases the size of each cached entry. Adds unit tests for all optimizition levels checking - that small functions are inlined at higher levels (as a way to verify that optimization level has an effect) - codegen cache entries are updated when optimizing the same fragment at a higher level, and not updated the rest of the time Change-Id: I371f8758b6552263e91a1fbfd9a6e1c28e1fa2bd Reviewed-on: http://gerrit.cloudera.org:8080/20399 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2023-08-22 13:14:40 -07:00
parent 379038f763
commit 7230c57f67
12 changed files with 306 additions and 39 deletions
--- a/be/src/codegen/CMakeLists.txt
+++ b/be/src/codegen/CMakeLists.txt
@@ -112,9 +112,16 @@ add_custom_target(test-loop.bc
  SOURCES ${CMAKE_SOURCE_DIR}/testdata/llvm/test-loop.cc
 )

+# Provide "unoptimized" output. Clang O0 adds attributes to everything which prevents us
+# doing optimizations in the test, so use O1 instead.
+add_custom_target(test-opt.bc
+  COMMAND ${LLVM_CLANG_EXECUTABLE} ${CLANG_IR_CXX_FLAGS} -O1 ${CLANG_INCLUDE_FLAGS} ${CMAKE_SOURCE_DIR}/testdata/llvm/test-opt.cc -o ${CMAKE_SOURCE_DIR}/llvm-ir/test-opt.bc
+  SOURCES ${CMAKE_SOURCE_DIR}/testdata/llvm/test-opt.cc
+)
+
 # Exception to unified be tests: custom main initializes LLVM
 ADD_BE_LSAN_TEST(llvm-codegen-test)
-add_dependencies(llvm-codegen-test test-loop.bc)
+add_dependencies(llvm-codegen-test test-loop.bc test-opt.bc)
 ADD_BE_LSAN_TEST(llvm-codegen-cache-test LlvmCodegenCacheTest.*)

 ADD_UNIFIED_BE_LSAN_TEST(instruction-counter-test InstructionCounterTest.*)
--- a/be/src/codegen/llvm-codegen-cache-test.cc
+++ b/be/src/codegen/llvm-codegen-cache-test.cc
@@ -136,6 +136,7 @@ class LlvmCodeGenCacheTest : public testing::Test {
  scoped_ptr<CodeGenCache> codegen_cache_;
  shared_ptr<LlvmExecutionEngineWrapper> exec_engine_wrapper_;
  TQueryOptions query_options_;
+  TCodeGenOptLevel::type opt_level_ = TCodeGenOptLevel::O2;
 };

 void LlvmCodeGenCacheTest::AddLlvmCodegenEcho(LlvmCodeGen* codegen) {
@@ -214,7 +215,7 @@ void LlvmCodeGenCacheTest::TestBasicFunction(TCodeGenCacheMode::type mode) {
      GetMemCharge(codegen_double.get(), cache_key.data(), is_normal_mode);

  // Store and lookup the entry by the key.
-  EXPECT_OK(codegen_cache_->Store(cache_key, codegen.get(), mode));
+  EXPECT_OK(codegen_cache_->Store(cache_key, codegen.get(), mode, opt_level_));
  CheckInUseMetrics(
      codegen_cache_.get(), 1 /*num_entry_in_use*/, mem_charge /*bytes_in_use*/);
  EXPECT_OK(codegen_cache_->Lookup(cache_key, mode, &entry, &exec_engine_wrapper_));
@@ -225,7 +226,7 @@ void LlvmCodeGenCacheTest::TestBasicFunction(TCodeGenCacheMode::type mode) {
  CheckResult(entry);
  // Override the entry with a different function, should be able to find the new
  // function from the new entry.
-  EXPECT_OK(codegen_cache_->Store(cache_key, codegen_double.get(), mode));
+  EXPECT_OK(codegen_cache_->Store(cache_key, codegen_double.get(), mode, opt_level_));
  CheckInUseMetrics(
      codegen_cache_.get(), 1 /*num_entry_in_use*/, mem_charge_double /*bytes_in_use*/);
  EXPECT_OK(codegen_cache_->Lookup(cache_key, mode, &entry, &exec_engine_wrapper_));
@@ -269,12 +270,12 @@ int64_t LlvmCodeGenCacheTest::GetMemCharge(
 void LlvmCodeGenCacheTest::TestAtCapacity(TCodeGenCacheMode::type mode) {
  int64_t codegen_cache_capacity = 196; // 196B
  bool is_normal_mode = !CodeGenCacheModeAnalyzer::is_optimal(mode);
-  // 128B for optimal mode, or 140B on ARM systems
+  // 150B for optimal mode, or 164B on ARM systems
  if (!is_normal_mode) {
 #ifdef __aarch64__
-    codegen_cache_capacity = 140;
+    codegen_cache_capacity = 164;
 #else
-    codegen_cache_capacity = 128;
+    codegen_cache_capacity = 150;
 #endif
  }
  // Using single shard makes the logic of scenarios simple for capacity and
@@ -472,7 +473,7 @@ void LlvmCodeGenCacheTest::TestSwitchModeHelper(TCodeGenCacheMode::type mode, st
  CodeGenCacheKeyConstructor::construct(key, &cache_key);

  // Store and lookup the entry by the key.
-  EXPECT_OK(codegen_cache_->Store(cache_key, codegen.get(), mode));
+  EXPECT_OK(codegen_cache_->Store(cache_key, codegen.get(), mode, opt_level_));
  if (expect_entry_num != -1) {
    CheckInUseMetrics(codegen_cache_.get(), expect_entry_num /*num_entry_in_use*/);
  }
@@ -546,7 +547,7 @@ void LlvmCodeGenCacheTest::StoreHelper(TCodeGenCacheMode::type mode, string key)
  CodeGenCacheKey cache_key;
  CodeGenCacheEntry entry;
  CodeGenCacheKeyConstructor::construct(key, &cache_key);
-  EXPECT_OK(codegen_cache_->Store(cache_key, codegen.get(), mode));
+  EXPECT_OK(codegen_cache_->Store(cache_key, codegen.get(), mode, opt_level_));
  codegen->Close();
 }

--- a/be/src/codegen/llvm-codegen-cache.cc
+++ b/be/src/codegen/llvm-codegen-cache.cc
@@ -133,8 +133,9 @@ Status CodeGenCache::Lookup(const CodeGenCacheKey& cache_key,
    // to look for jitted functions.
    if (LookupEngine(cached_entry->engine_pointer, execution_engine)) {
      entry->Reset(cached_entry->engine_pointer, cached_entry->num_functions,
-          cached_entry->num_instructions, cached_entry->function_names_hashcode,
-          cached_entry->total_bytes_charge);
+          cached_entry->num_instructions, cached_entry->num_opt_functions,
+          cached_entry->num_opt_instructions, cached_entry->function_names_hashcode,
+          cached_entry->total_bytes_charge, cached_entry->opt_level);
      return Status::OK();
    }
  }
@@ -143,7 +144,8 @@ Status CodeGenCache::Lookup(const CodeGenCacheKey& cache_key,
 }

 Status CodeGenCache::StoreInternal(const CodeGenCacheKey& cache_key,
-    LlvmCodeGen* codegen, const TCodeGenCacheMode::type& mode) {
+    LlvmCodeGen* codegen, TCodeGenCacheMode::type mode,
+    TCodeGenOptLevel::type opt_level) {
  // In normal mode, we will store the whole key content to the cache.
  // Otherwise, in optimal mode, we will only store the hash code and length of the key.
  Slice key = cache_key.optimal_key_slice();
@@ -163,7 +165,9 @@ Status CodeGenCache::StoreInternal(const CodeGenCacheKey& cache_key,
  CodeGenCacheEntry* cache_entry =
      reinterpret_cast<CodeGenCacheEntry*>(cache_->MutableValue(&pending_handle));
  cache_entry->Reset(codegen->execution_engine(), codegen->num_functions_->value(),
-      codegen->num_instructions_->value(), codegen->function_names_hashcode_, mem_charge);
+      codegen->num_instructions_->value(), codegen->num_opt_functions_->value(),
+      codegen->num_opt_instructions_->value(), codegen->function_names_hashcode_,
+      mem_charge, opt_level);
  StoreEngine(codegen);
  /// It is thread-safe, but could override the existing entry with the same key.
  Cache::UniqueHandle cache_handle =
@@ -181,7 +185,7 @@ Status CodeGenCache::StoreInternal(const CodeGenCacheKey& cache_key,
 }

 Status CodeGenCache::Store(const CodeGenCacheKey& cache_key, LlvmCodeGen* codegen,
-    const TCodeGenCacheMode::type& mode) {
+    TCodeGenCacheMode::type mode, TCodeGenOptLevel::type opt_level) {
  DCHECK(!is_closed_);
  DCHECK(cache_ != nullptr);
  DCHECK(codegen != nullptr);
@@ -206,7 +210,7 @@ Status CodeGenCache::Store(const CodeGenCacheKey& cache_key, LlvmCodeGen* codege
    if (!key_to_insert_it.second) return status;
  }
  // Do store the cache entry to the cache.
-  status = StoreInternal(cache_key, codegen, mode);
+  status = StoreInternal(cache_key, codegen, mode, opt_level);
  // Remove the hash code of the key from the to_insert_keys set.
  lock_guard<mutex> lock(to_insert_set_lock_);
  keys_to_insert_.erase(key_to_insert_it.first);
--- a/be/src/codegen/llvm-codegen-cache.h
+++ b/be/src/codegen/llvm-codegen-cache.h
@@ -161,25 +161,37 @@ class CodeGenCacheModeAnalyzer {
 };

 struct CodeGenCacheEntry {
-  CodeGenCacheEntry() { Init(); }
+  CodeGenCacheEntry() : engine_pointer(nullptr) {}
+  // When Empty, no guarantees are made about the content of other fields.
  bool Empty() { return engine_pointer == nullptr; }
-  void Init() { memset((uint8_t*)this, 0, sizeof(CodeGenCacheEntry)); }
-  void Reset() { Reset(nullptr, 0, 0, 0, 0); }
+  void Reset() { engine_pointer = nullptr; }
  void Reset(llvm::ExecutionEngine* engine_ptr, int64_t num_funcs, int64_t num_instrucs,
-      uint64_t names_hashcode, int64_t charge) {
+      int64_t num_opt_funcs, int64_t num_opt_instrucs, uint64_t names_hashcode,
+      int64_t charge, TCodeGenOptLevel::type opt) {
    engine_pointer = engine_ptr;
    num_functions = num_funcs;
    num_instructions = num_instrucs;
+    num_opt_functions = num_opt_funcs;
+    num_opt_instructions = num_opt_instrucs;
    function_names_hashcode = names_hashcode;
    total_bytes_charge = charge;
+    opt_level = opt;
  }
  llvm::ExecutionEngine* engine_pointer;
+  /// Number of functions before optimization.
  int64_t num_functions;
+  /// Number of instructions before optimization.
  int64_t num_instructions;
+  /// Number of functions after optimization.
+  int64_t num_opt_functions;
+  /// Number of instructions after optimization.
+  int64_t num_opt_instructions;
  /// The hashcode of function names in the entry.
  uint64_t function_names_hashcode;
  /// Bytes charge including the key and the entry.
  int64_t total_bytes_charge;
+  /// CodeGen optimization level used to generate this entry.
+  TCodeGenOptLevel::type opt_level;
 };

 /// Each CodeGenCache is supposed to be a singleton in the daemon, manages the codegen
@@ -208,7 +220,7 @@ class CodeGenCache {

  /// Store the cache entry with the specific cache key.
  Status Store(const CodeGenCacheKey& key, LlvmCodeGen* codegen,
-      const TCodeGenCacheMode::type& mode);
+      TCodeGenCacheMode::type mode, TCodeGenOptLevel::type opt_level);

  /// Store the shared pointer of llvm execution engine to the cache to keep all the
  /// jitted functions in that engine alive.
@@ -257,7 +269,7 @@ class CodeGenCache {

  /// Helper function to store the entry to the cache.
  Status StoreInternal(const CodeGenCacheKey& key, LlvmCodeGen* codegen,
-      const TCodeGenCacheMode::type& mode);
+      TCodeGenCacheMode::type mode, TCodeGenOptLevel::type opt_level);

  /// Indicate if the cache is closed. If is closed, no call is allowed for any functions
  /// other than ReleaseResources().
--- a/be/src/codegen/llvm-codegen-test.cc
+++ b/be/src/codegen/llvm-codegen-test.cc
@@ -61,7 +61,7 @@ TEST_F(CodegenFnPtrTest, StoreNonVoidPtrAndLoad) {
  ASSERT_TRUE(loaded_fn_ptr == int_identity);
 }

-class LlvmCodeGenTest : public testing:: Test {
+class LlvmCodeGenTest : public testing::Test {
 protected:
  scoped_ptr<TestEnv> test_env_;
  FragmentState* fragment_state_;
@@ -112,10 +112,6 @@ class LlvmCodeGenTest : public testing:: Test {
    return (*codegen)->MaterializeModule();
  }

-  static void ClearHashFns(LlvmCodeGen* codegen) {
-    codegen->ClearHashFns();
-  }
-
  static void AddFunctionToJit(LlvmCodeGen* codegen, llvm::Function* fn,
      CodegenFnPtrBase* fn_ptr) {
    // Bypass Impala-specific logic in AddFunctionToJit() that assumes Impala's struct
@@ -565,6 +561,148 @@ TEST_F(LlvmCodeGenTest, CleanupNonFinalizedMethodsTest) {
  ASSERT_TRUE(ContainsHandcraftedFn(codegen.get(), complete_fn));
  ASSERT_OK(FinalizeModule(codegen.get()));
 }
+
+class LlvmOptTest :
+    public testing::TestWithParam<std::tuple<TCodeGenOptLevel::type, bool>> {
+ protected:
+  scoped_ptr<MetricGroup> metrics_;
+  scoped_ptr<TestEnv> test_env_;
+  FragmentState* fragment_state_;
+  TQueryOptions query_opts_;
+
+  virtual void SetUp() {
+    metrics_.reset(new MetricGroup("codegen-test"));
+    test_env_.reset(new TestEnv());
+    ASSERT_OK(test_env_->Init());
+    SetUpQuery(0, std::get<0>(GetParam()));
+  }
+
+  virtual void TearDown() {
+    fragment_state_->ReleaseResources();
+    fragment_state_ = nullptr;
+    test_env_.reset();
+    metrics_.reset();
+  }
+
+  void SetUpQuery(int64_t query_id, TCodeGenOptLevel::type level) {
+    query_opts_.__set_codegen_opt_level(level);
+    RuntimeState* runtime_state_;
+    ASSERT_OK(test_env_->CreateQueryState(query_id, &query_opts_, &runtime_state_));
+    QueryState* qs = runtime_state_->query_state();
+    TPlanFragment* fragment = qs->obj_pool()->Add(new TPlanFragment());
+    PlanFragmentCtxPB* fragment_ctx = qs->obj_pool()->Add(new PlanFragmentCtxPB());
+    fragment_state_ =
+        qs->obj_pool()->Add(new FragmentState(qs, *fragment, *fragment_ctx));
+  }
+
+  void EnableCodegenCache() {
+    test_env_->ResetCodegenCache(metrics_.get());
+    EXPECT_OK(test_env_->codegen_cache()->Init(1024*1024));
+  }
+
+  // Wrapper to call private test-only methods on LlvmCodeGen object
+  Status CreateFromFile(const string& filename, scoped_ptr<LlvmCodeGen>* codegen) {
+    RETURN_IF_ERROR(LlvmCodeGen::CreateFromFile(fragment_state_,
+        fragment_state_->obj_pool(), nullptr, filename, "test", codegen));
+    return (*codegen)->MaterializeModule();
+  }
+
+  static void AddFunctionToJit(LlvmCodeGen* codegen, llvm::Function* fn,
+      CodegenFnPtrBase* fn_ptr) {
+    // Bypass Impala-specific logic in AddFunctionToJit() that assumes Impala's struct
+    // types are available in the module.
+    return codegen->AddFunctionToJitInternal(fn, fn_ptr);
+  }
+
+  static Status FinalizeModule(LlvmCodeGen* codegen) { return codegen->FinalizeModule(); }
+
+  static void VerifyCounters(LlvmCodeGen* codegen, bool expect_unopt) {
+    ASSERT_GT(codegen->num_functions_->value(), 0);
+    ASSERT_GT(codegen->num_instructions_->value(), 0);
+    if (expect_unopt) {
+      ASSERT_EQ(codegen->num_opt_functions_->value(), codegen->num_functions_->value());
+      ASSERT_EQ(
+          codegen->num_opt_instructions_->value(), codegen->num_instructions_->value());
+    } else {
+      ASSERT_LT(codegen->num_opt_functions_->value(), codegen->num_functions_->value());
+      ASSERT_LT(
+          codegen->num_opt_instructions_->value(), codegen->num_instructions_->value());
+    }
+  }
+
+  void LoadTestOpt(scoped_ptr<LlvmCodeGen>& codegen) {
+    const string func("_Z9loop_nullPii");
+    typedef void (*LoopFn)(int*, int);
+
+    string module_file;
+    PathBuilder::GetFullPath("llvm-ir/test-opt.bc", &module_file);
+
+    ASSERT_OK(CreateFromFile(module_file.c_str(), &codegen));
+    EXPECT_TRUE(codegen.get() != nullptr);
+    codegen->EnableOptimizations(true);
+
+    llvm::Function* fn = codegen->GetFunction(func, false);
+    EXPECT_TRUE(fn != nullptr);
+    EXPECT_EQ(fn->arg_size(), 2);
+    CodegenFnPtr<LoopFn> fn_impl;
+    AddFunctionToJit(codegen.get(), fn, &fn_impl);
+
+    ASSERT_OK(FinalizeModule(codegen.get()));
+    ASSERT_TRUE(fn_impl.load() != nullptr);
+  }
+};
+
+TEST_P(LlvmOptTest, OptFunction) {
+  scoped_ptr<LlvmCodeGen> codegen;
+  LoadTestOpt(codegen);
+  VerifyCounters(codegen.get(), std::get<1>(GetParam()));
+  codegen->Close();
+}
+
+TEST_P(LlvmOptTest, CachedOptFunction) {
+  TCodeGenOptLevel::type opt_level = std::get<0>(GetParam());
+  bool expect_unoptimized = std::get<1>(GetParam());
+  EnableCodegenCache();
+
+  scoped_ptr<LlvmCodeGen> codegen;
+  LoadTestOpt(codegen);
+  VerifyCounters(codegen.get(), expect_unoptimized);
+  codegen->Close();
+
+  int64_t query_id = 0;
+  constexpr std::array<TCodeGenOptLevel::type, 5> opt_levels{{TCodeGenOptLevel::O0,
+      TCodeGenOptLevel::O1, TCodeGenOptLevel::Os, TCodeGenOptLevel::O2,
+      TCodeGenOptLevel::O3}};
+  for (TCodeGenOptLevel::type level : opt_levels) {
+    codegen.reset();
+    SetUpQuery(++query_id, level);
+    LoadTestOpt(codegen);
+    // Higher levels are by definition O1+, which expects optimized size.
+    // Lower or equal levels should use the cached value from before the loop.
+    VerifyCounters(codegen.get(), level > opt_level ? false : expect_unoptimized);
+    codegen->Close();
+  }
+
+  // Check for cache hits/misses. Levels greater than opt_level will be a hit, others
+  // will be misses.
+  int num_less = opt_level - TCodeGenOptLevel::O0;
+  IntCounter* cache_hits =
+      metrics_->FindMetricForTesting<IntCounter>("impala.codegen-cache.hits");
+  EXPECT_NE(cache_hits, nullptr);
+  EXPECT_EQ(cache_hits->GetValue(), 1 + num_less);
+  IntCounter* cache_misses =
+      metrics_->FindMetricForTesting<IntCounter>("impala.codegen-cache.misses");
+  EXPECT_NE(cache_misses, nullptr);
+  EXPECT_EQ(cache_misses->GetValue(), opt_levels.size() - num_less);
+}
+
+INSTANTIATE_TEST_CASE_P(OptLevels, LlvmOptTest, ::testing::Values(
+  //              Optimization level    Expect unoptimized
+  std::make_tuple(TCodeGenOptLevel::O0, true),
+  std::make_tuple(TCodeGenOptLevel::O1, false),
+  std::make_tuple(TCodeGenOptLevel::Os, false),
+  std::make_tuple(TCodeGenOptLevel::O2, false),
+  std::make_tuple(TCodeGenOptLevel::O3, false)));
 }

 int main(int argc, char **argv) {
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -242,6 +242,8 @@ LlvmCodeGen::LlvmCodeGen(FragmentState* state, ObjectPool* pool,
      ASYNC_CODEGEN_THREAD_COUNTERS_PREFIX);
  num_functions_ = ADD_COUNTER(profile_, "NumFunctions", TUnit::UNIT);
  num_instructions_ = ADD_COUNTER(profile_, "NumInstructions", TUnit::UNIT);
+  num_opt_functions_ = ADD_COUNTER(profile_, "NumOptimizedFunctions", TUnit::UNIT);
+  num_opt_instructions_ = ADD_COUNTER(profile_, "NumOptimizedInstructions", TUnit::UNIT);
  num_cached_functions_ = ADD_COUNTER(profile_, "NumCachedFunctions", TUnit::UNIT);
  llvm_thread_counters_ = ADD_THREAD_COUNTERS(profile_, "Codegen");
 }
@@ -1184,14 +1186,24 @@ bool LlvmCodeGen::LookupCache(CodeGenCacheKey& cache_key) {
      return false;
    }

+    if (entry.opt_level < state_->query_options().codegen_opt_level) {
+      // Requested optimization level is higher than cached entry, so treat as a miss.
+      VLOG(2) << "Overwriting codegen cache entry at " << entry.opt_level
+          << " with optimization level " << state_->query_options().codegen_opt_level;
+      cache->IncHitOrMissCount(/*hit*/ false);
+      return false;
+    }
+
    const bool fn_pointers_set_from_cache = SetFunctionPointers(cache, &cache_key);
    if (!fn_pointers_set_from_cache) return false;

    // Because we cache the entire execution engine, the cached number of functions should
-    // be the same as the total function number.
-    COUNTER_SET(num_cached_functions_, entry.num_functions);
+    // be the same as the total optimized function number.
+    COUNTER_SET(num_cached_functions_, entry.num_opt_functions);
    COUNTER_SET(num_functions_, entry.num_functions);
    COUNTER_SET(num_instructions_, entry.num_instructions);
+    COUNTER_SET(num_opt_functions_, entry.num_opt_functions);
+    COUNTER_SET(num_opt_instructions_, entry.num_opt_instructions);
  }
  cache->IncHitOrMissCount(/*hit*/ entry_exists);
  return entry_exists;
@@ -1219,7 +1231,8 @@ void LlvmCodeGen::GenerateFunctionNamesHashCode() {
 Status LlvmCodeGen::StoreCache(CodeGenCacheKey& cache_key) {
  DCHECK(!cache_key.empty());
  Status store_status = ExecEnv::GetInstance()->codegen_cache()->Store(
-      cache_key, this, state_->query_options().codegen_cache_mode);
+      cache_key, this, state_->query_options().codegen_cache_mode,
+      state_->query_options().codegen_opt_level);
  LOG(INFO) << DebugCacheEntryString(cache_key, false /*is_lookup*/,
      CodeGenCacheModeAnalyzer::is_debug(state_->query_options().codegen_cache_mode),
      store_status.ok());
@@ -1313,9 +1326,22 @@ Status LlvmCodeGen::FinalizeModule() {
    if (LookupCache(cache_key)) return Status::OK();
  }

+  // Update counters before final optimization, but after removing unused functions. This
+  // gives us a rough measure of how much work the optimization and compilation must do.
+  // If found in cache, counters will be restored from the cache entry.
+  InstructionCounter counter;
+  counter.visit(*module_);
+  COUNTER_SET(num_functions_, counter.GetCount(InstructionCounter::TOTAL_FUNCTIONS));
+  COUNTER_SET(num_instructions_, counter.GetCount(InstructionCounter::TOTAL_INSTS));
+
  if (optimizations_enabled_ && !FLAGS_disable_optimization_passes) {
    RETURN_IF_ERROR(OptimizeModule());
+    counter.ResetCount();
+    counter.visit(*module_);
  }
+  COUNTER_SET(num_opt_functions_,
+      counter.GetCount(InstructionCounter::TOTAL_FUNCTIONS));
+  COUNTER_SET(num_opt_instructions_, counter.GetCount(InstructionCounter::TOTAL_INSTS));

  if (FLAGS_opt_module_dir.size() != 0) {
    string path = FLAGS_opt_module_dir + "/" + id_ + "_opt.ll";
@@ -1404,18 +1430,30 @@ Status LlvmCodeGen::OptimizeModule() {
  pass_builder.registerLoopAnalyses(LAM);
  pass_builder.crossRegisterProxies(LAM, FAM, CGAM, MAM);

-  llvm::ModulePassManager pass_manager = pass_builder.buildPerModuleDefaultPipeline(
-      llvm::PassBuilder::OptimizationLevel::O2);
-
-  // Update counters before final optimization, but after removing unused functions. This
-  // gives us a rough measure of how much work the optimization and compilation must do.
-  InstructionCounter counter;
-  counter.visit(*module_);
-  COUNTER_SET(num_functions_, counter.GetCount(InstructionCounter::TOTAL_FUNCTIONS));
-  COUNTER_SET(num_instructions_, counter.GetCount(InstructionCounter::TOTAL_INSTS));
+  TCodeGenOptLevel::type opt_level = state_->query_options().codegen_opt_level;
+  llvm::PassBuilder::OptimizationLevel opt;
+  // GCC's -Werror=switch errors if a case is not covered.
+  switch (opt_level) {
+    case TCodeGenOptLevel::O0:
+      // Default optimization pipeline requires O1 or greater, so for O0 we skip.
+      return Status::OK();
+    case TCodeGenOptLevel::O1:
+      opt = llvm::PassBuilder::OptimizationLevel::O1;
+      break;
+    case TCodeGenOptLevel::Os:
+      opt = llvm::PassBuilder::OptimizationLevel::Os;
+      break;
+    case TCodeGenOptLevel::O2:
+      opt = llvm::PassBuilder::OptimizationLevel::O2;
+      break;
+    case TCodeGenOptLevel::O3:
+      opt = llvm::PassBuilder::OptimizationLevel::O3;
+      break;
+  }
+  llvm::ModulePassManager pass_manager = pass_builder.buildPerModuleDefaultPipeline(opt);

  int64_t estimated_memory = ESTIMATED_OPTIMIZER_BYTES_PER_INST
-      * counter.GetCount(InstructionCounter::TOTAL_INSTS);
+      * num_instructions_->value();
  if (!mem_tracker_->TryConsume(estimated_memory)) {
    const string& msg = Substitute(
        "Codegen failed to reserve '$0' bytes for optimization", estimated_memory);
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -647,6 +647,7 @@ class LlvmCodeGen {
  friend class LlvmCodeGenTest;
  friend class LlvmCodeGenTest_CpuAttrWhitelist_Test;
  friend class LlvmCodeGenTest_HashTest_Test;
+  friend class LlvmOptTest;
  friend class SubExprElimination;
  friend class CodeGenCache;
  friend class LlvmCodeGenCacheTest;
@@ -877,6 +878,10 @@ class LlvmCodeGen {
  RuntimeProfile::Counter* num_functions_;
  RuntimeProfile::Counter* num_instructions_;

+  /// Number of instructions after optimization.
+  RuntimeProfile::Counter* num_opt_functions_;
+  RuntimeProfile::Counter* num_opt_instructions_;
+
  /// Number of functions that are used and cached.
  RuntimeProfile::Counter* num_cached_functions_;

--- a/be/src/service/query-options.cc
+++ b/be/src/service/query-options.cc
@@ -1156,6 +1156,13 @@ Status impala::SetQueryOption(const string& key, const string& value,
        query_options->__set_hdfs_scanner_non_reserved_bytes(mem_spec_val.value);
        break;
      };
+      case TImpalaQueryOptions::CODEGEN_OPT_LEVEL: {
+        TCodeGenOptLevel::type enum_type;
+        RETURN_IF_ERROR(GetThriftEnum(
+            value, "CodeGen Opt Level", _TCodeGenOptLevel_VALUES_TO_NAMES, &enum_type));
+        query_options->__set_codegen_opt_level(enum_type);
+        break;
+      }
      default:
        if (IsRemovedQueryOption(key)) {
          LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'";
--- a/be/src/service/query-options.h
+++ b/be/src/service/query-options.h
@@ -50,7 +50,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
 // time we add or remove a query option to/from the enum TImpalaQueryOptions.
 #define QUERY_OPTS_TABLE                                                                 \
  DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),                                 \
-      TImpalaQueryOptions::HDFS_SCANNER_NON_RESERVED_BYTES + 1);                         \
+      TImpalaQueryOptions::CODEGEN_OPT_LEVEL + 1);                                       \
  REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED) \
  QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)               \
  REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)             \
@@ -310,6 +310,7 @@ typedef std::unordered_map<string, beeswax::TQueryOptionLevel::type>
      ICEBERG_PREDICATE_PUSHDOWN_SUBSETTING, TQueryOptionLevel::DEVELOPMENT)             \
  QUERY_OPT_FN(hdfs_scanner_non_reserved_bytes, HDFS_SCANNER_NON_RESERVED_BYTES,         \
      TQueryOptionLevel::ADVANCED)                                                       \
+  QUERY_OPT_FN(codegen_opt_level, CODEGEN_OPT_LEVEL, TQueryOptionLevel::ADVANCED)        \
  ;

 /// Enforce practical limits on some query options to avoid undesired query state.
--- a/common/thrift/ImpalaService.thrift
+++ b/common/thrift/ImpalaService.thrift
@@ -854,6 +854,10 @@ enum TImpalaQueryOptions {
  // value, the value of flag hdfs_scanner_thread_max_estimated_bytes will be used
  // (which defaults to 32MB). The default value of this option is -1 (not set).
  HDFS_SCANNER_NON_RESERVED_BYTES = 166
+
+  // Select codegen optimization level from O0, O1, Os, O2, or O3. Higher levels will
+  // overwrite existing codegen cache entries.
+  CODEGEN_OPT_LEVEL = 167
 }

 // The summary of a DML statement.
--- a/common/thrift/Query.thrift
+++ b/common/thrift/Query.thrift
@@ -108,6 +108,14 @@ enum TParquetBloomFilterWrite {
  ALWAYS
 }

+enum TCodeGenOptLevel {
+  O0,
+  O1,
+  Os,
+  O2,
+  O3
+}
+
 // constants for TQueryOptions.num_nodes
 const i32 NUM_NODES_ALL = 0
 const i32 NUM_NODES_ALL_RACKS = -1
@@ -666,6 +674,9 @@ struct TQueryOptions {

  // See comment in ImpalaService.thrift
  167: optional i64 hdfs_scanner_non_reserved_bytes = -1
+
+  // See comment in ImpalaService.thrift
+  168: optional TCodeGenOptLevel codegen_opt_level = TCodeGenOptLevel.O2
 }

 // Impala currently has three types of sessions: Beeswax, HiveServer2 and external
--- a/testdata/llvm/test-opt.cc
+++ b/testdata/llvm/test-opt.cc
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This is a test-only file.  It was used to generate test-opt.bc
+// which is used by the unit test to test optimization passes.
+
+static const int m = 1;
+
+static int transform(int d) {
+  return d + m;
+}
+
+int loop_null(int* data, int num) {
+  int res = 0;
+  for (int i = 0; i < num; i++) {
+    // Expected to be optimized out.
+    if (i % 2 == 0) {
+      res += transform(data[i]);
+    }
+    if (i % 2 == 0) {
+      res -= transform(data[i]);
+    }
+  }
+  return res;
+}