Cross-compiled UDF builtins.

When codegen is enabled, UDF builtins will be loaded from the IR module rather than using the native functions. Since we cannot run UDFs without codegen yet this means UDF builtins can only be run this way, but once we add support for running UDFs without codegen this will allow us to switch back to the native functions for development/debugging. Change-Id: I948b113c61603801b84f80982384bbc07596f119 Reviewed-on: http://gerrit.ent.cloudera.com:8080/605 Tested-by: jenkins Reviewed-by: Nong Li <nong@cloudera.com>
2025-12-25 02:03:09 -05:00 · 2013-10-04 12:08:49 -07:00
parent bf139d1eba
commit 656ae8b1c8
7 changed files with 39 additions and 10 deletions
--- a/be/src/codegen/CMakeLists.txt
+++ b/be/src/codegen/CMakeLists.txt
@@ -51,6 +51,7 @@ set(IR_DEPENDENT_FILES
  ../exec/hash-join-node-ir.cc
  ../exec/hdfs-scanner-ir.cc
  ../exprs/expr-ir.cc
+  ../exprs/udf-builtins.cc
  ../runtime/string-value-ir.cc
  ../util/hash-util-ir.cc
 )
--- a/be/src/codegen/impala-ir.cc
+++ b/be/src/codegen/impala-ir.cc
@@ -23,6 +23,7 @@
 #include "exec/hdfs-avro-scanner-ir.cc"
 #include "exec/hdfs-scanner-ir.cc"
 #include "exprs/expr-ir.cc"
+#include "exprs/udf-builtins.cc"
 #include "runtime/string-value-ir.cc"
 #include "util/hash-util-ir.cc"
 #else
--- a/be/src/exprs/native-udf-expr.cc
+++ b/be/src/exprs/native-udf-expr.cc
@@ -248,9 +248,10 @@ Status NativeUdfExpr::GetIrComputeFn(RuntimeState* state, llvm::Function** fn) {

 Status NativeUdfExpr::GetUdf(RuntimeState* state, llvm::Function** udf) {
  LlvmCodeGen* codegen = state->llvm_codegen();
+  bool codegen_disabled = (codegen == NULL);

  if (udf_type_ == TFunctionBinaryType::NATIVE ||
-      udf_type_ == TFunctionBinaryType::BUILTIN) {
+      (udf_type_ == TFunctionBinaryType::BUILTIN && codegen_disabled)) {
    void* udf_ptr;
    if (udf_type_ == TFunctionBinaryType::NATIVE) {
      RETURN_IF_ERROR(state->lib_cache()->GetFunctionPtr(
@@ -290,6 +291,14 @@ Status NativeUdfExpr::GetUdf(RuntimeState* state, llvm::Function** udf) {
    // defined. This tells LLVM where the compiled function definition is located in
    // memory.
    codegen->execution_engine()->addGlobalMapping(*udf, udf_ptr);
+  } else if (udf_type_ == TFunctionBinaryType::BUILTIN && !codegen_disabled) {
+    const string& symbol = OpcodeRegistry::Instance()->GetFunctionSymbol(opcode_);
+    *udf = codegen->module()->getFunction(symbol);
+    if (*udf == NULL) {
+      stringstream ss;
+      ss << "Could not load builtin " << opcode_ << " with symbol: " << symbol;
+      return Status(ss.str());
+    }
  } else {
    DCHECK_EQ(udf_type_, TFunctionBinaryType::IR);

--- a/be/src/exprs/native-udf-expr.h
+++ b/be/src/exprs/native-udf-expr.h
@@ -21,7 +21,7 @@
 #include "udf/udf.h"

 namespace impala_udf {
-  class AnyVal;
+  struct AnyVal;
 };

 namespace impala {
--- a/be/src/exprs/opcode-registry.h
+++ b/be/src/exprs/opcode-registry.h
@@ -38,6 +38,14 @@ class OpcodeRegistry {
    return functions_[index];
  }

+  // Returns the function symbol for this opcode (used for loading IR functions).
+  const std::string& GetFunctionSymbol(TExprOpcode::type opcode) {
+    int index = static_cast<int>(opcode);
+    DCHECK_GE(index, 0);
+    DCHECK_LT(index, symbols_.size());
+    return symbols_[index];
+  }
+
  // Registry is a singleton
  static OpcodeRegistry* Instance() {
    if (instance_ == NULL) {
@@ -56,6 +64,7 @@ class OpcodeRegistry {
  OpcodeRegistry() {
    int num_opcodes = static_cast<int>(TExprOpcode::LAST_OPCODE);
    functions_.resize(num_opcodes);
+    symbols_.resize(num_opcodes);
    Init();
  }

@@ -63,16 +72,18 @@ class OpcodeRegistry {
  // opcode-registry-init.cc which is an auto-generated file
  void Init();

-  void Add(TExprOpcode::type opcode, void* fn) {
+  void Add(TExprOpcode::type opcode, void* fn, const char* symbol) {
    int index = static_cast<int>(opcode);
    DCHECK_LT(index, functions_.size());
    DCHECK_GE(index, 0);
    functions_[index] = fn;
+    symbols_[index] = symbol;
  }

  static OpcodeRegistry* instance_;
  static boost::mutex instance_lock_;
  std::vector<void*> functions_;
+  std::vector<std::string> symbols_;
 };

 }
--- a/common/function-registry/gen_opcodes.py
+++ b/common/function-registry/gen_opcodes.py
@@ -227,6 +227,7 @@ def add_function(fn_meta_data, udf_interface):
  entry["args"] = fn_meta_data[2]
  entry["be_fn"] = fn_meta_data[3]
  entry["sql_names"] = fn_meta_data[4]
+  entry["symbol"] = fn_meta_data[5] if udf_interface else "<no symbol specified>"
  entry["udf_interface"] = udf_interface

  if fn_name in meta_data_entries:
@@ -272,8 +273,10 @@ def generate_be_registry_init(filename):
    for entry in entries:
      opcode = entry["opcode"]
      be_fn = entry["be_fn"]
+      symbol = entry["symbol"]
      # We generate two casts to work around GCC Bug 11407
-      cc_output = "TExprOpcode::%s, (void*)(Expr::ComputeFn)%s" % (opcode, be_fn)
+      cc_output = 'TExprOpcode::%s, (void*)(Expr::ComputeFn)%s, "%s"' \
+                  % (opcode, be_fn, symbol)
      cc_registry_file.write("  this->Add(%s);\n" % (cc_output))

  cc_registry_file.write(cc_registry_epilogue)
@@ -334,11 +337,12 @@ for function in impala_functions.functions:
    print "Invalid function entry in impala_functions.py:\n\t" + repr(function)
    sys.exit(1)
  add_function(function, False)
+
 for function in impala_functions.udf_functions:
-  if len(function) != 5:
-    print "Invalid function entry in impala_functions.py:\n\t" + repr(function)
-    sys.exit(1)
+  assert len(function) == 6, \
+         "Invalid function entry in impala_functions.py:\n\t" + repr(function)
  add_function(function, True)
+
 for function in generated_functions.functions:
  if len(function) != 5:
    print "Invalid function entry in generated_functions.py:\n\t" + repr(function)
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -366,7 +366,10 @@ functions = [
 # These functions are implemented against the UDF interface.
 # TODO: this list should subsume the one above when all builtins are migrated.
 udf_functions = [
-  ['Udf_Math_Pi', 'DOUBLE', [], 'UdfBuiltins::Pi', ['udf_pi']],
-  ['Udf_Math_Abs', 'DOUBLE', ['DOUBLE'], 'UdfBuiltins::Abs', ['udf_abs']],
-  ['Udf_String_Lower', 'STRING', ['STRING'], 'UdfBuiltins::Lower', ['udf_lower']],
+  ['Udf_Math_Pi', 'DOUBLE', [], 'UdfBuiltins::Pi', ['udf_pi'],
+   '_ZN6impala11UdfBuiltins2PiEPN10impala_udf15FunctionContextE'],
+  ['Udf_Math_Abs', 'DOUBLE', ['DOUBLE'], 'UdfBuiltins::Abs', ['udf_abs'],
+   '_ZN6impala11UdfBuiltins3AbsEPN10impala_udf15FunctionContextERKNS1_9DoubleValE'],
+  ['Udf_String_Lower', 'STRING', ['STRING'], 'UdfBuiltins::Lower', ['udf_lower'],
+   '_ZN6impala11UdfBuiltins5LowerEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
 ]