impala/common/function-registry/gen_operators.py

#!/usr/bin/env python
# Copyright 2012 Cloudera Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from string import Template
import os

# This script will generate the implementation of the simple operators for the BE.
# These include:
#   - Arithmetic functions
#   - Binary functions
#   - Cast functions
#
# The script outputs (run: 'impala/common/function-registry/gen_functions.py')
#   - header and implemention for above functions:
#     - impala/be/src/generated-sources/opcode/functions.[h/cc]

unary_op = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  e->result_.${result_field} = ${native_op} *val;\n\
  return &e->result_.${result_field};\n\
}\n\n")

binary_op = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op1 = e->children()[0];\n\
  ${native_type1}* val1 = reinterpret_cast<${native_type1}*>(op1->GetValue(row));\n\
  Expr* op2 = e->children()[1];\n\
  ${native_type2}* val2 = reinterpret_cast<${native_type2}*>(op2->GetValue(row));\n\
  if (val1 == NULL || val2 == NULL) return NULL;\n\
  e->result_.${result_field} = (*val1 ${native_op} *val2);\n\
  return &e->result_.${result_field};\n\
}\n\n")

binary_op_check_zero = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op1 = e->children()[0];\n\
  ${native_type1}* val1 = reinterpret_cast<${native_type1}*>(op1->GetValue(row));\n\
  Expr* op2 = e->children()[1];\n\
  ${native_type2}* val2 = reinterpret_cast<${native_type2}*>(op2->GetValue(row));\n\
  if (val1 == NULL || val2 == NULL || *val2 == 0) return NULL;\n\
  e->result_.${result_field} = (*val1 ${native_op} *val2);\n\
  return &e->result_.${result_field};\n\
}\n\n")

binary_func = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op1 = e->children()[0];\n\
  ${native_type1}* val1 = reinterpret_cast<${native_type1}*>(op1->GetValue(row));\n\
  Expr* op2 = e->children()[1];\n\
  ${native_type2}* val2 = reinterpret_cast<${native_type2}*>(op2->GetValue(row));\n\
  if (val1 == NULL || val2 == NULL) return NULL;\n\
  e->result_.${result_field} = val1->${native_func}(*val2);\n\
  return &e->result_.${result_field};\n\
}\n\n")

cast = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  e->result_.${result_field} = *val;\n\
  return &e->result_.${result_field};\n\
}\n\n")

string_to_int = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  StringParser::ParseResult result;\n\
  e->result_.${result_field} = \
      StringParser::StringToInt<${native_type2}>(val->ptr, val->len, &result);\n\
  if (UNLIKELY(result != StringParser::PARSE_SUCCESS)) return NULL;\n\
  return &e->result_.${result_field};\n\
}\n\n")

string_to_float = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  StringParser::ParseResult result;\n\
  e->result_.${result_field} = \
      StringParser::StringToFloat<${native_type2}>(val->ptr, val->len, &result);\n\
  if (UNLIKELY(result != StringParser::PARSE_SUCCESS)) return NULL;\n\
  return &e->result_.${result_field};\n\
}\n\n")

string_to_timestamp = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  e->result_.${result_field} = TimestampValue(val->ptr, val->len);\n\
  return &e->result_.${result_field};\n\
}\n\n")

numeric_to_string = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  e->result_.SetStringVal(lexical_cast<string>(*val));\n\
  return &e->result_.${result_field};\n\
}\n\n")

# Need to special case tinyint.  boost thinks it is a char and handles it differently.
# e.g. '0' is written as an empty string.
string_to_tinyint = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  string tmp(val->ptr, val->len);\n\
  try {\n\
    e->result_.${result_field} = static_cast<int8_t>(lexical_cast<int16_t>(tmp));\n\
  } catch (bad_lexical_cast &) {\n\
    return NULL;\n\
  }\n\
  return &e->result_.${result_field};\n\
}\n\n")

tinyint_to_string = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  int64_t tmp_val = *val;\n\
  e->result_.SetStringVal(lexical_cast<string>(tmp_val));\n\
  return &e->result_.${result_field};\n\
}\n\n")


# Special case for float types to string that deals properly with nan
# (lexical_cast<string>(nan) returns "-nan" which is nonsensical).
float_types_to_string = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  Expr* op = e->children()[0];\n\
  ${native_type1}* val = reinterpret_cast<${native_type1}*>(op->GetValue(row));\n\
  if (val == NULL) return NULL;\n\
  if (isnan(*val)) {\n\
    e->result_.SetStringVal(string(\"nan\"));\n\
  } else {\n\
    e->result_.SetStringVal(lexical_cast<string>(*val));\n\
  }\n\
  return &e->result_.${result_field};\n\
}\n\n")

case = Template("\
void* ComputeFunctions::${fn_signature}(Expr* e, TupleRow* row) {\n\
  CaseExpr* expr = static_cast<CaseExpr*>(e);\n\
  int num_children = e->GetNumChildren();\n\
  int loop_end = (expr->has_else_expr()) ? num_children - 1 : num_children;\n\
  // Make sure we set the right compute function.\n\
  DCHECK_EQ(expr->has_case_expr(), true);\n\
  // Need at least case, when and then expr, and optionally an else.\n\
  DCHECK_GE(num_children, (expr->has_else_expr()) ? 4 : 3);\n\
  // All case and when exprs return the same type (we guaranteed that during analysis).\n\
  void* case_val = e->children()[0]->GetValue(row);\n\
  if (case_val == NULL) {\n\
    if (expr->has_else_expr()) {\n\
      // Return else value.\n\
      return e->children()[num_children - 1]->GetValue(row);\n\
    } else {\n\
      return NULL;\n\
    }\n\
  }\n\
  for (int i = 1; i < loop_end; i += 2) {\n\
    ${native_type1}* when_val =\n\
        reinterpret_cast<${native_type1}*>(e->children()[i]->GetValue(row));\n\
    if (when_val == NULL) continue;\n\
    if (*reinterpret_cast<${native_type1}*>(case_val) == *when_val) {\n\
      // Return then value.\n\
      return e->children()[i + 1]->GetValue(row);\n\
    }\n\
  }\n\
  if (expr->has_else_expr()) {\n\
    // Return else value.\n\
    return e->children()[num_children - 1]->GetValue(row);\n\
  }\n\
  return NULL;\n\
}\n\n")

# Mapping of function to template
templates = {
  'Add'         : binary_op,
  'Subtract'    : binary_op,
  'Multiply'    : binary_op,
  'Divide'      : binary_op,
  'Int_divide'  : binary_op_check_zero,
  'Mod'         : binary_op_check_zero,
  'Bitand'      : binary_op,
  'Bitxor'      : binary_op,
  'Bitor'       : binary_op,
  'Bitnot'      : unary_op,
  'Eq'          : binary_op,
  'Ne'          : binary_op,
  'Ge'          : binary_op,
  'Gt'          : binary_op,
  'Lt'          : binary_op,
  'Le'          : binary_op,
  'Cast'        : cast,
}

# Some aggregate types that are useful for defining functions
types = {
  'BOOLEAN'       : ['BOOLEAN'],
  'TINYINT'       : ['TINYINT'],
  'SMALLINT'      : ['SMALLINT'],
  'INT'           : ['INT'],
  'BIGINT'        : ['BIGINT'],
  'FLOAT'         : ['FLOAT'],
  'DOUBLE'        : ['DOUBLE'],
  'STRING'        : ['STRING'],
  'TIMESTAMP'     : ['TIMESTAMP'],
  'INT_TYPES'     : ['TINYINT', 'SMALLINT', 'INT', 'BIGINT'],
  'FLOAT_TYPES'   : ['FLOAT', 'DOUBLE'],
  'NUMERIC_TYPES' : ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
  'NATIVE_TYPES'  : ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
  'STRCAST_TYPES' : ['BOOLEAN', 'SMALLINT', 'INT', 'BIGINT'],
  'ALL_TYPES'     : ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT',\
                     'DOUBLE', 'STRING', 'TIMESTAMP'],
  'MAX_TYPES'     : ['BIGINT', 'DOUBLE'],
}

# Operation, [ReturnType], [[Args1], [Args2], ... [ArgsN]]
functions = [
  # Arithmetic Expr
  ['Add', ['NUMERIC_TYPES'], [['NUMERIC_TYPES'], ['NUMERIC_TYPES']] ],
  ['Subtract', ['NUMERIC_TYPES'], [['NUMERIC_TYPES'], ['NUMERIC_TYPES']] ],
  ['Multiply', ['NUMERIC_TYPES'], [['NUMERIC_TYPES'], ['NUMERIC_TYPES']] ],
  ['Divide', ['DOUBLE'], [['DOUBLE'], ['DOUBLE']] ],
  ['Int_divide', ['INT_TYPES'], [['INT_TYPES'], ['INT_TYPES']] ],
  ['Mod', ['INT_TYPES'], [['INT_TYPES'], ['INT_TYPES']] ],
  ['Bitand', ['INT_TYPES'], [['INT_TYPES'], ['INT_TYPES']] ],
  ['Bitxor', ['INT_TYPES'], [['INT_TYPES'], ['INT_TYPES']] ],
  ['Bitor', ['INT_TYPES'], [['INT_TYPES'], ['INT_TYPES']] ],
  ['Bitnot', ['INT_TYPES'], [['INT_TYPES']] ],

  # BinaryPredicates
  ['Eq', ['BOOLEAN'], [['NATIVE_TYPES'], ['NATIVE_TYPES']] ],
  ['Ne', ['BOOLEAN'], [['NATIVE_TYPES'], ['NATIVE_TYPES']] ],
  ['Gt', ['BOOLEAN'], [['NATIVE_TYPES'], ['NATIVE_TYPES']] ],
  ['Lt', ['BOOLEAN'], [['NATIVE_TYPES'], ['NATIVE_TYPES']] ],
  ['Ge', ['BOOLEAN'], [['NATIVE_TYPES'], ['NATIVE_TYPES']] ],
  ['Le', ['BOOLEAN'], [['NATIVE_TYPES'], ['NATIVE_TYPES']] ],
  ['Eq', ['BOOLEAN'], [['STRING'], ['STRING']], binary_func ],
  ['Ne', ['BOOLEAN'], [['STRING'], ['STRING']], binary_func ],
  ['Gt', ['BOOLEAN'], [['STRING'], ['STRING']], binary_func ],
  ['Lt', ['BOOLEAN'], [['STRING'], ['STRING']], binary_func ],
  ['Ge', ['BOOLEAN'], [['STRING'], ['STRING']], binary_func ],
  ['Le', ['BOOLEAN'], [['STRING'], ['STRING']], binary_func ],
  ['Eq', ['BOOLEAN'], [['TIMESTAMP'], ['TIMESTAMP']], ],
  ['Ne', ['BOOLEAN'], [['TIMESTAMP'], ['TIMESTAMP']], ],
  ['Gt', ['BOOLEAN'], [['TIMESTAMP'], ['TIMESTAMP']], ],
  ['Lt', ['BOOLEAN'], [['TIMESTAMP'], ['TIMESTAMP']], ],
  ['Ge', ['BOOLEAN'], [['TIMESTAMP'], ['TIMESTAMP']], ],
  ['Le', ['BOOLEAN'], [['TIMESTAMP'], ['TIMESTAMP']], ],

  # Casts
  ['Cast', ['BOOLEAN'], [['NATIVE_TYPES'], ['BOOLEAN']] ],
  ['Cast', ['TINYINT'], [['NATIVE_TYPES'], ['TINYINT']] ],
  ['Cast', ['SMALLINT'], [['NATIVE_TYPES'], ['SMALLINT']] ],
  ['Cast', ['INT'], [['NATIVE_TYPES'], ['INT']] ],
  ['Cast', ['BIGINT'], [['NATIVE_TYPES'], ['BIGINT']] ],
  ['Cast', ['FLOAT'], [['NATIVE_TYPES'], ['FLOAT']] ],
  ['Cast', ['DOUBLE'], [['NATIVE_TYPES'], ['DOUBLE']] ],
  ['Cast', ['INT_TYPES'], [['STRING'], ['INT_TYPES']], string_to_int ],
  ['Cast', ['FLOAT_TYPES'], [['STRING'], ['FLOAT_TYPES']], string_to_float ],
  ['Cast', ['STRING'], [['STRCAST_TYPES'], ['STRING']], numeric_to_string ],
  ['Cast', ['STRING'], [['TINYINT'], ['STRING']], tinyint_to_string ],
  ['Cast', ['STRING'], [['FLOAT_TYPES'], ['STRING']], float_types_to_string ],
  ['Cast', ['NATIVE_TYPES'], [['TIMESTAMP'], ['NATIVE_TYPES']]],
  ['Cast', ['STRING'], [['TIMESTAMP'], ['STRING']], numeric_to_string ],
  ['Cast', ['TIMESTAMP'], [['STRING'], ['TIMESTAMP']], string_to_timestamp],
  ['Cast', ['TIMESTAMP'], [['NATIVE_TYPES'], ['TIMESTAMP']], ],

  # Case
  # The case expr is special because it has a variable number of function args,
  # but we guarantee that all of them are of the same type during query analysis,
  # so we just list exactly one here.
  # In addition, the return type given here is a dummy, because it is
  # not necessarily the same as the function args type.
  ['Case', ['ALL_TYPES'], [['ALL_TYPES']], case],
]

native_types = {
  'BOOLEAN'       : 'bool',
  'TINYINT'       : 'char',
  'SMALLINT'      : 'short',
  'INT'           : 'int',
  'BIGINT'        : 'long',
  'FLOAT'         : 'float',
  'DOUBLE'        : 'double',
  'STRING'        : 'StringValue',
  'TIMESTAMP'     : 'TimestampValue',
}

# Portable type used in the function implementation
implemented_types = {
  'BOOLEAN'       : 'bool',
  'TINYINT'       : 'int8_t',
  'SMALLINT'      : 'int16_t',
  'INT'           : 'int32_t',
  'BIGINT'        : 'int64_t',
  'FLOAT'         : 'float',
  'DOUBLE'        : 'double',
  'STRING'        : 'StringValue',
  'TIMESTAMP'     : 'TimestampValue',
}
result_fields = {
  'BOOLEAN'       : 'bool_val',
  'TINYINT'       : 'tinyint_val',
  'SMALLINT'      : 'smallint_val',
  'INT'           : 'int_val',
  'BIGINT'        : 'bigint_val',
  'FLOAT'         : 'float_val',
  'DOUBLE'        : 'double_val',
  'STRING'        : 'string_val',
  'TIMESTAMP'     : 'timestamp_val',
}

native_ops = {
  'BITAND'     : '&',
  'BITNOT'     : '~',
  'BITOR'      : '|',
  'BITXOR'     : '^',
  'DIVIDE'     : '/',
  'EQ'         : '==',
  'GT'         : '>',
  'GE'         : '>=',
  'INT_DIVIDE' : '/',
  'SUBTRACT'   : '-',
  'MOD'        : '%',
  'MULTIPLY'   : '*',
  'LT'         : '<',
  'LE'         : '<=',
  'NE'         : '!=',
  'ADD'        : '+',
}

native_funcs = {
  'EQ' : 'Eq',
  'LE' : 'Le',
  'LT' : 'Lt',
  'NE' : 'Ne',
  'GE' : 'Ge',
  'GT' : 'Gt',
}

cc_preamble = '\
// Copyright 2012 Cloudera Inc.\n\
//\n\
// Licensed under the Apache License, Version 2.0 (the "License");\n\
// you may not use this file except in compliance with the License.\n\
// You may obtain a copy of the License at\n\
//\n\
// http://www.apache.org/licenses/LICENSE-2.0\n\
//\n\
// Unless required by applicable law or agreed to in writing, software\n\
// distributed under the License is distributed on an "AS IS" BASIS,\n\
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\
// See the License for the specific language governing permissions and\n\
// limitations under the License.\n\
\n\
// This is a generated file, DO NOT EDIT.\n\
// To add new functions, see impala/common/function-registry/gen_opcodes.py\n\
\n\
#include "opcode/functions.h"\n\
#include "exprs/expr.h"\n\
#include "exprs/case-expr.h"\n\
#include "runtime/string-value.inline.h"\n\
#include "runtime/tuple-row.h"\n\
#include "util/string-parser.h"\n\
#include <boost/lexical_cast.hpp>\n\
\n\
using namespace boost;\n\
using namespace std;\n\
\n\
namespace impala { \n\
\n'

cc_epilogue = '\
}\n'

h_preamble = '\
// Copyright 2012 Cloudera Inc.\n\
//\n\
// Licensed under the Apache License, Version 2.0 (the "License");\n\
// you may not use this file except in compliance with the License.\n\
// You may obtain a copy of the License at\n\
//\n\
// http://www.apache.org/licenses/LICENSE-2.0\n\
//\n\
// Unless required by applicable law or agreed to in writing, software\n\
// distributed under the License is distributed on an "AS IS" BASIS,\n\
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\
// See the License for the specific language governing permissions and\n\
// limitations under the License.\n\
\n\
// This is a generated file, DO NOT EDIT IT.\n\
// To add new functions, see impala/common/function-registry/gen_opcodes.py\n\
\n\
#ifndef IMPALA_OPCODE_FUNCTIONS_H\n\
#define IMPALA_OPCODE_FUNCTIONS_H\n\
\n\
namespace impala {\n\
class Expr;\n\
class OpcodeRegistry;\n\
class TupleRow;\n\
\n\
class ComputeFunctions {\n\
 public:\n'

h_epilogue = '\
};\n\
\n\
}\n\
\n\
#endif\n'

header_template = Template("\
  static void* ${fn_signature}(Expr* e, TupleRow* row);\n")

BE_PATH = "../../be/generated-sources/opcode/"
if not os.path.exists(BE_PATH):
  os.makedirs(BE_PATH)

# Expand the signature data for template substitution.  Returns
# a dictionary with all the entries for all the templates used in this script
def initialize_sub(op, return_type, arg_types):
  sub = {}
  sub["fn_name"] = op
  sub["fn_signature"] = op
  sub["return_type"] = return_type
  sub["result_field"] = result_fields[return_type]
  sub["args"] = ""
  if op.upper() in native_ops:
    sub["native_op"] = native_ops[op.upper()]
  for idx in range(0, len(arg_types)):
    arg = arg_types[idx]
    sub["fn_signature"] += "_" + native_types[arg]
    sub["native_type" + repr(idx + 1)] = implemented_types[arg]
    sub["args"] += "'" + arg + "', "
  return sub

if __name__ == "__main__":
  h_file = open(BE_PATH + 'functions.h', 'w')
  cc_file = open(BE_PATH + 'functions.cc', 'w')
  h_file.write(h_preamble)
  cc_file.write(cc_preamble)

  # Generate functions and headers
  for func_data in functions:

    op = func_data[0]
    # If a specific template has been specified, use that one.
    if len(func_data) >= 4 :
      template = func_data[3]
    else :
      # Skip functions with no template (shouldn't be auto-generated)
      if not op in templates:
        continue
      template = templates[op]

    # Expand all arguments
    return_types = []
    for ret in func_data[1]:
      for t in types[ret]:
        return_types.append(t)
    signatures = []
    for args in func_data[2]:
      expanded_arg = []
      for arg in args:
        for t in types[arg]:
          expanded_arg.append(t)
      signatures.append(expanded_arg)

    # Put arguments into substitution structure
    num_functions = 0
    for args in signatures:
      num_functions = max(num_functions, len(args))
    num_functions = max(num_functions, len(return_types))
    num_args = len(signatures)

    # Validate the input is correct
    if len(return_types) != 1 and len(return_types) != num_functions:
      print "Invalid Declaration: " + func_data
      sys.exit(1)

    for args in signatures:
      if len(args) != 1 and len(args) != num_functions:
        print "Invalid Declaration: " + func_data
        sys.exit(1)

    # Iterate over every function signature to generate
    for i in range(0, num_functions):
      if len(return_types) == 1:
        return_type = return_types[0]
      else:
        return_type = return_types[i]

      arg_types = []
      for j in range(0, num_args):
        if len(signatures[j]) == 1:
          arg_types.append(signatures[j][0])
        else:
          arg_types.append(signatures[j][i])

      # At this point, 'return_type' is a single type and 'arg_types'
      # is a list of single types
      sub = initialize_sub(op, return_type, arg_types)
      if template == binary_func :
        sub["native_func"] = native_funcs[op.upper()]

      h_file.write(header_template.substitute(sub))
      cc_file.write(template.substitute(sub))

  h_file.write(h_epilogue)
  cc_file.write(cc_epilogue)
  h_file.close()
  cc_file.close()