impala/tests/comparison/funcs.py

# Copyright (c) 2014 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from copy import deepcopy
from itertools import ifilter

from tests.comparison.common import ValExpr
from tests.comparison.types import (
    Boolean,
    Char,
    DataType,
    Decimal,
    Float,
    Int,
    Number,
    Timestamp,
    TYPES)

AGG_FUNCS = list()   # All aggregate functions will be added
ANALYTIC_FUNCS = list()   # All analytic functions will be added
FUNCS = list()   # All non-aggregate/analytic functions will be added

class Arg(object):
  '''Represents an argument in a function signature.

     data_type may be either a DataType or a list of DataTypes. A list is used to
     represent a subquery.

     If can_be_null is False, a NULL value should never be passed into the function
     during execution. This is used to maintain consistency across databases. For example
     if Impala and Postgresql both implement function foo but the results differ when
     the args to foo are NULL, then this flag can be used to prevent NULL values.

     If can_be_null_literal is False, the literal value NULL should never be an argument
     to the function. This is provided to workaround problems involving function signature
     resolution during execution. An alternative would be to CAST(NULL AS INT).

     determines_signature is used to signify that this arg is used to determine the
     signature during execution. This implies that the function has multiple signatures
     with the same number of arguments and at least one of the "determines_signature"
     arguments must be non-NULL in order to determine which signature to use during
     execution. An example is "SELECT GREATEST(NULL, NULL)" would result in an error
     during execution in Postgresql because the resulting data type could not be
     determined. An alternative would be to ensure that each modeled function contains
     the full set of possible signatures, then see if "foo(NULL)" would be ambiguous
     and if so use "foo(CAST(NULL AS INT))" instead.
  '''

  def __init__(self,
      data_type,
      require_constant=False,
      min_value=None,
      can_be_null=True,
      can_be_null_literal=True,
      determines_signature=False):
    self.type = data_type
    self.require_constant = require_constant
    self.min_value = min_value
    self.can_be_null = can_be_null
    self.can_be_null_literal = can_be_null_literal
    self.determines_signature = determines_signature

  @property
  def is_subquery(self):
    return isinstance(self.type, list)

  def validate(self, expr, skip_nulls=False):
    if not issubclass(expr.type, self.type):
      raise Exception('Expr type is %s but expected %s' % (expr.type, self.type))
    if self.require_constant and not expr.is_constant:
      raise Exception('A constant is required')
    if self.min_value is not None and expr.val < self.min_value:
      raise Exception('Minumum value not met')
    if skip_nulls and expr.is_constant and expr.val is None:
      return
    if expr.is_constant and expr.val is None and not self.can_be_null_literal:
      raise Exception('A NULL literal is not allowed')

  def __repr__(self):
    _repr = 'Arg<type: '
    if self.is_subquery:
      _repr += 'subquery[' + ', '.join([type_.__name__ for type_ in self.type]) + ']'
    else:
      _repr += self.type.__name__
    if self.require_constant:
      _repr += ', constant: True'
    if self.min_value:
      _repr += ', min: %s' % self.min_value
    _repr += '>'
    return _repr


class Signature(object):

  def __init__(self, func, return_type, *args):
    self.func = func
    self.return_type = return_type
    self.args = list(args)

  @property
  def input_types(self):
    return self.args[1:]


class Func(ValExpr):
  '''Base class for functions'''

  _NAME = None   # Helper for the classmethod name()
  _SIGNATURES = list()   # Helper for the classmethod signatures()

  @classmethod
  def name(cls):
    '''Returns the name of the function. Multiple functions may have the same name.
       For example, COUNT will have a separate Func class for the analytic and aggregate
       versions but both will have the same value of name().
    '''
    return cls.__name__ if cls._NAME is None else cls._NAME

  @classmethod
  def signatures(cls):
    '''Returns the available signatures for the function. Varargs are not supported, a
       subset of possible signatures must be chosen.
    '''
    return cls._SIGNATURES

  @classmethod
  def create_from_args(cls, *val_exprs):
    '''Constructor for instantiating from values. The return types of the exprs will be
       inspected and used to find the function signature. If no signature can be found
       an error will be raised.
    '''
    for signature in cls.signatures():
      if len(signature.args) != len(val_exprs):
        continue
      for idx, arg in enumerate(val_exprs):
        if not issubclass(arg.type, signature.args[idx].type):
          break
      else:
        break
    else:
      raise Exception('No signature matches the given arguments: %s' % (val_exprs, ))
    return cls(signature, *val_exprs)

  def __init__(self, signature, *val_exprs):
    '''"signature" should be one of the available signatures at the class level and
       signifies which function call this instance is intended to represent.
    '''
    if signature not in self.signatures():
      raise Exception('Unknown signature: %s' % (signature, ))
    self.signature = signature
    if val_exprs:
      self.args = list(val_exprs)
    else:
     self.args = list()
     for arg in signature.args:
       if arg.is_subquery:
         self.args.append([subtype(None) for subtype in arg.type])
       else:
         self.args.append(arg.type(arg.min_value))

  @property
  def exact_type(self):
    return self.signature.return_type

  def validate(self, skip_nulls=False):
    if not len(self.args) == len(self.signature.args):
      raise Exception('Signature length mismatch')
    for idx, signature_arg in enumerate(self.signature.args):
      signature_arg.validate(self.args[idx], skip_nulls=skip_nulls)

  def contains_subquery(self):
    for signature_arg in self.signature.args:
      if signature_arg.is_subquery:
        return True
    return any(self.iter_exprs(lambda expr: expr.is_func and expr.contains_subquery))

  def iter_exprs(self, filter=None):
    '''Returns an iterator over all val_exprs including those nested within this
       function's args.
    '''
    for arg in self.args:
      if not isinstance(arg, ValExpr):
        continue
      if not filter or filter(arg):
        yield arg
      for expr in arg.iter_exprs(filter=filter):
        yield expr

  def __hash__(self):
    return hash(type(self)) + hash(self.signature) + hash(tuple(self.args))

  def __eq__(self, other):
    if self is other:
      return True
    if not type(other) == type(self):
      return False
    return self.signature == other.signature and self.args == other.args


class AggFunc(Func):

  def __init__(self, *args):
    Func.__init__(self, *args)
    self.distinct = False

  def validate(self, skip_nulls=False):
    super(AggFunc, self).validate(skip_nulls=skip_nulls)
    for arg in self.args:
      if arg.contains_agg:
        raise Exception('Aggregate functions may not contain other aggregates')
    if self.contains_analytic:
      raise Exception('Aggregate functions may not contain analytics')


class AnalyticFunc(Func):

  HAS_IMPLICIT_WINDOW = False
  SUPPORTS_WINDOWING = True
  REQUIRES_ORDER_BY = False

  def __init__(self, *args):
    Func.__init__(self, *args)
    self.partition_by_clause = None
    self.order_by_clause = None
    self.window_clause = None

  def validate(self, skip_nulls=False):
    super(AnalyticFunc, self).validate(skip_nulls=skip_nulls)
    for arg in self.args:
      if arg.contains_analytic:
        raise Exception('Analytic functions may not contain other analytics')


class PartitionByClause(object):

  def __init__(self, val_exprs):
    self.val_exprs = val_exprs


class WindowClause(object):

  def __init__(self, range_or_rows, start_boundary, end_boundary=None):
    self.range_or_rows = range_or_rows
    self.start_boundary = start_boundary
    self.end_boundary = end_boundary


class WindowBoundary(object):

  UNBOUNDED_PRECEDING = 'UNBOUNDED PRECEDING'
  PRECEDING = 'PRECEDING'
  CURRENT_ROW = 'CURRENT ROW'
  FOLLOWING = 'FOLLOWING'
  UNBOUNDED_FOLLOWING = 'UNBOUNDED FOLLOWING'

  def __init__(self, boundary_type, val_expr=None):
    self.boundary_type = boundary_type
    self.val_expr = val_expr

# It's a lot of work to support this but it should be less error prone than explicitly
# listing each signature.
def create_func(name, returns=None, accepts=[], signatures=[], base_type=Func):
  '''Convenience function for creating a function class. The class is put into the
     global namespace just as though the class had been declared using the "class"
     keyword.

     The name of the class is "name". "base_type" can be used to specify the base class.

     The signature(s) of the class can be defined in one of three ways. "returns" and
     "accepts" can be used together but not in combination with "signatures".

       1) "signatures" should be a list of lists. Each entry corresponds to a single
          signature. Each item in the signature can be either an Arg or a DataType or
          a list of the preceding two types. The first entry in the list is the return
          type, the remainder are the input types. DataType is considered a placeholder
          for all other base types (Char, Number, Boolean, Timestamp). If a signature
          contains DataType, the entire signature will be replace with multiple
          signatures, one for each base type. Number is also considered a placeholder
          but the replacements will be the cross-product of (Int, Float, and Decimal) *
          the number of Number's used, except that the return type is the maximum of
          the input types. A function that accepts a subquery is represented by a list of
          Arg or DataType.

          Ex signatures:
            [Int, Double]: Could be a signature for FLOOR
            [Int, DataType]: Could be a signature for COUNT
                === [Int, Char] + [Int, Number] + [Int, Boolean] + ...
            [Number, Number, Number]: Could be a signature for Multiply
                === ... + [Float, Int, Float] + ... (but not [Int, Float, Float])
            [Boolean, DataType, [DataType]]: Could be a signature for In with a subquery

       2) "returns" and "accepts" is equivalent to
           signatures=[[returns, accepts[0], accepts[1], ..., accepts[n]]]

       3) "accepts" is equivalent to
           signatures=[[accepts[0], accepts[0], accepts[1], ..., accepts[n]]]
  '''
  if (returns or accepts) and signatures:
    raise Exception('Cannot mix signature specification arguments')

  type_name = base_type.__name__.replace('Func', '') + name
  func = type(type_name, (base_type, ), {'_NAME': name, '_SIGNATURES': []})
  globals()[type_name] = func

  if signatures:
    signatures = deepcopy(signatures)

  if base_type == Func:
    FUNCS.append(func)
  if returns:
    signatures = [Signature(func, returns)]
  elif accepts:
    signatures = [Signature(func, accepts[0])]
  if accepts:
    signatures[0].args.extend(accepts)

  # Replace convenience inputs with proper types
  for idx, signature in enumerate(signatures):
    if not isinstance(signature, Signature):
      signature = Signature(func, signature[0], *signature[1:])
      signatures[idx] = signature
    if isinstance(signature.return_type, Arg):
      signature.return_type = signature.return_type.type
    for arg_idx, arg in enumerate(signature.args):
      if not isinstance(arg, Arg):
        signature.args[arg_idx] = Arg(arg)

  # Replace "DataType" args with actual types
  non_wildcard_signatures = list()
  for replacement_type in TYPES:
    for signature_idx, signature in enumerate(signatures):
      replacement_signature = None
      for arg_idx, arg in enumerate(signature.args):
        if arg.is_subquery:
          for sub_idx, subtype in enumerate(arg.type):
            if subtype == DataType:
              if not replacement_signature:
                replacement_signature = deepcopy(signature)
              replacement_signature.args[arg_idx].type[sub_idx] = replacement_type
        elif arg.type == DataType:
            replacement_arg = deepcopy(arg)
            replacement_arg.type = replacement_type
            if not replacement_signature:
              replacement_signature = deepcopy(signature)
            replacement_signature.args[arg_idx] = replacement_arg
      if signature.return_type == DataType:
        if not replacement_signature:
          raise Exception('Wildcard return type requires at least one wildcard input arg')
        replacement_signature.return_type = replacement_type
      if replacement_signature:
        non_wildcard_signatures.append(replacement_signature)
      else:
        non_wildcard_signatures.append(signature)
        # This signature did not contain any "DataType" args, remove it from the list
        # so it isn't processed again.
        del signatures[signature_idx]

  # Replace "Number" args... Number wildcards work differently than DataType wildcards.
  # foo(DataType, DataType) expands to foo(Boolean, Boolean), foo(Char, Char), etc
  # but foo(Number, Number) expands to foo(Decimal, Decimal), foo(Decimal, Int), etc
  # In other words, a cross product needs to be done for Number wildcards. If the return
  # type is also "Number", then it will be replaced with the largest type of the input
  # replacements. Ex, foo(Decimal, Int) would return Decimal.

  # Find wildcard signatures
  signatures = non_wildcard_signatures
  wildcard_signatures = list()
  for signature_idx, signature in enumerate(signatures):
    is_wildcard = False
    for arg_idx, arg in enumerate(signature.args):
      if arg.is_subquery:
        for subtype in arg.type:
          if subtype == Number:
            is_wildcard = True
            break
      elif arg.type == Number:
        is_wildcard = True
      if is_wildcard:
        if signature.return_type == Number:
          signature.return_type = (Number, Int)
        wildcard_signatures.append(signature)
        del signatures[signature_idx]
        break

  # Helper function to reduce code duplication
  def update_return_type_and_append(
      replacement_type,
      replacement_signature,
      wildcard_signatures):
    if isinstance(replacement_signature.return_type, tuple):
      replacement_signature.return_type = \
          (Number, max(replacement_type, replacement_signature.return_type[1]))
    wildcard_signatures.append(replacement_signature)

  # Fully replace each wildcard one at a time so that a cross product is created
  while wildcard_signatures:
    signature = wildcard_signatures.pop()
    is_wildcard = False
    for arg_idx, arg in enumerate(signature.args):
      replacement_signature = None
      if arg.is_subquery:
        if any(ifilter(lambda type_: type_ == Number, arg.type)):
          raise Exception('Number not accepted in subquery signatures')
      elif arg.type == Number:
        for replacement_type in [Decimal, Int, Float]:
          replacement_signature = deepcopy(signature)
          replacement_signature.args[arg_idx].type = replacement_type
          is_wildcard = True
          update_return_type_and_append(
              replacement_type, replacement_signature, wildcard_signatures)
      if is_wildcard:
        break
    if not is_wildcard:
      if isinstance(signature.return_type, tuple):
        signature.return_type = signature.return_type[1]
      signatures.append(signature)

  func._SIGNATURES = signatures
  return func


def create_agg(name, returns=None, accepts=[], signatures=[]):
  func = create_func(name, returns, accepts, signatures, AggFunc)
  AGG_FUNCS.append(func)
  return func


def create_analytic(
    name,
    returns=None,
    accepts=[],
    signatures=[],
    require_order=False,
    supports_window=True):
  func = create_func(name, returns, accepts, signatures, AnalyticFunc)
  func.REQUIRES_ORDER_BY = require_order
  func.SUPPORTS_WINDOWING = supports_window
  ANALYTIC_FUNCS.append(func)
  return func


create_func('IsNull', returns=Boolean, accepts=[DataType])
create_func('IsNotNull', returns=Boolean, accepts=[DataType])
create_func('And', returns=Boolean, accepts=[Boolean, Boolean])
create_func('Or', returns=Boolean, accepts=[Boolean, Boolean])
create_func('Exists', returns=Boolean, accepts=[[DataType]])
create_func('NotExists', returns=Boolean, accepts=[[DataType]])
for func_name in ['In', 'NotIn']:
  # Avoid equality comparison on FLOATs
  create_func(func_name, signatures=[
      [Boolean, Boolean, [Boolean]],
      [Boolean, Boolean, Boolean, Boolean],
      [Boolean, Char, [Char]],
      [Boolean, Char, Char, Char],
      [Boolean, Decimal, [Decimal]],
      [Boolean, Decimal, [Int]],
      [Boolean, Decimal, Decimal, Decimal],
      [Boolean, Decimal, Decimal, Int],
      [Boolean, Decimal, Int, Decimal],
      [Boolean, Int, [Decimal]],
      [Boolean, Int, [Int]],
      [Boolean, Int, Int, Int],
      [Boolean, Int, Decimal, Int],
      [Boolean, Int, Int, Decimal],
      [Boolean, Timestamp, [Timestamp]],
      [Boolean, Timestamp, Timestamp, Timestamp]])
for comparator in ['GreaterThan', 'LessThan']:
  create_func(comparator, signatures=[
      [Boolean, Number, Number],
      [Boolean, Timestamp, Timestamp]])
for comparator in ['GreaterThanOrEquals', 'LessThanOrEquals']:
  # Avoid equality comparison on FLOATs
  create_func(comparator, signatures=[
      [Boolean, Decimal, Decimal],
      [Boolean, Decimal, Int],
      [Boolean, Int, Decimal],
      [Boolean, Int, Int],
      [Boolean, Timestamp, Timestamp]])
for comparator in ['Equals', 'NotEquals']:
  # Avoid equality comparison on FLOATs
  create_func(comparator, signatures=[
      [Boolean, Boolean, Boolean],
      [Boolean, Char, Char],
      [Boolean, Decimal, Decimal],
      [Boolean, Decimal, Int],
      [Boolean, Int, Decimal],
      [Boolean, Int, Int],
      [Boolean, Timestamp, Timestamp]])
create_func('If', returns=DataType,
    accepts=[Boolean, Arg(DataType, determines_signature=True), DataType])

# Don't allow + or - when using floats/doubles. This is done to avoid something like
# (10000.00919 - 10000) * 10000 which would lead to random values.
for operator in ['Plus', 'Minus']:
  create_func(operator, signatures=[
    [Decimal,
     Arg(Decimal, determines_signature=True),
     Arg(Decimal, determines_signature=True)],
    [Decimal,
     Arg(Decimal, determines_signature=True),
     Arg(Int, determines_signature=True)],
    [Decimal,
     Arg(Int, determines_signature=True),
     Arg(Decimal, determines_signature=True)],
    [Int,
     Arg(Int, determines_signature=True),
     Arg(Int, determines_signature=True)]])
create_func('Multiply', signatures=[
    [Number,
     Arg(Number, determines_signature=True),
     Arg(Number, determines_signature=True)]])
# Don't allow INT / INT, Postgresql results in an INT, but a FLOAT in most other databases
create_func('Divide', signatures=[
    [Decimal,
     Arg(Decimal, determines_signature=True),
     Arg(Decimal, determines_signature=True)],
    [Decimal,
     Arg(Decimal, determines_signature=True),
     Arg(Int, determines_signature=True)],
    [Decimal,
     Arg(Int, determines_signature=True),
     Arg(Decimal, determines_signature=True)],
    [Float,
     Arg(Decimal, determines_signature=True),
     Arg(Float, determines_signature=True)],
    [Float,
     Arg(Float, determines_signature=True),
     Arg(Decimal, determines_signature=True)],
    [Float,
     Arg(Float, determines_signature=True),
     Arg(Float, determines_signature=True)],
    [Float,
     Arg(Float, determines_signature=True),
     Arg(Int, determines_signature=True)],
    [Float,
     Arg(Int, determines_signature=True),
     Arg(Float, determines_signature=True)]])

create_func('Abs', signatures=[[Number, Arg(Number, determines_signature=True)]])
# Don't allow FLOAT/DOUBLE to become an INT (ie, an approximation to be treated as a
# precise value).
create_func('Floor', signatures=[[Decimal, Decimal], [Float, Float]])
create_func('Ceil', signatures=[[Decimal, Decimal], [Float, Float]])

# NULL handling in CONCAT differs between Impala and Postgresql
create_func('Concat',
    accepts=[Arg(Char, can_be_null=False), Arg(Char, can_be_null=False)])
create_func('Trim', accepts=[Char])
create_func('Length', returns=Int, accepts=[Char])

for interval in ['Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']:
  create_func('Extract' + interval,
      returns=Int, accepts=[Arg(Timestamp, can_be_null_literal=False)])
  create_func(
      'DateAdd' + interval,
      returns=Timestamp,
      # Determines signature in Postgresql
      accepts=[Arg(Timestamp, determines_signature=True), Int])

create_func('Greatest', signatures=[
    [Number,
     Arg(Number, can_be_null=False, determines_signature=True),
     Arg(Number, can_be_null=False, determines_signature=True)],
    [Timestamp,
     Arg(Timestamp, can_be_null=False, determines_signature=True),
     Arg(Timestamp, can_be_null=False, determines_signature=True)]])
create_func('Least', signatures=[
    [Number,
     Arg(Number, can_be_null=False, determines_signature=True),
     Arg(Number, can_be_null=False, determines_signature=True)],
    [Timestamp,
     Arg(Timestamp, can_be_null=False, determines_signature=True),
     Arg(Timestamp, can_be_null=False, determines_signature=True)]])
create_func('Coalesce', signatures=[
    [DataType,
     Arg(DataType, determines_signature=True),
     Arg(DataType, determines_signature=True)],
    [DataType,
     Arg(DataType, determines_signature=True),
     Arg(DataType, determines_signature=True),
     Arg(DataType, determines_signature=True)]])

# This is added so that query generation can assume that any return type can be
# produced by an aggregate or analytic with only one level of nesting.
# Ex: CAST(SUM(...) AS STRING)
create_func('CastAsChar', signatures=[[Char, Int]])

create_agg('Count', returns=Int, accepts=[Number])
create_agg('Max', signatures=[
  [Number, Arg(Number, determines_signature=True)],
  [Timestamp, Arg(Timestamp, determines_signature=True)]])
create_agg('Min', signatures=[
  [Number, Arg(Number, determines_signature=True)],
  [Timestamp, Arg(Timestamp, determines_signature=True)]])
create_agg('Sum', signatures=[
  # FLOATs not allowed. See comment about Plus/Minus for info.
  [Int, Arg(Int, determines_signature=True)],
  [Decimal, Arg(Decimal, determines_signature=True)]])
create_agg('Avg', signatures=[
  [Float, Arg(Int, determines_signature=True)],
  [Decimal, Arg(Decimal, determines_signature=True)]])

create_analytic('Rank', require_order=True, supports_window=False, returns=Int)
create_analytic('DenseRank', require_order=True, supports_window=False, returns=Int)
create_analytic('RowNumber', require_order=True, supports_window=False, returns=Int)
create_analytic('Lead', require_order=True, supports_window=False, signatures=[
  [DataType, Arg(DataType, determines_signature=True)],
  [DataType,
   Arg(DataType, determines_signature=True),
   Arg(Int, require_constant=True, min_value=1)]])
create_analytic('Lag', require_order=True, supports_window=False, signatures=[
  [DataType, Arg(DataType, determines_signature=True)],
  [DataType,
   Arg(DataType, determines_signature=True),
   Arg(Int, require_constant=True, min_value=1)]])
create_analytic('FirstValue', require_order=True, signatures=[
    [DataType, Arg(DataType, determines_signature=True)]])
create_analytic('LastValue', require_order=True, signatures=[
    [DataType, Arg(DataType, determines_signature=True)]])
create_analytic('Max', signatures=[
  [Number, Arg(Number, determines_signature=True)],
  [Timestamp, Arg(Timestamp, determines_signature=True)]])
create_analytic('Min', signatures=[[Number, Number], [Timestamp, Timestamp]])
create_analytic('Sum', signatures=[[Int, Int], [Decimal, Decimal]])  # FLOATs not allowed
create_analytic('Count', returns=Int, accepts=[Number])
create_analytic('Avg', returns=Float, accepts=[Number])