impala/tests/query_test/test_decimal_fuzz.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# Generates random decimal numbers and verifies that mathematical
# operations return correct results under decimal_v2.

from __future__ import absolute_import, division, print_function
from builtins import range
import decimal
import math
import random

from tests.common.impala_connection import IMPALA_CONNECTION_EXCEPTION
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import (
  add_mandatory_exec_option,
  create_single_exec_option_dimension)
from tests.common.test_vector import ImpalaTestDimension, ImpalaTestMatrix


class TestDecimalFuzz(ImpalaTestSuite):

  # Impala's max precision for decimals is 38, so we should have the same in the tests
  decimal.getcontext().prec = 38

  @classmethod
  def add_test_dimensions(cls):
    cls.ImpalaTestMatrix = ImpalaTestMatrix()
    cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())

    total_iterations = 10000
    batches = list(range(0, 10))
    cls.iterations = total_iterations // len(batches)
    cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension("test_batch", *batches))
    add_mandatory_exec_option(cls, 'decimal_v2', 'true')
    add_mandatory_exec_option(cls, 'long_polling_time_ms', 100)

  def weighted_choice(self, options):
    total_weight = sum(options.values())
    numeric_choice = random.uniform(0, total_weight)
    last_choice = None
    for choice, weight in options.items():
      if numeric_choice <= weight:
        return choice
      numeric_choice -= weight
      if weight > 0:
        last_choice = choice
    return last_choice

  def get_decimal(self):
    '''Returns a 3-tuple with string values of (value, precision, scale). The function
    does not always return completely random values, we try to bias it to select
    more interesting values.'''

    def random_precision():
      return random.randint(1, 38)

    def extreme_precision():
      return 38

    precision_weights = {}
    precision_weights[random_precision] = 0.8
    precision_weights[extreme_precision] = 0.2
    precision = self.weighted_choice(precision_weights)()

    def random_scale(precision):
      return random.randint(0, precision)

    def extreme_scale(precision):
      return random.choice([0, precision])

    scale_weights = {}
    scale_weights[random_scale] = 0.9
    scale_weights[extreme_scale] = 0.1
    scale = self.weighted_choice(scale_weights)(precision)

    def random_value(precision):
      '''Generates a completely random value.'''

      def num_digits_random(precision):
        return random.randint(1, precision)

      def num_digits_all(precision):
        return precision

      # Determine how many digits the value is going to have.
      num_digits_weights = {}
      num_digits_weights[num_digits_random] = 0.8
      num_digits_weights[num_digits_all] = 0.2
      num_digits = self.weighted_choice(num_digits_weights)(precision)

      no_zero = '123456789'
      with_zero = '0123456789'
      result = random.choice(no_zero)
      for _ in range(num_digits - 1):
        result += random.choice(with_zero)

      return result

    def special_case_binary_value(precision):
      '''Generates a value that looks like 11111... or 10000... in binary number
      system.'''

      def exponent_random(precision):
        return random.randint(0, int(precision * math.log(10, 2)))

      def exponent_max(precision):
        return int(precision * math.log(10, 2))

      exponent_weights = {}
      exponent_weights[exponent_random] = 0.8
      exponent_weights[exponent_max] = 0.2
      exponent = self.weighted_choice(exponent_weights)(precision)

      value = 2 ** exponent
      if random.random() < 0.5:
        value -= 1
      return '{0}'.format(value)

    def special_case_decimal_value(precision):
      '''Generates a value that looks like 99999... or 10000... in decimal number
      system.'''

      def num_digits_random(precision):
        return random.randint(1, precision)

      def num_digits_max(precision):
        return precision

      num_digits_weights = {}
      num_digits_weights[num_digits_random] = 8
      num_digits_weights[num_digits_max] = 0.2
      num_digits = self.weighted_choice(num_digits_weights)(precision)

      value = 10 ** num_digits

      if num_digits == precision or random.random() < 0.5:
        value -= 1

      return '{0}'.format(value)

    value_weights = {}
    value_weights[random_value] = 0.6
    value_weights[special_case_binary_value] = 0.2
    value_weights[special_case_decimal_value] = 0.2

    value = self.weighted_choice(value_weights)(precision)

    # Randomly determine the placement of the decimal mark.
    # The smallest index where the decimal mark can be placed in the number string.
    min_dot_location = max(len(value) - scale, 0)
    # The largest index where the decimal mark can be placed in the number string.
    max_dot_location = min(precision - scale, len(value))
    dot_location = random.randint(min_dot_location, max_dot_location)

    if dot_location == 0:
      value = '0.' + value
    elif dot_location == len(value):
      pass
    else:
      value = value[:dot_location] + '.' + value[dot_location:]

    if random.random() < 0.5:
      # Negate the number.
      value = '-' + value
    return (value, precision, scale)

  def result_equals(self, expected, actual):
    '''Verify that the expected result is equal to the actual result. We verify equality
    by rounding the expected result to different numbers of places and verifying that the
    actual result is matched in at least one of the cases.'''
    if actual == expected:
      return True

    if actual is None:
      # Overflow
      if abs(expected) > decimal.Decimal("9" * 32):
        # If the expected result is larger than 10^32 - 1, it's not unreasonable for
        # there to be an overflow in Impala because the minimum scale is 6 and
        # 38 (max precision) - 6 = 32.
        return True
      return False

    for num_digits_after_dot in range(39):
      # Reduce the number of digits after the dot in the expected_result to different
      # amounts. If it matches the actual result in at least one of the cases, we
      # consider the actual result to be acceptable.
      truncated_expected = expected.quantize(
          decimal.Decimal("1e-{0}".format(num_digits_after_dot)),
          rounding=decimal.ROUND_HALF_UP)
      if actual == truncated_expected:
        return True
    return False

  def execute_one_decimal_op(self, query_options):
    '''Executes a single query and compares the result to a result that we computed in
    Python.'''
    op = random.choice(['+', '-', '*', '/', '%'])
    value1, precision1, scale1 = self.get_decimal()
    value2, precision2, scale2 = self.get_decimal()

    query = ('select cast({value1} as decimal({precision1},{scale1})) {op} '
        'cast({value2} as decimal({precision2},{scale2}))').format(op=op,
        value1=value1, precision1=precision1, scale1=scale1,
        value2=value2, precision2=precision2, scale2=scale2)

    try:
      result = self.execute_scalar(query, query_options)
    except IMPALA_CONNECTION_EXCEPTION:
      result = None
    if result is not None:
      result = decimal.Decimal(result)

    with decimal.localcontext() as ctx:
      # Set the decimal context to a large precision initially, so that the
      # mathematical operations are performed at a high precision.
      ctx.prec = 80

      try:
        if op == '+':
          expected_result = decimal.Decimal(value1) + decimal.Decimal(value2)
        elif op == '-':
          expected_result = decimal.Decimal(value1) - decimal.Decimal(value2)
        elif op == '*':
          expected_result = decimal.Decimal(value1) * decimal.Decimal(value2)
        elif op == '/':
          expected_result = decimal.Decimal(value1) / decimal.Decimal(value2)
        elif op == '%':
          expected_result = decimal.Decimal(value1) % decimal.Decimal(value2)
        else:
          assert False
      except decimal.InvalidOperation:
        expected_result = None
      except decimal.DivisionByZero:
        expected_result = None
      assert self.result_equals(expected_result, result)

  def test_decimal_ops(self, vector):
    for _ in range(self.iterations):
      self.execute_one_decimal_op(vector.get_exec_option_dict())

  def width_bucket(self, val, min_range, max_range, num_buckets):
    # Multiplying the values by 10**40 guarantees that the numbers can be converted
    # to int without losing information.
    val_int = int(decimal.Decimal(val) * 10**40)
    min_range_int = int(decimal.Decimal(min_range) * 10**40)
    max_range_int = int(decimal.Decimal(max_range) * 10**40)

    if min_range_int >= max_range_int:
      return None
    if val_int < min_range_int:
      return 0
    if val_int > max_range_int:
      return num_buckets + 1

    range_size = max_range_int - min_range_int
    dist_from_min = val_int - min_range_int
    return (num_buckets * dist_from_min) // range_size + 1

  def execute_one_width_bucket(self, query_options):
    val, val_prec, val_scale = self.get_decimal()
    min_range, min_range_prec, min_range_scale = self.get_decimal()
    max_range, max_range_prec, max_range_scale = self.get_decimal()
    num_buckets = random.randint(1, 2147483647)

    query = ('select width_bucket('
        'cast({val} as decimal({val_prec},{val_scale})), '
        'cast({min_range} as decimal({min_range_prec},{min_range_scale})), '
        'cast({max_range} as decimal({max_range_prec},{max_range_scale})), '
        '{num_buckets})')

    query = query.format(val=val, val_prec=val_prec, val_scale=val_scale,
        min_range=min_range, min_range_prec=min_range_prec,
        min_range_scale=min_range_scale,
        max_range=max_range, max_range_prec=max_range_prec,
        max_range_scale=max_range_scale,
        num_buckets=num_buckets)

    expected_result = self.width_bucket(val, min_range, max_range, num_buckets)
    if not expected_result:
      return

    try:
      result = self.execute_scalar(query, query_options)
      assert int(result) == expected_result
    except IMPALA_CONNECTION_EXCEPTION as e:
      if "You need to wrap the arguments in a CAST" not in str(e):
        # Sometimes the decimal inputs are incompatible with each other, so it's ok
        # to ignore this error.
        raise e

  def test_width_bucket(self, vector):
    for _ in range(self.iterations):
      self.execute_one_width_bucket(vector.get_exec_option_dict())