mirror of
https://github.com/apache/impala.git
synced 2026-01-25 18:01:04 -05:00
Running the tests of https://gerrit.cloudera.org/#/c/10859/ it turned out that the width_bucket() function overflows very often. A common problem is that the function tries to cast the 'num_buckets' parameter to the decimal determined by the Frontend. When the Frontend determined the precision and scale of this decimal it only considered the decimal arguments and ignored everything else. Therefore the determined precision and scale is often not suitable for the 'num_buckets' parameter. WidthBucketImpl() has three decimal arguments, all of them have the same byte size, precision, and scale. So it is possible to interpret them as plain integers and still calculate the proper bucket. I included the python test cases from IMPALA-7202 developed by Taras Bobrovytsky. I also extended the backend tests with new test cases. For performance test I used the following query: SELECT sum(width_bucket(cast(l_orderkey AS DECIMAL(30, 10)), 0, 5500000, 1000000)) FROM tpch_parquet.lineitem; The new implementation executed it in ~0.3 seconds. The old implementation executed it in ~0.8 seconds. Change-Id: I68262698144029ef7f54e027e586eaf105f36ab3 Reviewed-on: http://gerrit.cloudera.org:8080/11282 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
306 lines
11 KiB
Python
306 lines
11 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Generates random decimal numbers and verifies that mathematical
|
|
# operations return correct results under decimal_v2.
|
|
|
|
import decimal
|
|
import math
|
|
import pytest
|
|
import random
|
|
|
|
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.test_dimensions import create_single_exec_option_dimension
|
|
from tests.common.test_vector import ImpalaTestDimension, ImpalaTestMatrix
|
|
|
|
class TestDecimalFuzz(ImpalaTestSuite):
|
|
|
|
# Impala's max precision for decimals is 38, so we should have the same in the tests
|
|
decimal.getcontext().prec = 38
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
cls.ImpalaTestMatrix = ImpalaTestMatrix()
|
|
cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
|
|
if cls.exploration_strategy() == 'exhaustive':
|
|
cls.iterations = 50000
|
|
else:
|
|
cls.iterations = 10000
|
|
|
|
def weighted_choice(self, options):
|
|
total_weight = sum(options.itervalues())
|
|
numeric_choice = random.uniform(0, total_weight)
|
|
last_choice = None
|
|
for choice, weight in options.iteritems():
|
|
if numeric_choice <= weight:
|
|
return choice
|
|
numeric_choice -= weight
|
|
if weight > 0:
|
|
last_choice = choice
|
|
return last_choice
|
|
|
|
def get_decimal(self):
|
|
'''Returns a 3-tuple with string values of (value, precision, scale). The function
|
|
does not always return completely random values, we try to bias it to select
|
|
more interesting values.'''
|
|
|
|
def random_precision():
|
|
return random.randint(1, 38)
|
|
|
|
def extreme_precision():
|
|
return 38
|
|
|
|
precision_weights = {}
|
|
precision_weights[random_precision] = 0.8
|
|
precision_weights[extreme_precision] = 0.2
|
|
precision = self.weighted_choice(precision_weights)()
|
|
|
|
def random_scale(precision):
|
|
return random.randint(0, precision)
|
|
|
|
def extreme_scale(precision):
|
|
return random.choice([0, precision])
|
|
|
|
scale_weights = {}
|
|
scale_weights[random_scale] = 0.9
|
|
scale_weights[extreme_scale] = 0.1
|
|
scale = self.weighted_choice(scale_weights)(precision)
|
|
|
|
def random_value(precision):
|
|
'''Generates a completely random value.'''
|
|
|
|
def num_digits_random(precision):
|
|
return random.randint(1, precision)
|
|
|
|
def num_digits_all(precision):
|
|
return precision
|
|
|
|
# Determine how many digits the value is going to have.
|
|
num_digits_weights = {}
|
|
num_digits_weights[num_digits_random] = 0.8
|
|
num_digits_weights[num_digits_all] = 0.2
|
|
num_digits = self.weighted_choice(num_digits_weights)(precision)
|
|
|
|
no_zero = '123456789'
|
|
with_zero = '0123456789'
|
|
result = random.choice(no_zero)
|
|
for _ in range(num_digits - 1):
|
|
result += random.choice(with_zero)
|
|
|
|
return result
|
|
|
|
def special_case_binary_value(precision):
|
|
'''Generates a value that looks like 11111... or 10000... in binary number
|
|
system.'''
|
|
|
|
def exponent_random(precision):
|
|
return random.randint(0, int(precision * math.log(10, 2)))
|
|
|
|
def exponent_max(precision):
|
|
return int(precision * math.log(10, 2))
|
|
|
|
exponent_weights = {}
|
|
exponent_weights[exponent_random] = 0.8
|
|
exponent_weights[exponent_max] = 0.2
|
|
exponent = self.weighted_choice(exponent_weights)(precision)
|
|
|
|
value = 2 ** exponent
|
|
if random.random() < 0.5:
|
|
value -= 1
|
|
return '{0}'.format(value)
|
|
|
|
def special_case_decimal_value(precision):
|
|
'''Generates a value that looks like 99999... or 10000... in decimal number
|
|
system.'''
|
|
|
|
def num_digits_random(precision):
|
|
return random.randint(1, precision)
|
|
|
|
def num_digits_max(precision):
|
|
return precision
|
|
|
|
num_digits_weights = {}
|
|
num_digits_weights[num_digits_random] = 8
|
|
num_digits_weights[num_digits_max] = 0.2
|
|
num_digits = self.weighted_choice(num_digits_weights)(precision)
|
|
|
|
value = 10 ** num_digits
|
|
|
|
if num_digits == precision or random.random() < 0.5:
|
|
value -= 1
|
|
|
|
return '{0}'.format(value)
|
|
|
|
value_weights = {}
|
|
value_weights[random_value] = 0.6
|
|
value_weights[special_case_binary_value] = 0.2
|
|
value_weights[special_case_decimal_value] = 0.2
|
|
|
|
value = self.weighted_choice(value_weights)(precision)
|
|
|
|
# Randomly determine the placement of the decimal mark.
|
|
# The smallest index where the decimal mark can be placed in the number string.
|
|
min_dot_location = max(len(value) - scale, 0)
|
|
# The largest index where the decimal mark can be placed in the number string.
|
|
max_dot_location = min(precision - scale, len(value))
|
|
dot_location = random.randint(min_dot_location, max_dot_location)
|
|
|
|
if dot_location == 0:
|
|
value = '0.' + value
|
|
elif dot_location == len(value):
|
|
pass
|
|
else:
|
|
value = value[:dot_location] + '.' + value[dot_location:]
|
|
|
|
if random.random() < 0.5:
|
|
# Negate the number.
|
|
value = '-' + value
|
|
return (value, precision, scale)
|
|
|
|
def result_equals(self, expected, actual):
|
|
'''Verify that the expected result is equal to the actual result. We verify equality
|
|
by rounding the expected result to different numbers of places and verifying that the
|
|
actual result is matched in at least one of the cases.'''
|
|
if actual == expected:
|
|
return True
|
|
|
|
if actual is None:
|
|
# Overflow
|
|
if abs(expected) > decimal.Decimal("9" * 32):
|
|
# If the expected result is larger than 10^32 - 1, it's not unreasonable for
|
|
# there to be an overflow in Impala because the minimum scale is 6 and
|
|
# 38 (max precision) - 6 = 32.
|
|
return True
|
|
return False
|
|
|
|
for num_digits_after_dot in xrange(39):
|
|
# Reduce the number of digits after the dot in the expected_result to different
|
|
# amounts. If it matches the actual result in at least one of the cases, we
|
|
# consider the actual result to be acceptable.
|
|
truncated_expected = expected.quantize(
|
|
decimal.Decimal("1e-{0}".format(num_digits_after_dot)),
|
|
rounding=decimal.ROUND_HALF_UP)
|
|
if actual == truncated_expected:
|
|
return True
|
|
return False
|
|
|
|
def execute_one_decimal_op(self):
|
|
'''Executes a single query and compares the result to a result that we computed in
|
|
Python.'''
|
|
op = random.choice(['+', '-', '*', '/', '%'])
|
|
value1, precision1, scale1 = self.get_decimal()
|
|
value2, precision2, scale2 = self.get_decimal()
|
|
|
|
query = ('select cast({value1} as decimal({precision1},{scale1})) {op} '
|
|
'cast({value2} as decimal({precision2},{scale2}))').format(op=op,
|
|
value1=value1, precision1=precision1, scale1=scale1,
|
|
value2=value2, precision2=precision2, scale2=scale2)
|
|
|
|
try:
|
|
result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
|
|
except ImpalaBeeswaxException as e:
|
|
result = None
|
|
if result is not None:
|
|
result = decimal.Decimal(result)
|
|
|
|
with decimal.localcontext() as ctx:
|
|
# Set the decimal context to a large precision initially, so that the
|
|
# mathematical operations are performed at a high precision.
|
|
ctx.prec = 80
|
|
|
|
try:
|
|
if op == '+':
|
|
expected_result = decimal.Decimal(value1) + decimal.Decimal(value2)
|
|
elif op == '-':
|
|
expected_result = decimal.Decimal(value1) - decimal.Decimal(value2)
|
|
elif op == '*':
|
|
expected_result = decimal.Decimal(value1) * decimal.Decimal(value2)
|
|
elif op == '/':
|
|
expected_result = decimal.Decimal(value1) / decimal.Decimal(value2)
|
|
elif op == '%':
|
|
expected_result = decimal.Decimal(value1) % decimal.Decimal(value2)
|
|
else:
|
|
assert False
|
|
except decimal.InvalidOperation as e:
|
|
expected_result = None
|
|
except decimal.DivisionByZero as e:
|
|
expected_result = None
|
|
assert self.result_equals(expected_result, result)
|
|
|
|
def test_decimal_ops(self, vector):
|
|
for _ in xrange(self.iterations):
|
|
self.execute_one_decimal_op()
|
|
|
|
def width_bucket(self, val, min_range, max_range, num_buckets):
|
|
# Multiplying the values by 10**40 guarantees that the numbers can be converted
|
|
# to int without losing information.
|
|
val_int = int(decimal.Decimal(val) * 10**40)
|
|
min_range_int = int(decimal.Decimal(min_range) * 10**40)
|
|
max_range_int = int(decimal.Decimal(max_range) * 10**40)
|
|
|
|
if min_range_int >= max_range_int:
|
|
return None
|
|
if val_int < min_range_int:
|
|
return 0
|
|
if val_int > max_range_int:
|
|
return num_buckets + 1
|
|
|
|
range_size = max_range_int - min_range_int
|
|
dist_from_min = val_int - min_range_int
|
|
return (num_buckets * dist_from_min) / range_size + 1
|
|
|
|
def execute_one_width_bucket(self):
|
|
val, val_prec, val_scale = self.get_decimal()
|
|
min_range, min_range_prec, min_range_scale = self.get_decimal()
|
|
max_range, max_range_prec, max_range_scale = self.get_decimal()
|
|
num_buckets = random.randint(1, 2147483647)
|
|
|
|
query = ('select width_bucket('
|
|
'cast({val} as decimal({val_prec},{val_scale})), '
|
|
'cast({min_range} as decimal({min_range_prec},{min_range_scale})), '
|
|
'cast({max_range} as decimal({max_range_prec},{max_range_scale})), '
|
|
'{num_buckets})')
|
|
|
|
query = query.format(val=val, val_prec=val_prec, val_scale=val_scale,
|
|
min_range=min_range, min_range_prec=min_range_prec,
|
|
min_range_scale=min_range_scale,
|
|
max_range=max_range, max_range_prec=max_range_prec,
|
|
max_range_scale=max_range_scale,
|
|
num_buckets=num_buckets)
|
|
|
|
expected_result = self.width_bucket(val, min_range, max_range, num_buckets)
|
|
if not expected_result:
|
|
return
|
|
|
|
try:
|
|
result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
|
|
assert int(result) == expected_result
|
|
except ImpalaBeeswaxException as e:
|
|
if "You need to wrap the arguments in a CAST" not in str(e):
|
|
# Sometimes the decimal inputs are incompatible with each other, so it's ok
|
|
# to ignore this error.
|
|
raise e
|
|
|
|
def test_width_bucket(self, vector):
|
|
for _ in xrange(self.iterations):
|
|
self.execute_one_width_bucket()
|