mirror of
https://github.com/apache/impala.git
synced 2026-01-03 15:00:52 -05:00
This change enables codegen for all builtin aggregate functions, e.g. timestamp functions and group_concat. There are several parts to the change: * Adding support for generic UDAs. Previous the codegen code did not handle multiple input arguments or NULL return values. * Defaulting to using the UDA interface when there is not a special codegen path (we have implementations of all builtin aggregate functions for the interpreted path). * Remove all the logic to disable codegen for the special cases that now are supported. Also fix the generation of code to get/set NULL bits since I needed to add functionality there anyway. Testing: Add tests that check that codegen was enabled for builtin aggregate functions. Also fix some gaps in the preexisting tests. Also add tests for UDAs that check input/output nulls are handled correctly, in anticipation of enabling codegen for arbitrary UDAs. The tests are run with both codegen enabled and disabled. To avoid flaky tests, we switch the UDF tests to use "unique_database". Perf: Ran local TPC-H and targeted perf. Spent a lot of time on TPC-H Q1, since my original approach regressed it ~5%. In the end the problem was to do with the ordering of loads/stores to the slot and null bit in the generated code: the previous version of the code exploited some properties of the particular aggregate function. I ended up replicating this behaviour to avoid regressing perf. Change-Id: Id9dc21d1d676505d3617e1e4f37557397c4fb260 Reviewed-on: http://gerrit.cloudera.org:8080/4655 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Internal Jenkins
286 lines
14 KiB
Python
286 lines
14 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Validates all aggregate functions across all datatypes
|
|
#
|
|
import pytest
|
|
|
|
from tests.common.environ import USING_OLD_AGGS_JOINS
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.skip import SkipIfOldAggsJoins
|
|
from tests.common.test_dimensions import (
|
|
create_exec_option_dimension,
|
|
create_uncompressed_text_dimension)
|
|
from tests.common.test_result_verifier import assert_codegen_enabled
|
|
from tests.common.test_vector import TestDimension
|
|
|
|
# Test dimensions for TestAggregation.
|
|
AGG_FUNCTIONS = ['sum', 'count', 'min', 'max', 'avg', 'ndv']
|
|
DATA_TYPES = ['int', 'bool', 'double', 'bigint', 'tinyint',
|
|
'smallint', 'float', 'timestamp', 'string']
|
|
|
|
# Lookup table for TestAggregation results.
|
|
result_lut = {
|
|
'sum-tinyint': 45000, 'avg-tinyint': 5, 'count-tinyint': 9000,
|
|
'min-tinyint': 1, 'max-tinyint': 9, 'ndv-tinyint': 9,
|
|
'sum-smallint': 495000, 'avg-smallint': 50, 'count-smallint': 9900,
|
|
'min-smallint': 1, 'max-smallint': 99, 'ndv-smallint': 99,
|
|
'sum-int': 4995000, 'avg-int': 500, 'count-int': 9990,
|
|
'min-int': 1, 'max-int': 999, 'ndv-int': 999,
|
|
'sum-bigint': 49950000, 'avg-bigint': 5000, 'count-bigint': 9990,
|
|
'min-bigint': 10, 'max-bigint' : 9990, 'ndv-bigint': 999,
|
|
'sum-bool': 5000, 'count-bool': 10000, 'min-bool': 'false',
|
|
'max-bool': 'true', 'avg-bool': 0.5, 'ndv-bool': 2,
|
|
'sum-double': 50449500.0, 'count-double': 9990, 'min-double': 10.1,
|
|
'max-double': 10089.9, 'avg-double': 5050.0, 'ndv-double': 999,
|
|
'sum-float': 5494500.0, 'count-float': 9990, 'min-float': 1.10,
|
|
'max-float': 1098.9, 'avg-float': 550.0, 'ndv-float': 999,
|
|
'count-timestamp': 10000, 'min-timestamp': '2010-01-01 00:00:00',
|
|
'max-timestamp': '2010-01-10 18:02:05.100000000',
|
|
'avg-timestamp': '2010-01-05 20:47:11.705080000', 'ndv-timestamp': 10000,
|
|
'count-string': 10000, 'min-string': '0', 'max-string': '999', 'ndv-string': 999,
|
|
'sum-distinct-tinyint': 45, 'count-distinct-tinyint': 9, 'min-distinct-tinyint': 1,
|
|
'max-distinct-tinyint': 9, 'avg-distinct-tinyint': 5, 'ndv-distinct-tinyint': 9,
|
|
'sum-distinct-smallint': 4950, 'count-distinct-smallint': 99,
|
|
'min-distinct-smallint': 1, 'max-distinct-smallint': 99,
|
|
'avg-distinct-smallint': 50, 'ndv-distinct-smallint': 99,
|
|
'sum-distinct-int': 499500, 'count-distinct-int': 999, 'min-distinct-int': 1,
|
|
'max-distinct-int': 999, 'avg-distinct-int': 500, 'ndv-distinct-int': 999,
|
|
'sum-distinct-bigint': 4995000, 'count-distinct-bigint': 999, 'min-distinct-bigint': 10,
|
|
'max-distinct-bigint': 9990, 'avg-distinct-bigint': 5000,
|
|
'ndv-distinct-bigint': 999,
|
|
'sum-distinct-bool': 1, 'count-distinct-bool': 2, 'min-distinct-bool': 'false',
|
|
'max-distinct-bool': 'true', 'avg-distinct-bool': 0.5, 'ndv-distinct-bool': 2,
|
|
'sum-distinct-double': 5044950.0, 'count-distinct-double': 999,
|
|
'min-distinct-double': 10.1, 'max-distinct-double': 10089.9,
|
|
'avg-distinct-double': 5050.0, 'ndv-distinct-double': 999,
|
|
'sum-distinct-float': 549450.0, 'count-distinct-float': 999, 'min-distinct-float': 1.1,
|
|
'max-distinct-float': 1098.9, 'avg-distinct-float': 550.0,
|
|
'ndv-distinct-float': 999,
|
|
'count-distinct-timestamp': 10000, 'min-distinct-timestamp': '2010-01-01 00:00:00',
|
|
'max-distinct-timestamp': '2010-01-10 18:02:05.100000000',
|
|
'avg-distinct-timestamp': '2010-01-05 20:47:11.705080000',
|
|
'ndv-distinct-timestamp': 10000,
|
|
'count-distinct-string': 1000, 'min-distinct-string': '0',
|
|
'max-distinct-string': '999', 'ndv-distinct-string': 999,
|
|
}
|
|
|
|
class TestAggregation(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestAggregation, cls).add_test_dimensions()
|
|
|
|
# Add two more dimensions
|
|
cls.TestMatrix.add_dimension(TestDimension('agg_func', *AGG_FUNCTIONS))
|
|
cls.TestMatrix.add_dimension(TestDimension('data_type', *DATA_TYPES))
|
|
cls.TestMatrix.add_constraint(lambda v: cls.is_valid_vector(v))
|
|
|
|
@classmethod
|
|
def is_valid_vector(cls, vector):
|
|
data_type, agg_func = vector.get_value('data_type'), vector.get_value('agg_func')
|
|
file_format = vector.get_value('table_format').file_format
|
|
if file_format not in ['parquet']: return False
|
|
|
|
if cls.exploration_strategy() == 'core':
|
|
# Reduce execution time when exploration strategy is 'core'
|
|
if vector.get_value('exec_option')['batch_size'] != 0: return False
|
|
|
|
# Avro doesn't have timestamp type
|
|
non_numeric = data_type in ['bool', 'string']
|
|
if file_format == 'avro' and data_type == 'timestamp':
|
|
return False
|
|
elif non_numeric and agg_func not in ['min', 'max', 'count', 'ndv']:
|
|
return False
|
|
elif agg_func == 'sum' and data_type == 'timestamp':
|
|
return False
|
|
return True
|
|
|
|
def test_aggregation(self, vector):
|
|
exec_option = vector.get_value('exec_option')
|
|
disable_codegen = exec_option['disable_codegen']
|
|
# The old aggregation node does not support codegen for all aggregate functions.
|
|
check_codegen_enabled = not disable_codegen and not USING_OLD_AGGS_JOINS
|
|
data_type, agg_func = (vector.get_value('data_type'), vector.get_value('agg_func'))
|
|
|
|
query = 'select %s(%s_col) from alltypesagg where day is not null' % (agg_func,
|
|
data_type)
|
|
result = self.execute_query(query, exec_option,
|
|
table_format=vector.get_value('table_format'))
|
|
assert len(result.data) == 1
|
|
self.verify_agg_result(agg_func, data_type, False, result.data[0]);
|
|
|
|
if check_codegen_enabled:
|
|
# Verify codegen was enabled for both stages of the aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1, 3])
|
|
|
|
query = 'select %s(DISTINCT(%s_col)) from alltypesagg where day is not null' % (
|
|
agg_func, data_type)
|
|
result = self.execute_query(query, vector.get_value('exec_option'))
|
|
assert len(result.data) == 1
|
|
self.verify_agg_result(agg_func, data_type, True, result.data[0]);
|
|
|
|
if check_codegen_enabled:
|
|
# Verify codegen was enabled for all stages of the aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
|
|
|
|
def verify_agg_result(self, agg_func, data_type, distinct, actual_string):
|
|
key = '%s-%s%s' % (agg_func, 'distinct-' if distinct else '', data_type)
|
|
|
|
if agg_func == 'ndv':
|
|
# NDV is inherently approximate. Compare with some tolerance.
|
|
err = abs(result_lut[key] - int(actual_string))
|
|
rel_err = err / float(result_lut[key])
|
|
print key, result_lut[key], actual_string,abs(result_lut[key] - int(actual_string))
|
|
assert err <= 1 or rel_err < 0.05
|
|
elif data_type in ('float', 'double') and agg_func != 'count':
|
|
# Compare with a margin of error.
|
|
delta = 1e6 if data_type == 'double' else 1e3
|
|
assert abs(result_lut[key] - float(actual_string)) < delta
|
|
elif data_type == 'timestamp' and agg_func != 'count':
|
|
# Strip off everything past 10s of microseconds.
|
|
ignore_digits = 4
|
|
assert result_lut[key][:-ignore_digits] == actual_string[:-ignore_digits]
|
|
else:
|
|
assert str(result_lut[key]) == actual_string
|
|
|
|
|
|
class TestAggregationQueries(ImpalaTestSuite):
|
|
"""Run the aggregation test suite, with codegen enabled and disabled, to exercise our
|
|
non-codegen code"""
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestAggregationQueries, cls).add_test_dimensions()
|
|
|
|
cls.TestMatrix.add_dimension(
|
|
create_exec_option_dimension(disable_codegen_options=[False, True]))
|
|
|
|
if cls.exploration_strategy() == 'core':
|
|
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def test_non_codegen_tinyint_grouping(self, vector, unique_database):
|
|
# Regression for IMPALA-901. The test includes an INSERT statement, so can only be run
|
|
# on INSERT-able formats - text only in this case, since the bug doesn't depend on the
|
|
# file format.
|
|
if vector.get_value('table_format').file_format == 'text' \
|
|
and vector.get_value('table_format').compression_codec == 'none':
|
|
self.client.execute("create table %s.imp_901 (col tinyint)" % unique_database)
|
|
self.run_test_case('QueryTest/aggregation_no_codegen_only', vector,
|
|
unique_database)
|
|
|
|
def test_aggregation(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
|
|
self.run_test_case('QueryTest/aggregation', vector)
|
|
|
|
def test_distinct(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail("HBase returns columns in alphabetical order for select distinct *, "
|
|
"making the result verication to fail.")
|
|
if vector.get_value('table_format').file_format == 'kudu':
|
|
pytest.xfail("IMPALA-4042: count(distinct NULL) fails on a view, needed for kudu")
|
|
self.run_test_case('QueryTest/distinct', vector)
|
|
|
|
def test_group_concat(self, vector):
|
|
"""group_concat distinct tests
|
|
Required to run directly in python because the order in which results will be
|
|
merged at the final, single-node aggregation step is non-deterministic (if the
|
|
first phase is running on multiple nodes). Need to pull the result apart and
|
|
compare the actual items)"""
|
|
exec_option = vector.get_value('exec_option')
|
|
disable_codegen = exec_option['disable_codegen']
|
|
table_format = vector.get_value('table_format')
|
|
# Test group_concat distinct with other aggregate function and groupings.
|
|
# expected result is the row: 2010,'1, 2, 3, 4','1-2-3-4','2|3|1|4',40,4
|
|
query = """select year, group_concat(distinct string_col),
|
|
group_concat(distinct string_col, '-'), group_concat(distinct string_col, '|'),
|
|
count(string_col), count(distinct string_col)
|
|
from alltypesagg where int_col < 5 and year = 2010 group by year"""
|
|
result = self.execute_query(query, exec_option, table_format=table_format)
|
|
row = (result.data)[0].split("\t")
|
|
assert(len(row) == 6)
|
|
assert(row[0] == '2010')
|
|
delimiter = [', ', '-', '|']
|
|
for i in range(1, 4):
|
|
assert(set(row[i].split(delimiter[i-1])) == set(['1', '2', '3', '4']))
|
|
assert(row[4] == '40')
|
|
assert(row[5] == '4')
|
|
check_codegen_enabled = not disable_codegen and not USING_OLD_AGGS_JOINS
|
|
if check_codegen_enabled:
|
|
# Verify codegen was enabled for all three stages of the aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1, 2, 4])
|
|
|
|
# Test group_concat distinct with arrow delimiter, with multiple rows
|
|
query = """select day, group_concat(distinct string_col, "->")
|
|
from (select * from alltypesagg where id % 100 = day order by id limit 99999) a
|
|
group by day order by day"""
|
|
result = self.execute_query(query, exec_option, table_format=table_format)
|
|
string_col = []
|
|
string_col.append(set(['1','101','201','301','401','501','601','701','801','901']))
|
|
string_col.append(set(['2','102','202','302','402','502','602','702','802','902']))
|
|
string_col.append(set(['3','103','203','303','403','503','603','703','803','903']))
|
|
string_col.append(set(['4','104','204','304','404','504','604','704','804','904']))
|
|
string_col.append(set(['5','105','205','305','405','505','605','705','805','905']))
|
|
string_col.append(set(['6','106','206','306','406','506','606','706','806','906']))
|
|
string_col.append(set(['7','107','207','307','407','507','607','707','807','907']))
|
|
string_col.append(set(['8','108','208','308','408','508','608','708','808','908']))
|
|
string_col.append(set(['9','109','209','309','409','509','609','709','809','909']))
|
|
string_col.append(set(['10','110','210','310','410','510','610','710','810','910']))
|
|
assert(len(result.data) == 10)
|
|
for i in range(10):
|
|
row = (result.data)[i].split("\t")
|
|
assert(len(row) == 2)
|
|
assert(row[0] == str(i+1))
|
|
assert(set(row[1].split("->")) == string_col[i])
|
|
|
|
# Test group_concat distinct with merge node
|
|
query = """select group_concat(distinct string_col, ' ') from alltypesagg
|
|
where int_col < 10"""
|
|
result = self.execute_query(query, exec_option, table_format=table_format)
|
|
assert(set((result.data)[0].split(" ")) == set(['1','2','3','4','5','6','7','8','9']))
|
|
if check_codegen_enabled:
|
|
# Verify codegen was enabled for all four stages of the aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
|
|
|
|
|
|
class TestTPCHAggregationQueries(ImpalaTestSuite):
|
|
# Uses the TPC-H dataset in order to have larger aggregations.
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'tpch'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestTPCHAggregationQueries, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format in ['parquet'])
|
|
|
|
def test_tpch_aggregations(self, vector):
|
|
self.run_test_case('tpch-aggregations', vector)
|
|
|
|
@SkipIfOldAggsJoins.passthrough_preagg
|
|
def test_tpch_passthrough_aggregations(self, vector):
|
|
self.run_test_case('tpch-passthrough-aggregations', vector)
|