mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
This change adds get_workload() to ImpalaTestSuite and removes it from all test suites that already returned 'functional-query'. get_workload() is also removed from CustomClusterTestSuite which used to return 'tpch'. All other changes besides impala_test_suite.py and custom_cluster_test_suite.py are just mass removals of get_workload() functions. The behavior is only changed in custom cluster tests that didn't override get_workload(). By returning 'functional-query' instead of 'tpch', exploration_strategy() will no longer return 'core' in 'exhaustive' test runs. See IMPALA-3947 on why workload affected exploration_strategy. An example for affected test is TestCatalogHMSFailures which was skipped both in core and exhaustive runs before this change. get_workload() functions that return a different workload than 'functional-query' are not changed - it is possible that some of these also don't handle exploration_strategy() as expected, but individually checking these tests is out of scope in this patch. Change-Id: I9ec6c41ffb3a30e1ea2de773626d1485c69fe115 Reviewed-on: http://gerrit.cloudera.org:8080/22726 Reviewed-by: Riza Suminto <riza.suminto@cloudera.com> Reviewed-by: Daniel Becker <daniel.becker@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
523 lines
24 KiB
Python
523 lines
24 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Validates all aggregate functions across all datatypes
|
|
#
|
|
from __future__ import absolute_import, division, print_function
|
|
from builtins import range
|
|
import pytest
|
|
|
|
from testdata.common import widetable
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.test_dimensions import (
|
|
ALL_CLUSTER_SIZES,
|
|
create_exec_option_dimension,
|
|
create_exec_option_dimension_from_dict,
|
|
create_uncompressed_text_dimension)
|
|
from tests.common.test_result_verifier import (
|
|
assert_codegen_enabled,
|
|
QueryTestResult,
|
|
parse_result_rows)
|
|
from tests.common.test_vector import ImpalaTestDimension
|
|
|
|
# Test dimensions for TestAggregation.
|
|
AGG_FUNCTIONS = ['sum', 'count', 'min', 'max', 'avg', 'ndv']
|
|
DATA_TYPES = ['int', 'bool', 'double', 'bigint', 'tinyint',
|
|
'smallint', 'float', 'timestamp', 'string']
|
|
|
|
# Lookup table for TestAggregation results.
|
|
result_lut = {
|
|
# tinyint
|
|
'sum-tinyint': 45000, 'avg-tinyint': 5, 'count-tinyint': 9000,
|
|
'min-tinyint': 1, 'max-tinyint': 9, 'ndv-tinyint': 9,
|
|
# smallint
|
|
'sum-smallint': 495000, 'avg-smallint': 50, 'count-smallint': 9900,
|
|
'min-smallint': 1, 'max-smallint': 99, 'ndv-smallint': 99,
|
|
# int
|
|
'sum-int': 4995000, 'avg-int': 500, 'count-int': 9990,
|
|
'min-int': 1, 'max-int': 999, 'ndv-int': 999,
|
|
# bigint
|
|
'sum-bigint': 49950000, 'avg-bigint': 5000, 'count-bigint': 9990,
|
|
'min-bigint': 10, 'max-bigint': 9990, 'ndv-bigint': 999,
|
|
# bool
|
|
'sum-bool': 5000, 'count-bool': 10000, 'min-bool': 'false',
|
|
'max-bool': 'true', 'avg-bool': 0.5, 'ndv-bool': 2,
|
|
# double
|
|
'sum-double': 50449500.0, 'count-double': 9990, 'min-double': 10.1,
|
|
'max-double': 10089.9, 'avg-double': 5050.0, 'ndv-double': 999,
|
|
# float
|
|
'sum-float': 5494500.0, 'count-float': 9990, 'min-float': 1.10,
|
|
'max-float': 1098.9, 'avg-float': 550.0, 'ndv-float': 999,
|
|
# timestamp
|
|
'count-timestamp': 10000, 'min-timestamp': '2010-01-01 00:00:00',
|
|
'max-timestamp': '2010-01-10 18:02:05.100000000',
|
|
'avg-timestamp': '2010-01-05 20:47:11.705080000', 'ndv-timestamp': 10000,
|
|
# string
|
|
'count-string': 10000, 'min-string': '0', 'max-string': '999', 'ndv-string': 999,
|
|
# distinct-tinyint
|
|
'sum-distinct-tinyint': 45, 'count-distinct-tinyint': 9, 'min-distinct-tinyint': 1,
|
|
'max-distinct-tinyint': 9, 'avg-distinct-tinyint': 5, 'ndv-distinct-tinyint': 9,
|
|
# distinct-smallint
|
|
'sum-distinct-smallint': 4950, 'count-distinct-smallint': 99,
|
|
'min-distinct-smallint': 1, 'max-distinct-smallint': 99,
|
|
'avg-distinct-smallint': 50, 'ndv-distinct-smallint': 99,
|
|
# distinct-int
|
|
'sum-distinct-int': 499500, 'count-distinct-int': 999, 'min-distinct-int': 1,
|
|
'max-distinct-int': 999, 'avg-distinct-int': 500, 'ndv-distinct-int': 999,
|
|
# distinct-bigint
|
|
'sum-distinct-bigint': 4995000, 'count-distinct-bigint': 999, 'min-distinct-bigint': 10,
|
|
'max-distinct-bigint': 9990, 'avg-distinct-bigint': 5000,
|
|
'ndv-distinct-bigint': 999,
|
|
# distinct-bool
|
|
'sum-distinct-bool': 1, 'count-distinct-bool': 2, 'min-distinct-bool': 'false',
|
|
'max-distinct-bool': 'true', 'avg-distinct-bool': 0.5, 'ndv-distinct-bool': 2,
|
|
# distinct-double
|
|
'sum-distinct-double': 5044950.0, 'count-distinct-double': 999,
|
|
'min-distinct-double': 10.1, 'max-distinct-double': 10089.9,
|
|
'avg-distinct-double': 5050.0, 'ndv-distinct-double': 999,
|
|
# distinct-float
|
|
'sum-distinct-float': 549450.0, 'count-distinct-float': 999, 'min-distinct-float': 1.1,
|
|
'max-distinct-float': 1098.9, 'avg-distinct-float': 550.0,
|
|
'ndv-distinct-float': 999,
|
|
# distinct-timestamp
|
|
'count-distinct-timestamp': 10000, 'min-distinct-timestamp': '2010-01-01 00:00:00',
|
|
'max-distinct-timestamp': '2010-01-10 18:02:05.100000000',
|
|
'avg-distinct-timestamp': '2010-01-05 20:47:11.705080000',
|
|
'ndv-distinct-timestamp': 10000,
|
|
# distinct-string
|
|
'count-distinct-string': 1000, 'min-distinct-string': '0',
|
|
'max-distinct-string': '999', 'ndv-distinct-string': 999,
|
|
}
|
|
|
|
|
|
class TestAggregation(ImpalaTestSuite):
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestAggregation, cls).add_test_dimensions()
|
|
# IMPALA-12383: Exercise with both num_nodes=0 (distributed plan) and num_nodes=1
|
|
# (single node plan).
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension(cluster_sizes=ALL_CLUSTER_SIZES)
|
|
)
|
|
|
|
# Add two more dimensions
|
|
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('agg_func', *AGG_FUNCTIONS))
|
|
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('data_type', *DATA_TYPES))
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v: cls.is_valid_vector(v))
|
|
|
|
@classmethod
|
|
def is_valid_vector(cls, vector):
|
|
data_type, agg_func = vector.get_value('data_type'), vector.get_value('agg_func')
|
|
file_format = vector.get_value('table_format').file_format
|
|
if file_format not in ['parquet']: return False
|
|
|
|
if cls.exploration_strategy() == 'core':
|
|
# Reduce execution time when exploration strategy is 'core'
|
|
if vector.get_value('exec_option')['batch_size'] != 0: return False
|
|
|
|
# Avro doesn't have timestamp type
|
|
non_numeric = data_type in ['bool', 'string']
|
|
if file_format == 'avro' and data_type == 'timestamp':
|
|
return False
|
|
elif non_numeric and agg_func not in ['min', 'max', 'count', 'ndv']:
|
|
return False
|
|
elif agg_func == 'sum' and data_type == 'timestamp':
|
|
return False
|
|
return True
|
|
|
|
def test_aggregation(self, vector):
|
|
exec_option = vector.get_value('exec_option')
|
|
disable_codegen = exec_option['disable_codegen']
|
|
data_type, agg_func = (vector.get_value('data_type'), vector.get_value('agg_func'))
|
|
db_name = self.get_db_name_from_format(vector.get_table_format())
|
|
|
|
query = 'select {0}({1}_col) from {2}.alltypesagg where day is not null'.format(
|
|
agg_func, data_type, db_name)
|
|
result = self.execute_query(query, exec_option)
|
|
assert len(result.data) == 1
|
|
self.verify_agg_result(agg_func, data_type, False, result.data[0])
|
|
|
|
if not disable_codegen:
|
|
# Verify codegen was enabled for the preaggregation.
|
|
# It is deliberately disabled for the merge aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1])
|
|
|
|
query = ('select {0}(DISTINCT({1}_col)) from {2}.alltypesagg '
|
|
'where day is not null').format(agg_func, data_type, db_name)
|
|
result = self.execute_query(query, exec_option)
|
|
assert len(result.data) == 1
|
|
self.verify_agg_result(agg_func, data_type, True, result.data[0])
|
|
|
|
if not disable_codegen:
|
|
# Verify codegen was enabled for all stages of the aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
|
|
|
|
def verify_agg_result(self, agg_func, data_type, distinct, actual_string):
|
|
key = '%s-%s%s' % (agg_func, 'distinct-' if distinct else '', data_type)
|
|
|
|
if agg_func == 'ndv':
|
|
# NDV is inherently approximate. Compare with some tolerance.
|
|
err = abs(result_lut[key] - int(actual_string))
|
|
rel_err = err / float(result_lut[key])
|
|
print(key, result_lut[key], actual_string,
|
|
abs(result_lut[key] - int(actual_string)))
|
|
assert err <= 1 or rel_err < 0.05
|
|
elif data_type in ('float', 'double') and agg_func != 'count':
|
|
# Compare with a margin of error.
|
|
delta = 1e6 if data_type == 'double' else 1e3
|
|
assert abs(result_lut[key] - float(actual_string)) < delta
|
|
elif data_type == 'timestamp' and agg_func != 'count':
|
|
# Strip off everything past 10s of microseconds.
|
|
ignore_digits = 4
|
|
assert result_lut[key][:-ignore_digits] == actual_string[:-ignore_digits]
|
|
else:
|
|
assert str(result_lut[key]) == actual_string
|
|
|
|
|
|
class TestAggregationQueries(ImpalaTestSuite):
|
|
"""Run the aggregation test suite, with codegen enabled and disabled, to exercise our
|
|
non-codegen code"""
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestAggregationQueries, cls).add_test_dimensions()
|
|
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension(disable_codegen_options=[False, True]))
|
|
|
|
if cls.exploration_strategy() == 'core':
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def test_non_codegen_tinyint_grouping(self, vector, unique_database):
|
|
# Regression for IMPALA-901. The test includes an INSERT statement, so can only be run
|
|
# on INSERT-able formats - text only in this case, since the bug doesn't depend on the
|
|
# file format.
|
|
if vector.get_value('table_format').file_format == 'text' \
|
|
and vector.get_value('table_format').compression_codec == 'none':
|
|
self.client.execute("create table %s.imp_901 (col tinyint)" % unique_database)
|
|
self.run_test_case('QueryTest/aggregation_no_codegen_only', vector,
|
|
unique_database)
|
|
|
|
def test_aggregation(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
|
|
self.run_test_case('QueryTest/aggregation', vector)
|
|
|
|
def test_group_concat(self, vector):
|
|
"""group_concat distinct tests
|
|
Required to run directly in python because the order in which results will be
|
|
merged at the final, single-node aggregation step is non-deterministic (if the
|
|
first phase is running on multiple nodes). Need to pull the result apart and
|
|
compare the actual items)"""
|
|
exec_option = vector.get_value('exec_option')
|
|
disable_codegen = exec_option['disable_codegen']
|
|
table_format = vector.get_value('table_format')
|
|
# Test group_concat distinct with other aggregate function and groupings.
|
|
# expected result is the row: 2010,'1, 2, 3, 4','1-2-3-4','2|3|1|4',40,4
|
|
query = """select year, group_concat(distinct string_col),
|
|
group_concat(distinct string_col, '-'), group_concat(distinct string_col, '|'),
|
|
count(string_col), count(distinct string_col)
|
|
from alltypesagg where int_col < 5 and year = 2010 group by year"""
|
|
result = self.execute_query(query, exec_option, table_format=table_format)
|
|
row = (result.data)[0].split("\t")
|
|
assert(len(row) == 6)
|
|
assert(row[0] == '2010')
|
|
delimiter = [', ', '-', '|']
|
|
for i in range(1, 4):
|
|
assert(set(row[i].split(delimiter[i - 1])) == set(['1', '2', '3', '4']))
|
|
assert(row[4] == '40')
|
|
assert(row[5] == '4')
|
|
if not disable_codegen:
|
|
# Verify codegen was enabled for all three stages of the aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1, 2, 4])
|
|
|
|
# Test group_concat distinct with arrow delimiter, with multiple rows
|
|
query = """select day, group_concat(distinct string_col, "->")
|
|
from (select * from alltypesagg where id % 100 = day order by id limit 99999) a
|
|
group by day order by day"""
|
|
result = self.execute_query(query, exec_option, table_format=table_format)
|
|
string_col = [
|
|
set(['1', '101', '201', '301', '401', '501', '601', '701', '801', '901']),
|
|
set(['2', '102', '202', '302', '402', '502', '602', '702', '802', '902']),
|
|
set(['3', '103', '203', '303', '403', '503', '603', '703', '803', '903']),
|
|
set(['4', '104', '204', '304', '404', '504', '604', '704', '804', '904']),
|
|
set(['5', '105', '205', '305', '405', '505', '605', '705', '805', '905']),
|
|
set(['6', '106', '206', '306', '406', '506', '606', '706', '806', '906']),
|
|
set(['7', '107', '207', '307', '407', '507', '607', '707', '807', '907']),
|
|
set(['8', '108', '208', '308', '408', '508', '608', '708', '808', '908']),
|
|
set(['9', '109', '209', '309', '409', '509', '609', '709', '809', '909']),
|
|
set(['10', '110', '210', '310', '410', '510', '610', '710', '810', '910'])
|
|
]
|
|
assert(len(result.data) == 10)
|
|
for i in range(10):
|
|
row = (result.data)[i].split("\t")
|
|
assert(len(row) == 2)
|
|
assert(row[0] == str(i + 1))
|
|
assert(set(row[1].split("->")) == string_col[i])
|
|
|
|
# Test group_concat distinct with merge node
|
|
query = """select group_concat(distinct string_col, ' ') from alltypesagg
|
|
where int_col < 10"""
|
|
result = self.execute_query(query, exec_option, table_format=table_format)
|
|
assert(set((result.data)[0].split(" ")) == set(
|
|
['1', '2', '3', '4', '5', '6', '7', '8', '9']))
|
|
if not disable_codegen:
|
|
# Verify codegen was enabled for all four stages of the aggregation.
|
|
assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
|
|
|
|
def test_ndv(self):
|
|
"""Test the version of NDV() that accepts a scale value argument against
|
|
different column data types. The scale argument is an integer in range
|
|
[1, 10]."""
|
|
|
|
ndv_results = [
|
|
[2, 9, 96, 988, 980, 1000, 944, 1030, 1020, 990, 1010, 957, 1030, 1027, 9845, 9898],
|
|
[2, 9, 97, 957, 1008, 1016, 1005, 963, 994, 993, 1018, 1004, 963, 1014, 10210,
|
|
10280],
|
|
[2, 9, 98, 977, 1024, 1020, 975, 977, 1002, 991, 994, 1006, 977, 999, 10118, 9923],
|
|
[2, 9, 99, 986, 1009, 1011, 994, 980, 997, 994, 1002, 997, 980, 988, 10148, 9987],
|
|
[2, 9, 99, 995, 996, 1000, 998, 988, 995, 999, 997, 999, 988, 979, 9974, 9960],
|
|
[2, 9, 99, 998, 1005, 999, 1003, 994, 1000, 993, 999, 998, 994, 992, 9899, 9941],
|
|
[2, 9, 99, 993, 1001, 1007, 1000, 998, 1002, 997, 999, 998, 998, 999, 9923, 9931],
|
|
[2, 9, 99, 994, 998, 1002, 1002, 999, 998, 999, 997, 1000, 999, 997, 9937, 9973],
|
|
[2, 9, 99, 995, 997, 998, 1001, 999, 1001, 996, 997, 1000, 999, 998, 9989, 9981],
|
|
[2, 9, 99, 998, 998, 997, 999, 998, 1000, 998, 1000, 998, 998, 1000, 10000, 10003]
|
|
]
|
|
|
|
# For each possible integer value, genereate one query and test it out.
|
|
for i in range(1, 11):
|
|
ndv_stmt = """
|
|
select ndv(bool_col, {0}), ndv(tinyint_col, {0}),
|
|
ndv(smallint_col, {0}), ndv(int_col, {0}),
|
|
ndv(bigint_col, {0}), ndv(float_col, {0}),
|
|
ndv(double_col, {0}), ndv(string_col, {0}),
|
|
ndv(cast(double_col as decimal(5, 0)), {0}),
|
|
ndv(cast(double_col as decimal(10, 5)), {0}),
|
|
ndv(cast(double_col as decimal(20, 10)), {0}),
|
|
ndv(cast(double_col as decimal(38, 33)), {0}),
|
|
ndv(cast(string_col as varchar(20)), {0}),
|
|
ndv(cast(string_col as char(10)), {0}),
|
|
ndv(timestamp_col, {0}), ndv(id, {0})
|
|
from functional_parquet.alltypesagg""".format(i)
|
|
ndv_result = self.execute_query(ndv_stmt)
|
|
ndv_vals = ndv_result.data[0].split('\t')
|
|
|
|
# Verify that each ndv() value (one per column for a total of 11) is identical
|
|
# to the corresponding known value. Since NDV() invokes Hash64() hash function
|
|
# with a fixed seed value, ndv() result is deterministic.
|
|
for j in range(0, 11):
|
|
assert(ndv_results[i - 1][j] == int(ndv_vals[j]))
|
|
|
|
def test_grouping_sets(self, vector):
|
|
"""Tests for ROLLUP, CUBE and GROUPING SETS."""
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail(reason="IMPALA-283 - HBase null handling is inconsistent")
|
|
self.run_test_case('QueryTest/grouping-sets', vector)
|
|
|
|
def test_aggregation_limit(self, vector):
|
|
"""Test that limits are honoured when enforced by aggregation node."""
|
|
# 1-phase
|
|
result = self.execute_query(
|
|
"select distinct l_orderkey from tpch.lineitem limit 10",
|
|
vector.get_value('exec_option'))
|
|
assert len(result.data) == 10
|
|
|
|
# 2-phase with transpose
|
|
result = self.execute_query(
|
|
"select count(distinct l_discount), group_concat(distinct l_linestatus), "
|
|
"max(l_quantity) from tpch.lineitem group by l_tax, l_shipmode limit 10;",
|
|
vector.get_value('exec_option'))
|
|
assert len(result.data) == 10
|
|
|
|
|
|
class TestAggregationQueriesSingleFormat(ImpalaTestSuite):
|
|
"""Run the aggregation test suite similarly as TestAggregationQueries, but with
|
|
stricter constraint. Each test in this class only run against one table format by
|
|
setting uncompressed text dimension for all exploration strategy. However, they
|
|
may not necessarily target uncompressed text table format. This also run with codegen
|
|
enabled and disabled to exercise our non-codegen code, default batch_size,
|
|
and batch_size=1."""
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestAggregationQueriesSingleFormat, cls).add_test_dimensions()
|
|
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension(
|
|
batch_sizes=[0, 1], disable_codegen_options=[False, True]))
|
|
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def test_parquet_count_star_optimization(self, vector, unique_database):
|
|
self.run_test_case('QueryTest/parquet-stats-agg', vector, unique_database)
|
|
|
|
def test_kudu_count_star_optimization(self, vector):
|
|
self.run_test_case('QueryTest/kudu-stats-agg', vector)
|
|
|
|
def test_orc_count_star_optimization(self, vector):
|
|
self.run_test_case('QueryTest/orc-stats-agg', vector)
|
|
|
|
def test_sampled_ndv(self):
|
|
"""The SAMPLED_NDV() function is inherently non-deterministic and cannot be
|
|
reasonably made deterministic with existing options so we test it separately.
|
|
The goal of this test is to ensure that SAMPLED_NDV() works on all data types
|
|
and returns approximately sensible estimates. It is not the goal of this test
|
|
to ensure tight error bounds on the NDV estimates. SAMPLED_NDV() is expected
|
|
be inaccurate on small data sets like the ones we use in this test."""
|
|
|
|
# NDV() is used a baseline to compare SAMPLED_NDV(). Both NDV() and SAMPLED_NDV()
|
|
# are based on HyperLogLog so NDV() is roughly the best that SAMPLED_NDV() can do.
|
|
# Expectations: All columns except 'id' and 'timestmap_col' have low NDVs and are
|
|
# expected to be reasonably accurate with SAMPLED_NDV(). For the two high-NDV columns
|
|
# SAMPLED_NDV() is expected to have high variance and error.
|
|
ndv_stmt = """
|
|
select ndv(bool_col), ndv(tinyint_col),
|
|
ndv(smallint_col), ndv(int_col),
|
|
ndv(bigint_col), ndv(float_col),
|
|
ndv(double_col), ndv(string_col),
|
|
ndv(cast(double_col as decimal(5, 0))),
|
|
ndv(cast(double_col as decimal(10, 5))),
|
|
ndv(cast(double_col as decimal(20, 10))),
|
|
ndv(cast(double_col as decimal(38, 33))),
|
|
ndv(cast(string_col as varchar(20))),
|
|
ndv(cast(string_col as char(10))),
|
|
ndv(timestamp_col), ndv(id)
|
|
from functional_parquet.alltypesagg"""
|
|
ndv_result = self.execute_query(ndv_stmt)
|
|
ndv_vals = ndv_result.data[0].split('\t')
|
|
|
|
for sample_perc in [0.1, 0.2, 0.5, 1.0]:
|
|
sampled_ndv_stmt = """
|
|
select sampled_ndv(bool_col, {0}), sampled_ndv(tinyint_col, {0}),
|
|
sampled_ndv(smallint_col, {0}), sampled_ndv(int_col, {0}),
|
|
sampled_ndv(bigint_col, {0}), sampled_ndv(float_col, {0}),
|
|
sampled_ndv(double_col, {0}), sampled_ndv(string_col, {0}),
|
|
sampled_ndv(cast(double_col as decimal(5, 0)), {0}),
|
|
sampled_ndv(cast(double_col as decimal(10, 5)), {0}),
|
|
sampled_ndv(cast(double_col as decimal(20, 10)), {0}),
|
|
sampled_ndv(cast(double_col as decimal(38, 33)), {0}),
|
|
sampled_ndv(cast(string_col as varchar(20)), {0}),
|
|
sampled_ndv(cast(string_col as char(10)), {0}),
|
|
sampled_ndv(timestamp_col, {0}), sampled_ndv(id, {0})
|
|
from functional_parquet.alltypesagg""".format(sample_perc)
|
|
sampled_ndv_result = self.execute_query(sampled_ndv_stmt)
|
|
sampled_ndv_vals = sampled_ndv_result.data[0].split('\t')
|
|
|
|
assert len(sampled_ndv_vals) == len(ndv_vals)
|
|
# Low NDV columns. We expect a reasonaby accurate estimate regardless of the
|
|
# sampling percent.
|
|
for i in range(0, 14):
|
|
self.appx_equals(int(sampled_ndv_vals[i]), int(ndv_vals[i]), 0.1)
|
|
# High NDV columns. We expect the estimate to have high variance and error.
|
|
# Since we give NDV() and SAMPLED_NDV() the same input data, i.e., we are not
|
|
# actually sampling for SAMPLED_NDV(), we expect the result of SAMPLED_NDV() to
|
|
# be bigger than NDV() proportional to the sampling percent.
|
|
# For example, the column 'id' is a PK so we expect the result of SAMPLED_NDV()
|
|
# with a sampling percent of 0.1 to be approximately 10x of the NDV().
|
|
for i in range(14, 16):
|
|
self.appx_equals(int(sampled_ndv_vals[i]) * sample_perc, int(ndv_vals[i]), 2.0)
|
|
|
|
|
|
class TestDistinctAggregation(ImpalaTestSuite):
|
|
"""Run the distinct aggregation test suite, with codegen and shuffle_distinct_exprs
|
|
enabled and disabled."""
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestDistinctAggregation, cls).add_test_dimensions()
|
|
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension_from_dict({
|
|
'disable_codegen': [False, True],
|
|
'shuffle_distinct_exprs': [False, True]
|
|
}))
|
|
|
|
if cls.exploration_strategy() == 'core':
|
|
cls.ImpalaTestMatrix.add_constraint(
|
|
lambda v: v.get_value('table_format').file_format == 'text'
|
|
and v.get_value('table_format').compression_codec == 'none')
|
|
|
|
def test_distinct(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail("HBase returns columns in alphabetical order for select distinct *, "
|
|
"making the result verication to fail.")
|
|
self.run_test_case('QueryTest/distinct', vector)
|
|
|
|
def test_multiple_distinct(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail("HBase returns columns in alphabetical order for select distinct *, "
|
|
"making the result verication to fail.")
|
|
self.run_test_case('QueryTest/multiple-distinct-aggs', vector)
|
|
|
|
|
|
class TestWideAggregationQueries(ImpalaTestSuite):
|
|
"""Test that aggregations with many grouping columns work"""
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestWideAggregationQueries, cls).add_test_dimensions()
|
|
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension(disable_codegen_options=[False, True]))
|
|
|
|
# File format doesn't matter for this test.
|
|
cls.ImpalaTestMatrix.add_constraint(
|
|
lambda v: v.get_value('table_format').file_format == 'parquet')
|
|
|
|
def test_many_grouping_columns(self, vector):
|
|
"""Test that an aggregate with many grouping columns works"""
|
|
table_format = vector.get_value('table_format')
|
|
exec_option = vector.get_value('exec_option')
|
|
query = "select distinct * from widetable_1000_cols"
|
|
|
|
# Ensure codegen is enabled.
|
|
result = self.execute_query(query, exec_option, table_format=table_format)
|
|
|
|
# All rows should be distinct.
|
|
expected_result = widetable.get_data(1000, 10, quote_strings=True)
|
|
|
|
types = result.column_types
|
|
labels = result.column_labels
|
|
expected = QueryTestResult(expected_result, types, labels, order_matters=False)
|
|
actual = QueryTestResult(parse_result_rows(result), types, labels,
|
|
order_matters=False)
|
|
assert expected == actual
|
|
|
|
|
|
class TestTPCHAggregationQueries(ImpalaTestSuite):
|
|
# Uses the TPC-H dataset in order to have larger aggregations.
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'tpch'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestTPCHAggregationQueries, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:
|
|
v.get_value('table_format').file_format in ['parquet'])
|
|
|
|
def test_tpch_aggregations(self, vector):
|
|
self.run_test_case('tpch-aggregations', vector)
|
|
|
|
def test_tpch_passthrough_aggregations(self, vector):
|
|
self.run_test_case('tpch-passthrough-aggregations', vector)
|
|
|
|
def test_tpch_stress(self, vector):
|
|
self.run_test_case('tpch-stress-aggregations', vector)
|
|
|
|
def test_min_multiple_distinct(self, vector):
|
|
self.run_test_case('min-multiple-distinct-aggs', vector)
|