Files
impala/tests/query_test/test_aggregation.py
Csaba Ringhofer f98b697c7b IMPALA-13929: Make 'functional-query' the default workload in tests
This change adds get_workload() to ImpalaTestSuite and removes it
from all test suites that already returned 'functional-query'.
get_workload() is also removed from CustomClusterTestSuite which
used to return 'tpch'.

All other changes besides impala_test_suite.py and
custom_cluster_test_suite.py are just mass removals of
get_workload() functions.

The behavior is only changed in custom cluster tests that didn't
override get_workload(). By returning 'functional-query' instead
of 'tpch', exploration_strategy() will no longer return 'core' in
'exhaustive' test runs. See IMPALA-3947 on why workload affected
exploration_strategy. An example for affected test is
TestCatalogHMSFailures which was skipped both in core and exhaustive
runs before this change.

get_workload() functions that return a different workload than
'functional-query' are not changed - it is possible that some of
these also don't handle exploration_strategy() as expected, but
individually checking these tests is out of scope in this patch.

Change-Id: I9ec6c41ffb3a30e1ea2de773626d1485c69fe115
Reviewed-on: http://gerrit.cloudera.org:8080/22726
Reviewed-by: Riza Suminto <riza.suminto@cloudera.com>
Reviewed-by: Daniel Becker <daniel.becker@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-04-08 07:12:55 +00:00

523 lines
24 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Validates all aggregate functions across all datatypes
#
from __future__ import absolute_import, division, print_function
from builtins import range
import pytest
from testdata.common import widetable
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import (
ALL_CLUSTER_SIZES,
create_exec_option_dimension,
create_exec_option_dimension_from_dict,
create_uncompressed_text_dimension)
from tests.common.test_result_verifier import (
assert_codegen_enabled,
QueryTestResult,
parse_result_rows)
from tests.common.test_vector import ImpalaTestDimension
# Test dimensions for TestAggregation.
AGG_FUNCTIONS = ['sum', 'count', 'min', 'max', 'avg', 'ndv']
DATA_TYPES = ['int', 'bool', 'double', 'bigint', 'tinyint',
'smallint', 'float', 'timestamp', 'string']
# Lookup table for TestAggregation results.
result_lut = {
# tinyint
'sum-tinyint': 45000, 'avg-tinyint': 5, 'count-tinyint': 9000,
'min-tinyint': 1, 'max-tinyint': 9, 'ndv-tinyint': 9,
# smallint
'sum-smallint': 495000, 'avg-smallint': 50, 'count-smallint': 9900,
'min-smallint': 1, 'max-smallint': 99, 'ndv-smallint': 99,
# int
'sum-int': 4995000, 'avg-int': 500, 'count-int': 9990,
'min-int': 1, 'max-int': 999, 'ndv-int': 999,
# bigint
'sum-bigint': 49950000, 'avg-bigint': 5000, 'count-bigint': 9990,
'min-bigint': 10, 'max-bigint': 9990, 'ndv-bigint': 999,
# bool
'sum-bool': 5000, 'count-bool': 10000, 'min-bool': 'false',
'max-bool': 'true', 'avg-bool': 0.5, 'ndv-bool': 2,
# double
'sum-double': 50449500.0, 'count-double': 9990, 'min-double': 10.1,
'max-double': 10089.9, 'avg-double': 5050.0, 'ndv-double': 999,
# float
'sum-float': 5494500.0, 'count-float': 9990, 'min-float': 1.10,
'max-float': 1098.9, 'avg-float': 550.0, 'ndv-float': 999,
# timestamp
'count-timestamp': 10000, 'min-timestamp': '2010-01-01 00:00:00',
'max-timestamp': '2010-01-10 18:02:05.100000000',
'avg-timestamp': '2010-01-05 20:47:11.705080000', 'ndv-timestamp': 10000,
# string
'count-string': 10000, 'min-string': '0', 'max-string': '999', 'ndv-string': 999,
# distinct-tinyint
'sum-distinct-tinyint': 45, 'count-distinct-tinyint': 9, 'min-distinct-tinyint': 1,
'max-distinct-tinyint': 9, 'avg-distinct-tinyint': 5, 'ndv-distinct-tinyint': 9,
# distinct-smallint
'sum-distinct-smallint': 4950, 'count-distinct-smallint': 99,
'min-distinct-smallint': 1, 'max-distinct-smallint': 99,
'avg-distinct-smallint': 50, 'ndv-distinct-smallint': 99,
# distinct-int
'sum-distinct-int': 499500, 'count-distinct-int': 999, 'min-distinct-int': 1,
'max-distinct-int': 999, 'avg-distinct-int': 500, 'ndv-distinct-int': 999,
# distinct-bigint
'sum-distinct-bigint': 4995000, 'count-distinct-bigint': 999, 'min-distinct-bigint': 10,
'max-distinct-bigint': 9990, 'avg-distinct-bigint': 5000,
'ndv-distinct-bigint': 999,
# distinct-bool
'sum-distinct-bool': 1, 'count-distinct-bool': 2, 'min-distinct-bool': 'false',
'max-distinct-bool': 'true', 'avg-distinct-bool': 0.5, 'ndv-distinct-bool': 2,
# distinct-double
'sum-distinct-double': 5044950.0, 'count-distinct-double': 999,
'min-distinct-double': 10.1, 'max-distinct-double': 10089.9,
'avg-distinct-double': 5050.0, 'ndv-distinct-double': 999,
# distinct-float
'sum-distinct-float': 549450.0, 'count-distinct-float': 999, 'min-distinct-float': 1.1,
'max-distinct-float': 1098.9, 'avg-distinct-float': 550.0,
'ndv-distinct-float': 999,
# distinct-timestamp
'count-distinct-timestamp': 10000, 'min-distinct-timestamp': '2010-01-01 00:00:00',
'max-distinct-timestamp': '2010-01-10 18:02:05.100000000',
'avg-distinct-timestamp': '2010-01-05 20:47:11.705080000',
'ndv-distinct-timestamp': 10000,
# distinct-string
'count-distinct-string': 1000, 'min-distinct-string': '0',
'max-distinct-string': '999', 'ndv-distinct-string': 999,
}
class TestAggregation(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestAggregation, cls).add_test_dimensions()
# IMPALA-12383: Exercise with both num_nodes=0 (distributed plan) and num_nodes=1
# (single node plan).
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension(cluster_sizes=ALL_CLUSTER_SIZES)
)
# Add two more dimensions
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('agg_func', *AGG_FUNCTIONS))
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('data_type', *DATA_TYPES))
cls.ImpalaTestMatrix.add_constraint(lambda v: cls.is_valid_vector(v))
@classmethod
def is_valid_vector(cls, vector):
data_type, agg_func = vector.get_value('data_type'), vector.get_value('agg_func')
file_format = vector.get_value('table_format').file_format
if file_format not in ['parquet']: return False
if cls.exploration_strategy() == 'core':
# Reduce execution time when exploration strategy is 'core'
if vector.get_value('exec_option')['batch_size'] != 0: return False
# Avro doesn't have timestamp type
non_numeric = data_type in ['bool', 'string']
if file_format == 'avro' and data_type == 'timestamp':
return False
elif non_numeric and agg_func not in ['min', 'max', 'count', 'ndv']:
return False
elif agg_func == 'sum' and data_type == 'timestamp':
return False
return True
def test_aggregation(self, vector):
exec_option = vector.get_value('exec_option')
disable_codegen = exec_option['disable_codegen']
data_type, agg_func = (vector.get_value('data_type'), vector.get_value('agg_func'))
db_name = self.get_db_name_from_format(vector.get_table_format())
query = 'select {0}({1}_col) from {2}.alltypesagg where day is not null'.format(
agg_func, data_type, db_name)
result = self.execute_query(query, exec_option)
assert len(result.data) == 1
self.verify_agg_result(agg_func, data_type, False, result.data[0])
if not disable_codegen:
# Verify codegen was enabled for the preaggregation.
# It is deliberately disabled for the merge aggregation.
assert_codegen_enabled(result.runtime_profile, [1])
query = ('select {0}(DISTINCT({1}_col)) from {2}.alltypesagg '
'where day is not null').format(agg_func, data_type, db_name)
result = self.execute_query(query, exec_option)
assert len(result.data) == 1
self.verify_agg_result(agg_func, data_type, True, result.data[0])
if not disable_codegen:
# Verify codegen was enabled for all stages of the aggregation.
assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
def verify_agg_result(self, agg_func, data_type, distinct, actual_string):
key = '%s-%s%s' % (agg_func, 'distinct-' if distinct else '', data_type)
if agg_func == 'ndv':
# NDV is inherently approximate. Compare with some tolerance.
err = abs(result_lut[key] - int(actual_string))
rel_err = err / float(result_lut[key])
print(key, result_lut[key], actual_string,
abs(result_lut[key] - int(actual_string)))
assert err <= 1 or rel_err < 0.05
elif data_type in ('float', 'double') and agg_func != 'count':
# Compare with a margin of error.
delta = 1e6 if data_type == 'double' else 1e3
assert abs(result_lut[key] - float(actual_string)) < delta
elif data_type == 'timestamp' and agg_func != 'count':
# Strip off everything past 10s of microseconds.
ignore_digits = 4
assert result_lut[key][:-ignore_digits] == actual_string[:-ignore_digits]
else:
assert str(result_lut[key]) == actual_string
class TestAggregationQueries(ImpalaTestSuite):
"""Run the aggregation test suite, with codegen enabled and disabled, to exercise our
non-codegen code"""
@classmethod
def add_test_dimensions(cls):
super(TestAggregationQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension(disable_codegen_options=[False, True]))
if cls.exploration_strategy() == 'core':
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_non_codegen_tinyint_grouping(self, vector, unique_database):
# Regression for IMPALA-901. The test includes an INSERT statement, so can only be run
# on INSERT-able formats - text only in this case, since the bug doesn't depend on the
# file format.
if vector.get_value('table_format').file_format == 'text' \
and vector.get_value('table_format').compression_codec == 'none':
self.client.execute("create table %s.imp_901 (col tinyint)" % unique_database)
self.run_test_case('QueryTest/aggregation_no_codegen_only', vector,
unique_database)
def test_aggregation(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
self.run_test_case('QueryTest/aggregation', vector)
def test_group_concat(self, vector):
"""group_concat distinct tests
Required to run directly in python because the order in which results will be
merged at the final, single-node aggregation step is non-deterministic (if the
first phase is running on multiple nodes). Need to pull the result apart and
compare the actual items)"""
exec_option = vector.get_value('exec_option')
disable_codegen = exec_option['disable_codegen']
table_format = vector.get_value('table_format')
# Test group_concat distinct with other aggregate function and groupings.
# expected result is the row: 2010,'1, 2, 3, 4','1-2-3-4','2|3|1|4',40,4
query = """select year, group_concat(distinct string_col),
group_concat(distinct string_col, '-'), group_concat(distinct string_col, '|'),
count(string_col), count(distinct string_col)
from alltypesagg where int_col < 5 and year = 2010 group by year"""
result = self.execute_query(query, exec_option, table_format=table_format)
row = (result.data)[0].split("\t")
assert(len(row) == 6)
assert(row[0] == '2010')
delimiter = [', ', '-', '|']
for i in range(1, 4):
assert(set(row[i].split(delimiter[i - 1])) == set(['1', '2', '3', '4']))
assert(row[4] == '40')
assert(row[5] == '4')
if not disable_codegen:
# Verify codegen was enabled for all three stages of the aggregation.
assert_codegen_enabled(result.runtime_profile, [1, 2, 4])
# Test group_concat distinct with arrow delimiter, with multiple rows
query = """select day, group_concat(distinct string_col, "->")
from (select * from alltypesagg where id % 100 = day order by id limit 99999) a
group by day order by day"""
result = self.execute_query(query, exec_option, table_format=table_format)
string_col = [
set(['1', '101', '201', '301', '401', '501', '601', '701', '801', '901']),
set(['2', '102', '202', '302', '402', '502', '602', '702', '802', '902']),
set(['3', '103', '203', '303', '403', '503', '603', '703', '803', '903']),
set(['4', '104', '204', '304', '404', '504', '604', '704', '804', '904']),
set(['5', '105', '205', '305', '405', '505', '605', '705', '805', '905']),
set(['6', '106', '206', '306', '406', '506', '606', '706', '806', '906']),
set(['7', '107', '207', '307', '407', '507', '607', '707', '807', '907']),
set(['8', '108', '208', '308', '408', '508', '608', '708', '808', '908']),
set(['9', '109', '209', '309', '409', '509', '609', '709', '809', '909']),
set(['10', '110', '210', '310', '410', '510', '610', '710', '810', '910'])
]
assert(len(result.data) == 10)
for i in range(10):
row = (result.data)[i].split("\t")
assert(len(row) == 2)
assert(row[0] == str(i + 1))
assert(set(row[1].split("->")) == string_col[i])
# Test group_concat distinct with merge node
query = """select group_concat(distinct string_col, ' ') from alltypesagg
where int_col < 10"""
result = self.execute_query(query, exec_option, table_format=table_format)
assert(set((result.data)[0].split(" ")) == set(
['1', '2', '3', '4', '5', '6', '7', '8', '9']))
if not disable_codegen:
# Verify codegen was enabled for all four stages of the aggregation.
assert_codegen_enabled(result.runtime_profile, [1, 2, 4, 6])
def test_ndv(self):
"""Test the version of NDV() that accepts a scale value argument against
different column data types. The scale argument is an integer in range
[1, 10]."""
ndv_results = [
[2, 9, 96, 988, 980, 1000, 944, 1030, 1020, 990, 1010, 957, 1030, 1027, 9845, 9898],
[2, 9, 97, 957, 1008, 1016, 1005, 963, 994, 993, 1018, 1004, 963, 1014, 10210,
10280],
[2, 9, 98, 977, 1024, 1020, 975, 977, 1002, 991, 994, 1006, 977, 999, 10118, 9923],
[2, 9, 99, 986, 1009, 1011, 994, 980, 997, 994, 1002, 997, 980, 988, 10148, 9987],
[2, 9, 99, 995, 996, 1000, 998, 988, 995, 999, 997, 999, 988, 979, 9974, 9960],
[2, 9, 99, 998, 1005, 999, 1003, 994, 1000, 993, 999, 998, 994, 992, 9899, 9941],
[2, 9, 99, 993, 1001, 1007, 1000, 998, 1002, 997, 999, 998, 998, 999, 9923, 9931],
[2, 9, 99, 994, 998, 1002, 1002, 999, 998, 999, 997, 1000, 999, 997, 9937, 9973],
[2, 9, 99, 995, 997, 998, 1001, 999, 1001, 996, 997, 1000, 999, 998, 9989, 9981],
[2, 9, 99, 998, 998, 997, 999, 998, 1000, 998, 1000, 998, 998, 1000, 10000, 10003]
]
# For each possible integer value, genereate one query and test it out.
for i in range(1, 11):
ndv_stmt = """
select ndv(bool_col, {0}), ndv(tinyint_col, {0}),
ndv(smallint_col, {0}), ndv(int_col, {0}),
ndv(bigint_col, {0}), ndv(float_col, {0}),
ndv(double_col, {0}), ndv(string_col, {0}),
ndv(cast(double_col as decimal(5, 0)), {0}),
ndv(cast(double_col as decimal(10, 5)), {0}),
ndv(cast(double_col as decimal(20, 10)), {0}),
ndv(cast(double_col as decimal(38, 33)), {0}),
ndv(cast(string_col as varchar(20)), {0}),
ndv(cast(string_col as char(10)), {0}),
ndv(timestamp_col, {0}), ndv(id, {0})
from functional_parquet.alltypesagg""".format(i)
ndv_result = self.execute_query(ndv_stmt)
ndv_vals = ndv_result.data[0].split('\t')
# Verify that each ndv() value (one per column for a total of 11) is identical
# to the corresponding known value. Since NDV() invokes Hash64() hash function
# with a fixed seed value, ndv() result is deterministic.
for j in range(0, 11):
assert(ndv_results[i - 1][j] == int(ndv_vals[j]))
def test_grouping_sets(self, vector):
"""Tests for ROLLUP, CUBE and GROUPING SETS."""
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail(reason="IMPALA-283 - HBase null handling is inconsistent")
self.run_test_case('QueryTest/grouping-sets', vector)
def test_aggregation_limit(self, vector):
"""Test that limits are honoured when enforced by aggregation node."""
# 1-phase
result = self.execute_query(
"select distinct l_orderkey from tpch.lineitem limit 10",
vector.get_value('exec_option'))
assert len(result.data) == 10
# 2-phase with transpose
result = self.execute_query(
"select count(distinct l_discount), group_concat(distinct l_linestatus), "
"max(l_quantity) from tpch.lineitem group by l_tax, l_shipmode limit 10;",
vector.get_value('exec_option'))
assert len(result.data) == 10
class TestAggregationQueriesSingleFormat(ImpalaTestSuite):
"""Run the aggregation test suite similarly as TestAggregationQueries, but with
stricter constraint. Each test in this class only run against one table format by
setting uncompressed text dimension for all exploration strategy. However, they
may not necessarily target uncompressed text table format. This also run with codegen
enabled and disabled to exercise our non-codegen code, default batch_size,
and batch_size=1."""
@classmethod
def add_test_dimensions(cls):
super(TestAggregationQueriesSingleFormat, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension(
batch_sizes=[0, 1], disable_codegen_options=[False, True]))
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_parquet_count_star_optimization(self, vector, unique_database):
self.run_test_case('QueryTest/parquet-stats-agg', vector, unique_database)
def test_kudu_count_star_optimization(self, vector):
self.run_test_case('QueryTest/kudu-stats-agg', vector)
def test_orc_count_star_optimization(self, vector):
self.run_test_case('QueryTest/orc-stats-agg', vector)
def test_sampled_ndv(self):
"""The SAMPLED_NDV() function is inherently non-deterministic and cannot be
reasonably made deterministic with existing options so we test it separately.
The goal of this test is to ensure that SAMPLED_NDV() works on all data types
and returns approximately sensible estimates. It is not the goal of this test
to ensure tight error bounds on the NDV estimates. SAMPLED_NDV() is expected
be inaccurate on small data sets like the ones we use in this test."""
# NDV() is used a baseline to compare SAMPLED_NDV(). Both NDV() and SAMPLED_NDV()
# are based on HyperLogLog so NDV() is roughly the best that SAMPLED_NDV() can do.
# Expectations: All columns except 'id' and 'timestmap_col' have low NDVs and are
# expected to be reasonably accurate with SAMPLED_NDV(). For the two high-NDV columns
# SAMPLED_NDV() is expected to have high variance and error.
ndv_stmt = """
select ndv(bool_col), ndv(tinyint_col),
ndv(smallint_col), ndv(int_col),
ndv(bigint_col), ndv(float_col),
ndv(double_col), ndv(string_col),
ndv(cast(double_col as decimal(5, 0))),
ndv(cast(double_col as decimal(10, 5))),
ndv(cast(double_col as decimal(20, 10))),
ndv(cast(double_col as decimal(38, 33))),
ndv(cast(string_col as varchar(20))),
ndv(cast(string_col as char(10))),
ndv(timestamp_col), ndv(id)
from functional_parquet.alltypesagg"""
ndv_result = self.execute_query(ndv_stmt)
ndv_vals = ndv_result.data[0].split('\t')
for sample_perc in [0.1, 0.2, 0.5, 1.0]:
sampled_ndv_stmt = """
select sampled_ndv(bool_col, {0}), sampled_ndv(tinyint_col, {0}),
sampled_ndv(smallint_col, {0}), sampled_ndv(int_col, {0}),
sampled_ndv(bigint_col, {0}), sampled_ndv(float_col, {0}),
sampled_ndv(double_col, {0}), sampled_ndv(string_col, {0}),
sampled_ndv(cast(double_col as decimal(5, 0)), {0}),
sampled_ndv(cast(double_col as decimal(10, 5)), {0}),
sampled_ndv(cast(double_col as decimal(20, 10)), {0}),
sampled_ndv(cast(double_col as decimal(38, 33)), {0}),
sampled_ndv(cast(string_col as varchar(20)), {0}),
sampled_ndv(cast(string_col as char(10)), {0}),
sampled_ndv(timestamp_col, {0}), sampled_ndv(id, {0})
from functional_parquet.alltypesagg""".format(sample_perc)
sampled_ndv_result = self.execute_query(sampled_ndv_stmt)
sampled_ndv_vals = sampled_ndv_result.data[0].split('\t')
assert len(sampled_ndv_vals) == len(ndv_vals)
# Low NDV columns. We expect a reasonaby accurate estimate regardless of the
# sampling percent.
for i in range(0, 14):
self.appx_equals(int(sampled_ndv_vals[i]), int(ndv_vals[i]), 0.1)
# High NDV columns. We expect the estimate to have high variance and error.
# Since we give NDV() and SAMPLED_NDV() the same input data, i.e., we are not
# actually sampling for SAMPLED_NDV(), we expect the result of SAMPLED_NDV() to
# be bigger than NDV() proportional to the sampling percent.
# For example, the column 'id' is a PK so we expect the result of SAMPLED_NDV()
# with a sampling percent of 0.1 to be approximately 10x of the NDV().
for i in range(14, 16):
self.appx_equals(int(sampled_ndv_vals[i]) * sample_perc, int(ndv_vals[i]), 2.0)
class TestDistinctAggregation(ImpalaTestSuite):
"""Run the distinct aggregation test suite, with codegen and shuffle_distinct_exprs
enabled and disabled."""
@classmethod
def add_test_dimensions(cls):
super(TestDistinctAggregation, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension_from_dict({
'disable_codegen': [False, True],
'shuffle_distinct_exprs': [False, True]
}))
if cls.exploration_strategy() == 'core':
cls.ImpalaTestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format == 'text'
and v.get_value('table_format').compression_codec == 'none')
def test_distinct(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("HBase returns columns in alphabetical order for select distinct *, "
"making the result verication to fail.")
self.run_test_case('QueryTest/distinct', vector)
def test_multiple_distinct(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("HBase returns columns in alphabetical order for select distinct *, "
"making the result verication to fail.")
self.run_test_case('QueryTest/multiple-distinct-aggs', vector)
class TestWideAggregationQueries(ImpalaTestSuite):
"""Test that aggregations with many grouping columns work"""
@classmethod
def add_test_dimensions(cls):
super(TestWideAggregationQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension(disable_codegen_options=[False, True]))
# File format doesn't matter for this test.
cls.ImpalaTestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format == 'parquet')
def test_many_grouping_columns(self, vector):
"""Test that an aggregate with many grouping columns works"""
table_format = vector.get_value('table_format')
exec_option = vector.get_value('exec_option')
query = "select distinct * from widetable_1000_cols"
# Ensure codegen is enabled.
result = self.execute_query(query, exec_option, table_format=table_format)
# All rows should be distinct.
expected_result = widetable.get_data(1000, 10, quote_strings=True)
types = result.column_types
labels = result.column_labels
expected = QueryTestResult(expected_result, types, labels, order_matters=False)
actual = QueryTestResult(parse_result_rows(result), types, labels,
order_matters=False)
assert expected == actual
class TestTPCHAggregationQueries(ImpalaTestSuite):
# Uses the TPC-H dataset in order to have larger aggregations.
@classmethod
def get_workload(cls):
return 'tpch'
@classmethod
def add_test_dimensions(cls):
super(TestTPCHAggregationQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ['parquet'])
def test_tpch_aggregations(self, vector):
self.run_test_case('tpch-aggregations', vector)
def test_tpch_passthrough_aggregations(self, vector):
self.run_test_case('tpch-passthrough-aggregations', vector)
def test_tpch_stress(self, vector):
self.run_test_case('tpch-stress-aggregations', vector)
def test_min_multiple_distinct(self, vector):
self.run_test_case('min-multiple-distinct-aggs', vector)