Files
impala/tests/query_test/test_aggregation.py
Dan Hecht ffa7829b70 IMPALA-3918: Remove Cloudera copyrights and add ASF license header
For files that have a Cloudera copyright (and no other copyright
notice), make changes to follow the ASF source file header policy here:

http://www.apache.org/legal/src-headers.html#headers

Specifically:
1) Remove the Cloudera copyright.
2) Modify NOTICE.txt according to
   http://www.apache.org/legal/src-headers.html#notice
   to follow that format and add a line for Cloudera.
3) Replace or add the existing ASF license text with the one given
   on the website.

Much of this change was automatically generated via:

git grep -li 'Copyright.*Cloudera' > modified_files.txt
cat modified_files.txt | xargs perl -n -i -e 'print unless m#Copyright.*Cloudera#i;'
cat modified_files_txt | xargs fix_apache_license.py [1]

Some manual fixups were performed following those steps, especially when
license text was completely missing from the file.

[1] https://gist.github.com/anonymous/ff71292094362fc5c594 with minor
    modification to ORIG_LICENSE to match Impala's license text.

Change-Id: I2e0bd8420945b953e1b806041bea4d72a3943d86
Reviewed-on: http://gerrit.cloudera.org:8080/3779
Reviewed-by: Dan Hecht <dhecht@cloudera.com>
Tested-by: Internal Jenkins
2016-08-09 08:19:41 +00:00

205 lines
8.9 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Validates all aggregate functions across all datatypes
#
import pytest
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import (
create_exec_option_dimension,
create_uncompressed_text_dimension)
from tests.common.test_vector import TestDimension
from tests.common.skip import SkipIfOldAggsJoins
agg_functions = ['sum', 'count', 'min', 'max', 'avg']
data_types = ['int', 'bool', 'double', 'bigint', 'tinyint',
'smallint', 'float', 'timestamp']
result_lut = {
# TODO: Add verification for other types
'sum-tinyint': 45000, 'avg-tinyint': 5, 'count-tinyint': 9000,
'min-tinyint': 1, 'max-tinyint': 9,
'sum-smallint': 495000, 'avg-smallint': 50, 'count-smallint': 9900,
'min-smallint': 1, 'max-smallint': 99,
'sum-int': 4995000, 'avg-int': 500, 'count-int': 9990,
'min-int': 1, 'max-int': 999,
'sum-bigint': 49950000, 'avg-bigint': 5000, 'count-bigint': 9990,
'min-bigint': 10, 'max-bigint': 9990,
}
class TestAggregation(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestAggregation, cls).add_test_dimensions()
# Add two more dimensions
cls.TestMatrix.add_dimension(TestDimension('agg_func', *agg_functions))
cls.TestMatrix.add_dimension(TestDimension('data_type', *data_types))
cls.TestMatrix.add_constraint(lambda v: cls.is_valid_vector(v))
@classmethod
def is_valid_vector(cls, vector):
data_type, agg_func = vector.get_value('data_type'), vector.get_value('agg_func')
file_format = vector.get_value('table_format').file_format
if file_format not in ['parquet']: return False
if cls.exploration_strategy() == 'core':
# Reduce execution time when exploration strategy is 'core'
if vector.get_value('exec_option')['batch_size'] != 0: return False
# Avro doesn't have timestamp type
if file_format == 'avro' and data_type == 'timestamp':
return False
elif agg_func not in ['min', 'max', 'count'] and data_type == 'bool':
return False
elif agg_func == 'sum' and data_type == 'timestamp':
return False
return True
def test_aggregation(self, vector):
data_type, agg_func = (vector.get_value('data_type'), vector.get_value('agg_func'))
query = 'select %s(%s_col) from alltypesagg where day is not null' % (agg_func,
data_type)
result = self.execute_scalar(query, vector.get_value('exec_option'),
table_format=vector.get_value('table_format'))
if 'int' in data_type:
assert result_lut['%s-%s' % (agg_func, data_type)] == int(result)
# AVG
if vector.get_value('data_type') == 'timestamp' and\
vector.get_value('agg_func') == 'avg':
return
query = 'select %s(DISTINCT(%s_col)) from alltypesagg where day is not null' % (
agg_func, data_type)
result = self.execute_scalar(query, vector.get_value('exec_option'))
class TestAggregationQueries(ImpalaTestSuite):
"""Run the aggregation test suite, with codegen enabled and disabled, to exercise our
non-codegen code"""
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestAggregationQueries, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(
create_exec_option_dimension(disable_codegen_options=[False, True]))
if cls.exploration_strategy() == 'core':
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
@pytest.mark.execute_serially
def test_non_codegen_tinyint_grouping(self, vector):
# Regression for IMPALA-901. The test includes an INSERT statement, so can only be run
# on INSERT-able formats - text only in this case, since the bug doesn't depend on the
# file format.
if vector.get_value('table_format').file_format == 'text' \
and vector.get_value('table_format').compression_codec == 'none':
self.run_test_case('QueryTest/aggregation_no_codegen_only', vector)
def test_aggregation(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
self.run_test_case('QueryTest/aggregation', vector)
def test_distinct(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("HBase returns columns in alphabetical order for select distinct *, "
"making the result verication to fail.")
self.run_test_case('QueryTest/distinct', vector)
def test_group_concat(self, vector):
"""group_concat distinct tests
Required to run directly in python because the order in which results will be
merged at the final, single-node aggregation step is non-deterministic (if the
first phase is running on multiple nodes). Need to pull the result apart and
compare the actual items)"""
exec_option = vector.get_value('exec_option')
table_format = vector.get_value('table_format')
# Test group_concat distinct with other aggregate function and groupings.
# expected result is the row: 2010,'1, 2, 3, 4','1-2-3-4','2|3|1|4',40,4
query = """select year, group_concat(distinct string_col),
group_concat(distinct string_col, '-'), group_concat(distinct string_col, '|'),
count(string_col), count(distinct string_col)
from alltypesagg where int_col < 5 and year = 2010 group by year"""
result = self.execute_query(query, exec_option, table_format=table_format)
row = (result.data)[0].split("\t")
assert(len(row) == 6)
assert(row[0] == '2010')
delimiter = [', ', '-', '|']
for i in range(1, 4):
assert(set(row[i].split(delimiter[i-1])) == set(['1', '2', '3', '4']))
assert(row[4] == '40')
assert(row[5] == '4')
# Test group_concat distinct with arrow delimiter, with multiple rows
query = """select day, group_concat(distinct string_col, "->")
from (select * from alltypesagg where id % 100 = day order by id limit 99999) a
group by day order by day"""
result = self.execute_query(query, exec_option, table_format=table_format)
string_col = []
string_col.append(set(['1','101','201','301','401','501','601','701','801','901']))
string_col.append(set(['2','102','202','302','402','502','602','702','802','902']))
string_col.append(set(['3','103','203','303','403','503','603','703','803','903']))
string_col.append(set(['4','104','204','304','404','504','604','704','804','904']))
string_col.append(set(['5','105','205','305','405','505','605','705','805','905']))
string_col.append(set(['6','106','206','306','406','506','606','706','806','906']))
string_col.append(set(['7','107','207','307','407','507','607','707','807','907']))
string_col.append(set(['8','108','208','308','408','508','608','708','808','908']))
string_col.append(set(['9','109','209','309','409','509','609','709','809','909']))
string_col.append(set(['10','110','210','310','410','510','610','710','810','910']))
assert(len(result.data) == 10)
for i in range(10):
row = (result.data)[i].split("\t")
assert(len(row) == 2)
assert(row[0] == str(i+1))
assert(set(row[1].split("->")) == string_col[i])
# Test group_concat distinct with merge node
query = """select group_concat(distinct string_col, ' ') from alltypesagg
where int_col < 10"""
result = self.execute_query(query, exec_option, table_format=table_format)
assert(set((result.data)[0].split(" ")) == set(['1','2','3','4','5','6','7','8','9']))
class TestTPCHAggregationQueries(ImpalaTestSuite):
# Uses the TPC-H dataset in order to have larger aggregations.
@classmethod
def get_workload(cls):
return 'tpch'
@classmethod
def add_test_dimensions(cls):
super(TestTPCHAggregationQueries, cls).add_test_dimensions()
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format in ['parquet'])
def test_tpch_aggregations(self, vector):
self.run_test_case('tpch-aggregations', vector)
@SkipIfOldAggsJoins.passthrough_preagg
def test_tpch_passthrough_aggregations(self, vector):
self.run_test_case('tpch-passthrough-aggregations', vector)