mirror of
https://github.com/apache/impala.git
synced 2026-01-07 00:02:28 -05:00
This patch redoes how the aggregation node is implemented. The functionality is now split between aggregation-node, agg-expr and aggregate-functions. This is a working progress (there's still a lot of debug stuff I added that needs to be cleaned up) but it does pass the tests. Aggregation-node is now very simple and now only deals with the grouping part. Aggregate-expr serves as the glue between the agg node and the aggregate functions. The aggregation functions are implemented with the UDA interface. I've reimplemented our existing aggregate functions with this setup. For true UDAs, the binaries would be loaded in aggregate-expr. This also includes some preliminary changes in the FE. We now need to annotate each AggNode as executing the update vs. merge phase (root aggs execute update, others execute merge) and if it needs a finalize step (only the root does). This is more general than our builtins which are too simple to need this structure. There is a big TODO here to allow the intermediate types between agg nodes to change. For example, in distinct estimate, the input type is the column type and the output type is a bigint. We'd like the intermediate type to be CHAR(256). This is different since currently, the intermediate type and output type have always been the same. We've hacked around this by having both the intermediate and output type be TYPE_STRING. I've left this for another patch (changing the BE to support this is trivial). For aggregates that result in strings, we used to store some additional stuff past the end of the tuple. The layout was: <tuple> <length of 1st string buffer>,<length of 2nd string buffer>, etc The rationale for this is that we want to reuse the buffer for min/max and grow the buffer more quickly for group_concat. This breaks down the abstraction between agg-expr and agg-node and is not something UDAs can use in general. Rather than try to hack around this, I think the proper solution is to the intermediate type not be StringValue and to contain the buffer length itself. This patch also resurrects the distinct estimate code. The distinct estimate functions exercise all of the code paths. Change-Id: Ic152a2cd03bc1713967673681e1e6204dcd80346 Reviewed-on: http://gerrit.ent.cloudera.com:8080/564 Reviewed-by: Nong Li <nong@cloudera.com> Tested-by: Nong Li <nong@cloudera.com>
107 lines
4.4 KiB
Python
107 lines
4.4 KiB
Python
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
# General Impala query tests
|
|
#
|
|
import logging
|
|
import pytest
|
|
from tests.common.test_vector import *
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
|
|
class TestQueries(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
def test_distinct(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail(("HBase returns columns in alphabetical order for select distinct *, "
|
|
"making result verication fail."))
|
|
self.run_test_case('QueryTest/distinct', vector)
|
|
|
|
def test_aggregation(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
|
|
self.run_test_case('QueryTest/aggregation', vector)
|
|
|
|
def test_exprs(self, vector):
|
|
# TODO: Enable some of these tests for Avro if possible
|
|
# Don't attempt to evaluate timestamp expressions with Avro tables (which)
|
|
# don't support a timestamp type)"
|
|
if vector.get_value('table_format').file_format == 'avro':
|
|
pytest.skip()
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
|
|
self.run_test_case('QueryTest/exprs', vector)
|
|
|
|
def test_hdfs_scan_node(self, vector):
|
|
self.run_test_case('QueryTest/hdfs-scan-node', vector)
|
|
|
|
def test_distinct_estimate(self, vector):
|
|
# These results will vary slightly depending on how the values get split up
|
|
# so only run with 1 node and on text.
|
|
if vector.get_value('table_format').file_format != 'text':
|
|
pytest.skip()
|
|
vector.get_value('exec_option')['num_nodes'] = 1
|
|
self.run_test_case('QueryTest/distinct-estimate', vector)
|
|
|
|
def test_scan_range(self, vector):
|
|
self.run_test_case('QueryTest/hdfs-partitions', vector)
|
|
|
|
def test_file_partitions(self, vector):
|
|
self.run_test_case('QueryTest/hdfs-partitions', vector)
|
|
|
|
def test_limit(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail("IMPALA-283 - select count(*) produces inconsistent results")
|
|
self.run_test_case('QueryTest/limit', vector)
|
|
|
|
def test_top_n(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
|
|
self.run_test_case('QueryTest/top-n', vector)
|
|
|
|
def test_empty(self, vector):
|
|
self.run_test_case('QueryTest/empty', vector)
|
|
|
|
def test_subquery(self, vector):
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
pytest.xfail(("jointbl does not have columns with unique values, "
|
|
"hbase collapses them"))
|
|
self.run_test_case('QueryTest/subquery', vector)
|
|
|
|
def test_subquery_limit(self, vector):
|
|
self.run_test_case('QueryTest/subquery-limit', vector)
|
|
|
|
def test_mixed_format(self, vector):
|
|
self.run_test_case('QueryTest/mixed-format', vector)
|
|
|
|
def test_views(self, vector):
|
|
if vector.get_value('table_format').file_format == "hbase":
|
|
pytest.xfail("TODO: Enable views tests for hbase")
|
|
self.run_test_case('QueryTest/views', vector)
|
|
|
|
def test_with_clause(self, vector):
|
|
if vector.get_value('table_format').file_format == "hbase":
|
|
pytest.xfail("TODO: Enable with clause tests for hbase")
|
|
self.run_test_case('QueryTest/with-clause', vector)
|
|
|
|
def test_values(self, vector):
|
|
# These tests do not read data from tables, so only run them a single time (text/none).
|
|
table_format = vector.get_value('table_format')
|
|
if (table_format.file_format == 'text' and table_format.compression_codec == 'none'):
|
|
self.run_test_case('QueryTest/values', vector)
|
|
|
|
def test_misc(self, vector):
|
|
table_format = vector.get_value('table_format')
|
|
if table_format.file_format in ['hbase', 'rc', 'parquet']:
|
|
msg = ("Failing on rc/snap/block despite resolution of IMP-624,IMP-503. "
|
|
"Failing on parquet because nulltable does not exist in parquet")
|
|
pytest.xfail(msg)
|
|
self.run_test_case('QueryTest/misc', vector)
|
|
|
|
def test_overflow(self, vector):
|
|
table_format = vector.get_value('table_format')
|
|
if table_format.file_format != 'text' or table_format.compression_codec != 'none':
|
|
pytest.xfail("Test limited to text/none")
|
|
self.run_test_case('QueryTest/overflow', vector)
|