Files
impala/tests/query_test/test_queries.py
Nong Li 15db34e356 AggregationNode refactoring
This patch redoes how the aggregation node is implemented. The functionality is
now split between aggregation-node, agg-expr and aggregate-functions. This is a working
progress (there's still a lot of debug stuff I added that needs to be cleaned up) but
it does pass the tests.

Aggregation-node is now very simple and now only deals with the grouping part.
Aggregate-expr serves as the glue between the agg node and the aggregate functions.
The aggregation functions are implemented with the UDA interface. I've reimplemented
our existing aggregate functions with this setup. For true UDAs, the binaries would be
loaded in aggregate-expr.

This also includes some preliminary changes in the FE. We now need to annotate each
AggNode as executing the update vs. merge phase (root aggs execute update, others
execute merge) and if it needs a finalize step (only the root does). This is more
general than our builtins which are too simple to need this structure.

There is a big TODO here to allow the intermediate types between agg nodes to change.
For example, in distinct estimate, the input type is the column type and the output type
is a bigint. We'd like the intermediate type to be CHAR(256). This is different since
currently, the intermediate type and output type have always been the same. We've hacked
around this by having both the intermediate and output type be TYPE_STRING. I've left
this for another patch (changing the BE to support this is trivial).
For aggregates that result in strings, we used to store some additional stuff past the
end of the tuple. The layout was:
<tuple> <length of 1st string buffer>,<length of 2nd string buffer>, etc

The rationale for this is that we want to reuse the buffer for min/max and grow the buffer
more quickly for group_concat. This breaks down the abstraction between agg-expr and
agg-node and is not something UDAs can use in general. Rather than try to hack around
this, I think the proper solution is to the intermediate type not be StringValue and
to contain the buffer length itself.

This patch also resurrects the distinct estimate code. The distinct estimate functions
exercise all of the code paths.

Change-Id: Ic152a2cd03bc1713967673681e1e6204dcd80346
Reviewed-on: http://gerrit.ent.cloudera.com:8080/564
Reviewed-by: Nong Li <nong@cloudera.com>
Tested-by: Nong Li <nong@cloudera.com>
2014-01-08 10:53:13 -08:00

107 lines
4.4 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
# General Impala query tests
#
import logging
import pytest
from tests.common.test_vector import *
from tests.common.impala_test_suite import ImpalaTestSuite
class TestQueries(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
def test_distinct(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail(("HBase returns columns in alphabetical order for select distinct *, "
"making result verication fail."))
self.run_test_case('QueryTest/distinct', vector)
def test_aggregation(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
self.run_test_case('QueryTest/aggregation', vector)
def test_exprs(self, vector):
# TODO: Enable some of these tests for Avro if possible
# Don't attempt to evaluate timestamp expressions with Avro tables (which)
# don't support a timestamp type)"
if vector.get_value('table_format').file_format == 'avro':
pytest.skip()
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
self.run_test_case('QueryTest/exprs', vector)
def test_hdfs_scan_node(self, vector):
self.run_test_case('QueryTest/hdfs-scan-node', vector)
def test_distinct_estimate(self, vector):
# These results will vary slightly depending on how the values get split up
# so only run with 1 node and on text.
if vector.get_value('table_format').file_format != 'text':
pytest.skip()
vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/distinct-estimate', vector)
def test_scan_range(self, vector):
self.run_test_case('QueryTest/hdfs-partitions', vector)
def test_file_partitions(self, vector):
self.run_test_case('QueryTest/hdfs-partitions', vector)
def test_limit(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("IMPALA-283 - select count(*) produces inconsistent results")
self.run_test_case('QueryTest/limit', vector)
def test_top_n(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
self.run_test_case('QueryTest/top-n', vector)
def test_empty(self, vector):
self.run_test_case('QueryTest/empty', vector)
def test_subquery(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail(("jointbl does not have columns with unique values, "
"hbase collapses them"))
self.run_test_case('QueryTest/subquery', vector)
def test_subquery_limit(self, vector):
self.run_test_case('QueryTest/subquery-limit', vector)
def test_mixed_format(self, vector):
self.run_test_case('QueryTest/mixed-format', vector)
def test_views(self, vector):
if vector.get_value('table_format').file_format == "hbase":
pytest.xfail("TODO: Enable views tests for hbase")
self.run_test_case('QueryTest/views', vector)
def test_with_clause(self, vector):
if vector.get_value('table_format').file_format == "hbase":
pytest.xfail("TODO: Enable with clause tests for hbase")
self.run_test_case('QueryTest/with-clause', vector)
def test_values(self, vector):
# These tests do not read data from tables, so only run them a single time (text/none).
table_format = vector.get_value('table_format')
if (table_format.file_format == 'text' and table_format.compression_codec == 'none'):
self.run_test_case('QueryTest/values', vector)
def test_misc(self, vector):
table_format = vector.get_value('table_format')
if table_format.file_format in ['hbase', 'rc', 'parquet']:
msg = ("Failing on rc/snap/block despite resolution of IMP-624,IMP-503. "
"Failing on parquet because nulltable does not exist in parquet")
pytest.xfail(msg)
self.run_test_case('QueryTest/misc', vector)
def test_overflow(self, vector):
table_format = vector.get_value('table_format')
if table_format.file_format != 'text' or table_format.compression_codec != 'none':
pytest.xfail("Test limited to text/none")
self.run_test_case('QueryTest/overflow', vector)