mirror of
https://github.com/apache/impala.git
synced 2026-01-26 12:02:21 -05:00
This patch adds support for having multiple aggregate functions in a single SELECT block that use DISTINCT over different sets of columns. Planner design: - The existing tree-based plan shape with a two-phased aggregation is maintained. - Existing plans are not changed. - Aggregates are grouped into 'aggregation classes' based on their expressions in the distinct portion which may be empty for non-distinct aggregates. - The aggregation framework is generalized to simultaneously process multiple aggregation classes within the tree-based plan. This process splits the results of different aggregation classes into separate rows, so a final aggregation is needed to transpose the results into the desired form. - Main challenge: Each aggregation class consumes and produces different tuples, so conceptually a union-type of tuples flows through the runtime. The tuple union is represented by a TupleRow with one tuple per aggregation class. Only one tuple in such a TupleRow is non-NULL. - Backend exec nodes in the aggregation plan will be aware of this tuple-union either explicitly in their implementation or by relying on expressions that distinguish the aggregation classes. - To distinguish the aggregation classes, e.g. in hash exchanges, CASE expressions are crafted to hash/group on the appropriate slots. Deferred FE work: - Beautify/condense the long CASE exprs - Push applicable conjuncts into individual aggregators before the transposition step - Added a few testing TODOs to reduce the size of this patch - Decide whether we want to change existing plans to the new model Execution design: - Previous patches separated out aggregation logic from the exec node into Aggregators. This is extended to support multiple Aggregators per node, with different grouping and aggregating functions. - There is a fast path for aggregations with only one aggregator, which leaves the execution essentially unchanged from before. - When there are multiple aggregators, the first aggregation node in the plan replicates its input to each aggregator. The output of this step is rows where only a single tuple is non-null, corresponding to the aggregator that produced the row. - A new expr is introduced, ValidTupleId, which takes one of these rows and returns which tuple is non-null. - For additional aggregation nodes, the input is split apart into 'mini-batches' according to which aggregator the row corresponds to. Testing: - Added analyzer and planner tests - Added end-to-end queries tests - Ran hdfs/core tests - Added support in the query generator and ran in a loop. Change-Id: I055402eaef6d81e5f70e850d9f8a621e766830a4 Reviewed-on: http://gerrit.cloudera.org:8080/10771 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
222 lines
5.1 KiB
Plaintext
222 lines
5.1 KiB
Plaintext
====
|
|
---- QUERY
|
|
set buffer_pool_limit=34m;
|
|
select l_orderkey, count(*)
|
|
from lineitem
|
|
group by 1
|
|
order by 1 limit 10
|
|
---- RESULTS
|
|
1,6
|
|
2,1
|
|
3,6
|
|
4,1
|
|
5,3
|
|
6,1
|
|
7,7
|
|
32,6
|
|
33,4
|
|
34,3
|
|
---- TYPES
|
|
BIGINT, BIGINT
|
|
---- RUNTIME_PROFILE
|
|
# Verify that spilling and passthrough were activated.
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Test query with string grouping column and string agg columns
|
|
set buffer_pool_limit=82m;
|
|
set num_nodes=1;
|
|
select l_returnflag, l_orderkey, avg(l_tax), min(l_shipmode)
|
|
from lineitem
|
|
group by 1,2
|
|
order by 1,2 limit 3
|
|
---- RESULTS
|
|
'A',3,0.050000,'RAIL'
|
|
'A',5,0.030000,'AIR'
|
|
'A',6,0.030000,'TRUCK'
|
|
---- TYPES
|
|
STRING, BIGINT, DECIMAL, STRING
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
set buffer_pool_limit=34m;
|
|
select l_orderkey, count(*)
|
|
from lineitem
|
|
group by 1
|
|
order by 1 limit 10;
|
|
---- RESULTS
|
|
1,6
|
|
2,1
|
|
3,6
|
|
4,1
|
|
5,3
|
|
6,1
|
|
7,7
|
|
32,6
|
|
33,4
|
|
34,3
|
|
---- TYPES
|
|
BIGINT, BIGINT
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Test query with string grouping column
|
|
set buffer_pool_limit=34m;
|
|
set num_nodes=1;
|
|
select l_comment, count(*)
|
|
from lineitem
|
|
group by 1
|
|
order by count(*) desc limit 5
|
|
---- RESULTS
|
|
' furiously',943
|
|
' carefully',893
|
|
' carefully ',875
|
|
'carefully ',854
|
|
' furiously ',845
|
|
---- TYPES
|
|
STRING, BIGINT
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Test query with string grouping column and string agg columns
|
|
set buffer_pool_limit=82m;
|
|
set num_nodes=1;
|
|
select l_returnflag, l_orderkey, round(avg(l_tax),2), min(l_shipmode)
|
|
from lineitem
|
|
group by 1,2
|
|
order by 1,2 limit 3;
|
|
---- RESULTS
|
|
'A',3,0.05,'RAIL'
|
|
'A',5,0.03,'AIR'
|
|
'A',6,0.03,'TRUCK'
|
|
---- TYPES
|
|
STRING, BIGINT, DECIMAL, STRING
|
|
---- RUNTIME_PROFILE
|
|
# Verify that spilling happened in the agg.
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Test with non-scalar intermediate state (avg() uses fixed intermediate value).
|
|
set buffer_pool_limit=34m;
|
|
select l_orderkey, avg(l_orderkey)
|
|
from lineitem
|
|
group by 1
|
|
order by 1 limit 5
|
|
---- RESULTS
|
|
1,1
|
|
2,2
|
|
3,3
|
|
4,4
|
|
5,5
|
|
---- TYPES
|
|
BIGINT, DOUBLE
|
|
---- RUNTIME_PROFILE
|
|
# Verify that passthrough and spilling happened in the pre and merge agg.
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Test aggregation spill with group_concat distinct
|
|
set buffer_pool_limit=50m;
|
|
select l_orderkey, count(*), group_concat(distinct l_linestatus, '|')
|
|
from lineitem
|
|
group by 1
|
|
order by 1 limit 10
|
|
---- RESULTS
|
|
1,6,'O'
|
|
2,1,'O'
|
|
3,6,'F'
|
|
4,1,'O'
|
|
5,3,'F'
|
|
6,1,'F'
|
|
7,7,'O'
|
|
32,6,'O'
|
|
33,4,'F'
|
|
34,3,'O'
|
|
---- TYPES
|
|
BIGINT, BIGINT, STRING
|
|
---- RUNTIME_PROFILE
|
|
# Verify that at least one of the aggs spilled.
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Test aggregation with minimum required reservation to exercise IMPALA-2708.
|
|
# Merge aggregation requires 17 buffers and preaggregation requires 16 buffers
|
|
# plus 1mb of hash tables. The buffer size is 256k for this test. The scan needs 24MB.
|
|
set max_row_size=256k;
|
|
set buffer_pool_limit="33.25M";
|
|
select count(*)
|
|
from (select distinct * from orders) t
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
1500000
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Multiple distinct
|
|
set buffer_pool_limit=30M;
|
|
select count(distinct l_orderkey), count(distinct l_partkey) from lineitem
|
|
---- TYPES
|
|
BIGINT,BIGINT
|
|
---- RESULTS
|
|
1500000,200000
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Multiple distinct with string col and group by
|
|
set buffer_pool_limit=50m;
|
|
select l_linenumber, count(distinct l_orderkey), count(distinct l_comment) from lineitem
|
|
group by 1 order by 1 limit 5
|
|
---- TYPES
|
|
INT,BIGINT,BIGINT
|
|
---- RESULTS
|
|
1,1500000,1273334
|
|
2,1285828,1102714
|
|
3,1071394,929553
|
|
4,857015,753374
|
|
5,643287,574337
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Multiple distinct and non-distinct, with an intermediate tuple (avg)
|
|
set buffer_pool_limit=40m;
|
|
select avg(distinct l_orderkey), count(distinct l_partkey), sum(l_tax), count(l_suppkey)
|
|
from tpch_parquet.lineitem
|
|
---- TYPES
|
|
DOUBLE,BIGINT,DECIMAL,BIGINT
|
|
---- RESULTS
|
|
2999991.5,200000,240129.67,6001215
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Multiple distinct and non-distinct, with a group by
|
|
set buffer_pool_limit=55m;
|
|
select l_linenumber, avg(distinct l_orderkey), count(distinct l_partkey), sum(l_tax), count(l_suppkey)
|
|
from tpch_parquet.lineitem
|
|
group by 1 order by 1 limit 5
|
|
---- TYPES
|
|
INT,DOUBLE,BIGINT,DECIMAL,BIGINT
|
|
---- RESULTS
|
|
1,2999991.5,199893,60025.25,1500000
|
|
2,3000615.766574534,199674,51457.37,1285828
|
|
3,3000079.631604246,199036,42879.38,1071394
|
|
4,3000330.547357981,197222,34279.36,857015
|
|
5,2999188.900650876,191905,25745.25,643287
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*RowsPassedThrough: .* \([1-9][0-9]*\)
|
|
row_regex: .*SpilledPartitions: .* \([1-9][0-9]*\)
|
|
====
|