Files
impala/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
Tim Armstrong f5b7842414 IMPALA-2502: don't redundantly repartition grouping aggregations
Grouping aggregations previously always repartitioned their input,
even if preceding joins or aggs had already partitioned the data on the
required key (or an equivalent key). This patch checks to see if data is
already partitioned on the required exprs (or equivalent ones), and if
so skips the preaggregation and only does a merge aggregation.

The patch also does some refactoring of the aggregation planning in
DistributedPlanner to make it easier to implement the change.

Includes planner tests for the three cases that are affected:
grouping aggregations, non-grouping distinct aggregations and
grouping distinct aggregations.

Change-Id: Iffdcfd3629b8a69bd23915e1adba3b8323cbbaef
Reviewed-on: http://gerrit.cloudera.org:8080/2414
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Internal Jenkins
2016-03-15 09:21:22 +00:00

1066 lines
28 KiB
Plaintext

# basic aggregation
select count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col), sum(tinyint_col),
avg(tinyint_col)
from functional.alltypesagg
---- PLAN
01:AGGREGATE [FINALIZE]
| output: count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
03:AGGREGATE [FINALIZE]
| output: count:merge(*), count:merge(tinyint_col), min:merge(tinyint_col), max:merge(tinyint_col), sum:merge(tinyint_col), avg:merge(tinyint_col)
|
02:EXCHANGE [UNPARTITIONED]
|
01:AGGREGATE
| output: count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# with grouping
select tinyint_col, bigint_col, count(*), min(tinyint_col), max(tinyint_col), sum(tinyint_col),
avg(tinyint_col)
from functional.alltypesagg
group by 2, 1
---- PLAN
01:AGGREGATE [FINALIZE]
| output: count(*), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
| group by: bigint_col, tinyint_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
04:EXCHANGE [UNPARTITIONED]
|
03:AGGREGATE [FINALIZE]
| output: count:merge(*), min:merge(tinyint_col), max:merge(tinyint_col), sum:merge(tinyint_col), avg:merge(tinyint_col)
| group by: bigint_col, tinyint_col
|
02:EXCHANGE [HASH(bigint_col,tinyint_col)]
|
01:AGGREGATE [STREAMING]
| output: count(*), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
| group by: bigint_col, tinyint_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# avg substitution
select avg(id)
from functional.testtbl
having count(id) > 0
order by avg(zip) limit 10
---- PLAN
02:TOP-N [LIMIT=10]
| order by: avg(zip) ASC
|
01:AGGREGATE [FINALIZE]
| output: avg(id), count(id), avg(zip)
| having: count(id) > 0
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 files=0 size=0B
---- DISTRIBUTEDPLAN
02:TOP-N [LIMIT=10]
| order by: avg(zip) ASC
|
04:AGGREGATE [FINALIZE]
| output: avg:merge(id), count:merge(id), avg:merge(zip)
| having: count(id) > 0
|
03:EXCHANGE [UNPARTITIONED]
|
01:AGGREGATE
| output: avg(id), count(id), avg(zip)
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 files=0 size=0B
====
# Test correct removal of redundant group-by expressions (IMPALA-817)
select int_col + int_col, int_col * int_col
from functional.alltypesagg
group by int_col + int_col, int_col * int_col, int_col + int_col
having (int_col * int_col) < 0 limit 10
---- PLAN
01:AGGREGATE [FINALIZE]
| group by: int_col + int_col, int_col * int_col
| having: int_col * int_col < 0
| limit: 10
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
04:EXCHANGE [UNPARTITIONED]
| limit: 10
|
03:AGGREGATE [FINALIZE]
| group by: int_col + int_col, int_col * int_col
| having: int_col * int_col < 0
| limit: 10
|
02:EXCHANGE [HASH(int_col + int_col,int_col * int_col)]
|
01:AGGREGATE [STREAMING]
| group by: int_col + int_col, int_col * int_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# Tests that a having predicate triggers slot materialization (IMPALA-846).
select count(*) from
functional.alltypes t1 inner join functional.alltypestiny t2
on t1.smallint_col = t2.smallint_col
group by t1.tinyint_col, t2.smallint_col
having count(t2.int_col) = count(t1.bigint_col)
---- PLAN
03:AGGREGATE [FINALIZE]
| output: count(*), count(t2.int_col), count(t1.bigint_col)
| group by: t1.tinyint_col, t2.smallint_col
| having: count(t2.int_col) = count(t1.bigint_col)
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t1.smallint_col = t2.smallint_col
| runtime filters: RF000 <- t2.smallint_col
|
|--01:SCAN HDFS [functional.alltypestiny t2]
| partitions=4/4 files=4 size=460B
|
00:SCAN HDFS [functional.alltypes t1]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> t1.smallint_col
====
# Tests proper slot materialization of agg-tuple slots for avg (IMP-1271).
# 't.x > 10' is picked up as an unassigned conjunct, and not as a binding
# predicate because avg gets rewritten into an expr against two slots
# (and getBoundPredicates() cannot handle multi-slot predicates).
select 1 from
(select int_col, avg(bigint_col) x from functional.alltypes
group by int_col) t
where t.x > 10
---- PLAN
01:AGGREGATE [FINALIZE]
| output: avg(bigint_col)
| group by: int_col
| having: avg(bigint_col) > 10
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# test distributed aggregation over unions (IMPALA-831)
# non-distinct agg without grouping over a union
select count(*) from
(select * from functional.alltypes
union all
select * from functional.alltypessmall) t
limit 10
---- PLAN
03:AGGREGATE [FINALIZE]
| output: count(*)
| limit: 10
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
05:AGGREGATE [FINALIZE]
| output: count:merge(*)
| limit: 10
|
04:EXCHANGE [UNPARTITIONED]
|
03:AGGREGATE
| output: count(*)
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# non-distinct agg with grouping over a union
select count(*) from
(select * from functional.alltypes
union all
select * from functional.alltypessmall) t
group by t.bigint_col
limit 10
---- PLAN
03:AGGREGATE [FINALIZE]
| output: count(*)
| group by: bigint_col
| limit: 10
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
06:EXCHANGE [UNPARTITIONED]
| limit: 10
|
05:AGGREGATE [FINALIZE]
| output: count:merge(*)
| group by: t.bigint_col
| limit: 10
|
04:EXCHANGE [HASH(t.bigint_col)]
|
03:AGGREGATE [STREAMING]
| output: count(*)
| group by: bigint_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# distinct agg without grouping over a union
select count(distinct int_col)
from
(select * from functional.alltypes
union all
select * from functional.alltypessmall) t
limit 10
---- PLAN
04:AGGREGATE [FINALIZE]
| output: count(int_col)
| limit: 10
|
03:AGGREGATE
| group by: int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
08:AGGREGATE [FINALIZE]
| output: count:merge(int_col)
| limit: 10
|
07:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE
| output: count(int_col)
|
06:AGGREGATE
| group by: int_col
|
05:EXCHANGE [HASH(int_col)]
|
03:AGGREGATE [STREAMING]
| group by: int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# distinct agg with grouping over a union
select count(distinct int_col)
from
(select * from functional.alltypes
union all
select * from functional.alltypessmall) t
group by t.bigint_col
limit 10
---- PLAN
04:AGGREGATE [FINALIZE]
| output: count(int_col)
| group by: t.bigint_col
| limit: 10
|
03:AGGREGATE
| group by: bigint_col, int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
07:EXCHANGE [UNPARTITIONED]
| limit: 10
|
04:AGGREGATE [FINALIZE]
| output: count(int_col)
| group by: t.bigint_col
| limit: 10
|
06:AGGREGATE
| group by: t.bigint_col, int_col
|
05:EXCHANGE [HASH(t.bigint_col)]
|
03:AGGREGATE [STREAMING]
| group by: bigint_col, int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# mixed distinct and non-distinct agg without grouping over a union
select count(smallint_col), count(distinct int_col)
from
(select * from functional.alltypes
union all
select * from functional.alltypessmall) t
limit 10
---- PLAN
04:AGGREGATE [FINALIZE]
| output: count(int_col), count:merge(smallint_col)
| limit: 10
|
03:AGGREGATE
| output: count(smallint_col)
| group by: int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
08:AGGREGATE [FINALIZE]
| output: count:merge(int_col), count:merge(smallint_col)
| limit: 10
|
07:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE
| output: count(int_col), count:merge(smallint_col)
|
06:AGGREGATE
| output: count:merge(smallint_col)
| group by: int_col
|
05:EXCHANGE [HASH(int_col)]
|
03:AGGREGATE [STREAMING]
| output: count(smallint_col)
| group by: int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# mixed distinct and non-distinct agg with grouping over a union
select count(smallint_col), count(distinct int_col)
from
(select * from functional.alltypes
union all
select * from functional.alltypessmall) t
group by t.bigint_col
limit 10
---- PLAN
04:AGGREGATE [FINALIZE]
| output: count(int_col), count:merge(smallint_col)
| group by: t.bigint_col
| limit: 10
|
03:AGGREGATE
| output: count(smallint_col)
| group by: bigint_col, int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
07:EXCHANGE [UNPARTITIONED]
| limit: 10
|
04:AGGREGATE [FINALIZE]
| output: count(int_col), count:merge(smallint_col)
| group by: t.bigint_col
| limit: 10
|
06:AGGREGATE
| output: count:merge(smallint_col)
| group by: t.bigint_col, int_col
|
05:EXCHANGE [HASH(t.bigint_col)]
|
03:AGGREGATE [STREAMING]
| output: count(smallint_col)
| group by: bigint_col, int_col
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# mixed distinct and non-distinct agg with grouping over a union distinct
select count(smallint_col), count(distinct int_col)
from
(select * from functional.alltypes
union distinct
select * from functional.alltypessmall) t
group by t.bigint_col
limit 10
---- PLAN
05:AGGREGATE [FINALIZE]
| output: count(int_col), count:merge(smallint_col)
| group by: t.bigint_col
| limit: 10
|
04:AGGREGATE
| output: count(smallint_col)
| group by: bigint_col, int_col
|
03:AGGREGATE [FINALIZE]
| group by: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
10:EXCHANGE [UNPARTITIONED]
| limit: 10
|
05:AGGREGATE [FINALIZE]
| output: count(int_col), count:merge(smallint_col)
| group by: t.bigint_col
| limit: 10
|
09:AGGREGATE
| output: count:merge(smallint_col)
| group by: t.bigint_col, int_col
|
08:EXCHANGE [HASH(t.bigint_col)]
|
04:AGGREGATE [STREAMING]
| output: count(smallint_col)
| group by: bigint_col, int_col
|
07:AGGREGATE [FINALIZE]
| group by: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
|
06:EXCHANGE [HASH(id,bool_col,tinyint_col,smallint_col,int_col,bigint_col,float_col,double_col,date_string_col,string_col,timestamp_col,year,month)]
|
03:AGGREGATE [STREAMING]
| group by: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
|
00:UNION
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# test that aggregations are not placed below an unpartitioned exchange with a limit
select count(*) from (select * from functional.alltypes limit 10) t
---- PLAN
01:AGGREGATE [FINALIZE]
| output: count(*)
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
limit: 10
---- DISTRIBUTEDPLAN
01:AGGREGATE [FINALIZE]
| output: count(*)
|
02:EXCHANGE [UNPARTITIONED]
| limit: 10
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
limit: 10
====
# test that aggregations are not placed below an unpartitioned exchange with a limit
select count(*) from
(select * from functional.alltypes
union all
(select * from functional.alltypessmall) limit 10) t
---- PLAN
03:AGGREGATE [FINALIZE]
| output: count(*)
|
00:UNION
| limit: 10
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
---- DISTRIBUTEDPLAN
03:AGGREGATE [FINALIZE]
| output: count(*)
|
04:EXCHANGE [UNPARTITIONED]
| limit: 10
|
00:UNION
| limit: 10
|
|--02:SCAN HDFS [functional.alltypessmall]
| partitions=4/4 files=4 size=6.32KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# test that limits are applied at the top-level merge aggregation node for non-grouping
# distinct aggregation (IMPALA-1802)
select * from (
select count(distinct cnt) from
(select count(distinct t1.id) as cnt
from functional.alltypesagg t1 join functional.alltypestiny t2 on t1.id = t2.id
limit 10) t
limit 2) v
limit 1
---- PLAN
06:AGGREGATE [FINALIZE]
| output: count(cnt)
| limit: 1
|
05:AGGREGATE
| group by: count(t1.id)
|
04:AGGREGATE [FINALIZE]
| output: count(t1.id)
| limit: 10
|
03:AGGREGATE
| group by: t1.id
|
02:HASH JOIN [INNER JOIN]
| hash predicates: t1.id = t2.id
| runtime filters: RF000 <- t2.id
|
|--01:SCAN HDFS [functional.alltypestiny t2]
| partitions=4/4 files=4 size=460B
|
00:SCAN HDFS [functional.alltypesagg t1]
partitions=11/11 files=11 size=814.73KB
runtime filters: RF000 -> t1.id
---- DISTRIBUTEDPLAN
06:AGGREGATE [FINALIZE]
| output: count(cnt)
| limit: 1
|
05:AGGREGATE
| group by: count(t1.id)
|
11:AGGREGATE [FINALIZE]
| output: count:merge(t1.id)
| limit: 10
|
10:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE
| output: count(t1.id)
|
09:AGGREGATE
| group by: t1.id
|
08:EXCHANGE [HASH(t1.id)]
|
03:AGGREGATE [STREAMING]
| group by: t1.id
|
02:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: t1.id = t2.id
| runtime filters: RF000 <- t2.id
|
|--07:EXCHANGE [BROADCAST]
| |
| 01:SCAN HDFS [functional.alltypestiny t2]
| partitions=4/4 files=4 size=460B
|
00:SCAN HDFS [functional.alltypesagg t1]
partitions=11/11 files=11 size=814.73KB
runtime filters: RF000 -> t1.id
====
# IMPALA-2089: Tests correct elimination of redundant predicates.
# The equivalences between inline-view slots are enforced inside the inline-view plan.
# Equivalences between simple grouping slots (with SlotRef grouping exprs) are enforced
# at the scan, and equivalences between grouping slots with complex grouping exprs are
# enforced at the aggregation.
# a, b, c, d are in the same equivalence class and some predicates are redundant.
select * from
(select tinyint_col a, smallint_col b, int_col + int_col c, coalesce(bigint_col, year) d
from functional.alltypes
group by 1, 2, 3, 4) v
where v.a = v.b and v.b = v.c and v.c = v.d and v.a = v.c and v.a = v.d
---- PLAN
01:AGGREGATE [FINALIZE]
| group by: tinyint_col, smallint_col, int_col + int_col, coalesce(bigint_col, year)
| having: int_col + int_col = coalesce(bigint_col, year), smallint_col = int_col + int_col
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
predicates: functional.alltypes.tinyint_col = functional.alltypes.smallint_col
====
# IMPALA-1917: Test NULL literals inside inline view with grouping aggregation.
select cnt from
(select bool_col, count(*) cnt, cast(NULL as int) as x, cast(NULL as int) as y
from functional.alltypestiny
group by bool_col, x) v
---- PLAN
01:AGGREGATE [FINALIZE]
| output: count(*)
| group by: bool_col, CAST(NULL AS INT)
|
00:SCAN HDFS [functional.alltypestiny]
partitions=4/4 files=4 size=460B
====
# IMPALA-1917: Test NULL literals inside inline view with grouping aggregation.
select cnt from
(select bool_col, count(distinct int_col) cnt, NULL as x, NULL as y
from functional.alltypestiny
group by bool_col, x) v
---- PLAN
02:AGGREGATE [FINALIZE]
| output: count(int_col)
| group by: bool_col, NULL
|
01:AGGREGATE
| group by: bool_col, NULL, int_col
|
00:SCAN HDFS [functional.alltypestiny]
partitions=4/4 files=4 size=460B
====
# test simple group_concat with distinct
select group_concat(distinct string_col) from functional.alltypesagg
---- PLAN
02:AGGREGATE [FINALIZE]
| output: group_concat(string_col)
|
01:AGGREGATE
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [FINALIZE]
| output: group_concat:merge(string_col)
|
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE
| output: group_concat(string_col)
|
04:AGGREGATE
| group by: string_col
|
03:EXCHANGE [HASH(string_col)]
|
01:AGGREGATE [STREAMING]
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# test group_concat and a group by
select day, group_concat(distinct string_col)
from (select * from functional.alltypesagg where id % 100 = day order by id limit 99999) a
group by day
---- PLAN
03:AGGREGATE [FINALIZE]
| output: group_concat(string_col)
| group by: day
|
02:AGGREGATE
| group by: day, string_col
|
01:TOP-N [LIMIT=99999]
| order by: id ASC
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
predicates: id % 100 = day
---- DISTRIBUTEDPLAN
03:AGGREGATE [FINALIZE]
| output: group_concat(string_col)
| group by: day
|
02:AGGREGATE
| group by: day, string_col
|
04:MERGING-EXCHANGE [UNPARTITIONED]
| order by: id ASC
| limit: 99999
|
01:TOP-N [LIMIT=99999]
| order by: id ASC
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
predicates: id % 100 = day
====
# test group_concat with distinct together with another distinct aggregate function
select count(distinct cast(timestamp_col as string)),
group_concat(distinct cast(timestamp_col as string))
from functional.alltypesagg group by year
---- PLAN
02:AGGREGATE [FINALIZE]
| output: count(CAST(timestamp_col AS STRING)), group_concat(CAST(timestamp_col AS STRING))
| group by: year
|
01:AGGREGATE
| group by: year, CAST(timestamp_col AS STRING)
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE [FINALIZE]
| output: count(CAST(timestamp_col AS STRING)), group_concat(CAST(timestamp_col AS STRING))
| group by: year
|
04:AGGREGATE
| group by: year, CAST(timestamp_col AS STRING)
|
03:EXCHANGE [HASH(year)]
|
01:AGGREGATE [STREAMING]
| group by: year, CAST(timestamp_col AS STRING)
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# test group_concat distinct with other non-distinct aggregate functions
select group_concat(distinct string_col), count(*) from functional.alltypesagg
---- PLAN
02:AGGREGATE [FINALIZE]
| output: group_concat(string_col), count:merge(*)
|
01:AGGREGATE
| output: count(*)
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [FINALIZE]
| output: group_concat:merge(string_col), count:merge(*)
|
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE
| output: group_concat(string_col), count:merge(*)
|
04:AGGREGATE
| output: count:merge(*)
| group by: string_col
|
03:EXCHANGE [HASH(string_col)]
|
01:AGGREGATE [STREAMING]
| output: count(*)
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# test group_concat distinct with other aggregate functions, with custom separator
select group_concat(distinct string_col, '-'), sum(int_col), count(distinct string_col)
from functional.alltypesagg
---- PLAN
02:AGGREGATE [FINALIZE]
| output: group_concat(string_col, '-'), count(string_col), sum:merge(int_col)
|
01:AGGREGATE
| output: sum(int_col)
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [FINALIZE]
| output: group_concat:merge(string_col, '-'), count:merge(string_col), sum:merge(int_col)
|
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE
| output: group_concat(string_col, '-'), count(string_col), sum:merge(int_col)
|
04:AGGREGATE
| output: sum:merge(int_col)
| group by: string_col
|
03:EXCHANGE [HASH(string_col)]
|
01:AGGREGATE [STREAMING]
| output: sum(int_col)
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# test group_concat distinct with other aggregate functions, with custom separator
# and a group by
select month, year, count(*), count(distinct date_string_col),
group_concat(distinct date_string_col, '-') from functional.alltypesagg
group by month, year
---- PLAN
02:AGGREGATE [FINALIZE]
| output: count(date_string_col), group_concat(date_string_col, '-'), count:merge(*)
| group by: month, year
|
01:AGGREGATE
| output: count(*)
| group by: month, year, date_string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE [FINALIZE]
| output: count(date_string_col), group_concat(date_string_col, '-'), count:merge(*)
| group by: month, year
|
04:AGGREGATE
| output: count:merge(*)
| group by: month, year, date_string_col
|
03:EXCHANGE [HASH(month,year)]
|
01:AGGREGATE [STREAMING]
| output: count(*)
| group by: month, year, date_string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# test multiple group_concat distinct, each with a different separator
select group_concat(distinct string_col), group_concat(distinct string_col, '-'),
group_concat(distinct string_col, '---') from functional.alltypesagg
---- PLAN
02:AGGREGATE [FINALIZE]
| output: group_concat(string_col), group_concat(string_col, '-'), group_concat(string_col, '---')
|
01:AGGREGATE
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [FINALIZE]
| output: group_concat:merge(string_col), group_concat:merge(string_col, '-'), group_concat:merge(string_col, '---')
|
05:EXCHANGE [UNPARTITIONED]
|
02:AGGREGATE
| output: group_concat(string_col), group_concat(string_col, '-'), group_concat(string_col, '---')
|
04:AGGREGATE
| group by: string_col
|
03:EXCHANGE [HASH(string_col)]
|
01:AGGREGATE [STREAMING]
| group by: string_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# IMPALA-852: Aggregation only in the HAVING clause.
select 1 from functional.alltypestiny having count(*) > 0
---- PLAN
01:AGGREGATE [FINALIZE]
| output: count(*)
| having: count(*) > 0
|
00:SCAN HDFS [functional.alltypestiny]
partitions=4/4 files=4 size=460B
====
# Grouping aggregation where input is partitioned on grouping expr.
# Planner should not redundantly repartition the data that was already partitioned on
# the required key by the join.
select straight_join c_custkey, count(*)
from tpch_parquet.customer inner join [shuffle] tpch_parquet.orders on c_custkey = o_custkey
where c_nationkey = 16
group by 1
having count(*) < 150000
limit 1000000
---- DISTRIBUTEDPLAN
06:EXCHANGE [UNPARTITIONED]
| limit: 1000000
|
03:AGGREGATE [FINALIZE]
| output: count(*)
| group by: c_custkey
| having: count(*) < 150000
| limit: 1000000
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: c_custkey = o_custkey
| runtime filters: RF000 <- o_custkey
|
|--05:EXCHANGE [HASH(o_custkey)]
| |
| 01:SCAN HDFS [tpch_parquet.orders]
| partitions=1/1 files=2 size=54.93MB
|
04:EXCHANGE [HASH(c_custkey)]
|
00:SCAN HDFS [tpch_parquet.customer]
partitions=1/1 files=1 size=12.47MB
predicates: c_nationkey = 16
runtime filters: RF000 -> c_custkey
====
# Distinct aggregation where input is partitioned on distinct expr.
# Planner should not redundantly repartition the data that was already partitioned on
# the required key by the join.
select col from (
select straight_join count(distinct c_custkey) col
from tpch_parquet.orders inner join [shuffle] tpch_parquet.customer on c_custkey = o_custkey) v
where col > 50
limit 50
---- DISTRIBUTEDPLAN
08:AGGREGATE [FINALIZE]
| output: count:merge(c_custkey)
| having: count(c_custkey) > 50
| limit: 50
|
07:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE
| output: count(c_custkey)
|
03:AGGREGATE
| group by: c_custkey
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: o_custkey = c_custkey
| runtime filters: RF000 <- c_custkey
|
|--06:EXCHANGE [HASH(c_custkey)]
| |
| 01:SCAN HDFS [tpch_parquet.customer]
| partitions=1/1 files=1 size=12.47MB
|
05:EXCHANGE [HASH(o_custkey)]
|
00:SCAN HDFS [tpch_parquet.orders]
partitions=1/1 files=2 size=54.93MB
runtime filters: RF000 -> o_custkey
====
# Distinct grouping aggregation where input is partitioned on distinct and grouping exprs.
# Planner should not redundantly repartition the data that was already partitioned on
# the required key by the join.
select straight_join c_custkey, count(distinct c_custkey)
from tpch_parquet.orders inner join [shuffle] tpch_parquet.customer on c_custkey = o_custkey
group by 1
---- DISTRIBUTEDPLAN
07:EXCHANGE [UNPARTITIONED]
|
04:AGGREGATE [FINALIZE]
| output: count(c_custkey)
| group by: c_custkey
|
03:AGGREGATE
| group by: c_custkey, c_custkey
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: o_custkey = c_custkey
| runtime filters: RF000 <- c_custkey
|
|--06:EXCHANGE [HASH(c_custkey)]
| |
| 01:SCAN HDFS [tpch_parquet.customer]
| partitions=1/1 files=1 size=12.47MB
|
05:EXCHANGE [HASH(o_custkey)]
|
00:SCAN HDFS [tpch_parquet.orders]
partitions=1/1 files=2 size=54.93MB
runtime filters: RF000 -> o_custkey
====
# Complex aggregation when two joins and an agg end up in same fragment.
select l_orderkey, l_returnflag, count(*) from (
select straight_join *
from tpch_parquet.lineitem
inner join [shuffle] tpch_parquet.orders
on l_orderkey = o_orderkey and l_returnflag = o_clerk
inner join [broadcast] tpch_parquet.customer
on o_custkey = c_custkey and c_phone = o_comment
) v
group by 1, 2
having count(*) > 10
limit 10
---- DISTRIBUTEDPLAN
09:EXCHANGE [UNPARTITIONED]
| limit: 10
|
05:AGGREGATE [FINALIZE]
| output: count(*)
| group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_returnflag
| having: count(*) > 10
| limit: 10
|
04:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: o_custkey = c_custkey, o_comment = c_phone
| runtime filters: RF000 <- c_custkey, RF001 <- c_phone
|
|--08:EXCHANGE [BROADCAST]
| |
| 02:SCAN HDFS [tpch_parquet.customer]
| partitions=1/1 files=1 size=12.47MB
|
03:HASH JOIN [INNER JOIN, PARTITIONED]
| hash predicates: l_orderkey = o_orderkey, l_returnflag = o_clerk
| runtime filters: RF002 <- o_orderkey, RF003 <- o_clerk
|
|--07:EXCHANGE [HASH(o_orderkey,o_clerk)]
| |
| 01:SCAN HDFS [tpch_parquet.orders]
| partitions=1/1 files=2 size=54.93MB
| runtime filters: RF000 -> o_custkey, RF001 -> o_comment
|
06:EXCHANGE [HASH(l_orderkey,l_returnflag)]
|
00:SCAN HDFS [tpch_parquet.lineitem]
partitions=1/1 files=3 size=195.85MB
runtime filters: RF002 -> l_orderkey, RF003 -> l_returnflag
====