mirror of
https://github.com/apache/impala.git
synced 2026-01-05 12:01:11 -05:00
Since commit d2d3f4c (on asf-master), TAggregateExpr contains
the logical input types of the Aggregate Expr. The reason they
are included is that merging aggregate expressions will have
input tyes of the intermediate values which aren't necessarily
the same as the input types. For instance, NDV() uses a binary
blob as its intermediate value and it's passed to its merge
aggregate expressions as a StringVal but the input type of NDV()
in the query could be DecimalVal. In this case, we consider
DecimalVal as the logical input type while StringVal is the
intermediate type. The logical input types are accessed by the
BE via GetConstFnAttr() during interpretation and constant
propagation during codegen.
To handle distinct aggregate expressions (e.g. select count(distinct)),
the FE uses 2-phase aggregation by introducing an extra phase of
split/merge aggregation in which the distinct aggregate expressions'
inputs are coverted and added to the group-by expressions in the first
phase while the non-distinct aggregate expressions go through the normal
split/merge treatement.
The bug is that the existing code incorrectly propagates the intermediate
types of the non-grouping aggregate expressions as the logical input types
to the merging aggregate expressions in the second phase of aggregation.
The input aggregate expressions for the non-distinct aggregate expressions
in the second phase aggregation are already merging aggregate expressions
(from phase one) in which case we should not treat its input types as
logical input types.
This change fixes the problem above by checking if the input aggregate
expression passed to FunctionCallExpr.createMergeAggCall() is already
a merging aggregate expression. If so, it will use the logical input
types recorded in its 'mergeAggInputFn_' as references for its logical
input types instead of the aggregate expression input types themselves.
Change-Id: I158303b20d1afdff23c67f3338b9c4af2ad80691
Reviewed-on: http://gerrit.cloudera.org:8080/6724
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Impala Public Jenkins
1332 lines
33 KiB
Plaintext
1332 lines
33 KiB
Plaintext
# basic aggregation
|
|
select count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col), sum(tinyint_col),
|
|
avg(tinyint_col)
|
|
from functional.alltypesagg
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*), count:merge(tinyint_col), min:merge(tinyint_col), max:merge(tinyint_col), sum:merge(tinyint_col), avg:merge(tinyint_col)
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*), count(tinyint_col), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# with grouping
|
|
select tinyint_col, bigint_col, count(*), min(tinyint_col), max(tinyint_col), sum(tinyint_col),
|
|
avg(tinyint_col)
|
|
from functional.alltypesagg
|
|
group by 2, 1
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: count(*), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
|
|
| group by: bigint_col, tinyint_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*), min:merge(tinyint_col), max:merge(tinyint_col), sum:merge(tinyint_col), avg:merge(tinyint_col)
|
|
| group by: bigint_col, tinyint_col
|
|
|
|
|
02:EXCHANGE [HASH(bigint_col,tinyint_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| output: count(*), min(tinyint_col), max(tinyint_col), sum(tinyint_col), avg(tinyint_col)
|
|
| group by: bigint_col, tinyint_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# avg substitution
|
|
select avg(id)
|
|
from functional.testtbl
|
|
having count(id) > 0
|
|
order by avg(zip) limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:TOP-N [LIMIT=10]
|
|
| order by: avg(zip) ASC
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: avg(id), count(id), avg(zip)
|
|
| having: count(id) > 0
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 files=0 size=0B
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:TOP-N [LIMIT=10]
|
|
| order by: avg(zip) ASC
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: avg:merge(id), count:merge(id), avg:merge(zip)
|
|
| having: count(id) > 0
|
|
|
|
|
03:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: avg(id), count(id), avg(zip)
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 files=0 size=0B
|
|
====
|
|
# Test correct removal of redundant group-by expressions (IMPALA-817)
|
|
select int_col + int_col, int_col * int_col
|
|
from functional.alltypesagg
|
|
group by int_col + int_col, int_col * int_col, int_col + int_col
|
|
having (int_col * int_col) < 0 limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| group by: int_col + int_col, int_col * int_col
|
|
| having: int_col * int_col < 0
|
|
| limit: 10
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| group by: int_col + int_col, int_col * int_col
|
|
| having: int_col * int_col < 0
|
|
| limit: 10
|
|
|
|
|
02:EXCHANGE [HASH(int_col + int_col,int_col * int_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| group by: int_col + int_col, int_col * int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# Tests that a having predicate triggers slot materialization (IMPALA-846).
|
|
select count(*) from
|
|
functional.alltypes t1 inner join functional.alltypestiny t2
|
|
on t1.smallint_col = t2.smallint_col
|
|
group by t1.tinyint_col, t2.smallint_col
|
|
having count(t2.int_col) = count(t1.bigint_col)
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*), count(t2.int_col), count(t1.bigint_col)
|
|
| group by: t1.tinyint_col, t2.smallint_col
|
|
| having: count(t2.int_col) = count(t1.bigint_col)
|
|
|
|
|
02:HASH JOIN [INNER JOIN]
|
|
| hash predicates: t1.smallint_col = t2.smallint_col
|
|
| runtime filters: RF000 <- t2.smallint_col
|
|
|
|
|
|--01:SCAN HDFS [functional.alltypestiny t2]
|
|
| partitions=4/4 files=4 size=460B
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
runtime filters: RF000 -> t1.smallint_col
|
|
====
|
|
# Tests proper slot materialization of agg-tuple slots for avg (IMP-1271).
|
|
# 't.x > 10' is picked up as an unassigned conjunct, and not as a binding
|
|
# predicate because avg gets rewritten into an expr against two slots
|
|
# (and getBoundPredicates() cannot handle multi-slot predicates).
|
|
select 1 from
|
|
(select int_col, avg(bigint_col) x from functional.alltypes
|
|
group by int_col) t
|
|
where t.x > 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: avg(bigint_col)
|
|
| group by: int_col
|
|
| having: avg(bigint_col) > 10
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# test distributed aggregation over unions (IMPALA-831)
|
|
# non-distinct agg without grouping over a union
|
|
select count(*) from
|
|
(select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypessmall) t
|
|
limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| limit: 10
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| limit: 10
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# non-distinct agg with grouping over a union
|
|
select count(*) from
|
|
(select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypessmall) t
|
|
group by t.bigint_col
|
|
limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| group by: bigint_col
|
|
| limit: 10
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| group by: t.bigint_col
|
|
| limit: 10
|
|
|
|
|
04:EXCHANGE [HASH(t.bigint_col)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| output: count(*)
|
|
| group by: bigint_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# distinct agg without grouping over a union
|
|
select count(distinct int_col)
|
|
from
|
|
(select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypessmall) t
|
|
limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(int_col)
|
|
| limit: 10
|
|
|
|
|
03:AGGREGATE
|
|
| group by: int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
08:AGGREGATE [FINALIZE]
|
|
| output: count:merge(int_col)
|
|
| limit: 10
|
|
|
|
|
07:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
04:AGGREGATE
|
|
| output: count(int_col)
|
|
|
|
|
06:AGGREGATE
|
|
| group by: int_col
|
|
|
|
|
05:EXCHANGE [HASH(int_col)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| group by: int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# distinct agg with grouping over a union
|
|
select count(distinct int_col)
|
|
from
|
|
(select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypessmall) t
|
|
group by t.bigint_col
|
|
limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(int_col)
|
|
| group by: t.bigint_col
|
|
| limit: 10
|
|
|
|
|
03:AGGREGATE
|
|
| group by: bigint_col, int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
07:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(int_col)
|
|
| group by: t.bigint_col
|
|
| limit: 10
|
|
|
|
|
06:AGGREGATE
|
|
| group by: t.bigint_col, int_col
|
|
|
|
|
05:EXCHANGE [HASH(t.bigint_col)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| group by: bigint_col, int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# mixed distinct and non-distinct agg without grouping over a union
|
|
select count(smallint_col), count(distinct int_col)
|
|
from
|
|
(select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypessmall) t
|
|
limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(int_col), count:merge(smallint_col)
|
|
| limit: 10
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(smallint_col)
|
|
| group by: int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
08:AGGREGATE [FINALIZE]
|
|
| output: count:merge(int_col), count:merge(smallint_col)
|
|
| limit: 10
|
|
|
|
|
07:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
04:AGGREGATE
|
|
| output: count(int_col), count:merge(smallint_col)
|
|
|
|
|
06:AGGREGATE
|
|
| output: count:merge(smallint_col)
|
|
| group by: int_col
|
|
|
|
|
05:EXCHANGE [HASH(int_col)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| output: count(smallint_col)
|
|
| group by: int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# mixed distinct and non-distinct agg with grouping over a union
|
|
select count(smallint_col), count(distinct int_col)
|
|
from
|
|
(select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypessmall) t
|
|
group by t.bigint_col
|
|
limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(int_col), count:merge(smallint_col)
|
|
| group by: t.bigint_col
|
|
| limit: 10
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(smallint_col)
|
|
| group by: bigint_col, int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
07:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(int_col), count:merge(smallint_col)
|
|
| group by: t.bigint_col
|
|
| limit: 10
|
|
|
|
|
06:AGGREGATE
|
|
| output: count:merge(smallint_col)
|
|
| group by: t.bigint_col, int_col
|
|
|
|
|
05:EXCHANGE [HASH(t.bigint_col)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| output: count(smallint_col)
|
|
| group by: bigint_col, int_col
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# mixed distinct and non-distinct agg with grouping over a union distinct
|
|
select count(smallint_col), count(distinct int_col)
|
|
from
|
|
(select * from functional.alltypes
|
|
union distinct
|
|
select * from functional.alltypessmall) t
|
|
group by t.bigint_col
|
|
limit 10
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count(int_col), count:merge(smallint_col)
|
|
| group by: t.bigint_col
|
|
| limit: 10
|
|
|
|
|
04:AGGREGATE
|
|
| output: count(smallint_col)
|
|
| group by: bigint_col, int_col
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| group by: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
10:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count(int_col), count:merge(smallint_col)
|
|
| group by: t.bigint_col
|
|
| limit: 10
|
|
|
|
|
09:AGGREGATE
|
|
| output: count:merge(smallint_col)
|
|
| group by: t.bigint_col, int_col
|
|
|
|
|
08:EXCHANGE [HASH(t.bigint_col)]
|
|
|
|
|
04:AGGREGATE [STREAMING]
|
|
| output: count(smallint_col)
|
|
| group by: bigint_col, int_col
|
|
|
|
|
07:AGGREGATE [FINALIZE]
|
|
| group by: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
|
|
|
|
|
06:EXCHANGE [HASH(id,bool_col,tinyint_col,smallint_col,int_col,bigint_col,float_col,double_col,date_string_col,string_col,timestamp_col,year,month)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| group by: id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col, year, month
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# Mixed distinct and non-distinct agg with intermediate type different from input type
|
|
# Regression test for IMPALA-5251 to exercise validateMergeAggFn() in FunctionCallExpr.
|
|
select avg(l_quantity), ndv(l_discount), count(distinct l_partkey)
|
|
from tpch_parquet.lineitem;
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: count(l_partkey), avg:merge(l_quantity), ndv:merge(l_discount)
|
|
|
|
|
01:AGGREGATE
|
|
| output: avg(l_quantity), ndv(l_discount)
|
|
| group by: l_partkey
|
|
|
|
|
00:SCAN HDFS [tpch_parquet.lineitem]
|
|
partitions=1/1 files=3 size=193.74MB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: count:merge(l_partkey), avg:merge(l_quantity), ndv:merge(l_discount)
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE
|
|
| output: count(l_partkey), avg:merge(l_quantity), ndv:merge(l_discount)
|
|
|
|
|
04:AGGREGATE
|
|
| output: avg:merge(l_quantity), ndv:merge(l_discount)
|
|
| group by: l_partkey
|
|
|
|
|
03:EXCHANGE [HASH(l_partkey)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| output: avg(l_quantity), ndv(l_discount)
|
|
| group by: l_partkey
|
|
|
|
|
00:SCAN HDFS [tpch_parquet.lineitem]
|
|
partitions=1/1 files=3 size=193.74MB
|
|
====
|
|
# test that aggregations are not placed below an unpartitioned exchange with a limit
|
|
select count(*) from (select * from functional.alltypes limit 10) t
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
limit: 10
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
limit: 10
|
|
====
|
|
# test that aggregations are not placed below an unpartitioned exchange with a limit
|
|
select count(*) from
|
|
(select * from functional.alltypes
|
|
union all
|
|
(select * from functional.alltypessmall) limit 10) t
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
| limit: 10
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
| limit: 10
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypessmall]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# test that limits are applied at the top-level merge aggregation node for non-grouping
|
|
# distinct aggregation (IMPALA-1802)
|
|
select * from (
|
|
select count(distinct cnt) from
|
|
(select count(distinct t1.id) as cnt
|
|
from functional.alltypesagg t1 join functional.alltypestiny t2 on t1.id = t2.id
|
|
limit 10) t
|
|
limit 2) v
|
|
limit 1
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: count(cnt)
|
|
| limit: 1
|
|
|
|
|
05:AGGREGATE
|
|
| group by: count(t1.id)
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(t1.id)
|
|
| limit: 10
|
|
|
|
|
03:AGGREGATE
|
|
| group by: t1.id
|
|
|
|
|
02:HASH JOIN [INNER JOIN]
|
|
| hash predicates: t1.id = t2.id
|
|
| runtime filters: RF000 <- t2.id
|
|
|
|
|
|--01:SCAN HDFS [functional.alltypestiny t2]
|
|
| partitions=4/4 files=4 size=460B
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg t1]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
runtime filters: RF000 -> t1.id
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: count(cnt)
|
|
| limit: 1
|
|
|
|
|
05:AGGREGATE
|
|
| group by: count(t1.id)
|
|
|
|
|
11:AGGREGATE [FINALIZE]
|
|
| output: count:merge(t1.id)
|
|
| limit: 10
|
|
|
|
|
10:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
04:AGGREGATE
|
|
| output: count(t1.id)
|
|
|
|
|
09:AGGREGATE
|
|
| group by: t1.id
|
|
|
|
|
08:EXCHANGE [HASH(t1.id)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| group by: t1.id
|
|
|
|
|
02:HASH JOIN [INNER JOIN, BROADCAST]
|
|
| hash predicates: t1.id = t2.id
|
|
| runtime filters: RF000 <- t2.id
|
|
|
|
|
|--07:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypestiny t2]
|
|
| partitions=4/4 files=4 size=460B
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg t1]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
runtime filters: RF000 -> t1.id
|
|
====
|
|
# IMPALA-2089: Tests correct elimination of redundant predicates.
|
|
# The equivalences between inline-view slots are enforced inside the inline-view plan.
|
|
# Equivalences between simple grouping slots (with SlotRef grouping exprs) are enforced
|
|
# at the scan, and equivalences between grouping slots with complex grouping exprs are
|
|
# enforced at the aggregation.
|
|
# a, b, c, d are in the same equivalence class and some predicates are redundant.
|
|
select * from
|
|
(select tinyint_col a, smallint_col b, int_col + int_col c, coalesce(bigint_col, year) d
|
|
from functional.alltypes
|
|
group by 1, 2, 3, 4) v
|
|
where v.a = v.b and v.b = v.c and v.c = v.d and v.a = v.c and v.a = v.d
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| group by: tinyint_col, smallint_col, int_col + int_col, coalesce(bigint_col, year)
|
|
| having: int_col + int_col = coalesce(bigint_col, year), smallint_col = int_col + int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
predicates: functional.alltypes.tinyint_col = functional.alltypes.smallint_col
|
|
====
|
|
# IMPALA-1917: Test NULL literals inside inline view with grouping aggregation.
|
|
select cnt from
|
|
(select bool_col, count(*) cnt, cast(NULL as int) as x, cast(NULL as int) as y
|
|
from functional.alltypestiny
|
|
group by bool_col, x) v
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| group by: bool_col, CAST(NULL AS INT)
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny]
|
|
partitions=4/4 files=4 size=460B
|
|
====
|
|
# IMPALA-1917: Test NULL literals inside inline view with grouping aggregation.
|
|
select cnt from
|
|
(select bool_col, count(distinct int_col) cnt, NULL as x, NULL as y
|
|
from functional.alltypestiny
|
|
group by bool_col, x) v
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: count(int_col)
|
|
| group by: bool_col, NULL
|
|
|
|
|
01:AGGREGATE
|
|
| group by: bool_col, NULL, int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny]
|
|
partitions=4/4 files=4 size=460B
|
|
====
|
|
# test simple group_concat with distinct
|
|
select group_concat(distinct string_col) from functional.alltypesagg
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: group_concat(string_col)
|
|
|
|
|
01:AGGREGATE
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: group_concat:merge(string_col)
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE
|
|
| output: group_concat(string_col)
|
|
|
|
|
04:AGGREGATE
|
|
| group by: string_col
|
|
|
|
|
03:EXCHANGE [HASH(string_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# test group_concat and a group by
|
|
select day, group_concat(distinct string_col)
|
|
from (select * from functional.alltypesagg where id % 100 = day order by id limit 99999) a
|
|
group by day
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: group_concat(string_col)
|
|
| group by: day
|
|
|
|
|
02:AGGREGATE
|
|
| group by: day, string_col
|
|
|
|
|
01:TOP-N [LIMIT=99999]
|
|
| order by: id ASC
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
predicates: day = id % 100
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: group_concat(string_col)
|
|
| group by: day
|
|
|
|
|
02:AGGREGATE
|
|
| group by: day, string_col
|
|
|
|
|
04:MERGING-EXCHANGE [UNPARTITIONED]
|
|
| order by: id ASC
|
|
| limit: 99999
|
|
|
|
|
01:TOP-N [LIMIT=99999]
|
|
| order by: id ASC
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
predicates: day = id % 100
|
|
====
|
|
# test group_concat with distinct together with another distinct aggregate function
|
|
select count(distinct cast(timestamp_col as string)),
|
|
group_concat(distinct cast(timestamp_col as string))
|
|
from functional.alltypesagg group by year
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: count(CAST(timestamp_col AS STRING)), group_concat(CAST(timestamp_col AS STRING))
|
|
| group by: year
|
|
|
|
|
01:AGGREGATE
|
|
| group by: year, CAST(timestamp_col AS STRING)
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: count(CAST(timestamp_col AS STRING)), group_concat(CAST(timestamp_col AS STRING))
|
|
| group by: year
|
|
|
|
|
04:AGGREGATE
|
|
| group by: year, CAST(timestamp_col AS STRING)
|
|
|
|
|
03:EXCHANGE [HASH(year)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| group by: year, CAST(timestamp_col AS STRING)
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# test group_concat distinct with other non-distinct aggregate functions
|
|
select group_concat(distinct string_col), count(*) from functional.alltypesagg
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: group_concat(string_col), count:merge(*)
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*)
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: group_concat:merge(string_col), count:merge(*)
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE
|
|
| output: group_concat(string_col), count:merge(*)
|
|
|
|
|
04:AGGREGATE
|
|
| output: count:merge(*)
|
|
| group by: string_col
|
|
|
|
|
03:EXCHANGE [HASH(string_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| output: count(*)
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# test group_concat distinct with other aggregate functions, with custom separator
|
|
select group_concat(distinct string_col, '-'), sum(int_col), count(distinct string_col)
|
|
from functional.alltypesagg
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: group_concat(string_col, '-'), count(string_col), sum:merge(int_col)
|
|
|
|
|
01:AGGREGATE
|
|
| output: sum(int_col)
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: group_concat:merge(string_col, '-'), count:merge(string_col), sum:merge(int_col)
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE
|
|
| output: group_concat(string_col, '-'), count(string_col), sum:merge(int_col)
|
|
|
|
|
04:AGGREGATE
|
|
| output: sum:merge(int_col)
|
|
| group by: string_col
|
|
|
|
|
03:EXCHANGE [HASH(string_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| output: sum(int_col)
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# test group_concat distinct with other aggregate functions, with custom separator
|
|
# and a group by
|
|
select month, year, count(*), count(distinct date_string_col),
|
|
group_concat(distinct date_string_col, '-') from functional.alltypesagg
|
|
group by month, year
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: count(date_string_col), group_concat(date_string_col, '-'), count:merge(*)
|
|
| group by: month, year
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*)
|
|
| group by: month, year, date_string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: count(date_string_col), group_concat(date_string_col, '-'), count:merge(*)
|
|
| group by: month, year
|
|
|
|
|
04:AGGREGATE
|
|
| output: count:merge(*)
|
|
| group by: month, year, date_string_col
|
|
|
|
|
03:EXCHANGE [HASH(month,year)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| output: count(*)
|
|
| group by: month, year, date_string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# test multiple group_concat distinct, each with a different separator
|
|
select group_concat(distinct string_col), group_concat(distinct string_col, '-'),
|
|
group_concat(distinct string_col, '---') from functional.alltypesagg
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
02:AGGREGATE [FINALIZE]
|
|
| output: group_concat(string_col), group_concat(string_col, '-'), group_concat(string_col, '---')
|
|
|
|
|
01:AGGREGATE
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: group_concat:merge(string_col), group_concat:merge(string_col, '-'), group_concat:merge(string_col, '---')
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE
|
|
| output: group_concat(string_col), group_concat(string_col, '-'), group_concat(string_col, '---')
|
|
|
|
|
04:AGGREGATE
|
|
| group by: string_col
|
|
|
|
|
03:EXCHANGE [HASH(string_col)]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| group by: string_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# IMPALA-852: Aggregation only in the HAVING clause.
|
|
select 1 from functional.alltypestiny having count(*) > 0
|
|
---- PLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| having: count(*) > 0
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny]
|
|
partitions=4/4 files=4 size=460B
|
|
====
|
|
# Grouping aggregation where input is partitioned on grouping expr.
|
|
# Planner should not redundantly repartition the data that was already partitioned on
|
|
# the required key by the join.
|
|
select straight_join c_custkey, count(*)
|
|
from tpch_parquet.customer inner join [shuffle] tpch_parquet.orders on c_custkey = o_custkey
|
|
where c_nationkey = 16
|
|
group by 1
|
|
having count(*) < 150000
|
|
limit 1000000
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:EXCHANGE [UNPARTITIONED]
|
|
| limit: 1000000
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| group by: c_custkey
|
|
| having: count(*) < 150000
|
|
| limit: 1000000
|
|
|
|
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
|
|
| hash predicates: c_custkey = o_custkey
|
|
| runtime filters: RF000 <- o_custkey
|
|
|
|
|
|--05:EXCHANGE [HASH(o_custkey)]
|
|
| |
|
|
| 01:SCAN HDFS [tpch_parquet.orders]
|
|
| partitions=1/1 files=2 size=54.00MB
|
|
|
|
|
04:EXCHANGE [HASH(c_custkey)]
|
|
|
|
|
00:SCAN HDFS [tpch_parquet.customer]
|
|
partitions=1/1 files=1 size=12.27MB
|
|
predicates: c_nationkey = 16
|
|
runtime filters: RF000 -> c_custkey
|
|
====
|
|
# Distinct aggregation where input is partitioned on distinct expr.
|
|
# Planner should not redundantly repartition the data that was already partitioned on
|
|
# the required key by the join.
|
|
select col from (
|
|
select straight_join count(distinct c_custkey) col
|
|
from tpch_parquet.orders inner join [shuffle] tpch_parquet.customer on c_custkey = o_custkey) v
|
|
where col > 50
|
|
limit 50
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
08:AGGREGATE [FINALIZE]
|
|
| output: count:merge(c_custkey)
|
|
| having: count(c_custkey) > 50
|
|
| limit: 50
|
|
|
|
|
07:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
04:AGGREGATE
|
|
| output: count(c_custkey)
|
|
|
|
|
03:AGGREGATE
|
|
| group by: c_custkey
|
|
|
|
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
|
|
| hash predicates: o_custkey = c_custkey
|
|
| runtime filters: RF000 <- c_custkey
|
|
|
|
|
|--06:EXCHANGE [HASH(c_custkey)]
|
|
| |
|
|
| 01:SCAN HDFS [tpch_parquet.customer]
|
|
| partitions=1/1 files=1 size=12.27MB
|
|
|
|
|
05:EXCHANGE [HASH(o_custkey)]
|
|
|
|
|
00:SCAN HDFS [tpch_parquet.orders]
|
|
partitions=1/1 files=2 size=54.00MB
|
|
runtime filters: RF000 -> o_custkey
|
|
====
|
|
# Distinct grouping aggregation where input is partitioned on distinct and grouping exprs.
|
|
# Planner should not redundantly repartition the data that was already partitioned on
|
|
# the required key by the join.
|
|
select straight_join c_custkey, count(distinct c_custkey)
|
|
from tpch_parquet.orders inner join [shuffle] tpch_parquet.customer on c_custkey = o_custkey
|
|
group by 1
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
07:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
04:AGGREGATE [FINALIZE]
|
|
| output: count(c_custkey)
|
|
| group by: c_custkey
|
|
|
|
|
03:AGGREGATE
|
|
| group by: c_custkey, c_custkey
|
|
|
|
|
02:HASH JOIN [INNER JOIN, PARTITIONED]
|
|
| hash predicates: o_custkey = c_custkey
|
|
| runtime filters: RF000 <- c_custkey
|
|
|
|
|
|--06:EXCHANGE [HASH(c_custkey)]
|
|
| |
|
|
| 01:SCAN HDFS [tpch_parquet.customer]
|
|
| partitions=1/1 files=1 size=12.27MB
|
|
|
|
|
05:EXCHANGE [HASH(o_custkey)]
|
|
|
|
|
00:SCAN HDFS [tpch_parquet.orders]
|
|
partitions=1/1 files=2 size=54.00MB
|
|
runtime filters: RF000 -> o_custkey
|
|
====
|
|
# Complex aggregation when two joins and an agg end up in same fragment.
|
|
select l_orderkey, l_returnflag, count(*) from (
|
|
select straight_join *
|
|
from tpch_parquet.lineitem
|
|
inner join [shuffle] tpch_parquet.orders
|
|
on l_orderkey = o_orderkey and l_returnflag = o_clerk
|
|
inner join [broadcast] tpch_parquet.customer
|
|
on o_custkey = c_custkey and c_phone = o_comment
|
|
) v
|
|
group by 1, 2
|
|
having count(*) > 10
|
|
limit 10
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
09:EXCHANGE [UNPARTITIONED]
|
|
| limit: 10
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| group by: tpch_parquet.lineitem.l_orderkey, tpch_parquet.lineitem.l_returnflag
|
|
| having: count(*) > 10
|
|
| limit: 10
|
|
|
|
|
04:HASH JOIN [INNER JOIN, BROADCAST]
|
|
| hash predicates: o_custkey = c_custkey, o_comment = c_phone
|
|
| runtime filters: RF000 <- c_custkey, RF001 <- c_phone
|
|
|
|
|
|--08:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 02:SCAN HDFS [tpch_parquet.customer]
|
|
| partitions=1/1 files=1 size=12.27MB
|
|
|
|
|
03:HASH JOIN [INNER JOIN, PARTITIONED]
|
|
| hash predicates: l_orderkey = o_orderkey, l_returnflag = o_clerk
|
|
| runtime filters: RF002 <- o_orderkey, RF003 <- o_clerk
|
|
|
|
|
|--07:EXCHANGE [HASH(o_orderkey,o_clerk)]
|
|
| |
|
|
| 01:SCAN HDFS [tpch_parquet.orders]
|
|
| partitions=1/1 files=2 size=54.00MB
|
|
| runtime filters: RF000 -> o_custkey, RF001 -> o_comment
|
|
|
|
|
06:EXCHANGE [HASH(l_orderkey,l_returnflag)]
|
|
|
|
|
00:SCAN HDFS [tpch_parquet.lineitem]
|
|
partitions=1/1 files=3 size=193.61MB
|
|
runtime filters: RF002 -> l_orderkey, RF003 -> l_returnflag
|
|
====
|
|
# IMPALA-4263: Grouping agg needs a merge step because the grouping exprs reference a
|
|
# tuple that is made nullable in the join fragment.
|
|
select /* +straight_join */ t2.id, count(*)
|
|
from functional.alltypes t1
|
|
left outer join /* +shuffle */ functional.alltypessmall t2
|
|
on t1.id = t2.id
|
|
group by t2.id
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
08:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
07:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| group by: t2.id
|
|
|
|
|
06:EXCHANGE [HASH(t2.id)]
|
|
|
|
|
03:AGGREGATE [STREAMING]
|
|
| output: count(*)
|
|
| group by: t2.id
|
|
|
|
|
02:HASH JOIN [LEFT OUTER JOIN, PARTITIONED]
|
|
| hash predicates: t1.id = t2.id
|
|
|
|
|
|--05:EXCHANGE [HASH(t2.id)]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypessmall t2]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
04:EXCHANGE [HASH(t1.id)]
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# IMPALA-4263: Grouping agg is placed in the join fragment and has no merge step.
|
|
select /* +straight_join */ t1.id, count(*)
|
|
from functional.alltypes t1
|
|
left outer join /* +shuffle */ functional.alltypessmall t2
|
|
on t1.id = t2.id
|
|
group by t1.id
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| group by: t1.id
|
|
|
|
|
02:HASH JOIN [LEFT OUTER JOIN, PARTITIONED]
|
|
| hash predicates: t1.id = t2.id
|
|
|
|
|
|--05:EXCHANGE [HASH(t2.id)]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypessmall t2]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
04:EXCHANGE [HASH(t1.id)]
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# IMPALA-4263: Grouping agg is placed in the second join fragment and has no merge step.
|
|
# The grouping exprs reference a nullable tuple (t2), but that tuple is made nullable in
|
|
# the first join fragment, so it's correct to place the the aggregation in the second
|
|
# join fragment without a merge step.
|
|
select /* +straight_join */ t2.id, count(*)
|
|
from functional.alltypes t1
|
|
left outer join /* +shuffle */ functional.alltypessmall t2
|
|
on t1.int_col = t2.int_col
|
|
left outer join /* +shuffle */ functional.alltypestiny t3
|
|
on t2.id = t3.id
|
|
group by t2.id
|
|
---- DISTRIBUTEDPLAN
|
|
PLAN-ROOT SINK
|
|
|
|
|
10:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count(*)
|
|
| group by: t2.id
|
|
|
|
|
04:HASH JOIN [LEFT OUTER JOIN, PARTITIONED]
|
|
| hash predicates: t2.id = t3.id
|
|
|
|
|
|--09:EXCHANGE [HASH(t3.id)]
|
|
| |
|
|
| 02:SCAN HDFS [functional.alltypestiny t3]
|
|
| partitions=4/4 files=4 size=460B
|
|
|
|
|
08:EXCHANGE [HASH(t2.id)]
|
|
|
|
|
03:HASH JOIN [LEFT OUTER JOIN, PARTITIONED]
|
|
| hash predicates: t1.int_col = t2.int_col
|
|
|
|
|
|--07:EXCHANGE [HASH(t2.int_col)]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypessmall t2]
|
|
| partitions=4/4 files=4 size=6.32KB
|
|
|
|
|
06:EXCHANGE [HASH(t1.int_col)]
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|