mirror of
https://github.com/apache/impala.git
synced 2026-01-05 12:01:11 -05:00
count(x) with no distinct and no group-by expressions returns NULL on empty input if other distinct aggs (e.g. COUNT(distinct x) are present. This happens because the COUNT is transformed to SUM(COUNT()), with the inner COUNT being evaluated WITH a group-by expression (e.g. x). SUM over empty input returns NULL, but COUNT should return 0. This patch fixes this by replacing COUNT with zeroifnull(COUNT) before AggregateInfo is generated if there are distinct aggs and no group-bys. The logic in AggregateInfo itself has not been modified. Change-Id: I902e3fdd95767135b2f3fe423e8802ef57366af1 Reviewed-on: http://gerrit.ent.cloudera.com:8080/1921 Reviewed-by: Srinath Shankar <sshankar@cloudera.com> Tested-by: jenkins
402 lines
10 KiB
Plaintext
402 lines
10 KiB
Plaintext
# distinct *
|
|
select distinct *
|
|
from functional.testtbl
|
|
---- PLAN
|
|
01:AGGREGATE [FINALIZE]
|
|
| group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 size=0B
|
|
---- DISTRIBUTEDPLAN
|
|
04:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE [MERGE FINALIZE]
|
|
| group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
|
|
|
|
02:EXCHANGE [PARTITION=HASH(functional.testtbl.id,functional.testtbl.name,functional.testtbl.zip)]
|
|
|
|
|
01:AGGREGATE
|
|
| group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 size=0B
|
|
====
|
|
# distinct w/ explicit select list
|
|
select distinct id, zip
|
|
from functional.testtbl
|
|
---- PLAN
|
|
01:AGGREGATE [FINALIZE]
|
|
| group by: id, zip
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 size=0B
|
|
---- DISTRIBUTEDPLAN
|
|
04:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE [MERGE FINALIZE]
|
|
| group by: id, zip
|
|
|
|
|
02:EXCHANGE [PARTITION=HASH(id,zip)]
|
|
|
|
|
01:AGGREGATE
|
|
| group by: id, zip
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 size=0B
|
|
====
|
|
# count(distinct)
|
|
select count(distinct id, zip)
|
|
from functional.testtbl
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(if(id IS NULL, NULL, zip))
|
|
|
|
|
01:AGGREGATE
|
|
| group by: id, zip
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 size=0B
|
|
---- DISTRIBUTEDPLAN
|
|
06:AGGREGATE [MERGE FINALIZE]
|
|
| output: sum(count(if(id IS NULL, NULL, zip)))
|
|
|
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE]
|
|
| output: count(if(id IS NULL, NULL, zip))
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| group by: id, zip
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(id,zip)]
|
|
|
|
|
01:AGGREGATE
|
|
| group by: id, zip
|
|
|
|
|
00:SCAN HDFS [functional.testtbl]
|
|
partitions=1/1 size=0B
|
|
====
|
|
# count(distinct) w/ grouping
|
|
select tinyint_col, count(distinct int_col, bigint_col)
|
|
from functional.alltypesagg
|
|
group by 1
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(if(int_col IS NULL, NULL, bigint_col))
|
|
| group by: tinyint_col
|
|
|
|
|
01:AGGREGATE
|
|
| group by: tinyint_col, int_col, bigint_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
---- DISTRIBUTEDPLAN
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(if(int_col IS NULL, NULL, bigint_col))
|
|
| group by: tinyint_col
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| group by: tinyint_col, int_col, bigint_col
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
|
|
|
|
01:AGGREGATE
|
|
| group by: tinyint_col, int_col, bigint_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
====
|
|
# count(distinct) and sum(distinct) w/ grouping
|
|
select tinyint_col, count(distinct int_col), sum(distinct int_col)
|
|
from functional.alltypesagg
|
|
group by 1
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(int_col), sum(int_col)
|
|
| group by: tinyint_col
|
|
|
|
|
01:AGGREGATE
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
---- DISTRIBUTEDPLAN
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(int_col), sum(int_col)
|
|
| group by: tinyint_col
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
|
|
|
|
01:AGGREGATE
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
====
|
|
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
|
|
# is ignored
|
|
select tinyint_col, count(distinct int_col),
|
|
min(distinct smallint_col), max(distinct string_col)
|
|
from functional.alltypesagg group by 1
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(int_col), min(min(smallint_col)), max(max(string_col))
|
|
| group by: tinyint_col
|
|
|
|
|
01:AGGREGATE
|
|
| output: min(smallint_col), max(string_col)
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
---- DISTRIBUTEDPLAN
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(int_col), min(min(smallint_col)), max(max(string_col))
|
|
| group by: tinyint_col
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| output: min(min(smallint_col)), max(max(string_col))
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
|
|
|
|
01:AGGREGATE
|
|
| output: min(smallint_col), max(string_col)
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
====
|
|
# aggregate fns with and without distinct
|
|
select tinyint_col, count(distinct int_col), count(*), sum(distinct int_col),
|
|
sum(int_col), min(smallint_col), max(bigint_col)
|
|
from functional.alltypesagg group by 1
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(int_col), sum(int_col), sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
|
|
| group by: tinyint_col
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*), sum(int_col), min(smallint_col), max(bigint_col)
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
---- DISTRIBUTEDPLAN
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(int_col), sum(int_col), sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
|
|
| group by: tinyint_col
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| output: sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*), sum(int_col), min(smallint_col), max(bigint_col)
|
|
| group by: tinyint_col, int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=10/10 size=743.67KB
|
|
====
|
|
# test join on inline views containing distinct aggregates to make sure
|
|
# the aggregation info reports the correct tuple ids (from the 2nd phase
|
|
# distinct aggregation) for the inline-view expression substitution
|
|
select t1.c, t2.c from
|
|
(select count(distinct int_col) as c from functional.alltypestiny) t1 inner join
|
|
(select count(distinct bigint_col) as c from functional.alltypestiny) t2 on (t1.c = t2.c)
|
|
---- PLAN
|
|
06:HASH JOIN [INNER JOIN]
|
|
| hash predicates: count(int_col) = count(bigint_col)
|
|
|
|
|
|--05:AGGREGATE [MERGE FINALIZE]
|
|
| | output: count(bigint_col)
|
|
| |
|
|
| 04:AGGREGATE
|
|
| | group by: bigint_col
|
|
| |
|
|
| 03:SCAN HDFS [functional.alltypestiny]
|
|
| partitions=4/4 size=460B
|
|
|
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(int_col)
|
|
|
|
|
01:AGGREGATE
|
|
| group by: int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny]
|
|
partitions=4/4 size=460B
|
|
---- DISTRIBUTEDPLAN
|
|
06:HASH JOIN [INNER JOIN, BROADCAST]
|
|
| hash predicates: count(int_col) = count(bigint_col)
|
|
|
|
|
|--15:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
| |
|
|
| 14:AGGREGATE [MERGE FINALIZE]
|
|
| | output: sum(count(bigint_col))
|
|
| |
|
|
| 13:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
| |
|
|
| 05:AGGREGATE [MERGE]
|
|
| | output: count(bigint_col)
|
|
| |
|
|
| 12:AGGREGATE [MERGE]
|
|
| | group by: bigint_col
|
|
| |
|
|
| 11:EXCHANGE [PARTITION=HASH(bigint_col)]
|
|
| |
|
|
| 04:AGGREGATE
|
|
| | group by: bigint_col
|
|
| |
|
|
| 03:SCAN HDFS [functional.alltypestiny]
|
|
| partitions=4/4 size=460B
|
|
|
|
|
10:AGGREGATE [MERGE FINALIZE]
|
|
| output: sum(count(int_col))
|
|
|
|
|
09:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE]
|
|
| output: count(int_col)
|
|
|
|
|
08:AGGREGATE [MERGE]
|
|
| group by: int_col
|
|
|
|
|
07:EXCHANGE [PARTITION=HASH(int_col)]
|
|
|
|
|
01:AGGREGATE
|
|
| group by: int_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypestiny]
|
|
partitions=4/4 size=460B
|
|
====
|
|
# Test placement of having predicate into 2nd phase merge agg for
|
|
# distinct + non-distinct aggregates without group by (IMPALA-845).
|
|
# TODO: Fix the incorrect labels for non-distinct agg expr after the
|
|
# 1st phase merge. We'd need to create more smaps during analysis
|
|
# because there are more than two levels of merging for the
|
|
# non-distinct agg expr.
|
|
select count(distinct tinyint_col) from functional.alltypes
|
|
having count(bigint_col) > 0
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(tinyint_col), sum(count(bigint_col))
|
|
| having: zeroifnull(sum(count(bigint_col))) > 0
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(bigint_col)
|
|
| group by: tinyint_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
06:AGGREGATE [MERGE FINALIZE]
|
|
| output: sum(count(tinyint_col)), sum(sum(count(bigint_col)))
|
|
| having: zeroifnull(sum(count(bigint_col))) > 0
|
|
|
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE]
|
|
| output: count(tinyint_col), sum(count(bigint_col))
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| output: sum(count(bigint_col))
|
|
| group by: tinyint_col
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(bigint_col)
|
|
| group by: tinyint_col
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 size=478.45KB
|
|
====
|
|
# test slot materialization on a distinct agg inside an inline view
|
|
# triggered by a predicate in an outer query block (IMPALA-861)
|
|
select 1 from
|
|
(select count(distinct 1) x from functional.alltypes) t
|
|
where t.x is not null
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(1)
|
|
| having: count(1) IS NOT NULL
|
|
|
|
|
01:AGGREGATE
|
|
| group by: 1
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
06:AGGREGATE [MERGE FINALIZE]
|
|
| output: sum(count(1))
|
|
| having: count(1) IS NOT NULL
|
|
|
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE]
|
|
| output: count(1)
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| group by: 1
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(1)]
|
|
|
|
|
01:AGGREGATE
|
|
| group by: 1
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 size=478.45KB
|
|
====
|
|
# test slot materialization on a distinct agg inside an inline view
|
|
# triggered by a predicate in an outer query block (IMPALA-861)
|
|
select 1 from
|
|
(select count(distinct 1) x, count(1) y from functional.alltypes) t
|
|
where t.x + t.y > 10 and t.x > 0 and t.y > 1
|
|
---- PLAN
|
|
02:AGGREGATE [MERGE FINALIZE]
|
|
| output: count(1), sum(count(1))
|
|
| having: count(1) > 0, count(1) + zeroifnull(sum(count(1))) > 10, zeroifnull(sum(count(1))) > 1
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(1)
|
|
| group by: 1
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 size=478.45KB
|
|
---- DISTRIBUTEDPLAN
|
|
06:AGGREGATE [MERGE FINALIZE]
|
|
| output: sum(count(1)), sum(sum(count(1)))
|
|
| having: count(1) > 0, count(1) + zeroifnull(sum(count(1))) > 10, zeroifnull(sum(count(1))) > 1
|
|
|
|
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
|
|
|
|
02:AGGREGATE [MERGE]
|
|
| output: count(1), sum(count(1))
|
|
|
|
|
04:AGGREGATE [MERGE]
|
|
| output: sum(count(1))
|
|
| group by: 1
|
|
|
|
|
03:EXCHANGE [PARTITION=HASH(1)]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(1)
|
|
| group by: 1
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 size=478.45KB
|
|
====
|