Files
impala/testdata/workloads/functional-planner/queries/PlannerTest/distinct.test
Srinath Shankar 74a975c45b IMPALA-862: count(x) may return null when a similar count(distinct x) is also used
count(x) with no distinct and no group-by expressions returns NULL on empty input
if other distinct aggs (e.g. COUNT(distinct x) are present.
This happens because the COUNT is transformed to SUM(COUNT()),
with the inner COUNT being evaluated WITH a group-by expression (e.g. x).
SUM over empty input returns NULL, but COUNT should return 0.

This patch fixes this by replacing COUNT with zeroifnull(COUNT) before AggregateInfo
is generated if there are distinct aggs and no group-bys. The logic in AggregateInfo
itself has not been modified.

Change-Id: I902e3fdd95767135b2f3fe423e8802ef57366af1
Reviewed-on: http://gerrit.ent.cloudera.com:8080/1921
Reviewed-by: Srinath Shankar <sshankar@cloudera.com>
Tested-by: jenkins
2014-03-14 23:35:55 -07:00

402 lines
10 KiB
Plaintext

# distinct *
select distinct *
from functional.testtbl
---- PLAN
01:AGGREGATE [FINALIZE]
| group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 size=0B
---- DISTRIBUTEDPLAN
04:EXCHANGE [PARTITION=UNPARTITIONED]
|
03:AGGREGATE [MERGE FINALIZE]
| group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
02:EXCHANGE [PARTITION=HASH(functional.testtbl.id,functional.testtbl.name,functional.testtbl.zip)]
|
01:AGGREGATE
| group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 size=0B
====
# distinct w/ explicit select list
select distinct id, zip
from functional.testtbl
---- PLAN
01:AGGREGATE [FINALIZE]
| group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 size=0B
---- DISTRIBUTEDPLAN
04:EXCHANGE [PARTITION=UNPARTITIONED]
|
03:AGGREGATE [MERGE FINALIZE]
| group by: id, zip
|
02:EXCHANGE [PARTITION=HASH(id,zip)]
|
01:AGGREGATE
| group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 size=0B
====
# count(distinct)
select count(distinct id, zip)
from functional.testtbl
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(if(id IS NULL, NULL, zip))
|
01:AGGREGATE
| group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 size=0B
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
| output: sum(count(if(id IS NULL, NULL, zip)))
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
| output: count(if(id IS NULL, NULL, zip))
|
04:AGGREGATE [MERGE]
| group by: id, zip
|
03:EXCHANGE [PARTITION=HASH(id,zip)]
|
01:AGGREGATE
| group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
partitions=1/1 size=0B
====
# count(distinct) w/ grouping
select tinyint_col, count(distinct int_col, bigint_col)
from functional.alltypesagg
group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(if(int_col IS NULL, NULL, bigint_col))
| group by: tinyint_col
|
01:AGGREGATE
| group by: tinyint_col, int_col, bigint_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
| output: count(if(int_col IS NULL, NULL, bigint_col))
| group by: tinyint_col
|
04:AGGREGATE [MERGE]
| group by: tinyint_col, int_col, bigint_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
| group by: tinyint_col, int_col, bigint_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
====
# count(distinct) and sum(distinct) w/ grouping
select tinyint_col, count(distinct int_col), sum(distinct int_col)
from functional.alltypesagg
group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(int_col), sum(int_col)
| group by: tinyint_col
|
01:AGGREGATE
| group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
| output: count(int_col), sum(int_col)
| group by: tinyint_col
|
04:AGGREGATE [MERGE]
| group by: tinyint_col, int_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
| group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
====
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
# is ignored
select tinyint_col, count(distinct int_col),
min(distinct smallint_col), max(distinct string_col)
from functional.alltypesagg group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(int_col), min(min(smallint_col)), max(max(string_col))
| group by: tinyint_col
|
01:AGGREGATE
| output: min(smallint_col), max(string_col)
| group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
| output: count(int_col), min(min(smallint_col)), max(max(string_col))
| group by: tinyint_col
|
04:AGGREGATE [MERGE]
| output: min(min(smallint_col)), max(max(string_col))
| group by: tinyint_col, int_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
| output: min(smallint_col), max(string_col)
| group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
====
# aggregate fns with and without distinct
select tinyint_col, count(distinct int_col), count(*), sum(distinct int_col),
sum(int_col), min(smallint_col), max(bigint_col)
from functional.alltypesagg group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(int_col), sum(int_col), sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
| group by: tinyint_col
|
01:AGGREGATE
| output: count(*), sum(int_col), min(smallint_col), max(bigint_col)
| group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
| output: count(int_col), sum(int_col), sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
| group by: tinyint_col
|
04:AGGREGATE [MERGE]
| output: sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
| group by: tinyint_col, int_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
| output: count(*), sum(int_col), min(smallint_col), max(bigint_col)
| group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
partitions=10/10 size=743.67KB
====
# test join on inline views containing distinct aggregates to make sure
# the aggregation info reports the correct tuple ids (from the 2nd phase
# distinct aggregation) for the inline-view expression substitution
select t1.c, t2.c from
(select count(distinct int_col) as c from functional.alltypestiny) t1 inner join
(select count(distinct bigint_col) as c from functional.alltypestiny) t2 on (t1.c = t2.c)
---- PLAN
06:HASH JOIN [INNER JOIN]
| hash predicates: count(int_col) = count(bigint_col)
|
|--05:AGGREGATE [MERGE FINALIZE]
| | output: count(bigint_col)
| |
| 04:AGGREGATE
| | group by: bigint_col
| |
| 03:SCAN HDFS [functional.alltypestiny]
| partitions=4/4 size=460B
|
02:AGGREGATE [MERGE FINALIZE]
| output: count(int_col)
|
01:AGGREGATE
| group by: int_col
|
00:SCAN HDFS [functional.alltypestiny]
partitions=4/4 size=460B
---- DISTRIBUTEDPLAN
06:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: count(int_col) = count(bigint_col)
|
|--15:EXCHANGE [PARTITION=UNPARTITIONED]
| |
| 14:AGGREGATE [MERGE FINALIZE]
| | output: sum(count(bigint_col))
| |
| 13:EXCHANGE [PARTITION=UNPARTITIONED]
| |
| 05:AGGREGATE [MERGE]
| | output: count(bigint_col)
| |
| 12:AGGREGATE [MERGE]
| | group by: bigint_col
| |
| 11:EXCHANGE [PARTITION=HASH(bigint_col)]
| |
| 04:AGGREGATE
| | group by: bigint_col
| |
| 03:SCAN HDFS [functional.alltypestiny]
| partitions=4/4 size=460B
|
10:AGGREGATE [MERGE FINALIZE]
| output: sum(count(int_col))
|
09:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
| output: count(int_col)
|
08:AGGREGATE [MERGE]
| group by: int_col
|
07:EXCHANGE [PARTITION=HASH(int_col)]
|
01:AGGREGATE
| group by: int_col
|
00:SCAN HDFS [functional.alltypestiny]
partitions=4/4 size=460B
====
# Test placement of having predicate into 2nd phase merge agg for
# distinct + non-distinct aggregates without group by (IMPALA-845).
# TODO: Fix the incorrect labels for non-distinct agg expr after the
# 1st phase merge. We'd need to create more smaps during analysis
# because there are more than two levels of merging for the
# non-distinct agg expr.
select count(distinct tinyint_col) from functional.alltypes
having count(bigint_col) > 0
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(tinyint_col), sum(count(bigint_col))
| having: zeroifnull(sum(count(bigint_col))) > 0
|
01:AGGREGATE
| output: count(bigint_col)
| group by: tinyint_col
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 size=478.45KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
| output: sum(count(tinyint_col)), sum(sum(count(bigint_col)))
| having: zeroifnull(sum(count(bigint_col))) > 0
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
| output: count(tinyint_col), sum(count(bigint_col))
|
04:AGGREGATE [MERGE]
| output: sum(count(bigint_col))
| group by: tinyint_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
| output: count(bigint_col)
| group by: tinyint_col
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 size=478.45KB
====
# test slot materialization on a distinct agg inside an inline view
# triggered by a predicate in an outer query block (IMPALA-861)
select 1 from
(select count(distinct 1) x from functional.alltypes) t
where t.x is not null
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(1)
| having: count(1) IS NOT NULL
|
01:AGGREGATE
| group by: 1
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 size=478.45KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
| output: sum(count(1))
| having: count(1) IS NOT NULL
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
| output: count(1)
|
04:AGGREGATE [MERGE]
| group by: 1
|
03:EXCHANGE [PARTITION=HASH(1)]
|
01:AGGREGATE
| group by: 1
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 size=478.45KB
====
# test slot materialization on a distinct agg inside an inline view
# triggered by a predicate in an outer query block (IMPALA-861)
select 1 from
(select count(distinct 1) x, count(1) y from functional.alltypes) t
where t.x + t.y > 10 and t.x > 0 and t.y > 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
| output: count(1), sum(count(1))
| having: count(1) > 0, count(1) + zeroifnull(sum(count(1))) > 10, zeroifnull(sum(count(1))) > 1
|
01:AGGREGATE
| output: count(1)
| group by: 1
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 size=478.45KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
| output: sum(count(1)), sum(sum(count(1)))
| having: count(1) > 0, count(1) + zeroifnull(sum(count(1))) > 10, zeroifnull(sum(count(1))) > 1
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
| output: count(1), sum(count(1))
|
04:AGGREGATE [MERGE]
| output: sum(count(1))
| group by: 1
|
03:EXCHANGE [PARTITION=HASH(1)]
|
01:AGGREGATE
| output: count(1)
| group by: 1
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 size=478.45KB
====