impala/testdata/workloads/functional-planner/queries/PlannerTest/distinct.test

# distinct *
select distinct *
from functional.testtbl
---- PLAN
01:AGGREGATE [FINALIZE]
|  group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
00:SCAN HDFS [functional.testtbl]
   partitions=1/1 size=0B
---- DISTRIBUTEDPLAN
04:EXCHANGE [PARTITION=UNPARTITIONED]
|
03:AGGREGATE [MERGE FINALIZE]
|  group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
02:EXCHANGE [PARTITION=HASH(functional.testtbl.id,functional.testtbl.name,functional.testtbl.zip)]
|
01:AGGREGATE
|  group by: functional.testtbl.id, functional.testtbl.name, functional.testtbl.zip
|
00:SCAN HDFS [functional.testtbl]
   partitions=1/1 size=0B
====
# distinct w/ explicit select list
select distinct id, zip
from functional.testtbl
---- PLAN
01:AGGREGATE [FINALIZE]
|  group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
   partitions=1/1 size=0B
---- DISTRIBUTEDPLAN
04:EXCHANGE [PARTITION=UNPARTITIONED]
|
03:AGGREGATE [MERGE FINALIZE]
|  group by: id, zip
|
02:EXCHANGE [PARTITION=HASH(id,zip)]
|
01:AGGREGATE
|  group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
   partitions=1/1 size=0B
====
# count(distinct)
select count(distinct id, zip)
from functional.testtbl
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(if(id IS NULL, NULL, zip))
|
01:AGGREGATE
|  group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
   partitions=1/1 size=0B
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
|  output: sum(count(if(id IS NULL, NULL, zip)))
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
|  output: count(if(id IS NULL, NULL, zip))
|
04:AGGREGATE [MERGE]
|  group by: id, zip
|
03:EXCHANGE [PARTITION=HASH(id,zip)]
|
01:AGGREGATE
|  group by: id, zip
|
00:SCAN HDFS [functional.testtbl]
   partitions=1/1 size=0B
====
# count(distinct) w/ grouping
select tinyint_col, count(distinct int_col, bigint_col)
from functional.alltypesagg
group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(if(int_col IS NULL, NULL, bigint_col))
|  group by: tinyint_col
|
01:AGGREGATE
|  group by: tinyint_col, int_col, bigint_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
|  output: count(if(int_col IS NULL, NULL, bigint_col))
|  group by: tinyint_col
|
04:AGGREGATE [MERGE]
|  group by: tinyint_col, int_col, bigint_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
|  group by: tinyint_col, int_col, bigint_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
====
# count(distinct) and sum(distinct) w/ grouping
select tinyint_col, count(distinct int_col), sum(distinct int_col)
from functional.alltypesagg
group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(int_col), sum(int_col)
|  group by: tinyint_col
|
01:AGGREGATE
|  group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
|  output: count(int_col), sum(int_col)
|  group by: tinyint_col
|
04:AGGREGATE [MERGE]
|  group by: tinyint_col, int_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
|  group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
====
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
# is ignored
select tinyint_col, count(distinct int_col),
min(distinct smallint_col), max(distinct string_col)
from functional.alltypesagg group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(int_col), min(min(smallint_col)), max(max(string_col))
|  group by: tinyint_col
|
01:AGGREGATE
|  output: min(smallint_col), max(string_col)
|  group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
|  output: count(int_col), min(min(smallint_col)), max(max(string_col))
|  group by: tinyint_col
|
04:AGGREGATE [MERGE]
|  output: min(min(smallint_col)), max(max(string_col))
|  group by: tinyint_col, int_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
|  output: min(smallint_col), max(string_col)
|  group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
====
# aggregate fns with and without distinct
select tinyint_col, count(distinct int_col), count(*), sum(distinct int_col),
sum(int_col), min(smallint_col), max(bigint_col)
from functional.alltypesagg group by 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(int_col), sum(int_col), sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
|  group by: tinyint_col
|
01:AGGREGATE
|  output: count(*), sum(int_col), min(smallint_col), max(bigint_col)
|  group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
---- DISTRIBUTEDPLAN
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE FINALIZE]
|  output: count(int_col), sum(int_col), sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
|  group by: tinyint_col
|
04:AGGREGATE [MERGE]
|  output: sum(count(*)), sum(sum(int_col)), min(min(smallint_col)), max(max(bigint_col))
|  group by: tinyint_col, int_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
|  output: count(*), sum(int_col), min(smallint_col), max(bigint_col)
|  group by: tinyint_col, int_col
|
00:SCAN HDFS [functional.alltypesagg]
   partitions=10/10 size=743.67KB
====
# test join on inline views containing distinct aggregates to make sure
# the aggregation info reports the correct tuple ids (from the 2nd phase
# distinct aggregation) for the inline-view expression substitution
select t1.c, t2.c from
(select count(distinct int_col) as c from functional.alltypestiny) t1 inner join
(select count(distinct bigint_col) as c from functional.alltypestiny) t2 on (t1.c = t2.c)
---- PLAN
06:HASH JOIN [INNER JOIN]
|  hash predicates: count(int_col) = count(bigint_col)
|
|--05:AGGREGATE [MERGE FINALIZE]
|  |  output: count(bigint_col)
|  |
|  04:AGGREGATE
|  |  group by: bigint_col
|  |
|  03:SCAN HDFS [functional.alltypestiny]
|     partitions=4/4 size=460B
|
02:AGGREGATE [MERGE FINALIZE]
|  output: count(int_col)
|
01:AGGREGATE
|  group by: int_col
|
00:SCAN HDFS [functional.alltypestiny]
   partitions=4/4 size=460B
---- DISTRIBUTEDPLAN
06:HASH JOIN [INNER JOIN, BROADCAST]
|  hash predicates: count(int_col) = count(bigint_col)
|
|--15:EXCHANGE [PARTITION=UNPARTITIONED]
|  |
|  14:AGGREGATE [MERGE FINALIZE]
|  |  output: sum(count(bigint_col))
|  |
|  13:EXCHANGE [PARTITION=UNPARTITIONED]
|  |
|  05:AGGREGATE [MERGE]
|  |  output: count(bigint_col)
|  |
|  12:AGGREGATE [MERGE]
|  |  group by: bigint_col
|  |
|  11:EXCHANGE [PARTITION=HASH(bigint_col)]
|  |
|  04:AGGREGATE
|  |  group by: bigint_col
|  |
|  03:SCAN HDFS [functional.alltypestiny]
|     partitions=4/4 size=460B
|
10:AGGREGATE [MERGE FINALIZE]
|  output: sum(count(int_col))
|
09:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
|  output: count(int_col)
|
08:AGGREGATE [MERGE]
|  group by: int_col
|
07:EXCHANGE [PARTITION=HASH(int_col)]
|
01:AGGREGATE
|  group by: int_col
|
00:SCAN HDFS [functional.alltypestiny]
   partitions=4/4 size=460B
====
# Test placement of having predicate into 2nd phase merge agg for
# distinct + non-distinct aggregates without group by (IMPALA-845).
# TODO: Fix the incorrect labels for non-distinct agg expr after the
# 1st phase merge. We'd need to create more smaps during analysis
# because there are more than two levels of merging for the
# non-distinct agg expr.
select count(distinct tinyint_col) from functional.alltypes
having count(bigint_col) > 0
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(tinyint_col), sum(count(bigint_col))
|  having: zeroifnull(sum(count(bigint_col))) > 0
|
01:AGGREGATE
|  output: count(bigint_col)
|  group by: tinyint_col
|
00:SCAN HDFS [functional.alltypes]
   partitions=24/24 size=478.45KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
|  output: sum(count(tinyint_col)), sum(sum(count(bigint_col)))
|  having: zeroifnull(sum(count(bigint_col))) > 0
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
|  output: count(tinyint_col), sum(count(bigint_col))
|
04:AGGREGATE [MERGE]
|  output: sum(count(bigint_col))
|  group by: tinyint_col
|
03:EXCHANGE [PARTITION=HASH(tinyint_col)]
|
01:AGGREGATE
|  output: count(bigint_col)
|  group by: tinyint_col
|
00:SCAN HDFS [functional.alltypes]
   partitions=24/24 size=478.45KB
====
# test slot materialization on a distinct agg inside an inline view
# triggered by a predicate in an outer query block (IMPALA-861)
select 1 from
  (select count(distinct 1) x from functional.alltypes) t
where t.x is not null
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(1)
|  having: count(1) IS NOT NULL
|
01:AGGREGATE
|  group by: 1
|
00:SCAN HDFS [functional.alltypes]
   partitions=24/24 size=478.45KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
|  output: sum(count(1))
|  having: count(1) IS NOT NULL
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
|  output: count(1)
|
04:AGGREGATE [MERGE]
|  group by: 1
|
03:EXCHANGE [PARTITION=HASH(1)]
|
01:AGGREGATE
|  group by: 1
|
00:SCAN HDFS [functional.alltypes]
   partitions=24/24 size=478.45KB
====
# test slot materialization on a distinct agg inside an inline view
# triggered by a predicate in an outer query block (IMPALA-861)
select 1 from
  (select count(distinct 1) x, count(1) y from functional.alltypes) t
where t.x + t.y > 10 and t.x > 0 and t.y > 1
---- PLAN
02:AGGREGATE [MERGE FINALIZE]
|  output: count(1), sum(count(1))
|  having: count(1) > 0, count(1) + zeroifnull(sum(count(1))) > 10, zeroifnull(sum(count(1))) > 1
|
01:AGGREGATE
|  output: count(1)
|  group by: 1
|
00:SCAN HDFS [functional.alltypes]
   partitions=24/24 size=478.45KB
---- DISTRIBUTEDPLAN
06:AGGREGATE [MERGE FINALIZE]
|  output: sum(count(1)), sum(sum(count(1)))
|  having: count(1) > 0, count(1) + zeroifnull(sum(count(1))) > 10, zeroifnull(sum(count(1))) > 1
|
05:EXCHANGE [PARTITION=UNPARTITIONED]
|
02:AGGREGATE [MERGE]
|  output: count(1), sum(count(1))
|
04:AGGREGATE [MERGE]
|  output: sum(count(1))
|  group by: 1
|
03:EXCHANGE [PARTITION=HASH(1)]
|
01:AGGREGATE
|  output: count(1)
|  group by: 1
|
00:SCAN HDFS [functional.alltypes]
   partitions=24/24 size=478.45KB
====