IMPALA-111: COUNT(DISTINCT col) returns wrong results -- does not ignore NULLs.

This commit is contained in:
Alex Behm
2013-04-23 15:14:26 -07:00
committed by Henry Robinson
parent d02f37e60f
commit c9040aee22
5 changed files with 139 additions and 33 deletions

View File

@@ -354,6 +354,34 @@ public class AggregateInfo {
mergeAggInfo.mergeAggInfo = mergeAggInfo;
}
/**
* Creates an IF function call that returns NULL if any of the slots
* at indexes [firstIdx, lastIdx] return NULL.
* For example, the resulting IF function would like this for 3 slots:
* IF(IsNull(slot1), NULL, IF(IsNull(slot2), NULL, slot3))
* Returns null if firstIdx is greater than lastIdx.
* Returns a SlotRef to the last slot if there is only one slot in range.
*/
private Expr createCountDistinctAggExprParam(int firstIdx, int lastIdx,
ArrayList<SlotDescriptor> slots) {
if (firstIdx > lastIdx) return null;
Expr elseExpr = new SlotRef(slots.get(lastIdx));
if (firstIdx == lastIdx) return elseExpr;
for (int i = lastIdx - 1; i >= firstIdx; --i) {
ArrayList<Expr> ifArgs = Lists.newArrayList();
SlotRef slotRef = new SlotRef(slots.get(i));
// Build expr: IF(IsNull(slotRef), NULL, elseExpr)
Expr isNullPred = new IsNullPredicate(slotRef, false);
ifArgs.add(isNullPred);
ifArgs.add(new NullLiteral());
ifArgs.add(elseExpr);
elseExpr = new FunctionCallExpr("if", ifArgs);
}
return elseExpr;
}
/**
* Create the info for an aggregation node that computes the second phase of of
* DISTINCT aggregate functions.
@@ -382,8 +410,23 @@ public class AggregateInfo {
for (AggregateExpr inputExpr: distinctAggExprs) {
AggregateExpr aggExpr = null;
if (inputExpr.getOp() == AggregateExpr.Operator.COUNT) {
// COUNT(DISTINCT ...) -> COUNT(*)
aggExpr = new AggregateExpr(AggregateExpr.Operator.COUNT, true, false, null);
// COUNT(DISTINCT ...) ->
// COUNT(IF(IsNull(<agg slot 1>), NULL, IF(IsNull(<agg slot 2>), NULL, ...)))
// We need the nested IF to make sure that we do not count
// column-value combinations if any of the distinct columns are NULL.
// This behavior is consistent with MySQL.
Expr ifExpr = createCountDistinctAggExprParam(origGroupingExprs.size(),
origGroupingExprs.size() + inputExpr.getChildren().size() - 1,
inputDesc.getSlots());
Preconditions.checkNotNull(ifExpr);
try {
ifExpr.analyze(analyzer);
} catch (AnalysisException e) {
throw new InternalException("Failed to analyze 'IF' function " +
"in second phase count distinct aggregation.", e);
}
aggExpr = new AggregateExpr(AggregateExpr.Operator.COUNT, false, false,
Lists.newArrayList(ifExpr));
} else {
// SUM(DISTINCT <expr>) -> SUM(<last grouping slot>);
// (MIN(DISTINCT ...) and MAX(DISTINCT ...) have their DISTINCT turned

View File

@@ -97,7 +97,7 @@ from functional.testtbl
Plan Fragment 0
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(if(<slot 2> IS NULL, NULL, <slot 3>))
GROUP BY:
TUPLE IDS: 2
AGGREGATE
@@ -123,7 +123,7 @@ Plan Fragment 1
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(if(<slot 2> IS NULL, NULL, <slot 3>))
GROUP BY:
TUPLE IDS: 2
AGGREGATE
@@ -156,7 +156,7 @@ group by 1
Plan Fragment 0
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(if(<slot 4> IS NULL, NULL, <slot 5>))
GROUP BY: <slot 3>
TUPLE IDS: 2
AGGREGATE
@@ -178,7 +178,7 @@ Plan Fragment 1
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(if(<slot 4> IS NULL, NULL, <slot 5>))
GROUP BY: <slot 3>
TUPLE IDS: 2
AGGREGATE
@@ -231,7 +231,7 @@ group by 1
Plan Fragment 0
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*), SUM(<slot 3>)
OUTPUT: COUNT(<slot 3>), SUM(<slot 3>)
GROUP BY: <slot 2>
TUPLE IDS: 2
AGGREGATE
@@ -253,7 +253,7 @@ Plan Fragment 1
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*), SUM(<slot 3>)
OUTPUT: COUNT(<slot 3>), SUM(<slot 3>)
GROUP BY: <slot 2>
TUPLE IDS: 2
AGGREGATE
@@ -307,7 +307,7 @@ from functional.alltypesagg group by 1
Plan Fragment 0
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*), MIN(<slot 6>), MAX(<slot 7>)
OUTPUT: COUNT(<slot 5>), MIN(<slot 6>), MAX(<slot 7>)
GROUP BY: <slot 4>
TUPLE IDS: 2
AGGREGATE
@@ -329,7 +329,7 @@ Plan Fragment 1
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*), MIN(<slot 6>), MAX(<slot 7>)
OUTPUT: COUNT(<slot 5>), MIN(<slot 6>), MAX(<slot 7>)
GROUP BY: <slot 4>
TUPLE IDS: 2
AGGREGATE
@@ -382,7 +382,7 @@ from functional.alltypesagg group by 1
Plan Fragment 0
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
OUTPUT: COUNT(<slot 5>), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
GROUP BY: <slot 4>
TUPLE IDS: 2
AGGREGATE
@@ -404,7 +404,7 @@ Plan Fragment 1
UNPARTITIONED
AGGREGATE
OUTPUT: COUNT(*), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
OUTPUT: COUNT(<slot 5>), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
GROUP BY: <slot 4>
TUPLE IDS: 2
AGGREGATE

View File

@@ -540,7 +540,7 @@ Plan Fragment 0
LIMIT: 10
TUPLE IDS: 3
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(<slot 7>)
GROUP BY: <slot 6>
TUPLE IDS: 3
AGGREGATE
@@ -579,7 +579,7 @@ Plan Fragment 1
LIMIT: 10
TUPLE IDS: 3
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(<slot 7>)
GROUP BY: <slot 6>
TUPLE IDS: 3
AGGREGATE
@@ -2576,7 +2576,7 @@ Plan Fragment 0
LIMIT: 1000
TUPLE IDS: 3
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(<slot 11>)
GROUP BY: <slot 8>, <slot 9>, <slot 10>
TUPLE IDS: 3
AGGREGATE
@@ -2607,7 +2607,7 @@ Plan Fragment 1
LIMIT: 1000
TUPLE IDS: 3
AGGREGATE
OUTPUT: COUNT(*)
OUTPUT: COUNT(<slot 11>)
GROUP BY: <slot 8>, <slot 9>, <slot 10>
TUPLE IDS: 3
AGGREGATE

View File

@@ -839,15 +839,9 @@ bigint, NULL, NULL, NULL, double
====
---- QUERY
# Test ignored distinct in MIN and MAX with NULLs
select min(distinct NULL), max(distinct NULL) from alltypesagg
---- TYPES
NULL, NULL
---- RESULTS
NULL,NULL
---- QUERY
# TODO: Fix count(distinct null) to return 0 instead of 1
select count(distinct NULL) from alltypesagg
---- TYPES
bigint
---- RESULTS
1
====

View File

@@ -6,7 +6,7 @@ from alltypesagg
---- TYPES
bigint
---- RESULTS
100
90
====
---- QUERY
select count(distinct bool_col)
@@ -22,7 +22,7 @@ from alltypesagg
---- TYPES
bigint
---- RESULTS
1000
999
====
---- QUERY
select count(distinct float_col)
@@ -30,7 +30,7 @@ from alltypesagg
---- TYPES
bigint
---- RESULTS
1000
999
====
---- QUERY
select count(distinct double_col)
@@ -38,7 +38,7 @@ from alltypesagg
---- TYPES
bigint
---- RESULTS
1000
999
====
---- QUERY
select count(distinct string_col)
@@ -49,12 +49,12 @@ bigint
1000
====
---- QUERY
select count(distinct string_col)
select count(distinct date_string_col)
from alltypesagg
---- TYPES
bigint
---- RESULTS
1000
10
====
---- QUERY
select count(distinct timestamp_col)
@@ -80,7 +80,7 @@ tinyint, bigint, bigint
7,10,1000
8,10,1000
9,10,1000
NULL,10,900
NULL,9,900
====
---- QUERY
# count(distinct) w/ grouping and non-distinct count()
@@ -98,7 +98,7 @@ tinyint, bigint, bigint
7,100,1000
8,100,1000
9,100,1000
NULL,100,900
NULL,90,900
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping and non-distinct count()
@@ -117,7 +117,7 @@ tinyint, bigint, bigint, bigint
7,10,520,1000
8,10,530,1000
9,10,540,1000
NULL,10,450,900
NULL,9,450,900
====
---- QUERY
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
@@ -137,7 +137,7 @@ tinyint, bigint, bigint, bigint, int, float
7,10,520,1000,7,1096.699951171875
8,10,530,1000,8,1097.800048828125
9,10,540,1000,9,1098.900024414062
NULL,10,450,900,10,1089
NULL,9,450,900,10,1089
====
---- QUERY
# count distinct order by the same agg expr
@@ -203,3 +203,72 @@ true,6
true,8
true,NULL
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct NULL) from alltypesagg
---- TYPES
bigint
---- RESULTS
0
====
---- QUERY
# test count distinct with nulls w/o groupby
select count(distinct int_col, NULL) from alltypesagg
---- TYPES
bigint
---- RESULTS
0
====
---- QUERY
# test count distinct with nulls and groupby
select tinyint_col, count(distinct NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- TYPES
tinyint, bigint
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
====
---- QUERY
select tinyint_col, count(distinct int_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- TYPES
tinyint, bigint
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
====
---- QUERY
# make sure we still return the NULL group even though NULLs are not added to COUNT
select tinyint_col, count(distinct tinyint_col, NULL) from alltypesagg group by tinyint_col
order by tinyint_col limit 10
---- TYPES
tinyint, bigint
---- RESULTS
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0
NULL,0
====