mirror of
https://github.com/apache/impala.git
synced 2025-12-23 21:08:39 -05:00
IMPALA-111: COUNT(DISTINCT col) returns wrong results -- does not ignore NULLs.
This commit is contained in:
committed by
Henry Robinson
parent
d02f37e60f
commit
c9040aee22
@@ -354,6 +354,34 @@ public class AggregateInfo {
|
||||
mergeAggInfo.mergeAggInfo = mergeAggInfo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an IF function call that returns NULL if any of the slots
|
||||
* at indexes [firstIdx, lastIdx] return NULL.
|
||||
* For example, the resulting IF function would like this for 3 slots:
|
||||
* IF(IsNull(slot1), NULL, IF(IsNull(slot2), NULL, slot3))
|
||||
* Returns null if firstIdx is greater than lastIdx.
|
||||
* Returns a SlotRef to the last slot if there is only one slot in range.
|
||||
*/
|
||||
private Expr createCountDistinctAggExprParam(int firstIdx, int lastIdx,
|
||||
ArrayList<SlotDescriptor> slots) {
|
||||
if (firstIdx > lastIdx) return null;
|
||||
|
||||
Expr elseExpr = new SlotRef(slots.get(lastIdx));
|
||||
if (firstIdx == lastIdx) return elseExpr;
|
||||
|
||||
for (int i = lastIdx - 1; i >= firstIdx; --i) {
|
||||
ArrayList<Expr> ifArgs = Lists.newArrayList();
|
||||
SlotRef slotRef = new SlotRef(slots.get(i));
|
||||
// Build expr: IF(IsNull(slotRef), NULL, elseExpr)
|
||||
Expr isNullPred = new IsNullPredicate(slotRef, false);
|
||||
ifArgs.add(isNullPred);
|
||||
ifArgs.add(new NullLiteral());
|
||||
ifArgs.add(elseExpr);
|
||||
elseExpr = new FunctionCallExpr("if", ifArgs);
|
||||
}
|
||||
return elseExpr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the info for an aggregation node that computes the second phase of of
|
||||
* DISTINCT aggregate functions.
|
||||
@@ -382,8 +410,23 @@ public class AggregateInfo {
|
||||
for (AggregateExpr inputExpr: distinctAggExprs) {
|
||||
AggregateExpr aggExpr = null;
|
||||
if (inputExpr.getOp() == AggregateExpr.Operator.COUNT) {
|
||||
// COUNT(DISTINCT ...) -> COUNT(*)
|
||||
aggExpr = new AggregateExpr(AggregateExpr.Operator.COUNT, true, false, null);
|
||||
// COUNT(DISTINCT ...) ->
|
||||
// COUNT(IF(IsNull(<agg slot 1>), NULL, IF(IsNull(<agg slot 2>), NULL, ...)))
|
||||
// We need the nested IF to make sure that we do not count
|
||||
// column-value combinations if any of the distinct columns are NULL.
|
||||
// This behavior is consistent with MySQL.
|
||||
Expr ifExpr = createCountDistinctAggExprParam(origGroupingExprs.size(),
|
||||
origGroupingExprs.size() + inputExpr.getChildren().size() - 1,
|
||||
inputDesc.getSlots());
|
||||
Preconditions.checkNotNull(ifExpr);
|
||||
try {
|
||||
ifExpr.analyze(analyzer);
|
||||
} catch (AnalysisException e) {
|
||||
throw new InternalException("Failed to analyze 'IF' function " +
|
||||
"in second phase count distinct aggregation.", e);
|
||||
}
|
||||
aggExpr = new AggregateExpr(AggregateExpr.Operator.COUNT, false, false,
|
||||
Lists.newArrayList(ifExpr));
|
||||
} else {
|
||||
// SUM(DISTINCT <expr>) -> SUM(<last grouping slot>);
|
||||
// (MIN(DISTINCT ...) and MAX(DISTINCT ...) have their DISTINCT turned
|
||||
|
||||
@@ -97,7 +97,7 @@ from functional.testtbl
|
||||
Plan Fragment 0
|
||||
UNPARTITIONED
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(if(<slot 2> IS NULL, NULL, <slot 3>))
|
||||
GROUP BY:
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -123,7 +123,7 @@ Plan Fragment 1
|
||||
UNPARTITIONED
|
||||
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(if(<slot 2> IS NULL, NULL, <slot 3>))
|
||||
GROUP BY:
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -156,7 +156,7 @@ group by 1
|
||||
Plan Fragment 0
|
||||
UNPARTITIONED
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(if(<slot 4> IS NULL, NULL, <slot 5>))
|
||||
GROUP BY: <slot 3>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -178,7 +178,7 @@ Plan Fragment 1
|
||||
UNPARTITIONED
|
||||
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(if(<slot 4> IS NULL, NULL, <slot 5>))
|
||||
GROUP BY: <slot 3>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -231,7 +231,7 @@ group by 1
|
||||
Plan Fragment 0
|
||||
UNPARTITIONED
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*), SUM(<slot 3>)
|
||||
OUTPUT: COUNT(<slot 3>), SUM(<slot 3>)
|
||||
GROUP BY: <slot 2>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -253,7 +253,7 @@ Plan Fragment 1
|
||||
UNPARTITIONED
|
||||
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*), SUM(<slot 3>)
|
||||
OUTPUT: COUNT(<slot 3>), SUM(<slot 3>)
|
||||
GROUP BY: <slot 2>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -307,7 +307,7 @@ from functional.alltypesagg group by 1
|
||||
Plan Fragment 0
|
||||
UNPARTITIONED
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*), MIN(<slot 6>), MAX(<slot 7>)
|
||||
OUTPUT: COUNT(<slot 5>), MIN(<slot 6>), MAX(<slot 7>)
|
||||
GROUP BY: <slot 4>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -329,7 +329,7 @@ Plan Fragment 1
|
||||
UNPARTITIONED
|
||||
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*), MIN(<slot 6>), MAX(<slot 7>)
|
||||
OUTPUT: COUNT(<slot 5>), MIN(<slot 6>), MAX(<slot 7>)
|
||||
GROUP BY: <slot 4>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -382,7 +382,7 @@ from functional.alltypesagg group by 1
|
||||
Plan Fragment 0
|
||||
UNPARTITIONED
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
|
||||
OUTPUT: COUNT(<slot 5>), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
|
||||
GROUP BY: <slot 4>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
@@ -404,7 +404,7 @@ Plan Fragment 1
|
||||
UNPARTITIONED
|
||||
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
|
||||
OUTPUT: COUNT(<slot 5>), SUM(<slot 5>), SUM(<slot 6>), SUM(<slot 7>), MIN(<slot 8>), MAX(<slot 9>)
|
||||
GROUP BY: <slot 4>
|
||||
TUPLE IDS: 2
|
||||
AGGREGATE
|
||||
|
||||
@@ -540,7 +540,7 @@ Plan Fragment 0
|
||||
LIMIT: 10
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(<slot 7>)
|
||||
GROUP BY: <slot 6>
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
@@ -579,7 +579,7 @@ Plan Fragment 1
|
||||
LIMIT: 10
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(<slot 7>)
|
||||
GROUP BY: <slot 6>
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
@@ -2576,7 +2576,7 @@ Plan Fragment 0
|
||||
LIMIT: 1000
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(<slot 11>)
|
||||
GROUP BY: <slot 8>, <slot 9>, <slot 10>
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
@@ -2607,7 +2607,7 @@ Plan Fragment 1
|
||||
LIMIT: 1000
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
OUTPUT: COUNT(*)
|
||||
OUTPUT: COUNT(<slot 11>)
|
||||
GROUP BY: <slot 8>, <slot 9>, <slot 10>
|
||||
TUPLE IDS: 3
|
||||
AGGREGATE
|
||||
|
||||
@@ -839,15 +839,9 @@ bigint, NULL, NULL, NULL, double
|
||||
====
|
||||
---- QUERY
|
||||
# Test ignored distinct in MIN and MAX with NULLs
|
||||
select min(distinct NULL), max(distinct NULL) from alltypesagg
|
||||
---- TYPES
|
||||
NULL, NULL
|
||||
---- RESULTS
|
||||
NULL,NULL
|
||||
---- QUERY
|
||||
# TODO: Fix count(distinct null) to return 0 instead of 1
|
||||
select count(distinct NULL) from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
1
|
||||
====
|
||||
|
||||
@@ -6,7 +6,7 @@ from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
100
|
||||
90
|
||||
====
|
||||
---- QUERY
|
||||
select count(distinct bool_col)
|
||||
@@ -22,7 +22,7 @@ from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
1000
|
||||
999
|
||||
====
|
||||
---- QUERY
|
||||
select count(distinct float_col)
|
||||
@@ -30,7 +30,7 @@ from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
1000
|
||||
999
|
||||
====
|
||||
---- QUERY
|
||||
select count(distinct double_col)
|
||||
@@ -38,7 +38,7 @@ from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
1000
|
||||
999
|
||||
====
|
||||
---- QUERY
|
||||
select count(distinct string_col)
|
||||
@@ -49,12 +49,12 @@ bigint
|
||||
1000
|
||||
====
|
||||
---- QUERY
|
||||
select count(distinct string_col)
|
||||
select count(distinct date_string_col)
|
||||
from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
1000
|
||||
10
|
||||
====
|
||||
---- QUERY
|
||||
select count(distinct timestamp_col)
|
||||
@@ -80,7 +80,7 @@ tinyint, bigint, bigint
|
||||
7,10,1000
|
||||
8,10,1000
|
||||
9,10,1000
|
||||
NULL,10,900
|
||||
NULL,9,900
|
||||
====
|
||||
---- QUERY
|
||||
# count(distinct) w/ grouping and non-distinct count()
|
||||
@@ -98,7 +98,7 @@ tinyint, bigint, bigint
|
||||
7,100,1000
|
||||
8,100,1000
|
||||
9,100,1000
|
||||
NULL,100,900
|
||||
NULL,90,900
|
||||
====
|
||||
---- QUERY
|
||||
# count(distinct) and sum(distinct) w/ grouping and non-distinct count()
|
||||
@@ -117,7 +117,7 @@ tinyint, bigint, bigint, bigint
|
||||
7,10,520,1000
|
||||
8,10,530,1000
|
||||
9,10,540,1000
|
||||
NULL,10,450,900
|
||||
NULL,9,450,900
|
||||
====
|
||||
---- QUERY
|
||||
# count(distinct) and sum(distinct) w/ grouping; distinct in min() and max()
|
||||
@@ -137,7 +137,7 @@ tinyint, bigint, bigint, bigint, int, float
|
||||
7,10,520,1000,7,1096.699951171875
|
||||
8,10,530,1000,8,1097.800048828125
|
||||
9,10,540,1000,9,1098.900024414062
|
||||
NULL,10,450,900,10,1089
|
||||
NULL,9,450,900,10,1089
|
||||
====
|
||||
---- QUERY
|
||||
# count distinct order by the same agg expr
|
||||
@@ -203,3 +203,72 @@ true,6
|
||||
true,8
|
||||
true,NULL
|
||||
====
|
||||
---- QUERY
|
||||
# test count distinct with nulls w/o groupby
|
||||
select count(distinct NULL) from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
0
|
||||
====
|
||||
---- QUERY
|
||||
# test count distinct with nulls w/o groupby
|
||||
select count(distinct int_col, NULL) from alltypesagg
|
||||
---- TYPES
|
||||
bigint
|
||||
---- RESULTS
|
||||
0
|
||||
====
|
||||
---- QUERY
|
||||
# test count distinct with nulls and groupby
|
||||
select tinyint_col, count(distinct NULL) from alltypesagg group by tinyint_col
|
||||
order by tinyint_col limit 10
|
||||
---- TYPES
|
||||
tinyint, bigint
|
||||
---- RESULTS
|
||||
1,0
|
||||
2,0
|
||||
3,0
|
||||
4,0
|
||||
5,0
|
||||
6,0
|
||||
7,0
|
||||
8,0
|
||||
9,0
|
||||
NULL,0
|
||||
====
|
||||
---- QUERY
|
||||
select tinyint_col, count(distinct int_col, NULL) from alltypesagg group by tinyint_col
|
||||
order by tinyint_col limit 10
|
||||
---- TYPES
|
||||
tinyint, bigint
|
||||
---- RESULTS
|
||||
1,0
|
||||
2,0
|
||||
3,0
|
||||
4,0
|
||||
5,0
|
||||
6,0
|
||||
7,0
|
||||
8,0
|
||||
9,0
|
||||
NULL,0
|
||||
====
|
||||
---- QUERY
|
||||
# make sure we still return the NULL group even though NULLs are not added to COUNT
|
||||
select tinyint_col, count(distinct tinyint_col, NULL) from alltypesagg group by tinyint_col
|
||||
order by tinyint_col limit 10
|
||||
---- TYPES
|
||||
tinyint, bigint
|
||||
---- RESULTS
|
||||
1,0
|
||||
2,0
|
||||
3,0
|
||||
4,0
|
||||
5,0
|
||||
6,0
|
||||
7,0
|
||||
8,0
|
||||
9,0
|
||||
NULL,0
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user