mirror of
https://github.com/apache/impala.git
synced 2026-01-05 21:00:54 -05:00
IMPALA-5036: Parquet count star optimization
Instead of materializing empty rows when computing count star, we use the data stored in the Parquet RowGroup.num_rows field. The Parquet scanner tuple is modified to have one slot into which we will write the num rows statistic. The aggregate function is changed from count to a special sum function that gets initialized to 0. We also add a rewrite rule so that count(<literal>) is rewritten to count(*) in order to make sure that this optimization is applied in all cases. Testing: - Added functional and planner tests Change-Id: I536b85c014821296aed68a0c68faadae96005e62 Reviewed-on: http://gerrit.cloudera.org:8080/6812 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: Impala Public Jenkins
This commit is contained in:
committed by
Impala Public Jenkins
parent
d7d6c03674
commit
57d7c614bc
@@ -1255,3 +1255,67 @@ where t2.int_col IN (t1.int_col_1, t1.int_col)
|
||||
---- TYPES
|
||||
TIMESTAMP,BIGINT
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Tests the correctness of the Parquet count(*) optimization.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes
|
||||
---- RESULTS
|
||||
7300
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with predicates on the partition columns.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes where year < 2010 and month > 8
|
||||
---- RESULTS
|
||||
1220
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with group by partition columns.
|
||||
select year, month, count(1)
|
||||
from functional_parquet.alltypes where month > 10 group by year, month
|
||||
---- RESULTS
|
||||
2009,11,300
|
||||
2009,12,310
|
||||
2010,11,300
|
||||
2010,12,310
|
||||
---- TYPES
|
||||
int, int, bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with both group by and predicates on
|
||||
# partition columns.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes where year < 2010 and month > 8
|
||||
group by month
|
||||
---- RESULTS
|
||||
310
|
||||
300
|
||||
310
|
||||
300
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with the result of the going into a join.
|
||||
select x.bigint_col from functional.alltypes x
|
||||
inner join (
|
||||
select count(1) as a from functional_parquet.alltypes group by year
|
||||
) t on x.id = t.a;
|
||||
---- RESULTS
|
||||
0
|
||||
0
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with the agg function in the having clause.
|
||||
select 1 from functional_parquet.alltypes having count(*) > 1
|
||||
---- RESULTS
|
||||
1
|
||||
---- TYPES
|
||||
tinyint
|
||||
====
|
||||
|
||||
117
testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test
vendored
Normal file
117
testdata/workloads/functional-query/queries/QueryTest/parquet-stats-agg.test
vendored
Normal file
@@ -0,0 +1,117 @@
|
||||
====
|
||||
---- QUERY
|
||||
# Tests the correctness of the Parquet count(*) optimization.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes
|
||||
---- RESULTS
|
||||
7300
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# Parquet count(*) optimization with predicates on the partition columns.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes where year < 2010 and month > 8
|
||||
---- RESULTS
|
||||
1220
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# Parquet count(*) optimization with group by partition columns.
|
||||
select year, month, count(1)
|
||||
from functional_parquet.alltypes group by year, month
|
||||
---- RESULTS
|
||||
2009,1,310
|
||||
2009,2,280
|
||||
2009,3,310
|
||||
2009,4,300
|
||||
2009,5,310
|
||||
2009,6,300
|
||||
2009,7,310
|
||||
2009,8,310
|
||||
2009,9,300
|
||||
2009,10,310
|
||||
2009,11,300
|
||||
2009,12,310
|
||||
2010,1,310
|
||||
2010,2,280
|
||||
2010,3,310
|
||||
2010,4,300
|
||||
2010,5,310
|
||||
2010,6,300
|
||||
2010,7,310
|
||||
2010,8,310
|
||||
2010,9,300
|
||||
2010,10,310
|
||||
2010,11,300
|
||||
2010,12,310
|
||||
---- TYPES
|
||||
int, int, bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# Parquet count(*) optimization with both group by and predicates on partition columns.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes where year < 2010 and month > 8
|
||||
group by month
|
||||
---- RESULTS
|
||||
310
|
||||
300
|
||||
310
|
||||
300
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# Parquet count(*) optimization with the result going into a join.
|
||||
select x.bigint_col from functional.alltypes x
|
||||
inner join (
|
||||
select count(1) as a from functional_parquet.alltypes group by year
|
||||
) t on x.id = t.a;
|
||||
---- RESULTS
|
||||
0
|
||||
0
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# Parquet count(*) optimization with the agg function in the having clause.
|
||||
select 1 from functional_parquet.alltypes having count(*) > 1
|
||||
---- RESULTS
|
||||
1
|
||||
---- TYPES
|
||||
tinyint
|
||||
====
|
||||
---- QUERY
|
||||
# Verify that 0 is returned for count(*) on an empty table.
|
||||
select count(1) from functional_parquet.emptytable
|
||||
---- RESULTS
|
||||
0
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# Verify that 0 is returned when all partitions are pruned.
|
||||
select count(1) from functional_parquet.alltypes where year = -1
|
||||
---- RESULTS
|
||||
0
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# Test different row group size combinations.
|
||||
select count(*) from functional_parquet.lineitem_multiblock
|
||||
union all
|
||||
select count(*) from functional_parquet.lineitem_multiblock_one_row_group
|
||||
union all
|
||||
select count(*) from functional_parquet.lineitem_sixblocks
|
||||
union all
|
||||
select count(*) from tpch_parquet.lineitem
|
||||
---- RESULTS
|
||||
20000
|
||||
40000
|
||||
40000
|
||||
6001215
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
Reference in New Issue
Block a user