IMPALA-5036: Parquet count star optimization

Instead of materializing empty rows when computing count star, we use
the data stored in the Parquet RowGroup.num_rows field. The Parquet
scanner tuple is modified to have one slot into which we will write the
num rows statistic. The aggregate function is changed from count to a
special sum function that gets initialized to 0. We also add a rewrite
rule so that count(<literal>) is rewritten to count(*) in order to make
sure that this optimization is applied in all cases.

Testing:
- Added functional and planner tests

Change-Id: I536b85c014821296aed68a0c68faadae96005e62
Reviewed-on: http://gerrit.cloudera.org:8080/6812
Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com>
Tested-by: Impala Public Jenkins
This commit is contained in:
Taras Bobrovytsky
2017-04-06 13:51:39 -07:00
committed by Impala Public Jenkins
parent d7d6c03674
commit 57d7c614bc
27 changed files with 873 additions and 82 deletions

View File

@@ -1255,3 +1255,67 @@ where t2.int_col IN (t1.int_col_1, t1.int_col)
---- TYPES
TIMESTAMP,BIGINT
====
---- QUERY
# IMPALA-5036: Tests the correctness of the Parquet count(*) optimization.
select count(1)
from functional_parquet.alltypes
---- RESULTS
7300
---- TYPES
bigint
=====
---- QUERY
# IMPALA-5036: Parquet count(*) optimization with predicates on the partition columns.
select count(1)
from functional_parquet.alltypes where year < 2010 and month > 8
---- RESULTS
1220
---- TYPES
bigint
=====
---- QUERY
# IMPALA-5036: Parquet count(*) optimization with group by partition columns.
select year, month, count(1)
from functional_parquet.alltypes where month > 10 group by year, month
---- RESULTS
2009,11,300
2009,12,310
2010,11,300
2010,12,310
---- TYPES
int, int, bigint
=====
---- QUERY
# IMPALA-5036: Parquet count(*) optimization with both group by and predicates on
# partition columns.
select count(1)
from functional_parquet.alltypes where year < 2010 and month > 8
group by month
---- RESULTS
310
300
310
300
---- TYPES
bigint
=====
---- QUERY
# IMPALA-5036: Parquet count(*) optimization with the result of the going into a join.
select x.bigint_col from functional.alltypes x
inner join (
select count(1) as a from functional_parquet.alltypes group by year
) t on x.id = t.a;
---- RESULTS
0
0
---- TYPES
bigint
=====
---- QUERY
# IMPALA-5036: Parquet count(*) optimization with the agg function in the having clause.
select 1 from functional_parquet.alltypes having count(*) > 1
---- RESULTS
1
---- TYPES
tinyint
====