mirror of
https://github.com/apache/impala.git
synced 2026-01-06 06:01:03 -05:00
IMPALA-5036: Parquet count star optimization
Instead of materializing empty rows when computing count star, we use the data stored in the Parquet RowGroup.num_rows field. The Parquet scanner tuple is modified to have one slot into which we will write the num rows statistic. The aggregate function is changed from count to a special sum function that gets initialized to 0. We also add a rewrite rule so that count(<literal>) is rewritten to count(*) in order to make sure that this optimization is applied in all cases. Testing: - Added functional and planner tests Change-Id: I536b85c014821296aed68a0c68faadae96005e62 Reviewed-on: http://gerrit.cloudera.org:8080/6812 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: Impala Public Jenkins
This commit is contained in:
committed by
Impala Public Jenkins
parent
d7d6c03674
commit
57d7c614bc
@@ -1255,3 +1255,67 @@ where t2.int_col IN (t1.int_col_1, t1.int_col)
|
||||
---- TYPES
|
||||
TIMESTAMP,BIGINT
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Tests the correctness of the Parquet count(*) optimization.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes
|
||||
---- RESULTS
|
||||
7300
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with predicates on the partition columns.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes where year < 2010 and month > 8
|
||||
---- RESULTS
|
||||
1220
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with group by partition columns.
|
||||
select year, month, count(1)
|
||||
from functional_parquet.alltypes where month > 10 group by year, month
|
||||
---- RESULTS
|
||||
2009,11,300
|
||||
2009,12,310
|
||||
2010,11,300
|
||||
2010,12,310
|
||||
---- TYPES
|
||||
int, int, bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with both group by and predicates on
|
||||
# partition columns.
|
||||
select count(1)
|
||||
from functional_parquet.alltypes where year < 2010 and month > 8
|
||||
group by month
|
||||
---- RESULTS
|
||||
310
|
||||
300
|
||||
310
|
||||
300
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with the result of the going into a join.
|
||||
select x.bigint_col from functional.alltypes x
|
||||
inner join (
|
||||
select count(1) as a from functional_parquet.alltypes group by year
|
||||
) t on x.id = t.a;
|
||||
---- RESULTS
|
||||
0
|
||||
0
|
||||
---- TYPES
|
||||
bigint
|
||||
=====
|
||||
---- QUERY
|
||||
# IMPALA-5036: Parquet count(*) optimization with the agg function in the having clause.
|
||||
select 1 from functional_parquet.alltypes having count(*) > 1
|
||||
---- RESULTS
|
||||
1
|
||||
---- TYPES
|
||||
tinyint
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user