mirror of
https://github.com/apache/impala.git
synced 2026-02-02 06:00:36 -05:00
This patch provides count(star) optimization for ORC scans, similar to the work done in IMPALA-5036 for Parquet scans. We use the stripes num rows statistics when computing the count star instead of materializing empty rows. The aggregate function changed from a count to a special sum function initialized to 0. This count(star) optimization is disabled for the full ACID table because the scanner might need to read and validate the 'currentTransaction' column in table's special schema. This patch drops 'parquet' from names related to the count star optimization. It also improves the count(star) operation in general by serving the result just from the file's footer stats for both Parquet and ORC. We unify the optimized count star and zero slot scan functions into HdfsColumnarScanner. The following table shows a performance comparison before and after the patch. primitive_count_star query target tpch10_parquet.lineitem table (10GB scale TPC-H). Meanwhile, count_star_parq and count_star_orc query is a modified primitive_count_star query that targets tpch_parquet.lineitem and tpch_orc_def.lineitem table accordingly. +-------------------+----------------------+-----------------------+--------+-------------+------------+------------+----------------+-------+----------------+---------+-------+ | Workload | Query | File Format | Avg(s) | Base Avg(s) | Delta(Avg) | StdDev(%) | Base StdDev(%) | Iters | Median Diff(%) | MW Zval | Tval | +-------------------+----------------------+-----------------------+--------+-------------+------------+------------+----------------+-------+----------------+---------+-------+ | tpch_parquet | count_star_parq | parquet / none / none | 0.06 | 0.07 | -10.45% | 2.87% | * 25.51% * | 9 | -1.47% | -1.26 | -1.22 | | tpch_orc_def | count_star_orc | orc / def / none | 0.06 | 0.08 | -22.37% | 6.22% | * 30.95% * | 9 | -1.85% | -1.16 | -2.14 | | TARGETED-PERF(10) | primitive_count_star | parquet / none / none | 0.06 | 0.08 | I -30.40% | 2.68% | * 29.63% * | 9 | I -7.20% | -2.42 | -3.07 | +-------------------+----------------------+-----------------------+--------+-------------+------------+------------+----------------+-------+----------------+---------+-------+ Testing: - Add PlannerTest.testOrcStatsAgg - Add TestAggregationQueries::test_orc_count_star_optimization - Exercise count(star) in TestOrc::test_misaligned_orc_stripes - Pass core tests Change-Id: I0fafa1182f97323aeb9ee39dd4e8ecd418fa6091 Reviewed-on: http://gerrit.cloudera.org:8080/18327 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
153 lines
3.3 KiB
Plaintext
153 lines
3.3 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Tests the correctness of the ORC count(*) optimization.
|
|
select count(1)
|
|
from functional_orc_def.uncomp_src_alltypes
|
|
---- RESULTS
|
|
7300
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
aggregation(SUM, RowsRead): 0
|
|
=====
|
|
---- QUERY
|
|
# Tests the correctness of zero slot scan over ORC.
|
|
# Does not verify 'NumFileMetadataRead' here since codegen vs non-codegen yield
|
|
# different number.
|
|
select 1 from functional_orc_def.alltypestiny
|
|
---- RESULTS
|
|
1
|
|
1
|
|
1
|
|
1
|
|
1
|
|
1
|
|
1
|
|
1
|
|
---- TYPES
|
|
tinyint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, RowsRead): 0
|
|
=====
|
|
---- QUERY
|
|
# ORC count(*) optimization with predicates on the partition columns.
|
|
select count(1)
|
|
from functional_orc_def.uncomp_src_alltypes where year < 2010 and month > 8
|
|
---- RESULTS
|
|
1220
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, NumFileMetadataRead): 4
|
|
aggregation(SUM, RowsRead): 0
|
|
=====
|
|
---- QUERY
|
|
# ORC count(*) optimization with group by partition columns.
|
|
select year, month, count(1)
|
|
from functional_orc_def.uncomp_src_alltypes group by year, month
|
|
---- RESULTS
|
|
2009,1,310
|
|
2009,2,280
|
|
2009,3,310
|
|
2009,4,300
|
|
2009,5,310
|
|
2009,6,300
|
|
2009,7,310
|
|
2009,8,310
|
|
2009,9,300
|
|
2009,10,310
|
|
2009,11,300
|
|
2009,12,310
|
|
2010,1,310
|
|
2010,2,280
|
|
2010,3,310
|
|
2010,4,300
|
|
2010,5,310
|
|
2010,6,300
|
|
2010,7,310
|
|
2010,8,310
|
|
2010,9,300
|
|
2010,10,310
|
|
2010,11,300
|
|
2010,12,310
|
|
---- TYPES
|
|
int, int, bigint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
aggregation(SUM, RowsRead): 0
|
|
=====
|
|
---- QUERY
|
|
# ORC count(*) optimization with both group by and predicates on partition columns.
|
|
select count(1)
|
|
from functional_orc_def.uncomp_src_alltypes where year < 2010 and month > 8
|
|
group by month
|
|
---- RESULTS
|
|
310
|
|
300
|
|
310
|
|
300
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, NumFileMetadataRead): 4
|
|
aggregation(SUM, RowsRead): 0
|
|
=====
|
|
---- QUERY
|
|
# ORC count(*) optimization with the result going into a join.
|
|
select x.bigint_col from functional_orc_def.uncomp_src_alltypes x
|
|
inner join (
|
|
select count(1) as a from functional_orc_def.uncomp_src_alltypes group by year
|
|
) t on x.id = t.a;
|
|
---- RESULTS
|
|
0
|
|
0
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 24
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
aggregation(SUM, RowsRead): 7300
|
|
=====
|
|
---- QUERY
|
|
# ORC count(*) optimization with the agg function in the having clause.
|
|
select 1 from functional_orc_def.uncomp_src_alltypes having count(*) > 1
|
|
---- RESULTS
|
|
1
|
|
---- TYPES
|
|
tinyint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
aggregation(SUM, RowsRead): 0
|
|
====
|
|
---- QUERY
|
|
# Verify that 0 is returned for count(*) on an empty table.
|
|
select count(1) from functional_orc_def.emptytable
|
|
---- RESULTS
|
|
0
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, NumFileMetadataRead): 0
|
|
aggregation(SUM, RowsRead): 0
|
|
=====
|
|
---- QUERY
|
|
# Verify that 0 is returned when all partitions are pruned.
|
|
select count(1) from functional_orc_def.uncomp_src_alltypes where year = -1
|
|
---- RESULTS
|
|
0
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumOrcStripes): 0
|
|
aggregation(SUM, NumFileMetadataRead): 0
|
|
aggregation(SUM, RowsRead): 0
|
|
=====
|