mirror of
https://github.com/apache/impala.git
synced 2026-02-01 21:00:29 -05:00
Currently, entire row is materialized before filtering during scan. Instead of paying the cost of materializing upfront, for columnar formats we can avoid doing it for rows that are filtered out. Columns that are required for filtering are the only ones that are needed to be materialized before filtering. For rest of the columns, materialization can be delayed and be done only for rows that survive. This patch implements this technique for Parquet format only. New configuration 'parquet_materialization_threshold' is introduced, which is minimum number of consecutive rows that are filtered out to avoid materialization. If set to less than 0, it disables the late materialization. Performance: Peformance measured for single daemon, single threaded impalad upon TPCH scale 42 lineitem table with 252 million rows, unsorted data. Upto 2.5x improvement for non-page indexed and upto 4x improvement in page index seen. Queries for page index borrowed from blog: https://blog.cloudera.com/speeding-up-select-queries-with-parquet-page-indexes/ More details: https://docs.google.com/spreadsheets/d/17s5OLaFOPo-64kimAPP6n3kJA42vM-iVT24OvsQgfuA/edit?usp=sharing Testing: 1. Ran existing tests 2. Added UT for 'ScratchTupleBatch::GetMicroBatch' 3. Added end-to-end test for late materialization. Change-Id: I46406c913297d5bbbec3ccae62a83bb214ed2c60 Reviewed-on: http://gerrit.cloudera.org:8080/17860 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Qifan Chen <qchen@cloudera.com>
46 lines
1.6 KiB
Plaintext
46 lines
1.6 KiB
Plaintext
# This tests pages skipped by parquet late materialization.
|
|
====
|
|
---- QUERY
|
|
# Test for late materialization on page indexes
|
|
select * from tpch_parquet.lineitem where l_orderkey=3209632;
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test for late materialization on non-page index
|
|
select * from tpch_parquet.lineitem
|
|
where l_comment like '%unusual courts. blithely final theodolit%';
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test late materialization on runtime filters.
|
|
SET RUNTIME_FILTER_MODE=GLOBAL;
|
|
SET RUNTIME_FILTER_WAIT_TIME_MS=5000;
|
|
select * from tpch_parquet.lineitem l
|
|
join tpch_parquet.orders o on l.l_orderkey = o.o_orderkey
|
|
where o_orderdate='1992-06-22' and o_totalprice = 153827.26;
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*1 of 1 Runtime Filter Published.*
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test late materialization on min/max runtime filters.
|
|
SET RUNTIME_FILTER_WAIT_TIME_MS=5000;
|
|
SET MINMAX_FILTERING_LEVEL=ROW;
|
|
SET ENABLED_RUNTIME_FILTER_TYPES=MIN_MAX;
|
|
SET MINMAX_FILTER_THRESHOLD=0.5;
|
|
select * from tpch_parquet.lineitem l
|
|
join tpch_parquet.orders o on l.l_orderkey = o.o_orderkey
|
|
where o_orderdate='1996-12-01' and o_totalprice >= 250000;
|
|
---- RUNTIME_PROFILE
|
|
row_regex:.* RF00.\[min_max\] -. .\.l_orderkey.*
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test that late materialization on nested columns is disabled.
|
|
select * from tpch_nested_parquet.customer where c_mktsegment = 'COMEDY';
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization): 0
|
|
====
|