mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
This patch enables late materialization for collections to avoid the cost of materializing collections that will never be accessed by the query. For a collection column, late materialization takes effect only when the collection column is not used in any predicate, including the `!empty()` predicate added by the planner. Otherwise we need to read every row to evaluate the predicate and cannot skip any. Therefore, this patch skips registering the `!empty()` predicates if the query contains zipping unnests. This can affect performance if the table contains many empty collections, but should be noticeable only in very extreme cases. The late materialization threshold is set to 1 in HdfsParquetScanner when there is any collection that can be skipped. This patch also adds the detail of `HdfsScanner::parse_status_` to the error message returned by the HdfsParquetScanner to help figure out the root cause. Performance: - Tests with the queries involving collection columns in table `tpch_nested_parquet.customer` show that when the selectivity is low, the single-threaded (1 impalad and MT_DOP=1) scanning time can be reduced by about 50%, while when the selectivity is high, the scanning time almost does not change. - For queries not involving collections, performance A/B testing shows no regression on TPC-H. Testing: - Added a runtime profile counter NumTopLevelValuesSkipped to record the total number of top-level values skipped for all columns. The counter only counts the values that are not skipped as a page. - Added e2e test cases in test_parquet_late_materialization.py to ensure that late materialization works using the new counter. Change-Id: Ia21bdfa6811408d66d74367e0a9520e20951105f Reviewed-on: http://gerrit.cloudera.org:8080/22662 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Csaba Ringhofer <csringhofer@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
79 lines
2.7 KiB
Plaintext
79 lines
2.7 KiB
Plaintext
# This tests pages skipped by parquet late materialization.
|
|
====
|
|
---- QUERY
|
|
# Test for late materialization on page indexes
|
|
select * from tpch_parquet.lineitem where l_orderkey=3209632;
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test for late materialization on non-page index
|
|
select * from tpch_parquet.lineitem
|
|
where l_comment like '%unusual courts. blithely final theodolit%';
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test late materialization on runtime filters.
|
|
SET RUNTIME_FILTER_MODE=GLOBAL;
|
|
SET RUNTIME_FILTER_WAIT_TIME_MS=5000;
|
|
select * from tpch_parquet.lineitem l
|
|
join tpch_parquet.orders o on l.l_orderkey = o.o_orderkey
|
|
where o_orderdate='1992-06-22' and o_totalprice = 153827.26;
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*1 of 1 Runtime Filter Published.*
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test late materialization on min/max runtime filters.
|
|
SET RUNTIME_FILTER_WAIT_TIME_MS=5000;
|
|
SET MINMAX_FILTERING_LEVEL=ROW;
|
|
SET ENABLED_RUNTIME_FILTER_TYPES=MIN_MAX;
|
|
SET MINMAX_FILTER_THRESHOLD=0.5;
|
|
select * from tpch_parquet.lineitem l
|
|
join tpch_parquet.orders o on l.l_orderkey = o.o_orderkey
|
|
where o_orderdate='1996-12-01' and o_totalprice >= 250000;
|
|
---- RUNTIME_PROFILE
|
|
row_regex:.* RF00.\[min_max\] -. .\.l_orderkey.*
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
====
|
|
---- QUERY
|
|
# Test late materialization for query with one zipping unnest.
|
|
select unnest(arr1)
|
|
from functional_parquet.complextypes_arrays
|
|
where id = 2;
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumTopLevelValuesSkipped): 10
|
|
====
|
|
---- QUERY
|
|
# Test late materialization for query with multiple zipping unnests.
|
|
select unnest(arr1), unnest(arr2)
|
|
from functional_parquet.complextypes_arrays
|
|
where id = 2;
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumTopLevelValuesSkipped): 20
|
|
====
|
|
---- QUERY
|
|
# Test if late materialization for collections works with page filtering.
|
|
# In table tpch_nested_parquet.customer, min(c_phone) is '10-100-106-1617'.
|
|
select count(o_orderkey) > 0
|
|
from tpch_nested_parquet.customer c left outer join c.c_orders
|
|
where c_phone < '10-100-106-16170'
|
|
---- RESULTS
|
|
true
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumDictFilteredRowGroups): 0
|
|
aggregation(SUM, NumPagesSkippedByLateMaterialization)> 0
|
|
aggregation(SUM, NumTopLevelValuesSkipped)> 0
|
|
====
|
|
---- QUERY
|
|
# Test if PARQUET_LATE_MATERIALIZATION_THRESHOLD is always 1 if there is any
|
|
# collection that can be skipped.
|
|
set parquet_read_page_index = false;
|
|
set expand_complex_types = true;
|
|
select int_array_array
|
|
from functional_parquet.complextypestbl where id % 2 = 0;
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumTopLevelValuesSkipped): 4
|
|
====
|