mirror of
https://github.com/apache/impala.git
synced 2026-01-03 06:00:52 -05:00
This patch implements min-max filters for runtime filters. Each runtime filter generates a bloom filter or a min-max filter, depending on if it has HDFS or Kudu targets, respectively. In RuntimeFilterGenerator in the planner, each hash join node generates a bloom and min-max filter for each equi-join predicate, but only those filters that end up being assigned to a target make it into the final plan. Min-max filters are only assigned to Kudu scans if the target expr is a column, as Kudu doesn't support bounds on general exprs, and only if the join op is '=' and not 'is distinct from', as Kudu doesn't support returning NULLs if a bound is set. Min-max filters are inserted into by the PartitionedHashJoinBuilder. Codegen is used to eliminate branching on the type of filter. String min-max filters truncate their bounds at 1024 chars, so that the max amount of memory used by min-max filters is negligible. For now, min-max filters are only applied at the KuduScanner, which passes them into the Kudu client. Future work will address applying min-max filters at HDFS scan nodes and applying bloom filters at Kudu scan nodes. Functional Testing: - Added new planner tests and updated the old ones. (in old tests, a lot of runtime filters are renumbered as we always generate min-max filters even if they don't end up getting assigned and they take up some of the RF ids). - Updated existing runtime filter tests to work with Kudu. - Added e2e tests for min-max filter specific functionality. Perf Testing: - All tests run on Kudu stress cluster (10 nodes) and tpch_100_kudu, timings are averages of 3 runs. - Ran a contrived query with a filter that does not eliminate any rows (full self join of lineitem). The difference in running time was negligible - 24.46s with filters on, 24.15s with filters off for a ~1% slowdown. - Ran a contrived query with a filter that elimiates all rows (self join on lineitem with a join condition that never matches). The filters resulted in a significant speedup - 0.26s with filters on, 1.46s with filters off for a ~5.6x speedup. This query is added to targeted-perf. Change-Id: I02bad890f5b5f78388a3041bf38f89369b5e2f1c Reviewed-on: http://gerrit.cloudera.org:8080/7793 Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com> Tested-by: Impala Public Jenkins
67 lines
2.6 KiB
Plaintext
67 lines
2.6 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Explain a simple hash join query.
|
|
explain
|
|
select *
|
|
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
'Max Per-Host Resource Reservation: Memory=34.00MB'
|
|
'Per-Host Resource Estimates: Memory=476.41MB'
|
|
''
|
|
'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
'Per-Host Resources: mem-estimate=0B mem-reservation=0B'
|
|
' PLAN-ROOT SINK'
|
|
' | mem-estimate=0B mem-reservation=0B'
|
|
' |'
|
|
' 04:EXCHANGE [UNPARTITIONED]'
|
|
' mem-estimate=0B mem-reservation=0B'
|
|
' tuple-ids=0,1 row-size=454B cardinality=5757710'
|
|
''
|
|
'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
|
|
'Per-Host Resources: mem-estimate=388.41MB mem-reservation=34.00MB'
|
|
' DATASTREAM SINK [FRAGMENT=F02, EXCHANGE=04, UNPARTITIONED]'
|
|
' | mem-estimate=0B mem-reservation=0B'
|
|
' 02:HASH JOIN [INNER JOIN, BROADCAST]'
|
|
' | hash predicates: l_orderkey = o_orderkey'
|
|
' | fk/pk conjuncts: l_orderkey = o_orderkey'
|
|
' | runtime filters: RF000[bloom] <- o_orderkey'
|
|
' | mem-estimate=300.41MB mem-reservation=34.00MB spill-buffer=2.00MB'
|
|
' | tuple-ids=0,1 row-size=454B cardinality=5757710'
|
|
' |'
|
|
' |--03:EXCHANGE [BROADCAST]'
|
|
' | mem-estimate=0B mem-reservation=0B'
|
|
' | tuple-ids=1 row-size=191B cardinality=1500000'
|
|
' |'
|
|
' 00:SCAN HDFS [tpch.lineitem, RANDOM]'
|
|
row_regex:.*partitions=1/1 files=1 size=.*
|
|
' runtime filters: RF000[bloom] -> l_orderkey'
|
|
' stats-rows=6001215 extrapolated-rows=disabled'
|
|
' table stats: rows=6001215 size=718.94MB'
|
|
' column stats: all'
|
|
' mem-estimate=88.00MB mem-reservation=0B'
|
|
' tuple-ids=0 row-size=263B cardinality=6001215'
|
|
''
|
|
'F01:PLAN FRAGMENT [RANDOM] hosts=2 instances=2'
|
|
'Per-Host Resources: mem-estimate=88.00MB mem-reservation=0B'
|
|
' DATASTREAM SINK [FRAGMENT=F00, EXCHANGE=03, BROADCAST]'
|
|
' | mem-estimate=0B mem-reservation=0B'
|
|
' 01:SCAN HDFS [tpch.orders, RANDOM]'
|
|
row_regex:.*partitions=1/1 files=1 size=.*
|
|
' stats-rows=1500000 extrapolated-rows=disabled'
|
|
' table stats: rows=1500000 size=162.56MB'
|
|
' column stats: all'
|
|
' mem-estimate=88.00MB mem-reservation=0B'
|
|
' tuple-ids=1 row-size=191B cardinality=1500000'
|
|
====
|
|
---- QUERY
|
|
# Tests the warning about missing table stats in the explain header.
|
|
explain select count(t1.int_col), avg(t2.float_col), sum(t3.bigint_col)
|
|
from functional_avro.alltypes t1
|
|
inner join functional_parquet.alltypessmall t2 on (t1.id = t2.id)
|
|
left outer join functional_avro.alltypes t3 on (t2.id = t3.id)
|
|
where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
'WARNING: The following tables are missing relevant table and/or column statistics.'
|
|
'functional_avro.alltypes, functional_parquet.alltypessmall'
|
|
====
|