mirror of
https://github.com/apache/impala.git
synced 2026-01-30 15:00:18 -05:00
Defined the BloomFilter class as the wrapper of kudu::BlockBloomFilter. impala::BloomFilter build runtime bloom filter in kudu::BlockBloomFilter APIs with FastHash as default hash algorithm. Removed the duplicated functions from impala::BloomFillter class. Pushed down bloom filter to Kudu through Kudu clinet API. Added a new query option ENABLED_RUNTIME_FILTER_TYPES to set enabled runtime filter types, which only affect Kudu scan node now. By default, bloom filter is not enabled, only min-max filter will be enabled for Kudu. With this option, user could enable bloom filter, min-max filter, or both bloom and min-max runtime filters. Added new test cases in PlannerTest and end-end runtime_filters test for pushing down bloom filter to Kudu. Added test cases to compare the number of rows returned from Kudu scan when appling different types of runtime filter on same queries. Updated bloom-filter-benchmark due to the bloom-filter implementation change. Bump Kudu version to d652cab17. Testing: - Passed all exhaustive tests. Performance benchmark: - Ran single_node_perf_run.py on TPC-H with scale as 30 for parquet and Kudu. Verified that new hash function and bloom-filter implementation don't cause regressions for HDFS bloom filters. For Kudu, there is one regression for query TPCH-Q9 and there are improvement for about 8 queris when appling both bloom and min-max filters. The bloom filter reduce the number of rows returned from Kudu scan, hence reduce the cost for aggregation and hash join. But bloom filter evaluation add extra cost for Kudu scan, which offset the gain on aggregation and join. Kudu scan need to be optimized for bloom filter in following tasks. - Ran bloom-filter microbenchmarks and verified that there is no regression for Insert/Find/Union functions with or without AVX2 due to bloom-filter implementation changes. There is small performance degradation for Init function, but this function is not in hot path. Change-Id: I9100076f68ea299ddb6ec8bc027cac7a47f5d754 Reviewed-on: http://gerrit.cloudera.org:8080/15683 Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
166 lines
7.7 KiB
Python
166 lines
7.7 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
from copy import deepcopy
|
|
|
|
from tests.common.environ import ImpalaTestClusterProperties
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.skip import SkipIfNotHdfsMinicluster
|
|
from tests.common.test_dimensions import (create_exec_option_dimension_from_dict,
|
|
create_kudu_dimension, create_parquet_dimension)
|
|
|
|
IMPALA_TEST_CLUSTER_PROPERTIES = ImpalaTestClusterProperties.get_instance()
|
|
|
|
# Test with denial of reservations at varying frequency.
|
|
# Always test with the minimal amount of spilling and running with the absolute minimum
|
|
# memory requirement.
|
|
CORE_DEBUG_ACTION_DIMS = [None,
|
|
'-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0']
|
|
|
|
# Test with different frequency of denial on exhaustive to try and exercise more
|
|
# interesting code paths.
|
|
EXHAUSTIVE_DEBUG_ACTION_DIMS = [
|
|
'-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@0.1',
|
|
'-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@0.5',
|
|
'-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@0.9']
|
|
|
|
|
|
@pytest.mark.xfail(IMPALA_TEST_CLUSTER_PROPERTIES.is_remote_cluster(),
|
|
reason='Queries may not spill on larger clusters')
|
|
class TestSpillingDebugActionDimensions(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestSpillingDebugActionDimensions, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.clear_constraints()
|
|
cls.ImpalaTestMatrix.add_dimension(create_parquet_dimension('tpch'))
|
|
debug_action_dims = CORE_DEBUG_ACTION_DIMS
|
|
if cls.exploration_strategy() == 'exhaustive':
|
|
debug_action_dims = CORE_DEBUG_ACTION_DIMS + EXHAUSTIVE_DEBUG_ACTION_DIMS
|
|
# Tests are calibrated so that they can execute and spill with this page size.
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension_from_dict({'default_spillable_buffer_size': ['256k'],
|
|
'debug_action': debug_action_dims, 'mt_dop': [0, 1]}))
|
|
# Pare down the combinations of mt_dop and debug_action that run to reduce test time.
|
|
# The MT code path for joins is more complex, so focus testing there.
|
|
if cls.exploration_strategy() == 'exhaustive':
|
|
debug_action_dims = CORE_DEBUG_ACTION_DIMS + EXHAUSTIVE_DEBUG_ACTION_DIMS
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:
|
|
v.get_value('exec_option')['mt_dop'] == 1 or
|
|
v.get_value('exec_option')['debug_action'] in CORE_DEBUG_ACTION_DIMS)
|
|
elif cls.exploration_strategy() == 'core':
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:
|
|
v.get_value('exec_option')['mt_dop'] == 1 or
|
|
v.get_value('exec_option')['debug_action'] is None)
|
|
|
|
def test_spilling(self, vector):
|
|
self.run_test_case('QueryTest/spilling', vector)
|
|
|
|
def test_spilling_aggs(self, vector):
|
|
self.run_test_case('QueryTest/spilling-aggs', vector)
|
|
|
|
def test_spilling_large_rows(self, vector, unique_database):
|
|
"""Test that we can process large rows in spilling operators, with or without
|
|
spilling to disk"""
|
|
self.run_test_case('QueryTest/spilling-large-rows', vector, unique_database)
|
|
|
|
def test_spilling_naaj(self, vector):
|
|
"""Test spilling null-aware anti-joins"""
|
|
self.run_test_case('QueryTest/spilling-naaj', vector)
|
|
|
|
@SkipIfNotHdfsMinicluster.tuned_for_minicluster
|
|
def test_spilling_regression_exhaustive(self, vector):
|
|
"""Regression tests for spilling. mem_limits tuned for 3-node minicluster."""
|
|
if self.exploration_strategy() != 'exhaustive':
|
|
pytest.skip("only run large sorts on exhaustive")
|
|
self.run_test_case('QueryTest/spilling-regression-exhaustive', vector)
|
|
new_vector = deepcopy(vector)
|
|
del new_vector.get_value('exec_option')['default_spillable_buffer_size']
|
|
self.run_test_case(
|
|
'QueryTest/spilling-regression-exhaustive-no-default-buffer-size', new_vector)
|
|
|
|
|
|
@pytest.mark.xfail(IMPALA_TEST_CLUSTER_PROPERTIES.is_remote_cluster(),
|
|
reason='Queries may not spill on larger clusters')
|
|
class TestSpillingNoDebugActionDimensions(ImpalaTestSuite):
|
|
"""Spilling tests to which we don't want to apply the debug_action dimension."""
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestSpillingNoDebugActionDimensions, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.clear_constraints()
|
|
cls.ImpalaTestMatrix.add_dimension(create_parquet_dimension('tpch'))
|
|
# Tests are calibrated so that they can execute and spill with this page size.
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension_from_dict({'default_spillable_buffer_size': ['64k'],
|
|
'mt_dop': [0, 4]}))
|
|
|
|
def test_spilling_naaj_no_deny_reservation(self, vector):
|
|
"""
|
|
Null-aware anti-join tests that depend on getting more than the minimum reservation
|
|
and therefore will not reliably pass with the deny reservation debug action enabled.
|
|
"""
|
|
self.run_test_case('QueryTest/spilling-naaj-no-deny-reservation', vector)
|
|
|
|
def test_spilling_query_options(self, vector):
|
|
"""Test that spilling-related query options work end-to-end. These tests rely on
|
|
setting debug_action to alternative values via query options."""
|
|
self.run_test_case('QueryTest/spilling-query-options', vector)
|
|
|
|
def test_spilling_no_debug_action(self, vector):
|
|
"""Spilling tests that will not succeed if run with an arbitrary debug action.
|
|
These tests either run with no debug action set or set their own debug action."""
|
|
self.run_test_case('QueryTest/spilling-no-debug-action', vector)
|
|
|
|
|
|
@pytest.mark.xfail(IMPALA_TEST_CLUSTER_PROPERTIES.is_remote_cluster(),
|
|
reason='Queries may not spill on larger clusters')
|
|
class TestSpillingBroadcastJoins(ImpalaTestSuite):
|
|
"""Tests specifically targeted at shared broadcast joins for mt_dop."""
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestSpillingBroadcastJoins, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.clear_constraints()
|
|
# Use Kudu because it has 9 input splits for lineitem, hence can have a
|
|
# higher effective dop than parquet, which only has 3 splits.
|
|
cls.ImpalaTestMatrix.add_dimension(create_kudu_dimension('tpch'))
|
|
debug_action_dims = CORE_DEBUG_ACTION_DIMS
|
|
if cls.exploration_strategy() == 'exhaustive':
|
|
debug_action_dims = CORE_DEBUG_ACTION_DIMS + EXHAUSTIVE_DEBUG_ACTION_DIMS
|
|
# Tests are calibrated so that they can execute and spill with this page size.
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension_from_dict({'default_spillable_buffer_size': ['256k'],
|
|
'debug_action': debug_action_dims, 'mt_dop': [3]}))
|
|
|
|
def test_spilling_broadcast_joins(self, vector):
|
|
# Disable bloom-filter for Kudu since the number of probe rows could be reduced
|
|
# if runtime bloom-filter is pushed to Kudu, hence change the spilling behavior.
|
|
self.execute_query("SET ENABLED_RUNTIME_FILTER_TYPES=MIN_MAX")
|
|
|
|
self.run_test_case('QueryTest/spilling-broadcast-joins', vector)
|