mirror of
https://github.com/apache/impala.git
synced 2026-02-03 09:00:39 -05:00
This enables parallel plans with the join build in a
separate fragment and fixes all of the ensuing fallout.
After this change, mt_dop plans with joins have separate
build fragments. There is still a 1:1 relationship between
join nodes and builders, so the builders are only accessed
by the join node's thread after it is handed off. This lets
us defer the work required to make PhjBuilder and NljBuilder
safe to be shared between nodes.
Planner changes:
* Combined the parallel and distributed planning code paths.
* Misc fixes to generate reasonable thrift structures in the
query exec requests, i.e. containing the right nodes.
* Fixes to resource calculations for the separate build plans.
** Calculate separate join/build resource consumption.
** Simplified the resource estimation by calculating resource
consumption for each fragment separately, and assuming that
all fragments hit their peak resource consumption at the
same time. IMPALA-9255 is the follow-on to make the resource
estimation more accurate.
Scheduler changes:
* Various fixes to handle multiple TPlanExecInfos correctly,
which are generated by the planner for the different cohorts.
* Add logic to colocate build fragments with parent fragments.
Runtime filter changes:
* Build sinks now produce runtime filters, which required
planner and coordinator fixes to handle.
DataSink changes:
* Close the input plan tree before calling FlushFinal() to release
resources. This depends on Send() not holding onto references
to input batches, which was true except for NljBuilder. This
invariant is documented.
Join builder changes:
* Add a common base class for PhjBuilder and NljBuilder with
functions to handle synchronisation with the join node.
* Close plan tree earlier in FragmentInstanceState::Exec()
so that peak resource requirements are lower.
* The NLJ always copies input batches, so that it can close
its input tree.
JoinNode changes:
* Join node blocks waiting for build-side to be ready,
then eventually signals that it's done, allowing the builder
to be cleaned up.
* NLJ and PHJ nodes handle both the integrated builder and
the external builder. There is a 1:1 relationship between
the node and the builder, so we don't deal with thread safety
yet.
* Buffer reservations are transferred between the builder and join
node when running with the separate builder. This is not really
necessary right now, since it is all single-threaded, but will
be important for the shared broadcast.
- The builder transfers memory for probe buffers to the join node
at the end of each build phase.
- At end of each probe phase, reservation needs to be handed back
to builder (or released).
ExecSummary changes:
* The summary logic was modified to handle connecting fragments
via join builds. The logic is an extension of what was used
for exchanges.
Testing:
* Enable --unlock_mt_dop for end-to-end tests
* Migrate some tests to run as part of end-to-end tests instead of
custom cluster.
* Add mt_dop dimension to various end-to-end tests to provide
coverage of join queries, spill-to-disk and cancellation.
* Ran a single node TPC-H and TPC-DS stress test with mt_dop=0
and mt_dop=4.
Perf:
* Ran TPC-H scale factor 30 locally with mt_dop=0. No significant
change.
Change-Id: I4403c8e62d9c13854e7830602ee613f8efc80c58
Reviewed-on: http://gerrit.cloudera.org:8080/14859
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
195 lines
8.5 KiB
Python
195 lines
8.5 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Targeted tests for Impala joins
|
|
#
|
|
import pytest
|
|
from copy import deepcopy
|
|
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.skip import (
|
|
SkipIf,
|
|
SkipIfIsilon,
|
|
SkipIfLocal,
|
|
SkipIfS3,
|
|
SkipIfABFS,
|
|
SkipIfADLS)
|
|
from tests.common.test_vector import ImpalaTestDimension
|
|
|
|
class TestJoinQueries(ImpalaTestSuite):
|
|
BATCH_SIZES = [0, 1]
|
|
MT_DOP_VALUES = [0, 4]
|
|
# Additional values for exhaustive tests.
|
|
MT_DOP_VALUES_EXHAUSTIVE = [1]
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestJoinQueries, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
|
|
mt_dop_values = cls.MT_DOP_VALUES
|
|
if cls.exploration_strategy() == 'exhaustive':
|
|
mt_dop_values += cls.MT_DOP_VALUES_EXHAUSTIVE
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
ImpalaTestDimension('mt_dop', *mt_dop_values))
|
|
# TODO: Look into splitting up join tests to accomodate hbase.
|
|
# Joins with hbase tables produce drastically different results.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format in ['parquet'])
|
|
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
# Cut down on execution time when not running in exhaustive mode.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1)
|
|
|
|
def test_basic_joins(self, vector):
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
|
|
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
|
|
self.run_test_case('QueryTest/joins', new_vector)
|
|
|
|
def test_single_node_joins_with_limits_exhaustive(self, vector):
|
|
if self.exploration_strategy() != 'exhaustive': pytest.skip()
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['num_nodes'] = 1
|
|
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
|
|
del new_vector.get_value('exec_option')['batch_size'] # .test file sets batch_size
|
|
self.run_test_case('QueryTest/single-node-joins-with-limits-exhaustive', new_vector)
|
|
|
|
@SkipIfS3.hbase
|
|
@SkipIfABFS.hbase
|
|
@SkipIfADLS.hbase
|
|
@SkipIfIsilon.hbase
|
|
@SkipIf.skip_hbase
|
|
@SkipIfLocal.hbase
|
|
def test_joins_against_hbase(self, vector):
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
|
|
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
|
|
self.run_test_case('QueryTest/joins-against-hbase', new_vector)
|
|
|
|
def test_outer_joins(self, vector):
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
|
|
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
|
|
self.run_test_case('QueryTest/outer-joins', new_vector)
|
|
|
|
def test_single_node_nested_loop_joins(self, vector):
|
|
# Test the execution of nested-loops joins for join types that can only be
|
|
# executed in a single node (right [outer|semi|anti] and full outer joins).
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['num_nodes'] = 1
|
|
self.run_test_case('QueryTest/single-node-nlj', new_vector)
|
|
|
|
def test_single_node_nested_loop_joins_exhaustive(self, vector):
|
|
if self.exploration_strategy() != 'exhaustive': pytest.skip()
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['num_nodes'] = 1
|
|
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
|
|
self.run_test_case('QueryTest/single-node-nlj-exhaustive', new_vector)
|
|
|
|
def test_empty_build_joins(self, vector):
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
|
|
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
|
|
self.run_test_case('QueryTest/empty-build-joins', new_vector)
|
|
|
|
class TestTPCHJoinQueries(ImpalaTestSuite):
|
|
# Uses the TPC-H dataset in order to have larger joins. Needed for example to test
|
|
# the repartitioning codepaths.
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'tpch'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestTPCHJoinQueries, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format in ['parquet'])
|
|
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
# Cut down on execution time when not running in exhaustive mode.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1)
|
|
|
|
@classmethod
|
|
def teardown_class(cls):
|
|
cls.client.execute('set mem_limit = 0');
|
|
super(TestTPCHJoinQueries, cls).teardown_class()
|
|
|
|
def test_outer_joins(self, vector):
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
|
|
self.run_test_case('tpch-outer-joins', new_vector)
|
|
|
|
class TestSemiJoinQueries(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestSemiJoinQueries, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
|
|
# Joins with hbase tables produce drastically different results.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format in ['parquet'])
|
|
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
# Cut down on execution time when not running in exhaustive mode.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1)
|
|
|
|
def __load_semi_join_tables(self, db_name):
|
|
# Create and load fresh test tables for semi/anti-join tests
|
|
fq_tbl_name_a = '%s.SemiJoinTblA' % db_name
|
|
self.client.execute('create table %s (a int, b int, c int)' % fq_tbl_name_a)
|
|
self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_a);
|
|
self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_a);
|
|
self.client.execute('insert into %s values(1,2,10)' % fq_tbl_name_a);
|
|
self.client.execute('insert into %s values(1,3,10)' % fq_tbl_name_a);
|
|
self.client.execute('insert into %s values(NULL,NULL,30)' % fq_tbl_name_a);
|
|
self.client.execute('insert into %s values(2,4,30)' % fq_tbl_name_a);
|
|
self.client.execute('insert into %s values(2,NULL,20)' % fq_tbl_name_a);
|
|
|
|
fq_tbl_name_b = '%s.SemiJoinTblB' % db_name
|
|
self.client.execute('create table %s (a int, b int, c int)' % fq_tbl_name_b)
|
|
self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_b);
|
|
self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_b);
|
|
self.client.execute('insert into %s values(1,2,5)' % fq_tbl_name_b);
|
|
self.client.execute('insert into %s values(1,NULL,10)' % fq_tbl_name_b);
|
|
self.client.execute('insert into %s values(2,10,NULL)' % fq_tbl_name_b);
|
|
self.client.execute('insert into %s values(3,NULL,NULL)' % fq_tbl_name_b);
|
|
self.client.execute('insert into %s values(3,NULL,50)' % fq_tbl_name_b);
|
|
|
|
def test_semi_joins(self, vector, unique_database):
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
|
|
self.__load_semi_join_tables(unique_database)
|
|
self.run_test_case('QueryTest/semi-joins', new_vector, unique_database)
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_semi_joins_exhaustive(self, vector):
|
|
"""Expensive and memory-intensive semi-join tests."""
|
|
if self.exploration_strategy() != 'exhaustive': pytest.skip()
|
|
self.run_test_case('QueryTest/semi-joins-exhaustive', vector)
|