Files
impala/tests/query_test/test_join_queries.py
xqhe dcdbaf1222 IMPALA-5022 part 1: Implement core functions of outer join simplification
Outer joins in SQL can return rows with certain columns filled with
NULLs when a match can not be found. However, such rows can be
rejected by null-rejecting predicates. The conditions in a null-rejecting
predicate that are always evaluated to FALSE for NULLs are referred to
as null-filtering conditions.

In general, an outer join can be converted to an inner join if there
exist null-filtering conditions on the inner tables. In a left outer
join, the right table is the inner table, while in a right outer join
it is the left table. In a full outer join, both tables are inner tables.

The option ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION enables or disables
the entire rewrite. This is False by default until we have done more
thorough functional testing.

For example,
1. A LEFT JOIN B ON A.id = B.id WHERE B.v > 10
= A INNER JOIN B ON A.id = B.id WHERE B.v > 10

2. A RIGHT JOIN B ON A.id = B.id WHERE A.v > 10
= A INNER JOIN B ON A.id = B.id WHERE A.v > 10

3. A FULL JOIN B ON A.id = B.id WHERE A.v > 10
= A LEFT JOIN B ON A.id = B.id WHERE A.v > 10

4. A FULL JOIN B ON A.id = B.id WHERE B.v > 10
= A RIGHT JOIN B ON A.id = B.id WHERE B.v > 10

5. A FULL JOIN B ON A.id = B.id WHERE A.v > 10 AND B.v > 10
= A INNER JOIN B ON A.id = B.id WHERE A.v > 10 AND B.v > 10

6. A LEFT JOIN B ON A.id = B.id INNER JOIN C ON B.id = C.id
= A INNER JOIN B ON A.id = B.id INNER JOIN C ON B.id = C.id

7. A RIGHT JOIN B ON A.id = B.id INNER JOIN C ON A.id = C.id
= A INNER JOIN B ON A.id = B.id INNER JOIN C ON A.id = C.id

8. A FULL JOIN B ON A.id = B.id INNER JOIN C ON A.id = C.id
= A LEFT JOIN B ON A.id = B.id INNER JOIN C ON A.id = C.id

9. A FULL JOIN B ON A.id = B.id INNER JOIN C ON B.id = C.id
= A RIGHT JOIN B ON A.id = B.id INNER JOIN C ON B.id = C.id

10. A FULL JOIN B ON A.id = B.id INNER JOIN C ON A.id + B.id = C.id
= A INNER JOIN B ON A.id = B.id INNER JOIN C ON A.id + B.id = C.id

In this commit, we have supported most of the cases that can convert
an outer join to an inner join, except for converting the embedding
inline view outer join by the join condition like
"SELECT * FROM T1 JOIN (SELECT T3.A A FROM T2 LEFT JOIN T3
ON T3.B=T2.B) T4 ON T4.A=T1.A". We will support it in part 2.

Tests:
* Add new plan tests outer-to-inner-joins.test
* Add new query tests to verify the correctness on transformation
* Ran the full set of verifications in Impala Public Jenkins

Change-Id: Iaa7804033fac68e93f33c387dc68ef67f803e93e
Reviewed-on: http://gerrit.cloudera.org:8080/16266
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2020-09-22 22:19:41 +00:00

206 lines
9.0 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Targeted tests for Impala joins
#
import pytest
from copy import deepcopy
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import (
SkipIf,
SkipIfIsilon,
SkipIfLocal,
SkipIfS3,
SkipIfABFS,
SkipIfADLS)
from tests.common.test_vector import ImpalaTestDimension
class TestJoinQueries(ImpalaTestSuite):
BATCH_SIZES = [0, 1]
MT_DOP_VALUES = [0, 4]
# Additional values for exhaustive tests.
MT_DOP_VALUES_EXHAUSTIVE = [1]
ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = ['false', 'true']
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestJoinQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
mt_dop_values = cls.MT_DOP_VALUES
if cls.exploration_strategy() == 'exhaustive':
mt_dop_values += cls.MT_DOP_VALUES_EXHAUSTIVE
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('mt_dop', *mt_dop_values))
# TODO: Look into splitting up join tests to accomodate hbase.
# Joins with hbase tables produce drastically different results.
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format in ['parquet'])
if cls.exploration_strategy() != 'exhaustive':
# Cut down on execution time when not running in exhaustive mode.
cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1)
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('enable_outer_join_to_inner_transformation',
*TestJoinQueries.ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION))
def test_basic_joins(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
self.run_test_case('QueryTest/joins', new_vector)
def test_single_node_joins_with_limits_exhaustive(self, vector):
if self.exploration_strategy() != 'exhaustive': pytest.skip()
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['num_nodes'] = 1
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
del new_vector.get_value('exec_option')['batch_size'] # .test file sets batch_size
self.run_test_case('QueryTest/single-node-joins-with-limits-exhaustive', new_vector)
@SkipIfS3.hbase
@SkipIfABFS.hbase
@SkipIfADLS.hbase
@SkipIfIsilon.hbase
@SkipIf.skip_hbase
@SkipIfLocal.hbase
def test_joins_against_hbase(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
self.run_test_case('QueryTest/joins-against-hbase', new_vector)
def test_outer_joins(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
self.run_test_case('QueryTest/outer-joins', new_vector)
def test_outer_to_inner_joins(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['enable_outer_join_to_inner_transformation']\
= vector.get_value('enable_outer_join_to_inner_transformation')
self.run_test_case('QueryTest/outer-to-inner-joins', new_vector)
def test_single_node_nested_loop_joins(self, vector):
# Test the execution of nested-loops joins for join types that can only be
# executed in a single node (right [outer|semi|anti] and full outer joins).
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/single-node-nlj', new_vector)
def test_single_node_nested_loop_joins_exhaustive(self, vector):
if self.exploration_strategy() != 'exhaustive': pytest.skip()
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['num_nodes'] = 1
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
self.run_test_case('QueryTest/single-node-nlj-exhaustive', new_vector)
def test_empty_build_joins(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
self.run_test_case('QueryTest/empty-build-joins', new_vector)
class TestTPCHJoinQueries(ImpalaTestSuite):
# Uses the TPC-H dataset in order to have larger joins. Needed for example to test
# the repartitioning codepaths.
@classmethod
def get_workload(cls):
return 'tpch'
@classmethod
def add_test_dimensions(cls):
super(TestTPCHJoinQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format in ['parquet'])
if cls.exploration_strategy() != 'exhaustive':
# Cut down on execution time when not running in exhaustive mode.
cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1)
@classmethod
def teardown_class(cls):
cls.client.execute('set mem_limit = 0');
super(TestTPCHJoinQueries, cls).teardown_class()
def test_outer_joins(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
self.run_test_case('tpch-outer-joins', new_vector)
class TestSemiJoinQueries(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestSemiJoinQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
# Joins with hbase tables produce drastically different results.
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format in ['parquet'])
if cls.exploration_strategy() != 'exhaustive':
# Cut down on execution time when not running in exhaustive mode.
cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1)
def __load_semi_join_tables(self, db_name):
# Create and load fresh test tables for semi/anti-join tests
fq_tbl_name_a = '%s.SemiJoinTblA' % db_name
self.client.execute('create table %s (a int, b int, c int)' % fq_tbl_name_a)
self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_a);
self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_a);
self.client.execute('insert into %s values(1,2,10)' % fq_tbl_name_a);
self.client.execute('insert into %s values(1,3,10)' % fq_tbl_name_a);
self.client.execute('insert into %s values(NULL,NULL,30)' % fq_tbl_name_a);
self.client.execute('insert into %s values(2,4,30)' % fq_tbl_name_a);
self.client.execute('insert into %s values(2,NULL,20)' % fq_tbl_name_a);
fq_tbl_name_b = '%s.SemiJoinTblB' % db_name
self.client.execute('create table %s (a int, b int, c int)' % fq_tbl_name_b)
self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_b);
self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_b);
self.client.execute('insert into %s values(1,2,5)' % fq_tbl_name_b);
self.client.execute('insert into %s values(1,NULL,10)' % fq_tbl_name_b);
self.client.execute('insert into %s values(2,10,NULL)' % fq_tbl_name_b);
self.client.execute('insert into %s values(3,NULL,NULL)' % fq_tbl_name_b);
self.client.execute('insert into %s values(3,NULL,50)' % fq_tbl_name_b);
def test_semi_joins(self, vector, unique_database):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
self.__load_semi_join_tables(unique_database)
self.run_test_case('QueryTest/semi-joins', new_vector, unique_database)
@pytest.mark.execute_serially
def test_semi_joins_exhaustive(self, vector):
"""Expensive and memory-intensive semi-join tests."""
if self.exploration_strategy() != 'exhaustive': pytest.skip()
self.run_test_case('QueryTest/semi-joins-exhaustive', vector)