Files
impala/tests/metadata/test_explain.py
paul-rogers a7ea86b768 IMPALA-8021: Add estimated cardinality to EXPLAIN output
Cardinality is vital to understanding why a plan has the form it does,
yet the planner normally emits cardinality information only for the
detailed levels. Unfortunately, most query profiles we see are at the
standard level without this information (except in the summary table),
making it hard to understand what happened.

This patch adds cardinality to the standard EXPLAIN output. It also
changes the displayed cardinality value to be in abbreviated "metric"
form: 1.23K instead of 1234, etc.

Changing the DESCRIBE output has a huge impact on PlannerTest: all the
"golden" test files must change. To avoid doing this twice, this patch
also includes:

IMPALA-7919: Add predicates line in plan output for partition key
predicates

This is also the time to also include:

IMPALA-8022: Add cardinality checks to PlannerTest

The comparison code was changed to allow a set of validators, one of
which compares cardinality to ensure it is within 5% of the expected
value. This should ensure we don't change estimates unintentionally.

While many planner tests are concerned with cardinality, many others are
not. Testing showed that the cardinality is actually unstable within
tests. For such tests, added filters to ignore cardinality. The filter
is enabled by default (for backward compatibility) but disabled (to
allow cardinality verification) for the critical tests.

Rebasing the tests was complicated by a bug in the error-matching code,
so this patch also fixes:

IMPALA-8023: Fix PlannerTest to handle error lines consistently

Now, the error output written to the output "save results" file matches
that expected in the "golden" file -- no more handling these specially.

Testing:

* Added cardinality verification.
* Reran all FE tests.
* Rebased all PlannerTest .test files.
* Adjusted the metadata/test_explain.py test to handle the changed
  EXPLAIN output.

Change-Id: Ie9aa2d715b04cbb279aaffec8c5692686562d986
Reviewed-on: http://gerrit.cloudera.org:8080/12136
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2019-01-12 04:03:26 +00:00

179 lines
8.4 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Functional tests running EXPLAIN statements.
#
import re
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import SkipIfLocal, SkipIfNotHdfsMinicluster, SkipIfEC
from tests.util.filesystem_utils import WAREHOUSE
# Tests the different explain levels [0-3] on a few queries.
# TODO: Clean up this test to use an explain level test dimension and appropriate
# result sub-sections for the expected explain plans.
@SkipIfEC.fix_later
class TestExplain(ImpalaTestSuite):
# Value for the num_scanner_threads query option to ensure that the memory estimates of
# scan nodes are consistent even when run on machines with different numbers of cores.
NUM_SCANNER_THREADS = 1
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestExplain, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and\
v.get_value('table_format').compression_codec == 'none' and\
v.get_value('exec_option')['batch_size'] == 0 and\
v.get_value('exec_option')['disable_codegen'] == False and\
v.get_value('exec_option')['num_nodes'] != 1)
@SkipIfNotHdfsMinicluster.plans
def test_explain_level0(self, vector):
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
vector.get_value('exec_option')['explain_level'] = 0
self.run_test_case('QueryTest/explain-level0', vector)
@SkipIfNotHdfsMinicluster.plans
def test_explain_level1(self, vector):
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
vector.get_value('exec_option')['explain_level'] = 1
self.run_test_case('QueryTest/explain-level1', vector)
@SkipIfNotHdfsMinicluster.plans
def test_explain_level2(self, vector):
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
vector.get_value('exec_option')['explain_level'] = 2
self.run_test_case('QueryTest/explain-level2', vector)
@SkipIfNotHdfsMinicluster.plans
def test_explain_level3(self, vector):
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
vector.get_value('exec_option')['explain_level'] = 3
self.run_test_case('QueryTest/explain-level3', vector)
def test_explain_validate_cardinality_estimates(self, vector, unique_database):
# Tests that the cardinality estimates are correct for partitioned tables.
# TODO Cardinality estimation tests should eventually be part of the planner tests.
# TODO Remove this test
db_name = 'functional'
tbl_name = 'alltypes'
def check_cardinality(query_result, expected_cardinality):
regex = re.compile(' row-size=\d+B cardinality=(.*)$')
for res in query_result:
m = regex.match(res.strip())
if m:
assert len(m.groups()) == 1
# The cardinality should be zero.
assert m.groups()[0] == expected_cardinality
# All partitions are filtered out, cardinality should be 0.
result = self.execute_query("explain select * from %s.%s where year = 1900" % (
db_name, tbl_name), query_options={'explain_level':3})
check_cardinality(result.data, '0')
# Half of the partitions are filtered out, cardinality should be 3650.
result = self.execute_query("explain select * from %s.%s where year = 2010" % (
db_name, tbl_name), query_options={'explain_level':3})
check_cardinality(result.data, '3.65K')
# None of the partitions are filtered out, cardinality should be 7300.
result = self.execute_query("explain select * from %s.%s" % (db_name, tbl_name),
query_options={'explain_level':3})
check_cardinality(result.data, '7.30K')
# Create a partitioned table with a mixed set of available stats,
mixed_tbl = unique_database + ".t"
self.execute_query(
"create table %s (c int) partitioned by (p int)" % mixed_tbl)
self.execute_query(
"insert into table %s partition (p) values(1,1),(2,2),(3,3)" % mixed_tbl)
# Set the number of rows at the table level.
self.execute_query(
"alter table %s set tblproperties('numRows'='100')" % mixed_tbl)
# Should fall back to table-level cardinality when partitions lack stats.
result = self.execute_query("explain select * from %s" % mixed_tbl,
query_options={'explain_level':3})
check_cardinality(result.data, '100')
# Should fall back to table-level cardinality, even for a subset of partitions,
result = self.execute_query("explain select * from %s where p = 1" % mixed_tbl,
query_options={'explain_level':3})
check_cardinality(result.data, '100')
# Set the number of rows for a single partition.
self.execute_query(
"alter table %s partition(p=1) set tblproperties('numRows'='50')" % mixed_tbl)
# Use partition stats when availabe. Partitions without stats are ignored.
result = self.execute_query("explain select * from %s" % mixed_tbl,
query_options={'explain_level':3})
check_cardinality(result.data, '50')
# Fall back to table-level stats when no selected partitions have stats.
result = self.execute_query("explain select * from %s where p = 2" % mixed_tbl,
query_options={'explain_level':3})
check_cardinality(result.data, '100')
class TestExplainEmptyPartition(ImpalaTestSuite):
TEST_DB_NAME = "imp_1708"
def setup_method(self, method):
self.cleanup_db(self.TEST_DB_NAME)
self.execute_query("create database if not exists {0} location '{1}/{0}.db'"
.format(self.TEST_DB_NAME, WAREHOUSE))
def teardown_method(self, method):
self.cleanup_db(self.TEST_DB_NAME)
@SkipIfLocal.hdfs_client
def test_non_empty_partition_0_rows(self):
"""Regression test for IMPALA-1708: if a partition has 0 rows but > 0 files after
COMPUTE STATS, don't warn the user about missing stats. The files are probably
corrupted, or used for something else."""
self.client.execute("SET EXPLAIN_LEVEL=3")
self.client.execute("CREATE TABLE %s.empty_partition (col int) "
"partitioned by (p int)" % self.TEST_DB_NAME)
self.client.execute(
"ALTER TABLE %s.empty_partition ADD PARTITION (p=NULL)" % self.TEST_DB_NAME)
# Put an empty file in the partition so we have > 0 files, but 0 rows
self.filesystem_client.create_file(
"test-warehouse/%s.db/empty_partition/p=__HIVE_DEFAULT_PARTITION__/empty" %
self.TEST_DB_NAME, "")
self.client.execute("REFRESH %s.empty_partition" % self.TEST_DB_NAME)
self.client.execute("COMPUTE STATS %s.empty_partition" % self.TEST_DB_NAME)
assert "NULL\t0\t1" in str(
self.client.execute("SHOW PARTITIONS %s.empty_partition" % self.TEST_DB_NAME))
assert "missing relevant table and/or column statistics" not in str(
self.client.execute(
"EXPLAIN SELECT * FROM %s.empty_partition" % self.TEST_DB_NAME))
# Now add a partition with some data (so it gets selected into the scan), to check
# that its lack of stats is correctly identified
self.client.execute(
"ALTER TABLE %s.empty_partition ADD PARTITION (p=1)" % self.TEST_DB_NAME)
self.filesystem_client.create_file("test-warehouse/%s.db/empty_partition/p=1/rows" %
self.TEST_DB_NAME, "1")
self.client.execute("REFRESH %s.empty_partition" % self.TEST_DB_NAME)
explain_result = str(
self.client.execute("EXPLAIN SELECT * FROM %s.empty_partition" % self.TEST_DB_NAME))
assert "missing relevant table and/or column statistics" in explain_result
# Also test IMPALA-1530 - adding the number of partitions missing stats
assert "partitions: 1/2 " in explain_result