mirror of
https://github.com/apache/impala.git
synced 2026-02-02 06:00:36 -05:00
This work addresses the current limitation in computing the total row count for a Hive table in a scan. The row count can be incorrectly computed as 0, even though there exists data in the Hive table. This is the stats corruption at table level. Similar stats corruption exists for a partition. The row count of a table or a partition sometime can also be -1 which indicates a missing stats situation. In the fix, as long as no partition in a Hive table exhibits any missing or corrupt stats, the total row count for the table is computed from the row counts in all partitions. Otherwise, Impala looks at the table level stats particularly the table row count. In addition, if the table stats is missing or corrupted, Impala estimates a row count for the table, if feasible. This row count is the sum of the row count from the partitions with good stats, and an estimation of the number of rows in the partitions with missing or corrupt stats. Such estimation also applies when some partition has corrupt stats. One way to observe the fix is through the explain of queries scanning Hive tables with missing or corrupted stats. The cardinality for any full scan should be a positive value (i.e. the estimated row count), instead of 'unavailable'. At the beginning of the explain output, that table is still listed in the WARNING section for potentially corrupt table statistics. Testing: 1. Ran unit tests with queries documented in the case against Hive tables with the following configrations: a. No stats corruption in any partitions b. Stats corruption in some partitions c. Stats corruption in all partitions 2. Added two new tests in test_compute_stats.py: a. test_corrupted_stats_in_partitioned_Hive_tables b. test_corrupted_stats_in_unpartitioned_Hive_tables 3. Fixed failures in corrupt-stats.test 4. Ran "core" test Change-Id: I9f4c64616ff7c0b6d5a48f2b5331325feeff3576 Reviewed-on: http://gerrit.cloudera.org:8080/16098 Reviewed-by: Sahil Takiar <stakiar@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
235 lines
11 KiB
Python
235 lines
11 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Functional tests running EXPLAIN statements.
|
|
#
|
|
import re
|
|
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.skip import SkipIfLocal, SkipIfNotHdfsMinicluster, SkipIfEC
|
|
from tests.util.filesystem_utils import WAREHOUSE
|
|
|
|
# Tests the different explain levels [0-3] on a few queries.
|
|
# TODO: Clean up this test to use an explain level test dimension and appropriate
|
|
# result sub-sections for the expected explain plans.
|
|
@SkipIfEC.fix_later
|
|
class TestExplain(ImpalaTestSuite):
|
|
# Value for the num_scanner_threads query option to ensure that the memory estimates of
|
|
# scan nodes are consistent even when run on machines with different numbers of cores.
|
|
NUM_SCANNER_THREADS = 1
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestExplain, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'text' and\
|
|
v.get_value('table_format').compression_codec == 'none' and\
|
|
v.get_value('exec_option')['batch_size'] == 0 and\
|
|
v.get_value('exec_option')['disable_codegen'] == False and\
|
|
v.get_value('exec_option')['num_nodes'] != 1)
|
|
|
|
@SkipIfNotHdfsMinicluster.plans
|
|
def test_explain_level0(self, vector):
|
|
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
|
|
vector.get_value('exec_option')['explain_level'] = 0
|
|
self.run_test_case('QueryTest/explain-level0', vector)
|
|
|
|
@SkipIfNotHdfsMinicluster.plans
|
|
def test_explain_level1(self, vector):
|
|
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
|
|
vector.get_value('exec_option')['explain_level'] = 1
|
|
self.run_test_case('QueryTest/explain-level1', vector)
|
|
|
|
@SkipIfNotHdfsMinicluster.plans
|
|
def test_explain_level2(self, vector):
|
|
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
|
|
vector.get_value('exec_option')['explain_level'] = 2
|
|
self.run_test_case('QueryTest/explain-level2', vector)
|
|
|
|
@SkipIfNotHdfsMinicluster.plans
|
|
def test_explain_level3(self, vector):
|
|
vector.get_value('exec_option')['num_scanner_threads'] = self.NUM_SCANNER_THREADS
|
|
vector.get_value('exec_option')['explain_level'] = 3
|
|
self.run_test_case('QueryTest/explain-level3', vector)
|
|
|
|
@staticmethod
|
|
def check_row_size_and_cardinality(query_result, expected_row_size=None,
|
|
expected_cardinality=None):
|
|
regex = re.compile('tuple-ids=.+ row-size=(\d+)B cardinality=(.*)')
|
|
found_match = False
|
|
for res in query_result:
|
|
m = regex.match(res.strip())
|
|
if m:
|
|
found_match = True
|
|
assert len(m.groups()) == 2
|
|
if expected_row_size:
|
|
assert m.groups()[0] == expected_row_size
|
|
if expected_cardinality:
|
|
assert m.groups()[1] == expected_cardinality
|
|
assert found_match, query_result
|
|
|
|
def test_explain_validate_cardinality_estimates(self, vector, unique_database):
|
|
# Tests that the cardinality estimates are correct for partitioned tables.
|
|
# TODO Cardinality estimation tests should eventually be part of the planner tests.
|
|
# TODO Remove this test
|
|
db_name = 'functional'
|
|
tbl_name = 'alltypes'
|
|
|
|
def check_cardinality(query_result, expected_cardinality):
|
|
self.check_row_size_and_cardinality(
|
|
query_result, expected_cardinality=expected_cardinality)
|
|
|
|
# All partitions are filtered out, cardinality should be 0.
|
|
result = self.execute_query("explain select * from %s.%s where year = 1900" % (
|
|
db_name, tbl_name), query_options={'explain_level':3})
|
|
check_cardinality(result.data, '0')
|
|
|
|
# Half of the partitions are filtered out, cardinality should be 3650.
|
|
result = self.execute_query("explain select * from %s.%s where year = 2010" % (
|
|
db_name, tbl_name), query_options={'explain_level':3})
|
|
check_cardinality(result.data, '3.65K')
|
|
|
|
# None of the partitions are filtered out, cardinality should be 7300.
|
|
result = self.execute_query("explain select * from %s.%s" % (db_name, tbl_name),
|
|
query_options={'explain_level':3})
|
|
check_cardinality(result.data, '7.30K')
|
|
|
|
# Create a partitioned table with a mixed set of available stats,
|
|
mixed_tbl = unique_database + ".t"
|
|
self.execute_query(
|
|
"create table %s (c int) partitioned by (p int)" % mixed_tbl)
|
|
self.execute_query(
|
|
"insert into table %s partition (p) values(1,1),(2,2),(3,3)" % mixed_tbl)
|
|
# Set the number of rows at the table level.
|
|
self.execute_query(
|
|
"alter table %s set tblproperties('numRows'='100')" % mixed_tbl)
|
|
# Should fall back to table-level cardinality when partitions lack stats.
|
|
result = self.execute_query("explain select * from %s" % mixed_tbl,
|
|
query_options={'explain_level':3})
|
|
check_cardinality(result.data, '100')
|
|
# Should fall back to table-level cardinality, even for a subset of partitions,
|
|
result = self.execute_query("explain select * from %s where p = 1" % mixed_tbl,
|
|
query_options={'explain_level':3})
|
|
check_cardinality(result.data, '100')
|
|
# Set the number of rows at the table level to -1.
|
|
self.execute_query(
|
|
"alter table %s set tblproperties('numRows'='-1')" % mixed_tbl)
|
|
# Set the number of rows for a single partition.
|
|
self.execute_query(
|
|
"alter table %s partition(p=1) set tblproperties('numRows'='50')" % mixed_tbl)
|
|
# Use partition stats when availabe. Row counts for partitions without
|
|
# stats are estimated.
|
|
result = self.execute_query("explain select * from %s" % mixed_tbl,
|
|
query_options={'explain_level':3})
|
|
check_cardinality(result.data, '51')
|
|
# Set the number of rows at the table level back to 100.
|
|
self.execute_query(
|
|
"alter table %s set tblproperties('numRows'='100')" % mixed_tbl)
|
|
# Fall back to table-level stats when no selected partitions have stats.
|
|
result = self.execute_query("explain select * from %s where p = 2" % mixed_tbl,
|
|
query_options={'explain_level':3})
|
|
check_cardinality(result.data, '100')
|
|
|
|
def test_explain_row_size_estimates(self, vector, unique_database):
|
|
""" Tests that EXPLAIN returns the expected row sizes with and without stats.
|
|
|
|
Planner tests is probably a more logical place for this, but covering string avg_size
|
|
handling end-to-end seemed easier here.
|
|
|
|
Note that row sizes do not include the null indicator bytes, so actual tuple sizes
|
|
are a bit larger. """
|
|
def check_row_size(query_result, expected_row_size):
|
|
self.check_row_size_and_cardinality(
|
|
query_result, expected_row_size=expected_row_size)
|
|
|
|
def execute_explain(query):
|
|
return self.execute_query("explain " + query, query_options={'explain_level': 3})
|
|
|
|
FQ_TBL_NAME = unique_database + ".t"
|
|
self.execute_query("create table %s (i int, s string)" % FQ_TBL_NAME)
|
|
# Fill the table with data that leads to avg_size of 4 for 's'.
|
|
self.execute_query("insert into %s values (1, '123'), (2, '12345')" % FQ_TBL_NAME)
|
|
|
|
# Always use slot size for fixed sized types.
|
|
result = execute_explain("select i from %s" % FQ_TBL_NAME)
|
|
check_row_size(result.data, '4')
|
|
|
|
# If there are no stats, use slot size for variable length types.
|
|
result = execute_explain("select s from %s" % FQ_TBL_NAME)
|
|
check_row_size(result.data, "12")
|
|
|
|
self.execute_query("compute stats %s" % FQ_TBL_NAME)
|
|
|
|
# Always use slot size for fixed sized types.
|
|
result = execute_explain("select i from %s" % FQ_TBL_NAME)
|
|
check_row_size(result.data, '4')
|
|
|
|
# If there are no stats, use slot size + avg_size for variable length types.
|
|
result = execute_explain("select s from %s" % FQ_TBL_NAME)
|
|
check_row_size(result.data, "16")
|
|
|
|
|
|
class TestExplainEmptyPartition(ImpalaTestSuite):
|
|
TEST_DB_NAME = "imp_1708"
|
|
|
|
def setup_method(self, method):
|
|
self.cleanup_db(self.TEST_DB_NAME)
|
|
self.execute_query("create database if not exists {0} location '{1}/{0}.db'"
|
|
.format(self.TEST_DB_NAME, WAREHOUSE))
|
|
|
|
def teardown_method(self, method):
|
|
self.cleanup_db(self.TEST_DB_NAME)
|
|
|
|
@SkipIfLocal.hdfs_client
|
|
def test_non_empty_partition_0_rows(self):
|
|
"""Regression test for IMPALA-1708: if a partition has 0 rows but > 0 files after
|
|
COMPUTE STATS, don't warn the user about missing stats. The files are probably
|
|
corrupted, or used for something else."""
|
|
self.client.execute("SET EXPLAIN_LEVEL=3")
|
|
self.client.execute("CREATE TABLE %s.empty_partition (col int) "
|
|
"partitioned by (p int)" % self.TEST_DB_NAME)
|
|
self.client.execute(
|
|
"ALTER TABLE %s.empty_partition ADD PARTITION (p=NULL)" % self.TEST_DB_NAME)
|
|
# Put an empty file in the partition so we have > 0 files, but 0 rows
|
|
self.filesystem_client.create_file(
|
|
"test-warehouse/%s.db/empty_partition/p=__HIVE_DEFAULT_PARTITION__/empty" %
|
|
self.TEST_DB_NAME, "")
|
|
self.client.execute("REFRESH %s.empty_partition" % self.TEST_DB_NAME)
|
|
self.client.execute("COMPUTE STATS %s.empty_partition" % self.TEST_DB_NAME)
|
|
assert "NULL\t0\t1" in str(
|
|
self.client.execute("SHOW PARTITIONS %s.empty_partition" % self.TEST_DB_NAME))
|
|
assert "missing relevant table and/or column statistics" not in str(
|
|
self.client.execute(
|
|
"EXPLAIN SELECT * FROM %s.empty_partition" % self.TEST_DB_NAME))
|
|
|
|
# Now add a partition with some data (so it gets selected into the scan), to check
|
|
# that its lack of stats is correctly identified
|
|
self.client.execute(
|
|
"ALTER TABLE %s.empty_partition ADD PARTITION (p=1)" % self.TEST_DB_NAME)
|
|
self.filesystem_client.create_file("test-warehouse/%s.db/empty_partition/p=1/rows" %
|
|
self.TEST_DB_NAME, "1")
|
|
self.client.execute("REFRESH %s.empty_partition" % self.TEST_DB_NAME)
|
|
explain_result = str(
|
|
self.client.execute("EXPLAIN SELECT * FROM %s.empty_partition" % self.TEST_DB_NAME))
|
|
assert "missing relevant table and/or column statistics" in explain_result
|
|
# Also test IMPALA-1530 - adding the number of partitions missing stats
|
|
assert "partitions: 1/2 " in explain_result
|