Files
impala/tests/query_test/test_tablesample.py
Todd Lipcon effe4e6668 IMPALA-7294. TABLESAMPLE should not allocate array based on total table file count
This changes HdfsTable.getFilesSample() to allocate its intermediate
sampling array based on the number of files in the selected
(post-pruning) partitions, rather than the total number of files in the
table. While the former behavior was correct (the total file count is of
course an upper bound on the pruned file count), it was an unnecessarily
large allocation, which has some downsides around garbage collection.

In addition, this is important for the LocalCatalog implementation of
table sampling, since we do not want to have to load all partition file
lists in order to compute a sample over a pruned subset of partitions.

The original code indicated that this was an optimization to avoid
looping over the partition list an extra time. However, typical
partition lists are relatively small even in the worst case (order of
100k) and looping over 100k in-memory Java objects is not likely to be
the bottleneck in planning any query. This is especially true
considering that we loop over that same list later in the function
anyway, so we probably aren't saving page faults or LLC cache misses
either.

In testing this change I noticed that the existing test for TABLESAMPLE
didn't test TABLESAMPLE when applied in conjunction with a predicate.
I added a new dimension to the test which employs a predicate which
prunes some partitions to ensure that the code works in that case.
I also added coverage of the "100%" sampling parameter as a sanity check
that it returns the same results as a non-sampled query.

Change-Id: I0248d89bcd9dd4ff8b4b85fef282c19e3fe9bdd5
Reviewed-on: http://gerrit.cloudera.org:8080/10936
Reviewed-by: Philip Zeyliger <philip@cloudera.com>
Reviewed-by: Vuk Ercegovac <vercegovac@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-07-17 22:56:50 +00:00

94 lines
4.3 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Tests the TABLESAMPLE clause.
import pytest
import subprocess
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_vector import ImpalaTestDimension
class TestTableSample(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestTableSample, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('repeatable', *[True, False]))
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('filtered', *[True, False]))
# Tablesample is only supported on HDFS tables.
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format != 'kudu' and
v.get_value('table_format').file_format != 'hbase')
if cls.exploration_strategy() != 'exhaustive':
# Cut down on core testing time by limiting the file formats.
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'parquet' or
v.get_value('table_format').file_format == 'text')
def test_tablesample(self, vector):
# Do not use a .test to avoid making this test flaky.
# 1. Queries without the repeatable clause are non-deterministic.
# 2. The results of queries without a repeatable clause could change due to
# changes in data loading that affect the number or size of files.
repeatable = vector.get_value('repeatable')
filtered = vector.get_value('filtered')
where_clause = ""
if filtered:
where_clause = "where month between 1 and 6"
ImpalaTestSuite.change_database(self.client, vector.get_value('table_format'))
result = self.client.execute("select count(*) from alltypes %s" % where_clause)
baseline_count = int(result.data[0])
prev_count = None
for perc in [5, 20, 50, 100]:
rep_sql = ""
if repeatable: rep_sql = " repeatable(1)"
sql_stmt = "select count(*) from alltypes tablesample system(%s)%s %s" \
% (perc, rep_sql, where_clause)
handle = self.client.execute_async(sql_stmt)
# IMPALA-6352: flaky test, possibly due to a hung thread. Wait for 500 sec before
# failing and logging the backtraces of all impalads.
is_finished = self.client.wait_for_finished_timeout(handle, 500)
assert is_finished, 'Query Timed out. Dumping backtrace of all threads in ' \
'impalads:\nthreads in the impalad1: %s \nthreads in the ' \
'impalad2: %s \nthreads in the impalad3: %s' % \
(subprocess.check_output(
"gdb -ex \"set pagination 0\" -ex \"thread apply all bt\" "
"--batch -p $(pgrep impalad | sed -n 1p)", shell=True),
subprocess.check_output(
"gdb -ex \"set pagination 0\" -ex \"thread apply all bt\" "
"--batch -p $(pgrep impalad | sed -n 2p)", shell=True),
subprocess.check_output(
"gdb -ex \"set pagination 0\" -ex \"thread apply all bt\" "
"--batch -p $(pgrep impalad | sed -n 3p)", shell=True))
result = self.client.fetch(sql_stmt, handle)
self.client.close_query(handle)
count = int(result.data[0])
if perc < 100:
assert count < baseline_count
else:
assert count == baseline_count
if prev_count and repeatable:
# May not necessarily be true for non-repeatable samples
assert count > prev_count
prev_count = count