Files
impala/tests/query_test/test_scanners_fuzz.py
Tim Armstrong c4d284f3cc IMPALA-5483: Automatically disable codegen for small queries
This is similar to the single-node execution optimisation, but applies
to slightly larger queries that should run in a distributed manner but
won't benefit from codegen.

This adds a new query option disable_codegen_rows_threshold that
defaults to 50,000. If fewer than this number of rows are processed
by a plan node per impalad, the cost of codegen almost certainly
outweighs the benefit.

Using rows processed as a threshold is justified by a simple
model that assumes the cost of codegen and execution per row for
the same operation are proportional. E.g. if x is the complexity
of the operation, n is the number of rows processed, C is a
constant factor giving the cost of codegen and Ec/Ei are constant
factor giving the cost of codegen'd and interpreted execution and
d, then the cost of the codegen'd operator is C * x + Ec * x * n
and the cost of the interpreted operator is Ei * x * n. Rearranging
means that interpretation is cheaper if n < C / (Ei - Ec), i.e. that
(at least with the simplified model) it makes sense to choose
interpretation or codegen based on a constant threshold. The
model also implies that it is somewhat safer to choose codegen
because the additional cost of codegen is O(1) but the additional
cost of interpretation is O(n).

I ran some experiments with TPC-H Q1, varying the input table size, to
determine what the cut-over point where codegen was beneficial was.
The cutover was around 150k rows per node for both text and parquet.
At 50k rows per node disabling codegen was very beneficial - around
0.12s versus 0.24s.  To be somewhat conservative I set the default
threshold to 50k rows. On more complex queries, e.g. TPC-H Q10, the
cutover tends to be higher because there are plan nodes that process
many fewer than the max rows.

Fix a couple of minor issues in the frontend - the numNodes_
calculation could return 0 for Kudu, and the single node optimization
didn't handle the case where for a scan node with conjuncts, a limit
and missing stats correctly (it considered the estimate still valid.)

Testing:
Updated e2e tests that set disable_codegen to set
disable_codegen_rows_threshold to 0, so that those tests run both
with and without codegen still.

Added an e2e test to make sure that the optimisation is applied in
the backend.

Added planner tests for various cases where codegen should and shouldn't
be disabled.

Perf:
Added a targeted perf test for a join+agg over a small input, which
benefits from this change.

Change-Id: I273bcee58641f5b97de52c0b2caab043c914b32e
Reviewed-on: http://gerrit.cloudera.org:8080/7153
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Impala Public Jenkins
2017-06-29 21:14:59 +00:00

230 lines
9.7 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from copy import copy
import itertools
import os
import pytest
import random
import shutil
import tempfile
import time
from subprocess import check_call
from tests.common.test_dimensions import create_exec_option_dimension_from_dict
from tests.common.impala_test_suite import ImpalaTestSuite, LOG
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
# Random fuzz testing of HDFS scanners. Existing tables for any HDFS file format
# are corrupted in random ways to flush out bugs with handling of corrupted data.
class TestScannersFuzzing(ImpalaTestSuite):
# Use abort_on_error = False to ensure we scan all the files.
ABORT_ON_ERROR_VALUES = [False]
# Only run on all nodes - num_nodes=1 would not provide additional coverage.
NUM_NODES_VALUES = [0]
# Limit memory to avoid causing other concurrent tests to fail.
MEM_LIMITS = ['512m']
# Test the codegen and non-codegen paths.
DISABLE_CODEGEN_VALUES = [True, False]
# Test a range of batch sizes to exercise different corner cases.
BATCH_SIZES = [0, 1, 16, 10000]
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScannersFuzzing, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension_from_dict({
'abort_on_error' : cls.ABORT_ON_ERROR_VALUES,
'num_nodes' : cls.NUM_NODES_VALUES,
'mem_limit' : cls.MEM_LIMITS}))
# TODO: enable for more table formats once they consistently pass the fuzz test.
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ('avro', 'parquet') or
(v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec in ('none', 'lzo')))
def test_fuzz_alltypes(self, vector, unique_database):
self.run_fuzz_test(vector, unique_database, "alltypes")
def test_fuzz_decimal_tbl(self, vector, unique_database):
table_format = vector.get_value('table_format')
table_name = "decimal_tbl"
if table_format.file_format == 'avro':
table_name = "avro_decimal_tbl"
if table_format.compression_codec != 'snap' or \
table_format.compression_type != 'block':
pytest.skip()
elif table_format.file_format == 'text' and \
table_format.compression_codec != 'none':
# decimal_tbl is not present for these file formats
pytest.skip()
self.run_fuzz_test(vector, unique_database, table_name, 10)
def test_fuzz_nested_types(self, vector, unique_database):
table_format = vector.get_value('table_format')
if table_format.file_format != 'parquet': pytest.skip()
self.run_fuzz_test(vector, unique_database, "complextypestbl", 10)
# TODO: add test coverage for additional data types like char and varchar
def run_fuzz_test(self, vector, unique_database, table, num_copies=1):
""" Do some basic fuzz testing: create a copy of an existing table with randomly
corrupted files and make sure that we don't crash or behave in an unexpected way.
'unique_database' is used for the table, so it will be cleaned up automatically.
If 'num_copies' is set, create that many corrupted copies of each input file.
SCANNER_FUZZ_SEED can be set in the environment to reproduce the result (assuming that
input files are the same).
SCANNER_FUZZ_KEEP_FILES can be set in the environment to keep the generated files.
"""
# Create and seed a new random number generator for reproducibility.
rng = random.Random()
random_seed = os.environ.get("SCANNER_FUZZ_SEED") or time.time()
LOG.info("Using random seed %d", random_seed)
rng.seed(long(random_seed))
table_format = vector.get_value('table_format')
self.change_database(self.client, table_format)
tmp_table_dir = tempfile.mkdtemp(prefix="tmp-scanner-fuzz-%s" % table,
dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))
self.execute_query("create table %s.%s like %s" % (unique_database, table, table))
fuzz_table_location = get_fs_path("/test-warehouse/{0}.db/{1}".format(
unique_database, table))
LOG.info("Generating corrupted version of %s in %s. Local working directory is %s",
table, unique_database, tmp_table_dir)
# Find the location of the existing table and get the full table directory structure.
table_loc = self._get_table_location(table, vector)
check_call(['hdfs', 'dfs', '-copyToLocal', table_loc + "/*", tmp_table_dir])
partitions = self.walk_and_corrupt_table_data(tmp_table_dir, num_copies, rng)
for partition in partitions:
self.execute_query('alter table {0}.{1} add partition ({2})'.format(
unique_database, table, ','.join(partition)))
# Copy all of the local files and directories to hdfs.
to_copy = ["%s/%s" % (tmp_table_dir, file_or_dir)
for file_or_dir in os.listdir(tmp_table_dir)]
check_call(['hdfs', 'dfs', '-copyFromLocal'] + to_copy + [fuzz_table_location])
if "SCANNER_FUZZ_KEEP_FILES" not in os.environ:
shutil.rmtree(tmp_table_dir)
# Querying the corrupted files should not DCHECK or crash.
self.execute_query("refresh %s.%s" % (unique_database, table))
# Execute a query that tries to read all the columns and rows in the file.
# Also execute a count(*) that materializes no columns, since different code
# paths are exercised.
queries = [
'select count(*) from (select distinct * from {0}.{1}) q'.format(
unique_database, table),
'select count(*) from {0}.{1} q'.format(unique_database, table)]
for query, batch_size, disable_codegen in \
itertools.product(queries, self.BATCH_SIZES, self.DISABLE_CODEGEN_VALUES):
query_options = copy(vector.get_value('exec_option'))
query_options['batch_size'] = batch_size
query_options['disable_codegen'] = disable_codegen
query_options['disable_codegen_rows_threshold'] = 0
try:
result = self.execute_query(query, query_options = query_options)
LOG.info('\n'.join(result.log))
except Exception as e:
if 'memory limit exceeded' in str(e).lower():
# Memory limit error should fail query.
continue
msg = "Should not throw error when abort_on_error=0: '{0}'".format(e)
LOG.error(msg)
# Parquet and compressed text can fail the query for some parse errors.
# E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
# (IMPALA-4013).
if table_format.file_format != 'parquet' \
and not (table_format.file_format == 'text' and
table_format.compression_codec != 'none'):
raise
def walk_and_corrupt_table_data(self, tmp_table_dir, num_copies, rng):
""" Walks a local copy of a HDFS table directory. Returns a list of partitions, each
as a list of "key=val" pairs. Ensures there is 'num_copies' copies of each file,
and corrupts each of the copies.
"""
partitions = []
# Iterate over the partitions and files we downloaded.
for subdir, dirs, files in os.walk(tmp_table_dir):
if '_impala_insert_staging' in subdir: continue
if len(dirs) != 0: continue # Skip non-leaf directories
rel_subdir = os.path.relpath(subdir, tmp_table_dir)
if rel_subdir != ".":
# Create metadata for any directory partitions.
partitions.append(self.partitions_from_path(rel_subdir))
# Corrupt all of the files that we find.
for filename in files:
filepath = os.path.join(subdir, filename)
copies = [filepath]
for copy_num in range(1, num_copies):
copypath = os.path.join(subdir, "copy{0}_{1}".format(copy_num, filename))
shutil.copyfile(filepath, copypath)
copies.append(copypath)
for filepath in copies:
self.corrupt_file(filepath, rng)
return partitions
def partitions_from_path(self, relpath):
""" Return a list of "key=val" parts from partitions inferred from the directory path.
"""
reversed_partitions = []
while relpath != '':
relpath, suffix = os.path.split(relpath)
reversed_partitions.append(suffix)
return reversed(reversed_partitions)
def corrupt_file(self, path, rng):
""" Corrupt the file at 'path' in the local file system in a randomised way using the
random number generator 'rng'. Rewrites the file in-place.
Logs a message to describe how the file was corrupted, so the error is reproducible.
"""
with open(path, "rb") as f:
data = bytearray(f.read())
if rng.random() < 0.5:
flip_offset = rng.randint(0, len(data) - 1)
flip_val = rng.randint(0, 255)
LOG.info("corrupt_file: Flip byte in %s at %d from %d to %d", path, flip_offset,
data[flip_offset], flip_val)
data[flip_offset] = flip_val
else:
truncation = rng.randint(0, len(data))
LOG.info("corrupt_file: Truncate %s to %d", path, truncation)
data = data[:truncation]
with open(path, "wb") as f:
f.write(data)