Files
impala/tests/query_test/test_scanners.py
Skye Wanderman-Milne 9b51b2b6e6 IMPALA-2835: introduce PARQUET_FALLBACK_SCHEMA_RESOLUTION query option
This patch introduces a new query option,
PARQUET_FALLBACK_SCHEMA_RESOLUTION which allows Parquet files' schemas
to be resolved by either name or position.  It's "fallback" because
eventually field IDs will be the primary schema resolution scheme, and
we don't want to create an option that we will have to change the name
of later. The default is still by position. I chose to do a query
option because it will make testing easier and also be easier to
diagnose resolution problems quickly in the field. If users want to
switch the default behavior to be by name (like Hive), they can use
the --default_query_options flag.

This patch also introduces a new test section, SHELL, which can be
used to execute shell commands in a .test file. This is useful for
copying files into test tables.

Change-Id: Id0c715ea23792b2a6872610839a40532aabbb5a6
Reviewed-on: http://gerrit.cloudera.org:8080/2384
Reviewed-by: Skye Wanderman-Milne <skye@cloudera.com>
Tested-by: Internal Jenkins
2016-04-02 04:04:25 +00:00

468 lines
19 KiB
Python

# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This test suite validates the scanners by running queries against ALL file formats and
# their permutations (e.g. compression codec/compression type). This works by exhaustively
# generating the table format test vectors for this specific test suite. This way, other
# tests can run with the normal exploration strategy and the overall test runtime doesn't
# explode.
import logging
import pytest
import random
from copy import deepcopy
from subprocess import call, check_call
from testdata.common import widetable
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
from tests.util.test_file_parser import *
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
from tests.util.get_parquet_metadata import get_parquet_metadata
from tests.common.test_dimensions import create_single_exec_option_dimension
from tests.common.skip import SkipIfS3, SkipIfIsilon, SkipIfOldAggsJoins, SkipIfLocal
from parquet.ttypes import ConvertedType
class TestScannersAllTableFormats(ImpalaTestSuite):
BATCH_SIZES = [0, 1, 16]
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScannersAllTableFormats, cls).add_test_dimensions()
if cls.exploration_strategy() == 'core':
# The purpose of this test is to get some base coverage of all the file formats.
# Even in 'core', we'll test each format by using the pairwise strategy.
cls.TestMatrix.add_dimension(cls.create_table_info_dimension('pairwise'))
cls.TestMatrix.add_dimension(
TestDimension('batch_size', *TestScannersAllTableFormats.BATCH_SIZES))
def test_scanners(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
self.run_test_case('QueryTest/scanners', new_vector)
# Test all the scanners with a simple limit clause. The limit clause triggers
# cancellation in the scanner code paths.
class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScannersAllTableFormatsWithLimit, cls).add_test_dimensions()
def test_limit(self, vector):
# Use a small batch size so changing the limit affects the timing of cancellation
vector.get_value('exec_option')['batch_size'] = 100
iterations = 50
query_template = "select * from alltypes limit %s"
for i in range(1, iterations):
# Vary the limit to vary the timing of cancellation
query = query_template % ((iterations * 100) % 1000 + 1)
self.execute_query(query, vector.get_value('exec_option'),
table_format=vector.get_value('table_format'))
# Test case to verify the scanners work properly when the table metadata (specifically the
# number of columns in the table) does not match the number of columns in the data file.
class TestUnmatchedSchema(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestUnmatchedSchema, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
# Avro has a more advanced schema evolution process which is covered in more depth
# in the test_avro_schema_evolution test suite.
cls.TestMatrix.add_constraint(\
lambda v: v.get_value('table_format').file_format != 'avro')
def _get_table_location(self, table_name, vector):
result = self.execute_query_using_client(self.client,
"describe formatted %s" % table_name, vector)
for row in result.data:
if 'Location:' in row:
return row.split('\t')[1]
# This should never happen.
assert 0, 'Unable to get location for table: ' + table_name
def _create_test_table(self, vector):
"""
Creates the test table
Cannot be done in a setup method because we need access to the current test vector
"""
self._drop_test_table(vector)
self.execute_query_using_client(self.client,
"create external table jointbl_test like jointbl", vector)
# Update the location of the new table to point the same location as the old table
location = self._get_table_location('jointbl', vector)
self.execute_query_using_client(self.client,
"alter table jointbl_test set location '%s'" % location, vector)
def _drop_test_table(self, vector):
self.execute_query_using_client(self.client,
"drop table if exists jointbl_test", vector)
def test_unmatched_schema(self, vector):
table_format = vector.get_value('table_format')
# jointbl has no columns with unique values. When loaded in hbase, the table looks
# different, as hbase collapses duplicates.
if table_format.file_format == 'hbase':
pytest.skip()
self._create_test_table(vector)
self.run_test_case('QueryTest/test-unmatched-schema', vector)
self._drop_test_table(vector)
# Tests that scanners can read a single-column, single-row, 10MB table
class TestWideRow(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestWideRow, cls).add_test_dimensions()
# I can't figure out how to load a huge row into hbase
cls.TestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format != 'hbase')
def test_wide_row(self, vector):
new_vector = deepcopy(vector)
# Use a 5MB scan range, so we will have to perform 5MB of sync reads
new_vector.get_value('exec_option')['max_scan_range_length'] = 5 * 1024 * 1024
# We need > 10 MB of memory because we're creating extra buffers:
# - 10 MB table / 5 MB scan range = 2 scan ranges, each of which may allocate ~20MB
# - Sync reads will allocate ~5MB of space
# The 100MB value used here was determined empirically by raising the limit until the
# query succeeded for all file formats -- I don't know exactly why we need this much.
# TODO: figure out exact breakdown of memory usage (IMPALA-681)
new_vector.get_value('exec_option')['mem_limit'] = 100 * 1024 * 1024
self.run_test_case('QueryTest/wide-row', new_vector)
class TestWideTable(ImpalaTestSuite):
# TODO: expand this to more rows when we have the capability
NUM_COLS = [250, 500, 1000]
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestWideTable, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(TestDimension("num_cols", *cls.NUM_COLS))
# To cut down on test execution time, only run in exhaustive.
if cls.exploration_strategy() != 'exhaustive':
cls.TestMatrix.add_constraint(lambda v: False)
def test_wide_table(self, vector):
NUM_COLS = vector.get_value('num_cols')
# Due to the way HBase handles duplicate row keys, we have different number of
# rows in HBase tables compared to HDFS tables.
NUM_ROWS = 10 if vector.get_value('table_format').file_format != 'hbase' else 2
DB_NAME = QueryTestSectionReader.get_db_name(vector.get_value('table_format'))
TABLE_NAME = "%s.widetable_%s_cols" % (DB_NAME, NUM_COLS)
result = self.client.execute("select count(*) from %s " % TABLE_NAME)
assert result.data == [str(NUM_ROWS)]
expected_result = widetable.get_data(NUM_COLS, NUM_ROWS, quote_strings=True)
result = self.client.execute("select * from %s" % TABLE_NAME)
if vector.get_value('table_format').file_format == 'hbase':
assert len(result.data) == NUM_ROWS
return
types = parse_column_types(result.schema)
labels = parse_column_labels(result.schema)
expected = QueryTestResult(expected_result, types, labels, order_matters=False)
actual = QueryTestResult(parse_result_rows(result), types, labels,
order_matters=False)
assert expected == actual
class TestParquet(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestParquet, cls).add_test_dimensions()
cls.TestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format == 'parquet')
def test_parquet(self, vector):
self.run_test_case('QueryTest/parquet', vector)
@SkipIfOldAggsJoins.nested_types
def test_continue_on_error(self, vector):
vector.get_value('exec_option')['abort_on_error'] = 0
self.run_test_case('QueryTest/parquet-continue-on-error', vector)
@SkipIfS3.hdfs_block_size
@SkipIfIsilon.hdfs_block_size
@SkipIfLocal.multiple_impalad
@pytest.mark.execute_serially
def test_multiple_blocks(self, vector):
# For IMPALA-1881. The table functional_parquet.lineitem_multiblock has 3 blocks, so
# each impalad should read 1 scan range.
# It needs to execute serially because if there is at a time more, than one query
# being scheduled, the simple scheduler round robins colocated impalads across
# all running queries. See IMPALA-2479 for more details.
table_name = 'functional_parquet.lineitem_multiblock'
self._multiple_blocks_helper(table_name, 20000, ranges_per_node=1)
table_name = 'functional_parquet.lineitem_sixblocks'
# 2 scan ranges per node should be created to read 'lineitem_sixblocks' because
# there are 6 blocks and 3 scan nodes.
self._multiple_blocks_helper(table_name, 40000, ranges_per_node=2)
@SkipIfS3.hdfs_block_size
@SkipIfIsilon.hdfs_block_size
@SkipIfLocal.multiple_impalad
@pytest.mark.execute_serially
def test_multiple_blocks_one_row_group(self, vector):
# For IMPALA-1881. The table functional_parquet.lineitem_multiblock_one_row_group has
# 3 blocks but only one row group across these blocks. We test to see that only one
# scan range reads everything from this row group.
table_name = 'functional_parquet.lineitem_multiblock_one_row_group'
self._multiple_blocks_helper(
table_name, 40000, one_row_group=True, ranges_per_node=1)
def _multiple_blocks_helper(
self, table_name, rows_in_table, one_row_group=False, ranges_per_node=1):
""" This function executes a simple SELECT query on a multiblock parquet table and
verifies the number of ranges issued per node and verifies that at least one row group
was read. If 'one_row_group' is True, then one scan range is expected to read the data
from the entire table regardless of the number of blocks. 'ranges_per_node' indicates
how many scan ranges we expect to be issued per node. """
query = 'select count(l_orderkey) from %s' % table_name
result = self.client.execute(query)
assert len(result.data) == 1
assert result.data[0] == str(rows_in_table)
runtime_profile = str(result.runtime_profile)
num_row_groups_list = re.findall('NumRowGroups: ([0-9]*)', runtime_profile)
scan_ranges_complete_list = re.findall(
'ScanRangesComplete: ([0-9]*)', runtime_profile)
num_rows_read_list = re.findall('RowsRead: [0-9.K]* \(([0-9]*)\)', runtime_profile)
# This will fail if the number of impalads != 3
# The fourth fragment is the "Averaged Fragment"
assert len(num_row_groups_list) == 4
assert len(scan_ranges_complete_list) == 4
assert len(num_rows_read_list) == 4
total_num_row_groups = 0
# Skip the Averaged Fragment; it comes first in the runtime profile.
for num_row_groups in num_row_groups_list[1:]:
total_num_row_groups += int(num_row_groups)
if not one_row_group: assert int(num_row_groups) > 0
if one_row_group:
# If it's the one row group test, only one scan range should read all the data from
# that row group.
assert total_num_row_groups == 1
for rows_read in num_rows_read_list[1:]:
if rows_read != '0': assert rows_read == str(rows_in_table)
for scan_ranges_complete in scan_ranges_complete_list:
assert int(scan_ranges_complete) == ranges_per_node
@SkipIfS3.insert
def test_annotate_utf8_option(self, vector, unique_database):
if self.exploration_strategy() != 'exhaustive': pytest.skip("Only run in exhaustive")
# Create table
TABLE_NAME = "parquet_annotate_utf8_test"
qualified_table_name = "%s.%s" % (unique_database, TABLE_NAME)
query = 'create table %s (a string, b char(10), c varchar(10), d string) ' \
'stored as parquet' % qualified_table_name
self.client.execute(query)
# Insert data that should have UTF8 annotation
query = 'insert overwrite table %s '\
'values("a", cast("b" as char(10)), cast("c" as varchar(10)), "d")' \
% qualified_table_name
self.execute_query(query, {'parquet_annotate_strings_utf8': True})
def get_schema_elements():
# Copy the created file to the local filesystem and parse metadata
local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000)
LOG.info("test_annotate_utf8_option local file name: " + local_file)
hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq'
% (unique_database, TABLE_NAME))
check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file])
metadata = get_parquet_metadata(local_file)
# Extract SchemaElements corresponding to the table columns
a_schema_element = metadata.schema[1]
assert a_schema_element.name == 'a'
b_schema_element = metadata.schema[2]
assert b_schema_element.name == 'b'
c_schema_element = metadata.schema[3]
assert c_schema_element.name == 'c'
d_schema_element = metadata.schema[4]
assert d_schema_element.name == 'd'
os.remove(local_file)
return a_schema_element, b_schema_element, c_schema_element, d_schema_element
# Check that the schema uses the UTF8 annotation
a_schema_elt, b_schema_elt, c_schema_elt, d_schema_elt = get_schema_elements()
assert a_schema_elt.converted_type == ConvertedType.UTF8
assert b_schema_elt.converted_type == ConvertedType.UTF8
assert c_schema_elt.converted_type == ConvertedType.UTF8
assert d_schema_elt.converted_type == ConvertedType.UTF8
# Create table and insert data that should not have UTF8 annotation for strings
self.execute_query(query, {'parquet_annotate_strings_utf8': False})
# Check that the schema does not use the UTF8 annotation except for CHAR and VARCHAR
# columns
a_schema_elt, b_schema_elt, c_schema_elt, d_schema_elt = get_schema_elements()
assert a_schema_elt.converted_type == None
assert b_schema_elt.converted_type == ConvertedType.UTF8
assert c_schema_elt.converted_type == ConvertedType.UTF8
assert d_schema_elt.converted_type == None
@SkipIfS3.insert
def test_resolution_by_name(self, unique_database, vector):
self.run_test_case('QueryTest/parquet-resolution-by-name', vector,
use_db=unique_database)
# We use various scan range lengths to exercise corner cases in the HDFS scanner more
# thoroughly. In particular, it will exercise:
# 1. default scan range
# 2. scan range with no tuple
# 3. tuple that span across multiple scan ranges
# 4. scan range length = 16 for ParseSse() execution path
MAX_SCAN_RANGE_LENGTHS = [0, 1, 2, 5, 16, 17, 32]
class TestScanRangeLengths(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScanRangeLengths, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(
TestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
def test_scan_ranges(self, vector):
vector.get_value('exec_option')['max_scan_range_length'] =\
vector.get_value('max_scan_range_length')
self.run_test_case('QueryTest/hdfs-tiny-scan', vector)
# More tests for text scanner
# 1. Test file that ends w/o tuple delimiter
# 2. Test file with escape character
class TestTextScanRangeLengths(ImpalaTestSuite):
ESCAPE_TABLE_LIST = ["testescape_16_lf", "testescape_16_crlf",
"testescape_17_lf", "testescape_17_crlf",
"testescape_32_lf", "testescape_32_crlf"]
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestTextScanRangeLengths, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(
TestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and\
v.get_value('table_format').compression_codec == 'none')
def test_text_scanner(self, vector):
vector.get_value('exec_option')['max_scan_range_length'] =\
vector.get_value('max_scan_range_length')
self.execute_query_expect_success(self.client, "drop stats "
"functional.table_no_newline_part")
self.execute_query_expect_success(self.client, "compute stats "
"functional.table_no_newline_part")
self.run_test_case('QueryTest/hdfs-text-scan', vector)
# Test various escape char cases. We have to check the count(*) result against
# the count(col) result because if the scan range is split right after the escape
# char, the escape char has no effect because we cannot scan backwards to the
# previous scan range.
for t in self.ESCAPE_TABLE_LIST:
expected_result = self.client.execute("select count(col) from " + t)
result = self.client.execute("select count(*) from " + t)
assert result.data == expected_result.data
# Missing Coverage: No coverage for truncated files errors or scans.
@SkipIfS3.hive
@SkipIfIsilon.hive
@SkipIfLocal.hive
@pytest.mark.execute_serially
class TestScanTruncatedFiles(ImpalaTestSuite):
TEST_DB = 'test_truncated_file'
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScanTruncatedFiles, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
# This test takes about a minute to complete due to the Hive commands that are
# executed. To cut down on runtime, limit the test to exhaustive exploration
# strategy.
# TODO: Test other file formats
if cls.exploration_strategy() == 'exhaustive':
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and\
v.get_value('table_format').compression_codec == 'none')
else:
cls.TestMatrix.add_constraint(lambda v: False)
def setup_method(self, method):
self.cleanup_db(TestScanTruncatedFiles.TEST_DB)
self.client.execute("create database %s location '%s/%s.db'" %
(TestScanTruncatedFiles.TEST_DB, WAREHOUSE,
TestScanTruncatedFiles.TEST_DB))
def teardown_method(self, method):
self.cleanup_db(TestScanTruncatedFiles.TEST_DB)
def test_scan_truncated_file_empty(self, vector):
self.scan_truncated_file(0)
def test_scan_truncated_file(self, vector):
self.scan_truncated_file(10)
def scan_truncated_file(self, num_rows):
db_name = TestScanTruncatedFiles.TEST_DB
tbl_name = "tbl"
self.execute_query("use %s" % db_name)
self.execute_query("create table %s (s string)" % tbl_name)
call(["hive", "-e", "INSERT OVERWRITE TABLE %s.%s SELECT string_col from "\
"functional.alltypes" % (db_name, tbl_name)])
# Update the Impala metadata
self.execute_query("refresh %s" % tbl_name)
# Insert overwrite with a truncated file
call(["hive", "-e", "INSERT OVERWRITE TABLE %s.%s SELECT string_col from "\
"functional.alltypes limit %s" % (db_name, tbl_name, num_rows)])
result = self.execute_query("select count(*) from %s" % tbl_name)
assert(len(result.data) == 1)
assert(result.data[0] == str(num_rows))