mirror of
https://github.com/apache/impala.git
synced 2026-01-05 21:00:54 -05:00
Adds handling and testing for a specific Parquet data corruption scenario with plain dictionary encoded values. The problematic scenario is when the repeat or literal count of the RLE-encoded dictionary indexes is decoded as 0 - an invalid value. There are several other cases of data corruption that are not yet handled gracefully. This patch only handles one specific case. Change-Id: Ibf406c82cdded37966f09c81e4cc1446d2b60d63 Reviewed-on: http://gerrit.cloudera.org:8080/3299 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Alex Behm <alex.behm@cloudera.com>
599 lines
25 KiB
Python
599 lines
25 KiB
Python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# This test suite validates the scanners by running queries against ALL file formats and
|
|
# their permutations (e.g. compression codec/compression type). This works by exhaustively
|
|
# generating the table format test vectors for this specific test suite. This way, other
|
|
# tests can run with the normal exploration strategy and the overall test runtime doesn't
|
|
# explode.
|
|
|
|
import logging
|
|
import pytest
|
|
import random
|
|
import tempfile
|
|
from copy import deepcopy
|
|
from subprocess import call, check_call
|
|
|
|
from testdata.common import widetable
|
|
from tests.common.test_vector import *
|
|
from tests.common.impala_test_suite import *
|
|
from tests.util.test_file_parser import *
|
|
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
|
|
from tests.util.get_parquet_metadata import get_parquet_metadata
|
|
from tests.common.test_dimensions import create_single_exec_option_dimension
|
|
from tests.common.skip import SkipIfS3, SkipIfIsilon, SkipIfOldAggsJoins, SkipIfLocal
|
|
|
|
from parquet.ttypes import ConvertedType
|
|
|
|
class TestScannersAllTableFormats(ImpalaTestSuite):
|
|
BATCH_SIZES = [0, 1, 16]
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestScannersAllTableFormats, cls).add_test_dimensions()
|
|
if cls.exploration_strategy() == 'core':
|
|
# The purpose of this test is to get some base coverage of all the file formats.
|
|
# Even in 'core', we'll test each format by using the pairwise strategy.
|
|
cls.TestMatrix.add_dimension(cls.create_table_info_dimension('pairwise'))
|
|
cls.TestMatrix.add_dimension(
|
|
TestDimension('batch_size', *TestScannersAllTableFormats.BATCH_SIZES))
|
|
|
|
def test_scanners(self, vector):
|
|
new_vector = deepcopy(vector)
|
|
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
|
|
self.run_test_case('QueryTest/scanners', new_vector)
|
|
|
|
# Test all the scanners with a simple limit clause. The limit clause triggers
|
|
# cancellation in the scanner code paths.
|
|
class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestScannersAllTableFormatsWithLimit, cls).add_test_dimensions()
|
|
|
|
def test_limit(self, vector):
|
|
vector.get_value('exec_option')['abort_on_error'] = 1
|
|
self._test_limit(vector)
|
|
# IMPALA-3337: when continuing on error, the error log should not show errors
|
|
# (e.g. "Cancelled").
|
|
vector.get_value('exec_option')['abort_on_error'] = 0
|
|
self._test_limit(vector)
|
|
|
|
def _test_limit(self, vector):
|
|
# Use a small batch size so changing the limit affects the timing of cancellation
|
|
vector.get_value('exec_option')['batch_size'] = 100
|
|
iterations = 50
|
|
query_template = "select * from alltypes limit %s"
|
|
for i in range(1, iterations):
|
|
# Vary the limit to vary the timing of cancellation
|
|
limit = (iterations * 100) % 1000 + 1
|
|
query = query_template % limit
|
|
result = self.execute_query(query, vector.get_value('exec_option'),
|
|
table_format=vector.get_value('table_format'))
|
|
assert len(result.data) == limit
|
|
# IMPALA-3337: The error log should be empty.
|
|
assert not result.log
|
|
|
|
# Test case to verify the scanners work properly when the table metadata (specifically the
|
|
# number of columns in the table) does not match the number of columns in the data file.
|
|
class TestUnmatchedSchema(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestUnmatchedSchema, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
|
|
# Avro has a more advanced schema evolution process which is covered in more depth
|
|
# in the test_avro_schema_evolution test suite.
|
|
cls.TestMatrix.add_constraint(\
|
|
lambda v: v.get_value('table_format').file_format != 'avro')
|
|
|
|
def _get_table_location(self, table_name, vector):
|
|
result = self.execute_query_using_client(self.client,
|
|
"describe formatted %s" % table_name, vector)
|
|
for row in result.data:
|
|
if 'Location:' in row:
|
|
return row.split('\t')[1]
|
|
# This should never happen.
|
|
assert 0, 'Unable to get location for table: ' + table_name
|
|
|
|
def _create_test_table(self, vector):
|
|
"""
|
|
Creates the test table
|
|
|
|
Cannot be done in a setup method because we need access to the current test vector
|
|
"""
|
|
self._drop_test_table(vector)
|
|
self.execute_query_using_client(self.client,
|
|
"create external table jointbl_test like jointbl", vector)
|
|
|
|
# Update the location of the new table to point the same location as the old table
|
|
location = self._get_table_location('jointbl', vector)
|
|
self.execute_query_using_client(self.client,
|
|
"alter table jointbl_test set location '%s'" % location, vector)
|
|
|
|
def _drop_test_table(self, vector):
|
|
self.execute_query_using_client(self.client,
|
|
"drop table if exists jointbl_test", vector)
|
|
|
|
def test_unmatched_schema(self, vector):
|
|
table_format = vector.get_value('table_format')
|
|
# jointbl has no columns with unique values. When loaded in hbase, the table looks
|
|
# different, as hbase collapses duplicates.
|
|
if table_format.file_format == 'hbase':
|
|
pytest.skip()
|
|
self._create_test_table(vector)
|
|
self.run_test_case('QueryTest/test-unmatched-schema', vector)
|
|
self._drop_test_table(vector)
|
|
|
|
|
|
# Tests that scanners can read a single-column, single-row, 10MB table
|
|
class TestWideRow(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestWideRow, cls).add_test_dimensions()
|
|
# I can't figure out how to load a huge row into hbase
|
|
cls.TestMatrix.add_constraint(
|
|
lambda v: v.get_value('table_format').file_format != 'hbase')
|
|
|
|
def test_wide_row(self, vector):
|
|
new_vector = deepcopy(vector)
|
|
# Use a 5MB scan range, so we will have to perform 5MB of sync reads
|
|
new_vector.get_value('exec_option')['max_scan_range_length'] = 5 * 1024 * 1024
|
|
# We need > 10 MB of memory because we're creating extra buffers:
|
|
# - 10 MB table / 5 MB scan range = 2 scan ranges, each of which may allocate ~20MB
|
|
# - Sync reads will allocate ~5MB of space
|
|
# The 100MB value used here was determined empirically by raising the limit until the
|
|
# query succeeded for all file formats -- I don't know exactly why we need this much.
|
|
# TODO: figure out exact breakdown of memory usage (IMPALA-681)
|
|
new_vector.get_value('exec_option')['mem_limit'] = 100 * 1024 * 1024
|
|
self.run_test_case('QueryTest/wide-row', new_vector)
|
|
|
|
class TestWideTable(ImpalaTestSuite):
|
|
# TODO: expand this to more rows when we have the capability
|
|
NUM_COLS = [250, 500, 1000]
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestWideTable, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_dimension(TestDimension("num_cols", *cls.NUM_COLS))
|
|
# To cut down on test execution time, only run in exhaustive.
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
cls.TestMatrix.add_constraint(lambda v: False)
|
|
|
|
def test_wide_table(self, vector):
|
|
NUM_COLS = vector.get_value('num_cols')
|
|
# Due to the way HBase handles duplicate row keys, we have different number of
|
|
# rows in HBase tables compared to HDFS tables.
|
|
NUM_ROWS = 10 if vector.get_value('table_format').file_format != 'hbase' else 2
|
|
DB_NAME = QueryTestSectionReader.get_db_name(vector.get_value('table_format'))
|
|
TABLE_NAME = "%s.widetable_%s_cols" % (DB_NAME, NUM_COLS)
|
|
|
|
result = self.client.execute("select count(*) from %s " % TABLE_NAME)
|
|
assert result.data == [str(NUM_ROWS)]
|
|
|
|
expected_result = widetable.get_data(NUM_COLS, NUM_ROWS, quote_strings=True)
|
|
result = self.client.execute("select * from %s" % TABLE_NAME)
|
|
|
|
if vector.get_value('table_format').file_format == 'hbase':
|
|
assert len(result.data) == NUM_ROWS
|
|
return
|
|
|
|
types = parse_column_types(result.schema)
|
|
labels = parse_column_labels(result.schema)
|
|
expected = QueryTestResult(expected_result, types, labels, order_matters=False)
|
|
actual = QueryTestResult(parse_result_rows(result), types, labels,
|
|
order_matters=False)
|
|
assert expected == actual
|
|
|
|
|
|
class TestParquet(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestParquet, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_constraint(
|
|
lambda v: v.get_value('table_format').file_format == 'parquet')
|
|
|
|
def test_parquet(self, vector):
|
|
self.run_test_case('QueryTest/parquet', vector)
|
|
|
|
@SkipIfOldAggsJoins.nested_types
|
|
def test_file_metadata_discrepancy(self, vector):
|
|
vector.get_value('exec_option')['abort_on_error'] = 0
|
|
self.run_test_case('QueryTest/parquet-continue-on-error', vector)
|
|
vector.get_value('exec_option')['abort_on_error'] = 1
|
|
self.run_test_case('QueryTest/parquet-abort-on-error', vector)
|
|
|
|
def test_corrupt_rle_counts(self, vector, unique_database):
|
|
"""IMPALA-3646: Tests that a certain type of file corruption for plain
|
|
dictionary encoded values is gracefully handled. Cases tested:
|
|
- incorrect literal count of 0 for the RLE encoded dictionary indexes
|
|
- incorrect repeat count of 0 for the RLE encoded dictionary indexes
|
|
"""
|
|
# Create test table and copy the corrupt files into it.
|
|
self.client.execute(
|
|
"create table %s.bad_rle_counts (c bigint) stored as parquet" % unique_database)
|
|
bad_rle_counts_tbl_loc =\
|
|
get_fs_path("/test-warehouse/%s.db/%s" % (unique_database, "bad_rle_counts"))
|
|
check_call(['hdfs', 'dfs', '-copyFromLocal',
|
|
os.environ['IMPALA_HOME'] + "/testdata/data/bad_rle_literal_count.parquet",
|
|
bad_rle_counts_tbl_loc])
|
|
check_call(['hdfs', 'dfs', '-copyFromLocal',
|
|
os.environ['IMPALA_HOME'] + "/testdata/data/bad_rle_repeat_count.parquet",
|
|
bad_rle_counts_tbl_loc])
|
|
# Querying the corrupted files should not DCHECK or crash.
|
|
vector.get_value('exec_option')['abort_on_error'] = 0
|
|
self.run_test_case('QueryTest/parquet-corrupt-rle-counts', vector, unique_database)
|
|
vector.get_value('exec_option')['abort_on_error'] = 1
|
|
self.run_test_case('QueryTest/parquet-corrupt-rle-counts-abort',
|
|
vector, unique_database)
|
|
|
|
@SkipIfS3.hdfs_block_size
|
|
@SkipIfIsilon.hdfs_block_size
|
|
@SkipIfLocal.multiple_impalad
|
|
@pytest.mark.execute_serially
|
|
def test_multiple_blocks(self, vector):
|
|
# For IMPALA-1881. The table functional_parquet.lineitem_multiblock has 3 blocks, so
|
|
# each impalad should read 1 scan range.
|
|
# It needs to execute serially because if there is at a time more, than one query
|
|
# being scheduled, the simple scheduler round robins colocated impalads across
|
|
# all running queries. See IMPALA-2479 for more details.
|
|
table_name = 'functional_parquet.lineitem_multiblock'
|
|
self._multiple_blocks_helper(table_name, 20000, ranges_per_node=1)
|
|
table_name = 'functional_parquet.lineitem_sixblocks'
|
|
# 2 scan ranges per node should be created to read 'lineitem_sixblocks' because
|
|
# there are 6 blocks and 3 scan nodes.
|
|
self._multiple_blocks_helper(table_name, 40000, ranges_per_node=2)
|
|
|
|
@SkipIfS3.hdfs_block_size
|
|
@SkipIfIsilon.hdfs_block_size
|
|
@SkipIfLocal.multiple_impalad
|
|
@pytest.mark.execute_serially
|
|
def test_multiple_blocks_one_row_group(self, vector):
|
|
# For IMPALA-1881. The table functional_parquet.lineitem_multiblock_one_row_group has
|
|
# 3 blocks but only one row group across these blocks. We test to see that only one
|
|
# scan range reads everything from this row group.
|
|
table_name = 'functional_parquet.lineitem_multiblock_one_row_group'
|
|
self._multiple_blocks_helper(
|
|
table_name, 40000, one_row_group=True, ranges_per_node=1)
|
|
|
|
def _multiple_blocks_helper(
|
|
self, table_name, rows_in_table, one_row_group=False, ranges_per_node=1):
|
|
""" This function executes a simple SELECT query on a multiblock parquet table and
|
|
verifies the number of ranges issued per node and verifies that at least one row group
|
|
was read. If 'one_row_group' is True, then one scan range is expected to read the data
|
|
from the entire table regardless of the number of blocks. 'ranges_per_node' indicates
|
|
how many scan ranges we expect to be issued per node. """
|
|
|
|
query = 'select count(l_orderkey) from %s' % table_name
|
|
result = self.client.execute(query)
|
|
assert len(result.data) == 1
|
|
assert result.data[0] == str(rows_in_table)
|
|
|
|
runtime_profile = str(result.runtime_profile)
|
|
num_row_groups_list = re.findall('NumRowGroups: ([0-9]*)', runtime_profile)
|
|
scan_ranges_complete_list = re.findall(
|
|
'ScanRangesComplete: ([0-9]*)', runtime_profile)
|
|
num_rows_read_list = re.findall('RowsRead: [0-9.K]* \(([0-9]*)\)', runtime_profile)
|
|
|
|
# This will fail if the number of impalads != 3
|
|
# The fourth fragment is the "Averaged Fragment"
|
|
assert len(num_row_groups_list) == 4
|
|
assert len(scan_ranges_complete_list) == 4
|
|
assert len(num_rows_read_list) == 4
|
|
|
|
total_num_row_groups = 0
|
|
# Skip the Averaged Fragment; it comes first in the runtime profile.
|
|
for num_row_groups in num_row_groups_list[1:]:
|
|
total_num_row_groups += int(num_row_groups)
|
|
if not one_row_group: assert int(num_row_groups) > 0
|
|
|
|
if one_row_group:
|
|
# If it's the one row group test, only one scan range should read all the data from
|
|
# that row group.
|
|
assert total_num_row_groups == 1
|
|
for rows_read in num_rows_read_list[1:]:
|
|
if rows_read != '0': assert rows_read == str(rows_in_table)
|
|
|
|
for scan_ranges_complete in scan_ranges_complete_list:
|
|
assert int(scan_ranges_complete) == ranges_per_node
|
|
|
|
def test_annotate_utf8_option(self, vector, unique_database):
|
|
if self.exploration_strategy() != 'exhaustive': pytest.skip("Only run in exhaustive")
|
|
|
|
# Create table
|
|
TABLE_NAME = "parquet_annotate_utf8_test"
|
|
qualified_table_name = "%s.%s" % (unique_database, TABLE_NAME)
|
|
query = 'create table %s (a string, b char(10), c varchar(10), d string) ' \
|
|
'stored as parquet' % qualified_table_name
|
|
self.client.execute(query)
|
|
|
|
# Insert data that should have UTF8 annotation
|
|
query = 'insert overwrite table %s '\
|
|
'values("a", cast("b" as char(10)), cast("c" as varchar(10)), "d")' \
|
|
% qualified_table_name
|
|
self.execute_query(query, {'parquet_annotate_strings_utf8': True})
|
|
|
|
def get_schema_elements():
|
|
# Copy the created file to the local filesystem and parse metadata
|
|
local_file = '/tmp/utf8_test_%s.parq' % random.randint(0, 10000)
|
|
LOG.info("test_annotate_utf8_option local file name: " + local_file)
|
|
hdfs_file = get_fs_path('/test-warehouse/%s.db/%s/*.parq'
|
|
% (unique_database, TABLE_NAME))
|
|
check_call(['hadoop', 'fs', '-copyToLocal', hdfs_file, local_file])
|
|
metadata = get_parquet_metadata(local_file)
|
|
|
|
# Extract SchemaElements corresponding to the table columns
|
|
a_schema_element = metadata.schema[1]
|
|
assert a_schema_element.name == 'a'
|
|
b_schema_element = metadata.schema[2]
|
|
assert b_schema_element.name == 'b'
|
|
c_schema_element = metadata.schema[3]
|
|
assert c_schema_element.name == 'c'
|
|
d_schema_element = metadata.schema[4]
|
|
assert d_schema_element.name == 'd'
|
|
|
|
os.remove(local_file)
|
|
return a_schema_element, b_schema_element, c_schema_element, d_schema_element
|
|
|
|
# Check that the schema uses the UTF8 annotation
|
|
a_schema_elt, b_schema_elt, c_schema_elt, d_schema_elt = get_schema_elements()
|
|
assert a_schema_elt.converted_type == ConvertedType.UTF8
|
|
assert b_schema_elt.converted_type == ConvertedType.UTF8
|
|
assert c_schema_elt.converted_type == ConvertedType.UTF8
|
|
assert d_schema_elt.converted_type == ConvertedType.UTF8
|
|
|
|
# Create table and insert data that should not have UTF8 annotation for strings
|
|
self.execute_query(query, {'parquet_annotate_strings_utf8': False})
|
|
|
|
# Check that the schema does not use the UTF8 annotation except for CHAR and VARCHAR
|
|
# columns
|
|
a_schema_elt, b_schema_elt, c_schema_elt, d_schema_elt = get_schema_elements()
|
|
assert a_schema_elt.converted_type == None
|
|
assert b_schema_elt.converted_type == ConvertedType.UTF8
|
|
assert c_schema_elt.converted_type == ConvertedType.UTF8
|
|
assert d_schema_elt.converted_type == None
|
|
|
|
@SkipIfOldAggsJoins.nested_types
|
|
def test_resolution_by_name(self, unique_database, vector):
|
|
self.run_test_case('QueryTest/parquet-resolution-by-name', vector,
|
|
use_db=unique_database)
|
|
|
|
# We use various scan range lengths to exercise corner cases in the HDFS scanner more
|
|
# thoroughly. In particular, it will exercise:
|
|
# 1. default scan range
|
|
# 2. scan range with no tuple
|
|
# 3. tuple that span across multiple scan ranges
|
|
# 4. scan range length = 16 for ParseSse() execution path
|
|
MAX_SCAN_RANGE_LENGTHS = [0, 1, 2, 5, 16, 17, 32]
|
|
|
|
class TestScanRangeLengths(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestScanRangeLengths, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_dimension(
|
|
TestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
|
|
|
|
def test_scan_ranges(self, vector):
|
|
vector.get_value('exec_option')['max_scan_range_length'] =\
|
|
vector.get_value('max_scan_range_length')
|
|
self.run_test_case('QueryTest/hdfs-tiny-scan', vector)
|
|
|
|
# More tests for text scanner
|
|
# 1. Test file that ends w/o tuple delimiter
|
|
# 2. Test file with escape character
|
|
class TestTextScanRangeLengths(ImpalaTestSuite):
|
|
ESCAPE_TABLE_LIST = ["testescape_16_lf", "testescape_16_crlf",
|
|
"testescape_17_lf", "testescape_17_crlf",
|
|
"testescape_32_lf", "testescape_32_crlf"]
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestTextScanRangeLengths, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_dimension(
|
|
TestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'text' and\
|
|
v.get_value('table_format').compression_codec == 'none')
|
|
|
|
def test_text_scanner(self, vector):
|
|
vector.get_value('exec_option')['max_scan_range_length'] =\
|
|
vector.get_value('max_scan_range_length')
|
|
self.execute_query_expect_success(self.client, "drop stats "
|
|
"functional.table_no_newline_part")
|
|
self.execute_query_expect_success(self.client, "compute stats "
|
|
"functional.table_no_newline_part")
|
|
self.run_test_case('QueryTest/hdfs-text-scan', vector)
|
|
|
|
# Test various escape char cases. We have to check the count(*) result against
|
|
# the count(col) result because if the scan range is split right after the escape
|
|
# char, the escape char has no effect because we cannot scan backwards to the
|
|
# previous scan range.
|
|
for t in self.ESCAPE_TABLE_LIST:
|
|
expected_result = self.client.execute("select count(col) from " + t)
|
|
result = self.client.execute("select count(*) from " + t)
|
|
assert result.data == expected_result.data
|
|
|
|
# Tests behavior of split "\r\n" delimiters.
|
|
class TestTextSplitDelimiters(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestTextSplitDelimiters, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'text' and\
|
|
v.get_value('table_format').compression_codec == 'none')
|
|
|
|
def test_text_split_delimiters(self, vector, unique_database):
|
|
"""Creates and queries a datafile that exercises interesting edge cases around split
|
|
"\r\n" delimiters. The data file contains the following 4-byte scan ranges:
|
|
|
|
abc\r First scan range, ends with split \r\n
|
|
- materializes (abc)
|
|
\nde\r Initial delimiter found, scan range ends with split \r\n
|
|
- materializes (de)
|
|
\nfg\r Initial delimiter found, scan range ends with \r
|
|
- materializes (fg),(hij)
|
|
hij\r Initial delimiter is \r at end
|
|
- materializes (klm)
|
|
klm\r Initial delimiter is split \r\n
|
|
- materializes nothing
|
|
\nno\r Final scan range, initial delimiter found, ends with \r
|
|
- materializes (no)
|
|
"""
|
|
DATA = "abc\r\nde\r\nfg\rhij\rklm\r\nno\r"
|
|
max_scan_range_length = 4
|
|
expected_result = ['abc', 'de', 'fg', 'hij', 'klm', 'no']
|
|
|
|
self._create_and_query_test_table(
|
|
vector, unique_database, DATA, max_scan_range_length, expected_result)
|
|
|
|
def test_text_split_across_buffers_delimiter(self, vector, unique_database):
|
|
"""Creates and queries a datafile that exercises a split "\r\n" across io buffers (but
|
|
within a single scan range). We use a 32MB file and 16MB scan ranges, so there are two
|
|
scan ranges of two io buffers each. The first scan range exercises a split delimiter
|
|
in the main text parsing algorithm. The second scan range exercises correctly
|
|
identifying a split delimiter as the first in a scan range."""
|
|
DEFAULT_IO_BUFFER_SIZE = 8 * 1024 * 1024
|
|
data = ('a' * (DEFAULT_IO_BUFFER_SIZE - 1) + "\r\n" + # first scan range
|
|
'b' * (DEFAULT_IO_BUFFER_SIZE - 3) + "\r\n" +
|
|
'a' * (DEFAULT_IO_BUFFER_SIZE - 1) + "\r\n" + # second scan range
|
|
'b' * (DEFAULT_IO_BUFFER_SIZE - 1))
|
|
assert len(data) == DEFAULT_IO_BUFFER_SIZE * 4
|
|
|
|
max_scan_range_length = DEFAULT_IO_BUFFER_SIZE * 2
|
|
expected_result = data.split("\r\n")
|
|
|
|
self._create_and_query_test_table(
|
|
vector, unique_database, data, max_scan_range_length, expected_result)
|
|
|
|
def _create_and_query_test_table(self, vector, unique_database, data,
|
|
max_scan_range_length, expected_result):
|
|
TABLE_NAME = "test_text_split_delimiters"
|
|
qualified_table_name = "%s.%s" % (unique_database, TABLE_NAME)
|
|
location = get_fs_path("/test-warehouse/%s_%s" % (unique_database, TABLE_NAME))
|
|
query = "create table %s (s string) location '%s'" % (qualified_table_name, location)
|
|
self.client.execute(query)
|
|
|
|
with tempfile.NamedTemporaryFile() as f:
|
|
f.write(data)
|
|
f.flush()
|
|
check_call(['hadoop', 'fs', '-copyFromLocal', f.name, location])
|
|
self.client.execute("refresh %s" % qualified_table_name);
|
|
|
|
vector.get_value('exec_option')['max_scan_range_length'] = max_scan_range_length
|
|
query = "select * from %s" % qualified_table_name
|
|
result = self.execute_query_expect_success(
|
|
self.client, query, vector.get_value('exec_option'))
|
|
|
|
assert sorted(result.data) == sorted(expected_result)
|
|
|
|
# Test for IMPALA-1740: Support for skip.header.line.count
|
|
class TestTextScanRangeLengths(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestTextScanRangeLengths, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_constraint(
|
|
lambda v: v.get_value('table_format').file_format == 'text')
|
|
|
|
def test_text_scanner_with_header(self, vector):
|
|
self.run_test_case('QueryTest/hdfs-text-scan-with-header', vector)
|
|
|
|
|
|
# Missing Coverage: No coverage for truncated files errors or scans.
|
|
@SkipIfS3.hive
|
|
@SkipIfIsilon.hive
|
|
@SkipIfLocal.hive
|
|
@pytest.mark.execute_serially
|
|
class TestScanTruncatedFiles(ImpalaTestSuite):
|
|
TEST_DB = 'test_truncated_file'
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestScanTruncatedFiles, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
|
|
|
|
# This test takes about a minute to complete due to the Hive commands that are
|
|
# executed. To cut down on runtime, limit the test to exhaustive exploration
|
|
# strategy.
|
|
# TODO: Test other file formats
|
|
if cls.exploration_strategy() == 'exhaustive':
|
|
cls.TestMatrix.add_constraint(lambda v:\
|
|
v.get_value('table_format').file_format == 'text' and\
|
|
v.get_value('table_format').compression_codec == 'none')
|
|
else:
|
|
cls.TestMatrix.add_constraint(lambda v: False)
|
|
|
|
def setup_method(self, method):
|
|
self.cleanup_db(TestScanTruncatedFiles.TEST_DB)
|
|
self.client.execute("create database %s location '%s/%s.db'" %
|
|
(TestScanTruncatedFiles.TEST_DB, WAREHOUSE,
|
|
TestScanTruncatedFiles.TEST_DB))
|
|
|
|
def teardown_method(self, method):
|
|
self.cleanup_db(TestScanTruncatedFiles.TEST_DB)
|
|
|
|
def test_scan_truncated_file_empty(self, vector):
|
|
self.scan_truncated_file(0)
|
|
|
|
def test_scan_truncated_file(self, vector):
|
|
self.scan_truncated_file(10)
|
|
|
|
def scan_truncated_file(self, num_rows):
|
|
db_name = TestScanTruncatedFiles.TEST_DB
|
|
tbl_name = "tbl"
|
|
self.execute_query("use %s" % db_name)
|
|
self.execute_query("create table %s (s string)" % tbl_name)
|
|
call(["hive", "-e", "INSERT OVERWRITE TABLE %s.%s SELECT string_col from "\
|
|
"functional.alltypes" % (db_name, tbl_name)])
|
|
|
|
# Update the Impala metadata
|
|
self.execute_query("refresh %s" % tbl_name)
|
|
|
|
# Insert overwrite with a truncated file
|
|
call(["hive", "-e", "INSERT OVERWRITE TABLE %s.%s SELECT string_col from "\
|
|
"functional.alltypes limit %s" % (db_name, tbl_name, num_rows)])
|
|
|
|
result = self.execute_query("select count(*) from %s" % tbl_name)
|
|
assert(len(result.data) == 1)
|
|
assert(result.data[0] == str(num_rows))
|