Files
impala/tests/query_test/test_scanners_all_table_formats.py
Lenni Kuff 0ac0527643 Reduce test execution time by limiting long running tests to exhaustive exec strategy
I looked at the latest run from master and took the tests suites that had long
execution times. This cleans those test suites up to either completely disable them
on 'core' or add constraints to limit the number of test vectors. It shouldn't impact
nightly coverage since we still run the same tests exhaustively.

Change-Id: I10c78c35155b00de0c36d9fc0923b2b1fc6b44de
Reviewed-on: http://gerrit.ent.cloudera.com:8080/3119
Reviewed-by: Marcel Kornacker <marcel@cloudera.com>
Tested-by: jenkins
Reviewed-on: http://gerrit.ent.cloudera.com:8080/3125
Reviewed-by: Lenni Kuff <lskuff@cloudera.com>
2014-06-18 16:18:17 -07:00

200 lines
8.0 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This test suite validates the scanners by running queries against ALL file formats and
# their permutations (e.g. compression codec/compression type). This works by exhaustively
# generating the table format test vectors for this specific test suite. This way, other
# tests can run with the normal exploration strategy and the overall test runtime doesn't
# explode.
import logging
import pytest
from copy import deepcopy
from testdata.common import widetable
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
from tests.util.test_file_parser import *
from tests.common.test_dimensions import create_single_exec_option_dimension
class TestScannersAllTableFormats(ImpalaTestSuite):
BATCH_SIZES = [0, 1, 16]
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScannersAllTableFormats, cls).add_test_dimensions()
# Exhaustively generate all table format vectors. This can still be overridden
# using the --table_formats flag.
cls.TestMatrix.add_dimension(cls.create_table_info_dimension('exhaustive'))
cls.TestMatrix.add_dimension(
TestDimension('batch_size', *TestScannersAllTableFormats.BATCH_SIZES))
def test_scanners(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size')
self.run_test_case('QueryTest/scanners', new_vector)
# Test all the scanners with a simple limit clause. The limit clause triggers
# cancellation in the scanner code paths.
class TestScannersAllTableFormatsWithLimit(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScannersAllTableFormatsWithLimit, cls).add_test_dimensions()
# Exhaustively generate all table format vectors. This can still be overridden
# using the --table_formats flag.
cls.TestMatrix.add_dimension(cls.create_table_info_dimension('exhaustive'))
def test_limit(self, vector):
# Use a small batch size so changing the limit affects the timing of cancellation
vector.get_value('exec_option')['batch_size'] = 100
iterations = 50
query_template = "select * from alltypes limit %s"
for i in range(1, iterations):
# Vary the limit to vary the timing of cancellation
query = query_template % ((iterations * 100) % 1000 + 1)
self.execute_query(query, vector.get_value('exec_option'),
table_format=vector.get_value('table_format'))
# Test case to verify the scanners work properly when the table metadata (specifically the
# number of columns in the table) does not match the number of columns in the data file.
class TestUnmatchedSchema(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestUnmatchedSchema, cls).add_test_dimensions()
# TODO: Does it add anything to enumerate all the supported compression codecs
# for each table format?
cls.TestMatrix.add_dimension(cls.create_table_info_dimension('exhaustive'))
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
# Avro has a more advanced schema evolution process which is covered in more depth
# in the test_avro_schema_evolution test suite.
cls.TestMatrix.add_constraint(\
lambda v: v.get_value('table_format').file_format != 'avro')
def __get_table_location(self, table_name, vector):
result = self.execute_query_using_client(self.client,
"describe formatted %s" % table_name, vector)
for row in result.data:
if 'Location:' in row:
return row.split('\t')[1]
# This should never happen.
assert 0, 'Unable to get location for table: ' + table_name
def __create_test_table(self, vector):
"""
Creates the test table
Cannot be done in a setup method because we need access to the current test vector
"""
self.__drop_test_table(vector)
self.execute_query_using_client(self.client,
"create external table jointbl_test like jointbl", vector)
# Update the location of the new table to point the same location as the old table
location = self.__get_table_location('jointbl', vector)
self.execute_query_using_client(self.client,
"alter table jointbl_test set location '%s'" % location, vector)
def __drop_test_table(self, vector):
self.execute_query_using_client(self.client,
"drop table if exists jointbl_test", vector)
def test_unmatched_schema(self, vector):
table_format = vector.get_value('table_format')
# jointbl has no columns with unique values. When loaded in hbase, the table looks
# different, as hbase collapses duplicates.
if table_format.file_format == 'hbase':
pytest.skip()
self.__create_test_table(vector)
self.run_test_case('QueryTest/test-unmatched-schema', vector)
self.__drop_test_table(vector)
# Tests that scanners can read a single-column, single-row, 10MB table
class TestWideRow(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestWideRow, cls).add_test_dimensions()
# I can't figure out how to load a huge row into hbase
cls.TestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format != 'hbase')
def test_wide_row(self, vector):
new_vector = deepcopy(vector)
# Use a 5MB scan range, so we will have to perform 5MB of sync reads
new_vector.get_value('exec_option')['max_scan_range_length'] = 5 * 1024 * 1024
# We need > 10 MB of memory because we're creating extra buffers:
# - 10 MB table / 5 MB scan range = 2 scan ranges, each of which may allocate ~20MB
# - Sync reads will allocate ~5MB of space
# The 80MB value used here was determined empirically by raising the limit until the query
# succeeded for all file formats -- I don't know exactly why we need this much.
# TODO: figure out exact breakdown of memory usage (IMPALA-681)
new_vector.get_value('exec_option')['mem_limit'] = 80 * 1024 * 1024
self.run_test_case('QueryTest/wide-row', new_vector)
class TestWideTable(ImpalaTestSuite):
# TODO: expand this to more rows when we have the capability
NUM_COLS = [250, 500, 1000]
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestWideTable, cls).add_test_dimensions()
cls.TestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format != 'hbase')
cls.TestMatrix.add_dimension(TestDimension("num_cols", *cls.NUM_COLS))
# To cut down on test execution time, only run in exhaustive.
if cls.exploration_strategy() != 'exhaustive':
cls.TestMatrix.add_constraint(lambda v: False)
def test_wide_table(self, vector):
NUM_COLS = vector.get_value('num_cols')
NUM_ROWS = 10
DB_NAME = QueryTestSectionReader.get_db_name(vector.get_value('table_format'))
TABLE_NAME = "%s.widetable_%s_cols" % (DB_NAME, NUM_COLS)
result = self.client.execute("select count(*) from %s " % TABLE_NAME)
assert result.data == [str(NUM_ROWS)]
expected_result = widetable.get_data(NUM_COLS, NUM_ROWS, quote_strings=True)
result = self.client.execute("select * from %s" % TABLE_NAME)
types = parse_column_types(result.schema)
labels = parse_column_labels(result.schema)
expected = QueryTestResult(expected_result, types, labels, order_matters=False)
actual = QueryTestResult(parse_result_rows(result), types, labels,
order_matters=False)
assert expected == actual
class TestParquet(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestParquet, cls).add_test_dimensions()
cls.TestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format == 'parquet')
def test_parquet(self, vector):
self.run_test_case('QueryTest/parquet', vector)