mirror of
https://github.com/apache/impala.git
synced 2026-01-05 12:01:11 -05:00
This is the first set of changes required to start getting our functional test infrastructure moved from JUnit to Python. After investigating a number of option, I decided to go with a python test executor named py.test (http://pytest.org/). It is very flexible, open source (MIT licensed), and will enable us to do some cool things like parallel test execution. As part of this change, we now use our "test vectors" for query test execution. This will be very nice because it means if load the "core" dataset you know you will be able to run the "core" query tests (specified by --exploration_strategy when running the tests). You will see that now each combination of table format + query exec options is treated like an individual test case. this will make it much easier to debug exactly where something failed. These new tests can be run using the script at tests/run-tests.sh
141 lines
6.0 KiB
Python
Executable File
141 lines
6.0 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# This script is used to generate test "vectors" based on a dimension input file.
|
|
# A vector in this context is simply a permutation of the values in the the
|
|
# dimension input file. For example, in this case the script is generating test vectors
|
|
# for the Impala / Hive benchmark suite so interesting dimensions are data set,
|
|
# file format, and compression algorithm. More can be added later.
|
|
# The output of running this script is a list of vectors. Currently two different vector
|
|
# outputs are generated - an "exhaustive" vector which contains all permutations and a
|
|
# "pairwise" vector that contains a subset of the vectors by chosing all combinations of
|
|
# pairs (the pairwise strategy). More information about pairwise can be found at
|
|
# http://www.pairwise.org.
|
|
#
|
|
# The end goal is to have a reduced set of test vectors to provide coverage but don't take
|
|
# as long to run as the exhaustive set of vectors along with a set of vectors that provide
|
|
# full coverage. This is especially important for benchmarks which work on very large data
|
|
# sets.
|
|
#
|
|
# The output files output can then be read in by other tests by other scripts,tools,tests.
|
|
# One major use case is the generate_scehma_statements.py script, which uses the vector
|
|
# files to dynamically build schema for running benchmark and functional tests.
|
|
#
|
|
# The pairwise generation is done using the Python 'AllPairs' module. This module can be
|
|
# downloaded from http://pypi.python.org/pypi/AllPairs/2.0.1
|
|
#
|
|
import collections
|
|
import csv
|
|
import math
|
|
import os
|
|
import sys
|
|
from itertools import product
|
|
from optparse import OptionParser
|
|
import metacomm.combinatorics.all_pairs2
|
|
all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("-w", "--workload", dest="workload",
|
|
help="The workload to generate test vectors for")
|
|
(options, args) = parser.parse_args()
|
|
|
|
if options.workload is None:
|
|
print "A workload name must be specified."
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
|
|
|
# This array also defines the order of the dimension values. This ordering
|
|
# is important because it is used to apply constraints. Add new items to the
|
|
# end of the list.
|
|
KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec',
|
|
'compression_type']
|
|
|
|
FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format')
|
|
DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset')
|
|
COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec')
|
|
COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type')
|
|
|
|
class VectorGenerator:
|
|
def __init__(self, input_vectors):
|
|
self.input_vectors = input_vectors
|
|
|
|
def generate_pairwise_matrix(self, filter_func = None):
|
|
if filter_func is None:
|
|
filter_func = lambda vector: True
|
|
return all_pairs(self.input_vectors, filter_func = is_valid_combination)
|
|
|
|
def generate_exhaustive_matrix(self, filter_func = None):
|
|
if filter_func is None:
|
|
filter_func = lambda vector: True
|
|
return [list(vec) for vec in product(*self.input_vectors) if filter_func(vec)]
|
|
|
|
# Add vector value constraints to this function. This
|
|
def is_valid_combination(vector):
|
|
if len(vector) == 4:
|
|
return not (
|
|
(vector[FILE_FORMAT_IDX] == 'text' and vector[COMPRESSION_IDX] != 'none') or
|
|
(vector[COMPRESSION_IDX] == 'none' and vector[COMPRESSION_TYPE_IDX] != 'none') or
|
|
(vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or
|
|
(vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or
|
|
(vector[FILE_FORMAT_IDX] == 'trevni' and
|
|
(vector[COMPRESSION_IDX] == 'gzip' or vector[COMPRESSION_IDX] == 'bzip')))
|
|
|
|
# The pairwise generator may call this with different vector lengths. In that case this
|
|
# should always return true.
|
|
return True
|
|
|
|
# Vector files have the format: <dimension name>: value1, value2, ... this function
|
|
# adds all specified dimensions to a map of dimension name-to-value
|
|
def read_dimension_file(file_name):
|
|
dimension_map = collections.defaultdict(list)
|
|
with open(file_name, 'rb') as input_file:
|
|
for line in input_file.readlines():
|
|
if line.strip().startswith('#'):
|
|
continue
|
|
values = line.split(':')
|
|
if len(values) != 2:
|
|
print 'Invalid dimension file format. Expected format is <dimension name>: val1,'\
|
|
' val2, ... Found: ' + line
|
|
sys.exit(1)
|
|
if not values[0] in KNOWN_DIMENSION_NAMES:
|
|
print 'Unknown dimension name: ' + values[0]
|
|
print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES)
|
|
sys.exit(1)
|
|
dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
|
|
return dimension_map
|
|
|
|
def write_vectors_to_csv(output_dir, output_file, matrix):
|
|
output_text = "# Generated File."
|
|
for row in matrix:
|
|
row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))]
|
|
output_text += '\n' + ', '.join(row)
|
|
|
|
output_path = os.path.join(output_dir, output_file)
|
|
print 'Writing test vectors to: ' + output_path
|
|
with open(output_path, 'wb') as output_file:
|
|
output_file.write(output_text)
|
|
output_file.write('\n')
|
|
|
|
dimension_file = os.path.join(WORKLOAD_DIR, options.workload,
|
|
'%s_dimensions.csv' % options.workload)
|
|
if not os.path.isfile(dimension_file):
|
|
print 'Dimension file not found: ' + dimension_file
|
|
sys.exit(1)
|
|
|
|
print 'Reading dimension file: ' + dimension_file
|
|
vector_map = read_dimension_file(dimension_file)
|
|
vectors = []
|
|
|
|
# This ordering matters! We need to know the order to apply the proper constraints.
|
|
for dimension_name in KNOWN_DIMENSION_NAMES:
|
|
vectors.append(vector_map[dimension_name])
|
|
vg = VectorGenerator(vectors)
|
|
|
|
output_dir = os.path.join(WORKLOAD_DIR, options.workload)
|
|
write_vectors_to_csv(output_dir, '%s_pairwise.csv' % options.workload,
|
|
vg.generate_pairwise_matrix(is_valid_combination))
|
|
write_vectors_to_csv(output_dir, '%s_exhaustive.csv' % options.workload,
|
|
vg.generate_exhaustive_matrix(is_valid_combination))
|