mirror of
https://github.com/apache/impala.git
synced 2026-01-01 18:00:30 -05:00
Change-Id: Ia01181a1e10eb108419122d347e9d869a69e8922 Reviewed-on: http://gerrit.ent.cloudera.com:8080/102 Reviewed-by: Ishaan Joshi <ishaan@cloudera.com> Tested-by: Ishaan Joshi <ishaan@cloudera.com>
145 lines
6.3 KiB
Python
Executable File
145 lines
6.3 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# This script is used to generate test "vectors" based on a dimension input file.
|
|
# A vector in this context is simply a permutation of the values in the the
|
|
# dimension input file. For example, in this case the script is generating test vectors
|
|
# for the Impala / Hive benchmark suite so interesting dimensions are data set,
|
|
# file format, and compression algorithm. More can be added later.
|
|
# The output of running this script is a list of vectors. Currently two different vector
|
|
# outputs are generated - an "exhaustive" vector which contains all permutations and a
|
|
# "pairwise" vector that contains a subset of the vectors by chosing all combinations of
|
|
# pairs (the pairwise strategy). More information about pairwise can be found at
|
|
# http://www.pairwise.org.
|
|
#
|
|
# The end goal is to have a reduced set of test vectors to provide coverage but don't take
|
|
# as long to run as the exhaustive set of vectors along with a set of vectors that provide
|
|
# full coverage. This is especially important for benchmarks which work on very large data
|
|
# sets.
|
|
#
|
|
# The output files output can then be read in by other tests by other scripts,tools,tests.
|
|
# One major use case is the generate_scehma_statements.py script, which uses the vector
|
|
# files to dynamically build schema for running benchmark and functional tests.
|
|
#
|
|
# The pairwise generation is done using the Python 'AllPairs' module. This module can be
|
|
# downloaded from http://pypi.python.org/pypi/AllPairs/2.0.1
|
|
#
|
|
import collections
|
|
import csv
|
|
import math
|
|
import os
|
|
import sys
|
|
from itertools import product
|
|
from optparse import OptionParser
|
|
import metacomm.combinatorics.all_pairs2
|
|
all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("-w", "--workload", dest="workload",
|
|
help="The workload to generate test vectors for")
|
|
(options, args) = parser.parse_args()
|
|
|
|
if options.workload is None:
|
|
print "A workload name must be specified."
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
|
|
|
# This array also defines the order of the dimension values. This ordering
|
|
# is important because it is used to apply constraints. Add new items to the
|
|
# end of the list.
|
|
KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec',
|
|
'compression_type']
|
|
|
|
FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format')
|
|
DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset')
|
|
COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec')
|
|
COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type')
|
|
|
|
class VectorGenerator:
|
|
def __init__(self, input_vectors):
|
|
self.input_vectors = input_vectors
|
|
|
|
def generate_pairwise_matrix(self, filter_func = None):
|
|
if filter_func is None:
|
|
filter_func = lambda vector: True
|
|
return all_pairs(self.input_vectors, filter_func = is_valid_combination)
|
|
|
|
def generate_exhaustive_matrix(self, filter_func = None):
|
|
if filter_func is None:
|
|
filter_func = lambda vector: True
|
|
return [list(vec) for vec in product(*self.input_vectors) if filter_func(vec)]
|
|
|
|
# Add vector value constraints to this function. This
|
|
def is_valid_combination(vector):
|
|
if len(vector) == 4:
|
|
return not (
|
|
(vector[FILE_FORMAT_IDX] == 'text' and\
|
|
vector[COMPRESSION_IDX] not in ['none', 'lzo']) or
|
|
(vector[FILE_FORMAT_IDX] != 'text' and vector[COMPRESSION_IDX] == 'lzo') or
|
|
(vector[COMPRESSION_IDX] == 'none' and vector[COMPRESSION_TYPE_IDX] != 'none') or
|
|
(vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or
|
|
(vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or
|
|
(vector[FILE_FORMAT_IDX] == 'parquet' and vector[COMPRESSION_IDX] != 'none') or
|
|
(vector[FILE_FORMAT_IDX] == 'hbase' and vector[COMPRESSION_IDX] != 'none') or
|
|
(vector[FILE_FORMAT_IDX] == 'avro' and
|
|
vector[COMPRESSION_IDX] not in ['none', 'snap', 'def']))
|
|
|
|
# The pairwise generator may call this with different vector lengths. In that case this
|
|
# should always return true.
|
|
return True
|
|
|
|
# Vector files have the format: <dimension name>: value1, value2, ... this function
|
|
# adds all specified dimensions to a map of dimension name-to-value
|
|
def read_dimension_file(file_name):
|
|
dimension_map = collections.defaultdict(list)
|
|
with open(file_name, 'rb') as input_file:
|
|
for line in input_file.readlines():
|
|
if line.strip().startswith('#'):
|
|
continue
|
|
values = line.split(':')
|
|
if len(values) != 2:
|
|
print 'Invalid dimension file format. Expected format is <dimension name>: val1,'\
|
|
' val2, ... Found: ' + line
|
|
sys.exit(1)
|
|
if not values[0] in KNOWN_DIMENSION_NAMES:
|
|
print 'Unknown dimension name: ' + values[0]
|
|
print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES)
|
|
sys.exit(1)
|
|
dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
|
|
return dimension_map
|
|
|
|
def write_vectors_to_csv(output_dir, output_file, matrix):
|
|
output_text = "# Generated File."
|
|
for row in matrix:
|
|
row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))]
|
|
output_text += '\n' + ', '.join(row)
|
|
|
|
output_path = os.path.join(output_dir, output_file)
|
|
print 'Writing test vectors to: ' + output_path
|
|
with open(output_path, 'wb') as output_file:
|
|
output_file.write(output_text)
|
|
output_file.write('\n')
|
|
|
|
dimension_file = os.path.join(WORKLOAD_DIR, options.workload,
|
|
'%s_dimensions.csv' % options.workload)
|
|
if not os.path.isfile(dimension_file):
|
|
print 'Dimension file not found: ' + dimension_file
|
|
sys.exit(1)
|
|
|
|
print 'Reading dimension file: ' + dimension_file
|
|
vector_map = read_dimension_file(dimension_file)
|
|
vectors = []
|
|
|
|
# This ordering matters! We need to know the order to apply the proper constraints.
|
|
for dimension_name in KNOWN_DIMENSION_NAMES:
|
|
vectors.append(vector_map[dimension_name])
|
|
vg = VectorGenerator(vectors)
|
|
|
|
output_dir = os.path.join(WORKLOAD_DIR, options.workload)
|
|
write_vectors_to_csv(output_dir, '%s_pairwise.csv' % options.workload,
|
|
vg.generate_pairwise_matrix(is_valid_combination))
|
|
write_vectors_to_csv(output_dir, '%s_exhaustive.csv' % options.workload,
|
|
vg.generate_exhaustive_matrix(is_valid_combination))
|