#!/usr/bin/env python # Copyright (c) 2012 Cloudera, Inc. All rights reserved. # # This script is used to generate test "vectors" based on a dimension input file. # A vector in this context is simply a permutation of the values in the the # dimension input file. For example, in this case the script is generating test vectors # for the Impala / Hive benchmark suite so interesting dimensions are data set, # file format, and compression algorithm. More can be added later. # The output of running this script is a list of vectors. Currently two different vector # outputs are generated - an "exhaustive" vector which contains all permutations and a # "pairwise" vector that contains a subset of the vectors by chosing all combinations of # pairs (the pairwise strategy). More information about pairwise can be found at # http://www.pairwise.org. # # The end goal is to have a reduced set of test vectors to provide coverage but don't take # as long to run as the exhaustive set of vectors along with a set of vectors that provide # full coverage. This is especially important for benchmarks which work on very large data # sets. # # The output files output can then be read in by other tests by other scripts,tools,tests. # One major use case is the generate_scehma_statements.py script, which uses the vector # files to dynamically build schema for running benchmark and functional tests. # # The pairwise generation is done using the Python 'AllPairs' module. This module can be # downloaded from http://pypi.python.org/pypi/AllPairs/2.0.1 # import collections import csv import math import os import sys from itertools import product from optparse import OptionParser import metacomm.combinatorics.all_pairs2 all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2 parser = OptionParser() parser.add_option("-w", "--workload", dest="workload", help="The workload to generate test vectors for") (options, args) = parser.parse_args() if options.workload is None: print "A workload name must be specified." parser.print_help() sys.exit(1) WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR'] # This array also defines the order of the dimension values. This ordering # is important because it is used to apply constraints. Add new items to the # end of the list. KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec', 'compression_type'] FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format') DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset') COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec') COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type') class VectorGenerator: def __init__(self, input_vectors): self.input_vectors = input_vectors def generate_pairwise_matrix(self, filter_func = None): if filter_func is None: filter_func = lambda vector: True return all_pairs(self.input_vectors, filter_func = is_valid_combination) def generate_exhaustive_matrix(self, filter_func = None): if filter_func is None: filter_func = lambda vector: True return [list(vec) for vec in product(*self.input_vectors) if filter_func(vec)] # Add vector value constraints to this function. This def is_valid_combination(vector): if len(vector) == 4: return not ( (vector[FILE_FORMAT_IDX] == 'text' and\ vector[COMPRESSION_IDX] not in ['none', 'lzo']) or (vector[FILE_FORMAT_IDX] != 'text' and vector[COMPRESSION_IDX] == 'lzo') or (vector[COMPRESSION_IDX] == 'none' and vector[COMPRESSION_TYPE_IDX] != 'none') or (vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or (vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or (vector[FILE_FORMAT_IDX] == 'parquet' and vector[COMPRESSION_IDX] != 'none') or (vector[FILE_FORMAT_IDX] == 'hbase' and vector[COMPRESSION_IDX] != 'none') or (vector[FILE_FORMAT_IDX] == 'avro' and vector[COMPRESSION_IDX] not in ['none', 'snap', 'def'])) # The pairwise generator may call this with different vector lengths. In that case this # should always return true. return True # Vector files have the format: : value1, value2, ... this function # adds all specified dimensions to a map of dimension name-to-value def read_dimension_file(file_name): dimension_map = collections.defaultdict(list) with open(file_name, 'rb') as input_file: for line in input_file.readlines(): if line.strip().startswith('#'): continue values = line.split(':') if len(values) != 2: print 'Invalid dimension file format. Expected format is : val1,'\ ' val2, ... Found: ' + line sys.exit(1) if not values[0] in KNOWN_DIMENSION_NAMES: print 'Unknown dimension name: ' + values[0] print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES) sys.exit(1) dimension_map[values[0]] = [val.strip() for val in values[1].split(',')] return dimension_map def write_vectors_to_csv(output_dir, output_file, matrix): output_text = "# Generated File." for row in matrix: row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))] output_text += '\n' + ', '.join(row) output_path = os.path.join(output_dir, output_file) print 'Writing test vectors to: ' + output_path with open(output_path, 'wb') as output_file: output_file.write(output_text) output_file.write('\n') dimension_file = os.path.join(WORKLOAD_DIR, options.workload, '%s_dimensions.csv' % options.workload) if not os.path.isfile(dimension_file): print 'Dimension file not found: ' + dimension_file sys.exit(1) print 'Reading dimension file: ' + dimension_file vector_map = read_dimension_file(dimension_file) vectors = [] # This ordering matters! We need to know the order to apply the proper constraints. for dimension_name in KNOWN_DIMENSION_NAMES: vectors.append(vector_map[dimension_name]) vg = VectorGenerator(vectors) output_dir = os.path.join(WORKLOAD_DIR, options.workload) write_vectors_to_csv(output_dir, '%s_pairwise.csv' % options.workload, vg.generate_pairwise_matrix(is_valid_combination)) write_vectors_to_csv(output_dir, '%s_exhaustive.csv' % options.workload, vg.generate_exhaustive_matrix(is_valid_combination))