#!/usr/bin/env python # Copyright (c) 2012 Cloudera, Inc. All rights reserved. # # This script is used to run benchmark queries. It runs the set queries specified in the # given workload(s) under /queries. This script will first try to warm the # buffer cache before running the query. There is a command line options to control how # many iterations to run each query. # # By default, the script will have minimal output. Verbose output can be turned on with # the -v option which will output the normal query output. In addition, the -p option # can be passed which will enable gprof instrumentation and output the sampled call # stacks. The -v and -p option are used by the perf regression tests. # import csv import logging import math import os import pickle import sys import subprocess import threading from collections import defaultdict from functools import partial from optparse import OptionParser from os.path import isfile, isdir from random import choice from sys import exit from time import sleep from tests.common.workload_runner import WorkloadRunner, QueryExecutionDetail # Options # TODO: Find ways to reduce the number of options. parser = OptionParser() parser.add_option("-p", "--profiler", dest="profiler", action="store_true", default = False, help="If set, also run google pprof for sample profiling.") parser.add_option("-v", "--verbose", dest="verbose", action="store_true", default = False, help="If set, outputs all benchmark diagnostics.") parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core", help="The exploration strategy to use for running benchmark: 'core', "\ "'pairwise', or 'exhaustive'") parser.add_option("-w", "--workloads", dest="workloads", default="hive-benchmark", help="The workload(s) and scale factors to run in a comma-separated "\ " list format. Optional scale factors for each workload are specified"\ " using colons. For example: -w tpcds,tpch:400gb,tpch:1gb. "\ "Some valid workloads: 'hive-benchmark', 'tpch', 'tpcds', ...") parser.add_option("--impalad", dest="impalad", default="localhost:21000", help="A comma-separated list of impalad instances to run the "\ "workload against.") parser.add_option("--exec_options", dest="exec_options", default='', help="Runquery exec option string.") parser.add_option("--compare_with_hive", dest="compare_with_hive", action="store_true", default= False, help="Run all queries using Hive as well as Impala") parser.add_option("--results_csv_file", dest="results_csv_file", default=os.environ['IMPALA_HOME'] + "/benchmark_results.csv", help="The output file where benchmark results are saved") parser.add_option("--hive_results_csv_file", dest="hive_results_csv_file", default=os.environ['IMPALA_HOME'] + "/hive_benchmark_results.csv", help="The output file where Hive benchmark results are saved") parser.add_option("--hive_cmd", dest="hive_cmd", default="hive -e", help="The command to use for executing hive queries") parser.add_option("-i", "--iterations", type="int", dest="iterations", default=1, help="Number of times to run each query.") parser.add_option("--prime_cache", dest="prime_cache", action="store_true", default= False, help="Whether or not to prime the buffer cache. ") parser.add_option("--num_clients", type="int", dest="num_clients", default=1, help="Number of clients (threads) to use when executing each query.") parser.add_option("--query_names", dest="query_names", default=None, help="A comma-separated list of query names to execute.") parser.add_option("--table_formats", dest="table_formats", default=None, help=\ "Override the default test vectors and run using only the specified "\ "table formats. Ex. --table_formats=seq/snap/block,text/none") parser.add_option("--skip_impala", dest="skip_impala", action="store_true", default= False, help="If set, queries will only run against Hive.") parser.add_option("--use_kerberos", dest="use_kerberos", action="store_true", default=False, help="If set, enables talking to a kerberized impalad") parser.add_option("--continue_on_query_error", dest="continue_on_query_error", action="store_true", default=False, help="If set, continue execution "\ "on each query error.") parser.add_option("-V", "--verify_results", dest="verify_results", action="store_true", default=False, help="If set, verifies query results") parser.add_option("-c", "--client_type", dest="client_type", default='beeswax', help="Client type. Valid options are 'beeswax' or 'jdbc'") # These options are used for configuring failure testing parser.add_option("--failure_frequency", type="int", dest="failure_frequency", default=0, help="Interval (in seconds) to inject each failure or 0 to disable"\ " failure injection.") parser.add_option("--cm_cluster_name", dest="cm_cluster_name", help="The CM name of the cluster under test") parser.add_option("--cm_server_host", dest="cm_server_host", help="The host of the CM server.") parser.add_option("--cm_username", dest="cm_username", default='admin', help="The username to the CM server") parser.add_option("--cm_password", dest="cm_password", default='admin', help="The password to the CM server") options, args = parser.parse_args() logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s') LOG = logging.getLogger('run-workload') def save_results(result_map, output_csv_file, is_impala_result=True): """ Writes the results to an output CSV files """ if result_map is None: LOG.error('Result map is None') return csv_writer = csv.writer(open(output_csv_file, 'wb'), delimiter='|', quoting=csv.QUOTE_MINIMAL) for query_tuple, execution_results in result_map.iteritems(): for result in execution_results: query, query_name = query_tuple append_row_to_csv_file(csv_writer, query, query_name, result[0] if is_impala_result else result[1]) def append_row_to_csv_file(csv_writer, query, query_name, result): """ Write the results to a CSV file with '|' as the delimiter. """ # Replace non-existent values with N/A for reporting results. std_dev, avg_time = result.execution_result.std_dev, result.execution_result.avg_time if not std_dev: std_dev = 'N/A' if not avg_time: avg_time = 'N/A' compression_str = '%s/%s' % (result.compression_codec, result.compression_type) if compression_str == 'none/none': compression_str = 'none' csv_writer.writerow([result.executor, result.workload, result.scale_factor, query, query_name, result.file_format, compression_str, avg_time, std_dev, options.num_clients, options.iterations, result.execution_result.runtime_profile]) def enumerate_query_files(base_directory): """ Recursively scan the given directory for all test query files. """ query_files = list() for item in os.listdir(base_directory): full_path = os.path.join(base_directory, item) if isfile(full_path) and item.endswith('.test'): query_files.append(full_path) elif isdir(full_path): query_files += enumerate_query_files(full_path) return query_files def parse_workload_scale_factor(workload_scale_factor): parsed_workload_scale_factor = workload_scale_factor.split(':') if len(parsed_workload_scale_factor) == 1: return parsed_workload_scale_factor[0], '' elif len(parsed_workload_scale_factor) == 2: return parsed_workload_scale_factor else: LOG.error("Error parsing workload. Proper format is workload[:scale factor]") sys.exit(1) def write_results(result_map, partial_results = False): suffix = '.partial' if partial_results else '' if not options.skip_impala: LOG.info("Results saving to: %s" % options.results_csv_file) save_results(result_map, options.results_csv_file + suffix, is_impala_result=True) if options.skip_impala or options.compare_with_hive: LOG.info("Hive Results saving to: %s" % options.hive_results_csv_file) save_results(result_map, options.hive_results_csv_file + suffix, is_impala_result=False) def run_workloads(workload_runner, failure_injector=None): stop_on_error = (failure_injector is None) and (not options.continue_on_query_error) if failure_injector is not None: failure_injector.start() for workload_and_scale_factor in options.workloads.split(','): workload, scale_factor = parse_workload_scale_factor(workload_and_scale_factor) workload_runner.run_workload(workload, scale_factor, table_formats=options.table_formats, query_names=options.query_names, exploration_strategy=options.exploration_strategy, stop_on_query_error=stop_on_error) if failure_injector is not None: failure_injector.cancel() def process_results(workload_runner, is_partial_result=False): write_results(workload_runner.get_results(), is_partial_result) LOG.info(workload_runner.get_summary_str()) if __name__ == "__main__": """ Driver for the run-benchmark script. It runs all the workloads specified on the command line and writes them to a csv file. """ if options.use_kerberos: try: import sasl except ImportError: print 'The sasl module is needed to query a kerberized impalad' sys.exit(1) VALID_CLIENT_TYPES = ['beeswax', 'jdbc'] if options.client_type not in VALID_CLIENT_TYPES: LOG.error("Invalid client type %s" % options.client_type) sys.exit(1); workload_runner = WorkloadRunner( client_type=options.client_type, hive_cmd=options.hive_cmd, impalad=options.impalad, iterations=options.iterations, num_clients=options.num_clients, compare_with_hive=options.compare_with_hive, skip_impala=options.skip_impala, exec_options=options.exec_options, profiler=options.profiler, verbose=options.verbose, prime_cache=options.prime_cache, use_kerberos=options.use_kerberos, verify_results=options.verify_results) failure_injector = None if options.failure_frequency > 0: # If not doing failure testing there is no reason to import these modules which # have additional dependencies. from common.failure_injector import FailureInjector from common.impala_cluster import ImpalaCluster cluster = ImpalaCluster(options.cm_server_host, options.cm_cluster_name, username=options.cm_username, password=options.cm_password) failure_injector = FailureInjector(cluster, failure_frequency=options.failure_frequency, impalad_exclude_list=options.impalad.split(',')) try: run_workloads(workload_runner, failure_injector) except Exception, e: if failure_injector is not None: failure_injector.cancel() process_results(workload_runner, is_partial_result=True) raise process_results(workload_runner, is_partial_result=False)