mirror of
https://github.com/apache/impala.git
synced 2026-01-01 00:00:20 -05:00
This patch makes the workload runner's logging concise and more informative. Specifically, it - logs the time taken for each iteration of a query. - changes the default log level to INFO. - The output is less verbose. Change-Id: I5f964cf76269fd64ce127b9e4c51fe1deafd1d1b Reviewed-on: http://gerrit.ent.cloudera.com:8080/1076 Reviewed-by: Ishaan Joshi <ishaan@cloudera.com> Tested-by: Ishaan Joshi <ishaan@cloudera.com>
270 lines
13 KiB
Python
Executable File
270 lines
13 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# This script is used to run benchmark queries. It runs the set queries specified in the
|
|
# given workload(s) under <workload name>/queries. This script will first try to warm the
|
|
# buffer cache before running the query. There is a command line options to control how
|
|
# many iterations to run each query.
|
|
#
|
|
# By default, the script will have minimal output. Verbose output can be turned on with
|
|
# the -v option which will output the normal query output. In addition, the -p option
|
|
# can be passed which will enable gprof instrumentation and output the sampled call
|
|
# stacks. The -v and -p option are used by the perf regression tests.
|
|
#
|
|
import csv
|
|
import logging
|
|
import math
|
|
import os
|
|
import pickle
|
|
import sys
|
|
import subprocess
|
|
import threading
|
|
from collections import defaultdict
|
|
from functools import partial
|
|
from optparse import OptionParser
|
|
from os.path import isfile, isdir
|
|
from random import choice
|
|
from sys import exit
|
|
from time import sleep
|
|
from tests.common.workload_runner import WorkloadRunner
|
|
from tests.util.plugin_runner import PluginRunner
|
|
|
|
# Options
|
|
# TODO: Group the options for user friendliness and readability.
|
|
# TODO: Find a way to reduce the number of options.
|
|
parser = OptionParser()
|
|
parser.add_option("-p", "--profiler", dest="profiler",
|
|
action="store_true", default = False,
|
|
help="If set, also run google pprof for sample profiling.")
|
|
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
|
default = False, help="If set, outputs all benchmark diagnostics.")
|
|
parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
|
|
help="The exploration strategy to use for running benchmark: 'core', "\
|
|
"'pairwise', or 'exhaustive'")
|
|
parser.add_option("-w", "--workloads", dest="workloads", default="hive-benchmark",
|
|
help="The workload(s) and scale factors to run in a comma-separated "\
|
|
" list format. Optional scale factors for each workload are specified"\
|
|
" using colons. For example: -w tpcds,tpch:400gb,tpch:1gb. "\
|
|
"Some valid workloads: 'hive-benchmark', 'tpch', 'tpcds', ...")
|
|
parser.add_option("--impalad", dest="impalad", default="localhost:21000",
|
|
help="A comma-separated list of impalad instances to run the "\
|
|
"workload against.")
|
|
parser.add_option("--exec_options", dest="exec_options", default='',
|
|
help="Runquery exec option string.")
|
|
parser.add_option("--compare_with_hive", dest="compare_with_hive", action="store_true",
|
|
default= False, help="Run all queries using Hive as well as Impala")
|
|
parser.add_option("--results_csv_file", dest="results_csv_file",
|
|
default=os.environ['IMPALA_HOME'] + "/benchmark_results.csv",
|
|
help="The output file where benchmark results are saved")
|
|
parser.add_option("--hive_results_csv_file", dest="hive_results_csv_file",
|
|
default=os.environ['IMPALA_HOME'] + "/hive_benchmark_results.csv",
|
|
help="The output file where Hive benchmark results are saved")
|
|
parser.add_option("--hive_cmd", dest="hive_cmd", default="hive -e",
|
|
help="The command to use for executing hive queries")
|
|
parser.add_option("-i", "--iterations", type="int", dest="iterations", default=1,
|
|
help="Number of times to run each query.")
|
|
parser.add_option("--num_clients", type="int", dest="num_clients", default=1,
|
|
help="Number of clients (threads) to use when executing each query.")
|
|
parser.add_option("--query_names", dest="query_names", default=None,
|
|
help="A comma-separated list of query names to execute.")
|
|
parser.add_option("--table_formats", dest="table_formats", default=None, help=\
|
|
"Override the default test vectors and run using only the specified "\
|
|
"table formats. Ex. --table_formats=seq/snap/block,text/none")
|
|
parser.add_option("--skip_impala", dest="skip_impala", action="store_true",
|
|
default= False, help="If set, queries will only run against Hive.")
|
|
parser.add_option("--execution_scope", dest="execution_scope", default="query",
|
|
help=("The unit of execution for a workload and its test vectors. The "
|
|
"default scope is 'query'; In this scope, user provided "
|
|
"execution parameters like number of iterations and number of "
|
|
"clents will operate on on a single query, which is a "
|
|
"combination of the workload, scale factor and test vector. "
|
|
"When 'workload' is specified as the scope, the execution "
|
|
"paramaters will operate over a set of queries under a "
|
|
"workload, scale factor and a test vector"))
|
|
parser.add_option("--shuffle_query_exec_order", dest="shuffle_queries",
|
|
action="store_true", default=False, help=("Randomizes the order "
|
|
"of query execution. Useful when the execution scope is a workload"))
|
|
|
|
parser.add_option("--use_kerberos", dest="use_kerberos", action="store_true",
|
|
default=False, help="If set, enables talking to a kerberized impalad")
|
|
parser.add_option("--continue_on_query_error", dest="continue_on_query_error",
|
|
action="store_true", default=False, help="If set, continue execution "\
|
|
"on each query error.")
|
|
parser.add_option("-V", "--verify_results", dest="verify_results", action="store_true",
|
|
default=False, help="If set, verifies query results")
|
|
parser.add_option("-c", "--client_type", dest="client_type", default='beeswax',
|
|
help="Client type. Valid options are 'beeswax' or 'jdbc'")
|
|
parser.add_option("--plugin_names", dest="plugin_names", default=None,
|
|
help=("Set of comma-separated plugin names with scope; Plugins are"
|
|
" specified as <plugin_name>[:<scope>]. If no scope if specified,"
|
|
" it defaults to Query. Plugin names are case sensitive"))
|
|
|
|
# These options are used for configuring failure testing
|
|
parser.add_option("--failure_frequency", type="int", dest="failure_frequency", default=0,
|
|
help="Interval (in seconds) to inject each failure or 0 to disable"\
|
|
" failure injection.")
|
|
parser.add_option("--cm_cluster_name", dest="cm_cluster_name",
|
|
help="The CM name of the cluster under test")
|
|
parser.add_option("--cm_server_host", dest="cm_server_host",
|
|
help="The host of the CM server.")
|
|
parser.add_option("--cm_username", dest="cm_username", default='admin',
|
|
help="The username to the CM server")
|
|
parser.add_option("--cm_password", dest="cm_password", default='admin',
|
|
help="The password to the CM server")
|
|
options, args = parser.parse_args()
|
|
|
|
logging.basicConfig(level=logging.INFO, format='[%(name)s]: %(message)s')
|
|
LOG = logging.getLogger('run-workload')
|
|
|
|
def save_results(result_map, output_csv_file, is_impala_result=True):
|
|
"""
|
|
Writes the results to an output CSV files
|
|
TODO: This should be changed to use JSON as the container format. This is especially
|
|
important as we add more fields to the results.
|
|
"""
|
|
if result_map is None:
|
|
LOG.error('Result map is None')
|
|
return
|
|
|
|
# The default field size limit is too small to write big runtime profiles. Set
|
|
# the limit to an artibrarily large value.
|
|
csv.field_size_limit(sys.maxint)
|
|
csv_writer = csv.writer(open(output_csv_file, 'wb'), delimiter='|',
|
|
quoting=csv.QUOTE_MINIMAL)
|
|
|
|
for query, exec_results in result_map.iteritems():
|
|
for result in exec_results:
|
|
append_row_to_csv_file(csv_writer, query,
|
|
result[0] if is_impala_result else result[1])
|
|
|
|
def append_row_to_csv_file(csv_writer, query, result):
|
|
"""
|
|
Write the results to a CSV file with '|' as the delimiter.
|
|
"""
|
|
# If the query failed, and returned an empty result, print its details to console.
|
|
if not result.success:
|
|
LOG.info("All threads failed for Query:%s, Scale Factor:%s, Table Format:%s" %
|
|
(query.name, query.scale_factor, query.table_format_str))
|
|
# Don't store a blank result.
|
|
return
|
|
# Replace non-existent values with N/A for reporting results.
|
|
if not result.std_dev: result.std_dev = 'N/A'
|
|
if not result.avg_time: result.avg_time = 'N/A'
|
|
compression_str = '%s/%s' % (query.test_vector.compression_codec,
|
|
query.test_vector.compression_type)
|
|
if compression_str == 'none/none': compression_str = 'none'
|
|
# The number of iterations in the performance database is a query execution parameter.
|
|
# Since the query is run only once when executing at a 'workload' scope, reset it to 1
|
|
num_iters = options.iterations if options.execution_scope.lower() == 'query' else 1
|
|
csv_writer.writerow([result.executor_name, query.workload, query.scale_factor,
|
|
query.name, query.query_str, query.test_vector.file_format,
|
|
compression_str, result.avg_time, result.std_dev,
|
|
options.num_clients, num_iters, result.runtime_profile])
|
|
|
|
def enumerate_query_files(base_directory):
|
|
"""
|
|
Recursively scan the given directory for all test query files.
|
|
"""
|
|
query_files = list()
|
|
for item in os.listdir(base_directory):
|
|
full_path = os.path.join(base_directory, item)
|
|
if isfile(full_path) and item.endswith('.test'):
|
|
query_files.append(full_path)
|
|
elif isdir(full_path):
|
|
query_files += enumerate_query_files(full_path)
|
|
return query_files
|
|
|
|
def parse_workload_scale_factor(workload_scale_factor):
|
|
parsed_workload_scale_factor = workload_scale_factor.split(':')
|
|
if len(parsed_workload_scale_factor) == 1:
|
|
return parsed_workload_scale_factor[0], ''
|
|
elif len(parsed_workload_scale_factor) == 2:
|
|
return parsed_workload_scale_factor
|
|
else:
|
|
LOG.error("Error parsing workload. Proper format is workload[:scale factor]")
|
|
sys.exit(1)
|
|
|
|
def write_results(result_map, partial_results = False):
|
|
suffix = '.partial' if partial_results else ''
|
|
|
|
if not options.skip_impala:
|
|
LOG.info("Saving results to: %s" % options.results_csv_file)
|
|
save_results(result_map, options.results_csv_file + suffix, is_impala_result=True)
|
|
if options.skip_impala or options.compare_with_hive:
|
|
LOG.info("Saving hive results to: %s" % options.hive_results_csv_file)
|
|
save_results(result_map, options.hive_results_csv_file + suffix,
|
|
is_impala_result=False)
|
|
|
|
def run_workloads(workload_runner, failure_injector=None):
|
|
stop_on_error = (failure_injector is None) and (not options.continue_on_query_error)
|
|
if failure_injector is not None:
|
|
failure_injector.start()
|
|
|
|
for workload_and_scale_factor in options.workloads.split(','):
|
|
workload, scale_factor = parse_workload_scale_factor(workload_and_scale_factor)
|
|
if plugin_runner: plugin_runner.run_plugins_pre(scope="Workload")
|
|
workload_runner.run_workload(workload, scale_factor,
|
|
table_formats=options.table_formats,
|
|
query_names=options.query_names,
|
|
exploration_strategy=options.exploration_strategy,
|
|
stop_on_query_error=stop_on_error)
|
|
if plugin_runner: plugin_runner.run_plugins_post(scope="Workload")
|
|
|
|
if failure_injector is not None:
|
|
failure_injector.cancel()
|
|
|
|
def process_results(workload_runner, is_partial_result=False):
|
|
write_results(workload_runner.get_results(), is_partial_result)
|
|
print '\nSUMMARY OF RESULTS'
|
|
print '------------------'
|
|
print workload_runner.get_summary_str()
|
|
|
|
if __name__ == "__main__":
|
|
"""
|
|
Driver for the run-benchmark script.
|
|
|
|
It runs all the workloads specified on the command line and writes them to a csv file.
|
|
"""
|
|
if options.use_kerberos:
|
|
try:
|
|
import sasl
|
|
except ImportError:
|
|
print 'The sasl module is needed to query a kerberized impalad'
|
|
sys.exit(1)
|
|
|
|
VALID_CLIENT_TYPES = ['beeswax', 'jdbc']
|
|
if options.client_type not in VALID_CLIENT_TYPES:
|
|
LOG.error("Invalid client type %s" % options.client_type)
|
|
sys.exit(1)
|
|
|
|
plugin_runner = None
|
|
if options.plugin_names:
|
|
plugin_runner = PluginRunner(options.plugin_names.split(','))
|
|
|
|
# create a dictionary for the configuration.
|
|
workload_config = vars(options)
|
|
workload_config['plugin_runner'] = plugin_runner
|
|
workload_runner = WorkloadRunner(**workload_config)
|
|
|
|
failure_injector = None
|
|
if options.failure_frequency > 0:
|
|
# If not doing failure testing there is no reason to import these modules which
|
|
# have additional dependencies.
|
|
from common.failure_injector import FailureInjector
|
|
from common.impala_cluster import ImpalaCluster
|
|
|
|
cluster = ImpalaCluster(options.cm_server_host, options.cm_cluster_name,
|
|
username=options.cm_username, password=options.cm_password)
|
|
failure_injector = FailureInjector(cluster,
|
|
failure_frequency=options.failure_frequency,
|
|
impalad_exclude_list=options.impalad.split(','))
|
|
|
|
try:
|
|
run_workloads(workload_runner, failure_injector)
|
|
except Exception, e:
|
|
if failure_injector is not None:
|
|
failure_injector.cancel()
|
|
process_results(workload_runner, is_partial_result=True)
|
|
raise
|
|
process_results(workload_runner, is_partial_result=False)
|