mirror of
https://github.com/apache/impala.git
synced 2026-01-01 09:00:42 -05:00
341 lines
14 KiB
Python
Executable File
341 lines
14 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# This module is used to run benchmark queries. It runs the set queries specified in the
|
|
# given workload(s) under <workload name>/queries. This script will first try to warm the
|
|
# buffer cache before running the query. There is also a parameter to to control how
|
|
# many iterations to run each query.
|
|
import csv
|
|
import logging
|
|
import math
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
import threading
|
|
from collections import defaultdict
|
|
from optparse import OptionParser
|
|
from functools import partial
|
|
from os.path import isfile, isdir
|
|
from tests.common.query_executor import *
|
|
from tests.common.test_dimensions import *
|
|
from tests.common.test_result_verifier import *
|
|
from tests.util.calculation_util import calculate_median
|
|
from tests.util.test_file_parser import *
|
|
from time import sleep
|
|
from random import choice
|
|
|
|
# globals
|
|
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
|
IMPALA_HOME = os.environ['IMPALA_HOME']
|
|
PROFILE_OUTPUT_FILE = os.path.join(IMPALA_HOME, 'be/build/release/service/profile.tmp')
|
|
PRIME_CACHE_CMD = os.path.join(IMPALA_HOME, "testdata/bin/cache_tables.py") + " -q \"%s\""
|
|
|
|
dev_null = open(os.devnull)
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s')
|
|
LOG = logging.getLogger('workload_runner')
|
|
|
|
class QueryExecutionDetail(object):
|
|
def __init__(self, executor, workload, scale_factor, file_format, compression_codec,
|
|
compression_type, execution_result):
|
|
self.executor = executor
|
|
self.workload = workload
|
|
self.scale_factor = scale_factor
|
|
self.file_format = file_format
|
|
self.compression_codec = compression_codec
|
|
self.compression_type = compression_type
|
|
self.execution_result = execution_result
|
|
|
|
|
|
# Runs query files and captures results from the specified workload(s)
|
|
# The usage is:
|
|
# 1) Initialize WorkloadRunner with desired execution parameters.
|
|
# 2) Call workload_runner.run_workload() passing in a workload name(s) and scale
|
|
# factor(s).
|
|
# Internally, for each workload, this module looks up and parses that workload's
|
|
# query files and reads the workload's test vector to determine what combination(s)
|
|
# of file format / compression to run with. The queries are then executed
|
|
# and the results are displayed as well as saved to a CSV file.
|
|
class WorkloadRunner(object):
|
|
def __init__(self, **kwargs):
|
|
self.verbose = kwargs.get('verbose', False)
|
|
if self.verbose:
|
|
LOG.setLevel(level=logging.DEBUG)
|
|
|
|
self.client_type = kwargs.get('client_type', 'beeswax')
|
|
self.skip_impala = kwargs.get('skip_impala', False)
|
|
self.compare_with_hive = kwargs.get('compare_with_hive', False)
|
|
self.hive_cmd = kwargs.get('hive_cmd', 'hive -e ')
|
|
self.TARGET_IMPALADS = kwargs.get('impalad', 'localhost:21000').split(",")
|
|
self.iterations = kwargs.get('iterations', 2)
|
|
self.num_clients = kwargs.get('num_clients', 1)
|
|
self.exec_options = kwargs.get('exec_options', str())
|
|
self.prime_cache = kwargs.get('prime_cache', False)
|
|
self.remote = not self.TARGET_IMPALADS[0].startswith('localhost')
|
|
self.profiler = kwargs.get('profiler', False)
|
|
self.use_kerberos = kwargs.get('use_kerberos', False)
|
|
self.run_using_hive = kwargs.get('compare_with_hive', False) or self.skip_impala
|
|
self.verify_results = kwargs.get('verify_results', False)
|
|
# TODO: Need to find a way to get this working without runquery
|
|
#self.gprof_cmd = 'google-pprof --text ' + self.runquery_path + ' %s | head -n 60'
|
|
self.__summary = str()
|
|
self.__result_map = defaultdict(list)
|
|
|
|
# Parse for the tables used in this query
|
|
@staticmethod
|
|
def __parse_tables(query):
|
|
"""
|
|
Parse the tables used in this query.
|
|
"""
|
|
table_predecessor = ['from', 'join']
|
|
tokens = query.split(' ')
|
|
tables = []
|
|
next_is_table = 0
|
|
for t in tokens:
|
|
t = t.lower()
|
|
if next_is_table == 1:
|
|
tables.append(t)
|
|
next_is_table = 0
|
|
if t in table_predecessor:
|
|
next_is_table = 1
|
|
return tables
|
|
|
|
def prime_remote_or_local_cache(self, query, remote, hive=False):
|
|
"""
|
|
Prime either the local cache or buffer cache for a remote machine.
|
|
"""
|
|
if remote:
|
|
# TODO: Need to find what (if anything) we should do in the remote case
|
|
return
|
|
else:
|
|
self.prime_buffer_cache_local(query)
|
|
|
|
def prime_buffer_cache_local(self, query):
|
|
"""
|
|
Prime the buffer cache on mini-dfs.
|
|
|
|
We can prime the buffer cache by accessing the local file system.
|
|
"""
|
|
# TODO: Consider making cache_tables a module rather than directly calling the script
|
|
command = PRIME_CACHE_CMD % query
|
|
os.system(command)
|
|
|
|
def create_executor(self, db_name, executor_name):
|
|
# Add additional query exec options here
|
|
query_options = {
|
|
'hive': lambda: (execute_using_hive,
|
|
HiveQueryExecOptions(self.iterations,
|
|
hive_cmd=self.hive_cmd,
|
|
db_name=db_name,
|
|
)),
|
|
'impala_beeswax': lambda: (execute_using_impala_beeswax,
|
|
ImpalaBeeswaxExecOptions(self.iterations,
|
|
exec_options=self.exec_options,
|
|
use_kerberos=self.use_kerberos,
|
|
db_name=db_name,
|
|
impalad=choice(self.TARGET_IMPALADS))),
|
|
'jdbc': lambda: (execute_using_jdbc,
|
|
JdbcQueryExecOptions(self.iterations,
|
|
impalad=choice(self.TARGET_IMPALADS),
|
|
db_name=db_name)),
|
|
} [executor_name]()
|
|
return query_options
|
|
|
|
def run_query(self, executor_name, db_name, query, prime_cache, exit_on_error):
|
|
"""
|
|
Run a query command and return the result.
|
|
|
|
Takes in a match functional that is used to parse stderr/out to extract the results.
|
|
"""
|
|
if prime_cache:
|
|
self.prime_remote_or_local_cache(query, self.remote, executor_name == 'hive')
|
|
|
|
threads = []
|
|
results = []
|
|
|
|
output = None
|
|
execution_result = None
|
|
for client in xrange(self.num_clients):
|
|
name = "Client Thread " + str(client)
|
|
exec_tuple = self.create_executor(db_name, executor_name)
|
|
threads.append(QueryExecutor(name, exec_tuple[0], exec_tuple[1], query))
|
|
for thread in threads:
|
|
LOG.debug(thread.name + " starting")
|
|
thread.start()
|
|
|
|
for thread in threads:
|
|
thread.join()
|
|
if not thread.success() and exit_on_error:
|
|
LOG.error("Thread: %s returned with error. Exiting." % thread.name)
|
|
raise RuntimeError, "Error executing query. Aborting"
|
|
|
|
results.append((thread.get_results()))
|
|
LOG.debug(thread.name + " completed")
|
|
return self.__get_median_execution_result(results)
|
|
|
|
def __get_median_execution_result(self, results):
|
|
"""
|
|
Returns an ExecutionResult object whose avg/stddev is the median of all results.
|
|
|
|
This is used when running with multiple clients to select a good representative value
|
|
for the overall execution time.
|
|
"""
|
|
# Choose a result to update with the mean avg/stddev values. It doesn't matter which
|
|
# one, so just pick the first one.
|
|
final_result = results[0]
|
|
if len(results) == 1:
|
|
return final_result
|
|
final_result.avg_time = calculate_median([result.avg_time for result in results])
|
|
if self.iterations > 1:
|
|
final_result.std_dev = calculate_median([result.std_dev for result in results])
|
|
return final_result
|
|
|
|
@staticmethod
|
|
def __enumerate_query_files(base_directory):
|
|
"""
|
|
Recursively scan the given directory for all test query files.
|
|
"""
|
|
query_files = list()
|
|
for item in os.listdir(base_directory):
|
|
full_path = os.path.join(base_directory, item)
|
|
if isfile(full_path) and item.endswith('.test'):
|
|
query_files.append(full_path)
|
|
elif isdir(full_path):
|
|
query_files += WorkloadRunner.__enumerate_query_files(full_path)
|
|
return query_files
|
|
|
|
@staticmethod
|
|
def __extract_queries_from_test_files(workload):
|
|
"""
|
|
Enumerate all the query files for a workload and extract the query strings.
|
|
|
|
TODO: Update this to use the new test file parser
|
|
"""
|
|
workload_base_dir = os.path.join(WORKLOAD_DIR, workload)
|
|
if not isdir(workload_base_dir):
|
|
raise ValueError,\
|
|
"Workload '%s' not found at path '%s'" % (workload, workload_base_dir)
|
|
|
|
query_dir = os.path.join(workload_base_dir, 'queries')
|
|
if not isdir(query_dir):
|
|
raise ValueError, "Workload query directory not found at path '%s'" % (query_dir)
|
|
|
|
query_map = defaultdict(list)
|
|
for query_file_name in WorkloadRunner.__enumerate_query_files(query_dir):
|
|
LOG.debug('Parsing Query Test File: ' + query_file_name)
|
|
sections = parse_query_test_file(query_file_name)
|
|
test_name = re.sub('/', '.', query_file_name.split('.')[0])[1:]
|
|
for section in sections:
|
|
query_map[test_name].append((section['QUERY_NAME'],
|
|
(section['QUERY'], section['RESULTS'])))
|
|
return query_map
|
|
|
|
def execute_queries(self, query_map, workload, scale_factor, query_names,
|
|
stop_on_query_error, test_vector):
|
|
"""
|
|
Execute the queries for combinations of file format, compression, etc.
|
|
|
|
The values needed to build the query are stored in the first 4 columns of each row.
|
|
"""
|
|
# TODO : Find a clean way to get rid of globals.
|
|
file_format, data_group, codec, compression_type = [test_vector.file_format,
|
|
test_vector.dataset, test_vector.compression_codec, test_vector.compression_type]
|
|
|
|
executor_name = self.client_type
|
|
# We want to indicate this is IMPALA beeswax (currently dont' support hive beeswax)
|
|
executor_name = 'impala_beeswax' if executor_name == 'beeswax' else executor_name
|
|
|
|
query_name_filter = None
|
|
if query_names:
|
|
query_name_filter = [name.lower() for name in query_names.split(',')]
|
|
LOG.info("Running Test Vector - File Format: %s Compression: %s / %s" %\
|
|
(file_format, codec, compression_type))
|
|
for test_name in query_map.keys():
|
|
for query_name, query_and_expected_result in query_map[test_name]:
|
|
query, results = query_and_expected_result
|
|
if not query_name:
|
|
query_name = query
|
|
if query_name_filter and (query_name.lower() not in query_name_filter):
|
|
LOG.info("Skipping query '%s'" % query_name)
|
|
continue
|
|
|
|
db_name = QueryTestSectionReader.get_db_name(test_vector, scale_factor)
|
|
query_string = QueryTestSectionReader.build_query(query.strip(), test_vector, '')
|
|
table_format_str = '%s/%s/%s' % (file_format, codec, compression_type)
|
|
self.__summary += "\nQuery (%s): %s\n" % (table_format_str, query_name)
|
|
execution_result = QueryExecutionResult()
|
|
if not self.skip_impala:
|
|
self.__summary += " Impala Results: "
|
|
LOG.debug('Running: \n%s\n' % query_string)
|
|
if query_name != query:
|
|
LOG.info('Query Name: \n%s\n' % query_name)
|
|
|
|
execution_result = self.run_query(executor_name, db_name, query_string,
|
|
self.prime_cache, stop_on_query_error)
|
|
|
|
# Don't verify insert results and allow user to continue on error if there is
|
|
# a verification failure
|
|
if execution_result is not None and\
|
|
self.verify_results and 'insert' not in query.lower():
|
|
try:
|
|
verify_results(results.split('\n'), execution_result.data,
|
|
contains_order_by(query))
|
|
except AssertionError, e:
|
|
if stop_on_query_error:
|
|
raise
|
|
LOG.error(e)
|
|
|
|
self.__summary += "%s\n" % execution_result
|
|
|
|
hive_execution_result = QueryExecutionResult()
|
|
if self.compare_with_hive or self.skip_impala:
|
|
self.__summary += " Hive Results: "
|
|
hive_execution_result = self.run_query('hive', db_name,
|
|
query_string,
|
|
self.prime_cache,
|
|
False)
|
|
self.__summary += "%s\n" % hive_execution_result
|
|
LOG.debug("---------------------------------------------------------------------")
|
|
|
|
execution_detail = QueryExecutionDetail(executor_name, workload, scale_factor,
|
|
file_format, codec, compression_type, execution_result)
|
|
|
|
hive_execution_detail = QueryExecutionDetail('hive', workload, scale_factor,
|
|
file_format, codec, compression_type, hive_execution_result)
|
|
|
|
self.__result_map[(query_name, query)].append((execution_detail,
|
|
hive_execution_detail))
|
|
|
|
def get_summary_str(self):
|
|
return self.__summary
|
|
|
|
def get_results(self):
|
|
return self.__result_map
|
|
|
|
def run_workload(self, workload, scale_factor=str(), table_formats=None,
|
|
query_names=None, exploration_strategy='core',
|
|
stop_on_query_error=True):
|
|
"""
|
|
Run queries associated with each workload specified on the commandline.
|
|
|
|
For each workload specified in, look up the associated query files. Extract valid
|
|
queries in each file and execute them using the specified number of execution
|
|
iterations. Finally, write results to an output CSV file for reporting.
|
|
"""
|
|
LOG.info('Running workload: %s / Scale factor: %s' % (workload, scale_factor))
|
|
query_map = WorkloadRunner.__extract_queries_from_test_files(workload)
|
|
|
|
test_vectors = None
|
|
if table_formats:
|
|
table_formats = table_formats.split(',')
|
|
dataset = get_dataset_from_workload(workload)
|
|
test_vectors =\
|
|
[TableFormatInfo.create_from_string(dataset, tf) for tf in table_formats]
|
|
else:
|
|
test_vectors = [vector.value for vector in\
|
|
load_table_info_dimension(workload, exploration_strategy)]
|
|
|
|
args = [query_map, workload, scale_factor, query_names, stop_on_query_error]
|
|
execute_queries_partial = partial(self.execute_queries, *args)
|
|
map(execute_queries_partial, test_vectors)
|