#!/usr/bin/env python # Copyright (c) 2012 Cloudera, Inc. All rights reserved. # # This module is used to run benchmark queries. It runs the set queries specified in the # given workload(s) under /queries. This script will first try to warm the # buffer cache before running the query. There is also a parameter to to control how # many iterations to run each query. import csv import logging import math import os import sys import subprocess import threading from collections import defaultdict from optparse import OptionParser from functools import partial from os.path import isfile, isdir from tests.common.query_executor import * from tests.common.test_dimensions import * from tests.common.test_result_verifier import * from tests.util.calculation_util import calculate_median from tests.util.test_file_parser import * from time import sleep from random import choice # globals WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR'] IMPALA_HOME = os.environ['IMPALA_HOME'] PROFILE_OUTPUT_FILE = os.path.join(IMPALA_HOME, 'be/build/release/service/profile.tmp') PRIME_CACHE_CMD = os.path.join(IMPALA_HOME, "testdata/bin/cache_tables.py") + " -q \"%s\"" dev_null = open(os.devnull) logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s') LOG = logging.getLogger('workload_runner') class QueryExecutionDetail(object): def __init__(self, executor, workload, scale_factor, file_format, compression_codec, compression_type, execution_result): self.executor = executor self.workload = workload self.scale_factor = scale_factor self.file_format = file_format self.compression_codec = compression_codec self.compression_type = compression_type self.execution_result = execution_result # Runs query files and captures results from the specified workload(s) # The usage is: # 1) Initialize WorkloadRunner with desired execution parameters. # 2) Call workload_runner.run_workload() passing in a workload name(s) and scale # factor(s). # Internally, for each workload, this module looks up and parses that workload's # query files and reads the workload's test vector to determine what combination(s) # of file format / compression to run with. The queries are then executed # and the results are displayed as well as saved to a CSV file. class WorkloadRunner(object): def __init__(self, **kwargs): self.verbose = kwargs.get('verbose', False) if self.verbose: LOG.setLevel(level=logging.DEBUG) self.client_type = kwargs.get('client_type', 'beeswax') self.skip_impala = kwargs.get('skip_impala', False) self.compare_with_hive = kwargs.get('compare_with_hive', False) self.hive_cmd = kwargs.get('hive_cmd', 'hive -e ') self.TARGET_IMPALADS = kwargs.get('impalad', 'localhost:21000').split(",") self.iterations = kwargs.get('iterations', 2) self.num_clients = kwargs.get('num_clients', 1) self.exec_options = kwargs.get('exec_options', str()) self.prime_cache = kwargs.get('prime_cache', False) self.remote = not self.TARGET_IMPALADS[0].startswith('localhost') self.profiler = kwargs.get('profiler', False) self.use_kerberos = kwargs.get('use_kerberos', False) self.run_using_hive = kwargs.get('compare_with_hive', False) or self.skip_impala self.verify_results = kwargs.get('verify_results', False) # TODO: Need to find a way to get this working without runquery #self.gprof_cmd = 'google-pprof --text ' + self.runquery_path + ' %s | head -n 60' self.__summary = str() self.__result_map = defaultdict(list) # Parse for the tables used in this query @staticmethod def __parse_tables(query): """ Parse the tables used in this query. """ table_predecessor = ['from', 'join'] tokens = query.split(' ') tables = [] next_is_table = 0 for t in tokens: t = t.lower() if next_is_table == 1: tables.append(t) next_is_table = 0 if t in table_predecessor: next_is_table = 1 return tables def prime_remote_or_local_cache(self, query, remote, hive=False): """ Prime either the local cache or buffer cache for a remote machine. """ if remote: # TODO: Need to find what (if anything) we should do in the remote case return else: self.prime_buffer_cache_local(query) def prime_buffer_cache_local(self, query): """ Prime the buffer cache on mini-dfs. We can prime the buffer cache by accessing the local file system. """ # TODO: Consider making cache_tables a module rather than directly calling the script command = PRIME_CACHE_CMD % query os.system(command) def create_executor(self, db_name, executor_name): # Add additional query exec options here query_options = { 'hive': lambda: (execute_using_hive, HiveQueryExecOptions(self.iterations, hive_cmd=self.hive_cmd, db_name=db_name, )), 'impala_beeswax': lambda: (execute_using_impala_beeswax, ImpalaBeeswaxExecOptions(self.iterations, exec_options=self.exec_options, use_kerberos=self.use_kerberos, db_name=db_name, impalad=choice(self.TARGET_IMPALADS))), 'jdbc': lambda: (execute_using_jdbc, JdbcQueryExecOptions(self.iterations, impalad=choice(self.TARGET_IMPALADS), db_name=db_name)), } [executor_name]() return query_options def run_query(self, executor_name, db_name, query, prime_cache, exit_on_error): """ Run a query command and return the result. Takes in a match functional that is used to parse stderr/out to extract the results. """ if prime_cache: self.prime_remote_or_local_cache(query, self.remote, executor_name == 'hive') threads = [] results = [] output = None execution_result = None for client in xrange(self.num_clients): name = "Client Thread " + str(client) exec_tuple = self.create_executor(db_name, executor_name) threads.append(QueryExecutor(name, exec_tuple[0], exec_tuple[1], query)) for thread in threads: LOG.debug(thread.name + " starting") thread.start() for thread in threads: thread.join() if not thread.success() and exit_on_error: LOG.error("Thread: %s returned with error. Exiting." % thread.name) raise RuntimeError, "Error executing query. Aborting" results.append((thread.get_results())) LOG.debug(thread.name + " completed") return self.__get_median_execution_result(results) def __get_median_execution_result(self, results): """ Returns an ExecutionResult object whose avg/stddev is the median of all results. This is used when running with multiple clients to select a good representative value for the overall execution time. """ # Choose a result to update with the mean avg/stddev values. It doesn't matter which # one, so just pick the first one. final_result = results[0] if len(results) == 1: return final_result final_result.avg_time = calculate_median([result.avg_time for result in results]) if self.iterations > 1: final_result.std_dev = calculate_median([result.std_dev for result in results]) return final_result @staticmethod def __enumerate_query_files(base_directory): """ Recursively scan the given directory for all test query files. """ query_files = list() for item in os.listdir(base_directory): full_path = os.path.join(base_directory, item) if isfile(full_path) and item.endswith('.test'): query_files.append(full_path) elif isdir(full_path): query_files += WorkloadRunner.__enumerate_query_files(full_path) return query_files @staticmethod def __extract_queries_from_test_files(workload): """ Enumerate all the query files for a workload and extract the query strings. TODO: Update this to use the new test file parser """ workload_base_dir = os.path.join(WORKLOAD_DIR, workload) if not isdir(workload_base_dir): raise ValueError,\ "Workload '%s' not found at path '%s'" % (workload, workload_base_dir) query_dir = os.path.join(workload_base_dir, 'queries') if not isdir(query_dir): raise ValueError, "Workload query directory not found at path '%s'" % (query_dir) query_map = defaultdict(list) for query_file_name in WorkloadRunner.__enumerate_query_files(query_dir): LOG.debug('Parsing Query Test File: ' + query_file_name) sections = parse_query_test_file(query_file_name) test_name = re.sub('/', '.', query_file_name.split('.')[0])[1:] for section in sections: query_map[test_name].append((section['QUERY_NAME'], (section['QUERY'], section['RESULTS']))) return query_map def execute_queries(self, query_map, workload, scale_factor, query_names, stop_on_query_error, test_vector): """ Execute the queries for combinations of file format, compression, etc. The values needed to build the query are stored in the first 4 columns of each row. """ # TODO : Find a clean way to get rid of globals. file_format, data_group, codec, compression_type = [test_vector.file_format, test_vector.dataset, test_vector.compression_codec, test_vector.compression_type] executor_name = self.client_type # We want to indicate this is IMPALA beeswax (currently dont' support hive beeswax) executor_name = 'impala_beeswax' if executor_name == 'beeswax' else executor_name query_name_filter = None if query_names: query_name_filter = [name.lower() for name in query_names.split(',')] LOG.info("Running Test Vector - File Format: %s Compression: %s / %s" %\ (file_format, codec, compression_type)) for test_name in query_map.keys(): for query_name, query_and_expected_result in query_map[test_name]: query, results = query_and_expected_result if not query_name: query_name = query if query_name_filter and (query_name.lower() not in query_name_filter): LOG.info("Skipping query '%s'" % query_name) continue db_name = QueryTestSectionReader.get_db_name(test_vector, scale_factor) query_string = QueryTestSectionReader.build_query(query.strip(), test_vector, '') table_format_str = '%s/%s/%s' % (file_format, codec, compression_type) self.__summary += "\nQuery (%s): %s\n" % (table_format_str, query_name) execution_result = QueryExecutionResult() if not self.skip_impala: self.__summary += " Impala Results: " LOG.debug('Running: \n%s\n' % query_string) if query_name != query: LOG.info('Query Name: \n%s\n' % query_name) execution_result = self.run_query(executor_name, db_name, query_string, self.prime_cache, stop_on_query_error) # Don't verify insert results and allow user to continue on error if there is # a verification failure if execution_result is not None and\ self.verify_results and 'insert' not in query.lower(): try: verify_results(results.split('\n'), execution_result.data, contains_order_by(query)) except AssertionError, e: if stop_on_query_error: raise LOG.error(e) self.__summary += "%s\n" % execution_result hive_execution_result = QueryExecutionResult() if self.compare_with_hive or self.skip_impala: self.__summary += " Hive Results: " hive_execution_result = self.run_query('hive', db_name, query_string, self.prime_cache, False) self.__summary += "%s\n" % hive_execution_result LOG.debug("---------------------------------------------------------------------") execution_detail = QueryExecutionDetail(executor_name, workload, scale_factor, file_format, codec, compression_type, execution_result) hive_execution_detail = QueryExecutionDetail('hive', workload, scale_factor, file_format, codec, compression_type, hive_execution_result) self.__result_map[(query_name, query)].append((execution_detail, hive_execution_detail)) def get_summary_str(self): return self.__summary def get_results(self): return self.__result_map def run_workload(self, workload, scale_factor=str(), table_formats=None, query_names=None, exploration_strategy='core', stop_on_query_error=True): """ Run queries associated with each workload specified on the commandline. For each workload specified in, look up the associated query files. Extract valid queries in each file and execute them using the specified number of execution iterations. Finally, write results to an output CSV file for reporting. """ LOG.info('Running workload: %s / Scale factor: %s' % (workload, scale_factor)) query_map = WorkloadRunner.__extract_queries_from_test_files(workload) test_vectors = None if table_formats: table_formats = table_formats.split(',') dataset = get_dataset_from_workload(workload) test_vectors =\ [TableFormatInfo.create_from_string(dataset, tf) for tf in table_formats] else: test_vectors = [vector.value for vector in\ load_table_info_dimension(workload, exploration_strategy)] args = [query_map, workload, scale_factor, query_names, stop_on_query_error] execute_queries_partial = partial(self.execute_queries, *args) map(execute_queries_partial, test_vectors)