impala/tests/common/workload_runner.py

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This module is used to run benchmark queries.  It runs the set queries specified in the
# given workload(s) under <workload name>/queries. This script will first try to warm the
# buffer cache before running the query. There is also a parameter to to control how
# many iterations to run each query.
import csv
import logging
import math
import os
import sys
import subprocess
import threading
from collections import defaultdict
from optparse import OptionParser
from functools import partial
from os.path import isfile, isdir
from tests.common.query_executor import *
from tests.common.test_dimensions import *
from tests.common.test_result_verifier import *
from tests.util.calculation_util import calculate_median
from tests.util.test_file_parser import *
from time import sleep
from random import choice

# globals
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
IMPALA_HOME = os.environ['IMPALA_HOME']
PROFILE_OUTPUT_FILE = os.path.join(IMPALA_HOME, 'be/build/release/service/profile.tmp')
PRIME_CACHE_CMD = os.path.join(IMPALA_HOME, "testdata/bin/cache_tables.py") + " -q \"%s\""

dev_null = open(os.devnull)

logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s')
LOG = logging.getLogger('workload_runner')

class QueryExecutionDetail(object):
  def __init__(self, executor, workload, scale_factor, file_format, compression_codec,
               compression_type, execution_result):
    self.executor = executor
    self.workload = workload
    self.scale_factor = scale_factor
    self.file_format = file_format
    self.compression_codec = compression_codec
    self.compression_type = compression_type
    self.execution_result = execution_result


# Runs query files and captures results from the specified workload(s)
# The usage is:
# 1) Initialize WorkloadRunner with desired execution parameters.
# 2) Call workload_runner.run_workload() passing in a workload name(s) and scale
# factor(s).
# Internally, for each workload, this module looks up and parses that workload's
# query files and reads the workload's test vector to determine what combination(s)
# of file format / compression to run with. The queries are then executed
# and the results are displayed as well as saved to a CSV file.
class WorkloadRunner(object):
  def __init__(self, **kwargs):
    self.verbose = kwargs.get('verbose', False)
    if self.verbose:
      LOG.setLevel(level=logging.DEBUG)

    self.client_type = kwargs.get('client_type', 'beeswax')
    self.skip_impala = kwargs.get('skip_impala', False)
    self.compare_with_hive = kwargs.get('compare_with_hive', False)
    self.hive_cmd = kwargs.get('hive_cmd', 'hive -e ')
    self.TARGET_IMPALADS = kwargs.get('impalad', 'localhost:21000').split(",")
    self.iterations = kwargs.get('iterations', 2)
    self.num_clients = kwargs.get('num_clients', 1)
    self.exec_options = kwargs.get('exec_options', str())
    self.prime_cache = kwargs.get('prime_cache', False)
    self.remote = not self.TARGET_IMPALADS[0].startswith('localhost')
    self.profiler = kwargs.get('profiler', False)
    self.use_kerberos = kwargs.get('use_kerberos', False)
    self.run_using_hive = kwargs.get('compare_with_hive', False) or self.skip_impala
    self.verify_results = kwargs.get('verify_results', False)
    # TODO: Need to find a way to get this working without runquery
    #self.gprof_cmd = 'google-pprof --text ' + self.runquery_path + ' %s | head -n 60'
    self.__summary = str()
    self.__result_map = defaultdict(list)

  # Parse for the tables used in this query
  @staticmethod
  def __parse_tables(query):
    """
    Parse the tables used in this query.
    """
    table_predecessor = ['from', 'join']
    tokens = query.split(' ')
    tables = []
    next_is_table = 0
    for t in tokens:
      t = t.lower()
      if next_is_table == 1:
        tables.append(t)
        next_is_table = 0
      if t in table_predecessor:
        next_is_table = 1
    return tables

  def prime_remote_or_local_cache(self, query, remote, hive=False):
    """
    Prime either the local cache or buffer cache for a remote machine.
    """
    if remote:
      # TODO: Need to find what (if anything) we should do in the remote case
      return
    else:
      self.prime_buffer_cache_local(query)

  def prime_buffer_cache_local(self, query):
    """
    Prime the buffer cache on mini-dfs.

    We can prime the buffer cache by accessing the local file system.
    """
    # TODO: Consider making cache_tables a module rather than directly calling the script
    command = PRIME_CACHE_CMD % query
    os.system(command)

  def create_executor(self, db_name, executor_name):
    # Add additional query exec options here
    query_options = {
        'hive': lambda: (execute_using_hive,
          HiveQueryExecOptions(self.iterations,
          hive_cmd=self.hive_cmd,
          db_name=db_name,
          )),
        'impala_beeswax': lambda: (execute_using_impala_beeswax,
          ImpalaBeeswaxExecOptions(self.iterations,
          exec_options=self.exec_options,
          use_kerberos=self.use_kerberos,
          db_name=db_name,
          impalad=choice(self.TARGET_IMPALADS))),
        'jdbc': lambda: (execute_using_jdbc,
          JdbcQueryExecOptions(self.iterations,
          impalad=choice(self.TARGET_IMPALADS),
          db_name=db_name)),
    } [executor_name]()
    return query_options

  def run_query(self, executor_name, db_name, query, prime_cache, exit_on_error):
    """
    Run a query command and return the result.

    Takes in a match functional that is used to parse stderr/out to extract the results.
    """
    if prime_cache:
      self.prime_remote_or_local_cache(query, self.remote, executor_name == 'hive')

    threads = []
    results = []

    output = None
    execution_result = None
    for client in xrange(self.num_clients):
      name = "Client Thread " + str(client)
      exec_tuple = self.create_executor(db_name, executor_name)
      threads.append(QueryExecutor(name, exec_tuple[0], exec_tuple[1], query))
    for thread in threads:
      LOG.debug(thread.name + " starting")
      thread.start()

    for thread in threads:
      thread.join()
      if not thread.success() and exit_on_error:
        LOG.error("Thread: %s returned with error. Exiting." % thread.name)
        raise RuntimeError, "Error executing query. Aborting"

      results.append((thread.get_results()))
      LOG.debug(thread.name + " completed")
    return self.__get_median_execution_result(results)

  def __get_median_execution_result(self, results):
    """
    Returns an ExecutionResult object whose avg/stddev is the median of all results.

    This is used when running with multiple clients to select a good representative value
    for the overall execution time.
    """
    # Choose a result to update with the mean avg/stddev values. It doesn't matter which
    # one, so just pick the first one.
    final_result = results[0]
    if len(results) == 1:
      return final_result
    final_result.avg_time = calculate_median([result.avg_time for result in results])
    if self.iterations > 1:
      final_result.std_dev = calculate_median([result.std_dev for result in results])
    return final_result

  @staticmethod
  def __enumerate_query_files(base_directory):
    """
    Recursively scan the given directory for all test query files.
    """
    query_files = list()
    for item in os.listdir(base_directory):
      full_path = os.path.join(base_directory, item)
      if isfile(full_path) and item.endswith('.test'):
        query_files.append(full_path)
      elif isdir(full_path):
        query_files += WorkloadRunner.__enumerate_query_files(full_path)
    return query_files

  @staticmethod
  def __extract_queries_from_test_files(workload):
    """
    Enumerate all the query files for a workload and extract the query strings.

    TODO: Update this to use the new test file parser
    """
    workload_base_dir = os.path.join(WORKLOAD_DIR, workload)
    if not isdir(workload_base_dir):
      raise ValueError,\
             "Workload '%s' not found at path '%s'" % (workload, workload_base_dir)

    query_dir = os.path.join(workload_base_dir, 'queries')
    if not isdir(query_dir):
      raise ValueError, "Workload query directory not found at path '%s'" % (query_dir)

    query_map = defaultdict(list)
    for query_file_name in WorkloadRunner.__enumerate_query_files(query_dir):
      LOG.debug('Parsing Query Test File: ' + query_file_name)
      sections = parse_query_test_file(query_file_name)
      test_name = re.sub('/', '.', query_file_name.split('.')[0])[1:]
      for section in sections:
        query_map[test_name].append((section['QUERY_NAME'],
                                     (section['QUERY'], section['RESULTS'])))
    return query_map

  def execute_queries(self, query_map, workload, scale_factor, query_names,
                      stop_on_query_error, test_vector):
    """
    Execute the queries for combinations of file format, compression, etc.

    The values needed to build the query are stored in the first 4 columns of each row.
    """
    # TODO : Find a clean way to get rid of globals.
    file_format, data_group, codec, compression_type = [test_vector.file_format,
        test_vector.dataset, test_vector.compression_codec, test_vector.compression_type]

    executor_name = self.client_type
    # We want to indicate this is IMPALA beeswax (currently dont' support hive beeswax)
    executor_name = 'impala_beeswax' if executor_name == 'beeswax' else executor_name

    query_name_filter = None
    if query_names:
      query_name_filter = [name.lower() for name in query_names.split(',')]
    LOG.info("Running Test Vector - File Format: %s Compression: %s / %s" %\
        (file_format, codec, compression_type))
    for test_name in query_map.keys():
      for query_name, query_and_expected_result in query_map[test_name]:
        query, results = query_and_expected_result
        if not query_name:
          query_name = query
        if query_name_filter and (query_name.lower() not in query_name_filter):
          LOG.info("Skipping query '%s'" % query_name)
          continue

        db_name = QueryTestSectionReader.get_db_name(test_vector, scale_factor)
        query_string = QueryTestSectionReader.build_query(query.strip(), test_vector, '')
        table_format_str = '%s/%s/%s' % (file_format, codec, compression_type)
        self.__summary += "\nQuery (%s): %s\n" % (table_format_str, query_name)
        execution_result = QueryExecutionResult()
        if not self.skip_impala:
          self.__summary += " Impala Results: "
          LOG.debug('Running: \n%s\n' % query_string)
          if query_name != query:
            LOG.info('Query Name: \n%s\n' % query_name)

          execution_result = self.run_query(executor_name, db_name, query_string,
                                            self.prime_cache, stop_on_query_error)

          # Don't verify insert results and allow user to continue on error if there is
          # a verification failure
          if execution_result is not None and\
             self.verify_results and 'insert' not in query.lower():
            try:
              verify_results(results.split('\n'), execution_result.data,
                             contains_order_by(query))
            except AssertionError, e:
              if stop_on_query_error:
                raise
              LOG.error(e)

          self.__summary += "%s\n" % execution_result

        hive_execution_result = QueryExecutionResult()
        if self.compare_with_hive or self.skip_impala:
          self.__summary += " Hive Results: "
          hive_execution_result = self.run_query('hive', db_name,
                                                         query_string,
                                                         self.prime_cache,
                                                         False)
          self.__summary += "%s\n" % hive_execution_result
        LOG.debug("---------------------------------------------------------------------")

        execution_detail = QueryExecutionDetail(executor_name, workload, scale_factor,
            file_format, codec, compression_type, execution_result)

        hive_execution_detail = QueryExecutionDetail('hive', workload, scale_factor,
            file_format, codec, compression_type, hive_execution_result)

        self.__result_map[(query_name, query)].append((execution_detail,
                                                       hive_execution_detail))

  def get_summary_str(self):
    return self.__summary

  def get_results(self):
    return self.__result_map

  def run_workload(self, workload, scale_factor=str(), table_formats=None,
                   query_names=None, exploration_strategy='core',
                   stop_on_query_error=True):
    """
      Run queries associated with each workload specified on the commandline.

      For each workload specified in, look up the associated query files. Extract valid
      queries in each file and execute them using the specified number of execution
      iterations. Finally, write results to an output CSV file for reporting.
    """
    LOG.info('Running workload: %s / Scale factor: %s' % (workload, scale_factor))
    query_map = WorkloadRunner.__extract_queries_from_test_files(workload)

    test_vectors = None
    if table_formats:
      table_formats = table_formats.split(',')
      dataset = get_dataset_from_workload(workload)
      test_vectors =\
          [TableFormatInfo.create_from_string(dataset, tf) for tf in table_formats]
    else:
      test_vectors = [vector.value for vector in\
          load_table_info_dimension(workload, exploration_strategy)]

    args = [query_map, workload, scale_factor, query_names, stop_on_query_error]
    execute_queries_partial = partial(self.execute_queries, *args)
    map(execute_queries_partial, test_vectors)