impala/tests/common/query_executor.py

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Module used for executing queries and gathering results and allowing for executing
# multiple queries concurrently. The QueryExecutor is meant to be very generic and doesn't
# have the knowledge of how to actually execute a query. It just takes an executor
# function and a query option object and returns the QueryExecutionResult.
# For example (in pseudo-code):
#
# def execute_using_impala_beeswax(query, query_options):
# ...
#
# exec_option = ImpalaBeeswaxQueryExecOptions()
# qe = QueryExecutor(execute_using_impala_beeswax, exec_options)
# qe.run()
# execution_result qe.get_results()
#
import logging
import math
import re
import threading
import shlex

from collections import defaultdict
from random import randint
from subprocess import Popen, PIPE
from tests.beeswax.impala_beeswax import *

# Setup logging for this module.
logging.basicConfig(level=logging.INFO, format='%(threadName)s: %(message)s')
LOG = logging.getLogger('query_executor')
LOG.setLevel(level=logging.DEBUG)

# globals.
hive_result_regex = 'Time taken: (\d*).(\d*) seconds'

# Contains details about the execution result of a query
class QueryExecutionResult(object):
  def __init__(self, avg_time=None, std_dev=None, data=None, note=None):
    self.avg_time = avg_time
    self.std_dev = std_dev
    self.__note = note
    self.success = False
    self.data = data

  def set_result_note(self, note):
    self.__note = note

  def __str__(self):
    """Print human readable query execution details"""
    message = str()
    if self.__note: message = "%s, " % self.__note
    message += 'Avg Time: %s, Std Dev: %s' % (self.avg_time, self.std_dev)
    return message


# Base class for query exec options
class QueryExecOptions(object):
  def __init__(self, iterations, **kwargs):
    self.options = kwargs
    self.iterations = iterations


# Base class for Impala query exec options
class ImpalaQueryExecOptions(QueryExecOptions):
  def __init__(self, iterations, **kwargs):
    QueryExecOptions.__init__(self, iterations, **kwargs)
    self.impalad = self.options.get('impalad', 'localhost:21000')


# constructs exec_options for query execution through beeswax
# TODO: Make argument handling better.
class ImpalaBeeswaxExecOptions(ImpalaQueryExecOptions):
  def __init__(self, iterations, **kwargs):
    ImpalaQueryExecOptions.__init__(self, iterations, **kwargs)
    self.use_kerberos = kwargs.get('use_kerberos', False)
    self._build_exec_options(kwargs.get('exec_options', None))

  def _build_exec_options(self, exec_options):
    """Read the exec_options into a dictionary"""
    self.exec_options = dict()
    if exec_options:
      # exec_options are seperated by ; on the command line
      options = exec_options.split(';')
      for option in options:
        key, value = option.split(':')
        # The keys in ImpalaService QueryOptions are upper case.
        self.exec_options[key.upper()] = value


# Hive query exec options
class HiveQueryExecOptions(QueryExecOptions):
  def __init__(self, iterations, **kwargs):
    QueryExecOptions.__init__(self, iterations, **kwargs)
    self.hive_cmd = self.options.get('hive_cmd', 'hive -e ')

  def build_argument_string(self):
    """ Builds the actual argument string that is passed to hive """
    return str()


# The QueryExecutor is used to run the given query using the target executor (Hive,
# Impala, Impala Beeswax)
class QueryExecutor(threading.Thread):
  def __init__(self, name, query_exec_func, exec_options, query):
    """
    Initialize the QueryExecutor

    The query_exec_func needs to be a function that accepts a QueryExecOption parameter
    and returns a QueryExecutionResult and output string. The output string is used so
    callers can devide whether or not to display the output or do other manipulation
    on it.
    """
    self.query_exec_func = query_exec_func
    self.query_exec_options = exec_options
    self.query = query
    self.output_result = None
    self.execution_result = None
    threading.Thread.__init__(self)
    self.name = name

  def _execute_query(self):
    self.execution_result = self.query_exec_func(self.query, self.query_exec_options)
    LOG.debug('Result:\n  -> %s\n' % self.execution_result)

  def success(self):
    return self.execution_result.success

  def run(self):
    """ Runs the actual query """
    self._execute_query()

  def get_results(self):
    """ Returns the result of the query execution """
    return self.execution_result

# Standalone Functions
def establish_beeswax_connection(query, query_options):
  # TODO: Make this generic, for hive etc.
  use_kerberos = query_options.use_kerberos
  client = ImpalaBeeswaxClient(query_options.impalad, use_kerberos=use_kerberos)
  # Try connect
  client.connect()
  LOG.debug('Connected to %s' % query_options.impalad)
  # Set the exec options.
  exec_options = query_options.exec_options
  for exec_option in exec_options.keys():
    # TODO: Move the validation to the ImpalaBeeswaxExecOptions.
    if not client.query_options.get(exec_option):
      LOG.error('Illegal exec_option: %s' % exec_option)
      return (False, None)
    # change the default value to the user specified value.
    client.query_options[exec_option] = exec_options[exec_option]
  return (True, client)

def execute_using_impala_beeswax(query, query_options):
  """Executes a query using beeswax.

  A new client is created per query, then destroyed. Returns QueryExecutionResult()
  """
  # Create a client object to talk to impalad
  exec_result = QueryExecutionResult()
  (success, client) = establish_beeswax_connection(query, query_options)
  if not success:
    return exec_result

  # execute the query
  results = []
  for i in xrange(query_options.iterations):
    LOG.debug("Running iteration %d" % (i+1))
    result = QueryResult()
    try:
      result = client.execute(query)
    except Exception, e:
      LOG.error(e)
      client.close_connection()
      return exec_result
    results.append(result)
  # We only need to print the results for a successfull run, not all.
  LOG.debug('Data:\n%s\n' % results[0].get_data())
  client.close_connection()
  # get rid of the client object
  del client
  # construct the execution result.
  return construct_execution_result(query_options.iterations, results)

def construct_execution_result(iterations, results):
  """Calculate average running time and standard deviation.

  The summary of the first result is used as the summary for the entire execution.
  """
  # Use the output from the first result.
  exec_result = QueryExecutionResult()
  exec_result.data = results[0].data
  exec_result.beeswax_result = results[0]
  exec_result.set_result_note(results[0].summary)
  runtimes = [r.time_taken for r in results]
  exec_result.success = True
  exec_result.avg_time = calculate_avg(runtimes)
  if iterations > 1:
    exec_result.std_dev = calculate_stddev(runtimes)
  return exec_result

def execute_shell_cmd(cmd):
  """Executes a command in the shell, pipes the output to local variables"""
  LOG.debug('Executing: %s' % (cmd,))
  # Popen needs a list as its first parameter.
  # The first element is the command, with the rest being arguments.
  p = Popen(shlex.split(cmd), shell=False, stdout=PIPE, stderr=PIPE)
  stdout, stderr = p.communicate()
  rc = p.returncode
  return rc, stdout, stderr

def execute_using_hive(query, query_options):
  """Executes a query via hive"""
  query_string = (query + ';') * query_options.iterations
  cmd = query_options.hive_cmd + " \"%s\"" % query_string
  return run_query_capture_results(cmd, match_hive_query_results,
                                   query_options.iterations, exit_on_error=False)

def run_query_capture_results(cmd, query_result_match_function, iterations,
                              exit_on_error):
  """
  Runs the given query command and returns the execution result.

  Takes in a match function that is used to parse stderr/stdout to extract the results.
  """
  execution_result = QueryExecutionResult()
  try:
    rc, stdout, stderr = execute_shell_cmd(cmd)
  except Exception, e:
    LOG.error('Error while executing query command: %s' % e)
    return execution_result
  if rc != 0:
    LOG.error(('Command returned with an error:\n'
               'rc: %d\n'
               'STDERR:\n%s'
               'STDOUT:\n%s'
                % (rc, stderr, stdout)))
    return execution_result
  # The command completed
  execution_result = query_result_match_function(stdout, stderr, iterations)
  if not execution_result.success:
    LOG.error("Query did not run successfully")
    LOG.error("STDERR:\n%s\nSTDOUT:\n%s" % (stderr, stdout))
  return execution_result

def match_hive_query_results(stdout, stderr, iterations):
  """
  Parse query execution details for hive.

  Parses the query execution details (avg time, stddev) from the runquery output.
  Returns a QueryExecutionResult object.
  """
  run_success = False
  execution_times = list()
  std_dev = None
  match = re.search(hive_result_regex, stderr)
  if match:
    execution_times.append(float(('%s.%s') % (match.group(1), match.group(2))))

  execution_result = QueryExecutionResult()
  if len(execution_times) == iterations:
    execution_result.avg_time = calculate_avg(execution_times)
    if iterations > 1:
      execution_result.std_dev = calculate_stddev(execution_times)
    execution_result.success = True
  return execution_result

# Util functions
# TODO : Move util functions to a common module.
def calculate_avg(values):
  return sum(values) / float(len(values))

def calculate_stddev(values):
  """Return the standard deviation of a numeric iterable."""
  avg = calculate_avg(values)
  return math.sqrt(calculate_avg([(val - avg)**2 for val in values]))