impala/tests/common/query_executor.py

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Module used for executing queries and gathering results.
# The QueryExecutor is meant to be generic and doesn't
# have the knowledge of how to actually execute a query. It takes a query and its config
# and executes is against a executor function.
# For example (in pseudo-code):
#
# def exec_func(query, config):
# ...
#
# config = ImpalaBeeswaxQueryExecConfig()
# executor = QueryExecutor('beeswax', query, config, exec_func)
# executor.run()
# result = executor.result

import logging
import os
import re
import shlex

from collections import defaultdict, deque
from datetime import datetime
from random import randint
from subprocess import Popen, PIPE
from tests.common.query import Query, QueryResult
from tests.beeswax.impala_beeswax import ImpalaBeeswaxClient, ImpalaBeeswaxResult
from threading import Thread, Lock

# Setup logging for this module.
logging.basicConfig(level=logging.INFO, format='[%(name)s] %(threadName)s: %(message)s')
LOG = logging.getLogger('query_executor')
LOG.setLevel(level=logging.INFO)

# globals.
hive_result_regex = 'Time taken: (\d*).(\d*) seconds'

## TODO: Split executors into their own modules.
class QueryExecConfig(object):
  """Base Class for Execution Configs

  Attributes:
    plugin_runner (PluginRunner?)
  """
  def __init__(self, plugin_runner=None):
    self.plugin_runner = plugin_runner


class ImpalaQueryExecConfig(QueryExecConfig):
  """Base class for Impala query execution config

  Attributes:
    impalad (str): address of impalad <host>:<port>
  """

  def __init__(self, plugin_runner=None, impalad='localhost:21000'):
    super(ImpalaQueryExecConfig, self).__init__(plugin_runner=plugin_runner)
    self._impalad = impalad

  @property
  def impalad(self):
    return self._impalad

  @impalad.setter
  def impalad(self, value):
    self._impalad = value


class JdbcQueryExecConfig(ImpalaQueryExecConfig):
  """Impala query execution config for jdbc

  Attributes:
    tranport (?): ?
  """

  JDBC_CLIENT_PATH = os.path.join(os.environ['IMPALA_HOME'], 'bin/run-jdbc-client.sh')

  def __init__(self, plugin_runner=None, impalad='localhost:21050', transport=None):
    super(JdbcQueryExecConfig, self).__init__(plugin_runner=plugin_runner,
        impalad=impalad)
    self.transport = transport

  @property
  def jdbc_client_cmd(self):
    """The args to run the jdbc client.

    Constructed on the fly, since the impalad it points to can change.
    """
    return JdbcQueryExecConfig.JDBC_CLIENT_PATH + ' -i "%s" -t %s' % (self._impalad,
                                                                      self.transport)

class BeeswaxQueryExecConfig(ImpalaQueryExecConfig):
  """Impala query execution config for beeswax

  Args:
    use_kerberos (boolean)
    exec_options (str): String formatted as "opt1:val1;opt2:val2"
    impalad (str): address of impalad <host>:<port>
    plugin_runner (?): ?

  Attributes:
    use_kerberos (boolean)
    exec_options (dict str -> str): execution options
  """

  def __init__(self, use_kerberos=False, exec_options=None, impalad='localhost:21000',
      plugin_runner=None):
    super(BeeswaxQueryExecConfig, self).__init__(plugin_runner=plugin_runner,
        impalad=impalad)
    self.use_kerberos = use_kerberos
    self.exec_options = dict()
    self.__build_options(exec_options)

  def __build_options(self, exec_options):
    """Read the exec_options into self.exec_options

    Args:
      exec_options (str): String formatted as "opt1:val1;opt2:val2"
    """

    if exec_options:
      # exec_options are seperated by ; on the command line
      options = exec_options.split(';')
      for option in options:
        key, value = option.split(':')
        # The keys in ImpalaService QueryOptions are upper case.
        self.exec_options[key.upper()] = value


class HiveQueryExecConfig(QueryExecConfig):
  """Hive query execution config"""
  def __init__(self, plugin_runner=None, hive_cmd='hive -e'):
    super(HiveQueryExecConfig, self).__init__(plugin_runner=plugin_runner)
    self.hive_cmd = hive_cmd

  def build_argument_string(self):
    """ Builds the actual argument string that is passed to hive """
    return str()


class QueryExecutor(object):
  """Executes a query.

  Args:
    name (str): eg. "hive"
    query (str): string containing SQL query to be executed
    func (function): Function that accepts a QueryExecOption parameter and returns a
      QueryResult. Eg. execute_using_impala_beeswax
    config (QueryExecOption)
    exit_on_error (boolean): Exit right after an error encountered.

  Attributes:
    exec_func (function): Function that accepts a QueryExecOption parameter and returns a
      QueryResult.
    exec_config (QueryExecOption)
    query (str): string containing SQL query to be executed
    exit_on_error (boolean): Exit right after an error encountered.
    executor_name (str): eg. "hive"
    result (QueryResult): Contains the result after execute method is called.
  """

  def __init__(self, name, query, func, config, exit_on_error):
    self.exec_func = func
    self.exec_config = config
    self.query = query
    self.exit_on_error = exit_on_error
    self.executor_name = name
    self.__result = QueryResult(query, query_config=self.exec_config)

  def prepare(self, impalad):
    """Prepare the query to be run.

    For now, this sets the impalad that the query connects to. If the executor is hive,
    it's a no op.
    """
    if self.executor_name != 'hive':
      self.exec_config.impalad = impalad

  def execute(self):
    """Execute the query using the given execution function"""
    self.__result = self.exec_func(self.query, self.exec_config)
    if not self.__result.success:
      if self.exit_on_error:
        raise RuntimeError(self.__result.query_error)
      else:
        LOG.info("Continuing execution")

  @property
  def result(self):
    """Getter for the result of the query execution.

    A result is a QueryResult object that contains the details of a single run of the
    query.
    """
    return self.__result

def establish_beeswax_connection(query, query_config):
  """Establish a connection to the user specified impalad.

  Args:
    query_config (QueryExecConfig)

  Returns:
    (boolean, ImpalaBeeswaxClient): True if successful
  """

  # TODO: Make this generic, for hive etc.
  use_kerberos = query_config.use_kerberos
  client = ImpalaBeeswaxClient(query_config.impalad, use_kerberos=use_kerberos)
  # Try connect
  client.connect()
  # Set the exec options.
  client.set_query_options(query_config.exec_options)
  LOG.info("Connected to %s" % query_config.impalad)
  return (True, client)

def execute_using_impala_beeswax(query, query_config):
  """Executes a query using beeswax.

  A new client is created per query, then destroyed.

  Args:
    query (str): string containing the query to be executed.
    query_config (QueryExecConfig)

  Returns:
    QueryResult
  """

  # Create a client object to talk to impalad
  exec_result = QueryResult(query, query_config=query_config)
  plugin_runner = query_config.plugin_runner
  (success, client) = establish_beeswax_connection(query.query_str, query_config)
  if not success: return exec_result
  # We need to issue a use database here.
  if query.db:
    use_query = 'use %s' % query.db
    client.execute(use_query)
  # create a map for query options and the query names to send to the plugin
  context = build_context(query, query_config)
  if plugin_runner: plugin_runner.run_plugins_pre(context=context, scope="Query")
  result = ImpalaBeeswaxResult()
  try:
    result = client.execute(query.query_str)
  except Exception, e:
    LOG.error(e)
    exec_result.query_error = str(e)
  finally:
    client.close_connection()
    if plugin_runner: plugin_runner.run_plugins_post(context=context, scope="Query")
    return construct_exec_result(result, exec_result)

def build_context(query, query_config):
  """Build context based on query config for plugin_runner.

  Why not pass QueryExecConfig to plugins directly?

  Args:
    query (str)
    query_config (QueryExecConfig)

  Returns:
    dict str -> str
  """

  context = vars(query_config)
  context['query'] = query
  return context

def construct_exec_result(result, exec_result):
  """ Transform an ImpalaBeeswaxResult object to a QueryResult object.

  Args:
    result (ImpalaBeeswasResult): Tranfers data from here.
    exec_result (QueryResult): Transfers data to here.

  Returns:
    QueryResult
  """

  # Return immedietely if the query failed.
  if not result.success: return exec_result
  exec_result.success = True
  attrs = ['data', 'runtime_profile', 'start_time',
      'time_taken', 'summary', 'exec_summary']
  for attr in attrs:
    setattr(exec_result, attr, getattr(result, attr))
  return exec_result

def execute_shell_cmd(cmd):
  """Executes a command in the shell, pipes the output to local variables

  Args:
    cmd (str): Command to be executed.

  Returns:
    (str, str, str): return code, stdout, stderr
  """

  LOG.debug('Executing: %s' % (cmd,))
  # Popen needs a list as its first parameter.
  # The first element is the command, with the rest being arguments.
  p = Popen(shlex.split(cmd), shell=False, stdout=PIPE, stderr=PIPE)
  stdout, stderr = p.communicate()
  rc = p.returncode
  return rc, stdout, stderr

def execute_using_hive(query, query_config):
  """Executes a query via hive"""
  query_string = query.query_str + ';'
  if query.db:
    query_string = 'use %s;%s' % (query.db, query_string)
  cmd = query_config.hive_cmd + " \"%s\"" % query_string
  return run_query_capture_results(cmd, query, parse_hive_query_results)

def parse_hive_query_results(stdout, stderr, iterations):
  """
  Parse query execution details for hive.

  Parses the query execution details (avg time, stddev) from the runquery output.
  Returns a QueryResult object.
  """
  run_success = False
  execution_times = list()
  std_dev = None
  for line in stderr.split('\n'):
    match = re.search(hive_result_regex, line)
    if match:
      execution_times.append(float(('%s.%s') % (match.group(1), match.group(2))))
      break
  # TODO: Get hive results
  return create_exec_result(execution_times, iterations, None)

def execute_using_jdbc(query, query_config):
  """Executes a query using JDBC"""
  query_string = query.query_str + ';'
  if query.db:
    query_string = 'use %s; %s' % (query.db, query_string)
  cmd = query_config.jdbc_client_cmd + " -q \"%s\"" % query_string
  return run_query_capture_results(cmd, query, parse_jdbc_query_results,
    exit_on_error=False)

def parse_jdbc_query_results(stdout, stderr):
  """
  Parse query execution results for the Impala JDBC client

  Parses the query execution details (avg time, stddev) from the output of the Impala
  JDBC test client.
  """
  jdbc_result_regex = 'row\(s\) in (\d*).(\d*)s'
  time_taken = 0.0
  for line in stdout.split('\n'):
    match = re.search(jdbc_result_regex, line)
    if match:
      time_taken = float(('%s.%s') % (match.group(1), match.group(2)))
      break
  result_data = re.findall(r'\[START\]----\n(.*?)\n----\[END\]', stdout, re.DOTALL)[0]
  return create_exec_result(time_taken, result_data)

def create_exec_result(time_taken, result_data):
  exec_result = QueryResult()
  if result_data:
    LOG.debug('Data:\n%s\n' % result_data)
    exec_result.data = result_data
  exec_result.time_taken = time_taken
  exec_result.success = True
  return exec_result

def run_query_capture_results(cmd, query, query_result_parse_function, exit_on_error):
  """
  Runs the given query command and returns the execution result.

  Takes in a match function that is used to parse stderr/stdout to extract the results.
  """
  exec_result = QueryResult(query)
  start_time = datetime.now()
  try:
    rc, stdout, stderr = execute_shell_cmd(cmd)
  except Exception, e:
    LOG.error('Error while executing query command: %s' % e)
    exec_result.query_error = str(e)
    return exec_result
  if rc != 0:
    msg = ('Command returned with an error:\n'
           'rc: %d\n'
           'STDERR:\n%s'
           'STDOUT:\n%s'
           % (rc, stderr, stdout))
    LOG.error(msg)
    exec_result.query_error = msg
    return exec_result
  # The command completed
  exec_result = query_result_parse_function(stdout, stderr)
  exec_result.query = query
  exec_result.start_time = start_time
  return exec_result