impala/bin/run-workload.py

#!/usr/bin/env impala-python3
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# This script is used as the driver to run performance benchmarks.
# It does the following:
#   - parses the user defined options and validates them.
#   - Matches each workload to its set of queries and constructs the required objects.
#   - Runs each workload in serial order (a workload is a combination of dataset and scale
#     factor)
#   - Pretty prints the results of each query's execution.
#   - Stores the execution details in JSON format.
#

from __future__ import absolute_import, division, print_function
import getpass
import json
import logging
import os
import prettytable

from collections import deque
from copy import deepcopy
from datetime import datetime
from decimal import Decimal
from itertools import groupby
from optparse import OptionParser
from random import shuffle
from sys import exit

from tests.common.test_dimensions import TableFormatInfo
from tests.performance.query import Query, HiveQueryResult
from tests.performance.query_executor import QueryExecConfig
from tests.performance.workload_runner import WorkloadRunner
from tests.performance.workload import Workload
from tests.util.plugin_runner import PluginRunner

parser = OptionParser()
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
                  default=False, help="If set, outputs all benchmark diagnostics.")
parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
                  help=("The exploration strategy to use for running benchmark: 'core', "
                        "'pairwise', or 'exhaustive'"))
parser.add_option("-w", "--workloads", dest="workloads", default="tpcds",
                  help=("The workload(s) and scale factors to run in a comma-separated "
                  " list format. Optional scale factors for each workload are specified"
                  " using colons. For example: -w tpcds,tpch:400gb,tpch:1gb. "
                  "Some valid workloads:'tpch', 'tpcds', ..."))
parser.add_option("--impalads", dest="impalads", default="localhost",
                  help=("A comma-separated list of impalad instances to run the "
                  "workload against."))
parser.add_option("--exec_options", dest="exec_options", default=str(),
                  help=("Run query exec option string "
                    "(formatted as 'opt1:val1;opt2:val2')."))
parser.add_option("--results_json_file", dest="results_json_file",
                  default=os.environ['IMPALA_HOME'] + "/benchmark_results.json",
                  help="The output file where benchmark results are saved")
parser.add_option("-i", "--query_iterations", type="int", dest="query_iterations",
                  default=1, help="Number of times to run each query within a workload")
parser.add_option("-x", "--workload_iterations", type="int", dest="workload_iterations",
                  default=1, help="Number of times to run each workload.")
parser.add_option("--num_clients", type="int", dest="num_clients", default=1,
                  help="Number of clients (threads) to use when executing each query.")
parser.add_option("--query_names", dest="query_names", default=str(),
                  help="A comma-separated list of regular expressions. A query is"
                    " executed if it matches any of the expressions.")
parser.add_option("--table_formats", dest="table_formats", default=str(),
                  help=("Override the default test vectors and run using only the"
                        " specified table formats. Ex. --table_formats=seq/snap/block"
                        ",text/none"))
parser.add_option("--shuffle_query_exec_order", dest="shuffle_queries",
                  action="store_true", default=False, help=("Randomizes the order "
                    "of query execution. Useful when the execution scope is a workload"))
parser.add_option("--plan_first", dest="plan_first", action="store_true", default=False,
                  help=("Runs EXPLAIN before running the query so that metadata loading"
                        " is excluded from the timing"))

parser.add_option("--use_kerberos", dest="use_kerberos", action="store_true",
                  default=False, help="If set, enables talking to a kerberized impalad")
parser.add_option("--continue_on_query_error", dest="continue_on_query_error",
                  action="store_true", default=False,
                  help="If set, continue execution on each query error.")
parser.add_option("-c", "--client_type", dest="client_type", default='hs2',
                  choices=['beeswax', 'jdbc', 'hs2'],
                  help="Client type. Valid options are 'beeswax' or 'jdbc' or 'hs2'")
parser.add_option("--plugin_names", dest="plugin_names", default=None,
                  help=("Set of comma-separated plugin names with scope; Plugins are"
                    " specified as <plugin_name>[:<scope>]. If no scope if specified,"
                    " it defaults to Query. Plugin names are case sensitive"))
parser.add_option("--exec_engine", dest="exec_engine", default="impala",
                  choices=['impala', 'hive'],
                  help=("Which SQL engine to use - impala, hive are valid options"))
parser.add_option("--hiveserver", dest="hiveserver", default="localhost",
                  help=("Host that has HiveServers2 service running"))
parser.add_option("--user", dest="user", default=getpass.getuser(),
                  help=("User account under which workload/query will run"))
parser.add_option("--get_password", dest="get_password", default=False,
                  action="store_true", help=("Prompt for password for user account"))
parser.add_option("--use_ssl", dest="use_ssl", action="store_true", default=False,
                  help=("Whether to use SSL or not"))

options, args = parser.parse_args()

options.password = None
if options.get_password:
  options.password = getpass.getpass()
  options.get_password = None

LOG = logging.getLogger('run-workload')


class WorkloadConfig(object):
  """Converts the options dict into a class"""
  def __init__(self, **config):
    self.__dict__.update(config)


class CustomJSONEncoder(json.JSONEncoder):
  """Override the JSONEncoder's default method.

  This class is needed for two reasons:
    - JSON does have a datetime field. We intercept a datetime object and convert it into
      a standard iso string.
    - JSON does not know how to serialize object. We intercept the objects and
      provide their __dict__ representations
  """
  def default(self, obj,):
    if isinstance(obj, Decimal):
      return str(obj)
    if isinstance(obj, datetime):
      # Convert datetime into an standard iso string
      return obj.isoformat()
    if isinstance(obj, bytes):
      # Impyla can leave a string value as bytes when it is unable to decode it to UTF-8.
      # TPC-DS has queries that produce non-UTF-8 results (e.g. Q30 on scale 20)
      # Convert bytes to strings to make JSON encoding work
      return obj.decode(encoding="utf-8", errors="backslashreplace")
    elif isinstance(obj, (Query, HiveQueryResult, QueryExecConfig, TableFormatInfo)):
      # Serialize these objects manually by returning their __dict__ methods.
      return obj.__dict__
    else:
      super(CustomJSONEncoder, self).default(obj)


def prettytable_print(results, failed=False):
  """Print a list of query results in prettytable"""
  column_names = ['Query', 'Start Time', 'Time Taken (s)', 'Client ID']
  if failed: column_names.append('Error')
  table = prettytable.PrettyTable(column_names)
  table.align = 'l'
  table.float_format = '.2'
  # Group the results by table format.
  for table_format_str, gr in groupby(results, lambda x: x.query.table_format_str):
    print("Table Format: %s" % table_format_str)
    for result in gr:
      start_time = result.start_time.strftime("%Y-%m-%d %H:%M:%S") if result.start_time \
          is not None else '-'
      row = [result.query.name, start_time, result.time_taken, result.client_name]
      if failed: row.append(result.query_error)
      table.add_row(row)
    print(table.get_string(sortby='Client ID'))
    table.clear_rows()
    print(str())


def print_result_summary(results):
  """Print failed and successfull queries for a given result list"""
  failed_results = [x for x in results if not x.success]
  successful_results = [x for x in results if x.success]
  prettytable_print(successful_results)
  if failed_results: prettytable_print(failed_results, failed=True)


def get_workload_scale_factor():
  """Extract the workload -> scale factor mapping from the command line

  The expected string is "workload_1[:scale_factor_1],...,workload_n[:scale_factor_n]"
  """
  workload_str = options.workloads
  workload_tuples = split_and_strip(workload_str)
  assert len(workload_tuples) > 0, "At least one workload must be specified"
  for workload_tuple in workload_tuples:
    # Each member should conform to workload[:scale_factor]
    workload_tuple = split_and_strip(workload_tuple, delim=":")
    assert len(workload_tuple) in [1, 2], "Error parsing workload:scale_factor"
    if len(workload_tuple) == 1: workload_tuple.append(str())
    yield workload_tuple


def split_and_strip(input_string, delim=","):
  """Convert a string into a list using the given delimiter"""
  if not input_string: return list()
  return list(map(str.strip, input_string.split(delim)))


def create_workload_config():
  """Parse command line inputs.

  Some user inputs needs to be transformed from delimited strings to lists in order to be
  consumed by the performacne framework. Additionally, plugin_names are converted into
  objects, and need to be added to the config.
  """
  config = deepcopy(vars(options))
  # We don't need workloads and query_names in the config map as they're already specified
  # in the workload object.
  del config['workloads']
  del config['query_names']
  config['plugin_runner'] = plugin_runner
  # transform a few options from strings to lists
  config['table_formats'] = split_and_strip(config['table_formats'])
  impalads = split_and_strip(config['impalads'])
  # Randomize the order of impalads.
  shuffle(impalads)
  config['impalads'] = deque(impalads)
  return WorkloadConfig(**config)


def _validate_options():
  """Basic validation for some commandline options"""
  # the sasl module must be importable on a secure setup.
  if options.use_kerberos: import sasl

  # If Hive is the exec engine, hs2 is the only suported interface.
  if options.exec_engine.lower() == "hive" and options.client_type != "hs2":
    raise RuntimeError("The only supported client type for Hive engine is hs2")

  # Check for duplicate workload/scale_factor combinations
  workloads = split_and_strip(options.workloads)
  if not len(set(workloads)) == len(workloads):
    raise RuntimeError("Duplicate workload/scale factor combinations are not allowed")

  # The list of Impalads must be provided as a comma separated list of either host:port
  # combination or just host.
  for impalad in split_and_strip(options.impalads):
    if len(impalad.split(":")) not in [1, 2]:
      raise RuntimeError("Impalads must be of the form host:port or host.")


if __name__ == "__main__":
  logging.basicConfig(level=logging.INFO, format='[%(name)s]: %(message)s')
  # Check for badly formed user options.
  _validate_options()

  # Intialize the PluginRunner.
  plugin_runner = None
  if options.plugin_names:
    plugin_runner = PluginRunner(split_and_strip(options.plugin_names))

  # Intialize workloads.
  workload_runners = list()
  query_name_filters = split_and_strip(options.query_names)
  # Create a workload config object.
  for workload_name, scale_factor in get_workload_scale_factor():
    config = create_workload_config()
    workload = Workload(workload_name, query_name_filters=query_name_filters)
    workload_runners.append(WorkloadRunner(workload, scale_factor, config))

  # Run all the workloads serially
  result_map = dict()
  exit_code = 0
  for workload_runner in workload_runners:
    try:
      if plugin_runner: plugin_runner.run_plugins_pre(scope="Workload")
      workload_runner.run()
      if plugin_runner: plugin_runner.run_plugins_post(scope="Workload")
    finally:
      key = "%s_%s" % (workload_runner.workload.name, workload_runner.scale_factor)
      result_map[key] = workload_runner.results

      if not all(result.success for result in workload_runner.results): exit_code = 1

      # Print the results
      print("\nWorkload: {0}, Scale Factor: {1}\n".format(
          workload_runner.workload.name.upper(), workload_runner.scale_factor))
      print_result_summary(workload_runner.results)

  # Store the results
  with open(options.results_json_file, 'w') as f:
    json.dump(result_map, f, cls=CustomJSONEncoder, ensure_ascii=False)

  exit(exit_code)