Added execution summary, modified benchmark to handle JSON

- Added execution summary to the beeswax client and QueryResult - Modified report-benchmark-results to handle JSON and perform execution summary comparison between runs - Added comments to the new workload runner Change-Id: I9c3c5f2fdc5d8d1e70022c4077334bc44e3a2d1d Reviewed-on: http://gerrit.ent.cloudera.com:8080/3598 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: jenkins (cherry picked from commit fd0b1406be2511c202e02fa63af94fbbe5e18eee) Reviewed-on: http://gerrit.ent.cloudera.com:8080/3618
2026-01-05 03:01:02 -05:00 · 2014-06-23 19:20:11 -07:00
parent 3bed0be1df
commit e94de02469
9 changed files with 1105 additions and 527 deletions
--- a/tests/common/query.py
+++ b/tests/common/query.py
@@ -18,7 +18,18 @@ from tests.util.test_file_parser import QueryTestSectionReader
 # TODO: This interface needs to be more robust; At the moment, it has two users with
 # completely different uses (the benchmark suite and the impala test suite)
 class Query(object):
-  """Represents a query and all the information neede to execute it"""
+  """Represents a query and all the information neede to execute it
+
+  Attributes:
+    query_str (str): The SQL query string.
+    name (str): query name?
+    scale_factor (str): for example 300gb, used to determine the database.
+    test_vector (?): Specifies some parameters
+    results (list of ?): ?
+    workload_name (str): for example tpch, tpcds, visa (used to determine directory)
+    db (str): ? represents the database
+    table_format_str (str): ?
+  """
  def __init__(self, **kwargs):
    self.query_str = kwargs.get('query_str')
    self.name = kwargs.get('name')
@@ -41,6 +52,7 @@ class Query(object):
            self.db == other.db)

  def __build_query(self):
+    """Populates db, query_str, table_format_str"""
    self.db = QueryTestSectionReader.get_db_name(self.test_vector, self.scale_factor)
    self.query_str = QueryTestSectionReader.build_query(self.query_str.strip())
    self.table_format_str = '%s/%s/%s' % (self.test_vector.file_format,
@@ -56,16 +68,27 @@ class Query(object):
 class QueryResult(object):
  """Contains the results of a query execution.

-  A query execution results contains the following fields:
-  query - The query object
-  time_taken - Time taken to execute the query
-  start_time - The time at which the client submits the query.
-  data - Query results
-  client_name - The thread id
-  runtime_profile - Saved runtime profile of the query's execution.
-  query_error - Empty string if the query succeeded. Error returned by the client if
-                it failed.
+  Parameters:
+    Required:
+      query (Query): The query object associated with this result.
+      start_time (datetime): Timestamp at the start of execution.
+      query_config (BeeswaxQueryExecConfig)
+      client_name (int): The thread id
+
+    Optional:
+      time_taken (float): Time taken to execute the query.
+      summary (str): query exection summary (ex. returned 10 rows)
+      data (list of str): Query results returned by Impala.
+      runtime_profile (str): Saved runtime profile of the query's execution.
+      exec_summary (TExecSummary)
+      success (bool): True if the execution was successful.
+
+  Attributes - these are modified by another class:
+    query_error (str): Empty string if the query succeeded. Error returned by the client
+        if it failed.
+    executor_name (str)
  """
+
  def __init__(self, query, **kwargs):
    self.query = query
    self.time_taken = kwargs.get('time_taken', 0.0)
@@ -75,6 +98,7 @@ class QueryResult(object):
    self.query_config = kwargs.get('query_config')
    self.client_name = kwargs.get('client_name')
    self.runtime_profile = kwargs.get('runtime_profile', str())
+    self.exec_summary = kwargs.get('exec_summary', str())
    self.success = kwargs.get('success', False)
    self.query_error = str()
    self.executor_name = str()
--- a/tests/common/query_executor.py
+++ b/tests/common/query_executor.py
@@ -50,13 +50,22 @@ hive_result_regex = 'Time taken: (\d*).(\d*) seconds'

 ## TODO: Split executors into their own modules.
 class QueryExecConfig(object):
-  """Base Class for Execution Configs"""
+  """Base Class for Execution Configs
+
+  Attributes:
+    plugin_runner (PluginRunner?)
+  """
  def __init__(self, plugin_runner=None):
    self.plugin_runner = plugin_runner


 class ImpalaQueryExecConfig(QueryExecConfig):
-  """Base class for Impala query execution config"""
+  """Base class for Impala query execution config
+
+  Attributes:
+    impalad (str): address of impalad <host>:<port>
+  """
+
  def __init__(self, plugin_runner=None, impalad='localhost:21000'):
    super(ImpalaQueryExecConfig, self).__init__(plugin_runner=plugin_runner)
    self._impalad = impalad
@@ -71,8 +80,14 @@ class ImpalaQueryExecConfig(QueryExecConfig):


 class JdbcQueryExecConfig(ImpalaQueryExecConfig):
-  """Impala query execution config for jdbc"""
+  """Impala query execution config for jdbc
+
+  Attributes:
+    tranport (?): ?
+  """
+
  JDBC_CLIENT_PATH = os.path.join(os.environ['IMPALA_HOME'], 'bin/run-jdbc-client.sh')
+
  def __init__(self, plugin_runner=None, impalad='localhost:21050', transport=None):
    super(JdbcQueryExecConfig, self).__init__(plugin_runner=plugin_runner,
        impalad=impalad)
@@ -87,9 +102,20 @@ class JdbcQueryExecConfig(ImpalaQueryExecConfig):
    return JdbcQueryExecConfig.JDBC_CLIENT_PATH + ' -i "%s" -t %s' % (self._impalad,
                                                                      self.transport)

-
 class BeeswaxQueryExecConfig(ImpalaQueryExecConfig):
-  """Impala query execution config for beeswax"""
+  """Impala query execution config for beeswax
+
+  Args:
+    use_kerberos (boolean)
+    exec_options (str): String formatted as "opt1:val1;opt2:val2"
+    impalad (str): address of impalad <host>:<port>
+    plugin_runner (?): ?
+
+  Attributes:
+    use_kerberos (boolean)
+    exec_options (dict str -> str): execution options
+  """
+
  def __init__(self, use_kerberos=False, exec_options=None, impalad='localhost:21000',
      plugin_runner=None):
    super(BeeswaxQueryExecConfig, self).__init__(plugin_runner=plugin_runner,
@@ -99,7 +125,12 @@ class BeeswaxQueryExecConfig(ImpalaQueryExecConfig):
    self.__build_options(exec_options)

  def __build_options(self, exec_options):
-    """Read the exec_options into a dictionary"""
+    """Read the exec_options into self.exec_options
+
+    Args:
+      exec_options (str): String formatted as "opt1:val1;opt2:val2"
+    """
+
    if exec_options:
      # exec_options are seperated by ; on the command line
      options = exec_options.split(';')
@@ -121,13 +152,27 @@ class HiveQueryExecConfig(QueryExecConfig):


 class QueryExecutor(object):
-  def __init__(self, name, query, func, config, exit_on_error):
-    """
-    Executes a query.
+  """Executes a query.

-    The query_exec_func needs to be a function that accepts a QueryExecOption parameter
-    and returns a QueryResult.
-    """
+  Args:
+    name (str): eg. "hive"
+    query (str): string containing SQL query to be executed
+    func (function): Function that accepts a QueryExecOption parameter and returns a
+      QueryResult. Eg. execute_using_impala_beeswax
+    config (QueryExecOption)
+    exit_on_error (boolean): Exit right after an error encountered.
+
+  Attributes:
+    exec_func (function): Function that accepts a QueryExecOption parameter and returns a
+      QueryResult.
+    exec_config (QueryExecOption)
+    query (str): string containing SQL query to be executed
+    exit_on_error (boolean): Exit right after an error encountered.
+    executor_name (str): eg. "hive"
+    result (QueryResult): Contains the result after execute method is called.
+  """
+
+  def __init__(self, name, query, func, config, exit_on_error):
    self.exec_func = func
    self.exec_config = config
    self.query = query
@@ -163,7 +208,15 @@ class QueryExecutor(object):
    return self.__result

 def establish_beeswax_connection(query, query_config):
-  """Establish a connection to the user specified impalad"""
+  """Establish a connection to the user specified impalad.
+
+  Args:
+    query_config (QueryExecConfig)
+
+  Returns:
+    (boolean, ImpalaBeeswaxClient): True if successful
+  """
+
  # TODO: Make this generic, for hive etc.
  use_kerberos = query_config.use_kerberos
  client = ImpalaBeeswaxClient(query_config.impalad, use_kerberos=use_kerberos)
@@ -177,8 +230,16 @@ def establish_beeswax_connection(query, query_config):
 def execute_using_impala_beeswax(query, query_config):
  """Executes a query using beeswax.

-  A new client is created per query, then destroyed. Returns QueryResult()
+  A new client is created per query, then destroyed.
+
+  Args:
+    query (str): string containing the query to be executed.
+    query_config (QueryExecConfig)
+
+  Returns:
+    QueryResult
  """
+
  # Create a client object to talk to impalad
  exec_result = QueryResult(query, query_config=query_config)
  plugin_runner = query_config.plugin_runner
@@ -203,23 +264,52 @@ def execute_using_impala_beeswax(query, query_config):
    return construct_exec_result(result, exec_result)

 def build_context(query, query_config):
+  """Build context based on query config for plugin_runner.
+
+  Why not pass QueryExecConfig to plugins directly?
+
+  Args:
+    query (str)
+    query_config (QueryExecConfig)
+
+  Returns:
+    dict str -> str
+  """
+
  context = vars(query_config)
  context['query'] = query
  return context

 def construct_exec_result(result, exec_result):
+  """ Transform an ImpalaBeeswaxResult object to a QueryResult object.
+
+  Args:
+    result (ImpalaBeeswasResult): Tranfers data from here.
+    exec_result (QueryResult): Transfers data to here.
+
+  Returns:
+    QueryResult
  """
-  Transform an ImpalaBeeswaxResult object to a QueryResult object.
-  """
+
  # Return immedietely if the query failed.
  if not result.success: return exec_result
  exec_result.success = True
-  for attr in ['data', 'runtime_profile', 'start_time', 'time_taken', 'summary']:
+  attrs = ['data', 'runtime_profile', 'start_time',
+      'time_taken', 'summary', 'exec_summary']
+  for attr in attrs:
    setattr(exec_result, attr, getattr(result, attr))
  return exec_result

 def execute_shell_cmd(cmd):
-  """Executes a command in the shell, pipes the output to local variables"""
+  """Executes a command in the shell, pipes the output to local variables
+
+  Args:
+    cmd (str): Command to be executed.
+
+  Returns:
+    (str, str, str): return code, stdout, stderr
+  """
+
  LOG.debug('Executing: %s' % (cmd,))
  # Popen needs a list as its first parameter.
  # The first element is the command, with the rest being arguments.
--- a/tests/common/scheduler.py
+++ b/tests/common/scheduler.py
@@ -34,12 +34,21 @@ LOG.setLevel(level=logging.DEBUG)
 class Scheduler(object):
  """Schedules the submission of workloads across one of more clients.

-  A workload execution expects the following arguments:
-  query_executors: A list of initialized query executor objects.
-  shuffle: Change the order of execution of queries in a workload. By default, the queries
-           are executed sorted by query name.
-  num_clients: The degree of parallelism.
-  impalads: A list of impalads to connect to. Ignored when the executor is hive.
+  Args:
+    query_executors (list of QueryExecutor): the objects should be initialized.
+    shuffle (boolean): If True, change the order of execution of queries in a workload.
+      By default, the queries are executed sorted by query name.
+    num_clients (int): Number of concurrent clients.
+    impalads (list of str): A list of impalads to connect to. Ignored when the executor
+      is hive.
+
+  Attributes:
+    query_executors (list of QueryExecutor): initialized query executors
+    shuffle (boolean): shuffle query executors
+    iterations (int): number of iterations ALL query executors will run
+    query_iterations (int): number of times each query executor will execute
+    impalads (list of str?): list of impalads for execution. It is rotated after each execution.
+    num_clients (int): Number of concurrent clients
  """
  def __init__(self, **kwargs):
    self.query_executors = kwargs.get('query_executors')
@@ -77,7 +86,12 @@ class Scheduler(object):
    return self.impalads[-1]

  def __run_queries(self, thread_num):
-    """Runs the list of query executors"""
+    """This method is run by every thread concurrently.
+
+    Args:
+      thread_num (int): Thread number. Used for setting the client name in the result.
+    """
+
    # each thread gets its own copy of query_executors
    query_executors = deepcopy(sorted(self.query_executors, key=lambda x: x.query.name))
    for j in xrange(self.iterations):
--- a/tests/common/workload.py
+++ b/tests/common/workload.py
@@ -27,7 +27,18 @@ class Workload(object):

  A workload is the internal representation for the set of queries on a dataset. It
  consists of the dataset name, and a mapping of query names to query strings.
+
+  Args:
+    name (str): workload name. (Eg. tpch)
+    query_name_filters (list of str): List of regular expressions used for matching query
+      names
+
+  Attributes:
+    name (str): workload name (Eg. tpch)
+    __query_map (dict): contains a query name -> string mapping; mapping of query name to
+      section (ex. "TPCH-Q10" -> "select * from...")
  """
+
  WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']

  def __init__(self, name, query_name_filters=None):
@@ -82,7 +93,15 @@ class Workload(object):

    Transform all the queries in the workload's query map to query objects based on the
    input test vector and scale factor.
+
+    Args:
+      test_vector (?): query vector
+      scale_factor (str): eg. "300gb"
+
+    Returns:
+      (list of Query): these will be consumed by ?
    """
+
    queries = list()
    for query_name, query_str in self.__query_map.iteritems():
      queries.append(Query(name=query_name,
--- a/tests/common/workload_runner.py
+++ b/tests/common/workload_runner.py
@@ -37,6 +37,19 @@ class WorkloadRunner(object):
   Internally, for each workload, this module looks up and parses that workload's
   query files and reads the workload's test vector to determine what combination(s)
   of file format / compression to run with.
+
+  Args:
+    workload (Workload)
+    scale_factor (str): eg. "300gb"
+    config (WorkloadConfig)
+
+  Attributes:
+    workload (Workload)
+    scale_factor (str): eg. "300gb"
+    config (WorkloadConfig)
+    exit_on_error (boolean)
+    results (list of QueryResult)
+    __test_vectors (list of ?)
  """
  def __init__(self, workload, scale_factor, config):
    self.workload = workload
@@ -106,7 +119,7 @@ class WorkloadRunner(object):
          self.exit_on_error)
      query_executors.append(query_executor)
    # Initialize the scheduler.
-    scheduler= Scheduler(query_executors=query_executors,
+    scheduler = Scheduler(query_executors=query_executors,
        shuffle=self.config.shuffle_queries,
        iterations=self.config.workload_iterations,
        query_iterations=self.config.query_iterations,