Files
impala/tests/common/scheduler.py
Taras Bobrovytsky e94de02469 Added execution summary, modified benchmark to handle JSON
- Added execution summary to the beeswax client and QueryResult
- Modified report-benchmark-results to handle JSON and perform
  execution summary comparison between runs
- Added comments to the new workload runner

Change-Id: I9c3c5f2fdc5d8d1e70022c4077334bc44e3a2d1d
Reviewed-on: http://gerrit.ent.cloudera.com:8080/3598
Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com>
Tested-by: jenkins
(cherry picked from commit fd0b1406be2511c202e02fa63af94fbbe5e18eee)
Reviewed-on: http://gerrit.ent.cloudera.com:8080/3618
2014-07-25 21:06:00 -07:00

136 lines
5.5 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# The WorkloadExecutor class encapsulates the execution of a workload. A workload is
# defined as a set of queries for a given data set, scale factor and a specific test
# vector. It treats a workload an the unit of parallelism.
import logging
import os
from collections import defaultdict, deque
from copy import deepcopy
from random import shuffle
from sys import exit
from threading import Lock, Thread, Event
logging.basicConfig(level=logging.INFO, format='%(name)s %(threadName)s: %(message)s')
LOG = logging.getLogger('scheduler')
LOG.setLevel(level=logging.DEBUG)
class Scheduler(object):
"""Schedules the submission of workloads across one of more clients.
Args:
query_executors (list of QueryExecutor): the objects should be initialized.
shuffle (boolean): If True, change the order of execution of queries in a workload.
By default, the queries are executed sorted by query name.
num_clients (int): Number of concurrent clients.
impalads (list of str): A list of impalads to connect to. Ignored when the executor
is hive.
Attributes:
query_executors (list of QueryExecutor): initialized query executors
shuffle (boolean): shuffle query executors
iterations (int): number of iterations ALL query executors will run
query_iterations (int): number of times each query executor will execute
impalads (list of str?): list of impalads for execution. It is rotated after each execution.
num_clients (int): Number of concurrent clients
"""
def __init__(self, **kwargs):
self.query_executors = kwargs.get('query_executors')
self.shuffle = kwargs.get('shuffle', False)
self.iterations = kwargs.get('iterations', 1)
self.query_iterations = kwargs.get('query_iterations', 1)
self.impalads = kwargs.get('impalads')
self.num_clients = kwargs.get('num_clients', 1)
self.__exit = Event()
self.__results = list()
self.__result_dict_lock = Lock()
self.__thread_name = "[%s " % self.query_executors[0].query.db + "Thread %d]"
self.__threads = []
self.__create_workload_threads()
@property
def results(self):
"""Return execution results."""
return self.__results
def __create_workload_threads(self):
"""Create workload threads.
Each workload thread is analogus to a client name, and is identified by a unique ID,
the workload that's being run and the table formats it's being run on."""
for thread_num in xrange(self.num_clients):
thread = Thread(target=self.__run_queries, args=[thread_num],
name=self.__thread_name % thread_num)
thread.daemon = True
self.__threads.append(thread)
def __get_next_impalad(self):
"""Maintains a rotating list of impalads"""
self.impalads.rotate(-1)
return self.impalads[-1]
def __run_queries(self, thread_num):
"""This method is run by every thread concurrently.
Args:
thread_num (int): Thread number. Used for setting the client name in the result.
"""
# each thread gets its own copy of query_executors
query_executors = deepcopy(sorted(self.query_executors, key=lambda x: x.query.name))
for j in xrange(self.iterations):
# Randomize the order of execution for each iteration if specified.
if self.shuffle: shuffle(query_executors)
results = defaultdict(list)
workload_time_sec = 0
for query_executor in query_executors:
query_name = query_executor.query.name
LOG.info("Running Query: %s" % query_name)
for i in xrange(self.query_iterations):
if self.__exit.isSet():
LOG.error("Another thread failed, exiting.")
exit(1)
try:
query_executor.prepare(self.__get_next_impalad())
query_executor.execute()
# QueryExecutor only throws an exception if the query fails and abort_on_error
# is set to True. If abort_on_error is False, then the exception is logged on
# the console and execution moves on to the next query.
except Exception as e:
LOG.error("Query %s Failed: %s" % (query_name, str(e)))
self.__exit.set()
finally:
LOG.info("%s query iteration %d finished in %.2f seconds" % (query_name, i+1,
query_executor.result.time_taken))
result = query_executor.result
result.client_name = thread_num + 1
self.__results.append(result)
workload_time_sec += query_executor.result.time_taken
if self.query_iterations == 1:
LOG.info("Workload iteration %d finished in %s seconds" % (j+1, workload_time_sec))
def run(self):
"""Run the query pipelines concurrently"""
for thread_num, t in enumerate(self.__threads):
LOG.info("Starting %s" % self.__thread_name % thread_num)
t.start()
for thread_num,t in enumerate(self.__threads):
t.join()
LOG.info("Finished %s" % self.__thread_name % thread_num)