Files
impala/tests/performance/workload_runner.py
Todd Lipcon 30bb0b3d89 tests: ensure consistent logging format across tests
Many of the test modules included calls to 'logging.basicConfig' at
global scope in their implementation. This meant that by just importing
one of these files, other tests would inherit their logging format. This
is typically a bad idea in Python -- modules should not have side
effects like this on import.

The format was additionally inconsistent. In some cases we had a "--"
prepended to the format, and in others we didn't. The "--" is very
useful since it lets developers copy-paste query-test output back into
the shell to reproduce an issue.

This patch fixes the above by centralizing the logging configuration in
a pytest hook that runs prior to all pytests. A few other non-pytest
related tools now configure logging in their "main" code which is only
triggered when the module is executed directly.

I tested that, with this change, logs still show up properly in the .xml
output files from 'run-tests.py' as well as when running tests manually
from impala-py.test

Change-Id: I55ef0214b43f87da2d71804913ba4caa964f789f
Reviewed-on: http://gerrit.cloudera.org:8080/11225
Reviewed-by: Philip Zeyliger <philip@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-08-18 04:21:00 +00:00

156 lines
5.5 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import logging
from tests.common.test_dimensions import (
TableFormatInfo,
get_dataset_from_workload,
load_table_info_dimension)
from tests.performance.query_executor import (
BeeswaxQueryExecConfig,
HiveHS2QueryConfig,
ImpalaHS2QueryConfig,
JdbcQueryExecConfig,
QueryExecutor)
from tests.performance.query_exec_functions import (
execute_using_hive_hs2,
execute_using_impala_beeswax,
execute_using_impala_hs2,
execute_using_jdbc)
from tests.performance.scheduler import Scheduler
LOG = logging.getLogger('workload_runner')
class WorkloadRunner(object):
"""Runs query files and captures results from the specified workload(s)
The usage is:
1) Initialize WorkloadRunner with desired execution parameters.
2) Call workload_runner.run()
Internally, for each workload, this module looks up and parses that workload's
query files and reads the workload's test vector to determine what combination(s)
of file format / compression to run with.
Args:
workload (Workload)
scale_factor (str): eg. "300gb"
config (WorkloadConfig)
Attributes:
workload (Workload)
scale_factor (str): eg. "300gb"
config (WorkloadConfig)
exit_on_error (boolean)
results (list of ImpalaQueryResult)
_test_vectors (list of ?)
"""
def __init__(self, workload, scale_factor, config):
self.workload = workload
self.scale_factor = scale_factor
self.config = config
self.exit_on_error = not self.config.continue_on_query_error
if self.config.verbose: LOG.setLevel(level=logging.DEBUG)
self._generate_test_vectors()
self._results = list()
@property
def results(self):
return self._results
def _generate_test_vectors(self):
"""Generate test vector objects
If the user has specified a set for table_formats, generate them, otherwise generate
vectors for all table formats within the specified exploration strategy.
"""
self._test_vectors = []
if self.config.table_formats:
dataset = get_dataset_from_workload(self.workload.name)
for tf in self.config.table_formats:
self._test_vectors.append(TableFormatInfo.create_from_string(dataset, tf))
else:
vectors = load_table_info_dimension(self.workload.name,
self.config.exploration_strategy)
self._test_vectors = [vector.value for vector in vectors]
def _create_executor(self, executor_name):
query_options = {
'impala_beeswax': lambda: (execute_using_impala_beeswax,
BeeswaxQueryExecConfig(plugin_runner=self.config.plugin_runner,
exec_options=self.config.exec_options,
use_kerberos=self.config.use_kerberos,
)),
'impala_jdbc': lambda: (execute_using_jdbc,
JdbcQueryExecConfig(plugin_runner=self.config.plugin_runner)
),
'impala_hs2': lambda: (execute_using_impala_hs2,
ImpalaHS2QueryConfig(plugin_runner=self.config.plugin_runner,
use_kerberos=self.config.use_kerberos
)),
'hive_hs2': lambda: (execute_using_hive_hs2,
HiveHS2QueryConfig(hiveserver=self.config.hiveserver,
plugin_runner=self.config.plugin_runner,
exec_options=self.config.exec_options,
user=self.config.user,
use_kerberos=self.config.use_kerberos
))
} [executor_name]()
return query_options
def _execute_queries(self, queries):
"""Execute a set of queries.
Create query executors for each query, and pass them along with config information to
the scheduler.
"""
executor_name = "{0}_{1}".format(self.config.exec_engine, self.config.client_type)
exec_func, exec_config = self._create_executor(executor_name)
query_executors = []
# Build an executor for each query
for query in queries:
query_executor = QueryExecutor(executor_name,
query,
exec_func,
exec_config,
self.exit_on_error)
query_executors.append(query_executor)
# Initialize the scheduler.
scheduler = Scheduler(query_executors=query_executors,
shuffle=self.config.shuffle_queries,
iterations=self.config.workload_iterations,
query_iterations=self.config.query_iterations,
impalads=self.config.impalads,
num_clients=self.config.num_clients,
plan_first=getattr(self.config, 'plan_first', False))
scheduler.run()
self._results.extend(scheduler.results)
def run(self):
"""
Runs the workload against all test vectors serially and stores the results.
"""
for test_vector in self._test_vectors:
# Transform the query strings to Query objects for a combination of scale factor and
# the test vector.
queries = self.workload.construct_queries(test_vector, self.scale_factor)
self._execute_queries(queries)