Files
impala/tests/performance/query_executor.py
Todd Lipcon 30bb0b3d89 tests: ensure consistent logging format across tests
Many of the test modules included calls to 'logging.basicConfig' at
global scope in their implementation. This meant that by just importing
one of these files, other tests would inherit their logging format. This
is typically a bad idea in Python -- modules should not have side
effects like this on import.

The format was additionally inconsistent. In some cases we had a "--"
prepended to the format, and in others we didn't. The "--" is very
useful since it lets developers copy-paste query-test output back into
the shell to reproduce an issue.

This patch fixes the above by centralizing the logging configuration in
a pytest hook that runs prior to all pytests. A few other non-pytest
related tools now configure logging in their "main" code which is only
triggered when the module is executed directly.

I tested that, with this change, logs still show up properly in the .xml
output files from 'run-tests.py' as well as when running tests manually
from impala-py.test

Change-Id: I55ef0214b43f87da2d71804913ba4caa964f789f
Reviewed-on: http://gerrit.cloudera.org:8080/11225
Reviewed-by: Philip Zeyliger <philip@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-08-18 04:21:00 +00:00

277 lines
9.0 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Module used for executing queries and gathering results.
# The QueryExecutor is meant to be generic and doesn't
# have the knowledge of how to actually execute a query. It takes a query and its config
# and executes is against a executor function.
# For example (in pseudo-code):
#
# def exec_func(query, config):
# ...
#
# config = ImpalaBeeswaxQueryExecConfig()
# executor = QueryExecutor('beeswax', query, config, exec_func)
# executor.run()
# result = executor.result
import logging
import os
import re
from tests.performance.query import Query
# Setup logging for this module.
LOG = logging.getLogger('query_executor')
LOG.setLevel(level=logging.INFO)
# Globals.
hive_result_regex = 'Time taken: (\d*).(\d*) seconds'
# Match any CRUD statement that can follow EXPLAIN.
# The statement may begin with SQL line comments starting with --
COMMENT_LINES_REGEX = r'(?:\s*--.*\n)*'
DDL_CRUD_PATTERN = \
re.compile(COMMENT_LINES_REGEX +
r'\s*((CREATE|DELETE|INSERT|SELECT|UPDATE|UPSERT|WITH)\s|VALUES\s*\()',
re.IGNORECASE)
## TODO: Split executors into their own modules.
class QueryExecConfig(object):
"""Base Class for Execution Configs
Attributes:
plugin_runner (PluginRunner?)
"""
def __init__(self, plugin_runner=None):
self.plugin_runner = plugin_runner
class ImpalaQueryExecConfig(QueryExecConfig):
"""Base class for Impala query execution config
Attributes:
impalad (str): address of impalad <host>:<port>
"""
def __init__(self, plugin_runner=None, impalad='localhost:21000'):
super(ImpalaQueryExecConfig, self).__init__(plugin_runner=plugin_runner)
self._impalad = impalad
@property
def impalad(self):
return self._impalad
@impalad.setter
def impalad(self, value):
self._impalad = value
class JdbcQueryExecConfig(ImpalaQueryExecConfig):
"""Impala query execution config for jdbc
Attributes:
tranport (?): ?
"""
JDBC_CLIENT_PATH = os.path.join(os.environ['IMPALA_HOME'], 'bin/run-jdbc-client.sh')
def __init__(self, plugin_runner=None, impalad='localhost:21050', transport=None):
super(JdbcQueryExecConfig, self).__init__(plugin_runner=plugin_runner,
impalad=impalad)
self.transport = transport
@property
def jdbc_client_cmd(self):
"""The args to run the jdbc client.
Constructed on the fly, since the impalad it points to can change.
"""
assert self.transport is not None
return JdbcQueryExecConfig.JDBC_CLIENT_PATH + ' -i "%s" -t %s' % (self._impalad,
self.transport)
class ImpalaHS2QueryConfig(ImpalaQueryExecConfig):
def __init__(self, use_kerberos=False, impalad="localhost:21050", plugin_runner=None):
super(ImpalaHS2QueryConfig, self).__init__(plugin_runner=plugin_runner,
impalad=impalad)
# TODO Use a config dict for query execution options similar to HS2
self.use_kerberos = use_kerberos
class HiveHS2QueryConfig(QueryExecConfig):
def __init__(self,
plugin_runner=None,
exec_options = None,
use_kerberos=False,
user=None,
hiveserver='localhost'):
super(HiveHS2QueryConfig, self).__init__()
self.exec_options = dict()
self._build_options(exec_options)
self.use_kerberos = use_kerberos
self.user = user
self.hiveserver = hiveserver
def _build_options(self, exec_options):
"""Read the exec_options into self.exec_options
Args:
exec_options (str): String formatted as "command1;command2"
"""
if exec_options:
# exec_options are seperated by ; on the command line
options = exec_options.split(';')
for option in options:
key, value = option.split(':')
# The keys in HiveService QueryOptions are lower case.
self.exec_options[key.lower()] = value
class BeeswaxQueryExecConfig(ImpalaQueryExecConfig):
"""Impala query execution config for beeswax
Args:
use_kerberos (boolean)
exec_options (str): String formatted as "opt1:val1;opt2:val2"
impalad (str): address of impalad <host>:<port>
plugin_runner (?): ?
Attributes:
use_kerberos (boolean)
exec_options (dict str -> str): execution options
"""
def __init__(self, use_kerberos=False, exec_options=None, impalad='localhost:21000',
plugin_runner=None):
super(BeeswaxQueryExecConfig, self).__init__(plugin_runner=plugin_runner,
impalad=impalad)
self.use_kerberos = use_kerberos
self.exec_options = dict()
self._build_options(exec_options)
def _build_options(self, exec_options):
"""Read the exec_options into self.exec_options
Args:
exec_options (str): String formatted as "opt1:val1;opt2:val2"
"""
if exec_options:
# exec_options are seperated by ; on the command line
options = exec_options.split(';')
for option in options:
key, value = option.split(':')
# The keys in ImpalaService QueryOptions are upper case.
self.exec_options[key.upper()] = value
class QueryExecutor(object):
"""Executes one or more queries.
Args:
name (str): eg. "hive"
query (Query): Container holding 1 or more ;-delimited SQL statements to be executed
func (function): Function that accepts a QueryExecOption parameter and returns a
ImpalaQueryResult. Eg. execute_using_impala_beeswax
config (QueryExecOption)
exit_on_error (boolean): Exit right after an error encountered.
Attributes:
exec_func (function): Function that accepts a QueryExecOption parameter and returns
an ImpalaQueryResult.
exec_config (QueryExecOption)
query (Query): Container holding 1 or more ;-delimited SQL statements to be executed
exit_on_error (boolean): Exit right after an error encountered.
executor_name (str): eg. "hive"
result (ImpalaQueryResult): Contains the result after execute method is called.
"""
def __init__(self, name, query, func, config, exit_on_error):
self.exec_func = func
self.exec_config = config
self.query = query
self.exit_on_error = exit_on_error
self.executor_name = name
self._result = None
def prepare(self, impalad):
"""Prepare the query to be run.
For now, this sets the impalad that the query connects to. If the executor is hive,
it's a no op.
"""
if 'hive' not in self.executor_name:
self.exec_config.impalad = impalad
def execute(self, plan_first=False):
"""Execute a set of SQL statements using the given execution function,
and return a result object. SQL statements can be the familiar SELECT, INSERT,
DELETE, UPDATE and UPSERT DML commands as well as utilities like SET, SHOW,
VERSION, etc.
If plan_first is true, EXPLAIN the "explainable" queries in the set
first so timing does not include the initial metadata loading
required for planning.
This function furnishes a query result object in self._result, for the last
query in the batch ONLY.
"""
assert isinstance(self.query, Query)
orig_query_str = self.query.query_str
try:
statements = self.query.query_str.split(';')
if plan_first:
# Break out multiple statements in self.query
for stmt in statements:
ddl_crud_match = DDL_CRUD_PATTERN.match(stmt)
if not ddl_crud_match:
# Don't EXPLAIN this statement
continue
self.query.query_str = 'EXPLAIN ' + stmt + ';'
LOG.debug('Planning %s' % self.query.query_str)
self.exec_func(self.query, self.exec_config)
# Now actually execute
for stmt in statements:
self.query.query_str = stmt + ';'
LOG.debug('Executing %s' % self.query.query_str)
self._result = self.exec_func(self.query, self.exec_config)
if not self._result.success:
if self.exit_on_error:
raise RuntimeError(self._result.query_error)
else:
LOG.info("Continuing execution")
finally:
self.query.query_str = orig_query_str
# We do not need to restore the SET options we changed for the next batch
# (Query object) because the scheduler runs each Query on its own connection
# (XXX pretty strange for a performance test. :)
@property
def result(self):
"""Getter for the result of the query execution.
A result is a ImpalaQueryResult object that contains the details of a single run of
the query.
"""
return self._result