mirror of
https://github.com/apache/impala.git
synced 2026-01-07 00:02:28 -05:00
This patch introduces new abstractions and changes the way queries are run via the workload runner. A new class 'Workload' is introduced, which represents the notion of a workload in the performance framework (i.e, A set of query names mapped to query strings). The new workflow is: - run-workload acts as a driver. It accepts user parmaters for which queries to run and their execution strategy. It generates workload objects and passes them to the workload-runner. - The workload runner takes a workload, its execution parameters and generates a set of test vectors over which the workload is run iteratively. - A workload is executed by initialiazing a QueryExecutor for each query being run in a test vector. The workload executor is then responsible for execution and gathering results. - The execution details of every query being executed are are stored and returned to the driver (run-workload). Change-Id: Ia16360140d65e6733e534e823bc5d5614622ab5f Reviewed-on: http://gerrit.ent.cloudera.com:8080/3616 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: jenkins
398 lines
17 KiB
Python
Executable File
398 lines
17 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# The base class that should be used for almost all Impala tests
|
|
|
|
import logging
|
|
import os
|
|
import pprint
|
|
import pytest
|
|
from functools import wraps
|
|
from random import choice
|
|
from tests.common.impala_service import ImpaladService
|
|
from tests.common.impala_connection import ImpalaConnection, create_connection
|
|
from tests.common.test_dimensions import *
|
|
from tests.common.test_result_verifier import *
|
|
from tests.common.test_vector import *
|
|
from tests.common.query import Query
|
|
from tests.util.test_file_parser import *
|
|
from tests.util.thrift_util import create_transport
|
|
from tests.common.base_test_suite import BaseTestSuite
|
|
from tests.common.query_executor import JdbcQueryExecConfig, execute_using_jdbc
|
|
from tests.util.hdfs_util import HdfsConfig, get_hdfs_client, get_hdfs_client_from_conf
|
|
|
|
# Imports required for Hive Metastore Client
|
|
from hive_metastore import ThriftHiveMetastore
|
|
from thrift.transport import TTransport, TSocket
|
|
from thrift.protocol import TBinaryProtocol
|
|
|
|
logging.basicConfig(level=logging.INFO, format='-- %(message)s')
|
|
LOG = logging.getLogger('impala_test_suite')
|
|
|
|
IMPALAD_HOST_PORT_LIST = pytest.config.option.impalad.split(',')
|
|
assert len(IMPALAD_HOST_PORT_LIST) > 0, 'Must specify at least 1 impalad to target'
|
|
IMPALAD = IMPALAD_HOST_PORT_LIST[0]
|
|
IMPALAD_HS2_HOST_PORT =\
|
|
IMPALAD.split(':')[0] + ":" + pytest.config.option.impalad_hs2_port
|
|
HIVE_HS2_HOST_PORT = pytest.config.option.hive_server2
|
|
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
|
HDFS_CONF = HdfsConfig(pytest.config.option.minicluster_xml_conf)
|
|
|
|
# Base class for Impala tests. All impala test cases should inherit from this class
|
|
class ImpalaTestSuite(BaseTestSuite):
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
"""
|
|
A hook for adding additional dimensions.
|
|
|
|
By default load the table_info and exec_option dimensions, but if a test wants to
|
|
add more dimensions or different dimensions they can override this function.
|
|
"""
|
|
super(ImpalaTestSuite, cls).add_test_dimensions()
|
|
cls.TestMatrix.add_dimension(
|
|
cls.create_table_info_dimension(cls.exploration_strategy()))
|
|
cls.TestMatrix.add_dimension(cls.__create_exec_option_dimension())
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
"""Setup section that runs before each test suite"""
|
|
cls.hive_client, cls.client = [None, None]
|
|
# Create a Hive Metastore Client (used for executing some test SETUP steps
|
|
metastore_host, metastore_port = pytest.config.option.metastore_server.split(':')
|
|
trans_type = 'buffered'
|
|
if pytest.config.option.use_kerberos:
|
|
trans_type = 'kerberos'
|
|
cls.hive_transport = create_transport(
|
|
host=metastore_host,
|
|
port=metastore_port,
|
|
service=pytest.config.option.hive_service_name,
|
|
transport_type=trans_type)
|
|
protocol = TBinaryProtocol.TBinaryProtocol(cls.hive_transport)
|
|
cls.hive_client = ThriftHiveMetastore.Client(protocol)
|
|
cls.hive_transport.open()
|
|
|
|
# Create a connection to Impala.
|
|
cls.client = cls.create_impala_client(IMPALAD)
|
|
|
|
cls.impalad_test_service = ImpaladService(IMPALAD.split(':')[0])
|
|
if pytest.config.option.namenode_http_address is None:
|
|
cls.hdfs_client = get_hdfs_client_from_conf(HDFS_CONF)
|
|
else:
|
|
host, port = pytest.config.option.namenode_http_address.split(":")
|
|
cls.hdfs_client = get_hdfs_client()
|
|
|
|
@classmethod
|
|
def teardown_class(cls):
|
|
"""Setup section that runs after each test suite"""
|
|
# Cleanup the Impala and Hive Metastore client connections
|
|
if cls.hive_transport:
|
|
cls.hive_transport.close()
|
|
|
|
if cls.client:
|
|
cls.client.close()
|
|
|
|
@classmethod
|
|
def create_impala_client(cls, host_port=IMPALAD):
|
|
client = create_connection(host_port=host_port,
|
|
use_kerberos=pytest.config.option.use_kerberos)
|
|
client.connect()
|
|
return client
|
|
|
|
@classmethod
|
|
def cleanup_db(self, db_name):
|
|
# To drop a db, we need to first drop all the tables in that db
|
|
self.client.execute("use default")
|
|
self.client.set_configuration({'sync_ddl': 1})
|
|
|
|
if db_name in self.client.execute("show databases", ).data:
|
|
# We use quoted identifiers to avoid name clashes with keywords
|
|
for tbl_name in self.client.execute("show tables in `" + db_name + "`").data:
|
|
full_tbl_name = '`%s`.`%s`' % (db_name, tbl_name)
|
|
result = self.client.execute("describe formatted " + full_tbl_name)
|
|
if 'VIRTUAL_VIEW' in '\n'.join(result.data):
|
|
self.client.execute("drop view " + full_tbl_name)
|
|
else:
|
|
self.client.execute("drop table " + full_tbl_name)
|
|
for fn_result in self.client.execute("show functions in `" + db_name + "`").data:
|
|
# First column is the return type, second is the function signature
|
|
fn_name = fn_result.split('\t')[1]
|
|
self.client.execute("drop function `%s`.%s" % (db_name, fn_name))
|
|
for fn_result in self.client.execute(\
|
|
"show aggregate functions in `" + db_name + "`").data:
|
|
fn_name = fn_result.split('\t')[1]
|
|
self.client.execute("drop function `%s`.%s" % (db_name, fn_name))
|
|
self.client.execute("drop database `" + db_name + "`")
|
|
|
|
def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False,
|
|
encoding=None):
|
|
"""
|
|
Runs the queries in the specified test based on the vector values
|
|
|
|
Runs the query using targeting the file format/compression specified in the test
|
|
vector and the exec options specified in the test vector. If multiple_impalad=True
|
|
a connection to a random impalad will be chosen to execute each test section.
|
|
Otherwise, the default impalad client will be used.
|
|
Additionally, the encoding for all test data can be specified using the 'encoding'
|
|
parameter. This is useful when data is ingested in a different encoding (ex.
|
|
latin). If not set, the default system encoding will be used.
|
|
"""
|
|
table_format_info = vector.get_value('table_format')
|
|
exec_options = vector.get_value('exec_option')
|
|
|
|
target_impalad_clients = list()
|
|
if multiple_impalad:
|
|
target_impalad_clients =\
|
|
map(ImpalaTestSuite.create_impala_client, IMPALAD_HOST_PORT_LIST)
|
|
else:
|
|
target_impalad_clients = [self.client]
|
|
|
|
# Change the database to reflect the file_format, compression codec etc, or the
|
|
# user specified database for all targeted impalad.
|
|
for impalad_client in target_impalad_clients:
|
|
ImpalaTestSuite.change_database(impalad_client,
|
|
table_format_info, use_db, pytest.config.option.scale_factor)
|
|
impalad_client.set_configuration(exec_options)
|
|
|
|
sections = self.load_query_test_file(self.get_workload(), test_file_name,
|
|
encoding=encoding)
|
|
for test_section in sections:
|
|
if 'QUERY' not in test_section:
|
|
assert 0, 'Error in test file %s. Test cases require a -- QUERY section.\n%s' %\
|
|
(test_file_name, pprint.pformat(test_section))
|
|
|
|
if 'SETUP' in test_section:
|
|
self.execute_test_case_setup(test_section['SETUP'], table_format_info)
|
|
|
|
# TODO: support running query tests against different scale factors
|
|
query = QueryTestSectionReader.build_query(test_section['QUERY'])
|
|
|
|
if 'QUERY_NAME' in test_section:
|
|
LOG.info('Query Name: \n%s\n' % test_section['QUERY_NAME'])
|
|
|
|
# Support running multiple queries within the same test section, only verifying the
|
|
# result of the final query. The main use case is to allow for 'USE database'
|
|
# statements before a query executes, but it is not limited to that.
|
|
# TODO: consider supporting result verification of all queries in the future
|
|
result = None
|
|
target_impalad_client = choice(target_impalad_clients)
|
|
try:
|
|
for query in query.split(';'):
|
|
result = self.__execute_query(target_impalad_client, query)
|
|
except Exception as e:
|
|
if 'CATCH' in test_section:
|
|
assert test_section['CATCH'].strip() in str(e)
|
|
continue
|
|
raise
|
|
|
|
if 'CATCH' in test_section:
|
|
assert test_section['CATCH'].strip() == ''
|
|
|
|
assert result is not None
|
|
assert result.success
|
|
|
|
# Decode the results read back if the data is stored with a specific encoding.
|
|
if encoding: result.data = [row.decode(encoding) for row in result.data]
|
|
|
|
verify_raw_results(test_section, result,
|
|
vector.get_value('table_format').file_format,
|
|
pytest.config.option.update_results)
|
|
if pytest.config.option.update_results:
|
|
output_file = os.path.join('/tmp', test_file_name.replace('/','_') + ".test")
|
|
write_test_file(output_file, sections, encoding=encoding)
|
|
|
|
def execute_test_case_setup(self, setup_section, table_format):
|
|
"""
|
|
Executes a test case 'SETUP' section
|
|
|
|
The test case 'SETUP' section is mainly used for insert tests. These tests need to
|
|
have some actions performed before each test case to ensure the target tables are
|
|
empty. The current supported setup actions:
|
|
RESET <table name> - Drop and recreate the table
|
|
DROP PARTITIONS <table name> - Drop all partitions from the table
|
|
"""
|
|
setup_section = QueryTestSectionReader.build_query(setup_section)
|
|
for row in setup_section.split('\n'):
|
|
row = row.lstrip()
|
|
if row.startswith('RESET'):
|
|
db_name, table_name = QueryTestSectionReader.get_table_name_components(\
|
|
table_format, row.split('RESET')[1])
|
|
self.__reset_table(db_name, table_name)
|
|
self.client.execute("invalidate metadata " + db_name + "." + table_name)
|
|
elif row.startswith('DROP PARTITIONS'):
|
|
db_name, table_name = QueryTestSectionReader.get_table_name_components(\
|
|
table_format, row.split('DROP PARTITIONS')[1])
|
|
self.__drop_partitions(db_name, table_name)
|
|
self.client.execute("invalidate metadata " + db_name + "." + table_name)
|
|
else:
|
|
assert False, 'Unsupported setup command: %s' % row
|
|
|
|
@classmethod
|
|
def change_database(cls, impala_client, table_format=None,
|
|
db_name=None, scale_factor=None):
|
|
if db_name == None:
|
|
assert table_format != None
|
|
db_name = QueryTestSectionReader.get_db_name(table_format,
|
|
scale_factor if scale_factor else '')
|
|
query = 'use %s' % db_name
|
|
# Clear the exec_options before executing a USE statement.
|
|
# The USE statement should not fail for negative exec_option tests.
|
|
impala_client.clear_configuration()
|
|
impala_client.execute(query)
|
|
|
|
def execute_wrapper(function):
|
|
"""
|
|
Issues a use database query before executing queries.
|
|
|
|
Database names are dependent on the input format for table, which the table names
|
|
remaining the same. A use database is issued before query execution. As such,
|
|
dabase names need to be build pre execution, this method wraps around the different
|
|
execute methods and provides a common interface to issue the proper use command.
|
|
"""
|
|
@wraps(function)
|
|
def wrapper(*args, **kwargs):
|
|
table_format = None
|
|
if kwargs.get('table_format'):
|
|
table_format = kwargs.get('table_format')
|
|
del kwargs['table_format']
|
|
if kwargs.get('vector'):
|
|
table_format = kwargs.get('vector').get_value('table_format')
|
|
del kwargs['vector']
|
|
# self is the implicit first argument
|
|
if table_format is not None:
|
|
args[0].change_database(args[0].client, table_format)
|
|
return function(*args, **kwargs)
|
|
return wrapper
|
|
|
|
@execute_wrapper
|
|
def execute_query_expect_success(self, impalad_client, query, query_options=None):
|
|
"""Executes a query and asserts if the query fails"""
|
|
result = self.__execute_query(impalad_client, query, query_options)
|
|
assert result.success
|
|
return result
|
|
|
|
@execute_wrapper
|
|
def execute_query(self, query, query_options=None):
|
|
return self.__execute_query(self.client, query, query_options)
|
|
|
|
def execute_query_using_client(self, client, query, vector):
|
|
self.change_database(client, vector.get_value('table_format'))
|
|
return client.execute(query)
|
|
|
|
@execute_wrapper
|
|
def execute_query_async(self, query, query_options=None):
|
|
self.client.set_configuration(query_options)
|
|
return self.client.execute_async(query)
|
|
|
|
@execute_wrapper
|
|
def close_query(self, query):
|
|
return self.client.close_query(query)
|
|
|
|
@execute_wrapper
|
|
def execute_scalar(self, query, query_options=None):
|
|
result = self.__execute_query(self.client, query, query_options)
|
|
assert len(result.data) <= 1, 'Multiple values returned from scalar'
|
|
return result.data[0] if len(result.data) == 1 else None
|
|
|
|
def exec_and_compare_hive_and_impala_hs2(self, stmt):
|
|
"""Compare Hive and Impala results when executing the same statment over HS2"""
|
|
# execute_using_jdbc expects a Query object. Convert the query string into a Query
|
|
# object
|
|
query = Query()
|
|
query.query_str = stmt
|
|
# Run the statement targeting Hive
|
|
exec_opts = JdbcQueryExecConfig(impalad=HIVE_HS2_HOST_PORT)
|
|
hive_results = execute_using_jdbc(query, exec_opts).data
|
|
|
|
# Run the statement targeting Impala
|
|
exec_opts = JdbcQueryExecConfig(impalad=IMPALAD_HS2_HOST_PORT)
|
|
impala_results = execute_using_jdbc(query, exec_opts).data
|
|
|
|
# Compare the results
|
|
assert (impala_results is not None) and (hive_results is not None)
|
|
for impala, hive in zip(impala_results, hive_results):
|
|
assert impala == hive
|
|
|
|
def load_query_test_file(self, workload, file_name, valid_section_names=None,
|
|
encoding=None):
|
|
"""
|
|
Loads/Reads the specified query test file. Accepts the given section names as valid.
|
|
Uses a default list of valid section names if valid_section_names is None.
|
|
"""
|
|
test_file_path = os.path.join(WORKLOAD_DIR, workload, 'queries', file_name + '.test')
|
|
if not os.path.isfile(test_file_path):
|
|
assert False, 'Test file not found: %s' % file_name
|
|
return parse_query_test_file(test_file_path, valid_section_names, encoding=encoding)
|
|
|
|
def __drop_partitions(self, db_name, table_name):
|
|
"""Drops all partitions in the given table"""
|
|
for partition in self.hive_client.get_partition_names(db_name, table_name, 0):
|
|
self.hive_client.drop_partition_by_name(db_name, table_name, partition, True)
|
|
|
|
def __execute_query(self, impalad_client, query, query_options=None):
|
|
"""Executes the given query against the specified Impalad"""
|
|
if query_options is not None: impalad_client.set_configuration(query_options)
|
|
return impalad_client.execute(query)
|
|
|
|
def __execute_query_new_client(self, query, query_options=None,
|
|
use_kerberos=False):
|
|
"""Executes the given query against the specified Impalad"""
|
|
new_client = self.create_impala_client()
|
|
new_client.set_configuration(query_options)
|
|
return new_client.execute(query)
|
|
|
|
def __reset_table(self, db_name, table_name):
|
|
"""Resets a table (drops and recreates the table)"""
|
|
table = self.hive_client.get_table(db_name, table_name)
|
|
assert table is not None
|
|
self.hive_client.drop_table(db_name, table_name, True)
|
|
self.hive_client.create_table(table)
|
|
|
|
@classmethod
|
|
def create_table_info_dimension(cls, exploration_strategy):
|
|
# If the user has specified a specific set of table formats to run against, then
|
|
# use those. Otherwise, load from the workload test vectors.
|
|
if pytest.config.option.table_formats:
|
|
table_formats = list()
|
|
for tf in pytest.config.option.table_formats.split(','):
|
|
dataset = get_dataset_from_workload(cls.get_workload())
|
|
table_formats.append(TableFormatInfo.create_from_string(dataset, tf))
|
|
return TestDimension('table_format', *table_formats)
|
|
else:
|
|
return load_table_info_dimension(cls.get_workload(), exploration_strategy)
|
|
|
|
@classmethod
|
|
def __create_exec_option_dimension(cls):
|
|
cluster_sizes = ALL_CLUSTER_SIZES
|
|
disable_codegen_options = ALL_DISABLE_CODEGEN_OPTIONS
|
|
batch_sizes = ALL_BATCH_SIZES
|
|
if cls.exploration_strategy() == 'core':
|
|
disable_codegen_options = [False]
|
|
cluster_sizes = ALL_NODES_ONLY
|
|
return create_exec_option_dimension(cluster_sizes, disable_codegen_options,
|
|
batch_sizes)
|
|
|
|
@classmethod
|
|
def exploration_strategy(cls):
|
|
default_strategy = pytest.config.option.exploration_strategy
|
|
if pytest.config.option.workload_exploration_strategy:
|
|
workload_strategies = pytest.config.option.workload_exploration_strategy.split(',')
|
|
for workload_strategy in workload_strategies:
|
|
workload_strategy = workload_strategy.split(':')
|
|
if len(workload_strategy) != 2:
|
|
raise ValueError, 'Invalid workload:strategy format: %s' % workload_strategy
|
|
if cls.get_workload() == workload_strategy[0]:
|
|
return workload_strategy[1]
|
|
return default_strategy
|