mirror of
https://github.com/apache/impala.git
synced 2025-12-30 21:02:41 -05:00
- Added execution summary to the beeswax client and QueryResult - Modified report-benchmark-results to handle JSON and perform execution summary comparison between runs - Added comments to the new workload runner Change-Id: I9c3c5f2fdc5d8d1e70022c4077334bc44e3a2d1d Reviewed-on: http://gerrit.ent.cloudera.com:8080/3598 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: jenkins (cherry picked from commit fd0b1406be2511c202e02fa63af94fbbe5e18eee) Reviewed-on: http://gerrit.ent.cloudera.com:8080/3618
114 lines
4.3 KiB
Python
114 lines
4.3 KiB
Python
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import os
|
|
import fnmatch
|
|
import re
|
|
|
|
from collections import defaultdict
|
|
from tests.common.query import Query
|
|
from tests.util.test_file_parser import parse_query_test_file
|
|
|
|
class Workload(object):
|
|
"""Represents a workload.
|
|
|
|
A workload is the internal representation for the set of queries on a dataset. It
|
|
consists of the dataset name, and a mapping of query names to query strings.
|
|
|
|
Args:
|
|
name (str): workload name. (Eg. tpch)
|
|
query_name_filters (list of str): List of regular expressions used for matching query
|
|
names
|
|
|
|
Attributes:
|
|
name (str): workload name (Eg. tpch)
|
|
__query_map (dict): contains a query name -> string mapping; mapping of query name to
|
|
section (ex. "TPCH-Q10" -> "select * from...")
|
|
"""
|
|
|
|
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
|
|
|
def __init__(self, name, query_name_filters=None):
|
|
self.__name = name
|
|
self.__query_map = dict()
|
|
# Build the query name -> string mapping in the c'tor. We want to fail fast and early
|
|
# if the user input is bad.
|
|
self.__validate_and_load(query_name_filters)
|
|
|
|
@property
|
|
def name(self):
|
|
return self.__name
|
|
|
|
@property
|
|
def query_map(self):
|
|
return self.__query_map
|
|
|
|
def __validate_and_load(self, query_name_filters):
|
|
"""Validates that the Workload is legal."""
|
|
query_name_filters = map(str.strip, query_name_filters) if query_name_filters else []
|
|
self.__base_dir = os.path.join(Workload.WORKLOAD_DIR, self.__name, 'queries')
|
|
# Check whether the workload name corresponds to an existing directory.
|
|
if not os.path.isdir(self.__base_dir):
|
|
raise ValueError("Workload %s not found in %s" % (self.__name, self.__base_dir))
|
|
sections = list()
|
|
# Parse all queries files for the given workload.
|
|
for file_name in self.__list_query_files():
|
|
sections.extend(parse_query_test_file(file_name))
|
|
# If the user has specified query names, check whether all the user specified queries
|
|
# exist in the query files.
|
|
all_query_names = [s['QUERY_NAME'] for s in sections if s['QUERY_NAME'].strip()]
|
|
regex = re.compile(r'|'.join(['^%s$' % n for n in query_name_filters]), re.I)
|
|
matched_query_names = filter(lambda x: re.match(regex, x), all_query_names)
|
|
assert len(matched_query_names) > 0, "No matching queries found for %s" % self.__name
|
|
# Filter the sections based on the queries the user wants.
|
|
sections = filter(lambda x: x['QUERY_NAME'] in matched_query_names, sections)
|
|
# Add the filtered queries to the query map
|
|
for section in sections:
|
|
self.__query_map[section['QUERY_NAME']] = section['QUERY']
|
|
|
|
def __list_query_files(self):
|
|
"""Return a list of all the .test files that contain queries"""
|
|
query_files = list()
|
|
for root, dirs, file_names in os.walk(self.__base_dir):
|
|
for file_name in fnmatch.filter(file_names, '*.test'):
|
|
query_files.append(os.path.join(root, file_name))
|
|
assert len(query_files) > 0, "No Query Files found in %s" % self.__base_dir
|
|
return query_files
|
|
|
|
def construct_queries(self, test_vector, scale_factor):
|
|
"""Transform a query map into a list of query objects.
|
|
|
|
Transform all the queries in the workload's query map to query objects based on the
|
|
input test vector and scale factor.
|
|
|
|
Args:
|
|
test_vector (?): query vector
|
|
scale_factor (str): eg. "300gb"
|
|
|
|
Returns:
|
|
(list of Query): these will be consumed by ?
|
|
"""
|
|
|
|
queries = list()
|
|
for query_name, query_str in self.__query_map.iteritems():
|
|
queries.append(Query(name=query_name,
|
|
query_str=query_str,
|
|
workload=self.__name,
|
|
scale_factor=scale_factor,
|
|
test_vector=test_vector))
|
|
return queries
|
|
|