mirror of
https://github.com/apache/impala.git
synced 2026-01-07 09:02:19 -05:00
Problems with perf queries (run-workload.py): - TPCH picks up stress test specific queries (TPCH-AGG1/2/3) - TPCDS picks up queries that were intended just to validate that data was loaded properly but that aren't interesting from a perf perspective (TPCDS-COUNT-<table>) - TPCDS picks up both decimal_v1 and decimal_v2 queries. This is mostly harmless as for queries with matching names only one gets run but it causes some queries with mismatched names to be run twice (TPCDS-Q39-1/2 vs. TPCDS-Q39.1/2) Problems with stress queries (concurrent_select.py): - TPCDS fails to pick up Q22A as it does not use the decimal_v2 queries, even though decimal_v2 is the default now. This problem is exacerbated by the fact that the two scripts have different code paths for selecting the queries, so in the past changes that were made to one path were not always made to the other. This patch merges the two paths to reduce code duplication and prevent these sorts of issues in the future, and fixes the above issues. One complication is that historically the stress test has used query names in the form 'q1' whereas the perf test has used query names in the form 'TPCH-Q1'. This patch standardizes on using 'TPCH-Q1'. Testing: - Added a test that checks that the perf tests pick up the expected number of queries. - Manually ran the scripts and verified that the correct queries are selected. Change-Id: Id1966d6ca8babdda07d47e089b75ba06d0318c0d Reviewed-on: http://gerrit.cloudera.org:8080/12503 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
84 lines
2.8 KiB
Python
84 lines
2.8 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
|
|
import os
|
|
import fnmatch
|
|
import re
|
|
|
|
from tests.performance.query import Query
|
|
from tests.util.test_file_parser import load_tpc_queries
|
|
|
|
class Workload(object):
|
|
"""Represents a workload.
|
|
|
|
A workload is the internal representation for the set of queries on a dataset. It
|
|
consists of the dataset name, and a mapping of query names to query strings.
|
|
|
|
Args:
|
|
name (str): workload name. (Eg. tpch)
|
|
query_name_filters (list of str): List of regular expressions used for matching query
|
|
names
|
|
|
|
Attributes:
|
|
name (str): workload name (Eg. tpch)
|
|
_query_map (dict): contains a query name -> string mapping; mapping of query name to
|
|
section (ex. "TPCH-Q10" -> "select * from...")
|
|
"""
|
|
|
|
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
|
|
|
def __init__(self, name, query_name_filters=None):
|
|
self._name = name
|
|
self._query_map = dict()
|
|
# Build the query name -> string mapping in the c'tor. We want to fail fast and early
|
|
# if the user input is bad.
|
|
self._query_map = load_tpc_queries(self._name, query_name_filters=query_name_filters)
|
|
assert len(self._query_map) > 0, "No matching queries found for %s" % self._name
|
|
|
|
@property
|
|
def name(self):
|
|
return self._name
|
|
|
|
@property
|
|
def query_map(self):
|
|
return self._query_map
|
|
|
|
def construct_queries(self, test_vector, scale_factor):
|
|
"""Transform a query map into a list of query objects.
|
|
|
|
Transform all the queries in the workload's query map to query objects based on the
|
|
input test vector and scale factor.
|
|
|
|
Args:
|
|
test_vector (?): query vector
|
|
scale_factor (str): eg. "300gb"
|
|
|
|
Returns:
|
|
(list of Query): these will be consumed by ?
|
|
"""
|
|
|
|
queries = list()
|
|
for query_name, query_str in self._query_map.iteritems():
|
|
queries.append(Query(name=query_name,
|
|
query_str=query_str,
|
|
workload=self._name,
|
|
scale_factor=scale_factor,
|
|
test_vector=test_vector))
|
|
return queries
|
|
|