Files
impala/tests/comparison/query_profile.py
casey cdbb6f80ee Misc query generator improvements
Changes:

1) Improve generation of boolean expressions to be more realistic.
   Previously functions like And and Or were very unlikely to be chosen
   because they only use Booleans and the generator prefers functions
   that use Ints. This is especially bad now that ON expressions may be
   arbitrary (previously they were hard coded to use Ands and Equals).
   Now And/Or are special cased when generating a Boolean expressions so
   they'll be more likely to be used.
2) Randomly choose query options. This is very basic. With this change
   Impala gets OOM killed because there are 3 nodes running locally and
   each of them is configured to use 80% of the total memory. Another
   update will be needed to start the mini-cluster with less memory.
3) Fix bug where a generated function tree would not be a tree at all,
   it would just be one function.

Change-Id: I75a7e6a1d96e5d92368f73c1e5e6a6b288932497
Reviewed-on: http://gerrit.cloudera.org:8080/2445
Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com>
Tested-by: Internal Jenkins
2016-03-05 05:33:32 +00:00

620 lines
23 KiB
Python

# Copyright (c) 2014 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from logging import getLogger
from random import choice, randint, random
from db_types import (
Boolean,
Char,
Decimal,
Float,
Int,
TYPES,
Timestamp)
from funcs import (
And,
Equals,
GreaterThan,
GreaterThanOrEquals,
In,
IsDistinctFrom,
IsNotDistinctFrom,
IsNotDistinctFromOp,
LessThan,
LessThanOrEquals,
NotEquals,
NotIn,
Or,
WindowBoundary)
from random_val_generator import RandomValGenerator
UNBOUNDED_PRECEDING = WindowBoundary.UNBOUNDED_PRECEDING
PRECEDING = WindowBoundary.PRECEDING
CURRENT_ROW = WindowBoundary.CURRENT_ROW
FOLLOWING = WindowBoundary.FOLLOWING
UNBOUNDED_FOLLOWING = WindowBoundary.UNBOUNDED_FOLLOWING
LOG = getLogger()
class DefaultProfile(object):
def __init__(self):
# Bounds are (min, max) values, the actual value used will be selected from the
# bounds and each value within the range has an equal probability of being selected.
self._bounds = {
'MAX_NESTED_QUERY_COUNT': (0, 2),
'MAX_NESTED_EXPR_COUNT': (0, 2),
'SELECT_ITEM_COUNT': (1, 5),
'WITH_TABLE_COUNT': (1, 3),
'TABLE_COUNT': (1, 2),
'ANALYTIC_LEAD_LAG_OFFSET': (1, 100),
'ANALYTIC_WINDOW_OFFSET': (1, 100)}
# Below are interdependent weights used to determine probabilities. The probability
# of any item being selected should be (item weight) / sum(weights). A weight of
# zero means the item will never be selected.
self._weights = {
'SELECT_ITEM_CATEGORY': {
'AGG': 3,
'ANALYTIC': 1,
'BASIC': 10},
'TYPES': {
Boolean: 1,
Char: 1,
Decimal: 1,
Float: 1,
Int: 10,
Timestamp: 1},
'RELATIONAL_FUNCS': {
Equals: 40,
GreaterThan: 2,
GreaterThanOrEquals: 2,
In: 2,
IsDistinctFrom: 2,
IsNotDistinctFrom: 1,
IsNotDistinctFromOp: 1,
LessThan: 2,
LessThanOrEquals: 2,
NotEquals: 2,
NotIn: 2},
'CONJUNCT_DISJUNCTS': {
And: 5,
Or: 1},
'ANALYTIC_WINDOW': {
('ROWS', UNBOUNDED_PRECEDING, None): 1,
('ROWS', UNBOUNDED_PRECEDING, PRECEDING): 2,
('ROWS', UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
('ROWS', UNBOUNDED_PRECEDING, FOLLOWING): 2,
('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
('ROWS', PRECEDING, None): 1,
('ROWS', PRECEDING, PRECEDING): 2,
('ROWS', PRECEDING, CURRENT_ROW): 1,
('ROWS', PRECEDING, FOLLOWING): 2,
('ROWS', PRECEDING, UNBOUNDED_FOLLOWING): 2,
('ROWS', CURRENT_ROW, None): 1,
('ROWS', CURRENT_ROW, CURRENT_ROW): 1,
('ROWS', CURRENT_ROW, FOLLOWING): 2,
('ROWS', CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
('ROWS', FOLLOWING, FOLLOWING): 2,
('ROWS', FOLLOWING, UNBOUNDED_FOLLOWING): 2,
# Ranges not yet supported
('RANGE', UNBOUNDED_PRECEDING, None): 0,
('RANGE', UNBOUNDED_PRECEDING, PRECEDING): 0,
('RANGE', UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
('RANGE', UNBOUNDED_PRECEDING, FOLLOWING): 0,
('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
('RANGE', PRECEDING, None): 0,
('RANGE', PRECEDING, PRECEDING): 0,
('RANGE', PRECEDING, CURRENT_ROW): 0,
('RANGE', PRECEDING, FOLLOWING): 0,
('RANGE', PRECEDING, UNBOUNDED_FOLLOWING): 0,
('RANGE', CURRENT_ROW, None): 0,
('RANGE', CURRENT_ROW, CURRENT_ROW): 0,
('RANGE', CURRENT_ROW, FOLLOWING): 0,
('RANGE', CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
('RANGE', FOLLOWING, FOLLOWING): 0,
('RANGE', FOLLOWING, UNBOUNDED_FOLLOWING): 0},
'JOIN': {
'INNER': 90,
'LEFT': 30,
'RIGHT': 10,
'FULL_OUTER': 3,
'CROSS': 1},
'SUBQUERY_PREDICATE': {
('Exists', 'AGG', 'CORRELATED'): 0, # Not supported
('Exists', 'AGG', 'UNCORRELATED'): 1,
('Exists', 'NON_AGG', 'CORRELATED'): 1,
('Exists', 'NON_AGG', 'UNCORRELATED'): 1,
('NotExists', 'AGG', 'CORRELATED'): 0, # Not supported
('NotExists', 'AGG', 'UNCORRELATED'): 0, # Not supported
('NotExists', 'NON_AGG', 'CORRELATED'): 1,
('NotExists', 'NON_AGG', 'UNCORRELATED'): 0, # Not supported
('In', 'AGG', 'CORRELATED'): 0, # Not supported
('In', 'AGG', 'UNCORRELATED'): 0, # Not supported
('In', 'NON_AGG', 'CORRELATED'): 1,
('In', 'NON_AGG', 'UNCORRELATED'): 1,
('NotIn', 'AGG', 'CORRELATED'): 0, # Not supported
('NotIn', 'AGG', 'UNCORRELATED'): 1,
('NotIn', 'NON_AGG', 'CORRELATED'): 1,
('NotIn', 'NON_AGG', 'UNCORRELATED'): 1,
('Scalar', 'AGG', 'CORRELATED'): 0, # Not supported
('Scalar', 'AGG', 'UNCORRELATED'): 1,
('Scalar', 'NON_AGG', 'CORRELATED'): 0, # Not supported
('Scalar', 'NON_AGG', 'UNCORRELATED'): 1},
'QUERY_EXECUTION': { # Used by the discrepancy searcher
'CREATE_TABLE_AS': 1,
'RAW': 10,
'VIEW': 1}}
# On/off switches
self._flags = {
'ANALYTIC_DESIGNS': {
'TOP_LEVEL_QUERY_WITHOUT_LIMIT': True,
'DETERMINISTIC_ORDER_BY': True,
'NO_ORDER_BY': True,
'ONLY_SELECT_ITEM': True,
'UNBOUNDED_WINDOW': True,
'RANK_FUNC': True}}
# Independent probabilities where 1 means 100%. These values may be ignored depending
# on the context. For example, GROUP_BY is almost always ignored and instead
# determined by the SELECT item weights above, since mixing aggregate and
# non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
# only applied if all of the SELECT items are non-aggregate.
self._probabilities = {
'OPTIONAL_QUERY_CLAUSES': {
'WITH': 0.1, # MAX_NESTED_QUERY_COUNT bounds take precedence
'FROM': 1,
'WHERE': 0.5,
'GROUP_BY': 0.1, # special case, doesn't really do much, see comment above
'HAVING': 0.25,
'UNION': 0.1,
'ORDER_BY': 0.1},
'OPTIONAL_ANALYTIC_CLAUSES': {
'PARTITION_BY': 0.5,
'ORDER_BY': 0.5,
'WINDOW': 0.5}, # will only be used if ORDER BY is chosen
'MISC': {
'INLINE_VIEW': 0.1, # MAX_NESTED_QUERY_COUNT bounds take precedence
'SELECT_DISTINCT': 0.1,
'SCALAR_SUBQUERY': 0.1,
'ONLY_USE_EQUALITY_JOIN_PREDICATES': 0.8,
'ONLY_USE_AGGREGATES_IN_HAVING_CLAUSE': 0.7,
'UNION_ALL': 0.5}} # Determines use of "ALL" but not "UNION"
self.__type_weights = {}
self.constant_generator = RandomValGenerator()
def _get_config_value(self, start_config, *keys):
value = start_config
for key in keys:
value = value[key]
return value
def weights(self, *keys):
'''Convenience method for getting the values of named weights'''
return self._get_config_value(self._weights, *keys)
def bounds(self, *keys):
'''Convenience method for getting the values of named bounds'''
return self._get_config_value(self._bounds, *keys)
def probability(self, *keys):
'''Convenience method for getting the value of named probabilities'''
return self._get_config_value(self._probabilities, *keys)
def _choose_from_bounds(self, *bounds):
'''Returns a value that is within the given bounds. Each value has an equal chance
of being chosen.
'''
if isinstance(bounds[0], str):
lower, upper = self.bounds(*bounds)
else:
lower, upper = bounds
return randint(lower, upper)
def _choose_from_weights(self, *weights):
'''Returns a value that is selected from the keys of weights with the probability
determined by the values of weights.
'''
if isinstance(weights[0], str):
weights = self.weights(*weights)
else:
weights = weights[0]
total_weight = sum(weights.itervalues())
numeric_choice = randint(1, total_weight)
for choice_, weight in weights.iteritems():
if weight <= 0:
continue
if numeric_choice <= weight:
return choice_
numeric_choice -= weight
def _choose_from_filtered_weights(self, filter, *weights):
'''Convenience method, apply the given filter before choosing a value.'''
if isinstance(weights[0], str):
weights = self.weights(*weights)
else:
weights = weights[0]
return self._choose_from_weights(dict(
(choice_, weight) for choice_, weight in weights.iteritems() if filter(choice_)))
def _decide_from_probability(self, *keys):
return random() < self.probability(*keys)
def get_max_nested_query_count(self):
'''Return the maximum number of queries the top level query may contain.'''
return self._choose_from_bounds('MAX_NESTED_QUERY_COUNT')
def use_with_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WITH')
def only_use_equality_join_predicates(self):
return self._decide_from_probability('MISC', 'ONLY_USE_EQUALITY_JOIN_PREDICATES')
def only_use_aggregates_in_having_clause(self):
return self._decide_from_probability('MISC', 'ONLY_USE_AGGREGATES_IN_HAVING_CLAUSE')
def get_with_clause_table_ref_count(self):
'''Return the number of table ref entries a WITH clause should contain.'''
return self._choose_from_bounds('WITH_TABLE_COUNT')
def get_select_item_count(self):
return self._choose_from_bounds('SELECT_ITEM_COUNT')
def choose_nested_expr_count(self):
return self._choose_from_bounds('MAX_NESTED_EXPR_COUNT')
def allowed_analytic_designs(self):
return [design for design, is_enabled in self._flags['ANALYTIC_DESIGNS'].iteritems()
if is_enabled]
def use_partition_by_clause_in_analytic(self):
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'PARTITION_BY')
def use_order_by_clause_in_analytic(self):
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'ORDER_BY')
def use_window_in_analytic(self):
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'WINDOW')
def choose_window_type(self):
return self._choose_from_weights('ANALYTIC_WINDOW')
def get_window_offset(self):
return self._choose_from_bounds('ANALYTIC_WINDOW_OFFSET')
def get_offset_for_analytic_lead_or_lag(self):
return self._choose_from_bounds('ANALYTIC_LEAD_LAG_OFFSET')
def get_table_count(self):
return self._choose_from_bounds('TABLE_COUNT')
def use_inline_view(self):
return self._decide_from_probability('MISC', 'INLINE_VIEW')
def choose_table(self, table_exprs):
return choice(table_exprs)
def choose_join_type(self, join_types):
return self._choose_from_filtered_weights(
lambda join_type: join_type in join_types, 'JOIN')
def choose_join_condition_count(self):
return max(1, self._choose_from_bounds('MAX_NESTED_EXPR_COUNT'))
def use_where_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WHERE')
def use_scalar_subquery(self):
return self._decide_from_probability('MISC', 'SCALAR_SUBQUERY')
def choose_subquery_predicate_category(self, func_name, allow_correlated):
weights = self.weights('SUBQUERY_PREDICATE')
func_names = set(name for name, _, _ in weights.iterkeys())
if func_name not in func_names:
func_name = 'Scalar'
allow_agg = self.weights('SELECT_ITEM_CATEGORY').get('AGG', 0)
if allow_correlated and self.bounds('TABLE_COUNT')[1] == 0:
allow_correlated = False
weights = dict(((name, use_agg, use_correlated), weight)
for (name, use_agg, use_correlated), weight in weights.iteritems()
if name == func_name \
and (allow_agg or use_agg == 'NON_AGG') \
and weight)
if weights:
return self._choose_from_weights(weights)
def use_distinct(self):
return self._decide_from_probability('MISC', 'SELECT_DISTINCT')
def use_distinct_in_func(self):
return self._decide_from_probability('MISC', 'SELECT_DISTINCT')
def use_group_by_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'GROUP_BY')
def use_having_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'HAVING')
def use_union_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'UNION')
def use_union_all(self):
return self._decide_from_probability('MISC', 'UNION_ALL')
def get_query_execution(self):
return self._choose_from_weights('QUERY_EXECUTION')
def use_having_without_groupby(self):
return True
def use_nested_with(self):
return True
def use_lateral_join(self):
return False
def use_boolean_expr_for_lateral_join(self):
return False
def get_num_boolean_exprs_for_lateral_join(self):
return False
# Workaround for Hive null ordering differences, and lack of 'NULL FIRST', 'NULL LAST'
# specifications. The ref db will order nulls as specified for ASC sorting to make it
# identifical to Hive. Valid return values are: 'BEFORE', 'AFTER', or 'DEFAULT',
# the latter means no specification needed.
def nulls_order_asc(self):
return 'DEFAULT'
def choose_val_expr(self, val_exprs, types=TYPES):
if not val_exprs:
raise Exception('At least on value is required')
if not types:
raise Exception('At least one type is required')
available_types = set(types) & set(val_exprs.by_type)
if not available_types:
raise Exception('None of the provided values return any of the required types')
val_type = self.choose_type(available_types)
return choice(val_exprs.by_type[val_type])
def choose_constant(self, return_type=None, allow_null=True):
if not return_type:
return_type = self.choose_type()
while True:
val = self.constant_generator.generate_val(return_type)
if val is None and not allow_null:
continue
return return_type(val)
def choose_type(self, types=TYPES):
type_weights = self.weights('TYPES')
weights = dict((type_, type_weights[type_]) for type_ in types)
if not weights:
raise Exception('None of the requested types are enabled')
return self._choose_from_weights(weights)
def choose_conjunct_disjunct_fill_ratio(self):
'''Return the ratio of ANDs and ORs to use in a boolean function tree. For example,
when creating a WHERE condition that consists of 10 nested functions, a ratio of
0.1 means 1 out of the 10 functions in the WHERE clause will be an AND or OR.
'''
return random() * random()
def choose_relational_func_fill_ratio(self):
'''Return the ratio of relational functions to use in a boolean function tree. This
ratio is applied after 'choose_conjunct_disjunct_fill_ratio()'.
'''
return random() * random()
def choose_conjunct_disjunct(self):
return self._choose_from_weights('CONJUNCT_DISJUNCTS')
def choose_relational_func_signature(self, signatures):
'''Return a relational signature chosen from "signatures". A signature is considered
to be relational if it returns a boolean and accepts more than one argument.
'''
if not signatures:
raise Exception('At least one signature is required')
signatures = filter(
lambda s: s.return_type == Boolean \
and len(s.args) > 1 \
and not any(a.is_subquery for a in s.args),
signatures)
if not signatures:
raise Exception(
'None of the provided signatures corresponded to a relational function')
func_weights = self.weights('RELATIONAL_FUNCS')
missing_funcs = set(s.func for s in signatures) - set(func_weights)
if missing_funcs:
raise Exception("Weights are missing for functions: %s"
% ", ".join([missing_funcs]))
return self.choose_func_signature(signatures, self.weights('RELATIONAL_FUNCS'))
def choose_func_signature(self, signatures, _func_weights=None):
'''Return a signature chosen from "signatures".'''
if not signatures:
raise Exception('At least one signature is required')
type_weights = self.weights('TYPES')
func_weights = _func_weights
if not func_weights:
# First a function will be chosen then a signature. This is done so that the number
# of signatures a function has doesn't influence its likelihood of being chosen.
# Functions will be weighted based on the weight of the types in their arguments.
# The weights will be normalized by the number of arguments in the signature. The
# weight of a function will be the maximum weight out of all of it's signatures.
# If any signature has a type with a weight of zero, the signature will not be used.
#
# Example: type_weights = {Int: 10, Float: 1},
# funcs = [foo(Int), foo(Float), bar(Int, Float)]
#
# max signature length = 2 # from bar(Int, Float)
# weight of foo(Int) = (10 * 2)
# weight of foo(Float) = (1 * 2)
# weight of bar(Int, Float) = ((10 + 1) * 1)
# func_weights = {foo: 20, bar: 11}
#
# Note that this only selects a function, the function signature will be selected
# later. This is done to prevent function with a greater number of signatures from
# being selected more frequently.
func_weights = dict()
# The length of the signature in func_weights
signature_length_by_func = dict()
for signature in signatures:
signature_weight = type_weights[signature.return_type]
signature_length = 1
for arg in signature.args:
if arg.is_subquery:
for subtype in arg.type:
signature_weight *= type_weights[subtype]
signature_length += 1
else:
signature_weight *= type_weights[arg.type]
signature_length += 1
if not signature_weight:
continue
if signature.func not in func_weights \
or signature_weight > func_weights[signature.func]:
func_weights[signature.func] = signature_weight
signature_length_by_func[signature.func] = signature_length
if not func_weights:
raise Exception('All functions disallowed based on signature types')
distinct_signature_lengths = set(signature_length_by_func.values())
for func, weight in func_weights.iteritems():
signature_length = signature_length_by_func[func]
func_weights[func] = reduce(
lambda x, y: x * y,
distinct_signature_lengths - set([signature_length]),
func_weights[func])
func = self._choose_from_weights(func_weights)
# Same idea as above but for the signatures of the selected function.
signature_weights = dict()
signature_lengths = dict()
for idx, signature in enumerate(func.signatures()):
if signature not in signatures:
continue
signature_weight = type_weights[signature.return_type]
signature_length = 1
for arg in signature.args:
if arg.is_subquery:
for subtype in arg.type:
signature_weight *= type_weights[subtype]
signature_length += 1
else:
signature_weight *= type_weights[arg.type]
signature_length += 1
if signature_weight:
signature_weights[idx] = signature_weight
signature_lengths[idx] = signature_length
distinct_signature_lengths = set(signature_lengths.values())
for idx, weight in signature_weights.iteritems():
signature_length = signature_lengths[idx]
signature_weights[idx] = reduce(
lambda x, y: x * y,
distinct_signature_lengths - set([signature_length]),
signature_weights[idx])
idx = self._choose_from_weights(signature_weights)
return func.signatures()[idx]
def allow_func_signature(self, signature):
weights = self.weights('TYPES')
if not weights[signature.return_type]:
return False
for arg in signature.args:
if arg.is_subquery:
if not all(weights[subtype] for subtype in arg.type):
return False
elif not weights[arg.type]:
return False
return True
class ImpalaNestedTypesProfile(DefaultProfile):
def __init__(self):
super(ImpalaNestedTypesProfile, self).__init__()
self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 0.3
self._probabilities['MISC']['INLINE_VIEW'] = 0.3
def use_lateral_join(self):
return random() < 0.5
def use_boolean_expr_for_lateral_join(self):
return random() < 0.2
def get_num_boolean_exprs_for_lateral_join(self):
if random() < 0.8:
return 0
result = 1
while random() < 0.6:
result += 1
return result
def get_table_count(self):
num = 1
while random() < (0.85 ** num):
num += 1
return num
# This profile was added for ad-hoc testing.
class TestFunctionProfile(DefaultProfile):
def choose_func_signature(self, signatures):
if not signatures:
raise Exception('At least one signature is required')
preferred_signatures = filter(lambda s: "DistinctFrom" in s.func._NAME, signatures)
if preferred_signatures:
signatures = preferred_signatures
return super(TestFunctionProfile, self).choose_func_signature(signatures)
class HiveProfile(DefaultProfile):
def __init__(self):
super(HiveProfile, self).__init__()
def use_having_without_groupby(self):
return False
def use_nested_with(self):
return False
def nulls_order_asc(self):
return 'BEFORE'
def allow_func_signature(self, signature):
if signature.func._NAME.startswith('DateAdd'):
return False
if signature.func._NAME in ('Greatest', 'Least'):
type = signature.return_type
argtypes = [arg.type for arg in signature.args]
for argtype in argtypes:
if type is None:
type = argtype
continue
else:
if type != argtype:
return False
return DefaultProfile.allow_func_signature(self, signature)
PROFILES = [var for var in locals().values()
if isinstance(var, type) and var.__name__.endswith('Profile')]