mirror of
https://github.com/apache/impala.git
synced 2026-01-07 09:02:19 -05:00
Changes: 1) Improve generation of boolean expressions to be more realistic. Previously functions like And and Or were very unlikely to be chosen because they only use Booleans and the generator prefers functions that use Ints. This is especially bad now that ON expressions may be arbitrary (previously they were hard coded to use Ands and Equals). Now And/Or are special cased when generating a Boolean expressions so they'll be more likely to be used. 2) Randomly choose query options. This is very basic. With this change Impala gets OOM killed because there are 3 nodes running locally and each of them is configured to use 80% of the total memory. Another update will be needed to start the mini-cluster with less memory. 3) Fix bug where a generated function tree would not be a tree at all, it would just be one function. Change-Id: I75a7e6a1d96e5d92368f73c1e5e6a6b288932497 Reviewed-on: http://gerrit.cloudera.org:8080/2445 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: Internal Jenkins
620 lines
23 KiB
Python
620 lines
23 KiB
Python
# Copyright (c) 2014 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from logging import getLogger
|
|
from random import choice, randint, random
|
|
|
|
from db_types import (
|
|
Boolean,
|
|
Char,
|
|
Decimal,
|
|
Float,
|
|
Int,
|
|
TYPES,
|
|
Timestamp)
|
|
from funcs import (
|
|
And,
|
|
Equals,
|
|
GreaterThan,
|
|
GreaterThanOrEquals,
|
|
In,
|
|
IsDistinctFrom,
|
|
IsNotDistinctFrom,
|
|
IsNotDistinctFromOp,
|
|
LessThan,
|
|
LessThanOrEquals,
|
|
NotEquals,
|
|
NotIn,
|
|
Or,
|
|
WindowBoundary)
|
|
from random_val_generator import RandomValGenerator
|
|
|
|
UNBOUNDED_PRECEDING = WindowBoundary.UNBOUNDED_PRECEDING
|
|
PRECEDING = WindowBoundary.PRECEDING
|
|
CURRENT_ROW = WindowBoundary.CURRENT_ROW
|
|
FOLLOWING = WindowBoundary.FOLLOWING
|
|
UNBOUNDED_FOLLOWING = WindowBoundary.UNBOUNDED_FOLLOWING
|
|
|
|
LOG = getLogger()
|
|
|
|
class DefaultProfile(object):
|
|
|
|
def __init__(self):
|
|
# Bounds are (min, max) values, the actual value used will be selected from the
|
|
# bounds and each value within the range has an equal probability of being selected.
|
|
self._bounds = {
|
|
'MAX_NESTED_QUERY_COUNT': (0, 2),
|
|
'MAX_NESTED_EXPR_COUNT': (0, 2),
|
|
'SELECT_ITEM_COUNT': (1, 5),
|
|
'WITH_TABLE_COUNT': (1, 3),
|
|
'TABLE_COUNT': (1, 2),
|
|
'ANALYTIC_LEAD_LAG_OFFSET': (1, 100),
|
|
'ANALYTIC_WINDOW_OFFSET': (1, 100)}
|
|
|
|
# Below are interdependent weights used to determine probabilities. The probability
|
|
# of any item being selected should be (item weight) / sum(weights). A weight of
|
|
# zero means the item will never be selected.
|
|
self._weights = {
|
|
'SELECT_ITEM_CATEGORY': {
|
|
'AGG': 3,
|
|
'ANALYTIC': 1,
|
|
'BASIC': 10},
|
|
'TYPES': {
|
|
Boolean: 1,
|
|
Char: 1,
|
|
Decimal: 1,
|
|
Float: 1,
|
|
Int: 10,
|
|
Timestamp: 1},
|
|
'RELATIONAL_FUNCS': {
|
|
Equals: 40,
|
|
GreaterThan: 2,
|
|
GreaterThanOrEquals: 2,
|
|
In: 2,
|
|
IsDistinctFrom: 2,
|
|
IsNotDistinctFrom: 1,
|
|
IsNotDistinctFromOp: 1,
|
|
LessThan: 2,
|
|
LessThanOrEquals: 2,
|
|
NotEquals: 2,
|
|
NotIn: 2},
|
|
'CONJUNCT_DISJUNCTS': {
|
|
And: 5,
|
|
Or: 1},
|
|
'ANALYTIC_WINDOW': {
|
|
('ROWS', UNBOUNDED_PRECEDING, None): 1,
|
|
('ROWS', UNBOUNDED_PRECEDING, PRECEDING): 2,
|
|
('ROWS', UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
|
|
('ROWS', UNBOUNDED_PRECEDING, FOLLOWING): 2,
|
|
('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
|
|
('ROWS', PRECEDING, None): 1,
|
|
('ROWS', PRECEDING, PRECEDING): 2,
|
|
('ROWS', PRECEDING, CURRENT_ROW): 1,
|
|
('ROWS', PRECEDING, FOLLOWING): 2,
|
|
('ROWS', PRECEDING, UNBOUNDED_FOLLOWING): 2,
|
|
('ROWS', CURRENT_ROW, None): 1,
|
|
('ROWS', CURRENT_ROW, CURRENT_ROW): 1,
|
|
('ROWS', CURRENT_ROW, FOLLOWING): 2,
|
|
('ROWS', CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
|
|
('ROWS', FOLLOWING, FOLLOWING): 2,
|
|
('ROWS', FOLLOWING, UNBOUNDED_FOLLOWING): 2,
|
|
# Ranges not yet supported
|
|
('RANGE', UNBOUNDED_PRECEDING, None): 0,
|
|
('RANGE', UNBOUNDED_PRECEDING, PRECEDING): 0,
|
|
('RANGE', UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
|
|
('RANGE', UNBOUNDED_PRECEDING, FOLLOWING): 0,
|
|
('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
|
|
('RANGE', PRECEDING, None): 0,
|
|
('RANGE', PRECEDING, PRECEDING): 0,
|
|
('RANGE', PRECEDING, CURRENT_ROW): 0,
|
|
('RANGE', PRECEDING, FOLLOWING): 0,
|
|
('RANGE', PRECEDING, UNBOUNDED_FOLLOWING): 0,
|
|
('RANGE', CURRENT_ROW, None): 0,
|
|
('RANGE', CURRENT_ROW, CURRENT_ROW): 0,
|
|
('RANGE', CURRENT_ROW, FOLLOWING): 0,
|
|
('RANGE', CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
|
|
('RANGE', FOLLOWING, FOLLOWING): 0,
|
|
('RANGE', FOLLOWING, UNBOUNDED_FOLLOWING): 0},
|
|
'JOIN': {
|
|
'INNER': 90,
|
|
'LEFT': 30,
|
|
'RIGHT': 10,
|
|
'FULL_OUTER': 3,
|
|
'CROSS': 1},
|
|
'SUBQUERY_PREDICATE': {
|
|
('Exists', 'AGG', 'CORRELATED'): 0, # Not supported
|
|
('Exists', 'AGG', 'UNCORRELATED'): 1,
|
|
('Exists', 'NON_AGG', 'CORRELATED'): 1,
|
|
('Exists', 'NON_AGG', 'UNCORRELATED'): 1,
|
|
('NotExists', 'AGG', 'CORRELATED'): 0, # Not supported
|
|
('NotExists', 'AGG', 'UNCORRELATED'): 0, # Not supported
|
|
('NotExists', 'NON_AGG', 'CORRELATED'): 1,
|
|
('NotExists', 'NON_AGG', 'UNCORRELATED'): 0, # Not supported
|
|
('In', 'AGG', 'CORRELATED'): 0, # Not supported
|
|
('In', 'AGG', 'UNCORRELATED'): 0, # Not supported
|
|
('In', 'NON_AGG', 'CORRELATED'): 1,
|
|
('In', 'NON_AGG', 'UNCORRELATED'): 1,
|
|
('NotIn', 'AGG', 'CORRELATED'): 0, # Not supported
|
|
('NotIn', 'AGG', 'UNCORRELATED'): 1,
|
|
('NotIn', 'NON_AGG', 'CORRELATED'): 1,
|
|
('NotIn', 'NON_AGG', 'UNCORRELATED'): 1,
|
|
('Scalar', 'AGG', 'CORRELATED'): 0, # Not supported
|
|
('Scalar', 'AGG', 'UNCORRELATED'): 1,
|
|
('Scalar', 'NON_AGG', 'CORRELATED'): 0, # Not supported
|
|
('Scalar', 'NON_AGG', 'UNCORRELATED'): 1},
|
|
'QUERY_EXECUTION': { # Used by the discrepancy searcher
|
|
'CREATE_TABLE_AS': 1,
|
|
'RAW': 10,
|
|
'VIEW': 1}}
|
|
|
|
# On/off switches
|
|
self._flags = {
|
|
'ANALYTIC_DESIGNS': {
|
|
'TOP_LEVEL_QUERY_WITHOUT_LIMIT': True,
|
|
'DETERMINISTIC_ORDER_BY': True,
|
|
'NO_ORDER_BY': True,
|
|
'ONLY_SELECT_ITEM': True,
|
|
'UNBOUNDED_WINDOW': True,
|
|
'RANK_FUNC': True}}
|
|
|
|
# Independent probabilities where 1 means 100%. These values may be ignored depending
|
|
# on the context. For example, GROUP_BY is almost always ignored and instead
|
|
# determined by the SELECT item weights above, since mixing aggregate and
|
|
# non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
|
|
# only applied if all of the SELECT items are non-aggregate.
|
|
self._probabilities = {
|
|
'OPTIONAL_QUERY_CLAUSES': {
|
|
'WITH': 0.1, # MAX_NESTED_QUERY_COUNT bounds take precedence
|
|
'FROM': 1,
|
|
'WHERE': 0.5,
|
|
'GROUP_BY': 0.1, # special case, doesn't really do much, see comment above
|
|
'HAVING': 0.25,
|
|
'UNION': 0.1,
|
|
'ORDER_BY': 0.1},
|
|
'OPTIONAL_ANALYTIC_CLAUSES': {
|
|
'PARTITION_BY': 0.5,
|
|
'ORDER_BY': 0.5,
|
|
'WINDOW': 0.5}, # will only be used if ORDER BY is chosen
|
|
'MISC': {
|
|
'INLINE_VIEW': 0.1, # MAX_NESTED_QUERY_COUNT bounds take precedence
|
|
'SELECT_DISTINCT': 0.1,
|
|
'SCALAR_SUBQUERY': 0.1,
|
|
'ONLY_USE_EQUALITY_JOIN_PREDICATES': 0.8,
|
|
'ONLY_USE_AGGREGATES_IN_HAVING_CLAUSE': 0.7,
|
|
'UNION_ALL': 0.5}} # Determines use of "ALL" but not "UNION"
|
|
|
|
self.__type_weights = {}
|
|
|
|
self.constant_generator = RandomValGenerator()
|
|
|
|
def _get_config_value(self, start_config, *keys):
|
|
value = start_config
|
|
for key in keys:
|
|
value = value[key]
|
|
return value
|
|
|
|
def weights(self, *keys):
|
|
'''Convenience method for getting the values of named weights'''
|
|
return self._get_config_value(self._weights, *keys)
|
|
|
|
def bounds(self, *keys):
|
|
'''Convenience method for getting the values of named bounds'''
|
|
return self._get_config_value(self._bounds, *keys)
|
|
|
|
def probability(self, *keys):
|
|
'''Convenience method for getting the value of named probabilities'''
|
|
return self._get_config_value(self._probabilities, *keys)
|
|
|
|
def _choose_from_bounds(self, *bounds):
|
|
'''Returns a value that is within the given bounds. Each value has an equal chance
|
|
of being chosen.
|
|
'''
|
|
if isinstance(bounds[0], str):
|
|
lower, upper = self.bounds(*bounds)
|
|
else:
|
|
lower, upper = bounds
|
|
return randint(lower, upper)
|
|
|
|
def _choose_from_weights(self, *weights):
|
|
'''Returns a value that is selected from the keys of weights with the probability
|
|
determined by the values of weights.
|
|
'''
|
|
if isinstance(weights[0], str):
|
|
weights = self.weights(*weights)
|
|
else:
|
|
weights = weights[0]
|
|
total_weight = sum(weights.itervalues())
|
|
numeric_choice = randint(1, total_weight)
|
|
for choice_, weight in weights.iteritems():
|
|
if weight <= 0:
|
|
continue
|
|
if numeric_choice <= weight:
|
|
return choice_
|
|
numeric_choice -= weight
|
|
|
|
def _choose_from_filtered_weights(self, filter, *weights):
|
|
'''Convenience method, apply the given filter before choosing a value.'''
|
|
if isinstance(weights[0], str):
|
|
weights = self.weights(*weights)
|
|
else:
|
|
weights = weights[0]
|
|
return self._choose_from_weights(dict(
|
|
(choice_, weight) for choice_, weight in weights.iteritems() if filter(choice_)))
|
|
|
|
def _decide_from_probability(self, *keys):
|
|
return random() < self.probability(*keys)
|
|
|
|
def get_max_nested_query_count(self):
|
|
'''Return the maximum number of queries the top level query may contain.'''
|
|
return self._choose_from_bounds('MAX_NESTED_QUERY_COUNT')
|
|
|
|
def use_with_clause(self):
|
|
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WITH')
|
|
|
|
def only_use_equality_join_predicates(self):
|
|
return self._decide_from_probability('MISC', 'ONLY_USE_EQUALITY_JOIN_PREDICATES')
|
|
|
|
def only_use_aggregates_in_having_clause(self):
|
|
return self._decide_from_probability('MISC', 'ONLY_USE_AGGREGATES_IN_HAVING_CLAUSE')
|
|
|
|
def get_with_clause_table_ref_count(self):
|
|
'''Return the number of table ref entries a WITH clause should contain.'''
|
|
return self._choose_from_bounds('WITH_TABLE_COUNT')
|
|
|
|
def get_select_item_count(self):
|
|
return self._choose_from_bounds('SELECT_ITEM_COUNT')
|
|
|
|
def choose_nested_expr_count(self):
|
|
return self._choose_from_bounds('MAX_NESTED_EXPR_COUNT')
|
|
|
|
def allowed_analytic_designs(self):
|
|
return [design for design, is_enabled in self._flags['ANALYTIC_DESIGNS'].iteritems()
|
|
if is_enabled]
|
|
|
|
def use_partition_by_clause_in_analytic(self):
|
|
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'PARTITION_BY')
|
|
|
|
def use_order_by_clause_in_analytic(self):
|
|
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'ORDER_BY')
|
|
|
|
def use_window_in_analytic(self):
|
|
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'WINDOW')
|
|
|
|
def choose_window_type(self):
|
|
return self._choose_from_weights('ANALYTIC_WINDOW')
|
|
|
|
def get_window_offset(self):
|
|
return self._choose_from_bounds('ANALYTIC_WINDOW_OFFSET')
|
|
|
|
def get_offset_for_analytic_lead_or_lag(self):
|
|
return self._choose_from_bounds('ANALYTIC_LEAD_LAG_OFFSET')
|
|
|
|
def get_table_count(self):
|
|
return self._choose_from_bounds('TABLE_COUNT')
|
|
|
|
def use_inline_view(self):
|
|
return self._decide_from_probability('MISC', 'INLINE_VIEW')
|
|
|
|
def choose_table(self, table_exprs):
|
|
return choice(table_exprs)
|
|
|
|
def choose_join_type(self, join_types):
|
|
return self._choose_from_filtered_weights(
|
|
lambda join_type: join_type in join_types, 'JOIN')
|
|
|
|
def choose_join_condition_count(self):
|
|
return max(1, self._choose_from_bounds('MAX_NESTED_EXPR_COUNT'))
|
|
|
|
def use_where_clause(self):
|
|
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WHERE')
|
|
|
|
def use_scalar_subquery(self):
|
|
return self._decide_from_probability('MISC', 'SCALAR_SUBQUERY')
|
|
|
|
def choose_subquery_predicate_category(self, func_name, allow_correlated):
|
|
weights = self.weights('SUBQUERY_PREDICATE')
|
|
func_names = set(name for name, _, _ in weights.iterkeys())
|
|
if func_name not in func_names:
|
|
func_name = 'Scalar'
|
|
allow_agg = self.weights('SELECT_ITEM_CATEGORY').get('AGG', 0)
|
|
if allow_correlated and self.bounds('TABLE_COUNT')[1] == 0:
|
|
allow_correlated = False
|
|
weights = dict(((name, use_agg, use_correlated), weight)
|
|
for (name, use_agg, use_correlated), weight in weights.iteritems()
|
|
if name == func_name \
|
|
and (allow_agg or use_agg == 'NON_AGG') \
|
|
and weight)
|
|
if weights:
|
|
return self._choose_from_weights(weights)
|
|
|
|
def use_distinct(self):
|
|
return self._decide_from_probability('MISC', 'SELECT_DISTINCT')
|
|
|
|
def use_distinct_in_func(self):
|
|
return self._decide_from_probability('MISC', 'SELECT_DISTINCT')
|
|
|
|
def use_group_by_clause(self):
|
|
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'GROUP_BY')
|
|
|
|
def use_having_clause(self):
|
|
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'HAVING')
|
|
|
|
def use_union_clause(self):
|
|
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'UNION')
|
|
|
|
def use_union_all(self):
|
|
return self._decide_from_probability('MISC', 'UNION_ALL')
|
|
|
|
def get_query_execution(self):
|
|
return self._choose_from_weights('QUERY_EXECUTION')
|
|
|
|
def use_having_without_groupby(self):
|
|
return True
|
|
|
|
def use_nested_with(self):
|
|
return True
|
|
|
|
def use_lateral_join(self):
|
|
return False
|
|
|
|
def use_boolean_expr_for_lateral_join(self):
|
|
return False
|
|
|
|
def get_num_boolean_exprs_for_lateral_join(self):
|
|
return False
|
|
|
|
# Workaround for Hive null ordering differences, and lack of 'NULL FIRST', 'NULL LAST'
|
|
# specifications. The ref db will order nulls as specified for ASC sorting to make it
|
|
# identifical to Hive. Valid return values are: 'BEFORE', 'AFTER', or 'DEFAULT',
|
|
# the latter means no specification needed.
|
|
def nulls_order_asc(self):
|
|
return 'DEFAULT'
|
|
|
|
def choose_val_expr(self, val_exprs, types=TYPES):
|
|
if not val_exprs:
|
|
raise Exception('At least on value is required')
|
|
if not types:
|
|
raise Exception('At least one type is required')
|
|
available_types = set(types) & set(val_exprs.by_type)
|
|
if not available_types:
|
|
raise Exception('None of the provided values return any of the required types')
|
|
val_type = self.choose_type(available_types)
|
|
return choice(val_exprs.by_type[val_type])
|
|
|
|
def choose_constant(self, return_type=None, allow_null=True):
|
|
if not return_type:
|
|
return_type = self.choose_type()
|
|
while True:
|
|
val = self.constant_generator.generate_val(return_type)
|
|
if val is None and not allow_null:
|
|
continue
|
|
return return_type(val)
|
|
|
|
def choose_type(self, types=TYPES):
|
|
type_weights = self.weights('TYPES')
|
|
weights = dict((type_, type_weights[type_]) for type_ in types)
|
|
if not weights:
|
|
raise Exception('None of the requested types are enabled')
|
|
return self._choose_from_weights(weights)
|
|
|
|
def choose_conjunct_disjunct_fill_ratio(self):
|
|
'''Return the ratio of ANDs and ORs to use in a boolean function tree. For example,
|
|
when creating a WHERE condition that consists of 10 nested functions, a ratio of
|
|
0.1 means 1 out of the 10 functions in the WHERE clause will be an AND or OR.
|
|
'''
|
|
return random() * random()
|
|
|
|
def choose_relational_func_fill_ratio(self):
|
|
'''Return the ratio of relational functions to use in a boolean function tree. This
|
|
ratio is applied after 'choose_conjunct_disjunct_fill_ratio()'.
|
|
'''
|
|
return random() * random()
|
|
|
|
def choose_conjunct_disjunct(self):
|
|
return self._choose_from_weights('CONJUNCT_DISJUNCTS')
|
|
|
|
def choose_relational_func_signature(self, signatures):
|
|
'''Return a relational signature chosen from "signatures". A signature is considered
|
|
to be relational if it returns a boolean and accepts more than one argument.
|
|
'''
|
|
if not signatures:
|
|
raise Exception('At least one signature is required')
|
|
signatures = filter(
|
|
lambda s: s.return_type == Boolean \
|
|
and len(s.args) > 1 \
|
|
and not any(a.is_subquery for a in s.args),
|
|
signatures)
|
|
if not signatures:
|
|
raise Exception(
|
|
'None of the provided signatures corresponded to a relational function')
|
|
func_weights = self.weights('RELATIONAL_FUNCS')
|
|
missing_funcs = set(s.func for s in signatures) - set(func_weights)
|
|
if missing_funcs:
|
|
raise Exception("Weights are missing for functions: %s"
|
|
% ", ".join([missing_funcs]))
|
|
return self.choose_func_signature(signatures, self.weights('RELATIONAL_FUNCS'))
|
|
|
|
def choose_func_signature(self, signatures, _func_weights=None):
|
|
'''Return a signature chosen from "signatures".'''
|
|
if not signatures:
|
|
raise Exception('At least one signature is required')
|
|
|
|
type_weights = self.weights('TYPES')
|
|
|
|
func_weights = _func_weights
|
|
if not func_weights:
|
|
# First a function will be chosen then a signature. This is done so that the number
|
|
# of signatures a function has doesn't influence its likelihood of being chosen.
|
|
# Functions will be weighted based on the weight of the types in their arguments.
|
|
# The weights will be normalized by the number of arguments in the signature. The
|
|
# weight of a function will be the maximum weight out of all of it's signatures.
|
|
# If any signature has a type with a weight of zero, the signature will not be used.
|
|
#
|
|
# Example: type_weights = {Int: 10, Float: 1},
|
|
# funcs = [foo(Int), foo(Float), bar(Int, Float)]
|
|
#
|
|
# max signature length = 2 # from bar(Int, Float)
|
|
# weight of foo(Int) = (10 * 2)
|
|
# weight of foo(Float) = (1 * 2)
|
|
# weight of bar(Int, Float) = ((10 + 1) * 1)
|
|
# func_weights = {foo: 20, bar: 11}
|
|
#
|
|
# Note that this only selects a function, the function signature will be selected
|
|
# later. This is done to prevent function with a greater number of signatures from
|
|
# being selected more frequently.
|
|
func_weights = dict()
|
|
# The length of the signature in func_weights
|
|
signature_length_by_func = dict()
|
|
for signature in signatures:
|
|
signature_weight = type_weights[signature.return_type]
|
|
signature_length = 1
|
|
for arg in signature.args:
|
|
if arg.is_subquery:
|
|
for subtype in arg.type:
|
|
signature_weight *= type_weights[subtype]
|
|
signature_length += 1
|
|
else:
|
|
signature_weight *= type_weights[arg.type]
|
|
signature_length += 1
|
|
if not signature_weight:
|
|
continue
|
|
if signature.func not in func_weights \
|
|
or signature_weight > func_weights[signature.func]:
|
|
func_weights[signature.func] = signature_weight
|
|
signature_length_by_func[signature.func] = signature_length
|
|
if not func_weights:
|
|
raise Exception('All functions disallowed based on signature types')
|
|
distinct_signature_lengths = set(signature_length_by_func.values())
|
|
for func, weight in func_weights.iteritems():
|
|
signature_length = signature_length_by_func[func]
|
|
func_weights[func] = reduce(
|
|
lambda x, y: x * y,
|
|
distinct_signature_lengths - set([signature_length]),
|
|
func_weights[func])
|
|
func = self._choose_from_weights(func_weights)
|
|
|
|
# Same idea as above but for the signatures of the selected function.
|
|
signature_weights = dict()
|
|
signature_lengths = dict()
|
|
for idx, signature in enumerate(func.signatures()):
|
|
if signature not in signatures:
|
|
continue
|
|
signature_weight = type_weights[signature.return_type]
|
|
signature_length = 1
|
|
for arg in signature.args:
|
|
if arg.is_subquery:
|
|
for subtype in arg.type:
|
|
signature_weight *= type_weights[subtype]
|
|
signature_length += 1
|
|
else:
|
|
signature_weight *= type_weights[arg.type]
|
|
signature_length += 1
|
|
if signature_weight:
|
|
signature_weights[idx] = signature_weight
|
|
signature_lengths[idx] = signature_length
|
|
distinct_signature_lengths = set(signature_lengths.values())
|
|
for idx, weight in signature_weights.iteritems():
|
|
signature_length = signature_lengths[idx]
|
|
signature_weights[idx] = reduce(
|
|
lambda x, y: x * y,
|
|
distinct_signature_lengths - set([signature_length]),
|
|
signature_weights[idx])
|
|
idx = self._choose_from_weights(signature_weights)
|
|
return func.signatures()[idx]
|
|
|
|
def allow_func_signature(self, signature):
|
|
weights = self.weights('TYPES')
|
|
if not weights[signature.return_type]:
|
|
return False
|
|
for arg in signature.args:
|
|
if arg.is_subquery:
|
|
if not all(weights[subtype] for subtype in arg.type):
|
|
return False
|
|
elif not weights[arg.type]:
|
|
return False
|
|
return True
|
|
|
|
|
|
class ImpalaNestedTypesProfile(DefaultProfile):
|
|
|
|
def __init__(self):
|
|
super(ImpalaNestedTypesProfile, self).__init__()
|
|
self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 0.3
|
|
self._probabilities['MISC']['INLINE_VIEW'] = 0.3
|
|
|
|
def use_lateral_join(self):
|
|
return random() < 0.5
|
|
|
|
def use_boolean_expr_for_lateral_join(self):
|
|
return random() < 0.2
|
|
|
|
def get_num_boolean_exprs_for_lateral_join(self):
|
|
if random() < 0.8:
|
|
return 0
|
|
result = 1
|
|
while random() < 0.6:
|
|
result += 1
|
|
return result
|
|
|
|
def get_table_count(self):
|
|
num = 1
|
|
while random() < (0.85 ** num):
|
|
num += 1
|
|
return num
|
|
|
|
|
|
# This profile was added for ad-hoc testing.
|
|
class TestFunctionProfile(DefaultProfile):
|
|
|
|
def choose_func_signature(self, signatures):
|
|
if not signatures:
|
|
raise Exception('At least one signature is required')
|
|
preferred_signatures = filter(lambda s: "DistinctFrom" in s.func._NAME, signatures)
|
|
if preferred_signatures:
|
|
signatures = preferred_signatures
|
|
return super(TestFunctionProfile, self).choose_func_signature(signatures)
|
|
|
|
|
|
class HiveProfile(DefaultProfile):
|
|
def __init__(self):
|
|
super(HiveProfile, self).__init__()
|
|
|
|
def use_having_without_groupby(self):
|
|
return False
|
|
|
|
def use_nested_with(self):
|
|
return False
|
|
|
|
def nulls_order_asc(self):
|
|
return 'BEFORE'
|
|
|
|
def allow_func_signature(self, signature):
|
|
if signature.func._NAME.startswith('DateAdd'):
|
|
return False
|
|
if signature.func._NAME in ('Greatest', 'Least'):
|
|
type = signature.return_type
|
|
argtypes = [arg.type for arg in signature.args]
|
|
for argtype in argtypes:
|
|
if type is None:
|
|
type = argtype
|
|
continue
|
|
else:
|
|
if type != argtype:
|
|
return False
|
|
return DefaultProfile.allow_func_signature(self, signature)
|
|
|
|
|
|
PROFILES = [var for var in locals().values()
|
|
if isinstance(var, type) and var.__name__.endswith('Profile')]
|