Files
impala/tests/comparison/query_generator.py
casey b013495e1d Misc updates to the query generator (part 1 of 2)
Summary of changes:

  1) Simplified type system. The old system was overly complicated for
     the task of query generation. The modeling of types used to mirror
     the types used in Impala. For simplicity, new system only uses a
     subset of types, Boolean, Char, Decimal, Float, Int, and Timestamp.

  2) Functions now have fully typed signatures. Previously you had to
     know which functions accepted which inputs, now arbitrary
     permutations of functions can be generated. The chance of being
     able to add a new function without needing to change the query
     generation logic is much higher now.

  3) Query generation profiles. The randomness of the previous version
     was hardcoded in various places in throughout the query generator.
     Now there is a profile to determine which SQL features should be
     used. There is still a lot of room for improvement in terms of
     intuitiveness and documentation for configuring the profiles.

  4) Greater diversity of queries. Besides the function permutations,
     various restrictions to simplify query generation have been
     removed. Also constants are used in queries.

  5) Eliminate spinning and infinite loops. Also the old version would
     sometimes "hope" that a generated SQL element would be compatible
     with the context and if not, it would try again which would lead
     to noticeable spinning and/or infinite loops.

  6) Catchup with Impala 2.0 features: subqueries, analytics, and
     Char/VarChar.

Change-Id: Ia25f4e85d6a06f7958a906aa42d9f90d63675bc0
Reviewed-on: http://gerrit.sjc.cloudera.com:8080/5640
Reviewed-by: Casey Ching <casey@cloudera.com>
Tested-by: jenkins
2014-12-19 03:30:44 -08:00

1219 lines
54 KiB
Python

# Copyright (c) 2014 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from copy import deepcopy
from itertools import ifilter
from logging import getLogger
from random import shuffle, choice, randint, randrange
from tests.comparison.common import TableExprList, ValExpr, ValExprList
from tests.comparison.funcs import (
AGG_FUNCS,
AggFunc,
ANALYTIC_FUNCS,
AnalyticFunc,
And,
Coalesce,
Equals,
Func,
FUNCS,
PartitionByClause,
Trim,
WindowBoundary,
WindowClause)
from tests.comparison.types import (
Char,
Boolean,
Int,
JOINABLE_TYPES,
String,
TYPES,
VarChar)
from tests.comparison.query import (
FromClause,
GroupByClause,
HavingClause,
InlineView,
JoinClause,
LimitClause,
OrderByClause,
Query,
SelectClause,
SelectItem,
Subquery,
UnionClause,
WhereClause,
WithClause,
WithClauseInlineView)
UNBOUNDED_PRECEDING = WindowBoundary.UNBOUNDED_PRECEDING
PRECEDING = WindowBoundary.PRECEDING
CURRENT_ROW = WindowBoundary.CURRENT_ROW
FOLLOWING = WindowBoundary.FOLLOWING
UNBOUNDED_FOLLOWING = WindowBoundary.UNBOUNDED_FOLLOWING
LOG = getLogger(__name__)
class QueryGenerator(object):
def __init__(self, query_profile):
self.profile = query_profile
self.queries_under_construction = list()
self.max_nested_query_count = None
@property
def current_query(self):
if self.queries_under_construction:
return self.queries_under_construction[-1]
@property
def root_query(self):
if self.queries_under_construction:
return self.queries_under_construction[0]
def create_query(self,
table_exprs,
allow_with_clause=True,
allow_union_clause=True,
select_item_data_types=None,
required_select_item_type=None,
required_table_expr_col_type=None,
create_aggregate=None,
table_alias_prefix='t'):
'''Create a random query using various language features.
The initial call to this method should only use tables in the table_exprs
parameter, and not inline views or WITH definitions. The other types of
table exprs may be added as part of the query generation.
Due to implementation limitations, nested queries are not distributed evenly by
default even if the query profile assigns an equal likelihood of use to each
possible nested query decision. This is because the implementation chooses nested
queries in a fixed order (WITH -> FROM -> WHERE). For example if only one nested
query were allowed and each clause had a 50% chance of using a subquery, the
resulting set of generated queries would contain a subquery in the WITH clause
50% of the time, in the FROM 25% of the time, and WHERE 12.5% of the time. If the
max nested queries were two, it's most likely that the generated query would
contain a WITH clause which has the second nested query within it....
If select_item_data_types is specified it must be a sequence or iterable of
DataType. The generated query.select_clause.select will have data types suitable
for use in a UNION.
required_select_item_type may be set to a DataType to force at least one of the
SELECT items to be of the given type. This can be used to ensure that inline
views will have at least one joinable column.
required_table_expr_col_type may be set to ensure that at least one of the
TableExprs used in the FROM clause will have a column of the given type. This
can be used to unsure that a correlation condition can be made for a Subquery.
create_aggregate can be set to True or False to force or disable the creation
of an aggregate query. This is used during Subquery creation where the context
may require an aggregate or non-aggregate.
'''
query = Query()
query.parent = self.current_query
self.queries_under_construction.append(query)
self.profile.query = query
# Make a copy so tables can be added if a WITH clause is used
table_exprs = TableExprList(table_exprs)
if self.max_nested_query_count is None:
self.max_nested_query_count = self.profile.get_max_nested_query_count()
elif self.max_nested_query_count == 0:
raise Exception('Maximum nested query count exceeded')
else:
self.max_nested_query_count -= 1
with_clause = None
if allow_with_clause \
and self.allow_more_nested_queries \
and self.profile.use_with_clause():
with_clause = self._create_with_clause(table_exprs)
table_exprs.extend(with_clause.table_exprs)
query.with_clause = with_clause
from_clause = self._create_from_clause(
table_exprs, table_alias_prefix, required_table_expr_col_type)
query.from_clause = from_clause
select_clause = self._create_select_clause(
from_clause.visible_table_exprs,
select_item_data_types,
required_select_item_type,
create_aggregate)
query.select_clause = select_clause
if self.profile.use_where_clause():
query.where_clause = self._create_where_clause(
from_clause.visible_table_exprs, table_exprs, table_alias_prefix)
# If agg and non-agg SELECT items are present then a GROUP BY is required otherwise
# it's optional and is effectively a "SELECT DISTINCT".
if (select_clause.agg_items and select_clause.basic_items) \
or (create_aggregate is None \
and select_clause.basic_items \
and self.profile.use_group_by_clause()):
group_by_items = [item for item in select_clause.basic_items
if not item.val_expr.is_constant]
if group_by_items:
query.group_by_clause = GroupByClause(group_by_items)
# Impala doesn't support DISTINCT with analytics or "SELECT DISTINCT" when
# GROUP BY is used.
if not select_clause.analytic_items \
and not (query.group_by_clause and not select_clause.agg_items) \
and self.profile.use_distinct():
if select_clause.agg_items:
self._enable_distinct_on_random_agg_items(select_clause.agg_items)
else:
select_clause.distinct = True
if self.profile.use_having_clause() \
and (query.group_by_clause or select_clause.agg_items):
basic_select_item_exprs = \
ValExprList(item.val_expr for item in select_clause.basic_items)
query.having_clause = self._create_having_clause(
from_clause.visible_table_exprs, basic_select_item_exprs)
if allow_union_clause \
and self.allow_more_nested_queries \
and self.profile.use_union_clause():
data_type_candidates_by_base_type = defaultdict(list)
for data_type in TYPES:
data_type_candidates_by_base_type[data_type.get_base_type()].append(data_type)
select_item_data_types = list()
for select_item in select_clause.items:
select_item_data_types.append(
choice(data_type_candidates_by_base_type[select_item.val_expr.base_type]))
query.union_clause = UnionClause(self.create_query(
table_exprs,
allow_with_clause=False,
select_item_data_types=select_item_data_types))
query.union_clause.all = self.profile.use_union_all()
self.queries_under_construction.pop()
if self.queries_under_construction:
self.profile.query = self.queries_under_construction[-1]
else:
self.profile.query = None
self.max_nested_query_count = None
return query
@property
def allow_more_nested_queries(self):
return self.max_nested_query_count > 0
def _create_with_clause(self, table_exprs):
# Make a copy so newly created tables can be added and made available for use in
# future table definitions.
table_exprs = TableExprList(table_exprs)
with_clause_inline_views = TableExprList()
for with_clause_inline_view_idx \
in xrange(self.profile.get_with_clause_table_ref_count()):
query = self.create_query(table_exprs)
with_clause_alias_count = getattr(self.root_query, 'with_clause_alias_count', 0) + 1
self.root_query.with_clause_alias_count = with_clause_alias_count
with_clause_inline_view = \
WithClauseInlineView(query, 'with_%s' % with_clause_alias_count)
table_exprs.append(with_clause_inline_view)
with_clause_inline_views.append(with_clause_inline_view)
if not self.allow_more_nested_queries:
break
return WithClause(with_clause_inline_views)
def _create_select_clause(self,
table_exprs,
select_item_data_types,
required_select_item_type,
create_aggregate):
# XXX: Prevent GROUP BY float_col, the groupings will differ across databases.
if select_item_data_types:
select_item_data_types = tuple(select_item_data_types)
if select_item_data_types \
and required_select_item_type \
and not issubclass(required_select_item_type, select_item_data_types):
raise Exception('Required select item type is not in allowed types')
if select_item_data_types:
desired_item_count = len(select_item_data_types)
else:
desired_item_count = self.profile.get_select_item_count()
basic_count, agg_count, analytic_count = \
self.profile.split_select_item_count(desired_item_count, create_aggregate)
select_items = list()
for item_idx in xrange(desired_item_count):
if select_item_data_types:
col_type = select_item_data_types[item_idx]
else:
if desired_item_count - 1 == item_idx and required_select_item_type:
col_types = [required_select_item_type]
else:
col_types = table_exprs.col_types
col_type = self.profile.choose_type(col_types)
if required_select_item_type and issubclass(col_type, required_select_item_type):
required_select_item_type = None
category_idx = randint(0, basic_count + agg_count + analytic_count - 1)
if category_idx < basic_count:
select_items.append(self._create_basic_select_item(table_exprs, col_type))
basic_count -= 1
elif category_idx < basic_count + agg_count:
select_items.append(('AGG', col_type))
agg_count -= 1
else:
select_items.append(('ANALYTIC', col_type))
analytic_count -= 1
select_item_exprs = \
ValExprList(item.val_expr for item in select_items if type(item) == SelectItem)
for idx, item in enumerate(select_items):
if type(item) == tuple and item[0] == 'AGG':
select_items[idx] = \
self._create_agg_select_item(table_exprs, select_item_exprs, item[1])
for item in select_items:
if type(item) == tuple:
continue
if item.is_agg:
select_item_exprs.append(item.val_expr)
for idx, item in enumerate(select_items):
if type(item) == tuple:
select_items[idx] = self._create_analytic_select_item(
table_exprs,
select_item_exprs,
len(select_item_exprs) == 0 and analytic_count == 1,
item[1])
# So far all the SELECT items are defined and set but none of them have aliases. If
# an item is a simple column reference, then it will only get an alias if there is a
# conflict with another simple column ref. All other item types, such as functions
# or constants, will always have an alias.
item_name_counts = defaultdict(int)
for item in select_items:
if item.alias or not item.val_expr.is_col:
continue
if item.val_expr.name in item_name_counts:
item.alias = '*CONFLICT*'
else:
item_name_counts[item.val_expr.name] += 1
for item in select_items:
if item.alias == '*CONFLICT*' or (not item.val_expr.is_col and not item.alias):
# Use names close to the Impala functional test database so that bugs in
# resolution will be more likely to surface.
alias = '%s_col' % item.type.__name__.lower()
if alias in item_name_counts:
item.alias = alias + '_%s' % (item_name_counts[alias] + 1)
else:
item.alias = alias
item_name_counts[alias] += 1
return SelectClause(select_items)
def _create_basic_select_item(self, table_exprs, return_type):
max_children = self.profile.choose_nested_expr_count()
if max_children:
value = self._create_func_tree(return_type)
value = self._populate_func_with_vals(value, table_exprs)
elif return_type in table_exprs.col_types:
value = self.profile.choose_val_expr(table_exprs.cols_by_type[return_type])
else:
value = self.profile.choose_constant(return_type)
return SelectItem(value)
def _create_func_tree(self, return_type, allow_subquery=False):
'''Returns an instance of a basic function that has all of it's arguments either set
to None or another instance of a function that has it's arguments set likewise. The
caller should replace the None values with column references or constants as
desired. The depth of the tree is determined by the query profile (self.profile).
'''
signatures = self._funcs_to_allowed_signatures(FUNCS)
root_signatures = self._find_matching_signatures(
signatures, return_type=return_type, allow_subquery=allow_subquery)
root_signature = self.profile.choose_func_signature(root_signatures)
func = root_signature.func(root_signature) # An instance of a function
max_children = self.profile.choose_nested_expr_count()
if max_children:
# Impala does not allow functions that contain subqueries to have arguments
# that contain subqueries. Ex: ... WHERE (int_col IN (SELECT 1)) IN (SELECT TRUE)
subquery_allowed_null_args = list()
subquery_not_allowed_null_args = list()
if func.contains_subquery:
null_args = subquery_not_allowed_null_args
else:
null_args = subquery_allowed_null_args
null_args = [(func, idx) for idx, arg in enumerate(func.args)
if type(arg) != list and arg.val is None]
while max_children \
and (subquery_allowed_null_args or subquery_not_allowed_null_args):
idx = randrange(
len(subquery_allowed_null_args) + len(subquery_not_allowed_null_args))
if idx < len(subquery_allowed_null_args):
null_args = subquery_allowed_null_args
else:
null_args = subquery_not_allowed_null_args
shuffle(null_args)
parent_func, parent_arg_idx = null_args.pop()
child_signatures = self._find_matching_signatures(
signatures,
return_type=parent_func.args[parent_arg_idx].type,
allow_subquery=(allow_subquery and null_args == subquery_allowed_null_args))
child_signature = self.profile.choose_func_signature(child_signatures)
child_func = child_signature.func(child_signature)
parent_func.args[parent_arg_idx] = child_func
if child_func.contains_subquery:
null_args = subquery_not_allowed_null_args
else:
null_args = subquery_allowed_null_args
null_args.extend((child_func, idx) for idx, arg in enumerate(child_func.args)
if type(arg) != list and arg.val is None)
max_children -= 1
return func
def _funcs_to_allowed_signatures(self, funcs):
'''Return a list of the signatures contained in "funcs" that are eligible for use
based on the query profile.
'''
return [signature for func in funcs for signature in func.signatures()
if self.profile.allow_func_signature(signature)]
def _find_matching_signatures(self,
signatures,
return_type=None,
accepts=None,
allow_subquery=False):
matching_signatures = list()
for signature in signatures:
if return_type and not issubclass(signature.return_type, return_type):
continue
if accepts and not any(not arg.is_subquery and issubclass(arg.type, accepts)
for arg in signature.args):
continue
if not allow_subquery and any(arg.is_subquery for arg in signature.args):
continue
matching_signatures.append(signature)
return matching_signatures
def _populate_func_with_vals(self,
func,
table_exprs=TableExprList(),
val_exprs=ValExprList(),
table_alias_prefix='',
allow_subquery=False,
_allow_table_exprs=None):
if not _allow_table_exprs and func.is_agg:
_allow_table_exprs = True
elif _allow_table_exprs is None and func.contains_agg:
_allow_table_exprs = False
elif _allow_table_exprs is None:
_allow_table_exprs = True
# If a function's return type depends on some of its args then at least one of those
# args must not be the NULL literal. Example: IF(false, NULL, NULL) is considered
# invalid because the return type cannot be determined.
has_non_null_literal_arg = False
for idx, arg in enumerate(func.args):
signature_arg = func.signature.args[idx]
if signature_arg.is_subquery \
or (allow_subquery \
and self.allow_more_nested_queries \
and self.profile.use_scalar_subquery()):
usage = self.profile.choose_subquery_predicate_category(
func.name(),
self.current_query.from_clause.table_exprs.joinable_cols_by_type)
if usage is not None \
and self.allow_more_nested_queries \
and (usage[1] == 'UNCORRELATED'
or self.current_query.from_clause.table_exprs.joinable_cols_by_type):
use_scalar_subquery = (usage[0] == 'Scalar')
use_agg_subquery = (usage[1] == 'AGG')
use_correlated_subquery = (usage[2] == 'CORRELATED')
if use_correlated_subquery:
join_expr_type = self.profile.choose_type(list(
self.current_query.from_clause.table_exprs.joinable_cols_by_type))
else:
join_expr_type = None
select_item_data_types = \
[signature_arg.type] if use_scalar_subquery else signature_arg.type
query = self.create_query(
table_exprs,
select_item_data_types=select_item_data_types,
required_table_expr_col_type=join_expr_type,
create_aggregate=use_agg_subquery,
# Don't use UNION + LIMIT; https://issues.cloudera.org/browse/IMPALA-1379
allow_union_clause=(not signature_arg.is_subquery),
table_alias_prefix=(table_alias_prefix +
('t' if use_correlated_subquery else '')))
if use_scalar_subquery and not use_agg_subquery:
# Impala will assume the query will return more than one row unless a LIMIT 1
# is added. An ORDER BY will also be added under the assumption that we want
# deterministic results.
query.order_by_clause = OrderByClause([Int(1)])
query.limit_clause = LimitClause(Int(1))
if use_correlated_subquery:
outer_table_expr = choice(
self.current_query.from_clause.table_exprs.by_col_type[join_expr_type])
inner_table_expr = choice(
query.from_clause.table_exprs.by_col_type[join_expr_type])
correlation_condition = self._create_relational_join_condition(
outer_table_expr, inner_table_expr)
if query.where_clause:
query.where_clause.boolean_expr = And.create_from_args(
query.where_clause.boolean_expr, correlation_condition)
else:
query.where_clause = WhereClause(correlation_condition)
func.args[idx] = Subquery(query)
else:
replacement_func = self._create_func_tree(func.type)
return self._populate_func_with_vals(
replacement_func,
table_exprs=table_exprs,
val_exprs=val_exprs,
table_alias_prefix=table_alias_prefix,
allow_subquery=allow_subquery,
_allow_table_exprs=_allow_table_exprs)
else:
if arg.is_constant and arg.val is None:
candidate_val_exprs = ValExprList()
if val_exprs:
candidate_val_exprs.extend(val_exprs.by_type[arg.type])
if _allow_table_exprs:
candidate_val_exprs.extend(table_exprs.cols_by_type[arg.type])
if candidate_val_exprs:
val = self.profile.choose_val_expr(candidate_val_exprs)
else:
val = self.profile.choose_constant(
return_type=arg.type,
allow_null=(signature_arg.can_be_null \
and signature_arg.can_be_null_literal \
and (has_non_null_literal_arg \
or not signature_arg.determines_signature)))
func.args[idx] = val
arg = val
elif arg.is_func:
func.args[idx] = self._populate_func_with_vals(
arg,
table_exprs=table_exprs,
val_exprs=val_exprs,
_allow_table_exprs=_allow_table_exprs)
if not signature_arg.can_be_null and not arg.is_constant:
val = self.profile.choose_constant(return_type=arg.type, allow_null=False)
func.args[idx] = Coalesce.create_from_args(arg, val)
if not arg.is_constant or not arg.val is None:
has_non_null_literal_arg = True
return func
def _create_agg_select_item(self, table_exprs, basic_select_item_exprs, return_type):
value = self._create_agg_func_tree(return_type)
value = self._populate_func_with_vals(value, table_exprs, basic_select_item_exprs)
return SelectItem(value)
def _create_agg_func_tree(self, return_type):
return self._create_agg_or_analytic_tree(return_type, agg_funcs=AGG_FUNCS)
def _create_agg_or_analytic_tree(self, return_type, agg_funcs=[], analytic_funcs=[]):
'''Returns an instance of a function that is guaranteed to either be or contain an
aggregate or analytic function. The arguments of the returned function will either
be None or an instance of a function as in _create_func_tree.
The chosen aggregate or analytic functions will be restricted to the list of
functions in agg_funcs and analytic_funcs.
If analytic_funcs is non-empty the returned function will be guaranteed to
be an analytic or contain at least on analytic function.
return_type must be set and refers to the data type of the function output.
agg_funcs and analytic_funcs should be used to determine the class of the
returned function. The caller is responsible for restricting the return_type
to types that can be generated by permutations of available functions. If the max
nested expr count in the query profile is at least one, then any return_type
should be possible to generate but this is not guaranteed.
'''
# The creation of aggregate and analytic functions is so similar that they are
# combined here. "basic" function creation is much simpler so that is kept separate.
# What's going to happen is there will be essentially two important data structures:
#
# 1) A tree of functions, the root of which will be returned. The leaves of the
# tree are "place holders" which are actually instances of a concrete DataType,
# such as Int, with a value of None. In other words, the arguments to all
# functions are either other functions or SQL NULL.
#
# 2) A mapping to place holders from the type of function that they may be replaced
# with. The actual data structure is
# dict<func class> -> list<tuple<tree node, index of place holder in tree node>>
# where "func class" is one of "AggFunc", "AnalyticFunc" or "Func".
#
# This means once a child function is generated, a spot where it can be placed into
# the tree can easily be identified. Although the work is actually done in reverse
# order, a place holder is chosen, then a replacement is generated.
if not agg_funcs and not analytic_funcs:
raise Exception('At least one analytic or aggregate function is required')
basic_signatures_by_return_type = self._group_signatures_by_return_type(
self._find_matching_signatures(
self._funcs_to_allowed_signatures(FUNCS),
allow_subquery=False))
agg_signatures_by_return_type = self._group_signatures_by_return_type(
self._funcs_to_allowed_signatures(agg_funcs))
analytic_signatures_by_return_type = self._group_signatures_by_return_type(
self._funcs_to_allowed_signatures(analytic_funcs))
if analytic_funcs:
return_class = AnalyticFunc
return_signatures_by_return_type = analytic_signatures_by_return_type
else:
return_class = AggFunc
return_signatures_by_return_type = agg_signatures_by_return_type
min_children, max_children = self.profile.bounds('MAX_NESTED_EXPR_COUNT')
if max_children == 0 and return_type not in return_signatures_by_return_type:
raise Exception(('At least one child expr is required to create a %s expr'
+ ' using an %s function')
% (return_type.__name__,
'aggregate' if return_class == AggFunc else 'analytic'))
if min_children == 0 and return_type not in return_signatures_by_return_type:
min_children = 1
max_children = self.profile._choose_from_bounds(min_children, max_children)
if not max_children:
signature = self.profile.choose_func_signature(
return_signatures_by_return_type[return_type])
return signature.func(signature)
root_func = None
# Every time a function is created its arguments will initially be NULL. Those NULL
# arguments may be replaced by other functions up to "max_children". The types of
# valid child functions depends on the type of parent function.
#
# * Analytics may not contain other analytics
# * Analytics may contain aggregates or basic functions
# * Aggregates may not contain other aggregates or analytics
# * Aggregates may contain basic functions
# * Basic functions may contain any function so long as the above conditions are
# not broken (indirectly).
null_args_by_func_allowed = {AggFunc: list(), AnalyticFunc: list(), Func: list()}
while max_children:
null_arg_pool = None
parent_func, parent_arg_idx = None, None
chosen_signature= None
# Since aggregate (and let's assume analytic functions) return a limited set of
# types, some prep work may be needed if the return type isn't in the set of
# directly producible types. For example if a Boolean is desired and there are
# only two children remaining, the next child must be something that accepts an
# aggregate and returns a Boolean, such as GreaterThan.
if (not root_func and max_children == 1) \
or (root_func and max_children == 2 \
and ((return_class == AggFunc and not root_func.contains_agg)
or (return_class == AnalyticFunc and not root_func.contains_analytic))):
if root_func:
idx = randrange(len(null_args_by_func_allowed[return_class])
+ len(null_args_by_func_allowed[Func]))
use_return_class = idx < len(null_args_by_func_allowed[return_class])
if use_return_class:
null_arg_pool = null_args_by_func_allowed[return_class]
signature_pools_by_return_type = return_signatures_by_return_type
else:
null_arg_pool = null_args_by_func_allowed[Func]
signature_pools_by_return_type = basic_signatures_by_return_type
shuffle(null_arg_pool)
parent_func, parent_arg_idx = null_arg_pool.pop()
parent_arg_type = parent_func.args[parent_arg_idx].type
if use_return_class:
signature_pool = signature_pools_by_return_type[parent_arg_type]
else:
# This is a basic functions so it needs to accept one of the types returned
# by the desired return_class.
signature_pool = self._find_matching_signatures(
signature_pools_by_return_type[parent_arg_type],
accepts=tuple(return_signatures_by_return_type))
else:
signature_pool = list()
if return_type in return_signatures_by_return_type:
signature_pool.extend(return_signatures_by_return_type[return_type])
if return_type in basic_signatures_by_return_type:
signature_pool.extend(self._find_matching_signatures(
basic_signatures_by_return_type[return_type],
accepts=tuple(return_signatures_by_return_type)))
chosen_signature = self.profile.choose_func_signature(signature_pool)
elif max_children == 1 \
and ((return_class == AggFunc and not root_func.contains_agg)
or (return_class == AnalyticFunc and not root_func.contains_analytic)):
# This is the last iteration and the generated function still doesn't contain
# an instance of the desired function class. From the setup above, at least
# one of the available place holder arguments can be replaced by an aggregate or
# analytic.
null_arg_pool = null_args_by_func_allowed[return_class]
if not null_arg_pool:
raise Exception(
'No leaves in the expr tree may be replaced by an %s function'
% ('aggregate' if return_class == AggFunc else 'analytic'))
shuffle(null_arg_pool)
return_types = tuple(return_signatures_by_return_type)
while null_arg_pool:
parent_func, parent_arg_idx = null_arg_pool.pop()
parent_arg_type = parent_func.args[parent_arg_idx].type
if issubclass(parent_arg_type, return_types):
break
if not null_arg_pool:
raise Exception('No functions could accept an %s function'
% ('aggregate' if return_class == AggFunc else 'analytic'))
chosen_signature = self.profile.choose_func_signature(
return_signatures_by_return_type[parent_arg_type])
elif not root_func:
# No restrictions, just choose a root_func that returns the needed type.
signature_pool = list()
if return_type in return_signatures_by_return_type:
signature_pool.extend(return_signatures_by_return_type[return_type])
if return_type in basic_signatures_by_return_type:
signature_pool.extend(basic_signatures_by_return_type[return_type])
chosen_signature= self.profile.choose_func_signature(signature_pool)
else:
# A root_func was chosen and it's children are in one or more of the
# null_args_by_func_allowed pools. A pool will be chosen, then a child function.
null_arg_counts_by_pool = dict((pool_category, len(pool)) for pool_category, pool
in null_args_by_func_allowed.iteritems())
# There is a special case that would lead to a dead end. If there is only one
# distinct place holder across all the pools and an analytic is still needed,
# then that place holder cannot be replaced by an aggregate since aggregates
# are not allowed to contain analytics. Example: an analytic is impossible once
# the tree is FLOOR(AVG(NULL)).
if return_class == AnalyticFunc \
and not root_func.contains_analytic \
and len(null_args_by_func_allowed[AnalyticFunc]) == 1 \
and null_args_by_func_allowed[AnalyticFunc][0] \
in null_args_by_func_allowed[AggFunc]:
null_arg_counts_by_pool[AggFunc] = 0
null_arg_pool_category = \
self.profile._choose_from_weights(null_arg_counts_by_pool)
null_arg_pool = null_args_by_func_allowed[null_arg_pool_category]
shuffle(null_arg_pool)
parent_func, parent_arg_idx = null_arg_pool.pop()
parent_arg_type = parent_func.args[parent_arg_idx].type
if null_arg_pool_category == AggFunc:
signature_pool = agg_signatures_by_return_type[parent_arg_type]
elif null_arg_pool_category == AnalyticFunc:
signature_pool = analytic_signatures_by_return_type[parent_arg_type]
else:
signature_pool = basic_signatures_by_return_type[parent_arg_type]
chosen_signature = self.profile.choose_func_signature(signature_pool)
chosen_func = chosen_signature.func(chosen_signature)
if root_func:
max_children -= 1
else:
root_func = chosen_func
if parent_func:
# Remove the place holder from all of the other pools.
for pool_category, pool in null_args_by_func_allowed.iteritems():
for null_arg_idx, (func, arg_idx) in enumerate(pool):
if func is parent_func and arg_idx == parent_arg_idx:
del pool[null_arg_idx]
# Replace the place holder with the child function
parent_func.args[parent_arg_idx] = chosen_func
parent_func.validate()
chosen_func.parent = parent_func
else:
chosen_func.parent = None
# Place the args of the chosen function into the appropriate pools. Aggregate and
# analytic functions have different rules about which functions they accept as
# arguments. If the chosen function is neither then the tree must be inspected to
# find the first aggregate or analytic ancestor.
child_null_arg_pools = set()
node = chosen_func
while True:
if node.is_analytic:
child_null_arg_pools.add(AggFunc)
child_null_arg_pools.add(Func)
break
elif node.is_agg:
child_null_arg_pools.add(Func)
break
node = node.parent
if not node:
# This means the root_func is a non-aggregate and non-analytic, it can accept
# anything.
child_null_arg_pools.add(AggFunc)
child_null_arg_pools.add(Func)
if return_class == AnalyticFunc:
child_null_arg_pools.add(AnalyticFunc)
break
for func_allowed in child_null_arg_pools:
null_args = null_args_by_func_allowed[func_allowed]
for idx, arg in enumerate(chosen_func.args):
if arg.val is not None:
# Some functions come with unusual arguments pre-populated.
# Ex: The analytic function LEAD(foo, constant) will have "constant" non-NULL.
continue
if func_allowed == AggFunc:
returnable_types = agg_signatures_by_return_type
elif func_allowed == AnalyticFunc:
returnable_types = analytic_signatures_by_return_type
else:
returnable_types = basic_signatures_by_return_type
# Skip place holders that are impossible to replace. The function selection
# logic depends on this.
if arg.type not in returnable_types:
continue
null_args.append((chosen_func, idx))
if not any(null_args_by_func_allowed.itervalues()):
# Some analytic functions take no arguments. Ex: ROW_NUM()
break
if (return_class == AggFunc and not root_func.contains_agg) \
or (return_class == AnalyticFunc and not root_func.contains_analytic):
raise Exception('Unable to create an %s function'
% ('aggregate' if return_class == AggFunc else 'analytic'))
return root_func
def _create_analytic_select_item(self,
table_exprs,
select_item_exprs,
is_only_item,
return_type):
'''Create and return a list of SelectItems where each item includes an anayltic
function in its val_expr.
table_exprs must be a non-empty collection of tables exprs to choose columns from.
select_items may be an empty collection. If not empty, anayltic function
expressions will be restricted so that PARTITION BY and ORDER BY values are
chosen from within this set. If empty, any random expr may be chosen.
is_only_item should be True if this analytic item will be the only SelectItem in
the SELECT clause. When True, an additional analytic variation that would
otherwise produce a non-deterministic result set is enabled.
'''
# Results may not be deterministic when analytics have an ORDER BY. The code below
# will assume we want deterministic results though we know this may not be true
# for certain use cases such as just searching for crashes where results will be
# thrown away.
#
# These cases will produce a deterministic result when ORDER BY is used
#
# 1) There is only one SELECT item and the analytic function has no argument
# or the ORDER BY includes the argument.
#
# 2) The analytic has set of ORDER BY expressions that has a deterministic
# result. Ex: ORDER BY primary_key. This is only reliable if there are no
# JOINs.
#
# 3) A window is explicitly specified as UNBOUNDED PRECEDING to UNBOUNDED
# FOLLOWING and either the analytic function is an aggregate or the
# analytic function is FIRST_VALUE or LAST_VALUE.
#
# 4) RANK is a special case, it is intrinsically deterministic
excluded_designs = list()
if len(self.queries_under_construction) > 1:
excluded_designs.append('TOP_LEVEL_QUERY_WITHOUT_LIMIT')
if not is_only_item:
excluded_designs.append('ONLY_ITEM')
if len(table_exprs) == 1:
unique_col_combos = []
if table_exprs[0].unique_cols:
if select_item_exprs:
select_cols = set(val_expr for val_expr in select_item_exprs if val_expr.is_col)
for unique_cols in table_exprs[0].unique_cols:
if unique_cols <= select_cols:
unique_col_combos.append(unique_cols)
else:
unique_col_combos = table_exprs[0].unique_cols
if not unique_col_combos:
excluded_designs.append('DETERMINISTIC_ORDER')
else:
excluded_designs.append('DETERMINISTIC_ORDER')
allow_agg = any(ifilter(lambda expr: expr.contains_agg, select_item_exprs))
value = self._create_analytic_func_tree(return_type, excluded_designs, allow_agg)
value = self._populate_func_with_vals(
value,
table_exprs=TableExprList(),
val_exprs=ValExprList())
self._populate_analytic_clauses(value, table_exprs, select_item_exprs)
return SelectItem(value)
def _create_analytic_func_tree(self, return_type, excluded_designs, allow_agg):
# The results of queries with analytic functions may not be deterministic. Let's
# assume deterministic results are required. To generate deterministic results, the
# structure of queries will be limited to one of several "designs". The goal of each
# design is to ensure that results are deterministic. See the notes in
# _create_analytic_select_item for more info on designs.
designs = set(self.profile.allowed_analytic_designs())
for design in excluded_designs:
if design in designs:
designs.remove(design)
window_weigts = self.profile.weights('ANALYTIC_WINDOW')
if 'UNBOUNDED_WINDOW' in designs \
and window_weigts[('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING)] == 0 \
and window_weigts[('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING)] == 0:
designs.remove('UNBOUNDED_WINDOW')
if not designs:
raise Exception('No options for creating a deterministic analytic function are'
' available')
available_funcs = list()
for func in ANALYTIC_FUNCS:
if 'TOP_LEVEL_QUERY_WITHOUT_LIMIT' in designs \
or 'DETERMINISTIC_ORDER_BY' in designs \
or 'ONLY_SELECT_ITEM' in designs \
or ('RANK_FUNC' in designs and func.name() == 'Rank') \
or ('NO_ORDER_BY' in designs and not func.REQUIRES_ORDER_BY) \
or ('UNBOUNDED_WINDOW' in designs and func.SUPPORTS_WINDOWING):
available_funcs.append(func)
if not available_funcs:
raise Exception('No analytic functions available')
# In general this may be a tree-like val expr that is guaranteed to contain at least
# one analytic function. The arguments to the function are already outlined but the
# analytic clause is still undetermined.
root_func = self._create_agg_or_analytic_tree(
return_type,
analytic_funcs=available_funcs,
agg_funcs=(AGG_FUNCS if allow_agg else []))
# The following designs are preferred because they place no restrictions on the
# analytic. From now on, if "designs" is non-empty this means that a design must
# be chosen.
if 'TOP_LEVEL_QUERY_WITHOUT_LIMIT' in designs \
or 'DETERMINISTIC_ORDER_BY' in designs \
or 'ONLY_SELECT_ITEM' in designs:
designs.clear()
else:
designs = list(designs)
# Set the analytic clauses, such as PARTITION BY, etc, for each analytic function.
analytic_funcs = list()
if root_func.is_analytic:
analytic_funcs.append(root_func)
for func in root_func.iter_exprs(lambda expr: expr.is_analytic):
analytic_funcs.append(func)
for func in analytic_funcs:
if designs:
func.design = choice([design for design in designs
if design != 'RANK' or func.name() != 'Rank'])
else:
func.design = 'NO_RESTRICTIONS'
use_window = (func.design == 'UNBOUNDED_WINDOW') \
or (func.design != 'NO_ORDER_BY'
and (func.SUPPORTS_WINDOWING and self.profile.use_window_in_analytic()))
if use_window:
window_range_or_rows, window_start_boundary_type, window_end_boundary_type \
= self.profile.choose_window_type()
if window_start_boundary_type in (PRECEDING, FOLLOWING):
start_val = Int(self.profile._choose_from_bounds('ANALYTIC_WINDOW_OFFSET'))
else:
start_val = None
if window_end_boundary_type in (PRECEDING, FOLLOWING):
end_val = Int(self.profile.get_window_offset())
if window_start_boundary_type == PRECEDING \
and window_end_boundary_type == PRECEDING:
if start_val.val < end_val.val:
start_val, end_val = end_val, start_val
elif start_val.val == end_val.val:
end_val.val -= 1
elif window_start_boundary_type == FOLLOWING \
and window_end_boundary_type == FOLLOWING:
if start_val.val > end_val.val:
start_val, end_val = end_val, start_val
elif start_val.val == end_val.val:
start_val.val -= 1
else:
end_val = None
func.window_clause = WindowClause(
window_range_or_rows,
WindowBoundary(window_start_boundary_type, start_val),
WindowBoundary(window_end_boundary_type, end_val) \
if window_end_boundary_type else None)
if self.profile.use_partition_by_clause_in_analytic():
func.partition_by_clause = PartitionByClause([])
if (func.design == 'DETERMINISTIC_ORDER') \
or use_window \
or func.REQUIRES_ORDER_BY \
or (func.design != 'NO_ORDER_BY' \
and self.profile.use_order_by_clause_in_analytic()):
func.order_by_clause = OrderByClause([])
if func.name() in ('Lead', 'Lag') and len(func.args) == 2:
func.args[1].val = self.profile.get_offset_for_analytic_lead_or_lag()
return root_func
def _group_signatures_by_return_type(self, signatures):
groups = defaultdict(list)
for signature in signatures:
groups[signature.return_type].append(signature)
return dict(groups)
def _populate_analytic_clauses(self, func, table_exprs, val_exprs):
for arg in func.args:
if arg.is_func:
self._populate_analytic_clauses(arg, table_exprs, val_exprs)
if func.is_analytic:
if func.design == 'DETERMINISTIC_ORDER':
unique_cols = shuffle(list(choice(table_exprs[0].unique_cols)))
else:
unique_cols = list()
if func.partition_by_clause:
if len(unique_cols) > 1:
func.partition_by_clause.val_exprs.append(unique_cols.pop())
elif len(val_exprs) > 1:
shuffle(val_exprs)
func.partition_by_clause.val_exprs.append(val_exprs[0])
else:
func.partition_by_clause = None
if func.order_by_clause:
if unique_cols:
order_by_exprs = unique_cols
elif val_exprs:
order_by_exprs = val_exprs[:2]
else:
cols = list(table_exprs.cols)
shuffle(cols)
order_by_exprs = cols[:2]
for order_by_expr in order_by_exprs:
func.order_by_clause.exprs_to_order.append(
(order_by_expr, choice([None, 'ASC', 'DESC', 'DESC'])))
def _create_analytic_partition_by_or_order_by_exprs(self,
table_exprs,
select_items,
required_exprs=None):
# TODO: Make more complicated exprs by combining the ones below. Ex: instead of
# returning [a, b, c] return [(a + b) * c, etc].
# TODO: The current implementation is more restrictive than it needs to be. I think
# we only need to know if a GROUP BY query is being generated, and in that case
# limit column exprs to those of the set of GROUP BY columns.
if select_items:
val_exprs = [choice(select_items).val_expr
for _ in xrange(self.profile.get_col_count_to_use_in_val_expr())]
else:
val_exprs = [self._create_val_expr(table_exprs)
for _ in xrange(self.profile.get_col_count_to_use_in_val_expr())]
if required_exprs:
used_cols = set()
for expr in val_exprs:
used_cols.update(expr.count_col_refs().keys())
remaining_cols = required_exprs - used_cols
if remaining_cols:
val_exprs.extend(remaining_cols)
return val_exprs
def _create_from_clause(self,
table_exprs,
table_alias_prefix,
required_table_expr_col_type):
from_clause = None
table_count = self.profile.get_table_count()
for idx in xrange(table_count):
if idx == 0:
candidate_table_exprs = TableExprList(table_exprs)
if required_table_expr_col_type:
candidate_table_exprs = \
candidate_table_exprs.by_col_type[required_table_expr_col_type]
if not candidate_table_exprs:
raise Exception('No tables have the required column type')
if table_count > 0 \
and required_table_expr_col_type not in JOINABLE_TYPES:
candidate_table_exprs = TableExprList(
table_expr for table_expr in candidate_table_exprs
if table_expr.joinable_cols_by_type)
if not candidate_table_exprs:
raise Exception('No tables have any joinable types')
table_expr = self._create_table_expr(candidate_table_exprs)
table_expr.alias = table_alias_prefix + str(idx + 1)
from_clause = FromClause(table_expr)
if not table_expr.joinable_cols:
# A CROSS JOIN is still possible but let's assume that isn't wanted.
break
else:
join_clause = self._create_join_clause(from_clause, table_exprs)
join_clause.table_expr.alias = table_alias_prefix + str(idx + 1)
from_clause.join_clauses.append(join_clause)
# Note: the HAVING clause creation assumes the only case when a table will not have
# an alias is the case below.
if len(from_clause.table_exprs) == 1 and not from_clause.table_expr.is_inline_view:
from_clause.table_expr.alias = None
return from_clause
def _create_table_expr(self, table_exprs, required_type=None):
if not table_exprs:
raise Exception('At least one table_expr is required')
if self.allow_more_nested_queries and self.profile.use_inline_view():
return self._create_inline_view(table_exprs, required_type=required_type)
if required_type:
candidiate_table_exprs = TableExprList()
for table_expr in table_exprs:
for col in table_expr.cols:
if issubclass(col.type, required_type):
candidiate_table_exprs.append(table_expr)
break
if not candidiate_table_exprs:
raise Exception('No tables have a column with the required data type')
table_exprs = candidiate_table_exprs
return deepcopy(self.profile.choose_table(table_exprs))
def _create_inline_view(self, table_exprs, required_type=None):
return InlineView(self.create_query(
table_exprs, required_select_item_type=required_type))
def _create_join_clause(self, from_clause, table_exprs):
join_type = self.profile.choose_join_type(JoinClause.JOINS_TYPES)
if join_type == 'CROSS':
table_expr = self._create_table_expr(table_exprs)
else:
available_join_expr_types = set(from_clause.table_exprs.joinable_cols_by_type) \
& set(table_exprs.col_types)
if not available_join_expr_types:
raise Exception('No tables have any colums eligible for joining')
join_expr_type = self.profile.choose_type(tuple(available_join_expr_types))
table_expr = self._create_table_expr(table_exprs, required_type=join_expr_type)
if join_type in ('LEFT ANTI', 'LEFT SEMI'):
table_expr.is_visible = False
join_clause = JoinClause(join_type, table_expr)
if join_type != 'CROSS':
join_table_expr_candidates = from_clause.table_exprs.by_col_type[join_expr_type]
if not join_table_expr_candidates:
raise Exception('table_expr has no common joinable columns')
related_table_expr = choice(join_table_expr_candidates)
join_clause.boolean_expr = self._create_relational_join_condition(
table_expr, related_table_expr)
return join_clause
def _create_relational_join_condition(self, left_table_expr, right_table_expr):
if not left_table_expr.joinable_cols_by_type:
raise Exception('All columns in table disallowed: %s' % left_table_expr)
if not right_table_expr.joinable_cols_by_type:
raise Exception('All columns in table disallowed: %s' % right_table_expr)
common_col_types = set(left_table_expr.joinable_cols_by_type) \
& set(right_table_expr.joinable_cols_by_type)
if not common_col_types:
raise Exception('Tables have no joinable columns in common')
predicates = list()
for _ in xrange(1 + self.profile.choose_nested_expr_count()):
col_type = choice(list(common_col_types))
left_col = choice(left_table_expr.joinable_cols_by_type[col_type])
right_col = choice(right_table_expr.joinable_cols_by_type[col_type])
# As of this writing Impala doesn't trim CHARs when comparing against VARCHAR. It's
# expected that the user will explicitly do this. Eventually an implicit trim should
# be done.
if issubclass(right_col.type, (String, VarChar)) and left_col.type == Char:
left_col = Trim.create_from_args(left_col)
elif left_col.type == Char and issubclass(right_col.type, (String, VarChar)):
right_col = Trim.create_from_args(right_col)
predicates.append(Equals.create_from_args(left_col, right_col))
while len(predicates) > 1:
predicates.append(And.create_from_args(predicates.pop(), predicates.pop()))
return predicates[0]
def _create_where_clause(self,
from_clause_table_exprs,
table_exprs,
table_alias_prefix):
predicate = self._create_func_tree(Boolean, allow_subquery=True)
predicate = self._populate_func_with_vals(
predicate,
table_exprs=from_clause_table_exprs,
table_alias_prefix=table_alias_prefix)
if predicate.contains_subquery and not from_clause_table_exprs[0].alias:
# TODO: Figure out if an alias is really needed.
from_clause_table_exprs[0].alias = 't1'
return WhereClause(predicate)
def _create_having_clause(self, table_exprs, basic_select_item_exprs):
predicate = self._create_agg_func_tree(Boolean)
predicate = self._populate_func_with_vals(
predicate, table_exprs=table_exprs, val_exprs=basic_select_item_exprs)
# https://issues.cloudera.org/browse/IMPALA-1423
# Make sure any cols used have a table identifier. As of this writing the only
# single table FROM clauses don't use table aliases. Setting a table alias
# automatically propagates as a column table identifier ("t1.col" instead of "col").
for arg in predicate.iter_exprs():
if isinstance(arg, ValExpr) and arg.is_col and not arg.owner.alias:
# TODO: Figure out if an alias is really needed.
arg.owner.alias = 't1'
return HavingClause(predicate)
def _enable_distinct_on_random_agg_items(self, agg_items):
'''Randomly choose an agg func and set it to use DISTINCT'''
# Impala has a limitation where 'DISTINCT' may only be applied to one agg
# expr. If an agg expr is used more than once, each usage may
# or may not include DISTINCT.
#
# Examples:
# OK: SELECT COUNT(DISTINCT a) + SUM(DISTINCT a) + MAX(a)...
# Not OK: SELECT COUNT(DISTINCT a) + COUNT(DISTINCT b)...
#
# Given a select list like:
# COUNT(a), SUM(a), MAX(b)
#
# We want to ouput one of:
# COUNT(DISTINCT a), SUM(DISTINCT a), AVG(b)
# COUNT(DISTINCT a), SUM(a), AVG(b)
# COUNT(a), SUM(a), AVG(DISTINCT b)
#
# This will be done by first grouping all agg funcs by their inner
# expr:
# {a: [COUNT(a), SUM(a)],
# b: [MAX(b)]}
#
# then choosing a random val (which is a list of aggs) in the above dict, and
# finally randomly adding DISTINCT to items in the list.
exprs_to_funcs = defaultdict(list)
for item in agg_items:
for expr, funcs in self._group_agg_funcs_by_expr(item.val_expr).iteritems():
exprs_to_funcs[expr].extend(funcs)
funcs = choice(exprs_to_funcs.values())
for func in funcs:
if self.profile.use_distinct_in_func():
func.distinct = True
def _group_agg_funcs_by_expr(self, val_expr):
'''Group exprs and return a dict mapping the expr to the agg items
it is used in.
Example: COUNT(a) * SUM(a) - MAX(b) + MIN(c) -> {a: [COUNT(a), SUM(a)],
b: [MAX(b)],
c: [MIN(c)]}
'''
exprs_to_funcs = defaultdict(list)
if val_expr.is_agg:
exprs_to_funcs[tuple(val_expr.args)].append(val_expr)
elif val_expr.is_func:
for arg in val_expr.args:
for expr, funcs in self._group_agg_funcs_by_expr(arg).iteritems():
exprs_to_funcs[expr].extend(funcs)
# else: The remaining case could happen if the original expr was something like
# "SUM(a) + b + 1" where b is a GROUP BY field.
return exprs_to_funcs