Files
impala/tests/comparison/statement_generator.py
Michael Brown db7facdee0 IMPALA-4351,IMPALA-4353: [qgen] randomly generate INSERT statements
- Generate INSERT statements that are either INSERT ... VALUES or INSERT
  ... SELECT

- On both types of INSERTs, we either insert into all columns, or into
  some column list. If the column list exists, all primary keys will be
  present, and 0 or more additional columns will also be in the list.
  The ordering of the column list is random.

- For INSERT ... SELECT, occasionally generate a WITH clause

- For INSERT ... VALUES, generate non-null constants for the primary
  keys, but for the non-primary keys, randomly generate a value
  expression.

The type system in the random statement/query generator isn't
sophisticated enough to the implicit type of a SELECT item or a value
expression. It knows it will be some INT-based type, but not if it's
going to be a SMALLINT or a BIGINT. To get around this, the easiest
thing seems to be to explicitly cast the SELECT items or value
expressions to the columns' so-called exact_type attribute.

Much of the testing here involved running discrepancy_searcher.py
--explain-only on both tpch_kudu and a random HDFS table, using both the
default profile and DML-only profile. This was done to quickly find bugs
in the statement generation, as they tend to bubble up as analysis
errors. I expect to make other changes as follow on patches and more
random statements find small test issues.

For actual use against Kudu data, you need to migrate data from Kudu
into PostgreSQL 5 (instructions tests/comparison/POSTGRES.txt) and run
something like:

tests/comparison/discrepancy_searcher.py \
  --use-postgresql \
  --postgresql-port 5433 \
  --profile dmlonly \
  --timeout 300 \
  --db-name tpch_kudu \
  --query-count 10

Change-Id: I842b41f0eed07ab30ec76d8fc3cdd5affb525af6
Reviewed-on: http://gerrit.cloudera.org:8080/5486
Reviewed-by: Jim Apple <jbapple-impala@apache.org>
Tested-by: Impala Public Jenkins
2017-01-13 01:31:47 +00:00

158 lines
6.7 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from copy import deepcopy
from tests.comparison.common import Table
from tests.comparison.funcs import CastFunc
from tests.comparison.query import (
InsertClause,
InsertStatement,
Query,
StatementExecutionMode,
ValuesClause,
ValuesRow)
from tests.comparison.query_generator import QueryGenerator
class InsertStatementGenerator(object):
def __init__(self, profile):
# QueryProfile-like object
self.profile = profile
# used to generate SELECT queries for INSERT ... SELECT statements;
# to ensure state is completely reset, this is created anew with each call to
# generate_statement()
self.select_stmt_generator = None
def generate_statement(self, tables, dml_table):
"""
Return a randomly generated INSERT statement.
tables should be a list of Table objects. A typical source of such a list comes from
db_connection.DbCursor.describe_common_tables(). This list describes the possible
"sources" of the INSERT's WITH and FROM/WHERE clauses.
dml_table is a required Table object. The INSERT will be into this table.
"""
if not (isinstance(tables, list) and len(tables) > 0 and
all((isinstance(t, Table) for t in tables))):
raise Exception('tables must be a not-empty list of Table objects')
if not isinstance(dml_table, Table):
raise Exception('dml_table must be a Table')
self.select_stmt_generator = QueryGenerator(self.profile)
if dml_table.primary_keys:
insert_statement = InsertStatement(
conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE)
else:
insert_statement = InsertStatement(
conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT)
insert_statement.execution = StatementExecutionMode.DML_TEST
# Choose whether this is a
# INSERT INTO table SELECT/VALUES
# or
# INSERT INTO table (col1, col2, ...) SELECT/VALUES
# If the method returns None, it's the former.
insert_column_list = self.profile.choose_insert_column_list(dml_table)
insert_statement.insert_clause = InsertClause(
dml_table, column_list=insert_column_list)
# We still need to internally track the columns we're inserting. Keep in mind None
# means "all" without an explicit column list. Since we've already created the
# InsertClause object though, we can fill this in for ourselves.
if insert_column_list is None:
insert_column_list = dml_table.cols
insert_item_data_types = [col.type for col in insert_column_list]
# Decide whether this is INSERT VALUES or INSERT SELECT
insert_source_clause = self.profile.choose_insert_source_clause()
if issubclass(insert_source_clause, Query):
# Use QueryGenerator()'s public interface to generate the SELECT.
select_query = self.select_stmt_generator.generate_statement(
tables, select_item_data_types=insert_item_data_types)
# To avoid many loss-of-precision errors, explicitly cast the SelectItems. The
# generator's type system is not near sophisticated enough to know how random
# expressions will be implicitly casted in the databases. This requires less work
# to implement. IMPALA-4693 considers alternative approaches.
self._cast_select_items(select_query, insert_column_list)
insert_statement.with_clause = deepcopy(select_query.with_clause)
select_query.with_clause = None
insert_statement.select_query = select_query
elif issubclass(insert_source_clause, ValuesClause):
insert_statement.values_clause = self._generate_values_clause(insert_column_list)
else:
raise Exception('unsupported INSERT source clause: {0}'.format(
insert_source_clause))
return insert_statement
def _generate_values_clause(self, columns):
"""
Return a VALUES clause containing a variable number of rows.
The values corresponding to primary keys will be non-null constants. Any other
columns could be null, constants, or function trees that may or may not evaluate to
null.
"""
values_rows = []
for _ in xrange(self.profile.choose_insert_values_row_count()):
values_row = []
for col in columns:
if col.is_primary_key:
val = self.profile.choose_constant(return_type=col.exact_type, allow_null=False)
elif 'constant' == self.profile.choose_values_item_expr():
val = self.profile.choose_constant(return_type=col.exact_type, allow_null=True)
else:
func_tree = self.select_stmt_generator.create_func_tree(
col.type, allow_subquery=False)
val = self.select_stmt_generator.populate_func_with_vals(func_tree)
# Only the generic type, not the exact type, of the value will be known. To
# avoid a lot of failed queries due to precision errors, we cast the val to
# the exact type of the column. This will still not prevent "out of range"
# conditions, as we don't try to evaluate the random expressions.
val = CastFunc(val, col.exact_type)
values_row.append(val)
values_rows.append(ValuesRow(values_row))
return ValuesClause(values_rows)
def _cast_select_items(self, select_query, column_list):
"""
For a given Query select_query and a column_list (list of Columns), cast each select
item in select_query to the exact type of the column.
A Query may have a UNION, recursively do this down the line.
"""
for col_idx, select_item in enumerate(select_query.select_clause.items):
cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type)
select_item.val_expr = cast_val_expr
if select_query.union_clause:
self._cast_select_items(select_query.union_clause.query, column_list)
def get_generator(statement_type):
"""
Given a statement type, return the proper statement generator.
"""
STATEMENT_GENERATOR_MAP = {
InsertStatement: InsertStatementGenerator,
Query: QueryGenerator,
}
return STATEMENT_GENERATOR_MAP[statement_type]