mirror of
https://github.com/apache/impala.git
synced 2025-12-30 03:01:44 -05:00
- Generate INSERT statements that are either INSERT ... VALUES or INSERT ... SELECT - On both types of INSERTs, we either insert into all columns, or into some column list. If the column list exists, all primary keys will be present, and 0 or more additional columns will also be in the list. The ordering of the column list is random. - For INSERT ... SELECT, occasionally generate a WITH clause - For INSERT ... VALUES, generate non-null constants for the primary keys, but for the non-primary keys, randomly generate a value expression. The type system in the random statement/query generator isn't sophisticated enough to the implicit type of a SELECT item or a value expression. It knows it will be some INT-based type, but not if it's going to be a SMALLINT or a BIGINT. To get around this, the easiest thing seems to be to explicitly cast the SELECT items or value expressions to the columns' so-called exact_type attribute. Much of the testing here involved running discrepancy_searcher.py --explain-only on both tpch_kudu and a random HDFS table, using both the default profile and DML-only profile. This was done to quickly find bugs in the statement generation, as they tend to bubble up as analysis errors. I expect to make other changes as follow on patches and more random statements find small test issues. For actual use against Kudu data, you need to migrate data from Kudu into PostgreSQL 5 (instructions tests/comparison/POSTGRES.txt) and run something like: tests/comparison/discrepancy_searcher.py \ --use-postgresql \ --postgresql-port 5433 \ --profile dmlonly \ --timeout 300 \ --db-name tpch_kudu \ --query-count 10 Change-Id: I842b41f0eed07ab30ec76d8fc3cdd5affb525af6 Reviewed-on: http://gerrit.cloudera.org:8080/5486 Reviewed-by: Jim Apple <jbapple-impala@apache.org> Tested-by: Impala Public Jenkins
158 lines
6.7 KiB
Python
158 lines
6.7 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from copy import deepcopy
|
|
|
|
from tests.comparison.common import Table
|
|
from tests.comparison.funcs import CastFunc
|
|
from tests.comparison.query import (
|
|
InsertClause,
|
|
InsertStatement,
|
|
Query,
|
|
StatementExecutionMode,
|
|
ValuesClause,
|
|
ValuesRow)
|
|
from tests.comparison.query_generator import QueryGenerator
|
|
|
|
|
|
class InsertStatementGenerator(object):
|
|
def __init__(self, profile):
|
|
# QueryProfile-like object
|
|
self.profile = profile
|
|
# used to generate SELECT queries for INSERT ... SELECT statements;
|
|
# to ensure state is completely reset, this is created anew with each call to
|
|
# generate_statement()
|
|
self.select_stmt_generator = None
|
|
|
|
def generate_statement(self, tables, dml_table):
|
|
"""
|
|
Return a randomly generated INSERT statement.
|
|
|
|
tables should be a list of Table objects. A typical source of such a list comes from
|
|
db_connection.DbCursor.describe_common_tables(). This list describes the possible
|
|
"sources" of the INSERT's WITH and FROM/WHERE clauses.
|
|
|
|
dml_table is a required Table object. The INSERT will be into this table.
|
|
"""
|
|
if not (isinstance(tables, list) and len(tables) > 0 and
|
|
all((isinstance(t, Table) for t in tables))):
|
|
raise Exception('tables must be a not-empty list of Table objects')
|
|
|
|
if not isinstance(dml_table, Table):
|
|
raise Exception('dml_table must be a Table')
|
|
|
|
self.select_stmt_generator = QueryGenerator(self.profile)
|
|
|
|
if dml_table.primary_keys:
|
|
insert_statement = InsertStatement(
|
|
conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE)
|
|
else:
|
|
insert_statement = InsertStatement(
|
|
conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT)
|
|
|
|
insert_statement.execution = StatementExecutionMode.DML_TEST
|
|
|
|
# Choose whether this is a
|
|
# INSERT INTO table SELECT/VALUES
|
|
# or
|
|
# INSERT INTO table (col1, col2, ...) SELECT/VALUES
|
|
# If the method returns None, it's the former.
|
|
insert_column_list = self.profile.choose_insert_column_list(dml_table)
|
|
insert_statement.insert_clause = InsertClause(
|
|
dml_table, column_list=insert_column_list)
|
|
# We still need to internally track the columns we're inserting. Keep in mind None
|
|
# means "all" without an explicit column list. Since we've already created the
|
|
# InsertClause object though, we can fill this in for ourselves.
|
|
if insert_column_list is None:
|
|
insert_column_list = dml_table.cols
|
|
insert_item_data_types = [col.type for col in insert_column_list]
|
|
|
|
# Decide whether this is INSERT VALUES or INSERT SELECT
|
|
insert_source_clause = self.profile.choose_insert_source_clause()
|
|
|
|
if issubclass(insert_source_clause, Query):
|
|
# Use QueryGenerator()'s public interface to generate the SELECT.
|
|
select_query = self.select_stmt_generator.generate_statement(
|
|
tables, select_item_data_types=insert_item_data_types)
|
|
# To avoid many loss-of-precision errors, explicitly cast the SelectItems. The
|
|
# generator's type system is not near sophisticated enough to know how random
|
|
# expressions will be implicitly casted in the databases. This requires less work
|
|
# to implement. IMPALA-4693 considers alternative approaches.
|
|
self._cast_select_items(select_query, insert_column_list)
|
|
insert_statement.with_clause = deepcopy(select_query.with_clause)
|
|
select_query.with_clause = None
|
|
insert_statement.select_query = select_query
|
|
elif issubclass(insert_source_clause, ValuesClause):
|
|
insert_statement.values_clause = self._generate_values_clause(insert_column_list)
|
|
else:
|
|
raise Exception('unsupported INSERT source clause: {0}'.format(
|
|
insert_source_clause))
|
|
return insert_statement
|
|
|
|
def _generate_values_clause(self, columns):
|
|
"""
|
|
Return a VALUES clause containing a variable number of rows.
|
|
|
|
The values corresponding to primary keys will be non-null constants. Any other
|
|
columns could be null, constants, or function trees that may or may not evaluate to
|
|
null.
|
|
"""
|
|
values_rows = []
|
|
for _ in xrange(self.profile.choose_insert_values_row_count()):
|
|
values_row = []
|
|
for col in columns:
|
|
if col.is_primary_key:
|
|
val = self.profile.choose_constant(return_type=col.exact_type, allow_null=False)
|
|
elif 'constant' == self.profile.choose_values_item_expr():
|
|
val = self.profile.choose_constant(return_type=col.exact_type, allow_null=True)
|
|
else:
|
|
func_tree = self.select_stmt_generator.create_func_tree(
|
|
col.type, allow_subquery=False)
|
|
val = self.select_stmt_generator.populate_func_with_vals(func_tree)
|
|
# Only the generic type, not the exact type, of the value will be known. To
|
|
# avoid a lot of failed queries due to precision errors, we cast the val to
|
|
# the exact type of the column. This will still not prevent "out of range"
|
|
# conditions, as we don't try to evaluate the random expressions.
|
|
val = CastFunc(val, col.exact_type)
|
|
values_row.append(val)
|
|
values_rows.append(ValuesRow(values_row))
|
|
return ValuesClause(values_rows)
|
|
|
|
def _cast_select_items(self, select_query, column_list):
|
|
"""
|
|
For a given Query select_query and a column_list (list of Columns), cast each select
|
|
item in select_query to the exact type of the column.
|
|
|
|
A Query may have a UNION, recursively do this down the line.
|
|
"""
|
|
for col_idx, select_item in enumerate(select_query.select_clause.items):
|
|
cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type)
|
|
select_item.val_expr = cast_val_expr
|
|
if select_query.union_clause:
|
|
self._cast_select_items(select_query.union_clause.query, column_list)
|
|
|
|
|
|
def get_generator(statement_type):
|
|
"""
|
|
Given a statement type, return the proper statement generator.
|
|
"""
|
|
STATEMENT_GENERATOR_MAP = {
|
|
InsertStatement: InsertStatementGenerator,
|
|
Query: QueryGenerator,
|
|
}
|
|
return STATEMENT_GENERATOR_MAP[statement_type]
|