Files
impala/tests/comparison/statement_generator.py
Michael Brown 0154ace61f IMPALA-4359: qgen: add UPSERT support
UPSERTs are very similar to INSERTs, so the UPSERT support is simply
folded into that of INSERT. We do this by adding another "conflict
action", CONFLICT_ACTION_UPDATE. The object responsible for holding the
conflict_action attribute is now the InsertClause. This is needed here
because the SqlWriter now needs to know the conflict_action both when
writing the InsertClause (Impala) and at the tail end of the
InsertStatement (PostgreSQL). We also add a few properties to the
InsertStatement interface so that the PostgresqlSqlWriter can form the
correct "DO UPDATE" conflic action, in which primary key columns and
updatable columns must be known. More information on that here:

https://www.postgresql.org/docs/9.5/static/sql-insert.html

By default, we will tend to generate 3 UPSERTs for every 1 INSERT.

In addition to adding unit tests to make sure UPSERTs are properly
written, I used discrepancy_searcher.py --profile dmlonly, both with and
without --explain-only, do run tests. I made sure we were generating
syntactically valid UPSERT statements, and that the INSERT/UPSERT ratio
was roughly 1/3 after 100 statements.

Change-Id: I6382f6ab22ba29c117e39a5d90592d3637df4b25
Reviewed-on: http://gerrit.cloudera.org:8080/5795
Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com>
Tested-by: Impala Public Jenkins
2017-02-04 06:13:34 +00:00

159 lines
7.0 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from copy import deepcopy
from tests.comparison.common import Table
from tests.comparison.funcs import CastFunc
from tests.comparison.query import (
InsertClause,
InsertStatement,
Query,
StatementExecutionMode,
ValuesClause,
ValuesRow)
from tests.comparison.query_generator import QueryGenerator
class InsertStatementGenerator(object):
def __init__(self, profile):
# QueryProfile-like object
self.profile = profile
# used to generate SELECT queries for INSERT/UPSERT ... SELECT statements;
# to ensure state is completely reset, this is created anew with each call to
# generate_statement()
self.select_stmt_generator = None
def generate_statement(self, tables, dml_table):
"""
Return a randomly generated INSERT or UPSERT statement. Note that UPSERTs are very
similar to INSERTs, which is why this generator handles both.
tables should be a list of Table objects. A typical source of such a list comes from
db_connection.DbCursor.describe_common_tables(). This list describes the possible
"sources" of the INSERT/UPSERT's WITH and FROM/WHERE clauses.
dml_table is a required Table object. The INSERT/UPSERT will be into this table.
"""
if not (isinstance(tables, list) and len(tables) > 0 and
all((isinstance(t, Table) for t in tables))):
raise Exception('tables must be a not-empty list of Table objects')
if not isinstance(dml_table, Table):
raise Exception('dml_table must be a Table')
self.select_stmt_generator = QueryGenerator(self.profile)
insert_statement = InsertStatement(execution=StatementExecutionMode.DML_TEST)
# Choose whether this is a
# INSERT/UPSERT INTO table SELECT/VALUES
# or
# INSERT/UPSERT INTO table (col1, col2, ...) SELECT/VALUES
# If the method returns None, it's the former.
insert_column_list = self.profile.choose_insert_column_list(dml_table)
if dml_table.primary_keys:
# Having primary keys implies the table is a Kudu table, which makes it subject to
# both INSERTs (with automatic ignoring of primary key duplicates) and UPSERTs.
conflict_action = self.profile.choose_insert_vs_upsert()
else:
conflict_action = InsertClause.CONFLICT_ACTION_DEFAULT
insert_statement.insert_clause = InsertClause(
dml_table, column_list=insert_column_list, conflict_action=conflict_action)
# We still need to internally track the columns we're inserting. Keep in mind None
# means "all" without an explicit column list. Since we've already created the
# InsertClause object though, we can fill this in for ourselves.
if insert_column_list is None:
insert_column_list = dml_table.cols
insert_item_data_types = [col.type for col in insert_column_list]
# Decide whether this is INSERT/UPSERT VALUES or INSERT/UPSERT SELECT
insert_source_clause = self.profile.choose_insert_source_clause()
if issubclass(insert_source_clause, Query):
# Use QueryGenerator()'s public interface to generate the SELECT.
select_query = self.select_stmt_generator.generate_statement(
tables, select_item_data_types=insert_item_data_types)
# To avoid many loss-of-precision errors, explicitly cast the SelectItems. The
# generator's type system is not near sophisticated enough to know how random
# expressions will be implicitly casted in the databases. This requires less work
# to implement. IMPALA-4693 considers alternative approaches.
self._cast_select_items(select_query, insert_column_list)
insert_statement.with_clause = deepcopy(select_query.with_clause)
select_query.with_clause = None
insert_statement.select_query = select_query
elif issubclass(insert_source_clause, ValuesClause):
insert_statement.values_clause = self._generate_values_clause(insert_column_list)
else:
raise Exception('unsupported INSERT/UPSERT source clause: {0}'.format(
insert_source_clause))
return insert_statement
def _generate_values_clause(self, columns):
"""
Return a VALUES clause containing a variable number of rows.
The values corresponding to primary keys will be non-null constants. Any other
columns could be null, constants, or function trees that may or may not evaluate to
null.
"""
values_rows = []
for _ in xrange(self.profile.choose_insert_values_row_count()):
values_row = []
for col in columns:
if col.is_primary_key:
val = self.profile.choose_constant(return_type=col.exact_type, allow_null=False)
elif 'constant' == self.profile.choose_values_item_expr():
val = self.profile.choose_constant(return_type=col.exact_type, allow_null=True)
else:
func_tree = self.select_stmt_generator.create_func_tree(
col.type, allow_subquery=False)
val = self.select_stmt_generator.populate_func_with_vals(func_tree)
# Only the generic type, not the exact type, of the value will be known. To
# avoid a lot of failed queries due to precision errors, we cast the val to
# the exact type of the column. This will still not prevent "out of range"
# conditions, as we don't try to evaluate the random expressions.
val = CastFunc(val, col.exact_type)
values_row.append(val)
values_rows.append(ValuesRow(values_row))
return ValuesClause(values_rows)
def _cast_select_items(self, select_query, column_list):
"""
For a given Query select_query and a column_list (list of Columns), cast each select
item in select_query to the exact type of the column.
A Query may have a UNION, recursively do this down the line.
"""
for col_idx, select_item in enumerate(select_query.select_clause.items):
cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type)
select_item.val_expr = cast_val_expr
if select_query.union_clause:
self._cast_select_items(select_query.union_clause.query, column_list)
def get_generator(statement_type):
"""
Given a statement type, return the proper statement generator.
"""
STATEMENT_GENERATOR_MAP = {
InsertStatement: InsertStatementGenerator,
Query: QueryGenerator,
}
return STATEMENT_GENERATOR_MAP[statement_type]