Files
impala/tests/query_test/test_exprs.py
Joe McDonnell 1908e44c3c IMPALA-4551: Limit the size of SQL statements
Various BI tools generate and run SQL. When used incorrectly or
misconfigured, the tools can generate extremely large SQLs.
Some of these SQL statements reach 10s of megabytes. Large SQL
statements impose costs throughout execution, including
statement rewrite logic in the frontend and codegen in the
backend. The resource usage of these statements can impact
the stability of the system or the ability to run other SQL
statements.

This implements two new query options that provide controls
to reject large SQL statements.
 - The first, MAX_STATEMENT_LENGTH_BYTES is a cap on the
   total size of the SQL statement (in bytes). It is
   applied before any parsing or analysis. It uses a
   default value of 16MB.
 - The second, STATEMENT_EXPRESSION_LIMIT, is a limit on
   the total number of expressions in a statement or any
   views that it references. The limit is applied upon the
   first round of analysis, but it is not reapplied when
   statement rewrite rules are applied. Certain expressions
   such as literals in IN lists or VALUES clauses are not
   analyzed and do not count towards the limit. It uses
   a default value of 250,000.
The two are complementary. Since enforcing the statement
expression limit requires parsing and analyzing the
statement, the MAX_STATEMENT_LENGTH_BYTES sets an upper
bound on the size of statement that needs to be parsed
and analyzed. Testing confirms that even statements
approaching 16MB get through the first round of analysis
within a few seconds and then are rejected.

This also changes the logging in tests/common/impala_connection.py
to limit the total SQL size that it will print to 128KB. This is
prevents the JUnitXML (which includes this logging) from being too
large. Existing tests do not run SQL larger than about 80KB, so
this only applies to tests added in this change that run multi-MB
SQLs to verify limits.

Testing:
 - This adds frontend tests that verify the low level
   semantics about how expressions are counted and verifies
   that the expression limits are enforced.
 - This adds end-to-end tests that verify both the
   MAX_STATEMENT_LENGTH_BYTES and STATEMENT_EXPRESSION_LIMIT
   at their defaults values.
 - There is also an end-to-end test that runs in exhaustive
   mode that runs a SQL with close to 250,000 expressions.

Change-Id: I5675fb4a08c1dc51ae5bcf467cbb969cc064602c
Reviewed-on: http://gerrit.cloudera.org:8080/14012
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2019-08-14 04:36:31 +00:00

246 lines
10 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
import re
from random import randint
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import create_exec_option_dimension
from tests.common.test_dimensions import create_uncompressed_text_dimension
from tests.common.test_vector import ImpalaTestDimension
from tests.util.test_file_parser import QueryTestSectionReader
class TestExprs(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestExprs, cls).add_test_dimensions()
# Test with and without expr rewrites to cover regular expr evaluations
# as well as constant folding, in particular, timestamp literals.
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('enable_expr_rewrites', *[0,1]))
if cls.exploration_strategy() == 'core':
# Test with file format that supports codegen
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and\
v.get_value('table_format').compression_codec == 'none')
def test_exprs(self, vector):
vector.get_value('exec_option')['enable_expr_rewrites'] = \
vector.get_value('enable_expr_rewrites')
# TODO: Enable some of these tests for Avro if possible
# Don't attempt to evaluate timestamp expressions with Avro tables (which don't
# support a timestamp type)"
table_format = vector.get_value('table_format')
if table_format.file_format == 'avro':
pytest.skip()
if table_format.file_format == 'hbase':
pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
if table_format.file_format == 'kudu':
# Can't load LikeTbl without KUDU-1570.
pytest.xfail("Need support for Kudu tables with nullable PKs (KUDU-1570)")
self.run_test_case('QueryTest/exprs', vector)
# This will change the current database to matching table format and then execute
# select current_database(). An error will be thrown if multiple values are returned.
current_db = self.execute_scalar('select current_database()', vector=vector)
assert current_db == QueryTestSectionReader.get_db_name(table_format)
def test_special_strings(self, vector):
"""Test handling of expressions with "special" strings."""
vector.get_value('exec_option')['enable_expr_rewrites'] = \
vector.get_value('enable_expr_rewrites')
self.run_test_case('QueryTest/special-strings', vector)
# Tests very deep expression trees and expressions with many children. Impala defines
# a 'safe' upper bound on the expr depth and the number of expr children in the
# FE Expr.java and any changes to those limits should be reflected in this test.
# The expr limits primarily guard against stack overflows or similar problems
# causing crashes. Therefore, this tests succeeds if no Impalads crash.
class TestExprLimits(ImpalaTestSuite):
# Keep these in sync with Expr.java
EXPR_CHILDREN_LIMIT = 10000
EXPR_DEPTH_LIMIT = 1000
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestExprLimits, cls).add_test_dimensions()
if cls.exploration_strategy() != 'exhaustive':
# Ensure the test runs with codegen enabled and disabled, even when the
# exploration strategy is not exhaustive.
cls.ImpalaTestMatrix.clear_dimension('exec_option')
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
cluster_sizes=[0], disable_codegen_options=[False, True], batch_sizes=[0]))
# There is no reason to run these tests using all dimensions.
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_expr_child_limit(self, vector):
# IN predicate
in_query = "select 1 IN("
for i in xrange(0, self.EXPR_CHILDREN_LIMIT - 1):
in_query += str(i)
if (i + 1 != self.EXPR_CHILDREN_LIMIT - 1):
in_query += ","
in_query += ")"
self.__exec_query(in_query)
# CASE expr
case_query = "select case "
for i in xrange(0, self.EXPR_CHILDREN_LIMIT/2):
case_query += " when true then 1"
case_query += " end"
self.__exec_query(case_query)
def test_expr_depth_limit(self, vector):
# Compound predicates
and_query = "select " + self.__gen_deep_infix_expr("true", " and false")
self.__exec_query(and_query)
or_query = "select " + self.__gen_deep_infix_expr("true", " or false")
self.__exec_query(or_query)
# Arithmetic expr
arith_query = "select " + self.__gen_deep_infix_expr("1", " + 1")
self.__exec_query(arith_query)
func_query = "select " + self.__gen_deep_func_expr("lower(", "'abc'", ")")
self.__exec_query(func_query)
# Casts.
cast_query = "select " + self.__gen_deep_func_expr("cast(", "1", " as int)")
self.__exec_query(cast_query)
def test_under_statement_expression_limit(self):
"""Generate a huge case statement that barely fits within the statement expression
limit and verify that it runs."""
# This takes 20+ minutes, so only run it on exhaustive.
# TODO: Determine whether this needs to run serially. It use >5 GB of memory.
if self.exploration_strategy() != 'exhaustive':
pytest.skip("Only test limit of codegen on exhaustive")
case = self.__gen_huge_case("int_col", 32, 2, " ")
query = "select {0} as huge_case from functional_parquet.alltypes".format(case)
self.__exec_query(query)
def test_max_statement_size(self):
"""Generate a huge case statement that exceeds the default 16MB limit and verify
that it gets rejected."""
expected_err_tmpl = ("Statement length of {0} bytes exceeds the maximum "
"statement length \({1} bytes\)")
size_16mb = 16 * 1024 * 1024
# Case 1: a valid SQL that would parse correctly
case = self.__gen_huge_case("int_col", 75, 2, " ")
query = "select {0} as huge_case from functional.alltypes".format(case)
err = self.execute_query_expect_failure(self.client, query)
assert re.search(expected_err_tmpl.format(len(query), size_16mb), str(err))
# Case 2: a string of 'a' characters that does not parse. This will still fail
# with the same message, because the check is before parsing.
invalid_sql = 'a' * (size_16mb + 1)
err = self.execute_query_expect_failure(self.client, invalid_sql)
assert re.search(expected_err_tmpl.format(len(invalid_sql), size_16mb), str(err))
def test_statement_expression_limit(self):
"""Generate a huge case statement that barely fits within the 16MB limit but exceeds
the statement expression limit. Verify that it fails."""
case = self.__gen_huge_case("int_col", 66, 2, " ")
query = "select {0} as huge_case from functional.alltypes".format(case)
assert len(query) < 16 * 1024 * 1024
expected_err_re = ("Exceeded the statement expression limit \({0}\)\n"
"Statement has .* expressions.").format(250000)
err = self.execute_query_expect_failure(self.client, query)
assert re.search(expected_err_re, str(err))
def __gen_huge_case(self, col_name, fanout, depth, indent):
toks = ["case\n"]
for i in xrange(fanout):
add = randint(1, 1000000)
divisor = randint(1, 10000000)
mod = randint(0, divisor)
# Generate a mathematical expr that can't be easily optimised out.
when_expr = "{0} + {1} % {2} = {3}".format(col_name, add, divisor, mod)
if depth == 0:
then_expr = "{0}".format(i)
else:
then_expr = "({0})".format(
self.__gen_huge_case(col_name, fanout, depth - 1, indent + " "))
toks.append(indent)
toks.append("when {0} then {1}\n".format(when_expr, then_expr))
toks.append(indent)
toks.append("end")
return ''.join(toks)
def __gen_deep_infix_expr(self, prefix, repeat_suffix):
expr = prefix
for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
expr += repeat_suffix
return expr
def __gen_deep_func_expr(self, open_func, base_arg, close_func):
expr = ""
for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
expr += open_func
expr += base_arg
for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
expr += close_func
return expr
def __exec_query(self, sql_str):
try:
impala_ret = self.execute_query(sql_str)
assert impala_ret.success, "Failed to execute query %s" % (sql_str)
except Exception as e: # consider any exception a failure
assert False, "Failed to execute query %s: %s" % (sql_str, e)
class TestUtcTimestampFunctions(ImpalaTestSuite):
"""Tests for UTC timestamp functions, i.e. functions that do not depend on the behavior
of the flag --use_local_tz_for_unix_timestamp_conversions. Tests added here should
also be run in the custom cluster test test_local_tz_conversion.py to ensure they
have the same behavior when the conversion flag is set to true."""
@classmethod
def add_test_dimensions(cls):
super(TestUtcTimestampFunctions, cls).add_test_dimensions()
# Test with and without expr rewrites to cover regular expr evaluations
# as well as constant folding, in particular, timestamp literals.
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('enable_expr_rewrites', *[0,1]))
if cls.exploration_strategy() == 'core':
# Test with file format that supports codegen
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and\
v.get_value('table_format').compression_codec == 'none')
@classmethod
def get_workload(cls):
return 'functional-query'
def test_utc_functions(self, vector):
vector.get_value('exec_option')['enable_expr_rewrites'] = \
vector.get_value('enable_expr_rewrites')
self.run_test_case('QueryTest/utc-timestamp-functions', vector)