diff --git a/be/src/service/impala-server.cc b/be/src/service/impala-server.cc index d0966ba5a..9c4c73216 100644 --- a/be/src/service/impala-server.cc +++ b/be/src/service/impala-server.cc @@ -1106,6 +1106,14 @@ Status ImpalaServer::ExecuteInternal( } #endif + size_t statement_length = query_ctx.client_request.stmt.length(); + int32_t max_statement_length = + query_ctx.client_request.query_options.max_statement_length_bytes; + if (max_statement_length > 0 && statement_length > max_statement_length) { + return Status(ErrorMsg(TErrorCode::MAX_STATEMENT_LENGTH_EXCEEDED, + statement_length, max_statement_length)); + } + RETURN_IF_ERROR((*request_state)->UpdateQueryStatus( exec_env_->frontend()->GetExecRequest(query_ctx, &result))); diff --git a/be/src/service/query-options-test.cc b/be/src/service/query-options-test.cc index 79f50cc9c..715422dd2 100644 --- a/be/src/service/query-options-test.cc +++ b/be/src/service/query-options-test.cc @@ -161,7 +161,10 @@ TEST(QueryOptions, SetByteOptions) { {8 * 1024, RuntimeFilterBank::MAX_BLOOM_FILTER_SIZE}}, {MAKE_OPTIONDEF(runtime_bloom_filter_size), {RuntimeFilterBank::MIN_BLOOM_FILTER_SIZE, - RuntimeFilterBank::MAX_BLOOM_FILTER_SIZE}}}; + RuntimeFilterBank::MAX_BLOOM_FILTER_SIZE}}, + {MAKE_OPTIONDEF(max_statement_length_bytes), + {MIN_MAX_STATEMENT_LENGTH_BYTES, I32_MAX}}, + }; TestByteCaseSet(options, case_set_i64); TestByteCaseSet(options, case_set_i32); } @@ -239,6 +242,8 @@ TEST(QueryOptions, SetIntOptions) { {MAKE_OPTIONDEF(exec_time_limit_s), {0, I32_MAX}}, {MAKE_OPTIONDEF(thread_reservation_limit), {-1, I32_MAX}}, {MAKE_OPTIONDEF(thread_reservation_aggregate_limit), {-1, I32_MAX}}, + {MAKE_OPTIONDEF(statement_expression_limit), + {MIN_STATEMENT_EXPRESSION_LIMIT, I32_MAX}}, }; for (const auto& test_case : case_set) { const OptionDef& option_def = test_case.first; diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index a84dd49b2..d48d0ecf0 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -815,6 +815,32 @@ Status impala::SetQueryOption(const string& key, const string& value, query_options->__set_default_transactional_type(enum_type); break; } + case TImpalaQueryOptions::STATEMENT_EXPRESSION_LIMIT: { + StringParser::ParseResult result; + const int32_t statement_expression_limit = + StringParser::StringToInt(value.c_str(), value.length(), &result); + if (result != StringParser::PARSE_SUCCESS || + statement_expression_limit < MIN_STATEMENT_EXPRESSION_LIMIT) { + return Status(Substitute("Invalid statement expression limit: $0 " + "Valid values are in [$1, $2]", value, MIN_STATEMENT_EXPRESSION_LIMIT, + std::numeric_limits::max())); + } + query_options->__set_statement_expression_limit(statement_expression_limit); + break; + } + case TImpalaQueryOptions::MAX_STATEMENT_LENGTH_BYTES: { + int64_t max_statement_length_bytes; + RETURN_IF_ERROR(ParseMemValue(value, "max statement length bytes", + &max_statement_length_bytes)); + if (max_statement_length_bytes < MIN_MAX_STATEMENT_LENGTH_BYTES || + max_statement_length_bytes > std::numeric_limits::max()) { + return Status(Substitute("Invalid maximum statement length: $0 " + "Valid values are in [$1, $2]", max_statement_length_bytes, + MIN_MAX_STATEMENT_LENGTH_BYTES, std::numeric_limits::max())); + } + query_options->__set_max_statement_length_bytes(max_statement_length_bytes); + break; + } default: if (IsRemovedQueryOption(key)) { LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'"; diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index 6bb7153d7..ff408ddc4 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -47,7 +47,7 @@ typedef std::unordered_map // time we add or remove a query option to/from the enum TImpalaQueryOptions. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::DEFAULT_TRANSACTIONAL_TYPE + 1);\ + TImpalaQueryOptions::MAX_STATEMENT_LENGTH_BYTES + 1);\ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\ REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -174,12 +174,21 @@ typedef std::unordered_map QUERY_OPT_FN(spool_query_results, SPOOL_QUERY_RESULTS,\ TQueryOptionLevel::DEVELOPMENT)\ QUERY_OPT_FN(default_transactional_type, DEFAULT_TRANSACTIONAL_TYPE,\ - TQueryOptionLevel::ADVANCED) + TQueryOptionLevel::ADVANCED)\ + QUERY_OPT_FN(statement_expression_limit, STATEMENT_EXPRESSION_LIMIT,\ + TQueryOptionLevel::REGULAR)\ + QUERY_OPT_FN(max_statement_length_bytes, MAX_STATEMENT_LENGTH_BYTES,\ + TQueryOptionLevel::REGULAR) ; /// Enforce practical limits on some query options to avoid undesired query state. - static const int64_t SPILLABLE_BUFFER_LIMIT = 1LL << 40; // 1 TB - static const int64_t ROW_SIZE_LIMIT = 1LL << 40; // 1 TB +static const int64_t SPILLABLE_BUFFER_LIMIT = 1LL << 40; // 1 TB +static const int64_t ROW_SIZE_LIMIT = 1LL << 40; // 1 TB + +/// Limits on the query size are intended to be large. Prevent them from being set +/// to small values (which can prevent clients from executing anything). +static const int32_t MIN_STATEMENT_EXPRESSION_LIMIT = 1 << 10; // 1024 +static const int32_t MIN_MAX_STATEMENT_LENGTH_BYTES = 1 << 10; // 1 KB /// Converts a TQueryOptions struct into a map of key, value pairs. Options that /// aren't set and lack defaults in common/thrift/ImpalaInternalService.thrift are diff --git a/common/thrift/ImpalaInternalService.thrift b/common/thrift/ImpalaInternalService.thrift index 3e1289598..be7982285 100644 --- a/common/thrift/ImpalaInternalService.thrift +++ b/common/thrift/ImpalaInternalService.thrift @@ -367,6 +367,18 @@ struct TQueryOptions { // See comment in ImpalaService.thrift 87: optional TTransactionalType default_transactional_type = TTransactionalType.NONE; + + // See comment in ImpalaService.thrift. + // The default of 250,000 is set to a high value to avoid impacting existing users, but + // testing indicates a statement with this number of expressions can run. + 88: optional i32 statement_expression_limit = 250000 + + // See comment in ImpalaService.thrift + // The default is set to 16MB. It is likely that a statement of this size would exceed + // the statement expression limit. Setting a limit on the total statement size avoids + // the cost of parsing and analyzing the statement, which is required to enforce the + // statement expression limit. + 89: optional i32 max_statement_length_bytes = 16777216 } // Impala currently has two types of sessions: Beeswax and HiveServer2 diff --git a/common/thrift/ImpalaService.thrift b/common/thrift/ImpalaService.thrift index ae030515d..d5c57aa4c 100644 --- a/common/thrift/ImpalaService.thrift +++ b/common/thrift/ImpalaService.thrift @@ -415,6 +415,20 @@ enum TImpalaQueryOptions { // Speficies the default transactional type for new HDFS tables. // Valid values: none, insert_only DEFAULT_TRANSACTIONAL_TYPE = 86 + + // Limit on the total number of expressions in the statement. Statements that exceed + // the limit will get an error during analysis. This is intended to set an upper + // bound on the complexity of statements to avoid resource impacts such as excessive + // time in analysis or codegen. This is enforced only for the first pass of analysis + // before any rewrites are applied. + STATEMENT_EXPRESSION_LIMIT = 87 + + // Limit on the total length of a SQL statement. Statements that exceed the maximum + // length will get an error before parsing/analysis. This is complementary to the + // statement expression limit, because statements of a certain size are highly + // likely to violate the statement expression limit. Rejecting them early avoids + // the cost of parsing/analysis. + MAX_STATEMENT_LENGTH_BYTES = 88 } // The summary of a DML statement. diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py index bb45c3d21..d9d4d68d6 100755 --- a/common/thrift/generate_error_codes.py +++ b/common/thrift/generate_error_codes.py @@ -430,6 +430,9 @@ error_codes = ( ("LZ4_DECOMPRESS_SAFE_FAILED", 141, "LZ4: LZ4_decompress_safe failed"), ("LZ4_COMPRESS_DEFAULT_FAILED", 142, "LZ4: LZ4_compress_default failed"), + + ("MAX_STATEMENT_LENGTH_EXCEEDED", 143, "Statement length of $0 bytes exceeds the " + "maximum statement length ($1 bytes)"), ) import sys diff --git a/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java b/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java index d5312f977..29cb6f873 100644 --- a/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java +++ b/fe/src/main/java/org/apache/impala/analysis/AnalysisContext.java @@ -454,6 +454,14 @@ public class AnalysisContext { Preconditions.checkNotNull(analysisResult_.stmt_); analysisResult_.analyzer_ = createAnalyzer(stmtTableCache); analysisResult_.stmt_.analyze(analysisResult_.analyzer_); + // Enforce the statement expression limit at the end of analysis so that there is an + // accurate count of the total number of expressions. The first analyze() call is not + // very expensive (~seconds) even for large statements. The limit on the total length + // of the SQL statement (max_statement_length_bytes) provides an upper bound. + // It is important to enforce this before expression rewrites, because rewrites are + // expensive with large expression trees. For example, a SQL that takes a few seconds + // to analyze the first time may take 10 minutes for rewrites. + analysisResult_.analyzer_.checkStmtExprLimit(); boolean isExplain = analysisResult_.isExplainStmt(); // Apply expr and subquery rewrites. diff --git a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java index f0656ec86..75c1fbe7a 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Analyzer.java +++ b/fe/src/main/java/org/apache/impala/analysis/Analyzer.java @@ -455,6 +455,12 @@ public class Analyzer { // Expr rewriter for normalizing and rewriting expressions. private final ExprRewriter exprRewriter_; + // Total number of expressions across the statement (including all subqueries). This + // is used to enforce a limit on the total number of expressions. Incremented by + // incrementNumStmtExprs(). Note that this does not include expressions that do not + // require analysis (e.g. some literal expressions). + private int numStmtExprs_ = 0; + public GlobalState(StmtTableCache stmtTableCache, TQueryCtx queryCtx, AuthorizationFactory authzFactory) { this.stmtTableCache = stmtTableCache; @@ -2847,6 +2853,17 @@ public class Analyzer { public int decrementCallDepth() { return --callDepth_; } public int getCallDepth() { return callDepth_; } + public int incrementNumStmtExprs() { return globalState_.numStmtExprs_++; } + public int getNumStmtExprs() { return globalState_.numStmtExprs_; } + public void checkStmtExprLimit() throws AnalysisException { + int statementExpressionLimit = getQueryOptions().getStatement_expression_limit(); + if (getNumStmtExprs() > statementExpressionLimit) { + String errorStr = String.format("Exceeded the statement expression limit (%d)\n" + + "Statement has %d expressions.", statementExpressionLimit, getNumStmtExprs()); + throw new AnalysisException(errorStr); + } + } + public boolean hasMutualValueTransfer(SlotId a, SlotId b) { return hasValueTransfer(a, b) && hasValueTransfer(b, a); } diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java index 53a00817e..1cfa93819 100644 --- a/fe/src/main/java/org/apache/impala/analysis/Expr.java +++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java @@ -347,6 +347,9 @@ abstract public class Expr extends TreeNode implements ParseNode, Cloneabl // analysisDone(). private boolean isAnalyzed_ = false; + // True if this has already been counted towards the number of statement expressions + private boolean isCountedForNumStmtExprs_ = false; + protected Expr() { type_ = Type.INVALID; selectivity_ = -1.0; @@ -369,6 +372,7 @@ abstract public class Expr extends TreeNode implements ParseNode, Cloneabl numDistinctValues_ = other.numDistinctValues_; isConstant_ = other.isConstant_; fn_ = other.fn_; + isCountedForNumStmtExprs_ = other.isCountedForNumStmtExprs_; children_ = Expr.cloneList(other.children_); } @@ -420,6 +424,7 @@ abstract public class Expr extends TreeNode implements ParseNode, Cloneabl throw new AnalysisException(String.format("Exceeded the maximum depth of an " + "expression tree (%s).", EXPR_DEPTH_LIMIT)); } + incrementNumStmtExprs(analyzer); } for (Expr child: children_) { child.analyze(analyzer); @@ -452,6 +457,24 @@ abstract public class Expr extends TreeNode implements ParseNode, Cloneabl } } + /** + * Helper function to properly count the number of statement expressions. + * If this expression has not been counted already and this is not a WITH clause, + * increment the number of statement expressions. This function guarantees that an + * expression will be counted at most once. + */ + private void incrementNumStmtExprs(Analyzer analyzer) { + // WITH clauses use a separate Analyzer with its own GlobalState. Skip counting + // this expression towards that GlobalState. If the view defined by the WITH + // clause is referenced, it will be counted during that analysis. + if (analyzer.hasWithClause()) return; + // If the expression is already counted, do not count it again. This is important + // for expressions that can be cloned (e.g. when doing Expr::trySubstitute()). + if (isCountedForNumStmtExprs_) return; + analyzer.incrementNumStmtExprs(); + isCountedForNumStmtExprs_ = true; + } + /** * Compute and return evalcost of this expr given the evalcost of all children has been * computed. Should be called bottom-up whenever the structure of subtree is modified. diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java index 23910f400..5fcb747e7 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java @@ -2399,6 +2399,165 @@ public class AnalyzeExprsTest extends AnalyzerTest { testFuncExprDepthLimit("cast(", "1", " as int)"); } + /** Generates a specific number of expressions by repeating a column reference. + * It generates a string containing 'num_copies' expressions. If 'use_alias' is true, + * each column reference has a distinct alias. This allows the repeated columns to be + * used in places that require distinct names (e.g. the definition of a view). + */ + String getRepeatedColumnReference(String col_name, int num_copies, boolean use_alias) { + StringBuilder repColRef = new StringBuilder(); + + for (int i = 0; i < num_copies; ++i) { + if (i != 0) repColRef.append(", "); + repColRef.append(col_name); + if (use_alias) repColRef.append(String.format(" alias%d", i)); + } + return repColRef.toString(); + } + + /** Get the error message for a statement with 'actual_num_expressions' that exceeds + * the statment_expression_limit 'limit'. + */ + String getExpressionLimitErrorStr(int actual_num_expressions, int limit) { + return String.format("Exceeded the statement expression limit (%d)\n" + + "Statement has %d expressions", limit, actual_num_expressions); + } + + @Test + public void TestStatementExprLimit() { + // To keep it simple and fast, we use a low value for statement_expression_limit. + // Since the statement_expression_limit is evaluated before rewrites, turn off + // expression rewrites. + TQueryOptions queryOpts = new TQueryOptions(); + queryOpts.setStatement_expression_limit(20); + queryOpts.setEnable_expr_rewrites(false); + AnalysisContext ctx = createAnalysisCtx(queryOpts); + + // Known SQL patterns (repeated reference to column) that generate 20 and 21 + // expressions, respectively. + String repCols20 = getRepeatedColumnReference("int_col", 20, true); + String repCols21 = getRepeatedColumnReference("int_col", 21, true); + + // Select from table + AnalyzesOk(String.format("select %s from functional.alltypes", repCols20), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20); + AnalysisError(String.format("select %s from functional.alltypes", repCols21), + ctx, getExpressionLimitErrorStr(21, 20)); + + // WHERE clause + // This statement has 20 expressions without the WHERE clause, as tested above. + // So, this will only be an error if the bool_col in the WHERE clause counts. + AnalysisError(String.format("select %s from functional.alltypes where bool_col", + repCols20), ctx, getExpressionLimitErrorStr(21, 20)); + + // Create table as select + AnalyzesOk(String.format("create table exprlimit1 as " + + "select %s from functional.alltypes", repCols20), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20); + AnalysisError(String.format("create table exprlimit1 as " + + "select %s from functional.alltypes", repCols21), ctx, + getExpressionLimitErrorStr(21, 20)); + + // Create view + AnalyzesOk(String.format("create view exprlimit1 as " + + "select %s from functional.alltypes", repCols20), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20); + AnalysisError(String.format("create view exprlimit1 as " + + "select %s from functional.alltypes", repCols21), ctx, + getExpressionLimitErrorStr(21, 20)); + + // Subquery + AnalyzesOk(String.format("select * from (select %s from functional.alltypes) x", + repCols20), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20); + AnalysisError(String.format("select * from (select %s from functional.alltypes) x", + repCols21), ctx, getExpressionLimitErrorStr(21, 20)); + + // WITH clause + AnalyzesOk(String.format("with v as (select %s from functional.alltypes) " + + "select * from v", repCols20), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20); + AnalysisError(String.format("with v as (select %s from functional.alltypes) " + + "select * from v", repCols21), ctx, getExpressionLimitErrorStr(21, 20)); + + // View is counted (functional.alltypes_parens has a few expressions) + AnalysisError(String.format("select %s from functional.alltypes_parens", repCols20), + ctx, getExpressionLimitErrorStr(32, 20)); + + // If a view is referenced multiple times, it counts multiple times. + AnalysisError(String.format("with v as (select %s from functional.alltypes) " + + "select * from v v1,v v2", repCols20), ctx, getExpressionLimitErrorStr(40, 20)); + + // If a view is not referenced, it doesn't count. + AnalyzesOk(String.format("with v as (select %s from functional.alltypes) select 1", + repCols21), ctx); + // This is zero because literals do not count. + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 0); + + // EXPLAIN is not exempted from the limit + AnalysisError(String.format("explain select %s from functional.alltypes", repCols21), + ctx, getExpressionLimitErrorStr(21, 20)); + + // Wide table doesn't impact * + AnalyzesOk("select * from functional.widetable_1000_cols", ctx); + + // Literal expressions don't count towards the limit, so build a list of literals + // to use to test various cases. + StringBuilder literalList = new StringBuilder(); + for (int i = 0; i < 200; ++i) { + if (i != 0) literalList.append(", "); + literalList.append(i); + } + + // Literals in the select list do not count + AnalyzesOk(String.format("select %s", literalList.toString()), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 0); + + // Literals in an IN list do not count + AnalyzesOk(String.format("select int_col IN (%s) from functional.alltypes", + literalList.toString()), ctx); + + // Literals in a VALUES clause do not count + StringBuilder valuesList = new StringBuilder(); + for (int i = 0; i < 200; ++i) { + if (i != 0) valuesList.append(", "); + valuesList.append("(" + i + ")"); + } + AnalyzesOk("insert into functional.insert_overwrite_nopart (col1) VALUES " + + valuesList.toString(), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 0); + + // Expressions that are operators applied to constants still count + StringBuilder constantExpr = new StringBuilder(); + for (int i = 0; i < 21; ++i) { + if (i != 0) constantExpr.append(" * "); + constantExpr.append(i); + } + // 21 literals with 20 multiplications requires 20 ArithmeticExprs + AnalyzesOk(String.format("select %s", constantExpr.toString()), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20); + constantExpr.append(" * 100"); + // 22 literals with 21 multiplications requires 21 ArithmeticExprs + AnalysisError(String.format("select %s", constantExpr.toString()), + ctx, getExpressionLimitErrorStr(21, 20)); + + // Put a non-literal in an IN list to verify it still counts appropriately. + StringBuilder inListExpr = new StringBuilder(); + for (int i = 0; i < 19; i++) { + if (i != 0) inListExpr.append(" * "); + inListExpr.append(i); + } + // 19 literals with 18 multiplications = 18 expressions. int_col and IN are another + // two expressions. This has 20 expressions total. + AnalyzesOk(String.format("select int_col IN (%s) from functional.alltypes", + inListExpr.toString()), ctx); + assertEquals(ctx.getAnalyzer().getNumStmtExprs(), 20); + // Adding one more expression pushes us over the limit. + inListExpr.append(" * 100"); + AnalysisError(String.format("select int_col IN (%s) from functional.alltypes", + inListExpr.toString()), ctx, getExpressionLimitErrorStr(21, 20)); + } + // Verifies the resulting expr decimal type is expectedType under decimal v1 or // decimal v2. private void testDecimalExpr(String expr, Type decimalExpectedType, boolean isV2) { diff --git a/tests/common/impala_connection.py b/tests/common/impala_connection.py index 8b275fb6d..f1a33bb2d 100644 --- a/tests/common/impala_connection.py +++ b/tests/common/impala_connection.py @@ -41,6 +41,26 @@ LOG.propagate = False PROGRESS_LOG_RE = re.compile( r'^Query [a-z0-9:]+ [0-9]+% Complete \([0-9]+ out of [0-9]+\)$') +MAX_SQL_LOGGING_LENGTH = 128 * 1024 + + +# test_exprs.py's TestExprLimits executes extremely large SQLs (multiple MBs). It is the +# only test that runs SQL larger than 128KB. Logging these SQLs in execute() increases +# the size of the JUnitXML files, causing problems for users of JUnitXML like Jenkins. +# This function limits the size of the SQL logged if it is larger than 128KB. +def log_sql_stmt(sql_stmt): + """If the 'sql_stmt' is shorter than MAX_SQL_LOGGING_LENGTH, log it unchanged. If + it is larger than MAX_SQL_LOGGING_LENGTH, truncate it and comment it out.""" + if (len(sql_stmt) <= MAX_SQL_LOGGING_LENGTH): + LOG.info("{0};\n".format(sql_stmt)) + else: + # The logging output should be valid SQL, so the truncated SQL is commented out. + LOG.info("-- Skip logging full SQL statement of length {0}".format(len(sql_stmt))) + LOG.info("-- Logging a truncated version, commented out:") + for line in sql_stmt[0:MAX_SQL_LOGGING_LENGTH].split("\n"): + LOG.info("-- {0}".format(line)) + LOG.info("-- [...]") + # Common wrapper around the internal types of HS2/Beeswax operation/query handles. class OperationHandle(object): def __init__(self, handle, sql_stmt): @@ -180,11 +200,13 @@ class BeeswaxConnection(ImpalaConnection): self.__beeswax_client.close_dml(operation_handle.get_handle()) def execute(self, sql_stmt, user=None): - LOG.info("-- executing against %s\n%s;\n" % (self.__host_port, sql_stmt)) + LOG.info("-- executing against %s\n" % (self.__host_port)) + log_sql_stmt(sql_stmt) return self.__beeswax_client.execute(sql_stmt, user=user) def execute_async(self, sql_stmt, user=None): - LOG.info("-- executing async: %s\n%s;\n" % (self.__host_port, sql_stmt)) + LOG.info("-- executing async: %s\n" % (self.__host_port)) + log_sql_stmt(sql_stmt) beeswax_handle = self.__beeswax_client.execute_query_async(sql_stmt, user=user) return OperationHandle(beeswax_handle, sql_stmt) @@ -312,8 +334,9 @@ class ImpylaHS2Connection(ImpalaConnection): return r def execute_async(self, sql_stmt, user=None): - LOG.info("-- executing against {0} at {1}\n{2};\n".format( - self._is_hive and 'Hive' or 'Impala', self.__host_port, sql_stmt)) + LOG.info("-- executing against {0} at {1}\n".format( + self._is_hive and 'Hive' or 'Impala', self.__host_port)) + log_sql_stmt(sql_stmt) if user is not None: raise NotImplementedError("Not yet implemented for HS2 - authentication") try: diff --git a/tests/query_test/test_exprs.py b/tests/query_test/test_exprs.py index eb50ee7c4..6722ca80e 100644 --- a/tests/query_test/test_exprs.py +++ b/tests/query_test/test_exprs.py @@ -16,6 +16,8 @@ # under the License. import pytest +import re +from random import randint from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.test_dimensions import create_exec_option_dimension @@ -131,6 +133,67 @@ class TestExprLimits(ImpalaTestSuite): cast_query = "select " + self.__gen_deep_func_expr("cast(", "1", " as int)") self.__exec_query(cast_query) + def test_under_statement_expression_limit(self): + """Generate a huge case statement that barely fits within the statement expression + limit and verify that it runs.""" + # This takes 20+ minutes, so only run it on exhaustive. + # TODO: Determine whether this needs to run serially. It use >5 GB of memory. + if self.exploration_strategy() != 'exhaustive': + pytest.skip("Only test limit of codegen on exhaustive") + case = self.__gen_huge_case("int_col", 32, 2, " ") + query = "select {0} as huge_case from functional_parquet.alltypes".format(case) + self.__exec_query(query) + + def test_max_statement_size(self): + """Generate a huge case statement that exceeds the default 16MB limit and verify + that it gets rejected.""" + + expected_err_tmpl = ("Statement length of {0} bytes exceeds the maximum " + "statement length \({1} bytes\)") + size_16mb = 16 * 1024 * 1024 + + # Case 1: a valid SQL that would parse correctly + case = self.__gen_huge_case("int_col", 75, 2, " ") + query = "select {0} as huge_case from functional.alltypes".format(case) + err = self.execute_query_expect_failure(self.client, query) + assert re.search(expected_err_tmpl.format(len(query), size_16mb), str(err)) + + # Case 2: a string of 'a' characters that does not parse. This will still fail + # with the same message, because the check is before parsing. + invalid_sql = 'a' * (size_16mb + 1) + err = self.execute_query_expect_failure(self.client, invalid_sql) + assert re.search(expected_err_tmpl.format(len(invalid_sql), size_16mb), str(err)) + + def test_statement_expression_limit(self): + """Generate a huge case statement that barely fits within the 16MB limit but exceeds + the statement expression limit. Verify that it fails.""" + case = self.__gen_huge_case("int_col", 66, 2, " ") + query = "select {0} as huge_case from functional.alltypes".format(case) + assert len(query) < 16 * 1024 * 1024 + expected_err_re = ("Exceeded the statement expression limit \({0}\)\n" + "Statement has .* expressions.").format(250000) + err = self.execute_query_expect_failure(self.client, query) + assert re.search(expected_err_re, str(err)) + + def __gen_huge_case(self, col_name, fanout, depth, indent): + toks = ["case\n"] + for i in xrange(fanout): + add = randint(1, 1000000) + divisor = randint(1, 10000000) + mod = randint(0, divisor) + # Generate a mathematical expr that can't be easily optimised out. + when_expr = "{0} + {1} % {2} = {3}".format(col_name, add, divisor, mod) + if depth == 0: + then_expr = "{0}".format(i) + else: + then_expr = "({0})".format( + self.__gen_huge_case(col_name, fanout, depth - 1, indent + " ")) + toks.append(indent) + toks.append("when {0} then {1}\n".format(when_expr, then_expr)) + toks.append(indent) + toks.append("end") + return ''.join(toks) + def __gen_deep_infix_expr(self, prefix, repeat_suffix): expr = prefix for i in xrange(self.EXPR_DEPTH_LIMIT - 1): @@ -150,8 +213,8 @@ class TestExprLimits(ImpalaTestSuite): try: impala_ret = self.execute_query(sql_str) assert impala_ret.success, "Failed to execute query %s" % (sql_str) - except: # consider any exception a failure - assert False, "Failed to execute query %s" % (sql_str) + except Exception as e: # consider any exception a failure + assert False, "Failed to execute query %s: %s" % (sql_str, e) class TestUtcTimestampFunctions(ImpalaTestSuite): """Tests for UTC timestamp functions, i.e. functions that do not depend on the behavior