mirror of
https://github.com/apache/impala.git
synced 2025-12-20 10:29:58 -05:00
IMPALA-5845: Limit the number of non-fatal errors logging to INFO
RuntimeState::LogError() does both error aggregation to the coordinator and logging the error to the log file depending on the vlog_level. This can flood INFO log if the specified vlog_level is 1 and makes it difficult to analyze other more significant log lines. This patch limits the number of errors logged to INFO based on max_error_logs_per_instance flag (default is 2000). When this number is exceeded, vlog_level=1 will be downgraded to vlog_level=2. To allow easy debugging in the future, this flag will be ignored if the user sets query option max_errors < 0, which in that case all errors targetting vlog_level 1 will be logged. This patch also fixes a bug where the error count is not increased for non-general error code that is already in 'error_log_' map. Testing: - Add test_logging.py::TestLoggingCore Change-Id: I924768ec461735c172fbf75d6415033bbdb77f9b Reviewed-on: http://gerrit.cloudera.org:8080/18565 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
97d3b25be3
commit
7273cfdfb9
@@ -118,12 +118,6 @@ QueryState::QueryState(
|
|||||||
}
|
}
|
||||||
TQueryOptions& query_options =
|
TQueryOptions& query_options =
|
||||||
const_cast<TQueryOptions&>(query_ctx_.client_request.query_options);
|
const_cast<TQueryOptions&>(query_ctx_.client_request.query_options);
|
||||||
// max_errors does not indicate how many errors in total have been recorded, but rather
|
|
||||||
// how many are distinct. It is defined as the sum of the number of generic errors and
|
|
||||||
// the number of distinct other errors.
|
|
||||||
if (query_options.max_errors <= 0) {
|
|
||||||
query_options.max_errors = 100;
|
|
||||||
}
|
|
||||||
if (query_options.batch_size <= 0) {
|
if (query_options.batch_size <= 0) {
|
||||||
query_options.__set_batch_size(DEFAULT_BATCH_SIZE);
|
query_options.__set_batch_size(DEFAULT_BATCH_SIZE);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,7 +59,11 @@
|
|||||||
|
|
||||||
using strings::Substitute;
|
using strings::Substitute;
|
||||||
|
|
||||||
DECLARE_int32(max_errors);
|
DEFINE_int32(max_error_logs_per_instance, 2000,
|
||||||
|
"Maximum number of non-fatal error to be logged in log level 1 (INFO). "
|
||||||
|
"Once this number exceeded, further non-fatal error will be logged at log level 2 "
|
||||||
|
"(DEBUG) severity. This flag is ignored if user set negative max_errors query "
|
||||||
|
"option. Default to 2000");
|
||||||
|
|
||||||
namespace impala {
|
namespace impala {
|
||||||
|
|
||||||
@@ -190,11 +194,37 @@ string RuntimeState::ErrorLog() {
|
|||||||
|
|
||||||
bool RuntimeState::LogError(const ErrorMsg& message, int vlog_level) {
|
bool RuntimeState::LogError(const ErrorMsg& message, int vlog_level) {
|
||||||
lock_guard<SpinLock> l(error_log_lock_);
|
lock_guard<SpinLock> l(error_log_lock_);
|
||||||
// All errors go to the log, unreported_error_count_ is counted independently of the
|
// All errors go to the log. If the amount of errors logged to vlog level 1 exceed
|
||||||
// size of the error_log to account for errors that were already reported to the
|
// or equal max_error_logs_per_instance, then that error will be downgraded to vlog
|
||||||
// coordinator
|
// level 2.
|
||||||
VLOG(vlog_level) << "Error from query " << PrintId(query_id()) << ": " << message.msg();
|
int user_max_errors = query_options().max_errors;
|
||||||
if (ErrorCount(error_log_) < query_options().max_errors) {
|
if (vlog_level == 1 && user_max_errors >= 0
|
||||||
|
&& vlog_1_errors >= FLAGS_max_error_logs_per_instance) {
|
||||||
|
vlog_level = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (VLOG_IS_ON(vlog_level)) {
|
||||||
|
VLOG(vlog_level) << "Error from query " << PrintId(query_id()) << ": "
|
||||||
|
<< message.msg();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vlog_level == 1 && user_max_errors >= 0) {
|
||||||
|
vlog_1_errors++;
|
||||||
|
DCHECK_LE(vlog_1_errors, FLAGS_max_error_logs_per_instance);
|
||||||
|
if (vlog_1_errors == FLAGS_max_error_logs_per_instance) {
|
||||||
|
VLOG(vlog_level) << "Query " << PrintId(query_id()) << " printed "
|
||||||
|
<< FLAGS_max_error_logs_per_instance
|
||||||
|
<< " non-fatal error to log level 1 (INFO). Further non-fatal "
|
||||||
|
<< "error will be downgraded to log level 2 (DEBUG).";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
TErrorCode::type code = message.error();
|
||||||
|
if (ErrorCount(error_log_) < max_errors()
|
||||||
|
|| (code != TErrorCode::GENERAL && error_log_.find(code) != error_log_.end())) {
|
||||||
|
// Appending general error is expensive since it writes the entire message to the
|
||||||
|
// error_log_ map. Meanwhile, appending non-general (specific) error that already
|
||||||
|
// exist in error_log_ is cheap since it only increment count.
|
||||||
AppendError(&error_log_, message);
|
AppendError(&error_log_, message);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -166,6 +166,15 @@ class RuntimeState {
|
|||||||
return Status::OK();
|
return Status::OK();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return maximum number of non-fatal error to report to client through coordinator.
|
||||||
|
/// max_errors does not indicate how many errors in total have been recorded, but rather
|
||||||
|
/// how many are distinct. It is defined as the sum of the number of generic errors and
|
||||||
|
/// the number of distinct other errors. Default to 100 if non-positive number is
|
||||||
|
/// specified in max_errors query option.
|
||||||
|
inline int max_errors() const {
|
||||||
|
return query_options().max_errors <= 0 ? 100 : query_options().max_errors;
|
||||||
|
}
|
||||||
|
|
||||||
/// Log an error that will be sent back to the coordinator based on an instance of the
|
/// Log an error that will be sent back to the coordinator based on an instance of the
|
||||||
/// ErrorMsg class. The runtime state aggregates log messages based on type with one
|
/// ErrorMsg class. The runtime state aggregates log messages based on type with one
|
||||||
/// exception: messages with the GENERAL type are not aggregated but are kept
|
/// exception: messages with the GENERAL type are not aggregated but are kept
|
||||||
@@ -318,6 +327,9 @@ class RuntimeState {
|
|||||||
/// Logs error messages.
|
/// Logs error messages.
|
||||||
ErrorLogMap error_log_;
|
ErrorLogMap error_log_;
|
||||||
|
|
||||||
|
/// Track how many error has been printed to VLOG(1).
|
||||||
|
int64_t vlog_1_errors = 0;
|
||||||
|
|
||||||
/// Global QueryState and original thrift descriptors for this fragment instance.
|
/// Global QueryState and original thrift descriptors for this fragment instance.
|
||||||
QueryState* const query_state_;
|
QueryState* const query_state_;
|
||||||
const TPlanFragment* const fragment_;
|
const TPlanFragment* const fragment_;
|
||||||
|
|||||||
61
tests/custom_cluster/test_logging.py
Normal file
61
tests/custom_cluster/test_logging.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoggingCore(CustomClusterTestSuite):
|
||||||
|
"""Test existence of certain log lines under some scenario."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_workload(cls):
|
||||||
|
return 'functional-query'
|
||||||
|
|
||||||
|
def _test_max_errors(self, max_error_logs_per_instance, max_errors, expect_downgraded):
|
||||||
|
"""Test that number of non-fatal error printed to INFO log is limited by
|
||||||
|
max_errors and max_error_logs_per_instance."""
|
||||||
|
|
||||||
|
query = ("select id, bool_col, tinyint_col, smallint_col "
|
||||||
|
"from functional.alltypeserror order by id")
|
||||||
|
client = self.create_impala_client()
|
||||||
|
|
||||||
|
self.execute_query_expect_success(client, query, {'max_errors': max_errors})
|
||||||
|
self.assert_impalad_log_contains("INFO", "Error parsing row",
|
||||||
|
max_error_logs_per_instance if expect_downgraded else 8)
|
||||||
|
self.assert_impalad_log_contains("INFO",
|
||||||
|
"printed {0} non-fatal error to log level 1".format(max_error_logs_per_instance),
|
||||||
|
1 if expect_downgraded else 0)
|
||||||
|
|
||||||
|
@pytest.mark.execute_serially
|
||||||
|
@CustomClusterTestSuite.with_args(cluster_size=1,
|
||||||
|
impalad_args="--max_error_logs_per_instance=2")
|
||||||
|
def test_max_errors(self):
|
||||||
|
self._test_max_errors(2, 4, True)
|
||||||
|
|
||||||
|
@pytest.mark.execute_serially
|
||||||
|
@CustomClusterTestSuite.with_args(cluster_size=1,
|
||||||
|
impalad_args="--max_error_logs_per_instance=3")
|
||||||
|
def test_max_errors_0(self):
|
||||||
|
self._test_max_errors(3, 0, True)
|
||||||
|
|
||||||
|
@pytest.mark.execute_serially
|
||||||
|
@CustomClusterTestSuite.with_args(cluster_size=1,
|
||||||
|
impalad_args="--max_error_logs_per_instance=2")
|
||||||
|
def test_max_errors_no_downgrade(self):
|
||||||
|
self._test_max_errors(2, -1, False)
|
||||||
Reference in New Issue
Block a user