IMPALA-14480: Optional OpenTelemetry DCHECKs

The code in span-manager.cc contains aggressive DCHECKS that rely on
the query lifecycle to be deterministic. In reality, the query
lifecycle is not completely deterministic due to multiple threads
being involved in execution, result retrieval, query shutdown, etc.

On debug builds only, a new flag named, otel_trace_exhaustive_dchecks
will be available with a default of 'false'. If set to 'true', then
optional DCHECKs will be enabled in the SpanManager class to enable
identification of edge cases where the query lifecycle proceeds in an
unexpected way.

The DCHECKs that are controlled by the new flag are those that rely
on a specific ordering of start/end child span and add child span
event calls.

Change-Id: Id6507f3f0e23ecf7c2bece9a6b6c2d86bfac1e57
Reviewed-on: http://gerrit.cloudera.org:8080/23518
Reviewed-by: Michael Smith <michael.smith@cloudera.com>
Reviewed-by: Riza Suminto <riza.suminto@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
jasonmfehr
2025-09-17 12:33:41 -07:00
committed by Impala Public Jenkins
parent fdad9d3204
commit 336034debd
4 changed files with 68 additions and 17 deletions

View File

@@ -61,6 +61,10 @@ DEFINE_validator(otel_trace_exporter, [](const char* flagname, const string& val
return false;
}); // flag otel_trace_exporter
DEFINE_bool_hidden(otel_trace_exhaustive_dchecks, false, "Specifies whether or not to "
"enable exhaustive DCHECKs in OpenTelemetry code. These DCHECKs fail if spans are "
"not properly started and ended.");
//
// Start of HTTP related flags.
//

View File

@@ -49,6 +49,10 @@ DECLARE_string(cluster_id);
DECLARE_int32(otel_trace_retry_policy_max_attempts);
DECLARE_int32(otel_trace_retry_policy_max_backoff_s);
#ifndef NDEBUG
DECLARE_bool(otel_trace_exhaustive_dchecks);
#endif
// Names of attributes only on Root spans.
static constexpr char const* ATTR_ERROR_MESSAGE = "ErrorMessage";
static constexpr char const* ATTR_QUERY_START_TIME = "QueryStartTime";
@@ -181,8 +185,12 @@ void SpanManager::AddChildSpanEvent(const nostd::string_view& name) {
LOG(WARNING) << strings::Substitute("Attempted to add event '$0' with no active "
"child span trace_id=\"$1\" span_id=\"$2\"\n$3", name.data(), root_->TraceId(),
root_->SpanId(), GetStackTrace());
DCHECK(current_child_) << strings::Substitute("Cannot add event '$0' when child span "
"is not active.", name.data());
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK(current_child_) << strings::Substitute("Cannot add event '$0' when child "
"span is not active.", name.data());
}
#endif
}
} // function AddChildSpanEvent
@@ -219,7 +227,11 @@ void SpanManager::EndChildSpanInit() {
} // function EndChildSpanInit
inline void SpanManager::DoEndChildSpanInit(const Status* cause) {
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK_CHILD_SPAN_TYPE(ChildSpanType::INIT);
}
#endif
EndChildSpan(
cause,
@@ -241,7 +253,11 @@ void SpanManager::EndChildSpanSubmitted() {
} // function EndChildSpanSubmitted
inline void SpanManager::DoEndChildSpanSubmitted(const Status* cause) {
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK_CHILD_SPAN_TYPE(ChildSpanType::SUBMITTED);
}
#endif
EndChildSpan(cause);
} // function DoEndChildSpanSubmitted
@@ -257,7 +273,11 @@ void SpanManager::EndChildSpanPlanning() {
} // function EndChildSpanPlanning
inline void SpanManager::DoEndChildSpanPlanning(const Status* cause) {
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK_CHILD_SPAN_TYPE(ChildSpanType::PLANNING);
}
#endif
EndChildSpan(
cause,
OtelAttributesMap{
@@ -283,7 +303,11 @@ inline void SpanManager::DoEndChildSpanAdmissionControl(const Status* cause) {
return; // <-- EARLY RETURN
}
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK_CHILD_SPAN_TYPE(ChildSpanType::ADMISSION_CONTROL);
}
#endif
bool queued = false;
string adm_result;
@@ -344,7 +368,11 @@ inline void SpanManager::DoEndChildSpanQueryExecution(const Status* cause) {
return; // <-- EARLY RETURN
}
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK_CHILD_SPAN_TYPE(ChildSpanType::QUERY_EXEC);
}
#endif
OtelAttributesMap attrs;
if (client_request_state_->exec_request().stmt_type == TStmtType::QUERY) {
@@ -399,7 +427,11 @@ void SpanManager::StartChildSpanClose(const Status* cause) {
} // function StartChildSpanClose
void SpanManager::EndChildSpanClose() {
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK_CHILD_SPAN_TYPE(ChildSpanType::CLOSE);
}
#endif
lock_guard<mutex> l(child_span_mu_);
@@ -459,9 +491,13 @@ inline void SpanManager::ChildSpanBuilder(const ChildSpanType& span_type,
"another child span '$1' is still active trace_id=\"$2\" span_id=\"$3\"\n$4",
to_string(span_type), to_string(child_span_type_), root_->TraceId(),
root_->SpanId(), GetStackTrace());
DCHECK(false) << strings::Substitute("Should not start child span '$0' when child "
"span '$1' is already active.", to_string(span_type),
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK(false) << strings::Substitute("Should not start child span '$0' when "
"child span '$1' is already active.", to_string(span_type),
to_string(child_span_type_));
}
#endif
{
lock_guard<mutex> crs_lock(*(client_request_state_->lock()));
@@ -529,8 +565,12 @@ inline void SpanManager::EndChildSpan(const Status* cause,
LOG(WARNING) << strings::Substitute("Attempted to end a non-active child span "
"trace_id=\"$0\" span_id=\"$1\"\n$2", root_->TraceId(), root_->SpanId(),
GetStackTrace());
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK(current_child_) << "Cannot end child span when one is not active.";
}
#endif
}
} // function EndChildSpan
inline void SpanManager::EndActiveChildSpan(const Status* cause) {
@@ -555,8 +595,12 @@ inline void SpanManager::EndActiveChildSpan(const Status* cause) {
LOG(WARNING) << "Attempted to start Close child span while another Close child "
"span is already active trace_id=\"$0\" span_id=\"$1\"\n$2", root_->TraceId(),
current_child_->SpanId(), GetStackTrace();
DCHECK(false) << "Cannot start a new Close child span while a Close child span is "
"already active.";
#ifndef NDEBUG
if (FLAGS_otel_trace_exhaustive_dchecks) {
DCHECK(false) << "Cannot start a new Close child span while a Close child span "
"is already active.";
}
#endif
break;
default:
// No-op, no active child span to end.

View File

@@ -67,7 +67,8 @@ public:
void AddChildSpanEvent(const opentelemetry::nostd::string_view& name);
// Functions to start child spans. If another child span is active, it will be ended,
// a warning will be logged, and a DCHECK failed.
// a warning will be logged, and, if the otel_trace_exhaustive_dchecks flag is true,
// a DCHECK will fail.
void StartChildSpanInit();
void StartChildSpanSubmitted();
void StartChildSpanPlanning();
@@ -116,7 +117,9 @@ private:
std::shared_ptr<TimedSpan> root_;
// TimedSpan instance for the current child span and the mutex to protect it.
// To ensure no deadlocks, locks must be acquired in the following order:
// To ensure no deadlocks, locks must be acquired in the following order. Note that
// ClientRequestState::lock_ only needs to be held when interacting with the
// client_request_state_ variable. It should not be held otherwise.
// 1. SpanManager::child_span_mu_
// 2. ClientRequestState::lock_
std::unique_ptr<TimedSpan> current_child_;

View File

@@ -33,7 +33,7 @@ from tests.util.retry import retry
OUT_DIR = "out_dir_traces"
TRACE_FILE = "export-trace.jsonl"
TRACE_FLAGS = "--otel_trace_enabled=true --otel_trace_exporter=file " \
"--otel_file_flush_interval_ms=500 " \
"--otel_trace_exhaustive_dchecks --otel_file_flush_interval_ms=500 " \
"--otel_file_pattern={out_dir_traces}/" + TRACE_FILE