mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-14433: Fix OpenTelemetry Tracing Deadlock
All functions in the SpanManager class operate under the assumption that child_span_mu_ in the SpanManager class will be locked before the ClientRequestState lock. However, the ImpalaServer::ExecuteInternal function takes the ClientRequestState lock before calling SpanManager::EndChildSpanPlanning. If another function in the SpanManager class has already taken the child_span_mu_ lock and is waiting for the ClientRequestState lock, a deadlock occurs. This issue was found by running end-to-end tests with OpenTelemetry tracing enabled and a release buildof Impala. Testing accomplished by re-running the end-to-end tests with OpenTelemetry tracing enabled and verifying that the deadlock no longer occurs. Change-Id: I7b43dba794cfe61d283bdd476e4056b9304d8947 Reviewed-on: http://gerrit.cloudera.org:8080/23422 Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com> Reviewed-by: Michael Smith <michael.smith@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
@@ -252,6 +252,7 @@ void SpanManager::StartChildSpanPlanning() {
|
||||
|
||||
void SpanManager::EndChildSpanPlanning() {
|
||||
lock_guard<mutex> l(child_span_mu_);
|
||||
lock_guard<mutex> crs_lock(*(client_request_state_->lock()));
|
||||
DoEndChildSpanPlanning();
|
||||
} // function EndChildSpanPlanning
|
||||
|
||||
|
||||
@@ -92,15 +92,11 @@ public:
|
||||
// client_request_state_->lock().
|
||||
void EndChildSpanInit();
|
||||
void EndChildSpanSubmitted();
|
||||
void EndChildSpanPlanning();
|
||||
void EndChildSpanAdmissionControl(const Status& cause);
|
||||
void EndChildSpanQueryExecution();
|
||||
void EndChildSpanClose();
|
||||
|
||||
// Function to end the Planning child span. This function takes ownership of
|
||||
// child_span_mu_ BUT NOT client_request_state_->lock(). The code calling this function
|
||||
// already holds client_request_state_->lock().
|
||||
void EndChildSpanPlanning();
|
||||
|
||||
private:
|
||||
// Tracer instance used to construct spans.
|
||||
opentelemetry::nostd::shared_ptr<opentelemetry::trace::Tracer> tracer_;
|
||||
@@ -120,6 +116,9 @@ private:
|
||||
std::shared_ptr<TimedSpan> root_;
|
||||
|
||||
// TimedSpan instance for the current child span and the mutex to protect it.
|
||||
// To ensure no deadlocks, locks must be acquired in the following order:
|
||||
// 1. SpanManager::child_span_mu_
|
||||
// 2. ClientRequestState::lock_
|
||||
std::unique_ptr<TimedSpan> current_child_;
|
||||
std::mutex child_span_mu_;
|
||||
|
||||
|
||||
@@ -1422,10 +1422,10 @@ Status ImpalaServer::ExecuteInternal(const TQueryCtx& query_ctx,
|
||||
if (result.__isset.result_set_metadata) {
|
||||
(*query_handle)->set_result_metadata(result.result_set_metadata);
|
||||
}
|
||||
}
|
||||
|
||||
if ((*query_handle)->otel_trace_query()) {
|
||||
(*query_handle)->otel_span_manager()->EndChildSpanPlanning();
|
||||
}
|
||||
if ((*query_handle)->otel_trace_query()) {
|
||||
(*query_handle)->otel_span_manager()->EndChildSpanPlanning();
|
||||
}
|
||||
|
||||
VLOG(2) << "Execution request: "
|
||||
|
||||
Reference in New Issue
Block a user