IMPALA-13703: Cancel running queries before shutdown deadline

Currently, when the graceful shutdown deadline is reached, Impala daemon exits immediately, leaving any running queries unfinished. This approach is not quite graceful, as it may result in unreleased resources, such as scratch files in remote storage. This patch adds a new state in the graceful shutdown process. Before reaching the shutdown deadline, Impala daemon will try to cancel any remaining running queries within a configurable timelimit flag, shutdown_query_cancel_period_s. If this time limit exceeds 20% of the total shutdown deadline, it will be automatically capped at that value. The idea is to cancel queries only near the end of the graceful shutdown deadline. The 20% is the threshold to allow us to take a more aggressive way to ensure a graceful shutdown. If all queries are successfully canceled within this period, the server shuts down immediately. Otherwise, it shuts down once the deadline is reached, with queries still running. Tests: Passed core tests. Added testcases test_shutdown_coordinator_cancel_query and test_shutdown_executor_with_query_cancel_period and test_shutdown_coordinator_and_executor_cancel_query. Manually tested shutdown a coord or an executor with long running queries and they were canceled. Change-Id: I1cac2e100d329644e21fdceb0b23901b08079130 Reviewed-on: http://gerrit.cloudera.org:8080/22422 Reviewed-by: Michael Smith <michael.smith@cloudera.com> Reviewed-by: Abhishek Rawat <arawat@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2025-01-24 04:21:08 -08:00
parent d3b6cbcc20
commit 80a45014ea
7 changed files with 329 additions and 61 deletions
--- a/be/src/runtime/query-exec-mgr.cc
+++ b/be/src/runtime/query-exec-mgr.cc
@@ -43,6 +43,8 @@

 #include "common/names.h"

+using std::unordered_set;
+using std::vector;
 using namespace impala;

 // TODO: this logging should go into a per query log.
@@ -228,47 +230,75 @@ void QueryExecMgr::AcquireQueryStateLocked(QueryState* qs) {
  DCHECK(refcnt > 0);
 }

-void QueryExecMgr::CancelQueriesForFailedCoordinators(
-    const std::unordered_set<BackendIdPB>& current_membership) {
-  // Build a list of queries that are scheduled by failed coordinators (as
-  // evidenced by their absence from the cluster membership list).
-  std::vector<QueryCancellationTask> to_cancel;
+void QueryExecMgr::CollectQueriesToCancel(std::function<bool(QueryState*)> filter,
+    bool is_coord_active, vector<QueryExecMgr::QueryCancellationTask>* to_cancel) {
+  DCHECK(to_cancel != nullptr);
  ExecEnv::GetInstance()->query_exec_mgr()->qs_map_.DoFuncForAllEntries(
      [&](QueryState* qs) {
-        if (qs != nullptr && !qs->IsCancelled()) {
-          if (current_membership.find(qs->GetCoordinatorBackendId())
-              == current_membership.end()) {
+        if (qs != nullptr && !qs->IsCancelled() && filter(qs)) {
          // decremented by ReleaseQueryState()
          AcquireQueryStateLocked(qs);
-            to_cancel.push_back(QueryCancellationTask(qs));
-          }
+          to_cancel->push_back(QueryCancellationTask(qs, is_coord_active));
        }
      });
+}

-  // Since we are the only producer for the cancellation thread pool, we can find the
-  // remaining capacity of the pool and submit the new cancellation requests without
-  // blocking.
+void QueryExecMgr::CancelQueries(const QueryCancellationTask& to_cancel) {
+  QueryState* qs = to_cancel.GetQueryState();
+  DCHECK(qs != nullptr);
+  if (qs == nullptr) return;
+  VLOG(1) << "CancelFromThreadPool(): cancel query " << PrintId(qs->query_id());
+  qs->Cancel();
+  qs->is_coord_active_.Store(to_cancel.IsCoordActive());
+  ReleaseQueryState(qs);
+}
+
+bool QueryExecMgr::ProcessCancelQueries(
+    const vector<QueryCancellationTask>& to_cancel, bool handle_full_queue) {
  int query_num_to_cancel = to_cancel.size();
-  int remaining_queue_size = QUERY_EXEC_MGR_MAX_CANCELLATION_QUEUE_SIZE
+  const int remaining_queue_size = QUERY_EXEC_MGR_MAX_CANCELLATION_QUEUE_SIZE
      - cancellation_thread_pool_->GetQueueSize();
+  bool all_handled = true;
+
  if (query_num_to_cancel > remaining_queue_size) {
-    // Fill the queue up to maximum limit, and ignore the rest which will get cancelled
-    // eventually anyways when QueryState::ReportExecStatus() hits the timeout.
+    if (handle_full_queue) {
      LOG_EVERY_N(WARNING, 60) << "QueryExecMgr cancellation queue is full";
      query_num_to_cancel = remaining_queue_size;
      for (int i = query_num_to_cancel; i < to_cancel.size(); ++i) {
        ReleaseQueryState(to_cancel[i].GetQueryState());
      }
-  }
-  for (int i = 0; i < query_num_to_cancel; ++i) {
-    cancellation_thread_pool_->Offer(to_cancel[i]);
+    } else {
+      all_handled = false;
    }
  }

-void QueryExecMgr::CancelFromThreadPool(const QueryCancellationTask& cancellation_task) {
-  QueryState* qs = cancellation_task.GetQueryState();
-  VLOG(1) << "CancelFromThreadPool(): cancel query " << PrintId(qs->query_id());
-  qs->Cancel();
-  qs->is_coord_active_.Store(false);
-  ReleaseQueryState(qs);
+  for (int i = 0; i < query_num_to_cancel; ++i) {
+    cancellation_thread_pool_->Offer(to_cancel[i]);
+  }
+  return all_handled;
+}
+
+void QueryExecMgr::CancelQueriesForFailedCoordinators(
+    const unordered_set<BackendIdPB>& current_membership) {
+  vector<QueryCancellationTask> to_cancel;
+  CollectQueriesToCancel(
+      [&](QueryState* qs) {
+        return current_membership.find(qs->GetCoordinatorBackendId())
+            == current_membership.end();
+      },
+      false /* is_coord_active */, &to_cancel);
+  ProcessCancelQueries(to_cancel, true /* handle_full_queue */);
+}
+
+bool QueryExecMgr::CancelQueriesForGracefulShutdown() {
+  vector<QueryCancellationTask> to_cancel;
+  CollectQueriesToCancel(
+      [&](QueryState* qs) { return true; }, true /* is_coord_active */, &to_cancel);
+  // If the queue is full for cancellation, the caller, which is the shutdown thread
+  // should handle this by retrying later for the rest.
+  return ProcessCancelQueries(to_cancel, false /* handle_full_queue */);
+}
+
+void QueryExecMgr::CancelFromThreadPool(const QueryCancellationTask& cancellation_task) {
+  CancelQueries(cancellation_task);
 }
--- a/be/src/runtime/query-exec-mgr.h
+++ b/be/src/runtime/query-exec-mgr.h
@@ -73,19 +73,28 @@ class QueryExecMgr : public CacheLineAligned {
  void CancelQueriesForFailedCoordinators(
      const std::unordered_set<BackendIdPB>& current_membership);

+  /// Before graceful shutdown, cancel all the queries within the daemon.
+  /// Returns false if the job hasn't been done yet due to the full cancellation queue.
+  /// Otherwise returns true.
+  bool CancelQueriesForGracefulShutdown();
+
  /// Work item for QueryExecMgr::cancellation_thread_pool_.
  /// This class needs to support move construction and assignment for use in ThreadPool.
  class QueryCancellationTask {
   public:
    // Empty constructor needed to make ThreadPool happy.
-    QueryCancellationTask() : qs_(nullptr) {}
-    QueryCancellationTask(QueryState* qs) : qs_(qs) {}
+    QueryCancellationTask() : qs_(nullptr), is_coord_active_(true) {}
+    QueryCancellationTask(QueryState* qs, bool is_coord_active)
+      : qs_(qs), is_coord_active_(is_coord_active) {}

    QueryState* GetQueryState() const { return qs_; }
+    bool IsCoordActive() const { return is_coord_active_; }

   private:
    // QueryState to be cancelled.
    QueryState* qs_;
+    // Is the coordinator active.
+    bool is_coord_active_;
  };

 private:
@@ -117,5 +126,20 @@ class QueryExecMgr : public CacheLineAligned {
  /// called from the cancellation thread pool. The cancellation_task contains the
  /// QueryState to be cancelled.
  void CancelFromThreadPool(const QueryCancellationTask& cancellation_task);
+
+  // Helper function to collect queries to cancel based on a filter.
+  // The 'to_cancel' vector (must not be nullptr) will store any queries
+  // to cancel.
+  void CollectQueriesToCancel(std::function<bool(QueryState*)> filter,
+      bool is_coord_active, std::vector<QueryCancellationTask>* to_cancel);
+
+  // Helper function to enqueue cancellation requests and handle cases where the
+  // queue is full accordingly.
+  // Returns true if all requests in 'to_cancel' were successfully processed.
+  bool ProcessCancelQueries(
+      const vector<QueryCancellationTask>& to_cancel, bool handle_full_queue);
+
+  /// Helper function to cancel queries.
+  void CancelQueries(const QueryCancellationTask& to_cancel);
 };
 }
--- a/be/src/service/cancellation-work.h
+++ b/be/src/service/cancellation-work.h
@@ -31,7 +31,10 @@ enum class CancellationWorkCause {
  TERMINATED_BY_SERVER,
  // The query is being terminated because a backend failed. We can skip cancelling the
  // query if the fragment instances running on that backend all completed.
-  BACKEND_FAILED
+  BACKEND_FAILED,
+  // The Impala Server terminated the query during shutdown and the query has not been
+  // done yet when the graceful shutdown deadline is reached.
+  GRACEFUL_SHUTDOWN
 };

 /// Work item for ImpalaServer::cancellation_thread_pool_.
@@ -56,10 +59,18 @@ class CancellationWork {
        failed_backends, false);
  }

+  // Construct a GRACEFUL_SHUTDOWN CancellationWork instance.
+  static CancellationWork GracefulShutdown(
+      const TUniqueId& query_id, const Status& error, bool unregister) {
+    return CancellationWork(
+        query_id, CancellationWorkCause::GRACEFUL_SHUTDOWN, error, {}, unregister);
+  }
+
  const TUniqueId& query_id() const { return query_id_; }
  CancellationWorkCause cause() const { return cause_; }
  const Status& error() const {
-    DCHECK_ENUM_EQ(cause_, CancellationWorkCause::TERMINATED_BY_SERVER);
+    DCHECK(cause_ == CancellationWorkCause::TERMINATED_BY_SERVER
+        || cause_ == CancellationWorkCause::GRACEFUL_SHUTDOWN);
    return error_;
  }
  const std::vector<NetworkAddressPB>& failed_backends() const {
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -40,6 +40,8 @@
 #include <rapidjson/stringbuffer.h>
 #include <sys/types.h>

+#include "runtime/query-exec-mgr.h"
+
 #include "catalog/catalog-server.h"
 #include "catalog/catalog-util.h"
 #include "common/compiler-util.h"
@@ -326,6 +328,13 @@ DEFINE_int64(shutdown_deadline_s, 60 * 60, "Default time limit in seconds for th
    "down process. If this duration elapses after the shut down process is started, "
    "the daemon shuts down regardless of any running queries.");

+DEFINE_int64(shutdown_query_cancel_period_s, 60,
+    "Time limit in seconds for canceling running queries before the shutdown deadline. "
+    "If this value is greater than 0, the Impala daemon will attempt to cancel running "
+    "queries starting from this period before reaching the deadline. However, if this "
+    "period exceeds 20% of the total shutdown deadline, it will be capped at 20% of the "
+    "total shutdown duration.");
+
 #ifndef NDEBUG
  DEFINE_int64(stress_metadata_loading_pause_injection_ms, 0, "Simulates metadata loading"
      "for a given query by injecting a sleep equivalent to this configuration in "
@@ -485,6 +494,8 @@ const char* ImpalaServer::QUERY_ERROR_FORMAT = "Query $0 failed:\n$1\n";

 // Interval between checks for query expiration.
 const int64_t EXPIRATION_CHECK_INTERVAL_MS = 1000L;
+// The max allowed ratio of the shutdown query cancel period to the shutdown deadline.
+constexpr double SHUTDOWN_CANCEL_MAX_RATIO = 0.2;

 // Template to return error messages for client requests that could not be found, belonged
 // to the wrong session, or had a mismatched secret. We need to use this particular string
@@ -502,7 +513,8 @@ ThreadSafeRandom ImpalaServer::rng_(GetRandomSeed32());

 ImpalaServer::ImpalaServer(ExecEnv* exec_env)
  : exec_env_(exec_env),
-      services_started_(false) {
+    services_started_(false),
+    shutdown_deadline_cancel_queries_(false) {
  // Initialize default config
  InitializeConfigVariables();

@@ -2082,6 +2094,7 @@ void ImpalaServer::CancelFromThreadPool(const CancellationWork& cancellation_wor
  Status error;
  switch (cancellation_work.cause()) {
    case CancellationWorkCause::TERMINATED_BY_SERVER:
+    case CancellationWorkCause::GRACEFUL_SHUTDOWN:
      error = cancellation_work.error();
      break;
    case CancellationWorkCause::BACKEND_FAILED: {
@@ -3418,6 +3431,12 @@ Status ImpalaServer::CheckNotShuttingDown() const {
      TErrorCode::SERVER_SHUTTING_DOWN, ShutdownStatusToString(GetShutdownStatus())));
 }

+static int64_t GetShutdownCancelThreshold() {
+  return std::min(
+      static_cast<int64_t>(FLAGS_shutdown_deadline_s * SHUTDOWN_CANCEL_MAX_RATIO * 1000L),
+      FLAGS_shutdown_query_cancel_period_s * 1000L);
+}
+
 ShutdownStatusPB ImpalaServer::GetShutdownStatus() const {
  ShutdownStatusPB result;
  int64_t shutdown_time = shutting_down_.Load();
@@ -3429,6 +3448,10 @@ ShutdownStatusPB ImpalaServer::GetShutdownStatus() const {
  result.set_grace_remaining_ms(
      max<int64_t>(0, FLAGS_shutdown_grace_period_s * 1000 - elapsed_ms));
  result.set_deadline_remaining_ms(max<int64_t>(0, shutdown_deadline - now));
+  if (FLAGS_shutdown_query_cancel_period_s > 0) {
+    result.set_cancel_deadline_remaining_ms(
+        max<int64_t>(0, shutdown_deadline - now - GetShutdownCancelThreshold()));
+  }
  result.set_finstances_executing(
      ImpaladMetrics::IMPALA_SERVER_NUM_FRAGMENTS_IN_FLIGHT->GetValue());
  result.set_client_requests_registered(
@@ -3439,11 +3462,16 @@ ShutdownStatusPB ImpalaServer::GetShutdownStatus() const {
 }

 string ImpalaServer::ShutdownStatusToString(const ShutdownStatusPB& shutdown_status) {
-  return Substitute("shutdown grace period left: $0, deadline left: $1, "
-                    "queries registered on coordinator: $2, queries executing: $3, "
-                    "fragment instances: $4",
+  return Substitute("shutdown grace period left: $0, deadline left: $1, $2"
+                    "queries registered on coordinator: $3, queries executing: $4, "
+                    "fragment instances: $5",
      PrettyPrinter::Print(shutdown_status.grace_remaining_ms(), TUnit::TIME_MS),
      PrettyPrinter::Print(shutdown_status.deadline_remaining_ms(), TUnit::TIME_MS),
+      FLAGS_shutdown_query_cancel_period_s > 0 ?
+          Substitute("cancel deadline left: $0, ",
+              PrettyPrinter::Print(
+                  shutdown_status.cancel_deadline_remaining_ms(), TUnit::TIME_MS)) :
+          "",
      shutdown_status.client_requests_registered(),
      shutdown_status.backend_queries_executing(),
      shutdown_status.finstances_executing());
@@ -3491,10 +3519,20 @@ Status ImpalaServer::StartShutdown(
  }
  if (set_deadline) {
    shutdown_status->set_deadline_remaining_ms(relative_deadline_s * 1000L);
+    if (FLAGS_shutdown_query_cancel_period_s > 0) {
+      shutdown_status->set_cancel_deadline_remaining_ms(
+          relative_deadline_s * 1000L - GetShutdownCancelThreshold());
+    }
  }
  return Status::OK();
 }

+bool ImpalaServer::CancelQueriesForGracefulShutdown() {
+  LOG(INFO) << "Start to cancel all running queries for graceful shutdown.";
+  // Cancel all active queries this Impala daemon involves.
+  return ExecEnv::GetInstance()->query_exec_mgr()->CancelQueriesForGracefulShutdown();
+}
+
 [[noreturn]] void ImpalaServer::ShutdownThread() {
  while (true) {
    SleepForMs(1000);
@@ -3506,6 +3544,10 @@ Status ImpalaServer::StartShutdown(
      break;
    } else if (shutdown_status.deadline_remaining_ms() <= 0) {
      break;
+    } else if (FLAGS_shutdown_query_cancel_period_s > 0
+        && !shutdown_deadline_cancel_queries_
+        && shutdown_status.cancel_deadline_remaining_ms() <= 0) {
+      shutdown_deadline_cancel_queries_ = CancelQueriesForGracefulShutdown();
    }
  }

--- a/be/src/service/impala-server.h
+++ b/be/src/service/impala-server.h
@@ -862,6 +862,10 @@ class ImpalaServer : public ImpalaServiceIf,
  Status CloseSessionInternal(const TUniqueId& session_id, const SecretArg& secret,
      bool ignore_if_absent) WARN_UNUSED_RESULT;

+  /// Cancel all running queries associated with this Impala daemon.
+  /// Returns true if the cancellation of all running queries has already been triggered.
+  bool CancelQueriesForGracefulShutdown();
+
  /// The output of a runtime profile. The output of a profile can be in one of three
  /// formats: string, thrift, or json. The format is specified by TRuntimeProfileFormat.
  /// The struct is a union of all output profiles types. The struct is similar to a
@@ -1673,6 +1677,11 @@ class ImpalaServer : public ImpalaServiceIf,
  /// atomically if a new shutdown command with a shorter deadline comes in.
  AtomicInt64 shutdown_deadline_{0};

+  /// Flag that records if the cancel queries before shutdown deadline has been started.
+  /// The flag is set when the shutdown cancel queries period has been reached and all
+  /// running queries have been sent for cancellation.
+  std::atomic_bool shutdown_deadline_cancel_queries_;
+
  /// Stores the last version number for the admission heartbeat that was sent.
  /// Incremented every time a new admission heartbeat is sent.
  int64_t admission_heartbeat_version_ = 0;
--- a/common/protobuf/control_service.proto
+++ b/common/protobuf/control_service.proto
@@ -329,6 +329,10 @@ message ShutdownStatusPB {

  // Number of queries still executing on backend.
  optional int64 backend_queries_executing = 5;
+
+  // Milliseconds remaining in query cancel before shutdown deadline. 0 if the deadline
+  // has expired.
+  optional int64 cancel_deadline_remaining_ms = 6;
 }

 message RemoteShutdownResultPB {
--- a/tests/custom_cluster/test_restart_services.py
+++ b/tests/custom_cluster/test_restart_services.py
@@ -489,18 +489,40 @@ class TestRestart(CustomClusterTestSuite):
      client.close()


-def parse_shutdown_result(result):
-  """Parse the shutdown result string and return the strings (grace left,
-  deadline left, queries registered, queries executing)."""
+def _get_shutdown_pattern(with_cancel):
+  base_pattern = (r'shutdown grace period left: ([0-9ms]*), '
+                 r'deadline left: ([0-9ms]*)')
+  cancel_part = r', cancel deadline left: ([0-9ms]*)' if with_cancel else ''
+  end_pattern = (r', queries registered on coordinator: ([0-9]*), queries executing: '
+                r'([0-9]*), fragment instances: [0-9]*')
+  return base_pattern + cancel_part + end_pattern
+
+
+def parse_shutdown_result_with_cancel(result):
+  """Parse shutdown string with cancel deadline."""
  assert len(result.data) == 1
  summary = result.data[0]
-  match = re.match(r'shutdown grace period left: ([0-9ms]*), deadline left: ([0-9ms]*), '
-                   r'queries registered on coordinator: ([0-9]*), queries executing: '
-                   r'([0-9]*), fragment instances: [0-9]*', summary)
+  match = re.match(_get_shutdown_pattern(True), summary)
  assert match is not None, summary
  return match.groups()


+def parse_shutdown_result(result):
+  """Parse shutdown string without cancel deadline."""
+  assert len(result.data) == 1
+  summary = result.data[0]
+  match = re.match(_get_shutdown_pattern(False), summary)
+  assert match is not None, summary
+  return match.groups()
+
+
+def get_remain_shutdown_query_cancel(exec_shutdown_deadline_s,
+    exec_shutdown_query_cancel_s):
+  max_allowed_cancel_s = int(exec_shutdown_deadline_s * 0.2)
+  return exec_shutdown_deadline_s - min(exec_shutdown_query_cancel_s,
+      max_allowed_cancel_s)
+
+
 class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):
  IDLE_SHUTDOWN_GRACE_PERIOD_S = 1
  IMPALA_SHUTDOWN_SIGNAL = signal.SIGRTMIN
@@ -641,22 +663,25 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):

  EXEC_SHUTDOWN_GRACE_PERIOD_S = 5
  EXEC_SHUTDOWN_DEADLINE_S = 10
+  EXEC_SHUTDOWN_QUERY_CANCEL_S = 30

  @pytest.mark.execute_serially
  @SkipIfNotHdfsMinicluster.scheduling
  @CustomClusterTestSuite.with_args(
      impalad_args="--shutdown_grace_period_s={grace_period} \
          --shutdown_deadline_s={deadline} \
+          --shutdown_query_cancel_period_s=0 \
          --hostname={hostname}".format(grace_period=EXEC_SHUTDOWN_GRACE_PERIOD_S,
            deadline=EXEC_SHUTDOWN_DEADLINE_S, hostname=socket.gethostname()))
  def test_shutdown_executor(self):
-    self.do_test_shutdown_executor(fetch_delay_s=0)
+    self.do_test_shutdown_executor(fetch_delay_s=0, has_query_cancel_period=False)

  @pytest.mark.execute_serially
  @SkipIfNotHdfsMinicluster.scheduling
  @CustomClusterTestSuite.with_args(
      impalad_args="--shutdown_grace_period_s={grace_period} \
          --shutdown_deadline_s={deadline} \
+          --shutdown_query_cancel_period_s=0 \
          --stress_status_report_delay_ms={status_report_delay_ms} \
          --hostname={hostname}".format(grace_period=EXEC_SHUTDOWN_GRACE_PERIOD_S,
            deadline=EXEC_SHUTDOWN_DEADLINE_S, status_report_delay_ms=5000,
@@ -667,9 +692,22 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):
    print(self.exploration_strategy)
    if self.exploration_strategy() != 'exhaustive':
      pytest.skip()
-    self.do_test_shutdown_executor(fetch_delay_s=5)
+    self.do_test_shutdown_executor(fetch_delay_s=5, has_query_cancel_period=False)

-  def do_test_shutdown_executor(self, fetch_delay_s):
+  @pytest.mark.execute_serially
+  @SkipIfNotHdfsMinicluster.scheduling
+  @CustomClusterTestSuite.with_args(
+      impalad_args="--shutdown_grace_period_s={grace_period} \
+          --shutdown_deadline_s={deadline} \
+          --shutdown_query_cancel_period_s={query_cancel_period} \
+          --hostname={hostname}".format(grace_period=EXEC_SHUTDOWN_GRACE_PERIOD_S,
+            deadline=EXEC_SHUTDOWN_DEADLINE_S,
+            query_cancel_period=EXEC_SHUTDOWN_QUERY_CANCEL_S,
+            hostname=socket.gethostname()))
+  def test_shutdown_executor_with_query_cancel_period(self):
+    self.do_test_shutdown_executor(fetch_delay_s=0, has_query_cancel_period=True)
+
+  def do_test_shutdown_executor(self, fetch_delay_s, has_query_cancel_period):
    """Implementation of test that shuts down and then restarts an executor. This should
    not disrupt any queries that start after the shutdown or complete before the shutdown
    time limit. The test is parameterized by 'fetch_delay_s', the amount to delay before
@@ -697,6 +735,14 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):

    # Shut down and wait for the shutdown state to propagate through statestore.
    result = self.execute_query_expect_success(self.client, SHUTDOWN_EXEC2)
+    if has_query_cancel_period:
+      assert parse_shutdown_result_with_cancel(result) == (
+          "{0}s000ms".format(self.EXEC_SHUTDOWN_GRACE_PERIOD_S),
+          "{0}s000ms".format(self.EXEC_SHUTDOWN_DEADLINE_S),
+          "{0}s000ms".format(get_remain_shutdown_query_cancel(
+              self.EXEC_SHUTDOWN_DEADLINE_S, self.EXEC_SHUTDOWN_QUERY_CANCEL_S)),
+          "0", "1")
+    else:
      assert parse_shutdown_result(result) == (
          "{0}s000ms".format(self.EXEC_SHUTDOWN_GRACE_PERIOD_S),
          "{0}s000ms".format(self.EXEC_SHUTDOWN_DEADLINE_S), "0", "1")
@@ -743,11 +789,20 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):
    # Test that a query will fail when the executor shuts down after the limit.
    deadline_expiry_handle = self.__exec_and_wait_until_running(SLOW_QUERY)
    result = self.execute_query_expect_success(self.client, SHUTDOWN_EXEC2)
+    if has_query_cancel_period:
+      assert parse_shutdown_result_with_cancel(result) == (
+          "{0}s000ms".format(self.EXEC_SHUTDOWN_GRACE_PERIOD_S),
+          "{0}s000ms".format(self.EXEC_SHUTDOWN_DEADLINE_S),
+          "{0}s000ms".format(get_remain_shutdown_query_cancel(
+              self.EXEC_SHUTDOWN_DEADLINE_S, self.EXEC_SHUTDOWN_QUERY_CANCEL_S)),
+          "0", "1")
+    else:
      assert parse_shutdown_result(result) == (
          "{0}s000ms".format(self.EXEC_SHUTDOWN_GRACE_PERIOD_S),
          "{0}s000ms".format(self.EXEC_SHUTDOWN_DEADLINE_S), "0", "1")
    self.cluster.impalads[1].wait_for_exit()
-    self.__check_deadline_expired(SLOW_QUERY, deadline_expiry_handle)
+    self.__check_deadline_expired(SLOW_QUERY, deadline_expiry_handle,
+        has_query_cancel_period)

    # Test that we can reduce the deadline after setting it to a high value.
    # Run a query that will fail as a result of the reduced deadline.
@@ -758,12 +813,18 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):
    LOW_DEADLINE = 5
    result = self.execute_query_expect_success(
        self.client, SHUTDOWN_EXEC3.format(HIGH_DEADLINE))
+    if has_query_cancel_period:
+      grace, deadline, _, _, _ = parse_shutdown_result_with_cancel(result)
+    else:
      grace, deadline, _, _ = parse_shutdown_result(result)
    assert grace == "{0}s000ms".format(self.EXEC_SHUTDOWN_GRACE_PERIOD_S)
    assert deadline == "{0}m{1}s".format(HIGH_DEADLINE // 60, HIGH_DEADLINE % 60)

    result = self.execute_query_expect_success(
        self.client, SHUTDOWN_EXEC3.format(VERY_HIGH_DEADLINE))
+    if has_query_cancel_period:
+      _, deadline, _, _, _ = parse_shutdown_result_with_cancel(result)
+    else:
      _, deadline, _, _ = parse_shutdown_result(result)
    LOG.info("Deadline is {0}".format(deadline))
    min_string, sec_string = re.match("([0-9]*)m([0-9]*)s", deadline).groups()
@@ -772,22 +833,31 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):

    result = self.execute_query_expect_success(
        self.client, SHUTDOWN_EXEC3.format(LOW_DEADLINE))
+    if has_query_cancel_period:
+      _, deadline, _, _, queries_executing = parse_shutdown_result_with_cancel(result)
+    else:
      _, deadline, _, queries_executing = parse_shutdown_result(result)
    assert deadline == "{0}s000ms".format(LOW_DEADLINE)
    assert int(queries_executing) > 0, "Slow query should still be running."
    self.cluster.impalads[2].wait_for_exit()
-    self.__check_deadline_expired(SLOW_QUERY, deadline_expiry_handle)
+    self.__check_deadline_expired(SLOW_QUERY, deadline_expiry_handle,
+        has_query_cancel_period)

  COORD_SHUTDOWN_GRACE_PERIOD_S = 5
  COORD_SHUTDOWN_DEADLINE_S = 120
+  COORD_SHUTDOWN_FAST_DEADLINE_S = 20
+  COORD_SHUTDOWN_QUERY_CANCEL_PERIOD_S = 10

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
      impalad_args="--shutdown_grace_period_s={grace_period} \
          --shutdown_deadline_s={deadline} \
+          --shutdown_query_cancel_period_s={query_cancel_period} \
          --hostname={hostname}".format(
          grace_period=COORD_SHUTDOWN_GRACE_PERIOD_S,
-          deadline=COORD_SHUTDOWN_DEADLINE_S, hostname=socket.gethostname()),
+          deadline=COORD_SHUTDOWN_DEADLINE_S,
+          query_cancel_period=COORD_SHUTDOWN_QUERY_CANCEL_PERIOD_S,
+          hostname=socket.gethostname()),
      default_query_options=[("num_scanner_threads", "1")])
  @needs_session(TCLIService.TProtocolVersion.HIVE_CLI_SERVICE_PROTOCOL_V6,
                 close_session=False)
@@ -806,7 +876,7 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):

    # Shut down the coordinator. Operations that start after this point should fail.
    result = self.execute_query_expect_success(self.client, SHUTDOWN)
-    grace, deadline, registered, _ = parse_shutdown_result(result)
+    grace, deadline, _, registered, _ = parse_shutdown_result_with_cancel(result)
    assert grace == "{0}s000ms".format(self.COORD_SHUTDOWN_GRACE_PERIOD_S)
    assert deadline == "{0}m".format(self.COORD_SHUTDOWN_DEADLINE_S // 60), "4"
    assert registered == "3"
@@ -860,6 +930,80 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):
    self.client.close_query(before_shutdown_handle)
    self.cluster.impalads[0].wait_for_exit()

+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(
+      impalad_args="--shutdown_grace_period_s={grace_period} \
+          --shutdown_deadline_s={deadline} \
+          --shutdown_query_cancel_period_s={query_cancel_period} \
+          --hostname={hostname}".format(
+          grace_period=COORD_SHUTDOWN_GRACE_PERIOD_S,
+          deadline=COORD_SHUTDOWN_FAST_DEADLINE_S,
+          query_cancel_period=COORD_SHUTDOWN_QUERY_CANCEL_PERIOD_S,
+          hostname=socket.gethostname()),
+      default_query_options=[("num_scanner_threads", "1")])
+  def test_shutdown_coordinator_cancel_query(self):
+    """Test that shuts down the coordinator with a short deadline, the slow query should
+    be cancelled before the deadline is reached."""
+    # Start a slow query running.
+    # Set NUM_SCANNER_THREADS=1 above to make the runtime more predictable.
+    SLOW_QUERY = """select * from tpch_parquet.lineitem where sleep(1) < l_orderkey"""
+    SHUTDOWN = ": shutdown()"
+
+    slow_query_handle = self.__exec_and_wait_until_running(SLOW_QUERY)
+
+    # Shut down the coordinator.
+    result = self.execute_query_expect_success(self.client, SHUTDOWN)
+    grace, deadline, cancel, registered, _ = parse_shutdown_result_with_cancel(result)
+    assert grace == "{0}s000ms".format(self.COORD_SHUTDOWN_GRACE_PERIOD_S)
+    assert deadline == "{0}s000ms".format(self.COORD_SHUTDOWN_FAST_DEADLINE_S)
+    assert cancel == "{0}s000ms".format(get_remain_shutdown_query_cancel(
+        self.COORD_SHUTDOWN_FAST_DEADLINE_S, self.COORD_SHUTDOWN_FAST_DEADLINE_S))
+    assert registered == "2"
+
+    # This query is too slow to complete before the deadline, because the
+    # query_cancel_period is set, this query should be cancelled before shutdown.
+    self.__check_deadline_expired(SLOW_QUERY, slow_query_handle, True)
+    self.cluster.impalads[0].wait_for_exit()
+
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(
+      impalad_args="--shutdown_grace_period_s={grace_period} \
+          --shutdown_deadline_s={deadline} \
+          --shutdown_query_cancel_period_s={query_cancel_period} \
+          --hostname={hostname}".format(
+          grace_period=COORD_SHUTDOWN_GRACE_PERIOD_S,
+          deadline=COORD_SHUTDOWN_FAST_DEADLINE_S,
+          query_cancel_period=COORD_SHUTDOWN_QUERY_CANCEL_PERIOD_S,
+          hostname=socket.gethostname()),
+      default_query_options=[("num_scanner_threads", "1")])
+  def test_shutdown_coordinator_and_executor_cancel_query(self):
+    """Test that shuts down the executor and coordinator, the slow query should
+    be cancelled before the deadline is reached."""
+    # Start two slow queries running.
+    # Set NUM_SCANNER_THREADS=1 above to make the runtime more predictable.
+    SLOW_QUERY = """select * from tpch_parquet.lineitem where sleep(1) < l_orderkey"""
+    SHUTDOWN = ": shutdown()"
+    SHUTDOWN_EXEC2 = ": shutdown('localhost:27001')"
+
+    slow_query_handle = self.__exec_and_wait_until_running(SLOW_QUERY)
+
+    # Shut down the executor.
+    result = self.execute_query_expect_success(self.client, SHUTDOWN_EXEC2)
+    grace, deadline, cancel, registered, running =\
+        parse_shutdown_result_with_cancel(result)
+    assert grace == "{0}s000ms".format(self.COORD_SHUTDOWN_GRACE_PERIOD_S)
+    assert deadline == "{0}s000ms".format(self.COORD_SHUTDOWN_FAST_DEADLINE_S)
+    assert cancel == "{0}s000ms".format(get_remain_shutdown_query_cancel(
+        self.COORD_SHUTDOWN_FAST_DEADLINE_S, self.COORD_SHUTDOWN_FAST_DEADLINE_S))
+    assert registered == "0"
+    assert running > 0
+    self.cluster.impalads[1].wait_for_exit()
+    # The slow query should be cancelled.
+    self.__check_deadline_expired(SLOW_QUERY, slow_query_handle, True)
+    # Shut down the coordinator.
+    self.execute_query_expect_success(self.client, SHUTDOWN)
+    self.cluster.impalads[0].wait_for_exit()
+
  def __exec_and_wait_until_running(self, query, timeout=20):
    """Execute 'query' with self.client and wait until it is in the RUNNING state.
    'timeout' controls how long we will wait"""
@@ -884,13 +1028,17 @@ class TestGracefulShutdown(CustomClusterTestSuite, HS2TestSuite):
    assert backends_match is not None, profile
    return int(backends_match.group(1))

-  def __check_deadline_expired(self, query, handle):
+  def __check_deadline_expired(self, query, handle, has_query_cancel_period):
    """Check that the query with 'handle' fails because of a backend hitting the
-    deadline and shutting down."""
+    deadline and shutting down. If query_cancel_period is set, the query should
+    be cancelled by the server before shutdown."""
    try:
      self.client.fetch(query, handle)
      assert False, "Expected query to fail"
    except Exception as e:
+      if has_query_cancel_period:
+        assert 'Cancelled' in str(e)
+      else:
        assert 'Failed due to unreachable impalad(s)' in str(e)

  @pytest.mark.execute_serially