IMPALA-14493: Cap memory usage of global admission service

The global admission service can experience OOM errors under high concurrency because its process memory tracker is inaccurate and doesn't account for all memory allocations. Ensuring memory tracker accurately accounts for every allocation could be difficult, this patch uses a simpler solution to introduce a hard memory cap using tcmalloc statistics, which accurately reflect the true process memory usage. If a new query is submitted while tcmalloc memory usage is over the process limit, the query will be rejected immediately to protect from OOM. Adds a new flag enable_admission_service_mem_safeguard allowing this feature to be enabled or disabled. By default, this feature is turned on Tests: Added test test_admission_service_low_mem_limit. Passed exhaustive tests. Change-Id: I2ee2c942a73fcd69358851fc2fdc0fc4fe531c73 Reviewed-on: http://gerrit.cloudera.org:8080/23542 Reviewed-by: Abhishek Rawat <arawat@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2025-10-13 16:36:28 -07:00
parent ff8bb33b91
commit 1bc7cdbff6
4 changed files with 57 additions and 2 deletions
--- a/tests/custom_cluster/test_admission_controller.py
+++ b/tests/custom_cluster/test_admission_controller.py
@@ -1100,7 +1100,8 @@ class TestAdmissionController(TestAdmissionControllerBase):
  @CustomClusterTestSuite.with_args(
      impalad_args=impalad_admission_ctrl_flags(max_requests=10, max_queued=10,
        pool_max_mem=10 * 1024 * 1024, proc_mem_limit=2 * 1024 * 1024,
-        queue_wait_timeout_ms=1000),
+        queue_wait_timeout_ms=1000)
+      + " --enable_admission_service_mem_safeguard=false",
      statestored_args=_STATESTORED_ARGS)
  def test_timeout_reason_host_memory(self):
    self.client.set_configuration_option('enable_trivial_query_for_admission', 'false')
@@ -1134,7 +1135,8 @@ class TestAdmissionController(TestAdmissionControllerBase):
  @CustomClusterTestSuite.with_args(
      impalad_args=impalad_admission_ctrl_flags(max_requests=10, max_queued=10,
        pool_max_mem=2 * 1024 * 1024, proc_mem_limit=20 * 1024 * 1024,
-        queue_wait_timeout_ms=1000),
+        queue_wait_timeout_ms=1000)
+      + " --enable_admission_service_mem_safeguard=false",
      statestored_args=_STATESTORED_ARGS)
  def test_timeout_reason_pool_memory(self):
    self.client.set_configuration_option('enable_trivial_query_for_admission', 'false')
@@ -2297,6 +2299,23 @@ class TestAdmissionControllerWithACService(TestAdmissionController):
    client1.close()
    client2.close()

+  @SkipIfNotHdfsMinicluster.tuned_for_minicluster
+  @pytest.mark.execute_serially
+  @CustomClusterTestSuite.with_args(
+    impalad_args="--vmodule admission-controller=3 --mem_limit=10MB ")
+  def test_admission_service_low_mem_limit(self):
+    EXPECTED_REASON = "Admission rejected due to memory pressure"
+    # Test whether it will fail for a normal query.
+    failed_query_handle = self.client.execute_async(
+            "select * from functional_parquet.alltypes limit 100")
+    self.client.wait_for_impala_state(failed_query_handle, ERROR, 20)
+    profile = self.client.get_runtime_profile(failed_query_handle)
+    assert EXPECTED_REASON in profile, \
+      "Expected reason '{0}' not found in profile: {1}".format(EXPECTED_REASON, profile)
+    self.client.close_query(failed_query_handle)
+    # Test it should pass all the trivial queries.
+    self._test_trivial_queries_suc()
+
  @SkipIfNotHdfsMinicluster.tuned_for_minicluster
  @pytest.mark.execute_serially
  def test_retained_removed_coords_size(self):