mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-14493: Cap memory usage of global admission service
The global admission service can experience OOM errors under high concurrency because its process memory tracker is inaccurate and doesn't account for all memory allocations. Ensuring memory tracker accurately accounts for every allocation could be difficult, this patch uses a simpler solution to introduce a hard memory cap using tcmalloc statistics, which accurately reflect the true process memory usage. If a new query is submitted while tcmalloc memory usage is over the process limit, the query will be rejected immediately to protect from OOM. Adds a new flag enable_admission_service_mem_safeguard allowing this feature to be enabled or disabled. By default, this feature is turned on Tests: Added test test_admission_service_low_mem_limit. Passed exhaustive tests. Change-Id: I2ee2c942a73fcd69358851fc2fdc0fc4fe531c73 Reviewed-on: http://gerrit.cloudera.org:8080/23542 Reviewed-by: Abhishek Rawat <arawat@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
@@ -1100,7 +1100,8 @@ class TestAdmissionController(TestAdmissionControllerBase):
|
||||
@CustomClusterTestSuite.with_args(
|
||||
impalad_args=impalad_admission_ctrl_flags(max_requests=10, max_queued=10,
|
||||
pool_max_mem=10 * 1024 * 1024, proc_mem_limit=2 * 1024 * 1024,
|
||||
queue_wait_timeout_ms=1000),
|
||||
queue_wait_timeout_ms=1000)
|
||||
+ " --enable_admission_service_mem_safeguard=false",
|
||||
statestored_args=_STATESTORED_ARGS)
|
||||
def test_timeout_reason_host_memory(self):
|
||||
self.client.set_configuration_option('enable_trivial_query_for_admission', 'false')
|
||||
@@ -1134,7 +1135,8 @@ class TestAdmissionController(TestAdmissionControllerBase):
|
||||
@CustomClusterTestSuite.with_args(
|
||||
impalad_args=impalad_admission_ctrl_flags(max_requests=10, max_queued=10,
|
||||
pool_max_mem=2 * 1024 * 1024, proc_mem_limit=20 * 1024 * 1024,
|
||||
queue_wait_timeout_ms=1000),
|
||||
queue_wait_timeout_ms=1000)
|
||||
+ " --enable_admission_service_mem_safeguard=false",
|
||||
statestored_args=_STATESTORED_ARGS)
|
||||
def test_timeout_reason_pool_memory(self):
|
||||
self.client.set_configuration_option('enable_trivial_query_for_admission', 'false')
|
||||
@@ -2297,6 +2299,23 @@ class TestAdmissionControllerWithACService(TestAdmissionController):
|
||||
client1.close()
|
||||
client2.close()
|
||||
|
||||
@SkipIfNotHdfsMinicluster.tuned_for_minicluster
|
||||
@pytest.mark.execute_serially
|
||||
@CustomClusterTestSuite.with_args(
|
||||
impalad_args="--vmodule admission-controller=3 --mem_limit=10MB ")
|
||||
def test_admission_service_low_mem_limit(self):
|
||||
EXPECTED_REASON = "Admission rejected due to memory pressure"
|
||||
# Test whether it will fail for a normal query.
|
||||
failed_query_handle = self.client.execute_async(
|
||||
"select * from functional_parquet.alltypes limit 100")
|
||||
self.client.wait_for_impala_state(failed_query_handle, ERROR, 20)
|
||||
profile = self.client.get_runtime_profile(failed_query_handle)
|
||||
assert EXPECTED_REASON in profile, \
|
||||
"Expected reason '{0}' not found in profile: {1}".format(EXPECTED_REASON, profile)
|
||||
self.client.close_query(failed_query_handle)
|
||||
# Test it should pass all the trivial queries.
|
||||
self._test_trivial_queries_suc()
|
||||
|
||||
@SkipIfNotHdfsMinicluster.tuned_for_minicluster
|
||||
@pytest.mark.execute_serially
|
||||
def test_retained_removed_coords_size(self):
|
||||
|
||||
Reference in New Issue
Block a user