IMPALA-10397 : Reduce flakiness in test_single_workload

This test failed recently due to a timeout waiting for executors to come up. The logs showed that the executors came up on time but it was not recognized by the coordinator. This patch attempts to reduce flakiness by increasing the timeout and adding more logging in case this happens in the future. Testing: Ran in a loop on my local for a few hours. Change-Id: I73ea5eb663db6d03832b19ed323670590946f514 Reviewed-on: http://gerrit.cloudera.org:8080/17028 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-23 21:08:39 -05:00 · 2021-02-05 13:40:07 -08:00
parent 1f7b413d11
commit f888d36295
1 changed files with 16 additions and 9 deletions
--- a/tests/custom_cluster/test_auto_scaling.py
+++ b/tests/custom_cluster/test_auto_scaling.py
@@ -27,7 +27,7 @@ from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.skip import SkipIfEC

 LOG = logging.getLogger("test_auto_scaling")
-
+TOTAL_BACKENDS_METRIC_NAME = "cluster-membership.backends.total"

 class TestAutoScaling(CustomClusterTestSuite):
  @classmethod
@@ -43,7 +43,7 @@ class TestAutoScaling(CustomClusterTestSuite):
  """This class contains tests that exercise the logic related to scaling clusters up and
  down by adding and removing groups of executors."""
  INITIAL_STARTUP_TIME_S = 10
-  STATE_CHANGE_TIMEOUT_S = 45
+  STATE_CHANGE_TIMEOUT_S = 60
  # This query will scan two partitions (month = 1, 2) and thus will have 1 fragment
  # instance per executor on groups of size 2. Each partition has 2 rows, so it performs
  # two comparisons and should take around 1 second to complete.
@@ -51,13 +51,20 @@ class TestAutoScaling(CustomClusterTestSuite):
             and id + random() < sleep(500)"""

  def _get_total_admitted_queries(self):
-    return self.impalad_test_service.get_total_admitted_queries("default-pool")
+    admitted_queries = self.impalad_test_service.get_total_admitted_queries(
+      "default-pool")
+    LOG.info("Current total admitted queries: %s", admitted_queries)
+    return admitted_queries

  def _get_num_backends(self):
-    return self.impalad_test_service.get_metric_value("cluster-membership.backends.total")
+    metric_val = self.impalad_test_service.get_metric_value(TOTAL_BACKENDS_METRIC_NAME)
+    LOG.info("Getting metric %s : %s", TOTAL_BACKENDS_METRIC_NAME, metric_val)
+    return metric_val

  def _get_num_running_queries(self):
-    return self.impalad_test_service.get_num_running_queries("default-pool")
+    running_queries = self.impalad_test_service.get_num_running_queries("default-pool")
+    LOG.info("Current running queries: %s", running_queries)
+    return running_queries

  @SkipIfEC.fix_later
  def test_single_workload(self):
@@ -124,7 +131,7 @@ class TestAutoScaling(CustomClusterTestSuite):

      # Wait for workers to spin down
      self.impalad_test_service.wait_for_metric_value(
-        "cluster-membership.backends.total", 1,
+        TOTAL_BACKENDS_METRIC_NAME, 1,
        timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
      assert self.impalad_test_service.get_metric_value(
        "cluster-membership.executor-groups.total") == 0
@@ -155,7 +162,7 @@ class TestAutoScaling(CustomClusterTestSuite):
      # Wait for workers to spin up
      cluster_size = GROUP_SIZE + 1  # +1 to include coordinator.
      self.impalad_test_service.wait_for_metric_value(
-        "cluster-membership.backends.total", cluster_size,
+        TOTAL_BACKENDS_METRIC_NAME, cluster_size,
        timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)

      # Wait until we admitted at least 10 queries
@@ -184,7 +191,7 @@ class TestAutoScaling(CustomClusterTestSuite):

      # Wait for workers to spin down
      self.impalad_test_service.wait_for_metric_value(
-        "cluster-membership.backends.total", 1,
+        TOTAL_BACKENDS_METRIC_NAME, 1,
        timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
      assert self.impalad_test_service.get_metric_value(
        "cluster-membership.executor-groups.total") == 0
@@ -242,7 +249,7 @@ class TestAutoScaling(CustomClusterTestSuite):

      # Wait for workers to spin down
      self.impalad_test_service.wait_for_metric_value(
-        "cluster-membership.backends.total", 1,
+        TOTAL_BACKENDS_METRIC_NAME, 1,
        timeout=self.STATE_CHANGE_TIMEOUT_S, interval=1)
      assert self.impalad_test_service.get_metric_value(
        "cluster-membership.executor-groups.total") == 0