IMPALA-8924, IMPALA-8934: Result spooling failpoint tests, fix DCHECKs

Adds several "failpoint" tests to test_result_spooling.py. These tests use debug_actions spread throughout buffered-plan-root-sink.cc to trigger failures while result spooling is running. The tests validate that all queries gracefully fail and do not cause any impalad crashes. Fixed a few bugs that came up when adding these tests, as well as the crash reported in IMPALA-8924 (which is now covered by the failpoint tests added in this patch). The first bug fixed was a DCHECK in SpillableRowBatchQueue::IsEmpty() where the method was being called after the queue had been closed. The fix is to only call IsEmpty() if IsOpen() returns true. The second bug was an issue in the cancellation path where BufferedPlanRootSink::GetNext would enter an infinite loop if the query was cancelled and then GetNext was called. The fix is to check the cancellation state in the outer while loop. Testing: * Added new tests to test_result_spooling.py * Ran core tests Change-Id: Ib96f797bc8a5ba8baf9fb28abd1f645345bbe932 Reviewed-on: http://gerrit.cloudera.org:8080/14214 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-25 02:03:09 -05:00 · 2019-09-10 13:31:01 -07:00
parent bca1b43efb
commit 391942d79d
5 changed files with 137 additions and 14 deletions
--- a/tests/query_test/test_result_spooling.py
+++ b/tests/query_test/test_result_spooling.py
@@ -25,6 +25,7 @@ from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.test_dimensions import create_exec_option_dimension
 from tests.common.test_vector import ImpalaTestDimension
 from tests.util.cancel_util import cancel_query_and_validate_state
+from tests.util.failpoints_util import execute_query_expect_debug_action_failure


 class TestResultSpooling(ImpalaTestSuite):
@@ -179,6 +180,19 @@ class TestResultSpooling(ImpalaTestSuite):
    finally:
      self.client.close_query(handle)

+  def test_exec_tree_failpoint(self, vector):
+    """Inject a failure during exec tree execution. The GETNEXT:DELAY is necessary to
+    ensure the client issues a fetch request before the MEM_LIMIT_EXCEEDED exception is
+    thrown. Unlike the tests in TestResultSpoolingFailpoints this test injects a fail
+    during the execution of the exec tree, rather than in the result spooling code."""
+    vector.get_value('exec_option')['batch_size'] = 10
+    vector.get_value('exec_option')['debug_action'] = \
+        '4:GETNEXT:MEM_LIMIT_EXCEEDED|0:GETNEXT:DELAY'
+    vector.get_value('exec_option')['spool_query_results'] = 'true'
+    query = "select 1 from functional.alltypessmall a join functional.alltypessmall b " \
+        "on a.id = b.id"
+    execute_query_expect_debug_action_failure(self, query, vector)
+
  def __validate_query(self, query, exec_options):
    """Compares the results of the given query with and without result spooling
    enabled."""
@@ -344,3 +358,47 @@ class TestResultSpoolingCancellation(ImpalaTestSuite):
          "Unexpected status code from cancel request: {0}".format(cancel_result)
    finally:
      if handle: self.client.close_query(handle)
+
+
+class TestResultSpoolingFailpoints(ImpalaTestSuite):
+  """Test result spooling failure handling. Uses debug actions to inject failures at
+  various points of result spooling execution (e.g. the when results are actually getting
+  spooled)."""
+
+  _debug_actions = [
+      # Inject a failure in BufferedPlanRootSink::Open.
+      'BPRS_BEFORE_OPEN:FAIL',
+      # Inject a failure immediately before BufferedPlanRootSink::Send adds a batch to
+      # the queue. The probability ensures that the error is thrown on a random
+      # RowBatch.
+      'BPRS_BEFORE_ADD_BATCH:FAIL@1.0',
+      # Inject a failure in BufferedPlanRootSink::FlushFinal.
+      'BPRS_BEFORE_FLUSH_FINAL:FAIL',
+      # Inject a failure immediately before the BufferedPlanRootSink::GetNext reads a
+      # batch from the queue. The probability ensures that the error is thrown on a
+      # random RowBatch.
+      'BPRS_BEFORE_GET_BATCH:FAIL@1.0']
+
+  _query = "select * from functional.alltypes"
+
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestResultSpoolingFailpoints, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('debug_action',
+        *cls._debug_actions))
+
+    # Result spooling should be independent of file format, so only testing for
+    # table_format=parquet/none in order to avoid a test dimension explosion.
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
+        v.get_value('table_format').file_format == 'parquet' and
+        v.get_value('table_format').compression_codec == 'none')
+
+  def test_failpoints(self, vector):
+    vector.get_value('exec_option')['batch_size'] = 10
+    vector.get_value('exec_option')['debug_action'] = vector.get_value('debug_action')
+    vector.get_value('exec_option')['spool_query_results'] = 'true'
+    execute_query_expect_debug_action_failure(self, self._query, vector)