mirror of
https://github.com/apache/impala.git
synced 2026-01-04 18:00:57 -05:00
This patch is a partial fix for the issue where an finst would not detect that it should cancel if the query limit had not been hit. It changes the UpdateExecStatus() RPC to return a cancelled status to an finst if the query has finished because it hit a limit. For certain queries, this allows them to finish much more quickly than they otherwise would. However, there's still a few-second delay for the finst to pick up the cancellation signal, because there UpdateExecStatus() RPC is only called every few seconds. A complete fix would also call CancelInternal() when returned_all_results_ was set to true. That would be a much larger change. The improvement here is to bound the delay between query completion and fragment teardown to a few seconds. Change-Id: I59f45e64978c9ab9914b5c33e86009960b4a88c4 Reviewed-on: http://gerrit.cloudera.org:8080/5987 Tested-by: Impala Public Jenkins Reviewed-by: Henry Robinson <henry@cloudera.com>
95 lines
4.2 KiB
Python
95 lines
4.2 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
import time
|
|
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.impala_cluster import ImpalaCluster
|
|
from tests.verifiers.metric_verifier import MetricVerifier
|
|
|
|
# TODO: Debug actions leak into other tests in the same suite (if not explicitly
|
|
# unset). Ensure they get unset between tests.
|
|
class TestFragmentLifecycleWithDebugActions(ImpalaTestSuite):
|
|
"""Using the debug action interface, check that failed queries correctly clean up *all*
|
|
fragments"""
|
|
|
|
IN_FLIGHT_FRAGMENTS = "impala-server.num-fragments-in-flight"
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional'
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_failure_in_prepare(self):
|
|
# Fail the scan node
|
|
verifiers = [ MetricVerifier(i.service) for i in ImpalaCluster().impalads ]
|
|
self.client.execute("SET DEBUG_ACTION='-1:0:PREPARE:FAIL'");
|
|
try:
|
|
self.client.execute("SELECT COUNT(*) FROM functional.alltypes")
|
|
assert "Query should have thrown an error"
|
|
except ImpalaBeeswaxException:
|
|
pass
|
|
|
|
for v in verifiers:
|
|
v.wait_for_metric(self.IN_FLIGHT_FRAGMENTS, 0)
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_failure_in_prepare_multi_fragment(self):
|
|
# Test that if one fragment fails that the others are cleaned up during the ensuing
|
|
# cancellation.
|
|
verifiers = [ MetricVerifier(i.service) for i in ImpalaCluster().impalads ]
|
|
# Fail the scan node
|
|
self.client.execute("SET DEBUG_ACTION='-1:0:PREPARE:FAIL'");
|
|
|
|
# Force a query plan that will have three fragments or more.
|
|
try:
|
|
self.client.execute("SELECT COUNT(*) FROM functional.alltypes a JOIN [SHUFFLE] \
|
|
functional.alltypes b on a.id = b.id")
|
|
assert "Query should have thrown an error"
|
|
except ImpalaBeeswaxException:
|
|
pass
|
|
|
|
for v in verifiers:
|
|
# Long timeout required because fragments may be blocked while sending data. The
|
|
# default value of --datastream_sender_timeout_ms is 120s before they wake up and
|
|
# cancel themselves.
|
|
#
|
|
# TODO: Fix when we have cancellable RPCs.
|
|
v.wait_for_metric(self.IN_FLIGHT_FRAGMENTS, 0, timeout=125)
|
|
|
|
class TestFragmentLifecycle(ImpalaTestSuite):
|
|
def test_finst_cancel_when_query_complete(self):
|
|
"""Regression test for IMPALA-4295: if a query returns all its rows before all its
|
|
finsts have completed, it should cancel the finsts and complete promptly."""
|
|
now = time.time()
|
|
|
|
# Query designed to produce 1024 (the limit) rows very quickly from the first union
|
|
# child, but the second one takes a very long time to complete. Without fix for
|
|
# IMPALA-4295, the whole query waits for the second child to complete.
|
|
|
|
# Due to IMPALA-5671, the limit must be a multiple of the row batch size - if it's
|
|
# reached during production of a row batch, processing moves to the second child, and
|
|
# the query will take a long time complete.
|
|
self.client.execute("with l as (select 1 from functional.alltypes), r as"
|
|
" (select count(*) from tpch_parquet.lineitem a cross join tpch_parquet.lineitem b)"
|
|
"select * from l union all (select * from r) LIMIT 1024")
|
|
end = time.time()
|
|
|
|
# Query typically completes in < 2s, but if cross join is fully evaluated, will take >
|
|
# 10 minutes. Pick 2 minutes as a reasonable midpoint to avoid false negatives.
|
|
assert end - now < 120, "Query took too long to complete: " + duration + "s"
|