Files
impala/tests/query_test/test_lifecycle.py
Henry Robinson 89c3c61a92 IMPALA-1599: Make Prepare() async for plan fragments
When codegen is enabled, Prepare() for plan fragments can be relatively
expensive (the cost comes mostly from preparing and optimizing the LLVM
bitcode module). However, we call it on the critical path for query
setup, during the ExecRemoteFragment() RPC issued by the
coordinator. This can lead to high startup latencies for queries,
particularly those with many fragments.

This patch moves Prepare() to the fragment exec thread. This allows the
coordinator to start many fragments in quick succession. Doing so
complicates matters regarding cancellation, which originally could not
occur during Prepare() as the coordinator would wait for all RPCs to
finish before issuing any cancellation.

To address the new complexity, this patch loosens the existing contract
for concurrent calls of PlanFragmentExecutor::Prepare() and
Cancel(). Previously Cancel() must not have been called before Prepare()
had returned.  Now they may be called in any order, including
concurrently, and the two methods will coordinate to ensure the correct
execution order. Cancel() will always block until Prepare() has
finished, unless it was called strictly before Prepare().

Making this change allows us to rework the previous logic where
Prepare() always had to be called before a fragment was registered (so
that any concurrent Cancel() calls could not 'find' the fragment). The
order is now register->prepare->exec->unregister. Cancellation may occur
any time after ExecPlanFragment() returns.

Change-Id: Ie39737dc419d7708dd881e68d1035e05d3256d19
Reviewed-on: http://gerrit.cloudera.org:8080/539
Reviewed-by: Henry Robinson <henry@cloudera.com>
Tested-by: Internal Jenkins
2016-02-02 20:10:02 +00:00

66 lines
2.5 KiB
Python

# Copyright (c) 2015 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.impala_cluster import ImpalaCluster
from tests.verifiers.metric_verifier import MetricVerifier
class TestFragmentLifecycle(ImpalaTestSuite):
"""Using the debug action interface, check that failed queries correctly clean up *all*
fragments"""
@classmethod
def get_workload(self):
return 'functional'
@pytest.mark.execute_serially
def test_failure_in_prepare(self):
# Fail the scan node
self.client.execute("SET DEBUG_ACTION='-1:0:PREPARE:FAIL'");
try:
self.client.execute("SELECT COUNT(*) FROM functional.alltypes")
assert "Query should have thrown an error"
except ImpalaBeeswaxException:
pass
verifiers = [ MetricVerifier(i.service) for i in ImpalaCluster().impalads ]
for v in verifiers:
v.wait_for_metric("impala-server.num-fragments-in-flight", 0)
@pytest.mark.execute_serially
def test_failure_in_prepare_multi_fragment(self):
# Test that if one fragment fails that the others are cleaned up during the ensuing
# cancellation.
# Fail the scan node
self.client.execute("SET DEBUG_ACTION='-1:0:PREPARE:FAIL'");
# Force a query plan that will have three fragments or more.
try:
self.client.execute("SELECT COUNT(*) FROM functional.alltypes a JOIN [SHUFFLE] \
functional.alltypes b on a.id = b.id")
assert "Query should have thrown an error"
except ImpalaBeeswaxException:
pass
verifiers = [ MetricVerifier(i.service) for i in ImpalaCluster().impalads ]
for v in verifiers:
# Long timeout required because fragments may be blocked while sending data, default
# timeout is 60s before they wake up and cancel themselves.
#
# TODO: Fix when we have cancellable RPCs.
v.wait_for_metric("impala-server.num-fragments-in-flight", 0, timeout=65)