mirror of
https://github.com/apache/impala.git
synced 2025-12-20 18:37:21 -05:00
This adds trace events for data stream RPCs and
dumps them when they take longer than
--impala_slow_rpc_threshold_ms.
I needed to modify the KRPC code to do this because it
currently only dumps traces for RPCs with deadlines.
I plan to add some version of this upstream in Kudu
so that we don't diverge our KRPC implementation.
Example output from test_exchange_small_buffer:
I1111 08:38:53.732910 26509 rpcz_store.cc:265] Call impala.DataStreamService.TransmitData from 127.0.0.1:42434 (request call id 43) took 7799ms. Request Metrics: {}
I1111 08:38:53.732928 26509 rpcz_store.cc:269] Trace:
1111 08:38:45.933412 (+ 0us) impala-service-pool.cc:167] Inserting onto call queue
1111 08:38:45.933449 (+ 37us) impala-service-pool.cc:254] Handling call
1111 08:38:45.933470 (+ 21us) krpc-data-stream-mgr.cc:227] Added early sender
1111 08:38:47.906542 (+1973072us) krpc-data-stream-recvr.cc:327] Enqueuing deferred RPC
1111 08:38:53.732858 (+5826316us) krpc-data-stream-recvr.cc:506] Processing deferred RPC
1111 08:38:53.732860 (+ 2us) krpc-data-stream-recvr.cc:399] Deserializing batch
1111 08:38:53.732888 (+ 28us) krpc-data-stream-recvr.cc:426] Enqueuing deserialized batch
1111 08:38:53.732895 (+ 7us) inbound_call.cc:162] Queueing success response
Disabled +-clang-diagnostic-gnu-zero-variadic-macro-arguments because it
had false positives on the TRACE_TO invocations.
Testing:
* Ran exhaustive and ASAN tests
* Ran stress test
Change-Id: Ic7af4b45c43ec731d742d3696112c5f800849947
Reviewed-on: http://gerrit.cloudera.org:8080/14668
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
82 lines
3.6 KiB
Python
82 lines
3.6 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
|
from tests.common.environ import build_flavor_timeout
|
|
from tests.common.skip import SkipIfBuildType
|
|
from tests.util.filesystem_utils import IS_S3, IS_ADLS, IS_ISILON
|
|
|
|
# IMPALA-6100: add additional margin for error for slow build types.
|
|
SLOW_BUILD_TIMEOUT=20000
|
|
DELAY_MS = build_flavor_timeout(10000, slow_build_timeout=SLOW_BUILD_TIMEOUT)
|
|
# IMPALA-6381: Isilon can behave as a slow build.
|
|
if IS_ISILON:
|
|
DELAY_MS = SLOW_BUILD_TIMEOUT
|
|
|
|
@SkipIfBuildType.not_dev_build
|
|
class TestExchangeDelays(CustomClusterTestSuite):
|
|
"""Tests for handling delays in finding data stream receivers"""
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
"--stress_datastream_recvr_delay_ms={0}".format(DELAY_MS)
|
|
+ " --datastream_sender_timeout_ms=5000"
|
|
+ " --impala_slow_rpc_threshold_ms=500")
|
|
def test_exchange_small_delay(self, vector):
|
|
"""Test delays in registering data stream receivers where the first one or two
|
|
batches will time out before the receiver registers, but subsequent batches will
|
|
arrive after the receiver registers. Before IMPALA-2987, this scenario resulted in
|
|
incorrect results.
|
|
"""
|
|
self.run_test_case('QueryTest/exchange-delays', vector)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
"--stress_datastream_recvr_delay_ms={0}".format(DELAY_MS)
|
|
+ " --datastream_sender_timeout_ms=1"
|
|
+ " --impala_slow_rpc_threshold_ms=500")
|
|
def test_exchange_large_delay(self, vector):
|
|
"""Test delays in registering data stream receivers where all of the batches sent
|
|
will time out before the receiver registers. Before IMPALA-2987, this scenario
|
|
resulted in the query hanging.
|
|
"""
|
|
self.run_test_case('QueryTest/exchange-delays', vector)
|
|
|
|
# The SQL used for test_exchange_large_delay_zero_rows requires that the scan complete
|
|
# before the fragment sends the EOS message. A slow scan can cause this test to fail,
|
|
# because the receivers could be set up before the fragment starts sending (and thus
|
|
# can't time out). Use a longer delay for platforms that have slow scans:
|
|
# IMPALA-6811: S3/ADLS have slow scans.
|
|
# IMPALA-6866: Isilon has slow scans (and is counted as a slow build above).
|
|
SLOW_SCAN_EXTRA_DELAY_MS = 10000
|
|
if IS_S3 or IS_ADLS or IS_ISILON:
|
|
DELAY_MS += SLOW_SCAN_EXTRA_DELAY_MS
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
"--stress_datastream_recvr_delay_ms={0}".format(DELAY_MS)
|
|
+ " --datastream_sender_timeout_ms=1"
|
|
+ " --impala_slow_rpc_threshold_ms=500")
|
|
def test_exchange_large_delay_zero_rows(self, vector):
|
|
"""Test the special case when no batches are sent and the EOS message times out."""
|
|
self.run_test_case('QueryTest/exchange-delays-zero-rows', vector)
|