mirror of
https://github.com/apache/impala.git
synced 2026-01-17 03:00:37 -05:00
IMPALA-7312 added the query option FETCH_ROWS_TIMEOUT_MS, but it only applies to fetch requests against a query that has already transitioned to the 'FINISHED' state. This patch changes the timeout so that it applies to queries in the 'RUNNING' state as well. Before this patch, fetch requests issued while a query was 'RUNNING' blocked until the query transitioned to the 'FINISHED' state, and then it fetched results and returned them. After this patch, fetch requests against queries in the 'RUNNING' state will block for 'FETCH_ROWS_TIMEOUT_MS' and then return. For HS2 clients, fetch requests that return while a query is 'RUNNING' set their TStatusCode to STILL_EXECUTING_STATUS. For Beeswax clients, fetch requests that return while a query is 'RUNNING' set the 'ready' flag to false. For both clients, hasMoreRows is set to true. If the following sequence of events occurs: * A fetch request is issued and blocks on a 'RUNNING' query * The query transitions to the 'FINISHED' state * The fetch request attempts to read multiple batches Then the time spent waiting for the query to finish is deducted from the timeout used when waiting for rows to be produced by the Coordinator fragment. Fixed a bug in the current usage of FETCH_ROWS_TIMEOUT_MS where the time units for FETCH_ROWS_TIMEOUT_MS and MonotonicStopWatch were not being converted properly. Tests: * Moved existing fetch timeout tests from hs2/test_fetch.py into a new test file hs2/test_fetch_timeout.py. * Added several new tests to hs2/test_fetch_timeout.py to validate that the timeout is applied to 'RUNNING' queries and that the timeout applies across a 'RUNNING' and 'FINISHED' query. * Added new tests to query_test/test_fetch.py to validate the timeout while using the Beeswax protocol. Change-Id: I2cba6bf062dcc1af19471d21857caa797c1ea4a4 Reviewed-on: http://gerrit.cloudera.org:8080/14332 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
149 lines
6.8 KiB
Python
149 lines
6.8 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import re
|
|
|
|
from time import sleep
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.test_dimensions import extend_exec_option_dimension
|
|
from tests.util.parse_util import parse_duration_string_ms
|
|
|
|
|
|
class TestFetch(ImpalaTestSuite):
|
|
"""Tests that are independent of whether result spooling is enabled or not."""
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestFetch, cls).add_test_dimensions()
|
|
# Result fetching should be independent of file format, so only test against
|
|
# Parquet files.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:
|
|
v.get_value('table_format').file_format == 'parquet')
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
def test_rows_sent_counters(self, vector):
|
|
"""Validate that ClientFetchWaitTimer, NumRowsFetched, RowMaterializationRate,
|
|
and RowMaterializationTimer are set to valid values in the ImpalaServer section
|
|
of the runtime profile."""
|
|
num_rows = 25
|
|
query = "select sleep(100) from functional.alltypes limit {0}".format(num_rows)
|
|
handle = self.execute_query_async(query, vector.get_value('exec_option'))
|
|
try:
|
|
# Wait until the query is 'FINISHED' and results are available for fetching.
|
|
self.wait_for_state(handle, self.client.QUERY_STATES['FINISHED'], 30)
|
|
# Sleep for 2.5 seconds so that the ClientFetchWaitTimer is >= 1s.
|
|
sleep(2.5)
|
|
# Fetch the results so that the fetch related counters are updated.
|
|
assert self.client.fetch(query, handle).success
|
|
|
|
runtime_profile = self.client.get_runtime_profile(handle)
|
|
fetch_timer = re.search("ClientFetchWaitTimer: (.*)", runtime_profile)
|
|
assert fetch_timer and len(fetch_timer.groups()) == 1 and \
|
|
parse_duration_string_ms(fetch_timer.group(1)) > 1000
|
|
assert "NumRowsFetched: {0} ({0})".format(num_rows) in runtime_profile
|
|
assert re.search("RowMaterializationRate: [1-9]", runtime_profile)
|
|
# The query should take at least 1s to materialize all rows since it should sleep
|
|
# for at least 1s during materialization.
|
|
materialization_timer = re.search("RowMaterializationTimer: (.*)", runtime_profile)
|
|
assert materialization_timer and len(materialization_timer.groups()) == 1 and \
|
|
parse_duration_string_ms(materialization_timer.group(1)) > 1000
|
|
finally:
|
|
self.client.close_query(handle)
|
|
|
|
|
|
class TestFetchAndSpooling(ImpalaTestSuite):
|
|
"""Tests that apply when result spooling is enabled or disabled."""
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestFetchAndSpooling, cls).add_test_dimensions()
|
|
# Result fetching should be independent of file format, so only test against
|
|
# Parquet files.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:
|
|
v.get_value('table_format').file_format == 'parquet')
|
|
extend_exec_option_dimension(cls, 'spool_query_results', 'true')
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
def test_rows_sent_counters(self, vector):
|
|
"""Validate that RowsSent and RowsSentRate are set to valid values in
|
|
the PLAN_ROOT_SINK section of the runtime profile."""
|
|
num_rows = 10
|
|
if ('spool_query_results' in vector.get_value('exec_option') and
|
|
bool(vector.get_value('exec_option')['spool_query_results'])):
|
|
vector.get_value('exec_option')['debug_action'] = "BPRS_BEFORE_ADD_BATCH:SLEEP@1000"
|
|
else:
|
|
vector.get_value('exec_option')['debug_action'] = "BPRS_BEFORE_ADD_ROWS:SLEEP@1000"
|
|
result = self.execute_query("select id from functional.alltypes limit {0}"
|
|
.format(num_rows), vector.get_value('exec_option'))
|
|
assert "RowsSent: {0} ({0})".format(num_rows) in result.runtime_profile
|
|
rows_sent_rate = re.search("RowsSentRate: (\d*\.?\d*)", result.runtime_profile)
|
|
assert rows_sent_rate
|
|
assert float(rows_sent_rate.group(1)) > 0
|
|
|
|
|
|
class TestFetchTimeout(ImpalaTestSuite):
|
|
"""A few basic tests for FETCH_ROWS_TIMEOUT_MS that are not specific to the HS2 protocol
|
|
(e.g. in contrast to the tests in tests/hs2/test_fetch_timeout.py). These tests are
|
|
necessary because part of the FETCH_ROWS_TIMEOUT_MS code is HS2/Beeswax specific.
|
|
Unlike the tests in hs2/test_fetch_timeout.py, these tests do not validate that
|
|
individual RPC calls timeout, instead they set a low value for the timeout and assert
|
|
that the query works end-to-end."""
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestFetchTimeout, cls).add_test_dimensions()
|
|
# Result fetching should be independent of file format, so only test against
|
|
# Parquet files.
|
|
cls.ImpalaTestMatrix.add_constraint(lambda v:
|
|
v.get_value('table_format').file_format == 'parquet')
|
|
extend_exec_option_dimension(cls, 'spool_query_results', 'true')
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
def test_fetch_timeout(self, vector):
|
|
"""A simple test that runs a query with a low timeout and introduces delays in
|
|
RowBatch production. Asserts that the query succeeds and returns the expected number
|
|
of rows."""
|
|
num_rows = 100
|
|
query = "select * from functional.alltypes limit {0}".format(num_rows)
|
|
vector.get_value('exec_option')['batch_size'] = 1
|
|
vector.get_value('exec_option')['fetch_rows_timeout_ms'] = 1
|
|
vector.get_value('exec_option')['debug_action'] = '0:GETNEXT:DELAY'
|
|
results = self.execute_query(query, vector.get_value('exec_option'))
|
|
assert results.success
|
|
assert len(results.data) == num_rows
|
|
|
|
def test_fetch_before_finished_timeout(self, vector):
|
|
"""Tests that the FETCH_ROWS_TIMEOUT_MS timeout applies to queries that are not in
|
|
the 'finished' state. Similar to the test tests/hs2/test_fetch_timeout.py::
|
|
TestFetchTimeout::test_fetch_before_finished_timeout(_with_result_spooling)."""
|
|
num_rows = 10
|
|
query = "select * from functional.alltypes limit {0}".format(num_rows)
|
|
vector.get_value('exec_option')['debug_action'] = 'CRS_BEFORE_COORD_STARTS:SLEEP@5000'
|
|
vector.get_value('exec_option')['fetch_rows_timeout_ms'] = '1000'
|
|
results = self.execute_query(query, vector.get_value('exec_option'))
|
|
assert results.success
|
|
assert len(results.data) == num_rows
|