Files
impala/tests/metadata/test_load.py
Qifan Chen 2d6300ac59 IMPALA-10999 Flakiness in TestAsyncLoadData.test_async_load
This patch addresses the flakness by allowing RUNNING state to be a
legit exec state returned from execute_query_async_using_client() in
python. This call submits the load query to Impala backend.

The corresponding Impala backend code for the beewax protocol is
ImpalaServer::query() which utilizes a wait thread executing
ClientRequestState::Wait() to set the exec state from RUNNING to
FINISH. Sometime, when this wait thread does not run fast to do so,
the returned state from the main thread is RUNNING.

The fix is purely a modification to the test itself.

Testing:
  1. ran core test successfully

Change-Id: Ic2ac954b0494b7413ce0ec405718fcc354dba9e0
Reviewed-on: http://gerrit.cloudera.org:8080/18268
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2022-03-08 04:48:06 +00:00

209 lines
9.3 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Functional tests for LOAD DATA statements.
import time
from beeswaxd.BeeswaxService import QueryState
from copy import deepcopy
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import (
create_client_protocol_dimension,
create_exec_option_dimension,
create_single_exec_option_dimension,
create_uncompressed_text_dimension)
from tests.common.skip import SkipIfLocal
from tests.common.test_vector import ImpalaTestDimension
from tests.util.filesystem_utils import (WAREHOUSE)
TEST_TBL_PART = "test_load"
TEST_TBL_NOPART = "test_load_nopart"
STAGING_PATH = 'test-warehouse/test_load_staging'
ALLTYPES_PATH = "test-warehouse/alltypes/year=2010/month=1/100101.txt"
MULTIAGG_PATH = 'test-warehouse/alltypesaggmultifiles/year=2010/month=1/day=1'
HIDDEN_FILES = ["{0}/3/.100101.txt".format(STAGING_PATH),
"{0}/3/_100101.txt".format(STAGING_PATH)]
@SkipIfLocal.hdfs_client
class TestLoadData(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestLoadData, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def _clean_test_tables(self):
self.client.execute("drop table if exists functional.{0}".format(TEST_TBL_NOPART))
self.client.execute("drop table if exists functional.{0}".format(TEST_TBL_PART))
self.filesystem_client.delete_file_dir(STAGING_PATH, recursive=True)
def teardown_method(self, method):
self._clean_test_tables()
def setup_method(self, method):
# Defensively clean the data dirs if they exist.
self._clean_test_tables()
# Create staging directories for load data inpath. The staging directory is laid out
# as follows:
# - It has 6 sub directories, numbered 1-6
# - The directories are populated with files from a subset of partitions in
# existing partitioned tables.
# - Sub Directories 1-4 have single files copied from alltypes/
# - Sub Directories 5-6 have multiple files (4) copied from alltypesaggmultifiles
# - Sub Directory 3 also has hidden files, in both supported formats.
# - All sub-dirs contain a hidden directory
for i in xrange(1, 6):
stagingDir = '{0}/{1}'.format(STAGING_PATH, i)
self.filesystem_client.make_dir(stagingDir, permission=777)
self.filesystem_client.make_dir('{0}/_hidden_dir'.format(stagingDir),
permission=777)
# Copy single file partitions from alltypes.
for i in xrange(1, 4):
self.filesystem_client.copy(ALLTYPES_PATH,
"{0}/{1}/100101.txt".format(STAGING_PATH, i))
# Copy multi file partitions from alltypesaggmultifiles.
file_names = self.filesystem_client.ls(MULTIAGG_PATH)
for i in xrange(4, 6):
for file_ in file_names:
self.filesystem_client.copy(
"{0}/{1}".format(MULTIAGG_PATH, file_),
'{0}/{1}/{2}'.format(STAGING_PATH, i, file_))
# Create two hidden files, with a leading . and _
for file_ in HIDDEN_FILES:
self.filesystem_client.copy(ALLTYPES_PATH, file_)
# Create both the test tables.
self.client.execute("create table functional.{0} like functional.alltypes"
" location '{1}/{0}'".format(TEST_TBL_PART, WAREHOUSE))
self.client.execute("create table functional.{0} like functional.alltypesnopart"
" location '{1}/{0}'".format(TEST_TBL_NOPART, WAREHOUSE))
def test_load(self, vector):
self.run_test_case('QueryTest/load', vector)
# The hidden files should not have been moved as part of the load operation.
for file_ in HIDDEN_FILES:
assert self.filesystem_client.exists(file_), "{0} does not exist".format(file_)
@SkipIfLocal.hdfs_client
class TestAsyncLoadData(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestAsyncLoadData, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
# Test all clients: hs2, hs2-http and beeswax
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
# Test two exec modes per client
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension('enable_async_load_data_execution', True, False))
# Disable codegen = false
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
disable_codegen_options=[False]))
# This test subjects the load into either sync or async compilation of the load
# query at the backend through beewax or hs2 clients. The objective is to assure
# the load query completes successfully.
def test_async_load(self, vector, unique_database):
enable_async_load_data = vector.get_value('enable_async_load_data_execution')
protocol = vector.get_value('protocol')
client = self.create_impala_client(protocol=protocol)
is_hs2 = protocol in ['hs2', 'hs2-http']
running_state = "RUNNING_STATE" if is_hs2 else QueryState.RUNNING
finished_state = "FINISHED_STATE" if is_hs2 else QueryState.FINISHED
# Form a fully qualified table name with '-' in protocol 'hs2-http' dropped as
# '-' is not allowed in Impala table name even delimited with ``.
qualified_table_name = '{0}.{1}_{2}_{3}'.format(unique_database, TEST_TBL_NOPART,
protocol if protocol != 'hs2-http' else 'hs2http', enable_async_load_data)
# Form a staging path that is protocol and enable_async_load_data dependent to
# allow parallel creating distinct HDFS directories for each test object.
staging_path = "{0}_{1}_{2}".format(STAGING_PATH, protocol, enable_async_load_data)
# Put some data into the staging path
self.filesystem_client.delete_file_dir(staging_path, recursive=True)
self.filesystem_client.make_dir(staging_path, permission=777)
self.filesystem_client.copy(ALLTYPES_PATH, "{0}/100101.txt".format(staging_path))
# Create a table with the staging path
self.client.execute("create table {0} like functional.alltypesnopart \
location \'/{1}\'".format(qualified_table_name, staging_path))
try:
# The load data is going to need the metadata of the table. To avoid flakiness
# about metadata loading, this selects from the table first to get the metadata
# loaded.
self.execute_query_expect_success(client,
"select count(*) from {0}".format(qualified_table_name))
# Configure whether to use async LOAD and add an appropriate delay of 3 seconds
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['enable_async_load_data_execution'] = \
enable_async_load_data
delay = "CRS_DELAY_BEFORE_LOAD_DATA:SLEEP@3000"
new_vector.get_value('exec_option')['debug_action'] = "{0}".format(delay)
load_stmt = "load data inpath \'/{1}\' \
into table {0}".format(qualified_table_name, staging_path)
exec_start = time.time()
handle = self.execute_query_async_using_client(client, load_stmt, new_vector)
exec_end = time.time()
exec_time = exec_end - exec_start
exec_end_state = client.get_state(handle)
# Wait for the statement to finish with a timeout of 10 seconds
wait_start = time.time()
self.wait_for_state(handle, finished_state, 10, client=client)
wait_end = time.time()
wait_time = wait_end - wait_start
self.close_query_using_client(client, handle)
if enable_async_load_data:
# In async mode:
# The compilation of LOAD is processed in the exec step without delay. And the
# processing of the LOAD plan is in wait step with delay. The wait time should
# definitely take more time than 3 seconds.
assert(exec_end_state == running_state)
assert(wait_time >= 3)
else:
# In sync mode:
# The entire LOAD is processed in the exec step with delay. exec_time should be
# more than 3 seconds. Since the load query is submitted async, it is possible
# that the exec state returned is still in RUNNING state due to the the wait-for
# thread executing ClientRequestState::Wait() does not have time to set the
# exec state from RUNNING to FINISH.
assert(exec_end_state == running_state or exec_end_state == finished_state)
assert(exec_time >= 3)
finally:
client.close()
self.client.execute("drop table if exists {0}".format(qualified_table_name))
self.filesystem_client.delete_file_dir(staging_path, recursive=True)