Files
impala/tests/query_test/test_queries.py
Riza Suminto 28cff4022d IMPALA-14333: Run impala-py.test using Python3
Running exhaustive tests with env var IMPALA_USE_PYTHON3_TESTS=true
reveals some tests that require adjustment. This patch made such
adjustment, which mostly revolves around encoding differences and string
vs bytes type in Python3. This patch also switch the default to run
pytest with Python3 by setting IMPALA_USE_PYTHON3_TESTS=true. The
following are the details:

Change hash() function in conftest.py to crc32() to produce
deterministic hash. Hash randomization is enabled by default since
Python 3.3 (see
https://docs.python.org/3/reference/datamodel.html#object.__hash__).
This cause test sharding (like --shard_tests=1/2) produce inconsistent
set of tests per shard. Always restart minicluster during custom cluster
tests if --shard_tests argument is set, because test order may change
and affect test correctness, depending on whether running on fresh
minicluster or not.

Moved one test case from delimited-latin-text.test to
test_delimited_text.py for easier binary comparison.

Add bytes_to_str() as a utility function to decode bytes in Python3.
This is often needed when inspecting the return value of
subprocess.check_output() as a string.

Implement DataTypeMetaclass.__lt__ to substitute
DataTypeMetaclass.__cmp__ that is ignored in Python3 (see
https://peps.python.org/pep-0207/).

Fix WEB_CERT_ERR difference in test_ipv6.py.

Fix trivial integer parsing in test_restart_services.py.

Fix various encoding issues in test_saml2_sso.py,
test_shell_commandline.py, and test_shell_interactive.py.

Change timeout in Impala.for_each_impalad() from sys.maxsize to 2^31-1.

Switch to binary comparison in test_iceberg.py where needed.

Specify text mode when calling tempfile.NamedTemporaryFile().

Simplify create_impala_shell_executable_dimension to skip testing dev
and python2 impala-shell when IMPALA_USE_PYTHON3_TESTS=true. The reason
is that several UTF-8 related tests in test_shell_commandline.py break
in Python3 pytest + Python2 impala-shell combo. This skipping already
happen automatically in build OS without system Python2 available like
RHEL9 (IMPALA_SYSTEM_PYTHON2 env var is empty).

Removed unused vector argument and fixed some trivial flake8 issues.

Several test logic require modification due to intermittent issue in
Python3 pytest. These include:

Add _run_query_with_client() in test_ranger.py to allow reusing a single
Impala client for running several queries. Ensure clients are closed
when the test is done. Mark several tests in test_ranger.py with

SkipIfFS.hive because they run queries through beeline + HiveServer2,
but Ozone and S3 build environment does not start HiveServer2 by
default.

Increase the sleep period from 0.1 to 0.5 seconds per iteration in
test_statestore.py and mark TestStatestore to execute serially. This is
because TServer appears to shut down more slowly when run concurrently
with other tests. Handle the deprecation of Thread.setDaemon() as well.

Always force_restart=True each test method in TestLoggingCore,
TestShellInteractiveReconnect, and TestQueryRetries to prevent them from
reusing minicluster from previous test method. Some of these tests
destruct minicluster (kill impalad) and will produce minidump if metrics
verifier for next tests fail to detect healthy minicluster state.

Testing:
Pass exhaustive tests with IMPALA_USE_PYTHON3_TESTS=true.

Change-Id: I401a93b6cc7bcd17f41d24e7a310e0c882a550d4
Reviewed-on: http://gerrit.cloudera.org:8080/23319
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-09-03 10:01:29 +00:00

489 lines
21 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# General Impala query tests
from __future__ import absolute_import, division, print_function
from copy import deepcopy
import re
from subprocess import check_call
import pytest
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import SkipIfFS, SkipIfNotHdfsMinicluster
from tests.common.test_dimensions import (
add_exec_option_dimension,
create_client_protocol_dimension,
create_exec_option_dimension,
create_exec_option_dimension_from_dict,
create_single_exec_option_dimension,
create_uncompressed_json_dimension,
create_uncompressed_text_dimension,
default_protocol_or_parquet_constraint,
extend_exec_option_dimension,
single_compression_constraint,
FILE_FORMAT_TO_STORED_AS_MAP,
)
from tests.util.filesystem_utils import get_fs_path
class TestQueries(ImpalaTestSuite):
debug_actions = \
"BEFORE_CODEGEN_IN_ASYNC_CODEGEN_THREAD:JITTER@1000" \
"|AFTER_STARTING_ASYNC_CODEGEN_IN_FRAGMENT_THREAD:JITTER@1000"
@classmethod
def add_test_dimensions(cls):
super(TestQueries, cls).add_test_dimensions()
single_node_option = ([0, 100] if cls.exploration_strategy() == 'exhaustive'
else [0])
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
exec_single_node_option=single_node_option))
if cls.exploration_strategy() == 'core':
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'parquet')
cls.ImpalaTestMatrix.add_constraint(single_compression_constraint)
# Run these queries through both beeswax and HS2 to get coverage of both protocols.
# Don't run all combinations of table format and protocol - the dimensions should
# be orthogonal.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(default_protocol_or_parquet_constraint)
# Adding a test dimension here to test the small query opt in exhaustive.
if cls.exploration_strategy() == 'exhaustive':
extend_exec_option_dimension(cls, "async_codegen", 1)
extend_exec_option_dimension(cls, "debug_action", cls.debug_actions)
cls.ImpalaTestMatrix.add_constraint(cls.debug_action_constraint)
@classmethod
def debug_action_constraint(cls, vector):
exec_option = vector.get_value("exec_option")
is_async = exec_option.get("async_codegen") == 1
using_async_debug_actions = exec_option.get("debug_action") == cls.debug_actions
codegen_enabled = exec_option["disable_codegen"] == 0
# If it is a synchronous codegen test, the async debug actions do not matter as they
# are never executed on the synchronous codegen path but we filter out the tests where
# they are set, otherwise we would run each test twice (once with and once without
# debug actions).
if not is_async:
return not using_async_debug_actions
# If it is an asynchronous codegen test, we require that codegen should be enabled and
# we always run debug actions. We also filter out other test cases than those using
# Parquet without compression and the beeswax protocol to save time.
assert is_async
return (codegen_enabled and using_async_debug_actions
and vector.get_value('table_format').file_format == 'parquet'
and vector.get_value('table_format').compression_codec == 'none'
and vector.get_value('protocol') == 'hs2'
and vector.get_value('exec_option')["exec_single_node_rows_threshold"] == 0)
def test_analytic_fns(self, vector):
# TODO: Enable some of these tests for Avro if possible
# Don't attempt to evaluate timestamp expressions with Avro tables which don't
# support a timestamp type
table_format = vector.get_value('table_format')
if table_format.file_format == 'avro':
pytest.xfail("%s doesn't support TIMESTAMP" % (table_format.file_format))
if table_format.file_format == 'hbase':
pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
self.run_test_case('QueryTest/analytic-fns', vector)
def test_limit(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("IMPALA-283 - select count(*) produces inconsistent results")
if vector.get_value('table_format').file_format == 'kudu':
pytest.xfail("Limit queries without order by clauses are non-deterministic")
self.run_test_case('QueryTest/limit', vector)
def test_top_n(self, vector):
file_format = vector.get_value('table_format').file_format
if file_format == 'hbase':
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
# QueryTest/top-n is also run in test_sort with disable_outermost_topn = 1
self.run_test_case('QueryTest/top-n', vector)
if file_format in ['parquet', 'orc']:
# set timestamp options to get consistent results for both format.
new_vector = deepcopy(vector)
options = new_vector.get_value('exec_option')
options['convert_legacy_hive_parquet_utc_timestamps'] = 1
options['timezone'] = '"Europe/Budapest"'
self.run_test_case('QueryTest/top-n-complex', new_vector)
def test_union(self, vector):
self.run_test_case('QueryTest/union', vector)
# IMPALA-3586: The passthrough and materialized children are interleaved. The batch
# size is small to test the transition between materialized and passthrough children.
query_string = ("select count(c) from ( "
"select bigint_col + 1 as c from functional.alltypes limit 15 "
"union all "
"select bigint_col as c from functional.alltypes limit 15 "
"union all "
"select bigint_col + 1 as c from functional.alltypes limit 15 "
"union all "
"(select bigint_col as c from functional.alltypes limit 15)) t")
vector.get_value('exec_option')['batch_size'] = 10
result = self.execute_query(query_string, vector.get_value('exec_option'))
assert result.data[0] == '60'
def test_intersect(self, vector):
self.run_test_case('QueryTest/intersect', vector)
def test_except(self, vector):
if vector.get_value('table_format').file_format == "hbase":
pytest.xfail(reason="IMPALA-14333 - HBase does not return rows "
"where tinyint_col is NULL")
self.run_test_case('QueryTest/except', vector)
def test_sort(self, vector):
file_format = vector.get_value('table_format').file_format
if file_format == 'hbase':
pytest.xfail(reason="IMPALA-283 - select count(*) produces inconsistent results")
new_vector = deepcopy(vector)
options = new_vector.get_value('exec_option')
options['disable_outermost_topn'] = 1
options['analytic_rank_pushdown_threshold'] = 0
self.run_test_case('QueryTest/sort', new_vector)
# We can get the sort tests for free from the top-n file
self.run_test_case('QueryTest/top-n', new_vector)
if file_format in ['parquet', 'orc']:
# set timestamp options to get consistent results for both format.
options['convert_legacy_hive_parquet_utc_timestamps'] = 1
options['timezone'] = '"Europe/Budapest"'
self.run_test_case('QueryTest/sort-complex', new_vector)
def test_partitioned_top_n(self, vector):
"""Test partitioned Top-N operator."""
if vector.get_value('table_format').file_format == "hbase":
pytest.xfail(reason="IMPALA-14333 - HBase does not return rows "
"where tinyint_col is NULL")
self.run_test_case('QueryTest/partitioned-top-n', vector)
if vector.get_value('table_format').file_format in ['parquet', 'orc']:
self.run_test_case('QueryTest/partitioned-top-n-complex', vector)
def test_inline_view(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("jointbl does not have columns with unique values, "
"hbase collapses them")
self.run_test_case('QueryTest/inline-view', vector)
def test_inline_view_limit(self, vector):
self.run_test_case('QueryTest/inline-view-limit', vector)
def test_subquery(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("Table alltypesagg is populated differently in database "
"functional and functional_hbase: there are nulls in column "
"int_col in the former and none in the latter. "
"Testing query: select int_col from alltypesagg where int_col "
"is null")
self.run_test_case('QueryTest/subquery', vector)
def test_subquery_single_node(self, vector):
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/subquery-single-node', new_vector)
def test_alias(self, vector):
self.run_test_case('QueryTest/alias', vector)
def test_subquery_in_constant_lhs(self, vector):
self.run_test_case('QueryTest/subquery-in-constant-lhs', vector)
def test_empty(self, vector):
self.run_test_case('QueryTest/empty', vector)
def test_views(self, vector):
if vector.get_value('table_format').file_format == "hbase":
pytest.xfail("TODO: Enable views tests for hbase")
self.run_test_case('QueryTest/views', vector)
def test_with_clause(self, vector):
if vector.get_value('table_format').file_format == "hbase":
pytest.xfail("TODO: Enable with clause tests for hbase")
self.run_test_case('QueryTest/with-clause', vector)
# TODO: Although it is not specified this test only runs in exhaustive.
def test_misc(self, vector):
table_format = vector.get_value('table_format')
if table_format.file_format in ['hbase', 'rc', 'parquet', 'kudu']:
msg = ("Failing on rc/snap/block despite resolution of IMP-624,IMP-503. "
"Failing on kudu and parquet because tables do not exist")
pytest.xfail(msg)
self.run_test_case('QueryTest/misc', vector)
def test_null_data(self, vector):
if vector.get_value('table_format').file_format == 'hbase':
pytest.xfail("null data does not appear to work in hbase")
self.run_test_case('QueryTest/null_data', vector)
# Tests in this class are only run against text/none either because that's the only
# format that is supported, or the tests don't exercise the file format.
class TestQueriesTextTables(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestQueriesTextTables, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_overflow(self, vector):
self.run_test_case('QueryTest/overflow', vector)
def test_strict_mode(self, vector):
vector.get_value('exec_option')['strict_mode'] = 1
vector.get_value('exec_option')['abort_on_error'] = 0
self.run_test_case('QueryTest/strict-mode', vector)
vector.get_value('exec_option')['abort_on_error'] = 1
self.run_test_case('QueryTest/strict-mode-abort', vector)
def test_range_constant_propagation(self, vector):
self.run_test_case('QueryTest/range-constant-propagation', vector)
def test_distinct_estimate(self, vector):
# These results will vary slightly depending on how the values get split up
# so only run with 1 node and on text.
vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/distinct-estimate', vector)
def test_random(self, vector):
# These results will vary slightly depending on how the values get split up
# so only run with 1 node and on text.
vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/random', vector)
def test_values(self, vector):
self.run_test_case('QueryTest/values', vector)
# Tests in this class are only run against json/none either because that's the only
# format that is supported, or the tests don't exercise the file format.
class TestQueriesJsonTables(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestQueriesJsonTables, cls).add_test_dimensions()
# Test that all protocol works.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_json_dimension(cls.get_workload()))
add_exec_option_dimension(cls, 'disable_optimized_json_count_star', [0, 1])
def test_complex(self, vector):
vector.get_value('exec_option')['abort_on_error'] = 0
self.run_test_case('QueryTest/complex_json', vector)
def test_multiline(self, vector):
vector.get_value('exec_option')['abort_on_error'] = 0
self.run_test_case('QueryTest/multiline_json', vector)
def test_malformed(self, vector):
vector.get_value('exec_option')['abort_on_error'] = 0
self.run_test_case('QueryTest/malformed_json', vector)
def test_overflow(self, vector):
vector.get_value('exec_option')['abort_on_error'] = 0
self.run_test_case('QueryTest/overflow_json', vector)
# Tests in this class are only run against Parquet because the tests don't exercise the
# file format.
class TestQueriesParquetTables(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestQueriesParquetTables, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'parquet')
@pytest.mark.execute_serially
def test_very_large_strings(self, vector):
"""Regression test for IMPALA-1619. Doesn't need to be run on all file formats.
Executes serially to avoid large random spikes in mem usage."""
# IMPALA-9856: Disable query result spooling so that we don't need to deal with extra
# memory reservation required by BufferedPlanRootSink.
new_vector = deepcopy(vector)
new_vector.get_value('exec_option')['spool_query_results'] = "0"
self.run_test_case('QueryTest/large_strings', new_vector)
def test_single_node_large_sorts(self, vector):
if self.exploration_strategy() != 'exhaustive':
pytest.skip("only run large sorts on exhaustive")
vector.get_value('exec_option')['disable_outermost_topn'] = 1
vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/single-node-large-sorts', vector)
# Tests for queries in HDFS-specific tables, e.g. AllTypesAggMultiFilesNoPart.
class TestHdfsQueries(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestHdfsQueries, cls).add_test_dimensions()
# Adding a test dimension here to test the small query opt in exhaustive.
single_node_option = ([0, 100] if cls.exploration_strategy() == 'exhaustive'
else [0])
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
exec_single_node_option=single_node_option))
# Kudu doesn't support AllTypesAggMultiFilesNoPart (KUDU-1271, KUDU-1570).
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format != 'kudu')
def test_hdfs_scan_node(self, vector):
self.run_test_case('QueryTest/hdfs-scan-node', vector)
def test_file_partitions(self, vector):
self.run_test_case('QueryTest/hdfs-partitions', vector)
class TestPartitionKeyScans(ImpalaTestSuite):
"""Tests for queries that exercise partition key scan optimisation. These
should be run against all HDFS table types with and without mt_dop to
exercise both scanner code paths. We run with mt_dop=0 and 1 only so
that the same number of rows flow through the plan."""
@classmethod
def add_test_dimensions(cls):
super(TestPartitionKeyScans, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format not in ('kudu', 'hbase'))
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension_from_dict({
'mt_dop': [0, 1], 'exec_single_node_rows_threshold': [0]}))
def test_partition_key_scans(self, vector):
self.run_test_case('QueryTest/partition-key-scans', vector)
@SkipIfNotHdfsMinicluster.scheduling
def test_partition_key_scans_plan_rows(self, vector):
"""Tests that assume the query is scheduled across three nodes."""
self.run_test_case('QueryTest/partition-key-scans-plan-rows', vector)
def test_partition_key_scans_with_joins(self, vector):
self.run_test_case('QueryTest/partition-key-scans-with-joins', vector)
class TestPartitionKeyScansWithMultipleBlocks(ImpalaTestSuite):
"""Tests for queries that exercise partition key scan optimisation with data files
that contain multiple blocks."""
@classmethod
def add_test_dimensions(cls):
super(TestPartitionKeyScansWithMultipleBlocks, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format not in ('kudu', 'hbase'))
def _build_alltypes_multiblocks_table(self, vector, unique_database):
file_format = vector.get_value('table_format').file_format
db_suffix = vector.get_value('table_format').db_suffix()
src_tbl_name = 'functional' + db_suffix + '.alltypes'
src_tbl_loc = self._get_table_location(src_tbl_name, vector)
source_file = src_tbl_loc + '/year=2010/month=12/*'
tbl_loc = get_fs_path("/test-warehouse/%s.db/alltypes_multiblocks"
% (unique_database))
file_path = tbl_loc + "/year=2010/month=12"
check_call(['hdfs', 'dfs', '-mkdir', '-p', file_path])
self.client.execute("""create table if not exists %s.alltypes_multiblocks
like functional.alltypes stored as %s location '%s';"""
% (unique_database, FILE_FORMAT_TO_STORED_AS_MAP[file_format], tbl_loc))
# set block size to 1024 so the target file occupies multiple blocks
check_call(['hdfs', 'dfs', '-Ddfs.block.size=1024', '-cp', '-f', '-d',
source_file, file_path])
self.client.execute("alter table %s.alltypes_multiblocks recover partitions"
% (unique_database))
@SkipIfFS.hdfs_small_block
def test_partition_key_scans_with_multiple_blocks_table(self, vector, unique_database):
self._build_alltypes_multiblocks_table(vector, unique_database)
result = self.execute_query_expect_success(self.client,
"SELECT max(year) FROM %s.alltypes_multiblocks" % (unique_database))
assert int(result.get_data()) == 2010
class TestTopNReclaimQuery(ImpalaTestSuite):
"""Test class to validate that TopN periodically reclaims tuple pool memory
and runs with a lower memory footprint."""
QUERY = "select * from tpch.lineitem order by l_orderkey desc limit 10;"
# Mem limit empirically selected so that the query fails if tuple pool reclamation
# is not implemented for TopN
MEM_LIMIT = "60m"
@classmethod
def get_workload(self):
return 'tpch'
@classmethod
def add_test_dimensions(cls):
super(TestTopNReclaimQuery, cls).add_test_dimensions()
# The tpch tests take a long time to execute so restrict the combinations they
# execute over.
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_top_n_reclaim(self, vector):
exec_options = vector.get_value('exec_option')
exec_options['mem_limit'] = self.MEM_LIMIT
exec_options['num_scanner_threads'] = 1
result = self.execute_query(self.QUERY, exec_options)
runtime_profile = str(result.runtime_profile)
num_of_times_tuple_pool_reclaimed = re.findall(
'TuplePoolReclamations: ([0-9]*)', runtime_profile)
# Confirm newly added counter is visible
assert len(num_of_times_tuple_pool_reclaimed) > 0
# Tuple pool is expected to be reclaimed for this query
for n in num_of_times_tuple_pool_reclaimed:
assert int(n) > 0
class TestAnalyticFnsTpch(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'tpch'
@classmethod
def add_test_dimensions(cls):
super(TestAnalyticFnsTpch, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ['parquet'])
def test_analytic_predicate(self, vector):
self.run_test_case('analytic-fns', vector)
class TestTopNHighNdv(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestTopNHighNdv, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_topn_high_ndv(self, vector, unique_database):
self.run_test_case(
'QueryTest/partitioned-top-n-high-ndv', vector, use_db=unique_database)