IMPALA-13958: Revisit hs2_parquet_constraint and hs2_text_constraint

hs2_parquet_constraint and hs2_text_constraint is meant to extend test
vector dimension to also test non-default test protocol (other than
beeswax), but limit it to only run against 'parquet/none' or 'text/none'
format accordingly.

This patch modifies these constraints to
default_protocol_or_parquet_constraint and
default_protocol_or_text_constraint respectively such that the full file
format coverage happen for default_test_protocol configuration and
limited for the other protocols. Drop hs2_parquet_constraint entirely
from test_utf8_strings.py because that test is already constrained to
single 'parquet/none' file format.

Num modified rows validation in date-fileformat-support.test and
date-partitioning.test are changed to check the NumModifiedRows counter
from profile.

Fix TestQueriesJsonTables to always run with beeswax protocol because
its assertions relies on beeswax-specific return values.

Run impala-isort and fix few flake8 issues and in modified test files.

Testing:
Run and pass the affected test files using exhaustive exploration and
env var DEFAULT_TEST_PROTOCOL=hs2. Confirmed that full file format
coverage happen for hs2 protocol. Note that
DEFAULT_TEST_PROTOCOL=beeswax is still the default.

Change-Id: I8be0a628842e29a8fcc036180654cd159f6a23c8
Reviewed-on: http://gerrit.cloudera.org:8080/22775
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Riza Suminto
2025-04-12 19:54:01 -07:00
committed by Impala Public Jenkins
parent e9a706723d
commit 182aa5066e
8 changed files with 124 additions and 75 deletions

View File

@@ -45,18 +45,18 @@ NULL
# Inserting text partitions to $DATABASE.date_tbl is OK.
insert into $DATABASE.date_tbl partition (date_part)
select date_col, date_part from functional.date_tbl;
---- RESULTS
date_part=0001-01-01: 7
date_part=1399-06-27: 3
date_part=2017-11-27: 10
date_part=9999-12-31: 2
---- RUNTIME_PROFILE
NumModifiedRows: 7
NumModifiedRows: 3
NumModifiedRows: 10
NumModifiedRows: 2
====
---- QUERY
# Inserting into parquet partition is supported.
insert into $DATABASE.date_tbl partition(date_part='1899-12-31')
select date_col from functional_parquet.date_tbl where date_part = '1399-06-27';
---- RESULTS
date_part=1899-12-31: 3
---- RUNTIME_PROFILE
NumModifiedRows: 3
====
---- QUERY
# Adding ORC partition works even though Impala cannot write ORC format.

View File

@@ -25,29 +25,29 @@ AnalysisException: Partition spec already exists: (p=DATE '1300-01-01').
---- QUERY
# Date partition formatted differently in insert
insert into $DATABASE.dtbl partition (p='1300-1-01') values ('1300-1-1');
---- RESULTS
p=1300-01-01: 1
---- RUNTIME_PROFILE
NumModifiedRows: 1
====
---- QUERY
insert into $DATABASE.dtbl partition (p='1300-01-1') values ('1300-1-02');
---- RESULTS
p=1300-01-01: 1
---- RUNTIME_PROFILE
NumModifiedRows: 1
====
---- QUERY
insert into $DATABASE.dtbl partition (p=DATE '1300-1-1') values ('1300-1-03');
---- RESULTS
p=1300-01-01: 1
---- RUNTIME_PROFILE
NumModifiedRows: 1
====
---- QUERY
# Insert into a new partition
insert into $DATABASE.dtbl partition (p=DATE '1400-01-1') values ('1400-1-1');
---- RESULTS
p=1400-01-01: 1
---- RUNTIME_PROFILE
NumModifiedRows: 1
====
---- QUERY
insert into $DATABASE.dtbl partition (p='1400-1-01') values ('1400-1-2');
---- RESULTS
p=1400-01-01: 1
---- RUNTIME_PROFILE
NumModifiedRows: 1
====
---- QUERY
select p, c from $DATABASE.dtbl;
@@ -86,9 +86,9 @@ UDF ERROR: String to Date parse failed. Invalid string val: '1400-01-'
# Test that STRING is implicitly cast to DATE.
insert into $DATABASE.dtbl partition(p) select * from $DATABASE.stbl
where p in ('1400-1-1', '1400-1-01', '1500-01-1');
---- RESULTS
p=1400-01-01: 2
p=1500-01-01: 1
---- RUNTIME_PROFILE
NumModifiedRows: 2
NumModifiedRows: 1
====
---- QUERY
select p, c from $DATABASE.dtbl;

View File

@@ -18,18 +18,25 @@
# Common test dimensions and associated utility functions.
from __future__ import absolute_import, division, print_function
from builtins import range
import copy
import os
import pytest
from itertools import product
import os
from builtins import range
import pytest
from tests.common.test_vector import (
EXEC_OPTION, PROTOCOL, TABLE_FORMAT,
BEESWAX, HS2, HS2_HTTP,
ImpalaTestDimension, ImpalaTestVector, assert_exec_option_key)
from tests.util.filesystem_utils import (
IS_HDFS)
assert_exec_option_key,
BEESWAX,
EXEC_OPTION,
HS2,
HS2_HTTP,
ImpalaTestDimension,
ImpalaTestVector,
PROTOCOL,
TABLE_FORMAT,
)
from tests.util.filesystem_utils import IS_HDFS
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
@@ -190,18 +197,18 @@ def create_client_protocol_no_strict_dimension():
return ImpalaTestDimension('strict_hs2_protocol', False)
def hs2_parquet_constraint(v):
"""Constraint function, used to only run HS2 against Parquet format, because file format
and the client protocol are orthogonal."""
return (v.get_protocol() == BEESWAX
def default_protocol_or_parquet_constraint(v):
"""Constraint function, used to limit non-default test protocol against uncompressed
parquet format, because file format and the client protocol are orthogonal."""
return (v.get_protocol() == pytest.config.option.default_test_protocol
or (v.get_table_format().file_format == 'parquet'
and v.get_table_format().compression_codec == 'none'))
def hs2_text_constraint(v):
"""Constraint function, used to only run HS2 against uncompressed text, because file
format and the client protocol are orthogonal."""
return (v.get_protocol() == BEESWAX
def default_protocol_or_text_constraint(v):
"""Constraint function, used to limit non-default test protocol against uncompressed
text format, because file format and the client protocol are orthogonal."""
return (v.get_protocol() == pytest.config.option.default_test_protocol
or (v.get_table_format().file_format == 'text'
and v.get_table_format().compression_codec == 'none'))

View File

@@ -19,11 +19,16 @@ from __future__ import absolute_import, division, print_function
from copy import deepcopy
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import (create_exec_option_dimension,
create_client_protocol_dimension, hs2_parquet_constraint, hs2_text_constraint)
from tests.common.test_dimensions import (
create_client_protocol_dimension,
create_exec_option_dimension,
default_protocol_or_parquet_constraint,
default_protocol_or_text_constraint,
)
class TestStringQueries(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestStringQueries, cls).add_test_dimensions()
@@ -35,7 +40,7 @@ class TestStringQueries(ImpalaTestSuite):
# Run these queries through both beeswax and HS2 to get coverage of CHAR/VARCHAR
# returned via both protocols.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(hs2_text_constraint)
cls.ImpalaTestMatrix.add_constraint(default_protocol_or_text_constraint)
def test_chars(self, vector):
self.run_test_case('QueryTest/chars', vector)
@@ -57,6 +62,7 @@ class TestStringQueries(ImpalaTestSuite):
class TestCharFormats(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestCharFormats, cls).add_test_dimensions()
@@ -72,7 +78,7 @@ class TestCharFormats(ImpalaTestSuite):
# Run these queries through both beeswax and HS2 to get coverage of CHAR/VARCHAR
# returned via both protocols.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(hs2_parquet_constraint)
cls.ImpalaTestMatrix.add_constraint(default_protocol_or_parquet_constraint)
def test_char_format(self, vector):
self.run_test_case('QueryTest/chars-formats', vector)

View File

@@ -18,11 +18,16 @@
# Targeted tests for date type.
from __future__ import absolute_import, division, print_function
from tests.common.file_utils import create_table_and_copy_files
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import SkipIfFS
from tests.common.test_dimensions import (create_exec_option_dimension_from_dict,
create_client_protocol_dimension, hs2_parquet_constraint)
from tests.common.test_dimensions import (
create_client_protocol_dimension,
create_exec_option_dimension_from_dict,
create_uncompressed_text_dimension,
default_protocol_or_parquet_constraint,
)
from tests.shell.util import create_impala_shell_executable_dimension
@@ -46,12 +51,17 @@ class TestDateQueriesBase(ImpalaTestSuite):
# Run these queries through both beeswax and HS2 to get coverage of date returned
# via both protocols.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(hs2_parquet_constraint)
cls.ImpalaTestMatrix.add_dimension(create_impala_shell_executable_dimension())
class TestDateQueriesAllFormat(TestDateQueriesBase):
@classmethod
def add_test_dimensions(cls):
super(TestDateQueriesAllFormat, cls).add_test_dimensions()
# Limit to 'parquet/none' for non-default test protocol.
cls.ImpalaTestMatrix.add_constraint(default_protocol_or_parquet_constraint)
def test_queries(self, vector):
if vector.get_value('table_format').file_format == 'avro':
# Avro date test queries are in a separate test file.
@@ -69,9 +79,9 @@ class TestDateQueriesTextFormat(TestDateQueriesBase):
@classmethod
def add_test_dimensions(cls):
super(TestDateQueriesTextFormat, cls).add_test_dimensions()
# Only run this test class with 'text' table_format.
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'text')
# Only run this test class with 'text/none' table_format.
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_partitioning(self, vector, unique_database):
""" Test partitioning by DATE. """

View File

@@ -18,12 +18,16 @@
# Targeted tests for decimal type.
from __future__ import absolute_import, division, print_function
import pytest
from tests.common.impala_connection import IMPALA_CONNECTION_EXCEPTION
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import (create_exec_option_dimension_from_dict,
create_client_protocol_dimension, hs2_parquet_constraint)
from tests.common.test_dimensions import (
create_client_protocol_dimension,
create_exec_option_dimension_from_dict,
default_protocol_or_parquet_constraint,
)
from tests.util.filesystem_utils import IS_S3
@@ -33,22 +37,22 @@ class TestDecimalQueries(ImpalaTestSuite):
super(TestDecimalQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension_from_dict({
'decimal_v2' : ['false', 'true'],
'batch_size' : [0, 1],
'disable_codegen' : ['false', 'true'],
'disable_codegen_rows_threshold' : [0]}))
'decimal_v2': ['false', 'true'],
'batch_size': [0, 1],
'disable_codegen': ['false', 'true'],
'disable_codegen_rows_threshold': [0]}))
# Hive < 0.11 does not support decimal so we can't run these tests against the other
# file formats.
# TODO: Enable them on Hive >= 0.11.
cls.ImpalaTestMatrix.add_constraint(lambda v:\
(v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec == 'none') or
v.get_value('table_format').file_format in ['parquet', 'orc', 'kudu', 'json'])
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ['parquet', 'orc', 'kudu', 'json']
or (v.get_value('table_format').file_format == 'text'
and v.get_value('table_format').compression_codec == 'none'))
# Run these queries through both beeswax and HS2 to get coverage of decimals returned
# via both protocols.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(hs2_parquet_constraint)
cls.ImpalaTestMatrix.add_constraint(default_protocol_or_parquet_constraint)
def test_queries(self, vector):
self.run_test_case('QueryTest/decimal', vector)
@@ -75,8 +79,8 @@ class TestAvroDecimalQueries(ImpalaTestSuite):
def add_test_dimensions(cls):
super(TestAvroDecimalQueries, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
(v.get_value('table_format').file_format == 'avro' and
v.get_value('table_format').compression_codec == 'snap'))
v.get_value('table_format').file_format == 'avro'
and v.get_value('table_format').compression_codec == 'snap')
def test_avro_queries(self, vector):
self.run_test_case('QueryTest/decimal_avro', vector)
@@ -91,7 +95,7 @@ class TestDecimalOverflowExprs(ImpalaTestSuite):
def add_test_dimensions(cls):
super(TestDecimalOverflowExprs, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
(v.get_value('table_format').file_format in ['kudu', 'parquet', 'text']))
v.get_value('table_format').file_format in ['kudu', 'parquet', 'text'])
def test_insert_select_exprs(self, vector, unique_database):
TBL_NAME_1 = '`{0}`.`overflowed_decimal_tbl_1`'.format(unique_database)

View File

@@ -18,20 +18,28 @@
# General Impala query tests
from __future__ import absolute_import, division, print_function
import pytest
import re
from copy import deepcopy
import re
from subprocess import check_call
import pytest
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import (
SkipIfEC, SkipIfCatalogV2, SkipIfNotHdfsMinicluster, SkipIfFS)
from tests.common.skip import SkipIfFS, SkipIfNotHdfsMinicluster
from tests.common.test_dimensions import (
create_uncompressed_text_dimension, create_uncompressed_json_dimension,
create_exec_option_dimension_from_dict, create_client_protocol_dimension,
hs2_parquet_constraint, extend_exec_option_dimension, FILE_FORMAT_TO_STORED_AS_MAP,
add_exec_option_dimension, create_exec_option_dimension)
add_exec_option_dimension,
create_client_protocol_dimension,
create_exec_option_dimension,
create_exec_option_dimension_from_dict,
create_uncompressed_json_dimension,
create_uncompressed_text_dimension,
default_protocol_or_parquet_constraint,
extend_exec_option_dimension,
FILE_FORMAT_TO_STORED_AS_MAP,
)
from tests.common.test_vector import BEESWAX
from tests.util.filesystem_utils import get_fs_path
from subprocess import check_call
class TestQueries(ImpalaTestSuite):
@@ -54,7 +62,7 @@ class TestQueries(ImpalaTestSuite):
# Don't run all combinations of table format and protocol - the dimensions should
# be orthogonal.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(hs2_parquet_constraint)
cls.ImpalaTestMatrix.add_constraint(default_protocol_or_parquet_constraint)
# Adding a test dimension here to test the small query opt in exhaustive.
if cls.exploration_strategy() == 'exhaustive':
@@ -212,6 +220,7 @@ class TestQueries(ImpalaTestSuite):
pytest.xfail("null data does not appear to work in hbase")
self.run_test_case('QueryTest/null_data', vector)
# Tests in this class are only run against text/none either because that's the only
# format that is supported, or the tests don't exercise the file format.
class TestQueriesTextTables(ImpalaTestSuite):
@@ -254,6 +263,13 @@ class TestQueriesTextTables(ImpalaTestSuite):
# Tests in this class are only run against json/none either because that's the only
# format that is supported, or the tests don't exercise the file format.
class TestQueriesJsonTables(ImpalaTestSuite):
@classmethod
def default_test_protocol(cls):
# Some assertions in this test relies on beeswax-specific return values such as
# Infinity, NaN, false, and true. HS2 returns inf, nan, False, and True instead.
return BEESWAX
@classmethod
def add_test_dimensions(cls):
super(TestQueriesJsonTables, cls).add_test_dimensions()
@@ -277,6 +293,7 @@ class TestQueriesJsonTables(ImpalaTestSuite):
vector.get_value('exec_option')['abort_on_error'] = 0
self.run_test_case('QueryTest/overflow_json', vector)
# Tests in this class are only run against Parquet because the tests don't exercise the
# file format.
class TestQueriesParquetTables(ImpalaTestSuite):
@@ -304,6 +321,7 @@ class TestQueriesParquetTables(ImpalaTestSuite):
vector.get_value('exec_option')['num_nodes'] = 1
self.run_test_case('QueryTest/single-node-large-sorts', vector)
# Tests for queries in HDFS-specific tables, e.g. AllTypesAggMultiFilesNoPart.
class TestHdfsQueries(ImpalaTestSuite):
@classmethod
@@ -387,6 +405,7 @@ class TestPartitionKeyScansWithMultipleBlocks(ImpalaTestSuite):
"SELECT max(year) FROM %s.alltypes_multiblocks" % (unique_database))
assert int(result.get_data()) == 2010
class TestTopNReclaimQuery(ImpalaTestSuite):
"""Test class to validate that TopN periodically reclaims tuple pool memory
and runs with a lower memory footprint."""

View File

@@ -16,24 +16,27 @@
# under the License.
from __future__ import absolute_import, division, print_function
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.test_dimensions import (create_exec_option_dimension,
create_client_protocol_dimension, hs2_parquet_constraint)
from tests.common.test_dimensions import (
create_client_protocol_dimension,
create_exec_option_dimension,
)
class TestUtf8StringFunctions(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestUtf8StringFunctions, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(
create_exec_option_dimension(disable_codegen_options=[False, True]))
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ['parquet'] and
v.get_value('table_format').compression_codec in ['none'])
v.get_value('table_format').file_format in ['parquet']
and v.get_value('table_format').compression_codec in ['none'])
# Run these queries through both beeswax and HS2 to get coverage of CHAR/VARCHAR
# returned via both protocols.
cls.ImpalaTestMatrix.add_dimension(create_client_protocol_dimension())
cls.ImpalaTestMatrix.add_constraint(hs2_parquet_constraint)
def test_string_functions(self, vector):
self.run_test_case('QueryTest/utf8-string-functions', vector)