IMPALA-13211: Add negative test for Parquet Byte Stream Split encoding

This change adds EE tests in test_parquet_byte_stream_split_encoding.py
that check that Impala returns the correct error message when it
encounters a table that contains a parquet file with Byte Stream Split
encoding.

To regenerate the test files, run the parquet_files_generator.py
script in the testdata/parquet_byte_stream_split_encoding/ folder.

Change-Id: If5eff8bf51fe246a9d0250e38c470b821fec75d9
Reviewed-on: http://gerrit.cloudera.org:8080/22124
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Gabriella Gyorgyevics
2024-11-26 15:15:33 +01:00
committed by Impala Public Jenkins
parent 19110b490d
commit 8aea57fc77
5 changed files with 103 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
The doubles_byte_stream_split.parquet and floats_byte_stream_split.parquet files were
generated with the parquet_files_generator.py script. The script is using PyArrow
(https://arrow.apache.org/docs/python).
To regenerate the files, run:
python3 parquet_files_generator.py

View File

@@ -0,0 +1,40 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import os
import pyarrow as pa
import pyarrow.parquet as pq
test_file_dir = "testdata/parquet_byte_stream_split_encoding"
nums_to_encode = [1.45, 4.256, 6.3573, 4.235, 7.5198463, 10.57956, 100.68491,
0.54987623514, 1.0]
floats = pa.array(nums_to_encode, type=pa.float32())
floats_table = pa.table([floats], names=["floats"])
pq.write_table(floats_table, os.path.join(test_file_dir,
'floats_byte_stream_split.parquet'), use_dictionary=False,
use_byte_stream_split=True)
doubles = pa.array(nums_to_encode, type=pa.float64())
doubles_table = pa.table([doubles], names=["doubles"])
pq.write_table(doubles_table, os.path.join(test_file_dir,
'doubles_byte_stream_split.parquet'), use_dictionary=False,
use_byte_stream_split=True)

View File

@@ -0,0 +1,57 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import
import os
from tests.common.file_utils import create_table_and_copy_files
from tests.common.impala_test_suite import ImpalaTestSuite
class TestParquetEncodings(ImpalaTestSuite):
TEST_FILE_DIRECTORY = "testdata/parquet_byte_stream_split_encoding"
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestParquetEncodings, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(
lambda v: v.get_value('table_format').file_format == 'parquet')
def test_parquet_byte_stream_split_encoding_float(self, vector, unique_database):
self._parquet_byte_stream_split_encoding_helper(vector, unique_database, "float",
os.path.join(self.TEST_FILE_DIRECTORY, "floats_byte_stream_split.parquet"))
def test_parquet_byte_stream_split_encoding_double(self, vector, unique_database):
self._parquet_byte_stream_split_encoding_helper(vector, unique_database, "double",
os.path.join(self.TEST_FILE_DIRECTORY, "doubles_byte_stream_split.parquet"))
def _parquet_byte_stream_split_encoding_helper(self, vector, unique_database, col_type,
filename):
table_name = "parquet_byte_stream_split_negative_test"
create_stmt = "create table {}.{} (numbers {}) stored as parquet".format(
unique_database, table_name, col_type)
create_table_and_copy_files(self.client, create_stmt, unique_database, table_name,
[filename])
query_stmt = "select * from {}.{}".format(unique_database, table_name)
result = self.execute_query_expect_failure(self.client, query_stmt)
assert "unsupported encoding: BYTE_STREAM_SPLIT" in str(result)