IMPALA-13211: Add negative test for Parquet Byte Stream Split encoding

This change adds EE tests in test_parquet_byte_stream_split_encoding.py that check that Impala returns the correct error message when it encounters a table that contains a parquet file with Byte Stream Split encoding. To regenerate the test files, run the parquet_files_generator.py script in the testdata/parquet_byte_stream_split_encoding/ folder. Change-Id: If5eff8bf51fe246a9d0250e38c470b821fec75d9 Reviewed-on: http://gerrit.cloudera.org:8080/22124 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2024-11-26 15:15:33 +01:00
parent 19110b490d
commit 8aea57fc77
5 changed files with 103 additions and 0 deletions
--- a/testdata/parquet_byte_stream_split_encoding/README
+++ b/testdata/parquet_byte_stream_split_encoding/README
@@ -0,0 +1,6 @@
+The doubles_byte_stream_split.parquet and floats_byte_stream_split.parquet files were
+generated with the parquet_files_generator.py script. The script is using PyArrow
+(https://arrow.apache.org/docs/python).
+
+To regenerate the files, run:
+python3 parquet_files_generator.py
--- a/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet
+++ b/testdata/parquet_byte_stream_split_encoding/doubles_byte_stream_split.parquet
--- a/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet
+++ b/testdata/parquet_byte_stream_split_encoding/floats_byte_stream_split.parquet
--- a/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py
+++ b/testdata/parquet_byte_stream_split_encoding/parquet_files_generator.py
@@ -0,0 +1,40 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import
+
+import os
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+test_file_dir = "testdata/parquet_byte_stream_split_encoding"
+
+nums_to_encode = [1.45, 4.256, 6.3573, 4.235, 7.5198463, 10.57956, 100.68491,
+    0.54987623514, 1.0]
+
+floats = pa.array(nums_to_encode, type=pa.float32())
+floats_table = pa.table([floats], names=["floats"])
+pq.write_table(floats_table, os.path.join(test_file_dir,
+    'floats_byte_stream_split.parquet'), use_dictionary=False,
+    use_byte_stream_split=True)
+
+doubles = pa.array(nums_to_encode, type=pa.float64())
+doubles_table = pa.table([doubles], names=["doubles"])
+pq.write_table(doubles_table, os.path.join(test_file_dir,
+    'doubles_byte_stream_split.parquet'), use_dictionary=False,
+    use_byte_stream_split=True)
--- a/tests/query_test/test_parquet_byte_stream_split_encoding.py
+++ b/tests/query_test/test_parquet_byte_stream_split_encoding.py
@@ -0,0 +1,57 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import
+
+import os
+
+from tests.common.file_utils import create_table_and_copy_files
+from tests.common.impala_test_suite import ImpalaTestSuite
+
+
+class TestParquetEncodings(ImpalaTestSuite):
+
+  TEST_FILE_DIRECTORY = "testdata/parquet_byte_stream_split_encoding"
+
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestParquetEncodings, cls).add_test_dimensions()
+    cls.ImpalaTestMatrix.add_constraint(
+        lambda v: v.get_value('table_format').file_format == 'parquet')
+
+  def test_parquet_byte_stream_split_encoding_float(self, vector, unique_database):
+    self._parquet_byte_stream_split_encoding_helper(vector, unique_database, "float",
+        os.path.join(self.TEST_FILE_DIRECTORY, "floats_byte_stream_split.parquet"))
+
+  def test_parquet_byte_stream_split_encoding_double(self, vector, unique_database):
+    self._parquet_byte_stream_split_encoding_helper(vector, unique_database, "double",
+        os.path.join(self.TEST_FILE_DIRECTORY, "doubles_byte_stream_split.parquet"))
+
+  def _parquet_byte_stream_split_encoding_helper(self, vector, unique_database, col_type,
+      filename):
+    table_name = "parquet_byte_stream_split_negative_test"
+    create_stmt = "create table {}.{} (numbers {}) stored as parquet".format(
+        unique_database, table_name, col_type)
+    create_table_and_copy_files(self.client, create_stmt, unique_database, table_name,
+                                [filename])
+    query_stmt = "select * from {}.{}".format(unique_database, table_name)
+    result = self.execute_query_expect_failure(self.client, query_stmt)
+    assert "unsupported encoding: BYTE_STREAM_SPLIT" in str(result)