mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
This removes Impala-lzo from the Impala development environment. Impala-lzo is not built as part of the Impala build. The LZO plugin is no longer loaded. LZO tables are not loaded during dataload, and LZO is no longer tested. This removes some obsolete scan APIs that were only used by Impala-lzo. With this commit, Impala-lzo would require code changes to build against Impala. The plugin infrastructure is not removed, and this leaves some LZO support code in place. If someone were to decide to revive Impala-lzo, they would still be able to load it as a plugin and get the same functionality as before. This plugin support may be removed later. Testing: - Dryrun of GVO - Modified TestPartitionMetadataUncompressedTextOnly's test_unsupported_text_compression() to add LZO case Change-Id: I3a4f12247d8872b7e14c9feb4b2c58cfd60d4c0e Reviewed-on: http://gerrit.cloudera.org:8080/15814 Reviewed-by: Bikramjeet Vig <bikramjeet.vig@cloudera.com> Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
115 lines
5.2 KiB
Python
115 lines
5.2 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# Tests for Hive-IMPALA text compression codec interoperability
|
|
|
|
import pytest
|
|
|
|
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
|
from tests.common.environ import HIVE_MAJOR_VERSION
|
|
from tests.common.skip import SkipIfS3
|
|
from tests.common.test_dimensions import create_exec_option_dimension
|
|
from tests.common.test_result_verifier import verify_query_result_is_equal
|
|
|
|
# compression codecs impala support reading in text file type
|
|
TEXT_CODECS = ['snappy', 'gzip', 'zstd', 'bzip2', 'deflate', 'default']
|
|
|
|
|
|
class TestTextInterop(CustomClusterTestSuite):
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
pytest.skip('runs only in exhaustive')
|
|
super(TestTextInterop, cls).setup_class()
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(CustomClusterTestSuite, cls).add_test_dimensions()
|
|
# Fix the exec_option vector to have a single value.
|
|
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
|
|
cluster_sizes=[0], disable_codegen_options=[False], batch_sizes=[0],
|
|
sync_ddl=[1]))
|
|
cls.ImpalaTestMatrix.add_constraint(
|
|
lambda v: v.get_value('table_format').file_format == 'textfile')
|
|
|
|
@SkipIfS3.hive
|
|
@pytest.mark.execute_serially
|
|
def test_hive_impala_interop(self, unique_database, cluster_properties):
|
|
"""Tests compressed text file written by Hive with different codecs
|
|
can be read from impala. And verify results."""
|
|
# Setup source table.
|
|
source_table = "{0}.{1}".format(unique_database, "t1_source")
|
|
# TODO: Once IMPALA-8721 is fixed add coverage for TimeStamp data type.
|
|
self.execute_query_expect_success(self.client,
|
|
"create table {0} stored as textfile as select id, bool_col, tinyint_col, "
|
|
"smallint_col, int_col, bigint_col, float_col, double_col, date_string_col,"
|
|
"string_col, year, month from functional_parquet.alltypes".format(source_table))
|
|
self.execute_query_expect_success(self.client,
|
|
"insert into {0}(id) values (7777), (8888), (9999), (11111), (22222), (33333)"
|
|
.format(source_table))
|
|
|
|
# For Hive 3+, workaround for HIVE-22371 (CTAS puts files in the wrong place) by
|
|
# explicitly creating an external table so that files are in the external warehouse
|
|
# directory. Use external.table.purge=true so that it is equivalent to a Hive 2
|
|
# managed table. Hive 2 stays the same.
|
|
external = ""
|
|
tblproperties = ""
|
|
if HIVE_MAJOR_VERSION >= 3:
|
|
external = "external"
|
|
tblproperties = "TBLPROPERTIES('external.table.purge'='TRUE')"
|
|
# Loop through the compression codecs and run interop tests.
|
|
for codec in TEXT_CODECS:
|
|
# Write data in Hive and read from Impala
|
|
# switch codec to format hive can accept
|
|
switcher = {
|
|
'snappy': 'org.apache.hadoop.io.compress.SnappyCodec',
|
|
'gzip': 'org.apache.hadoop.io.compress.GzipCodec',
|
|
'zstd': 'org.apache.hadoop.io.compress.ZStandardCodec',
|
|
'bzip2': 'org.apache.hadoop.io.compress.BZip2Codec',
|
|
'deflate': 'org.apache.hadoop.io.compress.DeflateCodec',
|
|
'default': 'org.apache.hadoop.io.compress.DefaultCodec'
|
|
}
|
|
hive_table = "{0}.{1}".format(unique_database, "t1_hive")
|
|
self.run_stmt_in_hive("drop table if exists {0}".format(hive_table))
|
|
self.run_stmt_in_hive("set hive.exec.compress.output=true;\
|
|
set mapreduce.output.fileoutputformat.compress.codec={0};\
|
|
create {1} table {2} stored as textfile {3} as select * from {4}"
|
|
.format(switcher.get(codec, 'Invalid codec'), external, hive_table,
|
|
tblproperties, source_table))
|
|
|
|
# Make sure hive CTAS table is not empty
|
|
assert self.run_stmt_in_hive("select count(*) from {0}".format(
|
|
hive_table)).split("\n")[1] != "0", "CTAS created Hive table is empty."
|
|
|
|
# Make sure Impala's metadata is in sync.
|
|
if cluster_properties.is_catalog_v2_cluster():
|
|
self.wait_for_table_to_appear(unique_database, hive_table, timeout_s=10)
|
|
else:
|
|
self.client.execute("invalidate metadata {0}".format(hive_table))
|
|
|
|
# Read Hive data in Impala and verify results.
|
|
base_result = self.execute_query_expect_success(self.client,
|
|
"select * from {0} order by id".format(source_table))
|
|
test_result = self.execute_query_expect_success(self.client,
|
|
"select * from {0} order by id".format(hive_table))
|
|
verify_query_result_is_equal(test_result.data, base_result.data)
|