mirror of
https://github.com/apache/impala.git
synced 2026-01-07 18:02:33 -05:00
This patch leverages the AdlFileSystem in Hadoop to allow Impala to talk to the Azure Data Lake Store. This patch has functional changes as well as adds test infrastructure for testing Impala over ADLS. We do not support ACLs on ADLS since the Hadoop ADLS connector does not integrate ADLS ACLs with Hadoop users/groups. For testing, we use the azure-data-lake-store-python client from Microsoft. This client seems to have some consistency issues. For example, a drop table through Impala will delete the files in ADLS, however, listing that directory through the python client immediately after the drop, will still show the files. This behavior is unexpected since ADLS claims to be strongly consistent. Some tests have been skipped due to this limitation with the tag SkipIfADLS.slow_client. Tracked by IMPALA-5335. The azure-data-lake-store-python client also only works on CentOS 6.6 and over, so the python dependencies for Azure will not be downloaded when the TARGET_FILESYSTEM is not "adls". While running ADLS tests, the expectation will be that it runs on a machine that is at least running CentOS 6.6. Note: This is only a test limitation, not a functional one. Clusters with older OSes like CentOS 6.4 will still work with ADLS. Added another dependency to bootstrap_build.sh for the ADLS Python client. Testing: Ran core tests with and without TARGET_FILESYSTEM as 'adls' to make sure that all tests pass and that nothing breaks. Change-Id: Ic56b9988b32a330443f24c44f9cb2c80842f7542 Reviewed-on: http://gerrit.cloudera.org:8080/6910 Tested-by: Impala Public Jenkins Reviewed-by: Sailesh Mukil <sailesh@cloudera.com>
180 lines
7.8 KiB
Python
180 lines
7.8 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Impala tests for queries that query metadata and set session settings
|
|
|
|
import pytest
|
|
import re
|
|
|
|
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.skip import SkipIfIsilon, SkipIfS3, SkipIfADLS, SkipIfLocal
|
|
from tests.common.test_dimensions import ALL_NODES_ONLY
|
|
from tests.common.test_dimensions import create_exec_option_dimension
|
|
from tests.common.test_dimensions import create_uncompressed_text_dimension
|
|
from tests.util.filesystem_utils import get_fs_path
|
|
|
|
# TODO: For these tests to pass, all table metadata must be created exhaustively.
|
|
# the tests should be modified to remove that requirement.
|
|
class TestMetadataQueryStatements(ImpalaTestSuite):
|
|
|
|
CREATE_DATA_SRC_STMT = ("CREATE DATA SOURCE %s LOCATION '" +
|
|
get_fs_path("/test-warehouse/data-sources/test-data-source.jar") +
|
|
"' CLASS 'org.apache.impala.extdatasource.AllTypesDataSource' API_VERSION 'V1'")
|
|
DROP_DATA_SRC_STMT = "DROP DATA SOURCE IF EXISTS %s"
|
|
TEST_DATA_SRC_NAMES = ["show_test_ds1", "show_test_ds2"]
|
|
AVRO_SCHEMA_LOC = get_fs_path("/test-warehouse/avro_schemas/functional/alltypes.json")
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestMetadataQueryStatements, cls).add_test_dimensions()
|
|
sync_ddl_opts = [0, 1]
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
# Cut down on test runtime by only running with SYNC_DDL=0
|
|
sync_ddl_opts = [0]
|
|
|
|
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
|
|
cluster_sizes=ALL_NODES_ONLY,
|
|
disable_codegen_options=[False],
|
|
batch_sizes=[0],
|
|
sync_ddl=sync_ddl_opts))
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def test_use(self, vector):
|
|
self.run_test_case('QueryTest/use', vector)
|
|
|
|
def test_show(self, vector):
|
|
self.run_test_case('QueryTest/show', vector)
|
|
|
|
def test_show_stats(self, vector):
|
|
self.run_test_case('QueryTest/show-stats', vector, "functional")
|
|
|
|
def test_describe_path(self, vector, unique_database):
|
|
self.run_test_case('QueryTest/describe-path', vector, unique_database)
|
|
|
|
# Missing Coverage: Describe formatted compatibility between Impala and Hive when the
|
|
# data doesn't reside in hdfs.
|
|
@SkipIfIsilon.hive
|
|
@SkipIfS3.hive
|
|
@SkipIfADLS.hive
|
|
@SkipIfLocal.hive
|
|
def test_describe_formatted(self, vector, unique_database):
|
|
# For describe formmated, we try to match Hive's output as closely as possible.
|
|
# However, we're inconsistent with our handling of NULLs vs theirs - Impala sometimes
|
|
# specifies 'NULL' where Hive uses an empty string, and Hive somtimes specifies 'null'
|
|
# with padding where Impala uses a sequence of blank spaces - and for now
|
|
# we want to leave it that way to not affect users who rely on this output.
|
|
def compare_describe_formatted(impala_results, hive_results):
|
|
for impala, hive in zip(re.split(',|\n', impala_results),
|
|
re.split(',|\n', hive_results)):
|
|
|
|
if impala != hive:
|
|
# If they don't match, check if it's because of the inconsistent null handling.
|
|
impala = impala.replace(' ', '').lower()
|
|
hive = hive.replace(' ', '').lower()
|
|
if not ((impala == "'null'" and hive == "''") or
|
|
(impala == "''" and hive == "'null'")):
|
|
return False
|
|
return True
|
|
|
|
# Describe a partitioned table.
|
|
self.exec_and_compare_hive_and_impala_hs2("describe formatted functional.alltypes",
|
|
compare=compare_describe_formatted)
|
|
self.exec_and_compare_hive_and_impala_hs2(
|
|
"describe formatted functional_text_lzo.alltypes",
|
|
compare=compare_describe_formatted)
|
|
|
|
# Describe an unpartitioned table.
|
|
self.exec_and_compare_hive_and_impala_hs2("describe formatted tpch.lineitem",
|
|
compare=compare_describe_formatted)
|
|
self.exec_and_compare_hive_and_impala_hs2("describe formatted functional.jointbl",
|
|
compare=compare_describe_formatted)
|
|
|
|
# Create and describe an unpartitioned and partitioned Avro table created
|
|
# by Impala without any column definitions.
|
|
# TODO: Instead of creating new tables here, change one of the existing
|
|
# Avro tables to be created without any column definitions.
|
|
self.client.execute("create database if not exists %s" % unique_database)
|
|
self.client.execute((
|
|
"create table %s.%s with serdeproperties ('avro.schema.url'='%s') stored as avro"
|
|
% (unique_database, "avro_alltypes_nopart", self.AVRO_SCHEMA_LOC)))
|
|
self.exec_and_compare_hive_and_impala_hs2("describe formatted avro_alltypes_nopart",
|
|
compare=compare_describe_formatted)
|
|
|
|
self.client.execute((
|
|
"create table %s.%s partitioned by (year int, month int) "
|
|
"with serdeproperties ('avro.schema.url'='%s') stored as avro"
|
|
% (unique_database, "avro_alltypes_part", self.AVRO_SCHEMA_LOC)))
|
|
self.exec_and_compare_hive_and_impala_hs2("describe formatted avro_alltypes_part",
|
|
compare=compare_describe_formatted)
|
|
|
|
self.exec_and_compare_hive_and_impala_hs2(\
|
|
"describe formatted functional.alltypes_view_sub",
|
|
compare=compare_describe_formatted)
|
|
|
|
@pytest.mark.execute_serially # due to data src setup/teardown
|
|
def test_show_data_sources(self, vector):
|
|
try:
|
|
self.__create_data_sources()
|
|
self.run_test_case('QueryTest/show-data-sources', vector)
|
|
finally:
|
|
self.__drop_data_sources()
|
|
|
|
def __drop_data_sources(self):
|
|
for name in self.TEST_DATA_SRC_NAMES:
|
|
self.client.execute(self.DROP_DATA_SRC_STMT % (name,))
|
|
|
|
def __create_data_sources(self):
|
|
self.__drop_data_sources()
|
|
for name in self.TEST_DATA_SRC_NAMES:
|
|
self.client.execute(self.CREATE_DATA_SRC_STMT % (name,))
|
|
|
|
@SkipIfS3.hive
|
|
@SkipIfADLS.hive
|
|
@SkipIfIsilon.hive
|
|
@SkipIfLocal.hive
|
|
@pytest.mark.execute_serially # because of invalidate metadata
|
|
def test_describe_db(self, vector):
|
|
self.__test_describe_db_cleanup()
|
|
try:
|
|
self.client.execute("create database impala_test_desc_db1")
|
|
self.client.execute("create database impala_test_desc_db2 "
|
|
"comment 'test comment'")
|
|
self.client.execute("create database impala_test_desc_db3 "
|
|
"location '" + get_fs_path("/testdb") + "'")
|
|
self.client.execute("create database impala_test_desc_db4 comment 'test comment' "
|
|
"location \"" + get_fs_path("/test2.db") + "\"")
|
|
self.run_stmt_in_hive("create database hive_test_desc_db comment 'test comment' "
|
|
"with dbproperties('pi' = '3.14', 'e' = '2.82')")
|
|
self.run_stmt_in_hive("alter database hive_test_desc_db set owner user test")
|
|
self.client.execute("invalidate metadata")
|
|
self.run_test_case('QueryTest/describe-db', vector)
|
|
finally:
|
|
self.__test_describe_db_cleanup()
|
|
|
|
def __test_describe_db_cleanup(self):
|
|
self.cleanup_db('hive_test_desc_db')
|
|
self.cleanup_db('impala_test_desc_db1')
|
|
self.cleanup_db('impala_test_desc_db2')
|
|
self.cleanup_db('impala_test_desc_db3')
|
|
self.cleanup_db('impala_test_desc_db4')
|