mirror of
https://github.com/apache/impala.git
synced 2025-12-20 02:20:11 -05:00
This patch adds support for COS(Cloud Object Storage). Using the hadoop-cos, the implementation is similar to other remote FileSystems. New flags for COS: - num_cos_io_threads: Number of COS I/O threads. Defaults to be 16. Follow-up: - Support for caching COS file handles will be addressed in IMPALA-10772. - test_concurrent_inserts and test_failing_inserts in test_acid_stress.py are skipped due to slow file listing on COS (IMPALA-10773). Tests: - Upload hdfs test data to a COS bucket. Modify all locations in HMS DB to point to the COS bucket. Remove some hdfs caching params. Run CORE tests. Change-Id: Idce135a7591d1b4c74425e365525be3086a39821 Reviewed-on: http://gerrit.cloudera.org:8080/17503 Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
128 lines
5.2 KiB
Python
128 lines
5.2 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
import re
|
|
from time import sleep
|
|
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
|
from tests.common.skip import (
|
|
SkipIfS3,
|
|
SkipIfABFS,
|
|
SkipIfADLS,
|
|
SkipIfGCS,
|
|
SkipIfCOS,
|
|
SkipIfIsilon,
|
|
SkipIfLocal)
|
|
from tests.util.hive_utils import HiveDbWrapper
|
|
|
|
@SkipIfS3.hive
|
|
@SkipIfGCS.hive
|
|
@SkipIfCOS.hive
|
|
@SkipIfABFS.hive
|
|
@SkipIfADLS.hive
|
|
@SkipIfIsilon.hive
|
|
@SkipIfLocal.hive
|
|
class TestMetadataReplicas(CustomClusterTestSuite):
|
|
""" Validates metadata content across catalogd and impalad coordinators."""
|
|
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
pytest.skip('runs only in exhaustive')
|
|
super(TestMetadataReplicas, cls).setup_class()
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_start(self):
|
|
""" Baseline to verify that the initial state is identical. No DDL/DML
|
|
is processed, so no objects are fully loaded."""
|
|
self.__validate_metadata()
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_catalog_restart(self, testid_checksum):
|
|
""" IMPALA-6948: reproduces the issue by deleting a table from Hive while the catalogd
|
|
is down. When catalogd is restarted, if the regression is present, the deleted
|
|
table will still be present at the impalads."""
|
|
db_name = "test_catalog_restart_%s" % testid_checksum
|
|
try:
|
|
with HiveDbWrapper(self, db_name):
|
|
# Issue several invalidates to boost the version for the current incarnation of the
|
|
# catalog. As a result, the table we'll add to Hive will get a version that's easier
|
|
# to see is higher than the highest version of the restarted catalogd incarnation.
|
|
for i in xrange(0, 50):
|
|
self.client.execute("invalidate metadata functional.alltypes")
|
|
assert self.cluster.catalogd.service.get_catalog_version() >= 50
|
|
# Creates a database and table with Hive and makes it visible to Impala.
|
|
self.run_stmt_in_hive("create table %s.x (a string)" % db_name)
|
|
self.client.execute("invalidate metadata %s.x" % db_name)
|
|
assert "x" in self.client.execute("show tables in %s" % db_name).data
|
|
# Stops the catalog
|
|
self.cluster.catalogd.kill()
|
|
# Drops the table from the catalog using Hive.
|
|
self.run_stmt_in_hive("drop table %s.x" % db_name)
|
|
# Restarts the catalog
|
|
self.cluster.catalogd.start()
|
|
# Refreshes the state of the catalogd process.
|
|
self.cluster.refresh()
|
|
# Wait until the impalad catalog versions agree with the catalogd's version.
|
|
catalogd_version = self.cluster.catalogd.service.get_catalog_version()
|
|
for impalad in self.cluster.impalads:
|
|
impalad.service.wait_for_metric_value("catalog.curr-version", catalogd_version)
|
|
|
|
self.__validate_metadata()
|
|
except Exception as e:
|
|
assert False, "Unexpected exception: " + str(e)
|
|
finally:
|
|
# Hack to work-around IMPALA-5695.
|
|
self.cluster.catalogd.kill()
|
|
|
|
def __validate_metadata(self):
|
|
""" Computes the pair-wise object version difference between the catalog contents
|
|
in catalogd and each impalad. Asserts that there are no differences."""
|
|
c_objects = self.cluster.catalogd.service.get_catalog_objects()
|
|
i_objects = [proc.service.get_catalog_objects() for proc in self.cluster.impalads]
|
|
|
|
for idx in xrange(0, len(i_objects)):
|
|
i_obj = i_objects[idx]
|
|
diff = self.__diff_catalog_objects(c_objects, i_obj)
|
|
assert diff[0] == {},\
|
|
'catalogd has objects not in impalad(%d): %s ' % (idx, diff[0])
|
|
assert diff[1] == {}, 'impalad(%d) has objects not in catalogd: %s' % (idx, diff[1])
|
|
assert diff[2] is None,\
|
|
'impalad(%d) and catalogd version for objects differs: %s' % (idx, diff[2])
|
|
|
|
def __diff_catalog_objects(self, a, b):
|
|
""" Computes the diff between the input 'a' and 'b' dictionaries. The result is a
|
|
list of length 3 where position 0 holds those entries that are in a, but not b,
|
|
position 1 those entries that are in b, but not a, and position 2 holds entries
|
|
where the key is in both a and b, but whose value differs."""
|
|
# diff[0] : a - b
|
|
# diff[1] : b - a
|
|
# diff[2] : a[k] != b[k]
|
|
diff = [None, None, None]
|
|
diff[0] = dict((k, a[k]) for k in set(a) - set(b))
|
|
diff[1] = dict((k, b[k]) for k in set(b) - set(a))
|
|
for k, v_a in a.items():
|
|
v_b = b[k]
|
|
if v_b is not None:
|
|
if v_b != v_a:
|
|
diff[2][k] = (v_a, v_b)
|
|
return diff
|