mirror of
https://github.com/apache/impala.git
synced 2025-12-20 02:20:11 -05:00
This patch adds support for COS(Cloud Object Storage). Using the hadoop-cos, the implementation is similar to other remote FileSystems. New flags for COS: - num_cos_io_threads: Number of COS I/O threads. Defaults to be 16. Follow-up: - Support for caching COS file handles will be addressed in IMPALA-10772. - test_concurrent_inserts and test_failing_inserts in test_acid_stress.py are skipped due to slow file listing on COS (IMPALA-10773). Tests: - Upload hdfs test data to a COS bucket. Modify all locations in HMS DB to point to the COS bucket. Remove some hdfs caching params. Run CORE tests. Change-Id: Idce135a7591d1b4c74425e365525be3086a39821 Reviewed-on: http://gerrit.cloudera.org:8080/17503 Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
223 lines
9.0 KiB
Python
223 lines
9.0 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
|
|
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
|
from tests.common.skip import SkipIfLocal
|
|
from tests.util.filesystem_utils import (
|
|
IS_ISILON,
|
|
IS_ADLS,
|
|
IS_GCS,
|
|
IS_COS)
|
|
from time import sleep
|
|
|
|
@SkipIfLocal.hdfs_fd_caching
|
|
class TestHdfsFdCaching(CustomClusterTestSuite):
|
|
"""Tests that if HDFS file handle caching is enabled, file handles are actually cached
|
|
and the associated metrics return valid results. In addition, tests that the upper bound
|
|
of cached file handles is respected."""
|
|
|
|
NUM_ROWS = 100
|
|
INSERT_TPL = "insert into cachefd.simple values"
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
def create_n_files(self, n):
|
|
"""Creates 'n' files by performing 'n' inserts with NUM_ROWS rows."""
|
|
values = ", ".join(["({0},{0},{0})".format(x) for x in range(self.NUM_ROWS)])
|
|
for _ in range(n):
|
|
self.client.execute(self.INSERT_TPL + values)
|
|
|
|
def setup_method(self, method):
|
|
super(TestHdfsFdCaching, self).setup_method(method)
|
|
impalad = self.cluster.impalads[0]
|
|
client = impalad.service.create_beeswax_client()
|
|
|
|
self.client = client
|
|
client.execute("drop database if exists cachefd cascade")
|
|
client.execute("create database cachefd")
|
|
client.execute("create table cachefd.simple(id int, col1 int, col2 int) "
|
|
"stored as parquet")
|
|
self.create_n_files(1)
|
|
|
|
def teardown_method(self, method):
|
|
super(TestHdfsFdCaching, self).teardown_method(method)
|
|
self.client.execute("drop database if exists cachefd cascade")
|
|
|
|
def run_fd_caching_test(self, vector, caching_expected, cache_capacity,
|
|
eviction_timeout_secs):
|
|
"""
|
|
Tests that HDFS file handles are cached as expected. This is used both
|
|
for the positive and negative test cases. If caching_expected is true,
|
|
this verifies that the cache adheres to the specified capacity. Also,
|
|
repeated queries across the same files reuse the file handles.
|
|
If caching_expected is false, it verifies that the cache does not
|
|
change in size while running queries.
|
|
"""
|
|
|
|
# Maximum number of file handles cached (applies whether caching expected
|
|
# or not)
|
|
assert self.max_cached_handles() <= cache_capacity
|
|
|
|
num_handles_start = self.cached_handles()
|
|
# The table has one file. If caching is expected, there should be one more
|
|
# handle cached after the first select. If caching is not expected, the
|
|
# number of handles should not change from the initial number.
|
|
self.execute_query("select * from cachefd.simple", vector=vector)
|
|
num_handles_after = self.cached_handles()
|
|
assert self.max_cached_handles() <= cache_capacity
|
|
|
|
if caching_expected:
|
|
assert num_handles_after == (num_handles_start + 1)
|
|
else:
|
|
assert num_handles_after == num_handles_start
|
|
|
|
# No open handles if scanning is finished
|
|
assert self.outstanding_handles() == 0
|
|
|
|
# No change when reading the table again
|
|
for x in range(10):
|
|
self.execute_query("select * from cachefd.simple", vector=vector)
|
|
assert self.cached_handles() == num_handles_after
|
|
assert self.max_cached_handles() <= cache_capacity
|
|
assert self.outstanding_handles() == 0
|
|
|
|
# Create more files. This means there are more files than the cache size.
|
|
# The cache size should still be enforced.
|
|
self.create_n_files(cache_capacity + 100)
|
|
|
|
# Read all the files of the table and make sure no FD leak
|
|
for x in range(10):
|
|
self.execute_query("select count(*) from cachefd.simple;", vector=vector)
|
|
assert self.max_cached_handles() <= cache_capacity
|
|
if not caching_expected:
|
|
assert self.cached_handles() == num_handles_start
|
|
assert self.outstanding_handles() == 0
|
|
|
|
if caching_expected and eviction_timeout_secs is not None:
|
|
# To test unused file handle eviction, sleep for longer than the timeout.
|
|
# All the cached handles should be evicted.
|
|
sleep(eviction_timeout_secs + 5)
|
|
assert self.cached_handles() == 0
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args="--max_cached_file_handles=16 " +
|
|
" --unused_file_handle_timeout_sec=18446744073709551600",
|
|
catalogd_args="--load_catalog_in_background=false")
|
|
def test_caching_enabled(self, vector):
|
|
"""
|
|
Test of the HDFS file handle cache with the parameter specified and a very
|
|
large file handle timeout
|
|
"""
|
|
|
|
cache_capacity = 16
|
|
|
|
# Caching applies to HDFS, S3, and ABFS files. If this is HDFS, S3, or ABFS, then
|
|
# verify that caching works. Otherwise, verify that file handles are not cached.
|
|
if IS_ADLS or IS_ISILON or IS_GCS or IS_COS:
|
|
caching_expected = False
|
|
else:
|
|
caching_expected = True
|
|
self.run_fd_caching_test(vector, caching_expected, cache_capacity, None)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args="--max_cached_file_handles=16 --unused_file_handle_timeout_sec=5",
|
|
catalogd_args="--load_catalog_in_background=false")
|
|
def test_caching_with_eviction(self, vector):
|
|
"""Test of the HDFS file handle cache with unused file handle eviction enabled"""
|
|
cache_capacity = 16
|
|
handle_timeout = 5
|
|
|
|
# Only test eviction on platforms where caching is enabled.
|
|
if IS_ADLS or IS_ISILON or IS_GCS or IS_COS:
|
|
return
|
|
caching_expected = True
|
|
self.run_fd_caching_test(vector, caching_expected, cache_capacity, handle_timeout)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args="--max_cached_file_handles=0",
|
|
catalogd_args="--load_catalog_in_background=false")
|
|
def test_caching_disabled_by_param(self, vector):
|
|
"""Test that the HDFS file handle cache is disabled when the parameter is zero"""
|
|
cache_capacity = 0
|
|
caching_expected = False
|
|
self.run_fd_caching_test(vector, caching_expected, cache_capacity, None)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args="--max_cached_file_handles=16 --unused_file_handle_timeout_sec=5 " +
|
|
"--always_use_data_cache=true",
|
|
start_args="--data_cache_dir=/tmp --data_cache_size=500MB",
|
|
catalogd_args="--load_catalog_in_background=false")
|
|
def test_no_fd_caching_on_cached_data(self, vector):
|
|
"""IMPALA-10147: Test that no file handle should be opened nor cached again if data
|
|
is being read from data cache."""
|
|
cache_capacity = 16
|
|
eviction_timeout_secs = 5
|
|
|
|
# Only test eviction on platforms where caching is enabled.
|
|
if IS_ADLS or IS_ISILON or IS_GCS or IS_COS:
|
|
return
|
|
|
|
# Maximum number of file handles cached.
|
|
assert self.max_cached_handles() <= cache_capacity
|
|
|
|
num_handles_start = self.cached_handles()
|
|
# The table has one file. If caching is expected, there should be one more
|
|
# handle cached after the first select. If caching is not expected, the
|
|
# number of handles should not change from the initial number.
|
|
# Read 5 times to make sure the data cache is fully warmed up.
|
|
for x in range(5):
|
|
self.execute_query("select * from cachefd.simple", vector=vector)
|
|
num_handles_after = self.cached_handles()
|
|
assert self.max_cached_handles() <= cache_capacity
|
|
assert num_handles_after == (num_handles_start + 1)
|
|
|
|
# No open handles if scanning is finished.
|
|
assert self.outstanding_handles() == 0
|
|
|
|
# To test unused file handle eviction, sleep for longer than the timeout.
|
|
# All the cached handles should be evicted.
|
|
sleep(eviction_timeout_secs + 5)
|
|
assert self.cached_handles() == 0
|
|
|
|
# Reread from data cache. Expect that no handle should be opened nor cached again.
|
|
for x in range(10):
|
|
self.execute_query("select * from cachefd.simple", vector=vector)
|
|
assert self.cached_handles() == 0
|
|
assert self.max_cached_handles() <= cache_capacity
|
|
assert self.outstanding_handles() == 0
|
|
|
|
def cached_handles(self):
|
|
return self.get_agg_metric("impala-server.io.mgr.num-cached-file-handles")
|
|
|
|
def outstanding_handles(self):
|
|
return self.get_agg_metric("impala-server.io.mgr.num-file-handles-outstanding")
|
|
|
|
def max_cached_handles(self):
|
|
return self.get_agg_metric("impala-server.io.mgr.num-cached-file-handles", max)
|
|
|
|
def get_agg_metric(self, key, fun=sum):
|
|
cluster = self.cluster
|
|
return fun([s.service.get_metric_value(key) for s in cluster.impalads])
|