Files
impala/tests/custom_cluster/test_hdfs_fd_caching.py
Michael Smith 5e694568d5 IMPALA-11966: Enable cache_ozone_file_handles by default
Updates Ozone dependency to 1.3.0 to address HDDS-7135 and enables
cache_ozone_file_handles by default for a ~10% improvement on TPC-DS
query time.

Updates the Ozone CDP dependency for HDDS-8095. Fix for it will be
available in Ozone 1.4.0, so testing with TDE currently requires the CDP
build.

Testing:
- ran backend, e2e, and custom cluster test suites with Ozone

Change-Id: Icc66551f9b87af785a1c30b516ac39f4640638fe
Reviewed-on: http://gerrit.cloudera.org:8080/19573
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2023-03-24 07:59:21 +00:00

251 lines
10 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import, division, print_function
from builtins import range
import pytest
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.network import get_external_ip
from tests.common.skip import SkipIfLocal
from tests.util.filesystem_utils import (
IS_ISILON,
IS_ADLS,
IS_GCS,
IS_COS,
IS_OSS)
from time import sleep
@SkipIfLocal.hdfs_fd_caching
class TestHdfsFdCaching(CustomClusterTestSuite):
"""Tests that if HDFS file handle caching is enabled, file handles are actually cached
and the associated metrics return valid results. In addition, tests that the upper bound
of cached file handles is respected."""
NUM_ROWS = 100
INSERT_TPL = "insert into cachefd.simple values"
@classmethod
def get_workload(self):
return 'functional-query'
def create_n_files(self, n):
"""Creates 'n' files by performing 'n' inserts with NUM_ROWS rows."""
values = ", ".join(["({0},{0},{0})".format(x) for x in range(self.NUM_ROWS)])
for _ in range(n):
self.client.execute(self.INSERT_TPL + values)
def setup_method(self, method):
super(TestHdfsFdCaching, self).setup_method(method)
impalad = self.cluster.impalads[0]
client = impalad.service.create_beeswax_client()
self.client = client
client.execute("drop database if exists cachefd cascade")
client.execute("create database cachefd")
client.execute("create table cachefd.simple(id int, col1 int, col2 int) "
"stored as parquet")
self.create_n_files(1)
def teardown_method(self, method):
super(TestHdfsFdCaching, self).teardown_method(method)
self.client.execute("drop database if exists cachefd cascade")
def run_fd_caching_test(self, vector, caching_expected, cache_capacity,
eviction_timeout_secs):
"""
Tests that HDFS file handles are cached as expected. This is used both
for the positive and negative test cases. If caching_expected is true,
this verifies that the cache adheres to the specified capacity. Also,
repeated queries across the same files reuse the file handles.
If caching_expected is false, it verifies that the cache does not
change in size while running queries.
"""
# Maximum number of file handles cached (applies whether caching expected
# or not)
assert self.max_cached_handles() <= cache_capacity
num_handles_start = self.cached_handles()
# The table has one file. If caching is expected, there should be one more
# handle cached after the first select. If caching is not expected, the
# number of handles should not change from the initial number.
self.execute_query("select * from cachefd.simple", vector=vector)
num_handles_after = self.cached_handles()
assert self.max_cached_handles() <= cache_capacity
if caching_expected:
assert num_handles_after == (num_handles_start + 1)
else:
assert num_handles_after == num_handles_start
# No open handles if scanning is finished
assert self.outstanding_handles() == 0
# No change when reading the table again
for x in range(10):
self.execute_query("select * from cachefd.simple", vector=vector)
assert self.cached_handles() == num_handles_after
assert self.max_cached_handles() <= cache_capacity
assert self.outstanding_handles() == 0
# Create more files. This means there are more files than the cache size.
# The cache size should still be enforced.
self.create_n_files(cache_capacity + 100)
# Read all the files of the table and make sure no FD leak
for x in range(10):
self.execute_query("select count(*) from cachefd.simple;", vector=vector)
assert self.max_cached_handles() <= cache_capacity
if not caching_expected:
assert self.cached_handles() == num_handles_start
assert self.outstanding_handles() == 0
if caching_expected and eviction_timeout_secs is not None:
# To test unused file handle eviction, sleep for longer than the timeout.
# All the cached handles should be evicted.
sleep(eviction_timeout_secs + 5)
assert self.cached_handles() == 0
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args="--max_cached_file_handles=16"
" --unused_file_handle_timeout_sec=18446744073709551600",
catalogd_args="--load_catalog_in_background=false")
def test_caching_enabled(self, vector):
"""
Test of the HDFS file handle cache with the parameter specified and a very
large file handle timeout
"""
cache_capacity = 16
# Caching applies to HDFS, Ozone, S3, and ABFS files. If this is HDFS, Ozone, S3, or
# ABFS, then verify that caching works. Otherwise, verify that file handles are not
# cached.
if IS_ADLS or IS_ISILON or IS_GCS or IS_COS or IS_OSS:
caching_expected = False
else:
caching_expected = True
self.run_fd_caching_test(vector, caching_expected, cache_capacity, None)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args="--max_cached_file_handles=16 --unused_file_handle_timeout_sec=5",
catalogd_args="--load_catalog_in_background=false")
def test_caching_with_eviction(self, vector):
"""Test of the HDFS file handle cache with unused file handle eviction enabled"""
cache_capacity = 16
handle_timeout = 5
# Only test eviction on platforms where caching is enabled.
if IS_ADLS or IS_ISILON or IS_GCS or IS_COS or IS_OSS:
return
caching_expected = True
self.run_fd_caching_test(vector, caching_expected, cache_capacity, handle_timeout)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args="--max_cached_file_handles=0",
catalogd_args="--load_catalog_in_background=false")
def test_caching_disabled_by_param(self, vector):
"""Test that the HDFS file handle cache is disabled when the parameter is zero"""
cache_capacity = 0
caching_expected = False
self.run_fd_caching_test(vector, caching_expected, cache_capacity, None)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args="--cache_remote_file_handles=false --cache_s3_file_handles=false "
"--cache_abfs_file_handles=false --cache_ozone_file_handles=false "
"--hostname=" + get_external_ip(),
catalogd_args="--load_catalog_in_background=false")
def test_remote_caching_disabled_by_param(self, vector):
"""Test that the file handle cache is disabled for remote files when disabled"""
cache_capacity = 0
caching_expected = False
self.run_fd_caching_test(vector, caching_expected, cache_capacity, None)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args="--max_cached_file_handles=0 --hostname=" + get_external_ip(),
catalogd_args="--load_catalog_in_background=false")
def test_remote_caching_disabled_by_global_param(self, vector):
"""Test that the file handle cache is disabled for remote files when all caching is
disabled"""
cache_capacity = 0
caching_expected = False
self.run_fd_caching_test(vector, caching_expected, cache_capacity, None)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args="--max_cached_file_handles=16 --unused_file_handle_timeout_sec=5 "
"--always_use_data_cache=true",
start_args="--data_cache_dir=/tmp --data_cache_size=500MB",
catalogd_args="--load_catalog_in_background=false")
def test_no_fd_caching_on_cached_data(self, vector):
"""IMPALA-10147: Test that no file handle should be opened nor cached again if data
is being read from data cache."""
cache_capacity = 16
eviction_timeout_secs = 5
# Only test eviction on platforms where caching is enabled.
if IS_ADLS or IS_ISILON or IS_GCS or IS_COS or IS_OSS:
return
# Maximum number of file handles cached.
assert self.max_cached_handles() <= cache_capacity
num_handles_start = self.cached_handles()
# The table has one file. If caching is expected, there should be one more
# handle cached after the first select. If caching is not expected, the
# number of handles should not change from the initial number.
# Read 5 times to make sure the data cache is fully warmed up.
for x in range(5):
self.execute_query("select * from cachefd.simple", vector=vector)
num_handles_after = self.cached_handles()
assert self.max_cached_handles() <= cache_capacity
assert num_handles_after == (num_handles_start + 1)
# No open handles if scanning is finished.
assert self.outstanding_handles() == 0
# To test unused file handle eviction, sleep for longer than the timeout.
# All the cached handles should be evicted.
sleep(eviction_timeout_secs + 5)
assert self.cached_handles() == 0
# Reread from data cache. Expect that no handle should be opened nor cached again.
for x in range(10):
self.execute_query("select * from cachefd.simple", vector=vector)
assert self.cached_handles() == 0
assert self.max_cached_handles() <= cache_capacity
assert self.outstanding_handles() == 0
def cached_handles(self):
return self.get_agg_metric("impala-server.io.mgr.num-cached-file-handles")
def outstanding_handles(self):
return self.get_agg_metric("impala-server.io.mgr.num-file-handles-outstanding")
def max_cached_handles(self):
return self.get_agg_metric("impala-server.io.mgr.num-cached-file-handles", max)
def get_agg_metric(self, key, fun=sum):
cluster = self.cluster
return fun([s.service.get_metric_value(key) for s in cluster.impalads])