Files
impala/tests/custom_cluster/test_data_cache.py
Joe McDonnell 5acce4d200 IMPALA-9472,IMPALA-9473: Add per-partition metrics for data cache
This adds two sets of metrics. The first is per-partition metrics
to track the performance of the underlying filesystem for the
data cache. It keeps histograms of read, write, and eviction
latency for each data cache partition along with another metric
recording the path for the partition. These are exposed as the
following metrics:
impala-server.io-mgr.remote-data-cache-partition-$0.path
impala-server.io-mgr.remote-data-cache-partition-$0.read-latency
impala-server.io-mgr.remote-data-cache-partition-$0.write-latency
impala-server.io-mgr.remote-data-cache-partition-$0.eviction-latency

This also adds metrics to keep counts of hits, misses, and entries
in the data cache. Since reducing the latency of IO is an important
feature of the data cache, the absolute count of hits and misses
is as important as the hit bytes and miss bytes. This adds the
following metrics:
impala-server.io-mgr.remote-data-cache-hit-count
impala-server.io-mgr.remote-data-cache-miss-count
impala-server.io-mgr.remote-data-cache-num-entries

To track metrics around inserts, this also adds the following
metrics:
impala-server.io-mgr.remote-data-cache-num-inserts
impala-server.io-mgr.remote-data-cache-dropped-entries
impala-server.io-mgr.remote-data-cache-instant-evictions
An instant eviction happens when inserting an entry into the cache
fails and the entry is immediately evicted during insert. This is
currently only possible for LIRS when the entry's size is larger
than the unprotected capacity. This manifests when the cache
size is very small. For example, for an 8MB entry, this would
manifest when a cache shard is smaller than 160MB. This metric
is primarily for debugging.

Testing:
 - Hand testing to verify the per-partition latency histograms
 - Modified custom_cluster/test_data_cache.py to also test
   the counts.

Change-Id: I56a57d75ff11f00ebc85b85bcaf104fb8108c478
Reviewed-on: http://gerrit.cloudera.org:8080/15382
Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2020-04-01 14:37:58 +00:00

166 lines
8.2 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.skip import SkipIf, SkipIfNotHdfsMinicluster
@SkipIf.not_hdfs
@SkipIf.is_buggy_el6_kernel
@SkipIfNotHdfsMinicluster.scheduling
class TestDataCache(CustomClusterTestSuite):
""" This test enables the data cache and verfies that cache hit and miss counts
in the runtime profile and metrics are as expected. Run on non-EC HDFS only as
this test checks the number of data cache hit counts, which implicitly relies
on the scheduler's behavior and number of HDFS blocks.
"""
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def setup_class(cls):
if cls.exploration_strategy() != 'exhaustive':
pytest.skip('runs only in exhaustive')
super(TestDataCache, cls).setup_class()
def get_impalad_args(eviction_policy, high_write_concurrency=True,
force_single_shard=True):
impalad_args = ["--always_use_data_cache=true"]
if (high_write_concurrency):
impalad_args.append("--data_cache_write_concurrency=64")
if (force_single_shard):
impalad_args.append("--cache_force_single_shard")
impalad_args.append("--data_cache_eviction_policy={0}".format(eviction_policy))
return " ".join(impalad_args)
CACHE_START_ARGS = "--data_cache_dir=/tmp --data_cache_size=500MB"
def __test_data_cache_deterministic(self, vector, unique_database):
""" This test creates a temporary table from another table, overwrites it with
some other data and verifies that no stale data is read from the cache. Runs with
a single node to make it easier to verify the runtime profile. Also enables higher
write concurrency and uses a single shard to avoid non-determinism.
"""
self.run_test_case('QueryTest/data-cache', vector, unique_database)
assert self.get_metric('impala-server.io-mgr.remote-data-cache-dropped-bytes') >= 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-dropped-entries') >= 0
assert \
self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions') >= 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') > 0
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args=get_impalad_args("LRU"),
start_args=CACHE_START_ARGS, cluster_size=1)
def test_data_cache_deterministic_lru(self, vector, unique_database):
self.__test_data_cache_deterministic(vector, unique_database)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args=get_impalad_args("LIRS"),
start_args=CACHE_START_ARGS, cluster_size=1)
def test_data_cache_deterministic_lirs(self, vector, unique_database):
self.__test_data_cache_deterministic(vector, unique_database)
def __test_data_cache(self, vector):
""" This test scans the same table twice and verifies the cache hit count metrics
are correct. The exact number of bytes hit is non-deterministic between runs due
to different mtime of files and multiple shards in the cache.
"""
QUERY = "select * from tpch_parquet.lineitem"
# Do a first run to warm up the cache. Expect no hits.
self.execute_query(QUERY)
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') > 0
# Do a second run. Expect some hits.
self.execute_query(QUERY)
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') > 0
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args=get_impalad_args("LRU", high_write_concurrency=False,
force_single_shard=False),
start_args=CACHE_START_ARGS, cluster_size=1)
def test_data_cache_lru(self, vector):
self.__test_data_cache(vector)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args=get_impalad_args("LIRS", high_write_concurrency=False,
force_single_shard=False),
start_args=CACHE_START_ARGS, cluster_size=1)
def test_data_cache_lirs(self, vector):
self.__test_data_cache(vector)
def __test_data_cache_disablement(self, vector):
# Verifies that the cache metrics are all zero.
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') == 0
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') == 0
# Runs a query with the cache disabled and then enabled against multiple file formats.
# Verifies that the metrics stay at zero when the cache is disabled.
for disable_cache in [True, False]:
vector.get_value('exec_option')['disable_data_cache'] = int(disable_cache)
for file_format in ['text_gzip', 'parquet', 'avro', 'seq', 'rc']:
QUERY = "select * from functional_{0}.alltypes".format(file_format)
self.execute_query(QUERY, vector.get_value('exec_option'))
assert disable_cache ==\
(self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') == 0)
assert disable_cache ==\
(self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') == 0)
assert disable_cache ==\
(self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') == 0)
assert disable_cache ==\
(self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') == 0)
assert disable_cache ==\
(self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') == 0)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args=get_impalad_args("LRU"),
start_args=CACHE_START_ARGS, cluster_size=1)
def test_data_cache_disablement_lru(self, vector):
self.__test_data_cache_disablement(vector)
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args(
impalad_args=get_impalad_args("LIRS"),
start_args=CACHE_START_ARGS, cluster_size=1)
def test_data_cache_disablement_lirs(self, vector):
self.__test_data_cache_disablement(vector)