mirror of
https://github.com/apache/impala.git
synced 2025-12-20 18:37:21 -05:00
This adds two sets of metrics. The first is per-partition metrics to track the performance of the underlying filesystem for the data cache. It keeps histograms of read, write, and eviction latency for each data cache partition along with another metric recording the path for the partition. These are exposed as the following metrics: impala-server.io-mgr.remote-data-cache-partition-$0.path impala-server.io-mgr.remote-data-cache-partition-$0.read-latency impala-server.io-mgr.remote-data-cache-partition-$0.write-latency impala-server.io-mgr.remote-data-cache-partition-$0.eviction-latency This also adds metrics to keep counts of hits, misses, and entries in the data cache. Since reducing the latency of IO is an important feature of the data cache, the absolute count of hits and misses is as important as the hit bytes and miss bytes. This adds the following metrics: impala-server.io-mgr.remote-data-cache-hit-count impala-server.io-mgr.remote-data-cache-miss-count impala-server.io-mgr.remote-data-cache-num-entries To track metrics around inserts, this also adds the following metrics: impala-server.io-mgr.remote-data-cache-num-inserts impala-server.io-mgr.remote-data-cache-dropped-entries impala-server.io-mgr.remote-data-cache-instant-evictions An instant eviction happens when inserting an entry into the cache fails and the entry is immediately evicted during insert. This is currently only possible for LIRS when the entry's size is larger than the unprotected capacity. This manifests when the cache size is very small. For example, for an 8MB entry, this would manifest when a cache shard is smaller than 160MB. This metric is primarily for debugging. Testing: - Hand testing to verify the per-partition latency histograms - Modified custom_cluster/test_data_cache.py to also test the counts. Change-Id: I56a57d75ff11f00ebc85b85bcaf104fb8108c478 Reviewed-on: http://gerrit.cloudera.org:8080/15382 Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
166 lines
8.2 KiB
Python
166 lines
8.2 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import pytest
|
|
|
|
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
|
from tests.common.skip import SkipIf, SkipIfNotHdfsMinicluster
|
|
|
|
|
|
@SkipIf.not_hdfs
|
|
@SkipIf.is_buggy_el6_kernel
|
|
@SkipIfNotHdfsMinicluster.scheduling
|
|
class TestDataCache(CustomClusterTestSuite):
|
|
""" This test enables the data cache and verfies that cache hit and miss counts
|
|
in the runtime profile and metrics are as expected. Run on non-EC HDFS only as
|
|
this test checks the number of data cache hit counts, which implicitly relies
|
|
on the scheduler's behavior and number of HDFS blocks.
|
|
"""
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
pytest.skip('runs only in exhaustive')
|
|
super(TestDataCache, cls).setup_class()
|
|
|
|
def get_impalad_args(eviction_policy, high_write_concurrency=True,
|
|
force_single_shard=True):
|
|
impalad_args = ["--always_use_data_cache=true"]
|
|
if (high_write_concurrency):
|
|
impalad_args.append("--data_cache_write_concurrency=64")
|
|
if (force_single_shard):
|
|
impalad_args.append("--cache_force_single_shard")
|
|
impalad_args.append("--data_cache_eviction_policy={0}".format(eviction_policy))
|
|
return " ".join(impalad_args)
|
|
|
|
CACHE_START_ARGS = "--data_cache_dir=/tmp --data_cache_size=500MB"
|
|
|
|
def __test_data_cache_deterministic(self, vector, unique_database):
|
|
""" This test creates a temporary table from another table, overwrites it with
|
|
some other data and verifies that no stale data is read from the cache. Runs with
|
|
a single node to make it easier to verify the runtime profile. Also enables higher
|
|
write concurrency and uses a single shard to avoid non-determinism.
|
|
"""
|
|
self.run_test_case('QueryTest/data-cache', vector, unique_database)
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-dropped-bytes') >= 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-dropped-entries') >= 0
|
|
assert \
|
|
self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions') >= 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') > 0
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args=get_impalad_args("LRU"),
|
|
start_args=CACHE_START_ARGS, cluster_size=1)
|
|
def test_data_cache_deterministic_lru(self, vector, unique_database):
|
|
self.__test_data_cache_deterministic(vector, unique_database)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args=get_impalad_args("LIRS"),
|
|
start_args=CACHE_START_ARGS, cluster_size=1)
|
|
def test_data_cache_deterministic_lirs(self, vector, unique_database):
|
|
self.__test_data_cache_deterministic(vector, unique_database)
|
|
|
|
def __test_data_cache(self, vector):
|
|
""" This test scans the same table twice and verifies the cache hit count metrics
|
|
are correct. The exact number of bytes hit is non-deterministic between runs due
|
|
to different mtime of files and multiple shards in the cache.
|
|
"""
|
|
QUERY = "select * from tpch_parquet.lineitem"
|
|
# Do a first run to warm up the cache. Expect no hits.
|
|
self.execute_query(QUERY)
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') > 0
|
|
|
|
# Do a second run. Expect some hits.
|
|
self.execute_query(QUERY)
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') > 0
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args=get_impalad_args("LRU", high_write_concurrency=False,
|
|
force_single_shard=False),
|
|
start_args=CACHE_START_ARGS, cluster_size=1)
|
|
def test_data_cache_lru(self, vector):
|
|
self.__test_data_cache(vector)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args=get_impalad_args("LIRS", high_write_concurrency=False,
|
|
force_single_shard=False),
|
|
start_args=CACHE_START_ARGS, cluster_size=1)
|
|
def test_data_cache_lirs(self, vector):
|
|
self.__test_data_cache(vector)
|
|
|
|
def __test_data_cache_disablement(self, vector):
|
|
# Verifies that the cache metrics are all zero.
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') == 0
|
|
assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') == 0
|
|
|
|
# Runs a query with the cache disabled and then enabled against multiple file formats.
|
|
# Verifies that the metrics stay at zero when the cache is disabled.
|
|
for disable_cache in [True, False]:
|
|
vector.get_value('exec_option')['disable_data_cache'] = int(disable_cache)
|
|
for file_format in ['text_gzip', 'parquet', 'avro', 'seq', 'rc']:
|
|
QUERY = "select * from functional_{0}.alltypes".format(file_format)
|
|
self.execute_query(QUERY, vector.get_value('exec_option'))
|
|
assert disable_cache ==\
|
|
(self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') == 0)
|
|
assert disable_cache ==\
|
|
(self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') == 0)
|
|
assert disable_cache ==\
|
|
(self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') == 0)
|
|
assert disable_cache ==\
|
|
(self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') == 0)
|
|
assert disable_cache ==\
|
|
(self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') == 0)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args=get_impalad_args("LRU"),
|
|
start_args=CACHE_START_ARGS, cluster_size=1)
|
|
def test_data_cache_disablement_lru(self, vector):
|
|
self.__test_data_cache_disablement(vector)
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(
|
|
impalad_args=get_impalad_args("LIRS"),
|
|
start_args=CACHE_START_ARGS, cluster_size=1)
|
|
def test_data_cache_disablement_lirs(self, vector):
|
|
self.__test_data_cache_disablement(vector)
|