impala/tests/custom_cluster/test_data_cache.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

import pytest

from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
from tests.common.skip import SkipIf, SkipIfNotHdfsMinicluster


@SkipIf.not_hdfs
@SkipIf.is_buggy_el6_kernel
@SkipIfNotHdfsMinicluster.scheduling
class TestDataCache(CustomClusterTestSuite):
  """ This test enables the data cache and verfies that cache hit and miss counts
  in the runtime profile and metrics are as expected. Run on non-EC HDFS only as
  this test checks the number of data cache hit counts, which implicitly relies
  on the scheduler's behavior and number of HDFS blocks.
  """
  @classmethod
  def get_workload(self):
    return 'functional-query'

  @classmethod
  def setup_class(cls):
    if cls.exploration_strategy() != 'exhaustive':
      pytest.skip('runs only in exhaustive')
    super(TestDataCache, cls).setup_class()

  def get_impalad_args(eviction_policy, high_write_concurrency=True,
                       force_single_shard=True):
    impalad_args = ["--always_use_data_cache=true"]
    if (high_write_concurrency):
      impalad_args.append("--data_cache_write_concurrency=64")
    if (force_single_shard):
      impalad_args.append("--cache_force_single_shard")
    impalad_args.append("--data_cache_eviction_policy={0}".format(eviction_policy))
    return " ".join(impalad_args)

  CACHE_START_ARGS = "--data_cache_dir=/tmp --data_cache_size=500MB"

  def __test_data_cache_deterministic(self, vector, unique_database):
    """ This test creates a temporary table from another table, overwrites it with
    some other data and verifies that no stale data is read from the cache. Runs with
    a single node to make it easier to verify the runtime profile. Also enables higher
    write concurrency and uses a single shard to avoid non-determinism.
    """
    self.run_test_case('QueryTest/data-cache', vector, unique_database)
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-dropped-bytes') >= 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-dropped-entries') >= 0
    assert \
        self.get_metric('impala-server.io-mgr.remote-data-cache-instant-evictions') >= 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') > 0

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
      impalad_args=get_impalad_args("LRU"),
      start_args=CACHE_START_ARGS, cluster_size=1)
  def test_data_cache_deterministic_lru(self, vector, unique_database):
    self.__test_data_cache_deterministic(vector, unique_database)

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
      impalad_args=get_impalad_args("LIRS"),
      start_args=CACHE_START_ARGS, cluster_size=1)
  def test_data_cache_deterministic_lirs(self, vector, unique_database):
    self.__test_data_cache_deterministic(vector, unique_database)

  def __test_data_cache(self, vector):
    """ This test scans the same table twice and verifies the cache hit count metrics
    are correct. The exact number of bytes hit is non-deterministic between runs due
    to different mtime of files and multiple shards in the cache.
    """
    QUERY = "select * from tpch_parquet.lineitem"
    # Do a first run to warm up the cache. Expect no hits.
    self.execute_query(QUERY)
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') > 0

    # Do a second run. Expect some hits.
    self.execute_query(QUERY)
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') > 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') > 0

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
      impalad_args=get_impalad_args("LRU", high_write_concurrency=False,
                                    force_single_shard=False),
      start_args=CACHE_START_ARGS, cluster_size=1)
  def test_data_cache_lru(self, vector):
    self.__test_data_cache(vector)

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
      impalad_args=get_impalad_args("LIRS", high_write_concurrency=False,
                                    force_single_shard=False),
      start_args=CACHE_START_ARGS, cluster_size=1)
  def test_data_cache_lirs(self, vector):
    self.__test_data_cache(vector)

  def __test_data_cache_disablement(self, vector):
    # Verifies that the cache metrics are all zero.
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-bytes') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-hit-count') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') == 0
    assert self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') == 0

    # Runs a query with the cache disabled and then enabled against multiple file formats.
    # Verifies that the metrics stay at zero when the cache is disabled.
    for disable_cache in [True, False]:
      vector.get_value('exec_option')['disable_data_cache'] = int(disable_cache)
      for file_format in ['text_gzip', 'parquet', 'avro', 'seq', 'rc']:
        QUERY = "select * from functional_{0}.alltypes".format(file_format)
        self.execute_query(QUERY, vector.get_value('exec_option'))
        assert disable_cache ==\
            (self.get_metric('impala-server.io-mgr.remote-data-cache-miss-bytes') == 0)
        assert disable_cache ==\
            (self.get_metric('impala-server.io-mgr.remote-data-cache-miss-count') == 0)
        assert disable_cache ==\
            (self.get_metric('impala-server.io-mgr.remote-data-cache-total-bytes') == 0)
        assert disable_cache ==\
            (self.get_metric('impala-server.io-mgr.remote-data-cache-num-entries') == 0)
        assert disable_cache ==\
            (self.get_metric('impala-server.io-mgr.remote-data-cache-num-writes') == 0)

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
      impalad_args=get_impalad_args("LRU"),
      start_args=CACHE_START_ARGS, cluster_size=1)
  def test_data_cache_disablement_lru(self, vector):
    self.__test_data_cache_disablement(vector)

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
      impalad_args=get_impalad_args("LIRS"),
      start_args=CACHE_START_ARGS, cluster_size=1)
  def test_data_cache_disablement_lirs(self, vector):
    self.__test_data_cache_disablement(vector)