impala/tests/query_test/test_hdfs_caching.py

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
# Validates limit on scan nodes
#
import logging
import pytest
from copy import copy
from subprocess import call
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.impala_test_suite import *
from tests.common.test_vector import *
from tests.common.impala_cluster import ImpalaCluster
from tests.common.test_dimensions import create_exec_option_dimension
from tests.util.shell_util import exec_process

# End to end test that hdfs caching is working.
class TestHdfsCaching(ImpalaTestSuite):
  @classmethod
  def get_workload(self):
    return 'tpch'

  @classmethod
  def add_test_dimensions(cls):
    super(TestHdfsCaching, cls).add_test_dimensions()
    cls.TestMatrix.add_constraint(lambda v:\
        v.get_value('exec_option')['batch_size'] == 0)
    cls.TestMatrix.add_constraint(lambda v:\
        v.get_value('table_format').file_format == "text")

  # The tpch nation table is cached as part of data loading. We'll issue a query
  # against this table and verify the metric is updated correctly.
  @pytest.mark.execute_serially
  def test_table_is_cached(self, vector):
    cached_read_metric = "impala-server.io-mgr.cached-bytes-read"
    query_string = "select count(*) from tpch.nation"
    expected_bytes_delta = 2199
    impala_cluster = ImpalaCluster()

    # Collect the cached read metric on all the impalads before running the query
    cached_bytes_before = list()
    for impalad in impala_cluster.impalads:
      cached_bytes_before.append(impalad.service.get_metric_value(cached_read_metric))

    # Execute the query.
    result = self.execute_query(query_string)
    assert(len(result.data) == 1)
    assert(result.data[0] == '25')

    # Read the metrics again.
    cached_bytes_after = list()
    for impalad in impala_cluster.impalads:
      cached_bytes_after.append(impalad.service.get_metric_value(cached_read_metric))

    # Verify that the cached bytes increased by the expected number on exactly one of
    # the impalads.
    num_metrics_increased = 0
    assert(len(cached_bytes_before) == len(cached_bytes_after))
    for i in range(0, len(cached_bytes_before)):
      assert(cached_bytes_before[i] == cached_bytes_after[i] or\
             cached_bytes_before[i] + expected_bytes_delta == cached_bytes_after[i])
      if cached_bytes_after[i] > cached_bytes_before[i]:
        num_metrics_increased = num_metrics_increased + 1

    if num_metrics_increased != 1:
      # Test failed, print the metrics
      for i in range(0, len(cached_bytes_before)):
        print "%d %d" % (cached_bytes_before[i], cached_bytes_after[i])
      assert(False)

  def test_cache_cancellation(self, vector):
    """ This query runs on some mix of cached and not cached tables. The query has
        a limit so it exercises the cancellation paths. Regression test for
        IMPALA-1019. """
    num_iters = 100
    query_string = """
      with t1 as (select int_col x, bigint_col y from functional.alltypes limit 2),
           t2 as (select int_col x, bigint_col y from functional.alltypestiny limit 2),
           t3 as (select int_col x, bigint_col y from functional.alltypessmall limit 2)
      select * from t1, t2, t3 where t1.x = t2.x and t2.x = t3.x """

    # Run this query for some iterations since it is timing dependent.
    for x in xrange(1, num_iters):
      result = self.execute_query(query_string)
      assert(len(result.data) == 2)

class TestHdfsCachingDdl(ImpalaTestSuite):
  @classmethod
  def get_workload(self):
    return 'functional-query'

  @classmethod
  def add_test_dimensions(cls):
    super(TestHdfsCachingDdl, cls).add_test_dimensions()
    cls.TestMatrix.add_dimension(create_single_exec_option_dimension())

    cls.TestMatrix.add_constraint(lambda v:\
        v.get_value('table_format').file_format == 'text' and \
        v.get_value('table_format').compression_codec == 'none')

  @pytest.mark.execute_serially
  @pytest.mark.xfail(run=False, reason="IMPALA-1037. This test is flaky")
  def test_caching_ddl(self, vector):
    self.client.execute("drop table if exists functional.cached_tbl_part")
    self.client.execute("drop table if exists functional.cached_tbl_nopart")

    # Get the number of cache requests before starting the test
    num_entries_pre = get_num_cache_requests()
    self.run_test_case('QueryTest/hdfs-caching', vector)

    # After running this test case we should be left with 6 cache requests.
    # In this case, 1 for each table + 4 more for each cached partition.
    assert num_entries_pre == get_num_cache_requests() - 6

    self.client.execute("drop table functional.cached_tbl_part")
    self.client.execute("drop table functional.cached_tbl_nopart")

    # Dropping the tables should cleanup cache entries leaving us with the same
    # total number of entries
    assert num_entries_pre == get_num_cache_requests()

def get_num_cache_requests():
  """Returns the number of outstanding cache requests"""
  rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats")
  assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr)
  return len(stdout.split('\n'))