Files
impala/tests/query_test/test_hdfs_caching.py
Nong Li ad534429df [CDH5] Disable flaky hdfs caching test.
Change-Id: I19900ae029876d8f74169eda0f08f5be3509fbaf
Reviewed-on: http://gerrit.ent.cloudera.com:8080/2946
Reviewed-by: Nong Li <nong@cloudera.com>
Tested-by: jenkins
2014-06-10 18:24:42 -07:00

126 lines
5.1 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
# Validates limit on scan nodes
#
import logging
import pytest
from copy import copy
from subprocess import call
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.impala_test_suite import *
from tests.common.test_vector import *
from tests.common.impala_cluster import ImpalaCluster
from tests.common.test_dimensions import create_exec_option_dimension
from tests.util.shell_util import exec_process
# End to end test that hdfs caching is working.
class TestHdfsCaching(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'tpch'
@classmethod
def add_test_dimensions(cls):
super(TestHdfsCaching, cls).add_test_dimensions()
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('exec_option')['batch_size'] == 0)
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == "text")
# The tpch nation table is cached as part of data loading. We'll issue a query
# against this table and verify the metric is updated correctly.
@pytest.mark.execute_serially
def test_table_is_cached(self, vector):
cached_read_metric = "impala-server.io-mgr.cached-bytes-read"
query_string = "select count(*) from tpch.nation"
expected_bytes_delta = 2199
impala_cluster = ImpalaCluster()
# Collect the cached read metric on all the impalads before running the query
cached_bytes_before = list()
for impalad in impala_cluster.impalads:
cached_bytes_before.append(impalad.service.get_metric_value(cached_read_metric))
# Execute the query.
result = self.execute_query(query_string)
assert(len(result.data) == 1)
assert(result.data[0] == '25')
# Read the metrics again.
cached_bytes_after = list()
for impalad in impala_cluster.impalads:
cached_bytes_after.append(impalad.service.get_metric_value(cached_read_metric))
# Verify that the cached bytes increased by the expected number on exactly one of
# the impalads.
num_metrics_increased = 0
assert(len(cached_bytes_before) == len(cached_bytes_after))
for i in range(0, len(cached_bytes_before)):
assert(cached_bytes_before[i] == cached_bytes_after[i] or\
cached_bytes_before[i] + expected_bytes_delta == cached_bytes_after[i])
if cached_bytes_after[i] > cached_bytes_before[i]:
num_metrics_increased = num_metrics_increased + 1
if num_metrics_increased != 1:
# Test failed, print the metrics
for i in range(0, len(cached_bytes_before)):
print "%d %d" % (cached_bytes_before[i], cached_bytes_after[i])
assert(False)
def test_cache_cancellation(self, vector):
""" This query runs on some mix of cached and not cached tables. The query has
a limit so it exercises the cancellation paths. Regression test for
IMPALA-1019. """
num_iters = 100
query_string = """
with t1 as (select int_col x, bigint_col y from functional.alltypes limit 2),
t2 as (select int_col x, bigint_col y from functional.alltypestiny limit 2),
t3 as (select int_col x, bigint_col y from functional.alltypessmall limit 2)
select * from t1, t2, t3 where t1.x = t2.x and t2.x = t3.x """
# Run this query for some iterations since it is timing dependent.
for x in xrange(1, num_iters):
result = self.execute_query(query_string)
assert(len(result.data) == 2)
class TestHdfsCachingDdl(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestHdfsCachingDdl, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'text' and \
v.get_value('table_format').compression_codec == 'none')
@pytest.mark.execute_serially
@pytest.mark.xfail(run=False, reason="IMPALA-1037. This test is flaky")
def test_caching_ddl(self, vector):
self.client.execute("drop table if exists functional.cached_tbl_part")
self.client.execute("drop table if exists functional.cached_tbl_nopart")
# Get the number of cache requests before starting the test
num_entries_pre = get_num_cache_requests()
self.run_test_case('QueryTest/hdfs-caching', vector)
# After running this test case we should be left with 6 cache requests.
# In this case, 1 for each table + 4 more for each cached partition.
assert num_entries_pre == get_num_cache_requests() - 6
self.client.execute("drop table functional.cached_tbl_part")
self.client.execute("drop table functional.cached_tbl_nopart")
# Dropping the tables should cleanup cache entries leaving us with the same
# total number of entries
assert num_entries_pre == get_num_cache_requests()
def get_num_cache_requests():
"""Returns the number of outstanding cache requests"""
rc, stdout, stderr = exec_process("hdfs cacheadmin -listDirectives -stats")
assert rc == 0, 'Error executing hdfs cacheadmin: %s %s' % (stdout, stderr)
return len(stdout.split('\n'))