mirror of
https://github.com/apache/impala.git
synced 2026-01-10 00:00:16 -05:00
test_data_cache.py was added as part of IMPALA-8341 to verify that the DataCache hit / miss counts and the DataCache metrics are working as expected. The test seems to fail intermittently due to unexpected cache misses. Part of the test creates a temp table from tpch_parquet.lineitem and uses it to join against tpch_parquet.lineitem itself on the l_orderkey column. The test expects a complete cache hit for tpch_parquet.lineitem when joining against the temp table as it should be cached entirely as part of CTAS statement. However, this doesn't work as expected all the time. In particular, the data cache internally divides up the key space into multiple shards and a key is hashed to determine the shard it belongs to. By default, the number of shards is the same as number of CPU cores (e.g. 16 for AWS m5-4xlarge instance). Since the cache size is set to 500MB, each shard will have a capacity of 31MB only. In some cases, it's possible that some rows of l_orderkey are evicted if the shard they belong to grow beyond 31MB. The problem is not deterministic as part of the cache key is the modification time of the file, which changes from run-to-run as it's essentially determined by the data loading time of the job. This leads to flakiness of the test. To fix this problem, this patch forces the data cache to use a single shard only for determinisim. In addition, the test is also skipped for non-HDFS and HDFS erasure encoding builds as it's dependent on the scan range assignment. To exercise the cache more extensively, the plan is to enable it by default for S3 builds instead of relying on BE and E2E tests only. Testing done: - Ran test_data_cache.py 10+ times, each with different mtime for tpch_parquet.lineitem; Used to fail 2 out of 3 runs. Change-Id: I98d5b8fa1d3fb25682a64bffaf56d751a140e4c9 Reviewed-on: http://gerrit.cloudera.org:8080/13242 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
52 lines
1.7 KiB
Plaintext
52 lines
1.7 KiB
Plaintext
====
|
|
---- QUERY
|
|
create table test_parquet stored as parquet as select * from tpch_parquet.lineitem;
|
|
---- RUNTIME_PROFILE
|
|
# Exepct all cache misses for tpch_parquet.lineitem.
|
|
row_regex: .*DataCacheHitBytes: 0.*
|
|
row_regex: .*DataCacheHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheMissCount: 64 \(64\).*
|
|
====
|
|
---- QUERY
|
|
select count(*) from tpch_parquet.lineitem t1, test_parquet t2 where t1.l_orderkey = t2.l_orderkey;
|
|
---- RESULTS
|
|
30012985
|
|
---- RUNTIME_PROFILE
|
|
# Exepct cache hits for t1 and cache misses for t2.
|
|
row_regex: .*DataCacheHitCount: 6 \(6\).*
|
|
row_regex: .*DataCacheMissBytes: 0.*
|
|
row_regex: .*DataCacheMissCount: 0 \(0\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheHitBytes: 0.*
|
|
row_regex: .*DataCacheHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheMissCount: 3 \(3\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
====
|
|
---- QUERY
|
|
select count(distinct l_orderkey) from test_parquet;
|
|
---- RESULTS
|
|
1500000
|
|
---- RUNTIME_PROFILE
|
|
# Expect all cache hits.
|
|
row_regex: .*DataCacheHitCount: 3 \(3\).*
|
|
row_regex: .*DataCacheMissBytes: 0.*
|
|
row_regex: .*DataCacheMissCount: 0 \(0\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
====
|
|
---- QUERY
|
|
# Overwrite temp table with subset of data.
|
|
insert overwrite test_parquet select * from tpch_parquet.lineitem where l_shipmode = 'AIR';
|
|
====
|
|
---- QUERY
|
|
# Verifies that stale data from the cache is not used.
|
|
select count(distinct l_orderkey) from test_parquet;
|
|
---- RESULTS
|
|
652393
|
|
---- RUNTIME_PROFILE
|
|
# Expect all cache misses due to change in mtime.
|
|
row_regex: .*DataCacheHitBytes: 0.*
|
|
row_regex: .*DataCacheHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheMissCount: 2 \(2\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
====
|