mirror of
https://github.com/apache/impala.git
synced 2026-01-15 15:00:36 -05:00
When trying to read from HDFS cache, ReadFromCache calls FileReader::Open(false) to force the file to open. The prior commit for IMPALA-11704 didn't allow for that case when using a data cache, as the data cache check would always happen. This resulted in a crash calling CachedFile as exclusive_hdfs_fh_ was nullptr. Tests only catch this when reading from HDFS cache with data cache enabled. Replaces explicit arguments to override FileReader behavior with a flag to communicate whether FileReader supports delayed open. Then the caller can choose whether to call Open before read. Also simplifies calls to ReadFromPos as it already has a pointer to ScanRange and can check whether file handle caching is enabled directly. The Open call in DoInternalRead uses a slightly wider net by only checking UseDataCache. If the data cache is unavailable or a miss the file will then be opened. Adds a select from tpch.nation to the query for test_data_cache.py as something that triggers checking the HDFS cache. Change-Id: I741488d6195e586917de220a39090895886a2dc5 Reviewed-on: http://gerrit.cloudera.org:8080/19228 Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
58 lines
1.7 KiB
Plaintext
58 lines
1.7 KiB
Plaintext
====
|
|
---- QUERY
|
|
create table test_parquet stored as parquet as select * from tpch_parquet.lineitem;
|
|
---- RUNTIME_PROFILE
|
|
# Exepct all cache misses for tpch_parquet.lineitem.
|
|
row_regex: .*DataCacheHitBytes: 0.*
|
|
row_regex: .*DataCacheHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheMissCount: 64 \(64\).*
|
|
====
|
|
---- QUERY
|
|
select count(*) from tpch_parquet.lineitem t1, test_parquet t2 where t1.l_orderkey = t2.l_orderkey;
|
|
---- RESULTS
|
|
30012985
|
|
---- RUNTIME_PROFILE
|
|
# Exepct cache hits for t1 and cache misses for t2.
|
|
row_regex: .*DataCacheHitCount: 6 \(6\).*
|
|
row_regex: .*DataCacheMissBytes: 0.*
|
|
row_regex: .*DataCacheMissCount: 0 \(0\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheHitBytes: 0.*
|
|
row_regex: .*DataCacheHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheMissCount: 3 \(3\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
====
|
|
---- QUERY
|
|
select count(distinct l_orderkey) from test_parquet;
|
|
---- RESULTS
|
|
1500000
|
|
---- RUNTIME_PROFILE
|
|
# Expect all cache hits.
|
|
row_regex: .*DataCacheHitCount: 3 \(3\).*
|
|
row_regex: .*DataCacheMissBytes: 0.*
|
|
row_regex: .*DataCacheMissCount: 0 \(0\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
====
|
|
---- QUERY
|
|
# Overwrite temp table with subset of data.
|
|
insert overwrite test_parquet select * from tpch_parquet.lineitem where l_shipmode = 'AIR';
|
|
====
|
|
---- QUERY
|
|
# Verifies that stale data from the cache is not used.
|
|
select count(distinct l_orderkey) from test_parquet;
|
|
---- RESULTS
|
|
652393
|
|
---- RUNTIME_PROFILE
|
|
# Expect all cache misses due to change in mtime.
|
|
row_regex: .*DataCacheHitBytes: 0.*
|
|
row_regex: .*DataCacheHitCount: 0 \(0\).*
|
|
row_regex: .*DataCacheMissCount: 2 \(2\).*
|
|
row_regex: .*DataCachePartialHitCount: 0 \(0\).*
|
|
====
|
|
---- QUERY
|
|
# Exercise HDFS cache
|
|
select count(*) from tpch.nation;
|
|
---- RESULTS
|
|
25
|
|
====
|