mirror of
https://github.com/apache/impala.git
synced 2026-01-04 09:00:56 -05:00
This adds a test that performs some simple fuzz testing of HDFS scanners. It creates a copy of a given HDFS table, with each file in the table corrupted in a random way: either a single byte is set to a random value, or the file is truncated to a random length. It then runs a query that scans the whole table with several different batch_size settings. I made some effort to make the failures reproducible by explicitly seeding the random number generator, and providing a mechanism to override the seed. The fuzzer has found crashes resulting from corrupted or truncated input files for RCFile, SequenceFile, Parquet, and Text LZO so far. Avro only had a small buffer read overrun detected by ASAN. Includes fixes for Parquet crashes found by the fuzzer, a small buffer overrun in Avro, and a DCHECK in MemPool. Initially it is only enabled for Avro, Parquet, and uncompressed text. As follow-up work we should fix the bugs in the other scanners and enable the test for them. We also don't implement abort_on_error=0 correctly in Parquet: for some file formats, corrupt headers result in the query being aborted, so an exception will xfail the test. Testing: Ran the test with exploration_strategy=exhaustive in a loop locally with both DEBUG and ASAN builds for a couple of days over a weekend. Also ran exhaustive private build. Change-Id: I50cf43195a7c582caa02c85ae400ea2256fa3a3b Reviewed-on: http://gerrit.cloudera.org:8080/3833 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Internal Jenkins
143 lines
2.9 KiB
Plaintext
143 lines
2.9 KiB
Plaintext
====
|
|
---- QUERY
|
|
# IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0
|
|
# IMPALA-720: data file with multiple row groups
|
|
SELECT * from bad_parquet where field = "parquet"
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'parquet'
|
|
'parquet'
|
|
'parquet'
|
|
'parquet'
|
|
====
|
|
---- QUERY
|
|
SELECT count(distinct field) from bad_parquet
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
1005
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid metadata size in the file footer.
|
|
SELECT * from bad_metadata_len
|
|
---- CATCH
|
|
Invalid metadata size in file footer
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid column dict_page_offset.
|
|
SELECT * from bad_dict_page_offset
|
|
---- CATCH
|
|
Column 0 has invalid data page offset (offset=100001 file_size=249)
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid column total_compressed_size.
|
|
SELECT * from bad_compressed_size
|
|
---- CATCH
|
|
Column 0 has invalid column offsets (offset=4, size=1000000, file_size=245)
|
|
====
|
|
---- QUERY
|
|
# Parquet file with required fields.
|
|
select * from kite_required_fields
|
|
---- TYPES
|
|
bigint,bigint,string,string,boolean,boolean,bigint,bigint,bigint,bigint
|
|
---- RESULTS
|
|
1,2,'foo','bar',true,false,1,2,3,4
|
|
1,NULL,'foo','NULL',true,NULL,NULL,NULL,3,4
|
|
100,NULL,'foooo','NULL',false,NULL,NULL,NULL,300,400
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid magic number
|
|
SELECT * from bad_magic_number
|
|
---- CATCH
|
|
File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid version number: XXXX
|
|
====
|
|
---- QUERY
|
|
# count(*) query on parquet file with multiple blocks (one block per node)
|
|
SELECT count(*) from lineitem_multiblock
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
20000
|
|
====
|
|
---- QUERY
|
|
# count(*) query on parquet file with more than one block per node
|
|
SELECT count(*) from lineitem_sixblocks
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
40000
|
|
====
|
|
---- QUERY
|
|
# Select multiple columns from parquet file with multiple blocks (one block per node)
|
|
SELECT count(l_comment), min(l_partkey), max(l_linenumber) from lineitem_multiblock;
|
|
---- TYPES
|
|
bigint, bigint, int
|
|
---- RESULTS
|
|
20000,2,7
|
|
====
|
|
---- QUERY
|
|
# Select multiple columns from parquet file with more than one block per node
|
|
SELECT count(l_comment), min(l_partkey), max(l_linenumber) from lineitem_sixblocks;
|
|
---- TYPES
|
|
bigint, bigint, int
|
|
---- RESULTS
|
|
40000,2,7
|
|
====
|
|
---- QUERY
|
|
# Test limit queries on parquet with multiple blocks (one block per node)
|
|
select distinct l_orderkey from lineitem_multiblock where
|
|
l_orderkey < 5 or l_orderkey > 15000 order by l_orderkey limit 20;
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
4
|
|
15008
|
|
15009
|
|
15010
|
|
15011
|
|
15012
|
|
15013
|
|
15014
|
|
15015
|
|
15040
|
|
15041
|
|
15042
|
|
15043
|
|
15044
|
|
15045
|
|
15046
|
|
15047
|
|
====
|
|
---- QUERY
|
|
# Test limit queries on parquet with more than one block per node
|
|
select distinct l_orderkey from lineitem_sixblocks where
|
|
l_orderkey < 5 or l_orderkey > 15000 order by l_orderkey limit 20;
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
4
|
|
15008
|
|
15009
|
|
15010
|
|
15011
|
|
15012
|
|
15013
|
|
15014
|
|
15015
|
|
15040
|
|
15041
|
|
15042
|
|
15043
|
|
15044
|
|
15045
|
|
15046
|
|
15047
|
|
====
|