mirror of
https://github.com/apache/impala.git
synced 2026-01-30 15:00:18 -05:00
This patch adds error checking to the Avro scanner (both the codegen'd and interepted paths), including out-of-bounds checks and data validity checks. I ran a local benchmark using the following queries: set num_scanner_threads=1; select count(i) from default.avro_bigints_big; # file contains only longs select max(l_orderkey) from biglineitem_avro; # file has tpch.lineitem schema Both benchmark queries see negligable or no performance impact. This patch adds a new Avro scanner unit test and an end-to-end test that queries several corrupted files, as well as updates the zig-zag varlen int unit test. Change-Id: I801a11c496a128e02c564c2a9c44baa5a97be132 Reviewed-on: http://gerrit.cloudera.org:8080/3072 Reviewed-by: Dan Hecht <dhecht@cloudera.com> Tested-by: Internal Jenkins
25 lines
1009 B
Plaintext
25 lines
1009 B
Plaintext
====
|
|
---- QUERY
|
|
# Read from the corrupt files. We may get partial results.
|
|
select * from bad_avro_snap_strings
|
|
---- RESULTS
|
|
---- TYPES
|
|
string
|
|
---- ERRORS
|
|
row_regex: .*Problem parsing file.*
|
|
row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/truncated_string.avro' is corrupt: truncated data block at offset 155.*
|
|
row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/negative_string_len.avro' is corrupt: invalid length -7 at offset 164.*
|
|
row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/invalid_union.avro' is corrupt: invalid union value 4 at offset 174.*
|
|
row_regex: .*File '.*/bad_avro_snap_strings_avro_snap/invalid_union.avro' is corrupt: invalid encoded integer at offset 191.*
|
|
====
|
|
---- QUERY
|
|
# Read from the corrupt files. We may get partial results.
|
|
select * from bad_avro_snap_floats
|
|
---- RESULTS
|
|
---- TYPES
|
|
float
|
|
---- ERRORS
|
|
row_regex: .*Problem parsing file.*
|
|
row_regex: .*File '.*/bad_avro_snap_floats_avro_snap/truncated_float.avro' is corrupt: truncated data block at offset 159.*
|
|
====
|