mirror of
https://github.com/apache/impala.git
synced 2026-01-06 15:01:43 -05:00
Added checks/error handling: * Negative string lengths while decoding dictionary or data page. * Buffer overruns while decoding dictionary or data page. * Some metadata FILECHECKs were converted to statuses. Testing: Unit tests for: * decoding of strings with negative lengths * truncation of all parquet types * dictionary creation correctly handling error returns from Decode(). End-to-end tests for handling of negative string lengths in dictionary- and plain-encoded data in corrupt files, and for handling of buffer overruns for string data. The corrupted parquet files were generated by hacking Impala's parquet writer to write invalid lengths, and by hacking it to write plain-encoded data instead of dictionary-encoded data by default. Performance: set num_nodes=1; set num_scanner_threads=1; select * from biglineitem where l_orderkey = -1; I inspected MaterializeTupleTime. Before the average was 8.24s and after was 8.36s (a 1.4% slowdown, within the standard deviation of 1.8%). Change-Id: Id565a2ccb7b82f9f92cc3b07f05642a3a835bece Reviewed-on: http://gerrit.cloudera.org:8080/3387 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Internal Jenkins
104 lines
1.8 KiB
Plaintext
104 lines
1.8 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Returns all results despite a discrepancy between the number of values
|
|
# scanned and the number of values stored in the file metadata.
|
|
# Set a single node and scanner thread to make this test deterministic.
|
|
set num_nodes=1;
|
|
set num_scanner_threads=1;
|
|
select id, cnt from bad_column_metadata t, (select count(*) cnt from t.int_array) v
|
|
---- TYPES
|
|
bigint,bigint
|
|
---- RESULTS
|
|
1,10
|
|
2,10
|
|
3,10
|
|
4,10
|
|
5,10
|
|
6,10
|
|
7,10
|
|
8,10
|
|
9,10
|
|
10,10
|
|
11,10
|
|
12,10
|
|
13,10
|
|
14,10
|
|
15,10
|
|
16,10
|
|
17,10
|
|
18,10
|
|
19,10
|
|
20,10
|
|
21,10
|
|
22,10
|
|
23,10
|
|
24,10
|
|
25,10
|
|
26,10
|
|
27,10
|
|
28,10
|
|
29,10
|
|
30,10
|
|
---- ERRORS
|
|
Column metadata states there are 50 values, but read 100 values from column element. file: hdfs://regex:.$
|
|
====
|
|
---- QUERY
|
|
# Same as above but only selecting a single scalar column.
|
|
set num_nodes=1;
|
|
set num_scanner_threads=1;
|
|
select id from bad_column_metadata
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
4
|
|
5
|
|
6
|
|
7
|
|
8
|
|
9
|
|
10
|
|
11
|
|
12
|
|
13
|
|
14
|
|
15
|
|
16
|
|
17
|
|
18
|
|
19
|
|
20
|
|
21
|
|
22
|
|
23
|
|
24
|
|
25
|
|
26
|
|
27
|
|
28
|
|
29
|
|
30
|
|
---- ERRORS
|
|
Column metadata states there are 11 values, but read 10 values from column id. file: hdfs://regex:.$
|
|
====
|
|
---- QUERY
|
|
SELECT * from bad_parquet_strings_negative_len
|
|
---- TYPES
|
|
STRING
|
|
---- RESULTS
|
|
---- ERRORS
|
|
row_regex: .*File '.*/plain-encoded-negative-len.parq' is corrupt: error decoding value of type STRING at offset 58.*
|
|
row_regex: .*File '.*/dict-encoded-negative-len.parq' is corrupt: error reading dictionary for data of type STRING: could not decode dictionary.*
|
|
====
|
|
---- QUERY
|
|
SELECT * from bad_parquet_strings_out_of_bounds
|
|
---- TYPES
|
|
STRING
|
|
---- RESULTS
|
|
---- ERRORS
|
|
row_regex: .*File '.*/plain-encoded-out-of-bounds.parq' is corrupt: error decoding value of type STRING at offset 58.*
|
|
row_regex: .*File '.*/dict-encoded-out-of-bounds.parq' is corrupt: error reading dictionary for data of type STRING: could not decode dictionary.*
|
|
====
|