mirror of
https://github.com/apache/impala.git
synced 2025-12-30 12:02:10 -05:00
This patch modifies the Parquet scanner to resolve nested schemas, and read and materialize collection types. The high-level modification is to create a CollectionColumnReader that recursively materializes map- and array-type slots. This patch also adds many tests, most of which query a new table called complextypestbl. This table contains hand-generated data that is meant to expose edge cases in the scanner. The tests mostly test the scanner, with a few tests of other functionality (e.g. array serialization). I ran a local benchmark comparing this scanner code to the original scanner code on an expanded version of tpch_parquet.lineitem with 48009720 rows. My benchmark involved selecting different numbers of columns with a single scanner thread, and I looked at the HDFS scan node time in the query profiles. This code introduces a 10%-20% regression in single-threaded scan time. Change-Id: Id27fb728934e8346444f61752c9278d8010e5f3a Reviewed-on: http://gerrit.cloudera.org:8080/576 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Internal Jenkins
89 lines
1.8 KiB
JSON
89 lines
1.8 KiB
JSON
[
|
|
{"id": 1,
|
|
"int_array": [1,2,3],
|
|
"int_array_array": [[1,2],[3,4]],
|
|
"int_map": {"k1": 1, "k2": 100},
|
|
"int_map_array": [{"k1": 1}],
|
|
"nested_struct": {
|
|
"a": 1,
|
|
"b": [1],
|
|
"c": {
|
|
"d": [
|
|
[{"e": 10, "f": "aaa"},
|
|
{"e": -10, "f": "bbb"}],
|
|
[{"e": 11, "f": "c"}]]},
|
|
"g": {"foo": {"h": {"i": [1.1]}}}}},
|
|
{"id": 2,
|
|
"int_array": [null,1,2,null,3,null],
|
|
"int_array_array": [[null,1,2,null],[3,null,4],[], null],
|
|
"int_map": {"k1": 2, "k2": null},
|
|
"int_map_array": [{"k3": null, "k1": 1}, null, {}],
|
|
"nested_struct": {
|
|
"a": null,
|
|
"b": [null],
|
|
"c": {"d": [
|
|
[{"e": null, "f": null},
|
|
{"e": 10, "f": "aaa"},
|
|
{"e": null, "f": null},
|
|
{"e": -10, "f": "bbb"},
|
|
{"e": null, "f": null}],
|
|
[{"e": 11, "f": "c"},
|
|
null],
|
|
[],
|
|
null]},
|
|
"g": {
|
|
"g1": {"h": {"i": [2.2, null]}},
|
|
"g2": {"h": {"i": []}},
|
|
"g3": null,
|
|
"g4": {"h": {"i": null}},
|
|
"g5": {"h": null}}}},
|
|
{"id": 3,
|
|
"int_array": [],
|
|
"int_array_array": [null],
|
|
"int_map": {},
|
|
"int_map_array": [null, null],
|
|
"nested_struct": {
|
|
"a": null,
|
|
"b": null,
|
|
"c": {"d": []},
|
|
"g": {}}},
|
|
{"id": 4,
|
|
"int_array": null,
|
|
"int_array_array": [],
|
|
"int_map": {},
|
|
"int_map_array": [],
|
|
"nested_struct": {
|
|
"a": null,
|
|
"b": null,
|
|
"c": {"d": null},
|
|
"g": null}},
|
|
{"id": 5,
|
|
"int_array": null,
|
|
"int_array_array": null,
|
|
"int_map": {},
|
|
"nested_struct": {
|
|
"a": null,
|
|
"b": null,
|
|
"c": null,
|
|
"g": {"foo": {"h": {"i": [2.2, 3.3]}}}}},
|
|
{"id": 6,
|
|
"int_array": null,
|
|
"int_array_array": null,
|
|
"int_map": null,
|
|
"int_map_array": null,
|
|
"nested_struct": null},
|
|
{"id": 7,
|
|
"int_array": null,
|
|
"int_array_array": [null,[5,6]],
|
|
"int_map": {"k1": null, "k3": null},
|
|
"int_map_array": null,
|
|
"nested_struct": {
|
|
"a": 7,
|
|
"b": [2,3,null],
|
|
"c": {"d": [
|
|
[],
|
|
[null],
|
|
null]},
|
|
"g": null}}
|
|
]
|