Files
impala/testdata/datasets/functional/schema_constraints.csv
Skye Wanderman-Milne bcc73a36da Nested types: read and materialize nested types in Parquet scanner
This patch modifies the Parquet scanner to resolve nested schemas, and
read and materialize collection types. The high-level modification is
to create a CollectionColumnReader that recursively materializes map-
and array-type slots.

This patch also adds many tests, most of which query a new table
called complextypestbl. This table contains hand-generated data that
is meant to expose edge cases in the scanner. The tests mostly test
the scanner, with a few tests of other functionality (e.g. array
serialization).

I ran a local benchmark comparing this scanner code to the original
scanner code on an expanded version of tpch_parquet.lineitem with
48009720 rows. My benchmark involved selecting different numbers of
columns with a single scanner thread, and I looked at the HDFS scan
node time in the query profiles. This code introduces a 10%-20%
regression in single-threaded scan time.

Change-Id: Id27fb728934e8346444f61752c9278d8010e5f3a
Reviewed-on: http://gerrit.cloudera.org:8080/576
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Internal Jenkins
2015-09-02 19:23:54 +00:00

9.5 KiB

1# Table level constraints:
2# Allows for defining constraints on which file formats to generate for an individual
3# table. The table name should match the base table name defined in the schema template
4# file.
5table_name:stringids, constraint:restrict_to, table_format:hbase/none/none
6table_name:hbasecolumnfamilies, constraint:restrict_to, table_format:hbase/none/none
7table_name:insertalltypesagg, constraint:restrict_to, table_format:hbase/none/none
8table_name:alltypessmallbinary, constraint:restrict_to, table_format:hbase/none/none
9table_name:insertalltypesaggbinary, constraint:restrict_to, table_format:hbase/none/none
10table_name:hbasealltypeserror, constraint:restrict_to, table_format:hbase/none/none
11table_name:hbasealltypeserrornonulls, constraint:restrict_to, table_format:hbase/none/none
12table_name:alltypesinsert, constraint:restrict_to, table_format:text/none/none
13table_name:alltypesnopart_insert, constraint:restrict_to, table_format:text/none/none
14table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:text/none/none
15table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:text/none/none
16table_name:insert_string_partitioned, constraint:restrict_to, table_format:text/none/none
17table_name:alltypesinsert, constraint:restrict_to, table_format:parquet/none/none
18table_name:alltypesnopart_insert, constraint:restrict_to, table_format:parquet/none/none
19table_name:alltypesinsert, constraint:restrict_to, table_format:text/none/none
20table_name:alltypesnopart_insert, constraint:restrict_to, table_format:text/none/none
21table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:text/none/none
22table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:text/none/none
23table_name:insert_string_partitioned, constraint:restrict_to, table_format:text/none/none
24table_name:alltypesinsert, constraint:restrict_to, table_format:parquet/none/none
25table_name:alltypesnopart_insert, constraint:restrict_to, table_format:parquet/none/none
26table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:parquet/none/none
27table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:parquet/none/none
28table_name:insert_string_partitioned, constraint:restrict_to, table_format:parquet/none/none
29table_name:old_rcfile_table, constraint:restrict_to, table_format:rc/none/none
30table_name:bad_text_lzo, constraint:restrict_to, table_format:text/lzo/block
31table_name:bad_text_gzip, constraint:restrict_to, table_format:text/gzip/block
32table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block
33table_name:bad_parquet, constraint:restrict_to, table_format:parquet/none/none
34table_name:bad_magic_number, constraint:restrict_to, table_format:parquet/none/none
35table_name:bad_metadata_len, constraint:restrict_to, table_format:parquet/none/none
36table_name:bad_dict_page_offset, constraint:restrict_to, table_format:parquet/none/none
37table_name:bad_compressed_size, constraint:restrict_to, table_format:parquet/none/none
38table_name:alltypesagg_hive_13_1, constraint:restrict_to, table_format:parquet/none/none
39table_name:kite_required_fields, constraint:restrict_to, table_format:parquet/none/none
40# TODO: Support Avro. Data loading currently fails for Avro because complex types
41# cannot be converted to the corresponding Avro types yet.
42table_name:allcomplextypes, constraint:restrict_to, table_format:text/none/none
43table_name:allcomplextypes, constraint:restrict_to, table_format:parquet/none/none
44table_name:allcomplextypes, constraint:restrict_to, table_format:hbase/none/none
45table_name:functional, constraint:restrict_to, table_format:text/none/none
46table_name:complextypes_fileformat, constraint:restrict_to, table_format:text/none/none
47table_name:complextypes_fileformat, constraint:restrict_to, table_format:parquet/none/none
48table_name:complextypes_fileformat, constraint:restrict_to, table_format:avro/snap/block
49table_name:complextypes_fileformat, constraint:restrict_to, table_format:rc/snap/block
50table_name:complextypes_fileformat, constraint:restrict_to, table_format:seq/snap/block
51table_name:complextypes_multifileformat, constraint:restrict_to, table_format:text/none/none
52# TODO: Avro
53table_name:complextypestbl, constraint:restrict_to, table_format:parquet/none/none
54table_name:alltypeserror, constraint:exclude, table_format:parquet/none/none
55table_name:alltypeserrornonulls, constraint:exclude, table_format:parquet/none/none
56table_name:unsupported_types, constraint:exclude, table_format:parquet/none/none
57table_name:escapechartesttable, constraint:exclude, table_format:parquet/none/none
58table_name:TblWithRaggedColumns, constraint:exclude, table_format:parquet/none/none
59# the text_ tables are for testing test delimiters and escape chars in text files
60table_name:text_comma_backslash_newline, constraint:restrict_to, table_format:text/none/none
61table_name:text_dollar_hash_pipe, constraint:restrict_to, table_format:text/none/none
62table_name:text_thorn_ecirc_newline, constraint:restrict_to, table_format:text/none/none
63table_name:bad_serde, constraint:restrict_to, table_format:text/none/none
64table_name:rcfile_lazy_binary_serde, constraint:restrict_to, table_format:rc/none/none
65table_name:unsupported_partition_types, constraint:restrict_to, table_format:text/none/none
66table_name:nullformat_custom, constraint:exclude, table_format:parquet/none/none
67table_name:alltypes_view, constraint:restrict_to, table_format:text/none/none
68table_name:allcomplextypes_view, constraint:restrict_to, table_format:text/none/none
69table_name:alltypes_view, constraint:restrict_to, table_format:seq/snap/block
70table_name:alltypes_hive_view, constraint:restrict_to, table_format:text/none/none
71table_name:alltypes_view_sub, constraint:restrict_to, table_format:text/none/none
72table_name:alltypes_view_sub, constraint:restrict_to, table_format:seq/snap/block
73table_name:alltypes_parens, constraint:restrict_to, table_format:text/none/none
74table_name:complex_view, constraint:restrict_to, table_format:text/none/none
75table_name:complex_view, constraint:restrict_to, table_format:seq/snap/block
76table_name:view_view, constraint:restrict_to, table_format:text/none/none
77table_name:view_view, constraint:restrict_to, table_format:seq/snap/block
78# liketbl and tblwithraggedcolumns all have
79# NULLs in primary key columns. hbase does not support
80# writing NULLs to primary key columns.
81table_name:liketbl, constraint:exclude, table_format:hbase/none/none
82table_name:tblwithraggedcolumns, constraint:exclude, table_format:hbase/none/none
83# Tables with only one column are not supported in hbase.
84table_name:greptiny, constraint:exclude, table_format:hbase/none/none
85table_name:tinyinttable, constraint:exclude, table_format:hbase/none/none
86# overflow has a bigint that's too big. hbase may lose precision, hence this
87# table cannot be loaded.
88table_name:overflow, constraint:exclude, table_format:hbase/none/none
89# widerow has a single column with a single row containing a 10MB string. hbase doesn't
90# seem to like this.
91table_name:widerow, constraint:exclude, table_format:hbase/none/none
92# nullformat_custom is used in null-insert tests, which user insert overwrite,
93# which is not supported in hbase. The schema is also specified in HIVE_CREATE
94# with no corresponding LOAD statement.
95table_name:nullformat_custom, constraint:exclude, table_format:hbase/none/none
96table_name:unsupported_types, constraint:exclude, table_format:hbase/none/none
97# On CDH4, decimal can only be tested on formats Impala can write to (text and parquet)
98# TODO: add Avro once Hive or Impala can write Avro decimals
99table_name:decimal_tbl, constraint:restrict_to, table_format:text/none/none
100table_name:decimal_tiny, constraint:restrict_to, table_format:text/none/none
101table_name:decimal_tbl, constraint:restrict_to, table_format:parquet/none/none
102table_name:decimal_tiny, constraint:restrict_to, table_format:parquet/none/none
103table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block
104# TODO first set of tests are for text/none/none
105table_name:chars_tiny, constraint:restrict_to, table_format:text/none/none
106# invalid_decimal_part_tbl[1,2,3] tables are used for testing invalid decimal
107# partition key values (see IMPALA-1040)
108table_name:invalid_decimal_part_tbl1, constraint:restrict_to, table_format:text/none/none
109table_name:invalid_decimal_part_tbl2, constraint:restrict_to, table_format:text/none/none
110table_name:invalid_decimal_part_tbl3, constraint:restrict_to, table_format:text/none/none
111table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block
112# testescape tables are used for testing text scanner delimiter handling
113table_name:table_no_newline, constraint:restrict_to, table_format:text/none/none
114table_name:table_no_newline_part, constraint:restrict_to, table_format:text/none/none
115table_name:testescape_16_lf, constraint:restrict_to, table_format:text/none/none
116table_name:testescape_16_crlf, constraint:restrict_to, table_format:text/none/none
117table_name:testescape_17_lf, constraint:restrict_to, table_format:text/none/none
118table_name:testescape_17_crlf, constraint:restrict_to, table_format:text/none/none
119table_name:testescape_32_lf, constraint:restrict_to, table_format:text/none/none
120table_name:testescape_32_crlf, constraint:restrict_to, table_format:text/none/none
121# alltimezones is used to verify that impala properly deals with timezones
122table_name:alltimezones, constraint:restrict_to, table_format:text/none/none
123# Avro schema is inferred from the column definitions (IMPALA-1136)
124table_name:no_avro_schema, constraint:restrict_to, table_format:avro/snap/block
125table_name:avro_unicode_nulls, constraint:restrict_to, table_format:avro/snap/block