mirror of
https://github.com/apache/impala.git
synced 2026-01-21 15:03:35 -05:00
Currently, top-level scalar columns in parquet files can be used at runtime to prune row-groups by evaluating certain conjuncts over the column's dictionary (if available). This change extends such pruning to scalar values that are stored in collection type columns. Currently, dictionary pruning works by finding eligible conjuncts for top-level slots. Since only top-level slots are supported, the slots are implicitly part of the scan node's tuple descriptor. With this change, we track eligible conjuncts by slot as well as the tuple that contains the slot (either top-level or nested collection). Since collection conjuncts are already managed by a map that associates tuple descriptors to a list of their conjuncts, this extension follows the existing representation. The frontend builds the mapping of SlotId to conjuncts that are dictionary filterable. This mapping now includes SlotId's that reference nested tuples. The backend is adjusted to use the same representation. In addition, collection readers are decomposed into scalar filterable columns and other, non-dictionary filterable readers. When filtering a row group using a conjunct associated to a (possibly) nested collection type, an additional tuple buffer is allocated per tuple descriptor. Testing: - e2e test extended to illustrate row-groups that are pruned by nested collection dictionary filters. Change-Id: If3a2abcfc3d0f7d18756816659fed77ce12668dd Reviewed-on: http://gerrit.cloudera.org:8080/8775 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins
13 KiB
13 KiB
| 1 | # Table level constraints: |
|---|---|
| 2 | # Allows for defining constraints on which file formats to generate for an individual |
| 3 | # table. The table name should match the base table name defined in the schema template |
| 4 | # file. |
| 5 | table_name:stringids, constraint:restrict_to, table_format:hbase/none/none |
| 6 | table_name:hbasecolumnfamilies, constraint:restrict_to, table_format:hbase/none/none |
| 7 | table_name:insertalltypesagg, constraint:restrict_to, table_format:hbase/none/none |
| 8 | table_name:alltypessmallbinary, constraint:restrict_to, table_format:hbase/none/none |
| 9 | table_name:insertalltypesaggbinary, constraint:restrict_to, table_format:hbase/none/none |
| 10 | table_name:hbasealltypeserror, constraint:restrict_to, table_format:hbase/none/none |
| 11 | table_name:hbasealltypeserrornonulls, constraint:restrict_to, table_format:hbase/none/none |
| 12 | table_name:alltypesinsert, constraint:restrict_to, table_format:text/none/none |
| 13 | table_name:stringpartitionkey, constraint:restrict_to, table_format:text/none/none |
| 14 | table_name:alltypesnopart_insert, constraint:restrict_to, table_format:text/none/none |
| 15 | table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:text/none/none |
| 16 | table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:text/none/none |
| 17 | table_name:insert_string_partitioned, constraint:restrict_to, table_format:text/none/none |
| 18 | table_name:alltypesinsert, constraint:restrict_to, table_format:parquet/none/none |
| 19 | table_name:alltypesnopart_insert, constraint:restrict_to, table_format:parquet/none/none |
| 20 | table_name:alltypesinsert, constraint:restrict_to, table_format:text/none/none |
| 21 | table_name:alltypesnopart_insert, constraint:restrict_to, table_format:text/none/none |
| 22 | table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:text/none/none |
| 23 | table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:text/none/none |
| 24 | table_name:insert_string_partitioned, constraint:restrict_to, table_format:text/none/none |
| 25 | table_name:alltypesinsert, constraint:restrict_to, table_format:parquet/none/none |
| 26 | table_name:alltypesnopart_insert, constraint:restrict_to, table_format:parquet/none/none |
| 27 | table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:parquet/none/none |
| 28 | table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:parquet/none/none |
| 29 | table_name:insert_string_partitioned, constraint:restrict_to, table_format:parquet/none/none |
| 30 | table_name:old_rcfile_table, constraint:restrict_to, table_format:rc/none/none |
| 31 | table_name:bad_text_lzo, constraint:restrict_to, table_format:text/lzo/block |
| 32 | table_name:bad_text_gzip, constraint:restrict_to, table_format:text/gzip/block |
| 33 | table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block |
| 34 | table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block |
| 35 | table_name:bad_avro_snap_floats, constraint:restrict_to, table_format:avro/snap/block |
| 36 | table_name:bad_avro_decimal_schema, constraint:restrict_to, table_format:avro/snap/block |
| 37 | table_name:bad_parquet, constraint:restrict_to, table_format:parquet/none/none |
| 38 | table_name:bad_parquet_strings_negative_len, constraint:restrict_to, table_format:parquet/none/none |
| 39 | table_name:bad_parquet_strings_out_of_bounds, constraint:restrict_to, table_format:parquet/none/none |
| 40 | table_name:bad_magic_number, constraint:restrict_to, table_format:parquet/none/none |
| 41 | table_name:bad_metadata_len, constraint:restrict_to, table_format:parquet/none/none |
| 42 | table_name:bad_dict_page_offset, constraint:restrict_to, table_format:parquet/none/none |
| 43 | table_name:bad_compressed_size, constraint:restrict_to, table_format:parquet/none/none |
| 44 | table_name:alltypesagg_hive_13_1, constraint:restrict_to, table_format:parquet/none/none |
| 45 | table_name:kite_required_fields, constraint:restrict_to, table_format:parquet/none/none |
| 46 | table_name:bad_column_metadata, constraint:restrict_to, table_format:parquet/none/none |
| 47 | table_name:lineitem_multiblock, constraint:restrict_to, table_format:parquet/none/none |
| 48 | table_name:lineitem_sixblocks, constraint:restrict_to, table_format:parquet/none/none |
| 49 | table_name:lineitem_multiblock_one_row_group, constraint:restrict_to, table_format:parquet/none/none |
| 50 | table_name:customer_multiblock, constraint:restrict_to, table_format:parquet/none/none |
| 51 | # TODO: Support Avro. Data loading currently fails for Avro because complex types |
| 52 | # cannot be converted to the corresponding Avro types yet. |
| 53 | table_name:allcomplextypes, constraint:restrict_to, table_format:text/none/none |
| 54 | table_name:allcomplextypes, constraint:restrict_to, table_format:parquet/none/none |
| 55 | table_name:allcomplextypes, constraint:restrict_to, table_format:hbase/none/none |
| 56 | table_name:functional, constraint:restrict_to, table_format:text/none/none |
| 57 | table_name:complextypes_fileformat, constraint:restrict_to, table_format:text/none/none |
| 58 | table_name:complextypes_fileformat, constraint:restrict_to, table_format:parquet/none/none |
| 59 | table_name:complextypes_fileformat, constraint:restrict_to, table_format:avro/snap/block |
| 60 | table_name:complextypes_fileformat, constraint:restrict_to, table_format:rc/snap/block |
| 61 | table_name:complextypes_fileformat, constraint:restrict_to, table_format:seq/snap/block |
| 62 | table_name:complextypes_multifileformat, constraint:restrict_to, table_format:text/none/none |
| 63 | # TODO: Avro |
| 64 | table_name:complextypestbl, constraint:restrict_to, table_format:parquet/none/none |
| 65 | table_name:alltypeserror, constraint:exclude, table_format:parquet/none/none |
| 66 | table_name:alltypeserrornonulls, constraint:exclude, table_format:parquet/none/none |
| 67 | table_name:unsupported_types, constraint:exclude, table_format:parquet/none/none |
| 68 | table_name:escapechartesttable, constraint:exclude, table_format:parquet/none/none |
| 69 | table_name:TblWithRaggedColumns, constraint:exclude, table_format:parquet/none/none |
| 70 | # the text_ tables are for testing test delimiters and escape chars in text files |
| 71 | table_name:text_comma_backslash_newline, constraint:restrict_to, table_format:text/none/none |
| 72 | table_name:text_dollar_hash_pipe, constraint:restrict_to, table_format:text/none/none |
| 73 | table_name:text_thorn_ecirc_newline, constraint:restrict_to, table_format:text/none/none |
| 74 | table_name:bad_serde, constraint:restrict_to, table_format:text/none/none |
| 75 | table_name:rcfile_lazy_binary_serde, constraint:restrict_to, table_format:rc/none/none |
| 76 | table_name:unsupported_partition_types, constraint:restrict_to, table_format:text/none/none |
| 77 | table_name:nullformat_custom, constraint:exclude, table_format:parquet/none/none |
| 78 | table_name:alltypes_view, constraint:restrict_to, table_format:text/none/none |
| 79 | table_name:allcomplextypes_view, constraint:restrict_to, table_format:text/none/none |
| 80 | table_name:alltypes_view, constraint:restrict_to, table_format:seq/snap/block |
| 81 | table_name:alltypes_hive_view, constraint:restrict_to, table_format:text/none/none |
| 82 | table_name:alltypes_view_sub, constraint:restrict_to, table_format:text/none/none |
| 83 | table_name:alltypes_view_sub, constraint:restrict_to, table_format:seq/snap/block |
| 84 | table_name:alltypes_parens, constraint:restrict_to, table_format:text/none/none |
| 85 | table_name:complex_view, constraint:restrict_to, table_format:text/none/none |
| 86 | table_name:complex_view, constraint:restrict_to, table_format:seq/snap/block |
| 87 | table_name:view_view, constraint:restrict_to, table_format:text/none/none |
| 88 | table_name:view_view, constraint:restrict_to, table_format:seq/snap/block |
| 89 | table_name:subquery_view, constraint:restrict_to, table_format:seq/snap/block |
| 90 | table_name:subquery_view, constraint:restrict_to, table_format:rc/none/none |
| 91 | # liketbl and tblwithraggedcolumns all have |
| 92 | # NULLs in primary key columns. hbase does not support |
| 93 | # writing NULLs to primary key columns. |
| 94 | table_name:liketbl, constraint:exclude, table_format:hbase/none/none |
| 95 | table_name:tblwithraggedcolumns, constraint:exclude, table_format:hbase/none/none |
| 96 | # Tables with only one column are not supported in hbase. |
| 97 | table_name:greptiny, constraint:exclude, table_format:hbase/none/none |
| 98 | table_name:tinyinttable, constraint:exclude, table_format:hbase/none/none |
| 99 | # overflow uses a manually constructed text file which doesn't make sense to write to |
| 100 | # other table formats since the values that would be written are different (e.g. already |
| 101 | # truncated.) |
| 102 | table_name:overflow, constraint:restrict_to, table_format:text/none/none |
| 103 | # widerow has a single column with a single row containing a 10MB string. hbase doesn't |
| 104 | # seem to like this. |
| 105 | table_name:widerow, constraint:exclude, table_format:hbase/none/none |
| 106 | # nullformat_custom is used in null-insert tests, which user insert overwrite, |
| 107 | # which is not supported in hbase. The schema is also specified in HIVE_CREATE |
| 108 | # with no corresponding LOAD statement. |
| 109 | table_name:nullformat_custom, constraint:exclude, table_format:hbase/none/none |
| 110 | table_name:unsupported_types, constraint:exclude, table_format:hbase/none/none |
| 111 | # Decimal can only be tested on formats Impala can write to (text and parquet). |
| 112 | # TODO: add Avro once Hive or Impala can write Avro decimals |
| 113 | table_name:decimal_tbl, constraint:restrict_to, table_format:text/none/none |
| 114 | table_name:decimal_tiny, constraint:restrict_to, table_format:text/none/none |
| 115 | table_name:decimal_tbl, constraint:restrict_to, table_format:parquet/none/none |
| 116 | table_name:decimal_tiny, constraint:restrict_to, table_format:parquet/none/none |
| 117 | table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block |
| 118 | # TODO first set of tests are for text/none/none |
| 119 | table_name:chars_tiny, constraint:restrict_to, table_format:text/none/none |
| 120 | # invalid_decimal_part_tbl[1,2,3] tables are used for testing invalid decimal |
| 121 | # partition key values (see IMPALA-1040) |
| 122 | table_name:invalid_decimal_part_tbl1, constraint:restrict_to, table_format:text/none/none |
| 123 | table_name:invalid_decimal_part_tbl2, constraint:restrict_to, table_format:text/none/none |
| 124 | table_name:invalid_decimal_part_tbl3, constraint:restrict_to, table_format:text/none/none |
| 125 | table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block |
| 126 | # testescape tables are used for testing text scanner delimiter handling |
| 127 | table_name:table_no_newline, constraint:restrict_to, table_format:text/none/none |
| 128 | table_name:table_no_newline_part, constraint:restrict_to, table_format:text/none/none |
| 129 | table_name:testescape_16_lf, constraint:restrict_to, table_format:text/none/none |
| 130 | table_name:testescape_16_crlf, constraint:restrict_to, table_format:text/none/none |
| 131 | table_name:testescape_17_lf, constraint:restrict_to, table_format:text/none/none |
| 132 | table_name:testescape_17_crlf, constraint:restrict_to, table_format:text/none/none |
| 133 | table_name:testescape_32_lf, constraint:restrict_to, table_format:text/none/none |
| 134 | table_name:testescape_32_crlf, constraint:restrict_to, table_format:text/none/none |
| 135 | # alltimezones is used to verify that impala properly deals with timezones |
| 136 | table_name:alltimezones, constraint:restrict_to, table_format:text/none/none |
| 137 | # Avro schema is inferred from the column definitions (IMPALA-1136) |
| 138 | table_name:no_avro_schema, constraint:restrict_to, table_format:avro/snap/block |
| 139 | table_name:avro_unicode_nulls, constraint:restrict_to, table_format:avro/snap/block |
| 140 | # test single and multi stream bz2 files |
| 141 | table_name:bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block |
| 142 | table_name:large_bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block |
| 143 | table_name:multistream_bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block |
| 144 | table_name:large_multistream_bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block |
| 145 | # Kudu can't handle certain types such as timestamp so we pick and choose the tables |
| 146 | # we actually use for Kudu related tests. |
| 147 | table_name:alltypes, constraint:only, table_format:kudu/none/none |
| 148 | table_name:alltypessmall, constraint:only, table_format:kudu/none/none |
| 149 | table_name:alltypestiny, constraint:only, table_format:kudu/none/none |
| 150 | table_name:alltypesagg, constraint:only, table_format:kudu/none/none |
| 151 | table_name:alltypesaggnonulls, constraint:only, table_format:kudu/none/none |
| 152 | table_name:testtbl, constraint:only, table_format:kudu/none/none |
| 153 | table_name:jointbl, constraint:only, table_format:kudu/none/none |
| 154 | table_name:emptytable, constraint:only, table_format:kudu/none/none |
| 155 | table_name:dimtbl, constraint:only, table_format:kudu/none/none |
| 156 | table_name:tinytable, constraint:only, table_format:kudu/none/none |
| 157 | table_name:tinyinttable, constraint:only, table_format:kudu/none/none |
| 158 | table_name:zipcode_incomes, constraint:only, table_format:kudu/none/none |
| 159 | table_name:nulltable, constraint:only, table_format:kudu/none/none |
| 160 | table_name:nullescapedtable, constraint:only, table_format:kudu/none/none |
| 161 | # Skipping header lines is only effective with text tables |
| 162 | table_name:table_with_header, constraint:restrict_to, table_format:text/none/none |
| 163 | table_name:table_with_header_2, constraint:restrict_to, table_format:text/none/none |
| 164 | table_name:table_with_header_insert, constraint:restrict_to, table_format:text/none/none |
| 165 | # We also test that skipping header lines works on compressed tables (IMPALA-5287) |
| 166 | table_name:table_with_header, constraint:restrict_to, table_format:text/gzip/block |
| 167 | table_name:table_with_header_2, constraint:restrict_to, table_format:text/gzip/block |
| 168 | table_name:table_with_header_insert, constraint:restrict_to, table_format:text/gzip/block |
| 169 | # Inserting into parquet tables should not be affected by the 'skip.header.line.count' |
| 170 | # property, so we test parquet format as well. |
| 171 | table_name:table_with_header_insert, constraint:restrict_to, table_format:parquet/none/none |