Files
impala/testdata/datasets/functional/schema_constraints.csv
stiga-huang 818cd8fa27 IMPALA-5717: Support for reading ORC data files
This patch integrates the orc library into Impala and implements
HdfsOrcScanner as a middle layer between them. The HdfsOrcScanner
supplies input needed from the orc-reader, tracks memory consumption of
the reader and transfers the reader's output (orc::ColumnVectorBatch)
into impala::RowBatch. The ORC version we used is release-1.4.3.

A startup option --enable_orc_scanner is added for this feature. It's
set to true by default. Setting it to false will fail queries on ORC
tables.

Currently, we only support reading primitive types. Writing into ORC
table has not been supported neither.

Tests
 - Most of the end-to-end tests can run on ORC format.
 - Add tpcds, tpch tests for ORC.
 - Add some ORC specific tests.
 - Haven't enabled test_scanner_fuzz for ORC yet, since the ORC library
   is not robust for corrupt files (ORC-315).

Change-Id: Ia7b6ae4ce3b9ee8125b21993702faa87537790a4
Reviewed-on: http://gerrit.cloudera.org:8080/9134
Reviewed-by: Quanlong Huang <huangquanlong@gmail.com>
Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-04-11 05:13:02 +00:00

208 lines
14 KiB
CSV

# Table level constraints:
# Allows for defining constraints on which file formats to generate for an individual
# table. The table name should match the base table name defined in the schema template
# file.
table_name:stringids, constraint:restrict_to, table_format:hbase/none/none
table_name:hbasecolumnfamilies, constraint:restrict_to, table_format:hbase/none/none
table_name:insertalltypesagg, constraint:restrict_to, table_format:hbase/none/none
table_name:alltypessmallbinary, constraint:restrict_to, table_format:hbase/none/none
table_name:insertalltypesaggbinary, constraint:restrict_to, table_format:hbase/none/none
table_name:hbasealltypeserror, constraint:restrict_to, table_format:hbase/none/none
table_name:hbasealltypeserrornonulls, constraint:restrict_to, table_format:hbase/none/none
table_name:alltypesinsert, constraint:restrict_to, table_format:text/none/none
table_name:stringpartitionkey, constraint:restrict_to, table_format:text/none/none
table_name:alltypesnopart_insert, constraint:restrict_to, table_format:text/none/none
table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:text/none/none
table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:text/none/none
table_name:insert_string_partitioned, constraint:restrict_to, table_format:text/none/none
table_name:alltypesinsert, constraint:restrict_to, table_format:parquet/none/none
table_name:alltypesnopart_insert, constraint:restrict_to, table_format:parquet/none/none
table_name:alltypesinsert, constraint:restrict_to, table_format:text/none/none
table_name:alltypesnopart_insert, constraint:restrict_to, table_format:text/none/none
table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:text/none/none
table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:text/none/none
table_name:insert_string_partitioned, constraint:restrict_to, table_format:text/none/none
table_name:alltypesinsert, constraint:restrict_to, table_format:parquet/none/none
table_name:alltypesnopart_insert, constraint:restrict_to, table_format:parquet/none/none
table_name:insert_overwrite_nopart, constraint:restrict_to, table_format:parquet/none/none
table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:parquet/none/none
table_name:insert_string_partitioned, constraint:restrict_to, table_format:parquet/none/none
table_name:old_rcfile_table, constraint:restrict_to, table_format:rc/none/none
table_name:bad_text_lzo, constraint:restrict_to, table_format:text/lzo/block
table_name:bad_text_gzip, constraint:restrict_to, table_format:text/gzip/block
table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block
table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block
table_name:bad_avro_snap_floats, constraint:restrict_to, table_format:avro/snap/block
table_name:bad_avro_decimal_schema, constraint:restrict_to, table_format:avro/snap/block
table_name:bad_parquet, constraint:restrict_to, table_format:parquet/none/none
table_name:bad_parquet_strings_negative_len, constraint:restrict_to, table_format:parquet/none/none
table_name:bad_parquet_strings_out_of_bounds, constraint:restrict_to, table_format:parquet/none/none
table_name:bad_magic_number, constraint:restrict_to, table_format:parquet/none/none
table_name:bad_metadata_len, constraint:restrict_to, table_format:parquet/none/none
table_name:bad_dict_page_offset, constraint:restrict_to, table_format:parquet/none/none
table_name:bad_compressed_size, constraint:restrict_to, table_format:parquet/none/none
table_name:alltypesagg_hive_13_1, constraint:restrict_to, table_format:parquet/none/none
table_name:kite_required_fields, constraint:restrict_to, table_format:parquet/none/none
table_name:bad_column_metadata, constraint:restrict_to, table_format:parquet/none/none
table_name:lineitem_multiblock, constraint:restrict_to, table_format:parquet/none/none
table_name:lineitem_sixblocks, constraint:restrict_to, table_format:parquet/none/none
table_name:lineitem_multiblock_one_row_group, constraint:restrict_to, table_format:parquet/none/none
table_name:customer_multiblock, constraint:restrict_to, table_format:parquet/none/none
# TODO: Support Avro. Data loading currently fails for Avro because complex types
# cannot be converted to the corresponding Avro types yet.
table_name:allcomplextypes, constraint:restrict_to, table_format:text/none/none
table_name:allcomplextypes, constraint:restrict_to, table_format:parquet/none/none
table_name:allcomplextypes, constraint:restrict_to, table_format:hbase/none/none
table_name:functional, constraint:restrict_to, table_format:text/none/none
table_name:complextypes_fileformat, constraint:restrict_to, table_format:text/none/none
table_name:complextypes_fileformat, constraint:restrict_to, table_format:parquet/none/none
table_name:complextypes_fileformat, constraint:restrict_to, table_format:avro/snap/block
table_name:complextypes_fileformat, constraint:restrict_to, table_format:rc/snap/block
table_name:complextypes_fileformat, constraint:restrict_to, table_format:seq/snap/block
table_name:complextypes_fileformat, constraint:restrict_to, table_format:orc/def/block
table_name:complextypes_multifileformat, constraint:restrict_to, table_format:text/none/none
# TODO: Avro
table_name:complextypestbl, constraint:restrict_to, table_format:parquet/none/none
table_name:alltypeserror, constraint:exclude, table_format:parquet/none/none
table_name:alltypeserrornonulls, constraint:exclude, table_format:parquet/none/none
table_name:unsupported_types, constraint:exclude, table_format:parquet/none/none
table_name:escapechartesttable, constraint:exclude, table_format:parquet/none/none
table_name:TblWithRaggedColumns, constraint:exclude, table_format:parquet/none/none
# the text_ tables are for testing test delimiters and escape chars in text files
table_name:text_comma_backslash_newline, constraint:restrict_to, table_format:text/none/none
table_name:text_dollar_hash_pipe, constraint:restrict_to, table_format:text/none/none
table_name:text_thorn_ecirc_newline, constraint:restrict_to, table_format:text/none/none
table_name:bad_serde, constraint:restrict_to, table_format:text/none/none
table_name:rcfile_lazy_binary_serde, constraint:restrict_to, table_format:rc/none/none
table_name:unsupported_partition_types, constraint:restrict_to, table_format:text/none/none
table_name:nullformat_custom, constraint:exclude, table_format:parquet/none/none
table_name:alltypes_view, constraint:restrict_to, table_format:text/none/none
table_name:allcomplextypes_view, constraint:restrict_to, table_format:text/none/none
table_name:alltypes_view, constraint:restrict_to, table_format:seq/snap/block
table_name:alltypes_hive_view, constraint:restrict_to, table_format:text/none/none
table_name:alltypes_view_sub, constraint:restrict_to, table_format:text/none/none
table_name:alltypes_view_sub, constraint:restrict_to, table_format:seq/snap/block
table_name:alltypes_parens, constraint:restrict_to, table_format:text/none/none
table_name:complex_view, constraint:restrict_to, table_format:text/none/none
table_name:complex_view, constraint:restrict_to, table_format:seq/snap/block
table_name:view_view, constraint:restrict_to, table_format:text/none/none
table_name:view_view, constraint:restrict_to, table_format:seq/snap/block
table_name:subquery_view, constraint:restrict_to, table_format:seq/snap/block
table_name:subquery_view, constraint:restrict_to, table_format:rc/none/none
# liketbl and tblwithraggedcolumns all have
# NULLs in primary key columns. hbase does not support
# writing NULLs to primary key columns.
table_name:liketbl, constraint:exclude, table_format:hbase/none/none
table_name:tblwithraggedcolumns, constraint:exclude, table_format:hbase/none/none
# Tables with only one column are not supported in hbase.
table_name:greptiny, constraint:exclude, table_format:hbase/none/none
table_name:tinyinttable, constraint:exclude, table_format:hbase/none/none
# overflow uses a manually constructed text file which doesn't make sense to write to
# other table formats since the values that would be written are different (e.g. already
# truncated.)
table_name:overflow, constraint:restrict_to, table_format:text/none/none
# widerow has a single column with a single row containing a 10MB string. hbase doesn't
# seem to like this.
table_name:widerow, constraint:exclude, table_format:hbase/none/none
# nullformat_custom is used in null-insert tests, which user insert overwrite,
# which is not supported in hbase. The schema is also specified in HIVE_CREATE
# with no corresponding LOAD statement.
table_name:nullformat_custom, constraint:exclude, table_format:hbase/none/none
table_name:unsupported_types, constraint:exclude, table_format:hbase/none/none
# Decimal can only be tested on formats Impala can write to (text and parquet).
# TODO: add Avro once Hive or Impala can write Avro decimals
table_name:decimal_tbl, constraint:restrict_to, table_format:text/none/none
table_name:decimal_tiny, constraint:restrict_to, table_format:text/none/none
table_name:decimal_tbl, constraint:restrict_to, table_format:parquet/none/none
table_name:decimal_tiny, constraint:restrict_to, table_format:parquet/none/none
table_name:decimal_tbl, constraint:restrict_to, table_format:kudu/none/none
table_name:decimal_tiny, constraint:restrict_to, table_format:kudu/none/none
table_name:decimal_tbl, constraint:restrict_to, table_format:orc/def/block
table_name:decimal_tiny, constraint:restrict_to, table_format:orc/def/block
table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block
# TODO first set of tests are for text/none/none
table_name:chars_tiny, constraint:restrict_to, table_format:text/none/none
# invalid_decimal_part_tbl[1,2,3] tables are used for testing invalid decimal
# partition key values (see IMPALA-1040)
table_name:invalid_decimal_part_tbl1, constraint:restrict_to, table_format:text/none/none
table_name:invalid_decimal_part_tbl2, constraint:restrict_to, table_format:text/none/none
table_name:invalid_decimal_part_tbl3, constraint:restrict_to, table_format:text/none/none
table_name:avro_decimal_tbl, constraint:restrict_to, table_format:avro/snap/block
# testescape tables are used for testing text scanner delimiter handling
table_name:table_no_newline, constraint:restrict_to, table_format:text/none/none
table_name:table_no_newline_part, constraint:restrict_to, table_format:text/none/none
table_name:testescape_16_lf, constraint:restrict_to, table_format:text/none/none
table_name:testescape_16_crlf, constraint:restrict_to, table_format:text/none/none
table_name:testescape_17_lf, constraint:restrict_to, table_format:text/none/none
table_name:testescape_17_crlf, constraint:restrict_to, table_format:text/none/none
table_name:testescape_32_lf, constraint:restrict_to, table_format:text/none/none
table_name:testescape_32_crlf, constraint:restrict_to, table_format:text/none/none
# alltimezones is used to verify that impala properly deals with timezones
table_name:alltimezones, constraint:restrict_to, table_format:text/none/none
# Avro schema is inferred from the column definitions (IMPALA-1136)
table_name:no_avro_schema, constraint:restrict_to, table_format:avro/snap/block
table_name:avro_unicode_nulls, constraint:restrict_to, table_format:avro/snap/block
# test single and multi stream bz2 files
table_name:bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block
table_name:large_bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block
table_name:multistream_bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block
table_name:large_multistream_bzip2_tbl, constraint:restrict_to, table_format:text/bzip/block
# Kudu can't handle certain types such as timestamp so we pick and choose the tables
# we actually use for Kudu related tests.
table_name:alltypes, constraint:only, table_format:kudu/none/none
table_name:alltypessmall, constraint:only, table_format:kudu/none/none
table_name:alltypestiny, constraint:only, table_format:kudu/none/none
table_name:alltypesagg, constraint:only, table_format:kudu/none/none
table_name:alltypesaggnonulls, constraint:only, table_format:kudu/none/none
table_name:testtbl, constraint:only, table_format:kudu/none/none
table_name:jointbl, constraint:only, table_format:kudu/none/none
table_name:emptytable, constraint:only, table_format:kudu/none/none
table_name:dimtbl, constraint:only, table_format:kudu/none/none
table_name:tinytable, constraint:only, table_format:kudu/none/none
table_name:tinyinttable, constraint:only, table_format:kudu/none/none
table_name:zipcode_incomes, constraint:only, table_format:kudu/none/none
table_name:nulltable, constraint:only, table_format:kudu/none/none
table_name:nullescapedtable, constraint:only, table_format:kudu/none/none
table_name:decimal_tbl, constraint:only, table_format:kudu/none/none
table_name:decimal_tiny, constraint:only, table_format:kudu/none/none
# Skipping header lines is only effective with text tables
table_name:table_with_header, constraint:restrict_to, table_format:text/none/none
table_name:table_with_header_2, constraint:restrict_to, table_format:text/none/none
table_name:table_with_header_insert, constraint:restrict_to, table_format:text/none/none
# We also test that skipping header lines works on compressed tables (IMPALA-5287)
table_name:table_with_header, constraint:restrict_to, table_format:text/gzip/block
table_name:table_with_header_2, constraint:restrict_to, table_format:text/gzip/block
table_name:table_with_header_insert, constraint:restrict_to, table_format:text/gzip/block
# Inserting into parquet tables should not be affected by the 'skip.header.line.count'
# property, so we test parquet format as well.
table_name:table_with_header_insert, constraint:restrict_to, table_format:parquet/none/none