Files
impala/testdata/workloads/functional-query/queries/QueryTest/seq-writer.test
Attila Jeges 59b2db6ba7 IMPALA-3079: Fix sequence file writer
This change fixes the following issues in the Sequence File Writer:
1. ReadWriteUtil::VLongRequiredBytes() and ReadWriteUtil::PutVLong()
   were broken. As a result, Impala created corrupt uncompressed
   sequence files.

2. KEY_CLASS_NAME was missing from the sequence file header. As a
   result, Hive could not read back uncompressed sequence files
   created by Impala.

3. Impala created record-compressed sequence files with empty keys
   block. As a result, Hive could not read back record-compressed
   sequence files created by Impala.

4. Impala created block-compressed files with:
   - empty key-lengths block
   - empty keys block
   - empty value-lengths block
   This resulted in invalid block-compressed sequence files that Hive could
   not read back.

5. In some cases the wrong Record-compression flag was written to the
   sequence file header. As a result, Hive could not read back record-
   compressed sequence files created by Impala.

6. Impala added 'sync_marker' instead of 'neg1_sync_marker' to the
   beginning of blocks in block-compressed sequence files. Hive could
   not read these files back.

7. The calculation of block sizes in SnappyBlockCompressor class was
   incorrect for odd-length buffers.

Change-Id: I0db642ad35132a9a5a6611810a6cafbbe26e7487
Reviewed-on: http://gerrit.cloudera.org:8080/6107
Reviewed-by: Michael Ho <kwho@cloudera.com>
Reviewed-by: Attila Jeges <attilaj@cloudera.com>
Reviewed-by: Dan Hecht <dhecht@cloudera.com>
Tested-by: Impala Public Jenkins
2017-04-25 21:07:53 +00:00

281 lines
7.6 KiB
Plaintext

====
---- QUERY
SET COMPRESSION_CODEC=NONE;
SET ALLOW_UNSUPPORTED_FORMATS=1;
SET SEQ_COMPRESSION_MODE=BLOCK;
create table __seq_write (i int, s string, d double)
stored as SEQUENCEFILE;
====
---- QUERY
SET COMPRESSION_CODEC=NONE;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write select 0, "a", 1.1;
====
---- QUERY
SET COMPRESSION_CODEC=DEFAULT;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (1, "b", 2.2);
====
---- QUERY
SET COMPRESSION_CODEC=SNAPPY;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (2, "c", 3.3);
====
---- QUERY
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (3, "d", 4.4);
====
---- QUERY
SET COMPRESSION_CODEC=GZIP;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (4, "e", 5.5);
====
---- QUERY
SET COMPRESSION_CODEC=NONE;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write select 5, "a", 1.1;
====
---- QUERY
SET COMPRESSION_CODEC=DEFAULT;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (6, "b", 2.2);
====
---- QUERY
SET COMPRESSION_CODEC=SNAPPY;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (7, "c", 3.3);
====
---- QUERY
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (8, "d", 4.4);
====
---- QUERY
SET COMPRESSION_CODEC=GZIP;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
insert into __seq_write values (9, "e", 5.5);
====
---- QUERY
SET ALLOW_UNSUPPORTED_FORMATS=0;
insert into __seq_write values (4, "e", 5.5);
---- CATCH
Writing to table format SEQUENCE_FILE is not supported. Use query option
====
---- QUERY
select * from __seq_write;
---- RESULTS
0,'a',1.1
1,'b',2.2
2,'c',3.3
3,'d',4.4
4,'e',5.5
5,'a',1.1
6,'b',2.2
7,'c',3.3
8,'d',4.4
9,'e',5.5
---- TYPES
INT,STRING,DOUBLE
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with NONE+RECORD and then read
# it back
SET COMPRESSION_CODEC=NONE;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_none_rec like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_none_rec partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_none_rec;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with DEFAULT+RECORD and then
# read it back
SET COMPRESSION_CODEC=DEFAULT;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_def_rec like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_def_rec partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_def_rec;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with SNAPPY_BLOCKED+RECORD and
# then read it back
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_snapb_rec like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_snapb_rec partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_snapb_rec;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with SNAPPY+RECORD and then read
# it back
SET COMPRESSION_CODEC=SNAPPY;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_snap_rec like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_snap_rec partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_snap_rec;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with GZIP+RECORD and then read
# it back
SET COMPRESSION_CODEC=GZIP;
SET SEQ_COMPRESSION_MODE=RECORD;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_gzip_rec like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_gzip_rec partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_gzip_rec;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with NONE+BLOCK and then read it
# back
SET COMPRESSION_CODEC=NONE;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_none_block like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_none_block partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_none_block;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with DEFAULT+BLOCK and then read
# it back
SET COMPRESSION_CODEC=DEFAULT;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_def_block like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_def_block partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_def_block;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with SNAPPY_BLOCKED+BLOCK and
# then read it back
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_snapb_block like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_snapb_block partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_snapb_block;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with SNAPPY+BLOCK and then read
# it back
SET COMPRESSION_CODEC=SNAPPY;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_snap_block like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_snap_block partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_snap_block;
---- RESULTS
37999
---- TYPES
BIGINT
====
---- QUERY
# IMPALA-3079: Create a table containing larger seq files with GZIP+BLOCK and then read it
# back
SET COMPRESSION_CODEC=GZIP;
SET SEQ_COMPRESSION_MODE=BLOCK;
SET ALLOW_UNSUPPORTED_FORMATS=1;
create table store_sales_seq_gzip_block like tpcds_parquet.store_sales
stored as SEQUENCEFILE;
insert into store_sales_seq_gzip_block partition(ss_sold_date_sk)
select * from tpcds_parquet.store_sales
where ss_sold_date_sk between 2450816 and 2451200;
====
---- QUERY
select count(*) from store_sales_seq_gzip_block;
---- RESULTS
37999
---- TYPES
BIGINT
====