mirror of
https://github.com/apache/impala.git
synced 2026-01-07 00:02:28 -05:00
This change fixes the following issues in the Sequence File Writer: 1. ReadWriteUtil::VLongRequiredBytes() and ReadWriteUtil::PutVLong() were broken. As a result, Impala created corrupt uncompressed sequence files. 2. KEY_CLASS_NAME was missing from the sequence file header. As a result, Hive could not read back uncompressed sequence files created by Impala. 3. Impala created record-compressed sequence files with empty keys block. As a result, Hive could not read back record-compressed sequence files created by Impala. 4. Impala created block-compressed files with: - empty key-lengths block - empty keys block - empty value-lengths block This resulted in invalid block-compressed sequence files that Hive could not read back. 5. In some cases the wrong Record-compression flag was written to the sequence file header. As a result, Hive could not read back record- compressed sequence files created by Impala. 6. Impala added 'sync_marker' instead of 'neg1_sync_marker' to the beginning of blocks in block-compressed sequence files. Hive could not read these files back. 7. The calculation of block sizes in SnappyBlockCompressor class was incorrect for odd-length buffers. Change-Id: I0db642ad35132a9a5a6611810a6cafbbe26e7487 Reviewed-on: http://gerrit.cloudera.org:8080/6107 Reviewed-by: Michael Ho <kwho@cloudera.com> Reviewed-by: Attila Jeges <attilaj@cloudera.com> Reviewed-by: Dan Hecht <dhecht@cloudera.com> Tested-by: Impala Public Jenkins
281 lines
7.6 KiB
Plaintext
281 lines
7.6 KiB
Plaintext
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=NONE;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
create table __seq_write (i int, s string, d double)
|
|
stored as SEQUENCEFILE;
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=NONE;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write select 0, "a", 1.1;
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=DEFAULT;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (1, "b", 2.2);
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=SNAPPY;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (2, "c", 3.3);
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (3, "d", 4.4);
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=GZIP;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (4, "e", 5.5);
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=NONE;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write select 5, "a", 1.1;
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=DEFAULT;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (6, "b", 2.2);
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=SNAPPY;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (7, "c", 3.3);
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (8, "d", 4.4);
|
|
====
|
|
---- QUERY
|
|
SET COMPRESSION_CODEC=GZIP;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
insert into __seq_write values (9, "e", 5.5);
|
|
====
|
|
---- QUERY
|
|
SET ALLOW_UNSUPPORTED_FORMATS=0;
|
|
insert into __seq_write values (4, "e", 5.5);
|
|
---- CATCH
|
|
Writing to table format SEQUENCE_FILE is not supported. Use query option
|
|
====
|
|
---- QUERY
|
|
select * from __seq_write;
|
|
---- RESULTS
|
|
0,'a',1.1
|
|
1,'b',2.2
|
|
2,'c',3.3
|
|
3,'d',4.4
|
|
4,'e',5.5
|
|
5,'a',1.1
|
|
6,'b',2.2
|
|
7,'c',3.3
|
|
8,'d',4.4
|
|
9,'e',5.5
|
|
---- TYPES
|
|
INT,STRING,DOUBLE
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with NONE+RECORD and then read
|
|
# it back
|
|
SET COMPRESSION_CODEC=NONE;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_none_rec like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_none_rec partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_none_rec;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with DEFAULT+RECORD and then
|
|
# read it back
|
|
SET COMPRESSION_CODEC=DEFAULT;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_def_rec like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_def_rec partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_def_rec;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with SNAPPY_BLOCKED+RECORD and
|
|
# then read it back
|
|
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_snapb_rec like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_snapb_rec partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_snapb_rec;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with SNAPPY+RECORD and then read
|
|
# it back
|
|
SET COMPRESSION_CODEC=SNAPPY;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_snap_rec like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_snap_rec partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_snap_rec;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with GZIP+RECORD and then read
|
|
# it back
|
|
SET COMPRESSION_CODEC=GZIP;
|
|
SET SEQ_COMPRESSION_MODE=RECORD;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_gzip_rec like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_gzip_rec partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_gzip_rec;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with NONE+BLOCK and then read it
|
|
# back
|
|
SET COMPRESSION_CODEC=NONE;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_none_block like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_none_block partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_none_block;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with DEFAULT+BLOCK and then read
|
|
# it back
|
|
SET COMPRESSION_CODEC=DEFAULT;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_def_block like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_def_block partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_def_block;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with SNAPPY_BLOCKED+BLOCK and
|
|
# then read it back
|
|
SET COMPRESSION_CODEC=SNAPPY_BLOCKED;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_snapb_block like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_snapb_block partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_snapb_block;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with SNAPPY+BLOCK and then read
|
|
# it back
|
|
SET COMPRESSION_CODEC=SNAPPY;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_snap_block like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_snap_block partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_snap_block;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|
|
---- QUERY
|
|
# IMPALA-3079: Create a table containing larger seq files with GZIP+BLOCK and then read it
|
|
# back
|
|
SET COMPRESSION_CODEC=GZIP;
|
|
SET SEQ_COMPRESSION_MODE=BLOCK;
|
|
SET ALLOW_UNSUPPORTED_FORMATS=1;
|
|
create table store_sales_seq_gzip_block like tpcds_parquet.store_sales
|
|
stored as SEQUENCEFILE;
|
|
insert into store_sales_seq_gzip_block partition(ss_sold_date_sk)
|
|
select * from tpcds_parquet.store_sales
|
|
where ss_sold_date_sk between 2450816 and 2451200;
|
|
====
|
|
---- QUERY
|
|
select count(*) from store_sales_seq_gzip_block;
|
|
---- RESULTS
|
|
37999
|
|
---- TYPES
|
|
BIGINT
|
|
====
|