mirror of
https://github.com/apache/impala.git
synced 2026-01-02 12:00:33 -05:00
Split out the encoder/type for parquet reader/writer. I think this puts us
in a better place to support future encodings.
On the tpch lineitem table, the results are:
Before:
BytesWritten: 236.45 MB
Per Column Sizes:
l_comment: 75.71 MB
l_commitdate: 8.64 MB
l_discount: 11.19 MB
l_extendedprice: 33.02 MB
l_linenumber: 4.56 MB
l_linestatus: 869.98 KB
l_orderkey: 8.99 MB
l_partkey: 27.02 MB
l_quantity: 11.58 MB
l_receiptdate: 8.65 MB
l_returnflag: 1.40 MB
l_shipdate: 8.65 MB
l_shipinstruct: 1.45 MB
l_shipmode: 2.17 MB
l_suppkey: 21.91 MB
l_tax: 10.68 MB
After:
BytesWritten: 198.63 MB (84%)
Per Column Sizes:
l_comment: 75.71 MB (100%)
l_commitdate: 8.64 MB (100%)
l_discount: 2.89 MB (25.8%)
l_extendedprice: 33.13 MB (100.33%)
l_linenumber: 1.50 MB (32.89%)
l_linestatus: 870.26 KB (100.032%)
l_orderkey: 9.18 MB (102.11%)
l_partkey: 27.10 MB (100.29%)
l_quantity: 4.32 MB (37.31%)
l_receiptdate: 8.65 MB (100%)
l_returnflag: 1.40 MB (100%)
l_shipdate: 8.65 MB (100%)
l_shipinstruct: 1.45 MB (100%)
l_shipmode: 2.17 MB (100%)
l_suppkey: 10.11 MB (46.14%)
l_tax: 2.89 MB (27.06%)
The table is overall 84% as big (i.e. 16% smaller). A few columns got marginally
bigger. If the file filled the 1 GB, I'd expect the overhead to decrease even
more.
The restructuring to use a virtual call doesn't seem to change things much and
will go away when we codegen the scanner.
Here's what they look like with this patch (note this is on the before data files,
so only string cols are dictionary encoded).
Before query times:
Insert Time: 8.5 sec
select *: 2.3 sec
select avg(l_orderkey): .33 sec
After query times:
Insert Time: 9.5 sec <-- Longer due to doing dictionary encoding
select *: 2.4 sec <-- kind of noisy, possibly a slight slow down
select avg(l_orderkey): .33 sec
Change-Id: I213fdca1bb972cc200dc0cd9fb14b77a8d36d9e6
Reviewed-on: http://gerrit.ent.cloudera.com:8080/238
Tested-by: jenkins <kitchen-build@cloudera.com>
Reviewed-by: Skye Wanderman-Milne <skye@cloudera.com>
173 lines
4.6 KiB
Plaintext
173 lines
4.6 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Test that we properly write null values to text tables.
|
|
insert overwrite table nullinsert
|
|
select NULL, "", "NULL", "\\N", NULL from alltypes limit 1
|
|
---- SETUP
|
|
RESET nullinsert
|
|
---- RESULTS
|
|
: 1
|
|
====
|
|
---- QUERY
|
|
select * from nullinsert
|
|
---- SETUP
|
|
RELOAD nullinsert
|
|
---- TYPES
|
|
string, string, string, string, int
|
|
---- RESULTS
|
|
'NULL','','NULL','\N',NULL
|
|
====
|
|
---- QUERY
|
|
select * from nullinsert_alt
|
|
---- SETUP
|
|
RELOAD nullinsert_alt
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'\N,,NULL,\\N,\N'
|
|
====
|
|
---- QUERY
|
|
# Test NULL partition keys using static partition insert. Both partitions keys are NULL.
|
|
insert overwrite table alltypesinsert
|
|
partition(year=NULL, month=NULL)
|
|
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
|
|
float_col, double_col, date_string_col, string_col, timestamp_col
|
|
from alltypessmall
|
|
where year=2009 and month=4
|
|
---- SETUP
|
|
DROP PARTITIONS alltypesinsert
|
|
---- RESULTS
|
|
year=__HIVE_DEFAULT_PARTITION__/month=__HIVE_DEFAULT_PARTITION__/: 25
|
|
====
|
|
---- QUERY
|
|
# Verify contents of alltypesinsert.
|
|
select count(*) from alltypesinsert where year is null and month is null
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
25
|
|
====
|
|
---- QUERY
|
|
# Verify that dropping NULL partitions works in the SETUP section.
|
|
select * from alltypesinsert
|
|
---- SETUP
|
|
DROP PARTITIONS alltypesinsert
|
|
---- TYPES
|
|
int, int, int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp
|
|
---- RESULTS
|
|
====
|
|
---- QUERY
|
|
# Test NULL partition keys using static partition insert. Year partition key is NULL.
|
|
insert overwrite table alltypesinsert
|
|
partition(year=NULL, month=10)
|
|
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
|
|
float_col, double_col, date_string_col, string_col, timestamp_col
|
|
from alltypessmall
|
|
where year=2009 and month=4
|
|
---- SETUP
|
|
DROP PARTITIONS alltypesinsert
|
|
---- RESULTS
|
|
year=__HIVE_DEFAULT_PARTITION__/month=10/: 25
|
|
====
|
|
---- QUERY
|
|
# Verify contents of alltypesinsert.
|
|
select count(*) from alltypesinsert where year is null and month=10
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
25
|
|
====
|
|
---- QUERY
|
|
# Test NULL partition keys using dynamic partition insert. Month partition key is NULL.
|
|
insert overwrite table alltypesinsert
|
|
partition(year=2008, month=NULL)
|
|
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
|
|
float_col, double_col, date_string_col, string_col, timestamp_col
|
|
from alltypessmall
|
|
where year=2009 and month=4
|
|
---- SETUP
|
|
DROP PARTITIONS alltypesinsert
|
|
---- RESULTS
|
|
year=2008/month=__HIVE_DEFAULT_PARTITION__/: 25
|
|
====
|
|
---- QUERY
|
|
# Verify contents of alltypesinsert.
|
|
select count(*) from alltypesinsert where year=2008 and month is null
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
25
|
|
====
|
|
---- QUERY
|
|
# Test NULL partition keys using dynamic partition insert.
|
|
insert overwrite table alltypesinsert
|
|
partition(year, month)
|
|
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
|
|
float_col, double_col, date_string_col, string_col, timestamp_col,
|
|
cast(if(bool_col, NULL, 2007) as int) as year, cast(if(tinyint_col % 3 = 0, NULL, 6) as int) as month
|
|
from alltypessmall
|
|
where year=2009 and month=4
|
|
---- RESULTS: VERIFY_IS_EQUAL_SORTED
|
|
year=2007/month=6/: 8
|
|
year=2007/month=__HIVE_DEFAULT_PARTITION__/: 5
|
|
year=__HIVE_DEFAULT_PARTITION__/month=6/: 7
|
|
year=__HIVE_DEFAULT_PARTITION__/month=__HIVE_DEFAULT_PARTITION__/: 5
|
|
====
|
|
---- QUERY
|
|
# Verify contents of each new partition in alltypesinsert.
|
|
select count(*) from alltypesinsert where year=2007 and month=6
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
8
|
|
====
|
|
---- QUERY
|
|
# Verify contents of each new partition in alltypesinsert.
|
|
select count(*) from alltypesinsert where year=2007 and month is null
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
5
|
|
====
|
|
---- QUERY
|
|
# Verify contents of each new partition in alltypesinsert.
|
|
select count(*) from alltypesinsert where year is null and month=6
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
7
|
|
====
|
|
---- QUERY
|
|
# Verify contents of each new partition in alltypesinsert.
|
|
select count(*) from alltypesinsert where year is null and month is null
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
5
|
|
====
|
|
---- QUERY
|
|
# Insert nulls and non-null values into table with
|
|
# custom table property serialization.null.format='xyz'
|
|
insert overwrite nullformat_custom
|
|
select 1, NULL, NULL, NULL, NULL union all
|
|
select 2, true, "", 1, 1 union all
|
|
select 3, false, "NULL", 2, 2 union all
|
|
select 4, false, "xyz", 3, 3 union all
|
|
select 5, false, "xyzbar", 4, 4
|
|
---- RESULTS
|
|
: 5
|
|
====
|
|
---- QUERY
|
|
# Test correct interpretation of NULLs with custom
|
|
# table property serialization.null.format='xyz'
|
|
select id, a, b, b is null, c, d from nullformat_custom order by id limit 10
|
|
---- TYPES
|
|
int, boolean, string, boolean, int, double
|
|
---- RESULTS
|
|
1,NULL,'NULL',true,NULL,NULL
|
|
2,true,'',false,1,1
|
|
3,false,'NULL',false,2,2
|
|
4,false,'NULL',true,3,3
|
|
5,false,'xyzbar',false,4,4
|
|
====
|