Files
impala/testdata/workloads/functional-query/queries/QueryTest/insert_null.test
Nong Li a3bc1ce133 Some parquet encoder/decoder refactoring. Added dictionary to other types.
Split out the encoder/type for parquet reader/writer. I think this puts us
in a better place to support future encodings.

On the tpch lineitem table, the results are:
Before:
  BytesWritten: 236.45 MB
  Per Column Sizes:
    l_comment: 75.71 MB
    l_commitdate: 8.64 MB
    l_discount: 11.19 MB
    l_extendedprice: 33.02 MB
    l_linenumber: 4.56 MB
    l_linestatus: 869.98 KB
    l_orderkey: 8.99 MB
    l_partkey: 27.02 MB
    l_quantity: 11.58 MB
    l_receiptdate: 8.65 MB
    l_returnflag: 1.40 MB
    l_shipdate: 8.65 MB
    l_shipinstruct: 1.45 MB
    l_shipmode: 2.17 MB
    l_suppkey: 21.91 MB
    l_tax: 10.68 MB
After:
 BytesWritten: 198.63 MB            (84%)
  Per Column Sizes:
    l_comment: 75.71 MB             (100%)
    l_commitdate: 8.64 MB           (100%)
    l_discount: 2.89 MB             (25.8%)
    l_extendedprice: 33.13 MB       (100.33%)
    l_linenumber: 1.50 MB           (32.89%)
    l_linestatus: 870.26 KB         (100.032%)
    l_orderkey: 9.18 MB             (102.11%)
    l_partkey: 27.10 MB             (100.29%)
    l_quantity: 4.32 MB             (37.31%)
    l_receiptdate: 8.65 MB          (100%)
    l_returnflag: 1.40 MB           (100%)
    l_shipdate: 8.65 MB             (100%)
    l_shipinstruct: 1.45 MB         (100%)
    l_shipmode: 2.17 MB             (100%)
    l_suppkey: 10.11 MB             (46.14%)
    l_tax: 2.89 MB                  (27.06%)

The table is overall 84% as big (i.e. 16% smaller). A few columns got marginally
bigger. If the file filled  the 1 GB, I'd expect the overhead to decrease even
more.

The restructuring to use a virtual call doesn't seem to change things much and
will go away when we codegen the scanner.

Here's what they look like with this patch (note this is on the before data files,
so only string cols are dictionary encoded).

Before query times:
  Insert Time: 8.5 sec
  select *: 2.3 sec
  select avg(l_orderkey): .33 sec

After query times:
  Insert Time: 9.5 sec                  <-- Longer due to doing dictionary encoding
  select *: 2.4 sec                     <-- kind of noisy, possibly a slight slow down
  select avg(l_orderkey): .33 sec

Change-Id: I213fdca1bb972cc200dc0cd9fb14b77a8d36d9e6
Reviewed-on: http://gerrit.ent.cloudera.com:8080/238
Tested-by: jenkins <kitchen-build@cloudera.com>
Reviewed-by: Skye Wanderman-Milne <skye@cloudera.com>
2014-01-08 10:52:16 -08:00

173 lines
4.6 KiB
Plaintext

====
---- QUERY
# Test that we properly write null values to text tables.
insert overwrite table nullinsert
select NULL, "", "NULL", "\\N", NULL from alltypes limit 1
---- SETUP
RESET nullinsert
---- RESULTS
: 1
====
---- QUERY
select * from nullinsert
---- SETUP
RELOAD nullinsert
---- TYPES
string, string, string, string, int
---- RESULTS
'NULL','','NULL','\N',NULL
====
---- QUERY
select * from nullinsert_alt
---- SETUP
RELOAD nullinsert_alt
---- TYPES
string
---- RESULTS
'\N,,NULL,\\N,\N'
====
---- QUERY
# Test NULL partition keys using static partition insert. Both partitions keys are NULL.
insert overwrite table alltypesinsert
partition(year=NULL, month=NULL)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from alltypessmall
where year=2009 and month=4
---- SETUP
DROP PARTITIONS alltypesinsert
---- RESULTS
year=__HIVE_DEFAULT_PARTITION__/month=__HIVE_DEFAULT_PARTITION__/: 25
====
---- QUERY
# Verify contents of alltypesinsert.
select count(*) from alltypesinsert where year is null and month is null
---- TYPES
bigint
---- RESULTS
25
====
---- QUERY
# Verify that dropping NULL partitions works in the SETUP section.
select * from alltypesinsert
---- SETUP
DROP PARTITIONS alltypesinsert
---- TYPES
int, int, int, boolean, tinyint, smallint, int, bigint, float, double, string, string, timestamp
---- RESULTS
====
---- QUERY
# Test NULL partition keys using static partition insert. Year partition key is NULL.
insert overwrite table alltypesinsert
partition(year=NULL, month=10)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from alltypessmall
where year=2009 and month=4
---- SETUP
DROP PARTITIONS alltypesinsert
---- RESULTS
year=__HIVE_DEFAULT_PARTITION__/month=10/: 25
====
---- QUERY
# Verify contents of alltypesinsert.
select count(*) from alltypesinsert where year is null and month=10
---- TYPES
bigint
---- RESULTS
25
====
---- QUERY
# Test NULL partition keys using dynamic partition insert. Month partition key is NULL.
insert overwrite table alltypesinsert
partition(year=2008, month=NULL)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from alltypessmall
where year=2009 and month=4
---- SETUP
DROP PARTITIONS alltypesinsert
---- RESULTS
year=2008/month=__HIVE_DEFAULT_PARTITION__/: 25
====
---- QUERY
# Verify contents of alltypesinsert.
select count(*) from alltypesinsert where year=2008 and month is null
---- TYPES
bigint
---- RESULTS
25
====
---- QUERY
# Test NULL partition keys using dynamic partition insert.
insert overwrite table alltypesinsert
partition(year, month)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col,
cast(if(bool_col, NULL, 2007) as int) as year, cast(if(tinyint_col % 3 = 0, NULL, 6) as int) as month
from alltypessmall
where year=2009 and month=4
---- RESULTS: VERIFY_IS_EQUAL_SORTED
year=2007/month=6/: 8
year=2007/month=__HIVE_DEFAULT_PARTITION__/: 5
year=__HIVE_DEFAULT_PARTITION__/month=6/: 7
year=__HIVE_DEFAULT_PARTITION__/month=__HIVE_DEFAULT_PARTITION__/: 5
====
---- QUERY
# Verify contents of each new partition in alltypesinsert.
select count(*) from alltypesinsert where year=2007 and month=6
---- TYPES
bigint
---- RESULTS
8
====
---- QUERY
# Verify contents of each new partition in alltypesinsert.
select count(*) from alltypesinsert where year=2007 and month is null
---- TYPES
bigint
---- RESULTS
5
====
---- QUERY
# Verify contents of each new partition in alltypesinsert.
select count(*) from alltypesinsert where year is null and month=6
---- TYPES
bigint
---- RESULTS
7
====
---- QUERY
# Verify contents of each new partition in alltypesinsert.
select count(*) from alltypesinsert where year is null and month is null
---- TYPES
bigint
---- RESULTS
5
====
---- QUERY
# Insert nulls and non-null values into table with
# custom table property serialization.null.format='xyz'
insert overwrite nullformat_custom
select 1, NULL, NULL, NULL, NULL union all
select 2, true, "", 1, 1 union all
select 3, false, "NULL", 2, 2 union all
select 4, false, "xyz", 3, 3 union all
select 5, false, "xyzbar", 4, 4
---- RESULTS
: 5
====
---- QUERY
# Test correct interpretation of NULLs with custom
# table property serialization.null.format='xyz'
select id, a, b, b is null, c, d from nullformat_custom order by id limit 10
---- TYPES
int, boolean, string, boolean, int, double
---- RESULTS
1,NULL,'NULL',true,NULL,NULL
2,true,'',false,1,1
3,false,'NULL',false,2,2
4,false,'NULL',true,3,3
5,false,'xyzbar',false,4,4
====