Files
impala/testdata/workloads/functional-query/queries/QueryTest/kudu_insert.test
Thomas Tauber-Marshall ad0c6e7499 IMPALA-5498: Support for partial sorts in Kudu INSERTs
Impala currently supports total sorts (the entire set of data
is sorted) and top-n sorts (only the highest/lowest n elements
are sorted). This patch adds the ability to do partial sorts,
where the data is divided up into some number of subsets, each
of which is sorted individually.

It accomplishes this by adding a new exec node, PartialSortNode.
When PartialSortNode::GetNext() is called, it retrieves input
up to the query memory limit, uses the existing Sorter class to sort
it, and outputs it. This is faster than a total sort with SortNode
as it avoids the need to spill if the input is larger than the
memory limit.

Future work will look into setting a more restrictive memory limit
on the PartialSortNode. (IMPALA-5669)

In the planner, the SortNode plan node is used, with an enum value
indicating if it is a total or partial sort.

This also adds a new counter 'RunSize' to the runtime profile which
tracks the min, max, and avg size of the generated runs, in tuples.

As a first use case, partial sort is used where a total sort was
used previously for inserts/upserts into Kudu tables only. Future
work can extend this to other table sinks. (IMPALA-5649)

Testing:
- E2E test with a large INSERT into a Kudu table with a mem limit.
  Checks that no spills occurred.
- Updated planner tests.
- Existing E2E tests and stress test verify correctness of INSERT.
- Perf tests on the 10 node cluster: inserting tpch_100.lineitem
  into a Kudu table with mem_limit=3gb:
  Previously: 5 runs are spilled, sort took 7m33s
  Now: no spills, sort takes 6m19s, for ~18% speedup

Change-Id: Ieec2a15a0cc5240b1c13682067ab64670d1e0a38
Reviewed-on: http://gerrit.cloudera.org:8080/7267
Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com>
Tested-by: Impala Public Jenkins
2017-07-22 00:28:36 +00:00

441 lines
14 KiB
Plaintext

====
---- QUERY
create table tdata
(id int primary key, valf float null, vali bigint null, valv string null,
valb boolean null, valt tinyint null, vals smallint null, vald double null,
ts timestamp)
PARTITION BY RANGE (PARTITION VALUES < 10, PARTITION 10 <= VALUES < 30,
PARTITION 30 <= VALUES) STORED AS KUDU
---- RESULTS
====
---- QUERY
# VALUES, single row, all target cols, no errors
insert into tdata values (1, 1, 1, 'one', true, 1, 1, 1,
cast('1987-05-19 00:00:00' as timestamp))
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# VALUES, single row, all target cols, NULL
insert into tdata values (2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL)
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
2,NULL,NULL,'NULL',NULL,NULL,NULL,NULL,NULL
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# VALUES, single row, all target cols, boundary values. The timestamp value is the max
# possible value that Impala can represent; it gets truncated rather than rounded up to
# the nearest microsecond. If it were rounded up, it wouldn't be readable by Impala.
insert into tdata values
(3, cast('nan' as float), max_bigint(), '', true, min_tinyint(), max_smallint(),
cast('-inf' as double),
nanoseconds_add(cast('9999-12-31 23:59:59' as timestamp), 999999999))
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
2,NULL,NULL,'NULL',NULL,NULL,NULL,NULL,NULL
3,NaN,9223372036854775807,'',true,-128,32767,-Infinity,9999-12-31 23:59:59.999999000
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# VALUES, single row, subset of target cols
insert into tdata (valb, vald, id) values (true, 0, 4)
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
2,NULL,NULL,'NULL',NULL,NULL,NULL,NULL,NULL
3,NaN,9223372036854775807,'',true,-128,32767,-Infinity,9999-12-31 23:59:59.999999000
4,NULL,NULL,'NULL',true,NULL,NULL,0,NULL
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# VALUES, multiple rows, all target cols
insert into tdata values
(5, 5.0, 5, 'five', false, NULL, NULL, NULL, NULL),
(6, 16, 60, '', true, 0, -1, -6, cast('2010-12-31 23:59:59' as timestamp)),
(7, NULL, 10, NULL, false, max_tinyint(), -7, 2, cast('1400-01-01 00:00:00' as timestamp))
---- RUNTIME_PROFILE
NumModifiedRows: 3
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
2,NULL,NULL,'NULL',NULL,NULL,NULL,NULL,NULL
3,NaN,9223372036854775807,'',true,-128,32767,-Infinity,9999-12-31 23:59:59.999999000
4,NULL,NULL,'NULL',true,NULL,NULL,0,NULL
5,5.0,5,'five',false,NULL,NULL,NULL,NULL
6,16,60,'',true,0,-1,-6,2010-12-31 23:59:59
7,NULL,10,'NULL',false,127,-7,2,1400-01-01 00:00:00
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# VALUES, multiple rows, subset of cols
insert into tdata (valv, valf, vali, id) values
('eight', 88, 888, 8),
(NULL, -9, -99, 9)
---- RUNTIME_PROFILE
NumModifiedRows: 2
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
2,NULL,NULL,'NULL',NULL,NULL,NULL,NULL,NULL
3,NaN,9223372036854775807,'',true,-128,32767,-Infinity,9999-12-31 23:59:59.999999000
4,NULL,NULL,'NULL',true,NULL,NULL,0,NULL
5,5.0,5,'five',false,NULL,NULL,NULL,NULL
6,16,60,'',true,0,-1,-6,2010-12-31 23:59:59
7,NULL,10,'NULL',false,127,-7,2,1400-01-01 00:00:00
8,88,888,'eight',NULL,NULL,NULL,NULL,NULL
9,-9,-99,'NULL',NULL,NULL,NULL,NULL,NULL
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# SELECT, single row, all target cols
insert into tdata
select id, float_col, bigint_col, string_col, bool_col, tinyint_col, smallint_col,
double_col, timestamp_col
from functional.alltypes where id = 10
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
2,NULL,NULL,'NULL',NULL,NULL,NULL,NULL,NULL
3,NaN,9223372036854775807,'',true,-128,32767,-Infinity,9999-12-31 23:59:59.999999000
4,NULL,NULL,'NULL',true,NULL,NULL,0,NULL
5,5.0,5,'five',false,NULL,NULL,NULL,NULL
6,16,60,'',true,0,-1,-6,2010-12-31 23:59:59
7,NULL,10,'NULL',false,127,-7,2,1400-01-01 00:00:00
8,88,888,'eight',NULL,NULL,NULL,NULL,NULL
9,-9,-99,'NULL',NULL,NULL,NULL,NULL,NULL
10,0,0,'0',true,0,0,0,2009-01-02 00:10:00.450000000
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# SELECT, single row, subset of cols
insert into tdata (id, vald, valb, vali, ts)
select id, double_col, bool_col, bigint_col, timestamp_col
from functional.alltypes where id = 11
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
1,1,1,'one',true,1,1,1,1987-05-19 00:00:00
2,NULL,NULL,'NULL',NULL,NULL,NULL,NULL,NULL
3,NaN,9223372036854775807,'',true,-128,32767,-Infinity,9999-12-31 23:59:59.999999000
4,NULL,NULL,'NULL',true,NULL,NULL,0,NULL
5,5.0,5,'five',false,NULL,NULL,NULL,NULL
6,16,60,'',true,0,-1,-6,2010-12-31 23:59:59
7,NULL,10,'NULL',false,127,-7,2,1400-01-01 00:00:00
8,88,888,'eight',NULL,NULL,NULL,NULL,NULL
9,-9,-99,'NULL',NULL,NULL,NULL,NULL,NULL
10,0,0,'0',true,0,0,0,2009-01-02 00:10:00.450000000
11,NULL,10,'NULL',false,NULL,NULL,10.1,2009-01-02 00:11:00.450000000
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
delete tdata
---- DML_RESULTS: tdata
====
---- QUERY
# SELECT, multiple rows, all target cols
insert into tdata
select id, float_col, bigint_col, string_col, bool_col, tinyint_col, smallint_col,
double_col, timestamp_col
from functional.alltypes where id < 2
---- RUNTIME_PROFILE
NumModifiedRows: 2
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
0,0,0,'0',true,0,0,0,2009-01-01 00:00:00
1,1.100000023841858,10,'1',false,1,1,10.1,2009-01-01 00:01:00
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# SELECT, multiple rows, subset of cols
insert into tdata (vals, id, valt, vald, ts)
select smallint_col, id, tinyint_col, double_col, timestamp_col
from functional.alltypes where id > 2 and id < 6
---- RUNTIME_PROFILE
NumModifiedRows: 3
NumRowErrors: 0
---- LABELS
ID, VALF, VALI, VALV, VALB, VALT, VALS, VALD, TS
---- DML_RESULTS: tdata
0,0,0,'0',true,0,0,0,2009-01-01 00:00:00
1,1.100000023841858,10,'1',false,1,1,10.1,2009-01-01 00:01:00
3,NULL,NULL,'NULL',NULL,3,3,30.3,2009-01-01 00:03:00.300000000
4,NULL,NULL,'NULL',NULL,4,4,40.4,2009-01-01 00:04:00.600000000
5,NULL,NULL,'NULL',NULL,5,5,50.5,2009-01-01 00:05:00.100000000
---- TYPES
INT,FLOAT,BIGINT,STRING,BOOLEAN,TINYINT,SMALLINT,DOUBLE,TIMESTAMP
====
---- QUERY
# Make sure we can insert empty strings into string columns and that we can scan them
# back.
insert into tdata values (320, 2.0, 932, cast('' as string), false, 0, 0, 0, NULL)
---- RESULTS
: 1
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
====
---- QUERY
select id, valv, valb from tdata where id = 320;
---- RESULTS
320,'',false
---- TYPES
INT,STRING,BOOLEAN
====
---- QUERY
insert into tdata values
(666, cast(1.2 as float), 43, cast('z' as string), true, 0, 0, 0, NULL)
---- RESULTS
: 1
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
====
---- QUERY
# insert row with primary key that already exists
insert into tdata values
(666, cast(1.2 as float), 43, cast('z' as VARCHAR(20)), true, 0, 0, 0, NULL)
---- RESULTS
: 0
---- RUNTIME_PROFILE
NumModifiedRows: 0
NumRowErrors: 1
====
---- QUERY
create table kudu_test_tbl primary key(id)
partition by range(id) (partition values < 100, partition 100 <= values <= 10000)
stored as kudu as
select * from functional_kudu.alltypes where id < 100;
---- RESULTS
'Inserted 100 row(s)'
---- RUNTIME_PROFILE
NumModifiedRows: 100
NumRowErrors: 0
====
---- QUERY
insert into kudu_test_tbl
select * from functional_kudu.alltypes where id < 100;
---- RESULTS
: 0
---- RUNTIME_PROFILE
NumModifiedRows: 0
NumRowErrors: 100
====
---- QUERY
# large insert - 100 rows were already inserted above and result in errors
insert into kudu_test_tbl
select * from functional_kudu.alltypes;
---- RESULTS
: 7200
---- RUNTIME_PROFILE
NumModifiedRows: 7200
NumRowErrors: 100
====
---- QUERY
# Insert rows that are not covered by any of the existing range partitions
# Only the row at 10000 is inserted.
insert into kudu_test_tbl SELECT cast(id + 10000 as int), bool_col, tinyint_col,
smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col,
timestamp_col, year, month
from functional_kudu.alltypes
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 7299
====
---- QUERY
# Table with all supported types as primary key and distribution columns
create table allkeytypes (i1 tinyint, i2 smallint, i3 int, i4 bigint, name string,
valf float, vald double, primary key (i1, i2, i3, i4, name)) partition by
hash partitions 3, range (partition value = (1,1,1,1,'1'),
partition value = (2,2,2,2,'2'), partition value = (3,3,3,3,'3')) stored as kudu
---- RESULTS
====
---- QUERY
insert into allkeytypes select cast(id as tinyint), smallint_col, int_col,
cast (bigint_col/10 as bigint), string_col, float_col, double_col
from functional.alltypes where id > 0 and id < 10
---- RESULTS
: 3
---- RUNTIME_PROFILE
NumModifiedRows: 3
NumRowErrors: 6
====
---- QUERY
# Table with default values
create table tbl_with_defaults (a int primary key, b int null default 10,
c int not null default 100, d int default 1000, e int null, f int not null,
g string default 'test', h boolean default true) partition by hash (a)
partitions 3 stored as kudu
---- RESULTS
====
---- QUERY
insert into tbl_with_defaults (a, f) values (1, 1), (2, 2), (3, 3), (4, 4)
---- RUNTIME_PROFILE
NumModifiedRows: 4
NumRowErrors: 0
---- LABELS
A, B, C, D, E, F, G, H
---- DML_RESULTS: tbl_with_defaults
1,10,100,1000,NULL,1,'test',true
2,10,100,1000,NULL,2,'test',true
3,10,100,1000,NULL,3,'test',true
4,10,100,1000,NULL,4,'test',true
---- TYPES
INT,INT,INT,INT,INT,INT,STRING,BOOLEAN
====
---- QUERY
insert into tbl_with_defaults values (5, 5, 5, 5, 5, 5, 'row', false)
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
A, B, C, D, E, F, G, H
---- DML_RESULTS: tbl_with_defaults
1,10,100,1000,NULL,1,'test',true
2,10,100,1000,NULL,2,'test',true
3,10,100,1000,NULL,3,'test',true
4,10,100,1000,NULL,4,'test',true
5,5,5,5,5,5,'row',false
---- TYPES
INT,INT,INT,INT,INT,INT,STRING,BOOLEAN
====
---- QUERY
alter table tbl_with_defaults add columns (i int null, j int not null default 10000)
---- RESULTS
====
---- QUERY
select * from tbl_with_defaults
---- RESULTS
1,10,100,1000,NULL,1,'test',true,NULL,10000
2,10,100,1000,NULL,2,'test',true,NULL,10000
3,10,100,1000,NULL,3,'test',true,NULL,10000
4,10,100,1000,NULL,4,'test',true,NULL,10000
5,5,5,5,5,5,'row',false,NULL,10000
---- TYPES
INT,INT,INT,INT,INT,INT,STRING,BOOLEAN,INT,INT
====
---- QUERY
insert into tbl_with_defaults values (6,6,6,6,6,6,'another row',false,6,6)
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
A, B, C, D, E, F, G, H, I, J
---- DML_RESULTS: tbl_with_defaults
1,10,100,1000,NULL,1,'test',true,NULL,10000
2,10,100,1000,NULL,2,'test',true,NULL,10000
3,10,100,1000,NULL,3,'test',true,NULL,10000
4,10,100,1000,NULL,4,'test',true,NULL,10000
5,5,5,5,5,5,'row',false,NULL,10000
6,6,6,6,6,6,'another row',false,6,6
---- TYPES
INT,INT,INT,INT,INT,INT,STRING,BOOLEAN,INT,INT
====
---- QUERY
# IMPALA-5217: Try to insert NULL to a 'NOT NULL' col with a target col list that leaves
# out some cols.
insert into tbl_with_defaults (a, c, f) values (0, null, 1)
---- RUNTIME_PROFILE
NumModifiedRows: 0
NumRowErrors: 1
====
---- QUERY
# IMPALA-5217: Insert NULL into a nullable col when a non-nullable col has been left out
# of the target col list.
insert into tbl_with_defaults (a, b, d, f) values (0, 0, null, 0)
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
A, B, C, D, E, F, G, H, I, J
---- DML_RESULTS: tbl_with_defaults
0,0,100,NULL,NULL,0,'test',true,NULL,10000
1,10,100,1000,NULL,1,'test',true,NULL,10000
2,10,100,1000,NULL,2,'test',true,NULL,10000
3,10,100,1000,NULL,3,'test',true,NULL,10000
4,10,100,1000,NULL,4,'test',true,NULL,10000
5,5,5,5,5,5,'row',false,NULL,10000
6,6,6,6,6,6,'another row',false,6,6
---- TYPES
INT,INT,INT,INT,INT,INT,STRING,BOOLEAN,INT,INT
====
---- QUERY
create table multiple_partition_cols (x bigint, y bigint, z string, primary key(x, y))
partition by hash(x, y) partitions 8 stored as kudu
---- RESULTS
====
---- QUERY
# SELECT with constant
insert into multiple_partition_cols select 0, bigint_col, string_col
from functional.alltypes where id = 0
---- RUNTIME_PROFILE
NumModifiedRows: 1
NumRowErrors: 0
---- LABELS
X,Y,Z
---- DML_RESULTS: multiple_partition_cols
0,0,'0'
---- TYPES
BIGINT,BIGINT,STRING
====
---- QUERY
# SELECT with constant NULL
insert into multiple_partition_cols select bigint_col, null, string_col
from functional.alltypes where id = 1
---- RESULTS
: 0
---- RUNTIME_PROFILE
NumModifiedRows: 0
NumRowErrors: 1
====
---- QUERY
# IMPALA-5611 - test a large insert with a memory limit
set mem_limit=400m;
create table kudu_test primary key(a, b) partition by hash(a, b) partitions 8 stored as kudu as
select l_orderkey a, concat(l_comment, l_comment, l_comment) b from tpch.lineitem
---- RUNTIME_PROFILE
row_regex: .*SpilledRuns: 0 \(0\)
====