Files
impala/testdata/workloads/functional-planner/queries/PlannerTest/insert.test
Zoltan Borok-Nagy 90f3b2f491 IMPALA-10432: INSERT INTO Iceberg tables with partition transforms
INSERT INTO Iceberg tables that use partition transforms. Partition
transforms are functions that calculate partition data from row data.

There are the following partition transforms in Iceberg:
https://iceberg.apache.org/spec/#partition-transforms

 * IDENTITY
 * BUCKET
 * TRUNCATE
 * YEAR
 * MONTH
 * DAY
 * HOUR

INSERT INTO identity-partitioned Iceberg tables are already supported.
This patch adds support for the rest of the transforms.

We create the partitioning expressions in InsertStmt. Based on these
expressions data are automatically shuffled and sorted by the backend
executors before rows are given to the table sink operators. The table
sink operator writes the partitions one-by-one and creates a
human-readable partition path for them.

In the end, we will convert the partition path to partition data and
create Iceberg DataFiles with information about the files written.

Testing:
 * added planner test
 * added e2e tests

Change-Id: I3edf02048cea78703837b248c55219c22d512b78
Reviewed-on: http://gerrit.cloudera.org:8080/16939
Reviewed-by: wangsheng <skyyws@163.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2021-01-18 18:46:42 +00:00

1004 lines
35 KiB
Plaintext

# insert into an unpartitioned table
insert into table functional.alltypesnopart
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from functional.alltypes
where year=2009 and month=05
---- PLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 5
HDFS partitions=1/24 files=1 size=20.36KB
row-size=81B cardinality=310
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=5/090501.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 5
HDFS partitions=1/24 files=1 size=20.36KB
row-size=81B cardinality=310
====
# insert into a static partition
insert into table functional.alltypessmall
partition (year=2009, month=04)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from functional.alltypes
where year=2009 and month=05
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,4)]
| partitions=1
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 5
HDFS partitions=1/24 files=1 size=20.36KB
row-size=81B cardinality=310
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=5/090501.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,4)]
| partitions=1
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 5
HDFS partitions=1/24 files=1 size=20.36KB
row-size=81B cardinality=310
====
# overwrite a static partition
insert overwrite table functional.alltypessmall
partition (year=2009, month=04)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from functional.alltypes
where year=2009 and month=05
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=true, PARTITION-KEYS=(2009,4)]
| partitions=1
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 5
HDFS partitions=1/24 files=1 size=20.36KB
row-size=81B cardinality=310
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=5/090501.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=true, PARTITION-KEYS=(2009,4)]
| partitions=1
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 5
HDFS partitions=1/24 files=1 size=20.36KB
row-size=81B cardinality=310
====
# insert into fully dynamic partitions
insert into table functional.alltypessmall
partition (year, month)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, year, month
from functional.alltypes
where year=2009 and month>10
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=610
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=89B cardinality=610
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=11/091101.txt 0:20179
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=12/091201.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
02:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=610
|
01:EXCHANGE [HASH(`year`,`month`)]
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=89B cardinality=610
====
# IMPALA-5293: noclustered hint prevents adding sort node
insert into table functional.alltypessmall
partition (year, month) /* +noclustered */
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, year, month
from functional.alltypes
where year=2009 and month>10
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(`year`,`month`)]
| partitions=24
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=89B cardinality=610
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=11/091101.txt 0:20179
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=12/091201.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(`year`,`month`)]
| partitions=24
|
01:EXCHANGE [HASH(`year`,`month`)]
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=89B cardinality=610
====
# insert into fully dynamic partitions. The source table has no stats and the insert
# statement has a partition clause, so hash partition before the sink.
insert into table functional.alltypessmall
partition (year, month)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, int_col, int_col
from functional_seq_snap.alltypes
where year=2009 and month>10
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(int_col,int_col)]
| partitions=unavailable
|
01:SORT
| order by: int_col ASC NULLS LAST, int_col ASC NULLS LAST
| row-size=72B cardinality=520
|
00:SCAN HDFS [functional_seq_snap.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=11.34KB
row-size=72B cardinality=520
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(int_col,int_col)]
| partitions=unavailable
|
02:SORT
| order by: int_col ASC NULLS LAST, int_col ASC NULLS LAST
| row-size=72B cardinality=520
|
01:EXCHANGE [HASH(int_col,int_col)]
|
00:SCAN HDFS [functional_seq_snap.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=11.34KB
row-size=72B cardinality=520
====
# insert into fully dynamic partitions;
# partitioned output doesn't require repartitioning
insert into table functional.alltypessmall
partition (year, month)
select min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col),
min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col),
min(timestamp_col), year, month
from functional.alltypes
where year=2009 and month>10
group by year, month
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(`year`,`month`)]
| partitions=24
|
02:SORT
| order by: `year` ASC NULLS LAST, `month` ASC NULLS LAST
| row-size=80B cardinality=24
|
01:AGGREGATE [FINALIZE]
| output: min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col), min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col), min(timestamp_col)
| group by: `year`, `month`
| row-size=80B cardinality=24
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=89B cardinality=610
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=11/091101.txt 0:20179
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=12/091201.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(`year`,`month`)]
| partitions=24
|
04:SORT
| order by: `year` ASC NULLS LAST, `month` ASC NULLS LAST
| row-size=80B cardinality=24
|
03:AGGREGATE [FINALIZE]
| output: min:merge(id), min:merge(bool_col), min:merge(tinyint_col), min:merge(smallint_col), min:merge(int_col), min:merge(bigint_col), min:merge(float_col), min:merge(double_col), min:merge(date_string_col), min:merge(string_col), min:merge(timestamp_col)
| group by: `year`, `month`
| row-size=80B cardinality=24
|
02:EXCHANGE [HASH(`year`,`month`)]
|
01:AGGREGATE [STREAMING]
| output: min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col), min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col), min(timestamp_col)
| group by: `year`, `month`
| row-size=80B cardinality=24
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=89B cardinality=610
====
# insert into a partially dynamic partition
insert into table functional.alltypessmall
partition (year=2009, month)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, month
from functional.alltypes
where year=2009 and month>10
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,month)]
| partitions=12
|
01:SORT
| order by: month ASC NULLS LAST
| row-size=85B cardinality=610
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=85B cardinality=610
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=11/091101.txt 0:20179
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=12/091201.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,month)]
| partitions=12
|
02:SORT
| order by: month ASC NULLS LAST
| row-size=85B cardinality=610
|
01:EXCHANGE [HASH(`month`)]
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=85B cardinality=610
====
# insert into a partially dynamic partition
# partitioned output doesn't require repartitioning
insert into table functional.alltypessmall
partition (year=2009, month)
select min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col),
min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col),
min(timestamp_col), month
from functional.alltypes
where year=2009 and month>10
group by month
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,`month`)]
| partitions=12
|
02:SORT
| order by: `month` ASC NULLS LAST
| row-size=76B cardinality=12
|
01:AGGREGATE [FINALIZE]
| output: min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col), min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col), min(timestamp_col)
| group by: `month`
| row-size=76B cardinality=12
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=85B cardinality=610
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=11/091101.txt 0:20179
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=12/091201.txt 0:20853
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2009,`month`)]
| partitions=12
|
04:SORT
| order by: `month` ASC NULLS LAST
| row-size=76B cardinality=12
|
03:AGGREGATE [FINALIZE]
| output: min:merge(id), min:merge(bool_col), min:merge(tinyint_col), min:merge(smallint_col), min:merge(int_col), min:merge(bigint_col), min:merge(float_col), min:merge(double_col), min:merge(date_string_col), min:merge(string_col), min:merge(timestamp_col)
| group by: `month`
| row-size=76B cardinality=12
|
02:EXCHANGE [HASH(`month`)]
|
01:AGGREGATE [STREAMING]
| output: min(id), min(bool_col), min(tinyint_col), min(smallint_col), min(int_col), min(bigint_col), min(float_col), min(double_col), min(date_string_col), min(string_col), min(timestamp_col)
| group by: `month`
| row-size=76B cardinality=12
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` > 10
HDFS partitions=2/24 files=2 size=40.07KB
row-size=85B cardinality=610
====
# insert into a partially dynamic partition
insert into table functional.alltypessmall
partition (year, month=4)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, year
from functional.alltypes
where year>2009 and month=4
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,4)]
| partitions=2
|
01:SORT
| order by: year ASC NULLS LAST
| row-size=85B cardinality=300
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` > 2009, `month` = 4
HDFS partitions=1/24 files=1 size=19.71KB
row-size=85B cardinality=300
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2010/month=4/100401.txt 0:20179
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(year,4)]
| partitions=2
|
01:SORT
| order by: year ASC NULLS LAST
| row-size=85B cardinality=300
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` > 2009, `month` = 4
HDFS partitions=1/24 files=1 size=19.71KB
row-size=85B cardinality=300
====
# insert with limit from partitioned table.
insert into table functional.alltypesnopart
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from functional.alltypes where year=2009 and month=1 limit 10
---- PLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 1
HDFS partitions=1/24 files=1 size=19.95KB
limit: 10
row-size=81B cardinality=10
---- SCANRANGELOCATIONS
NODE 0:
HDFS SPLIT hdfs://localhost:20500/test-warehouse/alltypes/year=2009/month=1/090101.txt 0:20433
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
01:EXCHANGE [UNPARTITIONED]
| limit: 10
|
00:SCAN HDFS [functional.alltypes]
partition predicates: `year` = 2009, `month` = 1
HDFS partitions=1/24 files=1 size=19.95KB
limit: 10
row-size=81B cardinality=10
====
# static partition insert from a constant select
insert into table functional.alltypessmall
partition (year=2010, month=4)
select 100, false, 1, 1, 1, 10,
10.0, 10.0, "02/01/09", "1", cast("2009-02-01 00:01:00" as timestamp)
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=1
|
00:UNION
constant-operands=1
row-size=54B cardinality=1
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=1
|
00:UNION
constant-operands=1
row-size=54B cardinality=1
====
# dynamic partition insert from a constant select
insert into table functional.alltypessmall
partition (year, month)
select 100, false, 1, 1, 1, 10,
10.0, 10.0, "02/01/09", "1", cast("2009-02-01 00:01:00" as timestamp), 2010, 4
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=1
|
00:UNION
constant-operands=1
row-size=57B cardinality=1
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=1
|
00:UNION
constant-operands=1
row-size=57B cardinality=1
====
# static partition insert from values statement
insert into table functional.alltypessmall
partition (year=2010, month=4) values
(100, false, 1, 1, 1, 10, 10.0, 10.0, "02/01/09", "1", cast("2009-02-01 00:01:00" as timestamp)),
(200, true, 2, 2, 2, 20, 20.0, 20.0, "02/02/09", "2", cast("2009-02-02 00:01:00" as timestamp)),
(300, false, 3, 3, 3, 30, 30.0, 30.0, "02/03/09", "3", cast("2009-02-03 00:01:00" as timestamp))
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=1
|
00:UNION
constant-operands=3
row-size=55B cardinality=3
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=1
|
00:UNION
constant-operands=3
row-size=55B cardinality=3
====
# dynamic partition insert from values statement
insert into table functional.alltypessmall
partition (year, month) values
(100, false, 1, 1, 1, 10, 10.0, 10.0, "02/01/09", "1", cast("2009-02-01 00:01:00" as timestamp), 2010, 4),
(200, true, 2, 2, 2, 20, 20.0, 20.0, "02/02/09", "2", cast("2009-02-02 00:01:00" as timestamp), 2010, 5),
(300, false, 3, 3, 3, 30, 30.0, 30.0, "02/03/09", "3", cast("2009-02-03 00:01:00" as timestamp), 2010, 6)
---- PLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=9
|
01:SORT
| order by: 2010 ASC NULLS LAST, 4 ASC NULLS LAST
| row-size=58B cardinality=3
|
00:UNION
constant-operands=3
row-size=58B cardinality=3
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypessmall, OVERWRITE=false, PARTITION-KEYS=(2010,4)]
| partitions=9
|
01:SORT
| order by: 2010 ASC NULLS LAST, 4 ASC NULLS LAST
| row-size=58B cardinality=3
|
00:UNION
constant-operands=3
row-size=58B cardinality=3
====
# test static partition insert from a query with grouped aggregation
# we expect the insert fragment to be partitioned by the grouping exprs of the query stmt
# and not by the partition exprs of the insert stmt
insert into functional.alltypes(bigint_col, string_col) partition (year=2010, month=10)
select count(int_col), string_col from functional.alltypes
group by string_col
---- PLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2010,10)]
| partitions=1
|
01:AGGREGATE [FINALIZE]
| output: count(int_col)
| group by: string_col
| row-size=21B cardinality=10
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=17B cardinality=7.30K
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2010,10)]
| partitions=1
|
03:AGGREGATE [FINALIZE]
| output: count:merge(int_col)
| group by: string_col
| row-size=21B cardinality=10
|
02:EXCHANGE [HASH(string_col)]
|
01:AGGREGATE [STREAMING]
| output: count(int_col)
| group by: string_col
| row-size=21B cardinality=10
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=17B cardinality=7.30K
====
# test static partition insert from a query with distinct grouped aggregation
# we expect the insert fragment to be partitioned by the grouping exprs of the query stmt
# and not by the partition exprs of the insert stmt
insert into functional.alltypes(bigint_col, string_col) partition (year=2010, month=10)
select count(distinct int_col), string_col from functional.alltypes
group by string_col
---- PLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2010,10)]
| partitions=1
|
02:AGGREGATE [FINALIZE]
| output: count(int_col)
| group by: string_col
| row-size=21B cardinality=10
|
01:AGGREGATE
| group by: string_col, int_col
| row-size=17B cardinality=100
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=17B cardinality=7.30K
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2010,10)]
| partitions=1
|
06:AGGREGATE [FINALIZE]
| output: count:merge(int_col)
| group by: string_col
| row-size=21B cardinality=10
|
05:EXCHANGE [HASH(string_col)]
|
02:AGGREGATE [STREAMING]
| output: count(int_col)
| group by: string_col
| row-size=21B cardinality=10
|
04:AGGREGATE
| group by: string_col, int_col
| row-size=17B cardinality=100
|
03:EXCHANGE [HASH(string_col,int_col)]
|
01:AGGREGATE [STREAMING]
| group by: string_col, int_col
| row-size=17B cardinality=100
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=17B cardinality=7.30K
====
# test that the planner chooses to repartition before the table sink
# alltypes has column stats and based on the product of the NDVs of year and month
# the planner should choose to repartition before the table sink
insert into table functional.alltypes partition(year, month)
select * from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
02:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
01:EXCHANGE [HASH(functional.alltypes.year,functional.alltypes.month)]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
====
# test noshuffle hint to prevent repartitioning (same query as above with hint)
insert into table functional.alltypes partition(year, month) [noshuffle]
select * from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
====
# same as above but with traditional commented hint at default hint location
insert into table functional.alltypes partition(year, month) /* +noshuffle */
select * from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
====
# same as above but with traditional commented hint at Oracle hint location
insert /* +noshuffle */ into table functional.alltypes partition(year, month)
select * from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
====
# same as above but with enf-of-line commented hint
insert into table functional.alltypes partition(year, month)
-- +noshuffle
select * from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
====
# test that the planner does not repartition before the table sink
# alltypes has column stats and since year only has 2 distinct values the planner
# should choose not to repartition before the table sink
insert into table functional.alltypes partition(year, month=1)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, year
from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,1)]
| partitions=2
|
01:SORT
| order by: year ASC NULLS LAST
| row-size=85B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=85B cardinality=7.30K
====
# test shuffle hint to force repartitioning (same query as above with hint)
insert into table functional.alltypes partition(year, month=1) [shuffle]
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, year
from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,1)]
| partitions=2
|
02:SORT
| order by: year ASC NULLS LAST
| row-size=85B cardinality=7.30K
|
01:EXCHANGE [HASH(`year`)]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=85B cardinality=7.30K
====
# test insert/select stmt that contains an analytic function (IMPALA-1400)
insert into table functional.alltypestiny partition(year=2009, month=1)
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col,
lag(timestamp_col, 1) over (partition by id order by id) as timestamp_col
from functional.alltypestiny
---- PLAN
WRITE TO HDFS [functional.alltypestiny, OVERWRITE=false, PARTITION-KEYS=(2009,1)]
| partitions=1
|
02:ANALYTIC
| functions: lag(timestamp_col, 1, NULL)
| partition by: id
| order by: id ASC
| window: ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
| row-size=97B cardinality=8
|
01:SORT
| order by: id ASC NULLS LAST
| row-size=81B cardinality=8
|
00:SCAN HDFS [functional.alltypestiny]
HDFS partitions=4/4 files=4 size=460B
row-size=81B cardinality=8
====
# IMPALA-3930: Test insert with shuffle hint on constant partition exprs. The table sink
# is executed at the coordinator.
insert into table functional.alltypes partition(year=2009, month=1) /* +shuffle */
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2009,1)]
| partitions=1
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=81B cardinality=7.30K
====
# IMPALA-3930: Same as above but with a dynamic partition insert.
insert into table functional.alltypes partition(year, month) /* +shuffle */
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, 2009, 1
from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2009,1)]
| partitions=1
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=81B cardinality=7.30K
====
# IMPALA-3930: Same as above but with a mix of static/dynamic partition exprs, and
# with more complex constant exprs.
insert into table functional.alltypes partition(year, month=cast(10/2 as int)) /* +shuffle */
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col, cast(concat("2", "010") as smallint) - 1
from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(2009,5)]
| partitions=1
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=81B cardinality=7.30K
====
# Test insert into an unpartitioned table with shuffle hint.
insert into table functional.alltypesnopart /* +shuffle */
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col,
float_col, double_col, date_string_col, string_col, timestamp_col
from functional.alltypes
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=81B cardinality=7.30K
====
# IMPALA-5293: ensure insert into partitioned table adds sort node without clustered hint.
insert into table functional.alltypes partition(year, month)
select * from functional.alltypes
---- PLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
02:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
01:EXCHANGE [HASH(functional.alltypes.year,functional.alltypes.month)]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
====
# IMPALA-5293: ensure insert into partitioned table adds sort node without clustered hint.
insert into table functional.alltypes partition(year, month) /*+ noshuffle */
select * from functional.alltypes
---- PLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
01:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=89B cardinality=7.30K
====
# IMPALA-5293: ensure insert into partitioned table adds sort node without clustered hint.
# Subquery in WHERE-clause exercises the reset() + analyze() path during rewrite.
insert into table functional.alltypes partition(year, month)
select * from functional.alltypes
where int_col = (select max(int_col) from functional.alltypes)
---- PLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
04:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=730
|
03:HASH JOIN [LEFT SEMI JOIN]
| hash predicates: int_col = max(int_col)
| runtime filters: RF000 <- max(int_col)
| row-size=89B cardinality=730
|
|--02:AGGREGATE [FINALIZE]
| | output: max(int_col)
| | row-size=4B cardinality=1
| |
| 01:SCAN HDFS [functional.alltypes]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=4B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> int_col
row-size=89B cardinality=7.30K
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypes, OVERWRITE=false, PARTITION-KEYS=(year,month)]
| partitions=24
|
08:SORT
| order by: year ASC NULLS LAST, month ASC NULLS LAST
| row-size=89B cardinality=730
|
07:EXCHANGE [HASH(functional.alltypes.year,functional.alltypes.month)]
|
03:HASH JOIN [LEFT SEMI JOIN, BROADCAST]
| hash predicates: int_col = max(int_col)
| runtime filters: RF000 <- max(int_col)
| row-size=89B cardinality=730
|
|--06:EXCHANGE [BROADCAST]
| |
| 05:AGGREGATE [FINALIZE]
| | output: max:merge(int_col)
| | row-size=4B cardinality=1
| |
| 04:EXCHANGE [UNPARTITIONED]
| |
| 02:AGGREGATE
| | output: max(int_col)
| | row-size=4B cardinality=1
| |
| 01:SCAN HDFS [functional.alltypes]
| HDFS partitions=24/24 files=24 size=478.45KB
| row-size=4B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> int_col
row-size=89B cardinality=7.30K
====
# IMPALA-5293: ensure insert into non-partitioned table does not add sort node.
insert into table functional.alltypesnopart
select * from functional.alltypesnopart
---- PLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
00:SCAN HDFS [functional.alltypesnopart]
HDFS partitions=1/1 files=0 size=0B
row-size=72B cardinality=0
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
00:SCAN HDFS [functional.alltypesnopart]
HDFS partitions=1/1 files=0 size=0B
row-size=72B cardinality=0
====
# IMPALA-5293: ensure insert into non-partitioned table does not add sort node.
insert into table functional.alltypesnopart /*+ shuffle */
select * from functional.alltypesnopart
---- PLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
00:SCAN HDFS [functional.alltypesnopart]
HDFS partitions=1/1 files=0 size=0B
row-size=72B cardinality=0
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional.alltypesnopart, OVERWRITE=false]
| partitions=1
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN HDFS [functional.alltypesnopart]
HDFS partitions=1/1 files=0 size=0B
row-size=72B cardinality=0
====
# Test that Iceberg partitioned inserts shuffle and sort data based on the
# partitioning columns.
insert into functional_parquet.iceberg_int_partitioned
select id % 3, id % 2, id from functional.alltypes
---- PLAN
WRITE TO HDFS [functional_parquet.iceberg_int_partitioned, OVERWRITE=false, PARTITION-KEYS=(id % 3,id % 2)]
| partitions=53290000
|
01:SORT
| order by: id % 3 ASC NULLS LAST, id % 2 ASC NULLS LAST
| row-size=4B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=4B cardinality=7.30K
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional_parquet.iceberg_int_partitioned, OVERWRITE=false, PARTITION-KEYS=(id % 3,id % 2)]
| partitions=53290000
|
02:SORT
| order by: id % 3 ASC NULLS LAST, id % 2 ASC NULLS LAST
| row-size=4B cardinality=7.30K
|
01:EXCHANGE [HASH(id % 3,id % 2)]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=4B cardinality=7.30K
====
# Test that Iceberg partition transform inserts shuffle and sort data based on the
# transforms. Also the table is ZORDERed, which should be also present in the SORT node.
insert into functional_parquet.iceberg_partition_transforms_zorder
select years_add(timestamp_col, id % 3),
concat(string_col, date_string_col),
cast(id * 3 as int),
cast(10000 - id as int)
from functional.alltypes
---- PLAN
WRITE TO HDFS [functional_parquet.iceberg_partition_transforms_zorder, OVERWRITE=false, PARTITION-KEYS=(year(years_add(timestamp_col, id % 3)),iceberg_bucket_transform(concat(string_col, date_string_col), 5))]
| partitions=5372800
|
01:SORT
| order by: LEXICAL: year(years_add(timestamp_col, id % 3)) ASC NULLS LAST, iceberg_bucket_transform(concat(string_col, date_string_col), 5) ASC NULLS LAST, ZORDER: CAST(id * 3 AS INT), CAST(10000 - id AS INT)
| row-size=61B cardinality=7.30K
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=53B cardinality=7.30K
---- DISTRIBUTEDPLAN
WRITE TO HDFS [functional_parquet.iceberg_partition_transforms_zorder, OVERWRITE=false, PARTITION-KEYS=(year(years_add(timestamp_col, id % 3)),iceberg_bucket_transform(concat(string_col, date_string_col), 5))]
| partitions=5372800
|
02:SORT
| order by: LEXICAL: year(years_add(timestamp_col, id % 3)) ASC NULLS LAST, iceberg_bucket_transform(concat(string_col, date_string_col), 5) ASC NULLS LAST, ZORDER: CAST(id * 3 AS INT), CAST(10000 - id AS INT)
| row-size=61B cardinality=7.30K
|
01:EXCHANGE [HASH(year(years_add(timestamp_col, id % 3)),iceberg_bucket_transform(concat(string_col, date_string_col), 5))]
|
00:SCAN HDFS [functional.alltypes]
HDFS partitions=24/24 files=24 size=478.45KB
row-size=53B cardinality=7.30K
====