Files
impala/testdata/workloads/functional-query/queries/QueryTest/iceberg-optimize.test
Zoltan Borok-Nagy d086babdbd IMPALA-13598: OPTIMIZE redundantly accumulates memory in HDFS WRITER
When OptimizeStmt created the table sink it didn't set
'inputIsClustered' to true. Therefore HdfsTableSink expected
random input and kept the output writers open for every partition,
which resulted in high memory consumption and potentially an
OOM error when the number of partitions are high.

Since we actually sort the rows before the sink we can set
'inputIsClustered' to true, which means HdfsTableSink can write
files one by one, because whenever it gets a row that belongs
to a new partition it knows that it can close the current output
writer, and open a new one.

Testing:
 * added e2e test

Change-Id: I8d451c50c4b6dff9433ab105493051bee106bc63
Reviewed-on: http://gerrit.cloudera.org:8080/22192
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2024-12-11 22:19:10 +00:00

307 lines
11 KiB
Plaintext

====
---- QUERY
CREATE TABLE ice_optimize_part (int_col int, string_col string, bool_col boolean)
PARTITIONED BY SPEC(int_col)
STORED BY ICEBERG
TBLPROPERTIES ('format-version'='2');
====
---- QUERY
# Insert a value then delete everything from the table.
INSERT INTO ice_optimize_part VALUES(1, 'one', true);
DELETE FROM ice_optimize_part WHERE bool_col=true;
SHOW FILES IN ice_optimize_part;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_SUBSET
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/int_col=1/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/int_col=1/delete-.*parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# Testing that OPTIMIZE merged the data and delete files and the result is an empty snapshot.
OPTIMIZE TABLE ice_optimize_part;
SHOW FILES IN ice_optimize_part;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# Testing that OPTIMIZE empty table is no-op.
DESCRIBE HISTORY ice_optimize_part;
---- LABELS
creation_time,snapshot_id,parent_id,is_current_ancestor
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'.*','.*','NULL','TRUE'
row_regex:'.*','.*','.*','TRUE'
row_regex:'.*','.*','.*','TRUE'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
OPTIMIZE TABLE ice_optimize_part;
DESCRIBE HISTORY ice_optimize_part;
---- LABELS
creation_time,snapshot_id,parent_id,is_current_ancestor
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'.*','.*','NULL','TRUE'
row_regex:'.*','.*','.*','TRUE'
row_regex:'.*','.*','.*','TRUE'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
CREATE TABLE ice_optimize (int_col int, string_col string, bool_col boolean)
STORED BY ICEBERG
TBLPROPERTIES ('format-version'='2');
====
---- QUERY
# Insert a value then delete everything from the table.
INSERT INTO ice_optimize VALUES(1, 'one', true);
DELETE FROM ice_optimize WHERE bool_col=true;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_SUBSET
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/delete-.*parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# Testing that OPTIMIZE merged the data and delete files and the result is an empty snapshot.
OPTIMIZE TABLE ice_optimize;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# Testing that OPTIMIZE empty table is no-op.
DESCRIBE HISTORY ice_optimize;
---- LABELS
creation_time,snapshot_id,parent_id,is_current_ancestor
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'.*','.*','NULL','TRUE'
row_regex:'.*','.*','.*','TRUE'
row_regex:'.*','.*','.*','TRUE'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
OPTIMIZE TABLE ice_optimize;
DESCRIBE HISTORY ice_optimize;
---- LABELS
creation_time,snapshot_id,parent_id,is_current_ancestor
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'.*','.*','NULL','TRUE'
row_regex:'.*','.*','.*','TRUE'
row_regex:'.*','.*','.*','TRUE'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# Insert rows one by one to write multiple small files.
INSERT INTO ice_optimize VALUES(1, 'one', true);
INSERT INTO ice_optimize VALUES(2, 'two', false);
INSERT INTO ice_optimize VALUES(2, 'two', true);
INSERT INTO ice_optimize VALUES(3, 'three', true);
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# OPTIMIZE TABLE should create 1 data file per executor, resulting in a total of 3 files.
OPTIMIZE TABLE ice_optimize;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
SELECT * FROM ice_optimize;
---- RESULTS
1,'one',true
2,'two',false
2,'two',true
3,'three',true
---- TYPES
INT,STRING,BOOLEAN
====
---- QUERY
DELETE FROM ice_optimize WHERE int_col = 2;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_SUBSET
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/delete-.*parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# Checking that MAX_FS_WRITERS has an effect on the number of files written.
# Also checking that the delete file was merged and there is no delete file in the table.
SET MAX_FS_WRITERS=1;
OPTIMIZE TABLE ice_optimize;
SET MAX_FS_WRITERS=0;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
---- RESULTS: VERIFY_IS_NOT_IN
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/delete-.*parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
SELECT * FROM ice_optimize;
---- RESULTS
1,'one',true
3,'three',true
---- TYPES
INT,STRING,BOOLEAN
====
---- QUERY
# Schema evolution should work and return correct results according to the latest schema.
ALTER TABLE ice_optimize DROP COLUMN string_col;
ALTER TABLE ice_optimize ADD COLUMN date_col DATE;
ALTER TABLE ice_optimize ADD COLUMN double_col DOUBLE;
INSERT INTO ice_optimize VALUES((4, false, '2024-01-22', 4.444), (1, false, '2024-02-02', 1.101));
OPTIMIZE TABLE ice_optimize;
SELECT * FROM ice_optimize;
---- RESULTS
1,true,NULL,NULL
3,true,NULL,NULL
4,false,2024-01-22,4.444
1,false,2024-02-02,1.101
---- TYPES
INT,BOOLEAN,DATE,DOUBLE
====
---- QUERY
# OPTIMIZE TABLE should use the latest partition spec and create 1 file per partition.
ALTER TABLE ice_optimize SET PARTITION SPEC(int_col);
INSERT INTO ice_optimize VALUES((2, false, '2024-01-22', 2.2), (3, false, '2024-02-22', 3.3));
OPTIMIZE TABLE ice_optimize;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=1.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=2.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=3.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=4.*.0.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
SELECT * FROM ice_optimize;
---- RESULTS
1,true,NULL,NULL
3,true,NULL,NULL
4,false,2024-01-22,4.444
1,false,2024-02-02,1.101
2,false,2024-01-22,2.2
3,false,2024-02-22,3.3
---- TYPES
INT,BOOLEAN,DATE,DOUBLE
====
---- QUERY
# OPTIMIZE TABLE should handle partition evolution and merge update deltas as well.
ALTER TABLE ice_optimize SET PARTITION SPEC(month(date_col));
UPDATE ice_optimize SET date_col='2023-01-07' WHERE bool_col=true;
OPTIMIZE TABLE ice_optimize;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/date_col_month=2023-01/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/date_col_month=2024-01/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/date_col_month=2024-02/.*.0.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
ALTER TABLE ice_optimize SET PARTITION SPEC(bucket(2,int_col));
OPTIMIZE TABLE ice_optimize;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col_bucket_2=0/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col_bucket_2=1/.*.0.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
ALTER TABLE ice_optimize ADD COLUMN string_col STRING;
INSERT INTO ice_optimize VALUES((10, true, '2024-01-01', 10.1010, 'impala'), (11, true, '2024-01-01', 11.11, 'iceberg'));
ALTER TABLE ice_optimize SET PARTITION SPEC(truncate(2, string_col));
OPTIMIZE TABLE ice_optimize;
SHOW FILES IN ice_optimize;
---- LABELS
Path,Size,Partition,EC Policy
---- RESULTS: VERIFY_IS_EQUAL
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/string_col_trunc_2=__HIVE_DEFAULT_PARTITION__/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/string_col_trunc_2=ic/.*.0.parq','.*','','$ERASURECODE_POLICY'
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/string_col_trunc_2=im/.*.0.parq','.*','','$ERASURECODE_POLICY'
---- TYPES
STRING, STRING, STRING, STRING
====
---- QUERY
# Make sure that the table content is not modified by OPTIMIZE TABLE.
SELECT * FROM ice_optimize;
---- RESULTS
1,true,2023-01-07,NULL,'NULL'
3,true,2023-01-07,NULL,'NULL'
4,false,2024-01-22,4.444,'NULL'
1,false,2024-02-02,1.101,'NULL'
2,false,2024-01-22,2.2,'NULL'
3,false,2024-02-22,3.3,'NULL'
10,true,2024-01-01,10.1010,'impala'
11,true,2024-01-01,11.11,'iceberg'
---- TYPES
INT,BOOLEAN,DATE,DOUBLE,STRING
====
---- QUERY
# Regression test for IMPALA-13598.
# OPTIMIZE table with many partitions (with low mem_limit).
# Then we also check the number of files to verify that
# the inputs of the writers are actually sorted(/clustered).
CREATE TABLE ice_tpch_many_parts
PARTITIONED BY SPEC(truncate(1000, l_orderkey))
STORED BY ICEBERG
AS SELECT * FROM tpch_parquet.lineitem
WHERE l_orderkey % 2 = 0;
INSERT INTO ice_tpch_many_parts
SELECT * FROM tpch_parquet.lineitem WHERE l_orderkey % 2 = 1;
SET MEM_LIMIT=600m;
OPTIMIZE TABLE ice_tpch_many_parts;
SELECT count(*) FROM $DATABASE.ice_tpch_many_parts.`files`;
---- RESULTS
6001
---- TYPES
BIGINT
====