mirror of
https://github.com/apache/impala.git
synced 2026-02-02 15:00:38 -05:00
When OptimizeStmt created the table sink it didn't set 'inputIsClustered' to true. Therefore HdfsTableSink expected random input and kept the output writers open for every partition, which resulted in high memory consumption and potentially an OOM error when the number of partitions are high. Since we actually sort the rows before the sink we can set 'inputIsClustered' to true, which means HdfsTableSink can write files one by one, because whenever it gets a row that belongs to a new partition it knows that it can close the current output writer, and open a new one. Testing: * added e2e test Change-Id: I8d451c50c4b6dff9433ab105493051bee106bc63 Reviewed-on: http://gerrit.cloudera.org:8080/22192 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
307 lines
11 KiB
Plaintext
307 lines
11 KiB
Plaintext
====
|
|
---- QUERY
|
|
CREATE TABLE ice_optimize_part (int_col int, string_col string, bool_col boolean)
|
|
PARTITIONED BY SPEC(int_col)
|
|
STORED BY ICEBERG
|
|
TBLPROPERTIES ('format-version'='2');
|
|
====
|
|
---- QUERY
|
|
# Insert a value then delete everything from the table.
|
|
INSERT INTO ice_optimize_part VALUES(1, 'one', true);
|
|
DELETE FROM ice_optimize_part WHERE bool_col=true;
|
|
SHOW FILES IN ice_optimize_part;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/int_col=1/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/int_col=1/delete-.*parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Testing that OPTIMIZE merged the data and delete files and the result is an empty snapshot.
|
|
OPTIMIZE TABLE ice_optimize_part;
|
|
SHOW FILES IN ice_optimize_part;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Testing that OPTIMIZE empty table is no-op.
|
|
DESCRIBE HISTORY ice_optimize_part;
|
|
---- LABELS
|
|
creation_time,snapshot_id,parent_id,is_current_ancestor
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'.*','.*','NULL','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
OPTIMIZE TABLE ice_optimize_part;
|
|
DESCRIBE HISTORY ice_optimize_part;
|
|
---- LABELS
|
|
creation_time,snapshot_id,parent_id,is_current_ancestor
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'.*','.*','NULL','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
CREATE TABLE ice_optimize (int_col int, string_col string, bool_col boolean)
|
|
STORED BY ICEBERG
|
|
TBLPROPERTIES ('format-version'='2');
|
|
====
|
|
---- QUERY
|
|
# Insert a value then delete everything from the table.
|
|
INSERT INTO ice_optimize VALUES(1, 'one', true);
|
|
DELETE FROM ice_optimize WHERE bool_col=true;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/delete-.*parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Testing that OPTIMIZE merged the data and delete files and the result is an empty snapshot.
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Testing that OPTIMIZE empty table is no-op.
|
|
DESCRIBE HISTORY ice_optimize;
|
|
---- LABELS
|
|
creation_time,snapshot_id,parent_id,is_current_ancestor
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'.*','.*','NULL','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
OPTIMIZE TABLE ice_optimize;
|
|
DESCRIBE HISTORY ice_optimize;
|
|
---- LABELS
|
|
creation_time,snapshot_id,parent_id,is_current_ancestor
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'.*','.*','NULL','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
row_regex:'.*','.*','.*','TRUE'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Insert rows one by one to write multiple small files.
|
|
INSERT INTO ice_optimize VALUES(1, 'one', true);
|
|
INSERT INTO ice_optimize VALUES(2, 'two', false);
|
|
INSERT INTO ice_optimize VALUES(2, 'two', true);
|
|
INSERT INTO ice_optimize VALUES(3, 'three', true);
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# OPTIMIZE TABLE should create 1 data file per executor, resulting in a total of 3 files.
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,'one',true
|
|
2,'two',false
|
|
2,'two',true
|
|
3,'three',true
|
|
---- TYPES
|
|
INT,STRING,BOOLEAN
|
|
====
|
|
---- QUERY
|
|
DELETE FROM ice_optimize WHERE int_col = 2;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/delete-.*parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Checking that MAX_FS_WRITERS has an effect on the number of files written.
|
|
# Also checking that the delete file was merged and there is no delete file in the table.
|
|
SET MAX_FS_WRITERS=1;
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SET MAX_FS_WRITERS=0;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- RESULTS: VERIFY_IS_NOT_IN
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/delete-.*parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,'one',true
|
|
3,'three',true
|
|
---- TYPES
|
|
INT,STRING,BOOLEAN
|
|
====
|
|
---- QUERY
|
|
# Schema evolution should work and return correct results according to the latest schema.
|
|
ALTER TABLE ice_optimize DROP COLUMN string_col;
|
|
ALTER TABLE ice_optimize ADD COLUMN date_col DATE;
|
|
ALTER TABLE ice_optimize ADD COLUMN double_col DOUBLE;
|
|
INSERT INTO ice_optimize VALUES((4, false, '2024-01-22', 4.444), (1, false, '2024-02-02', 1.101));
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,true,NULL,NULL
|
|
3,true,NULL,NULL
|
|
4,false,2024-01-22,4.444
|
|
1,false,2024-02-02,1.101
|
|
---- TYPES
|
|
INT,BOOLEAN,DATE,DOUBLE
|
|
====
|
|
---- QUERY
|
|
# OPTIMIZE TABLE should use the latest partition spec and create 1 file per partition.
|
|
ALTER TABLE ice_optimize SET PARTITION SPEC(int_col);
|
|
INSERT INTO ice_optimize VALUES((2, false, '2024-01-22', 2.2), (3, false, '2024-02-22', 3.3));
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=1.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=2.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=3.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col=4.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,true,NULL,NULL
|
|
3,true,NULL,NULL
|
|
4,false,2024-01-22,4.444
|
|
1,false,2024-02-02,1.101
|
|
2,false,2024-01-22,2.2
|
|
3,false,2024-02-22,3.3
|
|
---- TYPES
|
|
INT,BOOLEAN,DATE,DOUBLE
|
|
====
|
|
---- QUERY
|
|
# OPTIMIZE TABLE should handle partition evolution and merge update deltas as well.
|
|
ALTER TABLE ice_optimize SET PARTITION SPEC(month(date_col));
|
|
UPDATE ice_optimize SET date_col='2023-01-07' WHERE bool_col=true;
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/date_col_month=2023-01/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/date_col_month=2024-01/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/date_col_month=2024-02/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
ALTER TABLE ice_optimize SET PARTITION SPEC(bucket(2,int_col));
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col_bucket_2=0/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/int_col_bucket_2=1/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
ALTER TABLE ice_optimize ADD COLUMN string_col STRING;
|
|
INSERT INTO ice_optimize VALUES((10, true, '2024-01-01', 10.1010, 'impala'), (11, true, '2024-01-01', 11.11, 'iceberg'));
|
|
ALTER TABLE ice_optimize SET PARTITION SPEC(truncate(2, string_col));
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/string_col_trunc_2=__HIVE_DEFAULT_PARTITION__/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/string_col_trunc_2=ic/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/string_col_trunc_2=im/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Make sure that the table content is not modified by OPTIMIZE TABLE.
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,true,2023-01-07,NULL,'NULL'
|
|
3,true,2023-01-07,NULL,'NULL'
|
|
4,false,2024-01-22,4.444,'NULL'
|
|
1,false,2024-02-02,1.101,'NULL'
|
|
2,false,2024-01-22,2.2,'NULL'
|
|
3,false,2024-02-22,3.3,'NULL'
|
|
10,true,2024-01-01,10.1010,'impala'
|
|
11,true,2024-01-01,11.11,'iceberg'
|
|
---- TYPES
|
|
INT,BOOLEAN,DATE,DOUBLE,STRING
|
|
====
|
|
---- QUERY
|
|
# Regression test for IMPALA-13598.
|
|
# OPTIMIZE table with many partitions (with low mem_limit).
|
|
# Then we also check the number of files to verify that
|
|
# the inputs of the writers are actually sorted(/clustered).
|
|
CREATE TABLE ice_tpch_many_parts
|
|
PARTITIONED BY SPEC(truncate(1000, l_orderkey))
|
|
STORED BY ICEBERG
|
|
AS SELECT * FROM tpch_parquet.lineitem
|
|
WHERE l_orderkey % 2 = 0;
|
|
|
|
INSERT INTO ice_tpch_many_parts
|
|
SELECT * FROM tpch_parquet.lineitem WHERE l_orderkey % 2 = 1;
|
|
|
|
SET MEM_LIMIT=600m;
|
|
OPTIMIZE TABLE ice_tpch_many_parts;
|
|
SELECT count(*) FROM $DATABASE.ice_tpch_many_parts.`files`;
|
|
---- RESULTS
|
|
6001
|
|
---- TYPES
|
|
BIGINT
|
|
====
|