mirror of
https://github.com/apache/impala.git
synced 2026-02-01 12:00:22 -05:00
If an Iceberg table is frequently updated/written to in small batches, a lot of small files are created. This decreases read performance. Similarly, frequent row-level deletes contribute to this problem by creating delete files, which have to be merged on read. So far INSERT OVERWRITE (rewriting the table with itself) has been used to compact Iceberg tables. However, it comes with some RESTRICTIONS: - The table should not have multiple partition specs/partition evolution. - The table should not contain complex types. The OPTIMIZE statement offers a new syntax and a solution limited to Iceberg tables to enhance read performance for subsequent operations. See IMPALA-12293 for details. Syntax: OPTIMIZE TABLE <table_name>; This first patch introduces the new syntax, temporarily as an alias for INSERT OVERWRITE. Note that executing OPTIMIZE TABLE requires ALL privileges. Testing: - negative tests - FE planner test - Ranger test - E2E tests Change-Id: Ief42537499ffe64fafdefe25c8d175539234c4e7 Reviewed-on: http://gerrit.cloudera.org:8080/20405 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
123 lines
4.3 KiB
Plaintext
123 lines
4.3 KiB
Plaintext
====
|
|
---- QUERY
|
|
CREATE TABLE ice_optimize (i int, s string)
|
|
STORED BY ICEBERG
|
|
TBLPROPERTIES ('format-version'='2');
|
|
====
|
|
---- QUERY
|
|
# Insert rows one by one to write multiple small files.
|
|
INSERT INTO ice_optimize VALUES(1, 'one');
|
|
INSERT INTO ice_optimize VALUES(2, 'two');
|
|
INSERT INTO ice_optimize VALUES(3, 'three');
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# OPTIMIZE TABLE should create 1 data file.
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,'one'
|
|
2,'two'
|
|
3,'three'
|
|
---- TYPES
|
|
INT,STRING
|
|
====
|
|
---- QUERY
|
|
DELETE FROM ice_optimize WHERE i = 2;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/delete-.*parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Checking that the delete file was merged and there is no delete file in the table.
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SHOW FILES IN ice_optimize;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize/data/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,'one'
|
|
3,'three'
|
|
---- TYPES
|
|
INT,STRING
|
|
====
|
|
---- QUERY
|
|
# Schema evolution should work and return correct results according to the latest schema.
|
|
ALTER TABLE ice_optimize DROP COLUMN s;
|
|
ALTER TABLE ice_optimize ADD COLUMN b BOOLEAN;
|
|
INSERT INTO ice_optimize VALUES(4, true);
|
|
OPTIMIZE TABLE ice_optimize;
|
|
SELECT * FROM ice_optimize;
|
|
---- RESULTS
|
|
1,NULL
|
|
3,NULL
|
|
4,true
|
|
---- TYPES
|
|
INT,BOOLEAN
|
|
====
|
|
---- QUERY
|
|
CREATE TABLE ice_optimize_part
|
|
PARTITIONED BY(i int)
|
|
STORED BY ICEBERG
|
|
TBLPROPERTIES ('format-version'='1');
|
|
====
|
|
---- QUERY
|
|
# Insert values into each partition to write multiple small files in each.
|
|
INSERT INTO ice_optimize_part VALUES(1), (2), (3);
|
|
INSERT INTO ice_optimize_part VALUES(2), (3);
|
|
INSERT INTO ice_optimize_part VALUES(1), (3);
|
|
SHOW FILES IN ice_optimize_part;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=1/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=1/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=2/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=2/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=3/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=3/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=3/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# OPTIMIZE TABLE should create 1 data file per partition.
|
|
OPTIMIZE TABLE ice_optimize_part;
|
|
SHOW FILES IN ice_optimize_part;
|
|
---- LABELS
|
|
Path,Size,Partition,EC Policy
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=1/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=2/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_optimize_part/data/i=3/.*.0.parq','.*','','$ERASURECODE_POLICY'
|
|
---- TYPES
|
|
STRING, STRING, STRING, STRING
|
|
==== |