Files
impala/testdata/workloads/functional-query/queries/QueryTest/iceberg-plain-count-star-optimization.test
LPL f38c53235f IMPALA-11279: Optimize plain count(*) queries for Iceberg tables
This commit optimizes the plain count(*) queries for the Iceberg tables.
When the `org.apache.iceberg.SnapshotSummary#TOTAL_RECORDS_PROP` can be
retrieved from the current `org.apache.iceberg.BaseSnapshot#summary` of
the Iceberg table, this kind of query can be very fast. If this property
is not retrieved, the query will aggregate the `num_rows` of parquet
`file_metadata_` as usual.

Queries that can be optimized need to meet the following requirements:
 - SelectStmt does not have WHERE clause
 - SelectStmt does not have GROUP BY clause
 - SelectStmt does not have HAVING clause
 - The TableRefs of FROM clause contains only one BaseTableRef
 - Only for the Iceberg table
 - SelectList must contain 'count(*)' or 'count(constant)'
 - SelectList can contain other agg functions, e.g. min, sum, etc
 - SelectList can contain constant

Testing:
 - Added end-to-end test
 - Existing tests
 - Test it in a real cluster

Change-Id: I8e9c48bbba7ab2320fa80915e7001ce54f1ef6d9
Reviewed-on: http://gerrit.cloudera.org:8080/18574
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2022-06-27 15:29:11 +00:00

236 lines
5.1 KiB
Plaintext

====
---- QUERY
create table ice_tbl (
col_i INT,
col_s STRING
) partitioned by spec (col_s) stored as iceberg tblproperties ('write.format.default' = 'parquet');
---- RESULTS
'Table has been created.'
====
---- QUERY
select count(*) from ice_tbl;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
insert into
ice_tbl
values
(1, "odd"),
(3, "odd"),
(5, "odd");
show files in ice_tbl;
---- RESULTS
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_tbl/data/col_s=odd/.*.0.parq','.*',''
---- TYPES
STRING, STRING, STRING
====
---- QUERY
select count(*) from ice_tbl;
---- RESULTS
3
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
create table ice_tbl_u1 stored as iceberg as select * from ice_tbl;
---- RESULTS
'Inserted 3 row(s)'
====
---- QUERY
select count(*) from ice_tbl_u1;
---- RESULTS
3
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
insert into
ice_tbl
values
(2, "even"),
(4, "even"),
(6, "even");
show files in ice_tbl;
---- RESULTS
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_tbl/data/col_s=even/.*.0.parq','.*',''
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_tbl/data/col_s=odd/.*.0.parq','.*',''
---- TYPES
STRING, STRING, STRING
====
---- QUERY
select count(*) from ice_tbl;
---- RESULTS
6
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
create table ice_tbl_u2 stored as iceberg as select * from ice_tbl;
---- RESULTS
'Inserted 6 row(s)'
====
---- QUERY
select count(*) from ice_tbl_u2;
---- RESULTS
6
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
insert into
ice_tbl
values
(1, "odd"),
(2, "even");
show files in ice_tbl;
---- RESULTS
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_tbl/data/col_s=even/.*.0.parq','.*',''
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_tbl/data/col_s=even/.*.0.parq','.*',''
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_tbl/data/col_s=odd/.*.0.parq','.*',''
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/ice_tbl/data/col_s=odd/.*.0.parq','.*',''
---- TYPES
STRING, STRING, STRING
====
---- QUERY
select count(*) from ice_tbl;
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
select count(*) from ice_tbl for system_time as of now();
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
set explain_level=3;
explain select count(col_i), count(*) from ice_tbl;
---- RESULTS: VERIFY_IS_SUBSET
'Analyzed query: SELECT count(col_i), CAST(8 AS BIGINT) FROM'
'$DATABASE.ice_tbl'
====
---- QUERY
set explain_level=3;
explain select count(distinct col_i), count(*) from ice_tbl;
---- RESULTS: VERIFY_IS_SUBSET
'Analyzed query: SELECT count(DISTINCT col_i), CAST(8 AS BIGINT) FROM'
'$DATABASE.ice_tbl'
====
---- QUERY
set explain_level=3;
explain select min(col_i), count(*), max(col_i) from ice_tbl;
---- RESULTS: VERIFY_IS_SUBSET
'Analyzed query: SELECT min(col_i), CAST(8 AS BIGINT), max(col_i) FROM'
'$DATABASE.ice_tbl'
====
---- QUERY
set explain_level=3;
explain select 123, count(*), 321 from ice_tbl;
---- RESULTS: VERIFY_IS_SUBSET
'Analyzed query: SELECT CAST(123 AS TINYINT), CAST(8 AS BIGINT), CAST(321 AS'
'SMALLINT)'
====
---- QUERY
select
count(*)
from
ice_tbl
where
col_s = 'odd';
---- RESULTS
4
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
select
count(*)
from
ice_tbl
having
avg(col_i) < 0;
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
select
count(*)
from
ice_tbl
group by
col_s;
---- RESULTS
4
4
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
select
count(distinct col_i)
from
ice_tbl;
---- RESULTS
6
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
truncate ice_tbl;
---- RESULTS
'Table has been truncated.'
====
---- QUERY
select count(*) from ice_tbl;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====
---- QUERY
create table parq_tbl(col_i INT, col_s STRING) PARTITIONED BY(x INT) STORED AS PARQUET;
---- RESULTS
'Table has been created.'
====
---- QUERY
insert into parq_tbl PARTITION(x = 12340) values (0, "a");
insert into parq_tbl PARTITION(x = 12341) values (1, "b");
insert into parq_tbl PARTITION(x = 12342) values (2, "c");
select count(*) from parq_tbl;
---- RESULTS
3
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 3
====
---- QUERY
select count(*) as c from ice_tbl_u1 union all (select count(*) c from ice_tbl_u2) order by c;
---- RESULTS
3
6
---- TYPES
BIGINT
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 0
aggregation(SUM, NumFileMetadataRead): 0
====