mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
Commit7ca20b3c94revert the original optimized count(star) for ORC scan from commitf932d78ad0(gerrit review http://gerrit.cloudera.org:8080/18327). The revert is necessary since the unification of count star and zero slot functions into HdfsColumnarScanner and causing significant regression for non-optimized counts star query in parquet format (over 15% slower MaterializeTupleTime). This patch reimplements optimized count(star) for ORC scan code path while minimizing the code changes needed for parquet scan code path. After this patch, ORC and parquet code path will have only the following new things in common: - THdfsScanNode.count_star_slot_offset renamed to THdfsScanNode.star_slot_offset - HdfsScanner::IssueFooterRanges will only issue footer ranges if IsZeroSlotTableScan() or optimize_count_star() is true (made possible for parquet by IMPALA-12631). The structure of HdfsParquetScanner::GetNextInternal() remains unchanged. Its zero scan slot code path is still served through num_rows metadata from the parquet footer, while the optimized count star code path still loops over row groups metadata (also from parquet footer). The following table shows single-node benchmark result of 3 count query variant on TPC-DS scale 10, both in ORC and parquet format, looped 9 times. +-----------+---------------------------+---------+--------+-------------+------------+ | Workload | Query | Format | Avg(s) | Base Avg(s) | Delta(Avg) | +-----------+---------------------------+---------+--------+-------------+------------+ | TPCDS(10) | TPCDS-Q_COUNT_UNOPTIMIZED | orc | 0.30 | 0.28 | +6.50% | | TPCDS(10) | TPCDS-Q_COUNT_OPTIMIZED | parquet | 0.14 | 0.14 | +1.56% | | TPCDS(10) | TPCDS-Q_COUNT_ZERO_SLOT | parquet | 0.27 | 0.27 | +1.42% | | TPCDS(10) | TPCDS-Q_COUNT_ZERO_SLOT | orc | 0.28 | 0.29 | -3.03% | | TPCDS(10) | TPCDS-Q_COUNT_UNOPTIMIZED | parquet | 0.21 | 0.22 | -4.45% | | TPCDS(10) | TPCDS-Q_COUNT_OPTIMIZED | orc | 0.14 | 0.21 | I -35.92% | +-----------+---------------------------+---------+--------+-------------+------------+ Testing: - Restore PlannerTest.testOrcStatsAgg - Restore TestAggregationQueriesRunOnce and TestAggregationQueriesRunOnce::test_orc_count_star_optimization - Exercise count(star) in TestOrc::test_misaligned_orc_stripes - Pass core tests Change-Id: I5971c8f278e1dee44e2a8dd4d2f043d22ebf5d17 Reviewed-on: http://gerrit.cloudera.org:8080/19927 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
169 lines
3.1 KiB
Plaintext
169 lines
3.1 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Basic partition key scan.
|
|
select distinct year
|
|
from alltypes
|
|
---- RESULTS
|
|
2009
|
|
2010
|
|
---- TYPES
|
|
INT
|
|
---- RUNTIME_PROFILE
|
|
# Confirm that only one row per file is read.
|
|
aggregation(SUM, RowsRead): 24
|
|
---- RUNTIME_PROFILE: table_format=parquet,orc
|
|
aggregation(SUM, RowsRead): 24
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
====
|
|
---- QUERY
|
|
# Test with more complex multiple distinct aggregation.
|
|
select count(distinct year), count(distinct month)
|
|
from alltypes
|
|
---- RESULTS
|
|
2,12
|
|
---- TYPES
|
|
BIGINT,BIGINT
|
|
---- RUNTIME_PROFILE
|
|
# Confirm that only one row per file is read.
|
|
aggregation(SUM, RowsRead): 24
|
|
---- RUNTIME_PROFILE: table_format=parquet,orc
|
|
aggregation(SUM, RowsRead): 24
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
====
|
|
---- QUERY
|
|
# Distinct aggregation with multiple columns.
|
|
select distinct year, month
|
|
from alltypes
|
|
---- RESULTS
|
|
2009,1
|
|
2009,2
|
|
2009,3
|
|
2009,4
|
|
2009,5
|
|
2009,6
|
|
2009,7
|
|
2009,8
|
|
2009,9
|
|
2009,10
|
|
2009,11
|
|
2009,12
|
|
2010,1
|
|
2010,2
|
|
2010,3
|
|
2010,4
|
|
2010,5
|
|
2010,6
|
|
2010,7
|
|
2010,8
|
|
2010,9
|
|
2010,10
|
|
2010,11
|
|
2010,12
|
|
---- TYPES
|
|
INT,INT
|
|
---- RUNTIME_PROFILE
|
|
# Confirm that only one row per file is read.
|
|
aggregation(SUM, RowsRead): 24
|
|
---- RUNTIME_PROFILE: table_format=parquet,orc
|
|
aggregation(SUM, RowsRead): 24
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
====
|
|
---- QUERY
|
|
# Partition key scan combined with analytic function.
|
|
select year, row_number() over (order by year)
|
|
from alltypes group by year;
|
|
---- RESULTS
|
|
2009,1
|
|
2010,2
|
|
---- TYPES
|
|
INT,BIGINT
|
|
---- RUNTIME_PROFILE
|
|
# Confirm that only one row per file is read.
|
|
aggregation(SUM, RowsRead): 24
|
|
---- RUNTIME_PROFILE: table_format=parquet,orc
|
|
aggregation(SUM, RowsRead): 24
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
====
|
|
---- QUERY
|
|
# Partition scan combined with sort.
|
|
select distinct year, month
|
|
from alltypes
|
|
order by year, month
|
|
---- RESULTS
|
|
2009,1
|
|
2009,2
|
|
2009,3
|
|
2009,4
|
|
2009,5
|
|
2009,6
|
|
2009,7
|
|
2009,8
|
|
2009,9
|
|
2009,10
|
|
2009,11
|
|
2009,12
|
|
2010,1
|
|
2010,2
|
|
2010,3
|
|
2010,4
|
|
2010,5
|
|
2010,6
|
|
2010,7
|
|
2010,8
|
|
2010,9
|
|
2010,10
|
|
2010,11
|
|
2010,12
|
|
---- TYPES
|
|
INT,INT
|
|
---- RUNTIME_PROFILE
|
|
# Confirm that only one row per file is read.
|
|
aggregation(SUM, RowsRead): 24
|
|
---- RUNTIME_PROFILE: table_format=parquet,orc
|
|
aggregation(SUM, RowsRead): 24
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
====
|
|
---- QUERY
|
|
# Partition key scan combined with predicate on partition columns
|
|
select distinct year, month
|
|
from alltypes
|
|
where year - 2000 = month;
|
|
---- RESULTS
|
|
2009,9
|
|
2010,10
|
|
---- TYPES
|
|
INT,INT
|
|
---- RUNTIME_PROFILE
|
|
# Confirm that only one row per file is read.
|
|
aggregation(SUM, RowsRead): 2
|
|
---- RUNTIME_PROFILE: table_format=parquet,orc
|
|
aggregation(SUM, RowsRead): 2
|
|
aggregation(SUM, NumFileMetadataRead): 2
|
|
====
|
|
---- QUERY
|
|
# Partition key scan combined with having predicate.
|
|
select year, min(month)
|
|
from alltypes
|
|
group by year
|
|
having min(month) = 1
|
|
---- RESULTS
|
|
2009,1
|
|
2010,1
|
|
---- TYPES
|
|
INT,INT
|
|
---- RUNTIME_PROFILE
|
|
# Confirm that only one row per file is read.
|
|
aggregation(SUM, RowsRead): 24
|
|
---- RUNTIME_PROFILE: table_format=parquet,orc
|
|
aggregation(SUM, RowsRead): 24
|
|
aggregation(SUM, NumFileMetadataRead): 24
|
|
====
|
|
---- QUERY
|
|
# Empty table should not return any rows
|
|
select distinct 'test'
|
|
from emptytable
|
|
---- RESULTS
|
|
---- TYPES
|
|
STRING
|
|
====
|