mirror of
https://github.com/apache/impala.git
synced 2026-01-10 09:00:16 -05:00
This change codegens the hash partitioning logic of KrpcDataStreamSender::Send() when the partitioning strategy is HASH_PARTITIONED. It does so by unrolling the loop which evaluates each row against the partitioning expressions and hashes the result. It also replaces the number of channels of that sender with a constant at runtime. With this change, we get reasonable speedup with some benchmarks: +------------+-----------------------+---------+------------+------------+----------------+ | Workload | File Format | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) | +------------+-----------------------+---------+------------+------------+----------------+ | TPCH(_300) | parquet / none / none | 20.03 | -6.44% | 13.56 | -7.15% | +------------+-----------------------+---------+------------+------------+----------------+ +---------------------+-----------------------+---------+------------+------------+----------------+ | Workload | File Format | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) | +---------------------+-----------------------+---------+------------+------------+----------------+ | TARGETED-PERF(_300) | parquet / none / none | 58.59 | -5.56% | 12.28 | -5.30% | +---------------------+-----------------------+---------+------------+------------+----------------+ +-------------------------+-----------------------+---------+------------+------------+----------------+ | Workload | File Format | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) | +-------------------------+-----------------------+---------+------------+------------+----------------+ | TPCDS-UNMODIFIED(_1000) | parquet / none / none | 15.60 | -3.10% | 7.16 | -4.33% | +-------------------------+-----------------------+---------+------------+------------+----------------+ +-------------------+-----------------------+---------+------------+------------+----------------+ | Workload | File Format | Avg (s) | Delta(Avg) | GeoMean(s) | Delta(GeoMean) | +-------------------+-----------------------+---------+------------+------------+----------------+ | TPCH_NESTED(_300) | parquet / none / none | 30.93 | -3.02% | 17.46 | -4.71% | +-------------------+-----------------------+---------+------------+------------+----------------+ Change-Id: I1c44cc9312c062cc7a5a4ac9156ceaa31fb887ff Reviewed-on: http://gerrit.cloudera.org:8080/10421 Reviewed-by: Michael Ho <kwho@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
42 lines
1018 B
Plaintext
42 lines
1018 B
Plaintext
====
|
|
---- QUERY
|
|
set disable_codegen_rows_threshold=0;
|
|
select count(*) from alltypes t1
|
|
join /* +SHUFFLE */ alltypes t2
|
|
on t1.int_col= t2.int_col and
|
|
t1.string_col = t2.string_col
|
|
---- RESULTS
|
|
5329000
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
# Verify that codegen was enabled
|
|
row_regex: .*Hash Partitioned Sender Codegen Enabled.*
|
|
====
|
|
---- QUERY
|
|
set disable_codegen_rows_threshold=0;
|
|
select count(*) from alltypes t1
|
|
join /* +BROADCAST */ alltypes t2
|
|
on t1.int_col= t2.int_col and
|
|
t1.string_col = t2.string_col
|
|
---- RESULTS
|
|
5329000
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
# Verify that codegen was enabled
|
|
row_regex: .*Unpartitioned Sender Codegen Disabled: not needed.*
|
|
====
|
|
---- QUERY
|
|
set disable_codegen_rows_threshold=0;
|
|
select count(*) from chars_tiny t1
|
|
join /* +SHUFFLE */ chars_tiny t2 on t1.cs=t2.cs;
|
|
---- RESULTS
|
|
10
|
|
---- TYPES
|
|
bigint
|
|
---- RUNTIME_PROFILE
|
|
# Verify that codegen was disabled
|
|
row_regex: .*Hash Partitioned Sender Codegen Disabled: Codegen for Char not supported.*
|
|
====
|