mirror of
https://github.com/apache/impala.git
synced 2026-02-02 06:00:36 -05:00
PlanNode does not consider some factors when estimating memory,
this will cause a large error rate
AggregationNode
1.MemoryEstimate = Ndv * (AvgRowSize + SizeOfBucket)
2.When estimating the Ndv of merge aggregation, Ndv should be
divided only once.
3.If there is no grouping exprs, MemoryEstimate =
MIN_PLAIN_AGG_MEM
SortNode
1.MemoryEstimate = Cardinality * AvgRowSize. Memory used when
there is enough memory
HashJoinNode
1.MemoryEstimate= DataRows + Buckets + DuplicateNodes,
DataRows = RightTableCardinality * AvgRowSize,
Buckets= roundUpToPowerOf2(RightTableCardinality) *
SizeOfBucket,
DuplicateNodes = (RightTableCardinality - RightNdv) *
SizeOfDuplicateNode
KuduScanNode
1.MemoryEstimate = Columns * BytesPerColumn * MaxScannerThreads,
Columns are scanned in query, not all the columns of the table
UnitTest
1.CardinalityTest adds test cases to test memory estimation.
Modify existing test cases related to memory estimation
Change-Id: Ic01db168ff2c6d6de33ee553a8175599f035d7a1
Reviewed-on: http://gerrit.cloudera.org:8080/16842
Reviewed-by: Zoltan Borok-Nagy <boroknagyz@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
328 lines
8.6 KiB
Plaintext
328 lines
8.6 KiB
Plaintext
# Rows per node is < 3000: codegen should be disabled.
|
|
select count(*) from functional.alltypes
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=32.00KB Threads=3
|
|
Per-Host Resource Estimates: Memory=128MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
HDFS partitions=24/24 files=24 size=478.45KB
|
|
row-size=0B cardinality=7.30K
|
|
====
|
|
# Rows per node is > 3000: codegen should be enabled.
|
|
select count(*) from functional.alltypesagg
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=128.00KB Threads=3
|
|
Per-Host Resource Estimates: Memory=80MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
HDFS partitions=11/11 files=11 size=814.73KB
|
|
row-size=0B cardinality=11.00K
|
|
====
|
|
# No stats on functional_parquet: codegen should be disabled.
|
|
select count(*) from functional_parquet.alltypes
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=16.00KB Threads=3
|
|
Per-Host Resource Estimates: Memory=10MB
|
|
WARNING: The following tables are missing relevant table and/or column statistics.
|
|
functional_parquet.alltypes
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: sum_init_zero(functional_parquet.alltypes.stats: num_rows)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
00:SCAN HDFS [functional_parquet.alltypes]
|
|
HDFS partitions=24/24 files=24 size=202.42KB
|
|
row-size=8B cardinality=12.88K
|
|
====
|
|
# > 3000 rows returned to coordinator: codegen should be enabled
|
|
select * from functional_parquet.alltypes
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=88.00KB Threads=3
|
|
Per-Host Resource Estimates: Memory=129MB
|
|
WARNING: The following tables are missing relevant table and/or column statistics.
|
|
functional_parquet.alltypes
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
00:SCAN HDFS [functional_parquet.alltypes]
|
|
HDFS partitions=24/24 files=24 size=202.42KB
|
|
row-size=80B cardinality=12.88K
|
|
====
|
|
# Optimisation is enabled for join producing < 3000 rows
|
|
select count(*)
|
|
from functional.alltypes t1
|
|
join functional.alltypestiny t2 on t1.id = t2.id
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=2.98MB Threads=5
|
|
Per-Host Resource Estimates: Memory=163MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
02:HASH JOIN [INNER JOIN, BROADCAST]
|
|
| hash predicates: t1.id = t2.id
|
|
| runtime filters: RF000 <- t2.id
|
|
| row-size=8B cardinality=8
|
|
|
|
|
|--04:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypestiny t2]
|
|
| HDFS partitions=4/4 files=4 size=460B
|
|
| row-size=4B cardinality=8
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
HDFS partitions=24/24 files=24 size=478.45KB
|
|
runtime filters: RF000 -> t1.id
|
|
row-size=4B cardinality=7.30K
|
|
====
|
|
# Optimisation is disabled by cross join producing > 3000 rows
|
|
select count(*) from functional.alltypes t1, functional.alltypes t2
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=64.00KB Threads=5
|
|
Per-Host Resource Estimates: Memory=256MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
02:NESTED LOOP JOIN [CROSS JOIN, BROADCAST]
|
|
| row-size=0B cardinality=53.29M
|
|
|
|
|
|--04:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypes t2]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| row-size=0B cardinality=7.30K
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
HDFS partitions=24/24 files=24 size=478.45KB
|
|
row-size=0B cardinality=7.30K
|
|
====
|
|
# Optimisation is enabled for union producing < 3000 rows
|
|
select count(*) from (
|
|
select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypestiny) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=32.00KB Threads=3
|
|
Per-Host Resource Estimates: Memory=128MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
| row-size=0B cardinality=7.31K
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypestiny]
|
|
| HDFS partitions=4/4 files=4 size=460B
|
|
| row-size=0B cardinality=8
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
HDFS partitions=24/24 files=24 size=478.45KB
|
|
row-size=0B cardinality=7.30K
|
|
====
|
|
# Optimisation is disabled by union producing > 3000 rows
|
|
select count(*) from (
|
|
select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypes) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=32.00KB Threads=3
|
|
Per-Host Resource Estimates: Memory=128MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
| row-size=8B cardinality=1
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
| row-size=0B cardinality=14.60K
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypes]
|
|
| HDFS partitions=24/24 files=24 size=478.45KB
|
|
| row-size=0B cardinality=7.30K
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
HDFS partitions=24/24 files=24 size=478.45KB
|
|
row-size=0B cardinality=7.30K
|
|
====
|
|
# Scan with limit on large table: the number of rows scanned is bounded,
|
|
# codegen should be disabled
|
|
select sum(l_discount)
|
|
from (select * from tpch.lineitem limit 1000) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=8.00MB Threads=3
|
|
Per-Host Resource Estimates: Memory=264MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: sum(tpch.lineitem.l_discount)
|
|
| row-size=16B cardinality=1
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
| limit: 1000
|
|
|
|
|
00:SCAN HDFS [tpch.lineitem]
|
|
HDFS partitions=1/1 files=1 size=718.94MB
|
|
limit: 1000
|
|
row-size=8B cardinality=1.00K
|
|
====
|
|
# Scan with limit and predicates on large table: any number of rows could be scanned:
|
|
# codegen should be enabled
|
|
select sum(l_discount)
|
|
from (select * from tpch.lineitem where l_orderkey > 100 limit 1000) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=8.00MB Threads=3
|
|
Per-Host Resource Estimates: Memory=264MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: sum(tpch.lineitem.l_discount)
|
|
| row-size=16B cardinality=1
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
| limit: 1000
|
|
|
|
|
00:SCAN HDFS [tpch.lineitem]
|
|
HDFS partitions=1/1 files=1 size=718.94MB
|
|
predicates: l_orderkey > 100
|
|
limit: 1000
|
|
row-size=16B cardinality=1.00K
|
|
====
|
|
# Test query on large Kudu table with all Kudu primary key columns in equivalence
|
|
# predicates: not more than 1 row could be returned from Kudu,
|
|
# Codegen should be disabled.
|
|
select * from tpch_kudu.partsupp where ps_partkey=2 and ps_suppkey=5003
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B Threads=3
|
|
Per-Host Resource Estimates: Memory=10MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
00:SCAN KUDU [tpch_kudu.partsupp]
|
|
kudu predicates: ps_partkey = 2, ps_suppkey = 5003
|
|
row-size=172B cardinality=1
|
|
====
|
|
# Test query on large Kudu table with partial Kudu primary key columns in equivalence
|
|
# predicates: any number of rows could be returned from Kudu,
|
|
# Codegen should be enabled
|
|
select * from tpch_kudu.partsupp where ps_partkey=2
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B Threads=3
|
|
Per-Host Resource Estimates: Memory=10MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
00:SCAN KUDU [tpch_kudu.partsupp]
|
|
kudu predicates: ps_partkey = 2
|
|
row-size=172B cardinality=4
|
|
====
|
|
# Test that codegen is disabled for a query over a small number of large files when the
|
|
# partition key scan optimisation kicks in. This is desirable because most of the rows
|
|
# in the file will not be processed.
|
|
select distinct 'const' from tpch_parquet.lineitem;
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=4.06MB Threads=4
|
|
Per-Host Resource Estimates: Memory=21MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| group by: 'const'
|
|
| row-size=12B cardinality=1
|
|
|
|
|
02:EXCHANGE [HASH('const')]
|
|
|
|
|
01:AGGREGATE [STREAMING]
|
|
| group by: 'const'
|
|
| row-size=12B cardinality=1
|
|
|
|
|
00:SCAN HDFS [tpch_parquet.lineitem]
|
|
HDFS partitions=1/1 files=3 size=193.99MB
|
|
partition key scan
|
|
row-size=0B cardinality=3
|
|
====
|