mirror of
https://github.com/apache/impala.git
synced 2026-01-10 18:00:14 -05:00
The metadata-only scan doesn't allocate I/O buffers, contrary to an assumption of the memory estimation code in the planner. This fix also sets a floor on the memory estimate, to avoid estimating 0 bytes. 1MB seems like a reasonable approximation: I ran metadata-only scans on a few different data sizes and saw numbers from 128kb to 1mb. The estimate is now much closer to actual consumption (it was 80MB before): [localhost:21000] > select count(*) from tpch_parquet.lineitem; summary; Query: select count(*) from tpch_parquet.lineitem Query submitted at: 2017-08-23 11:58:29 (Coordinator: http://tarmstrong-box:25000) Query progress can be monitored at: http://tarmstrong-box:25000/query_plan?query_id=cb4b8d41fc838c9a:c5496ff300000000 +----------+ | count(*) | +----------+ | 6001215 | +----------+ Fetched 1 row(s) in 0.13s +--------------+--------+----------+----------+-------+------------+-----------+---------------+-----------------------+ | Operator | #Hosts | Avg Time | Max Time | #Rows | Est. #Rows | Peak Mem | Est. Peak Mem | Detail | +--------------+--------+----------+----------+-------+------------+-----------+---------------+-----------------------+ | 03:AGGREGATE | 1 | 168.49us | 168.49us | 1 | 1 | 28.00 KB | 10.00 MB | FINALIZE | | 02:EXCHANGE | 1 | 30.11ms | 30.11ms | 3 | 1 | 0 B | 0 B | UNPARTITIONED | | 01:AGGREGATE | 3 | 2.05us | 6.14us | 3 | 1 | 20.00 KB | 10.00 MB | | | 00:SCAN HDFS | 3 | 4.58ms | 4.72ms | 3 | 6.00M | 128.00 KB | 1.00 MB | tpch_parquet.lineitem | +--------------+--------+----------+----------+-------+------------+-----------+---------------+-----------------------+ Testing: Updated affected planner tests. Change-Id: Iaf5c2316bef2afae54a94245c715534ed294f286 Reviewed-on: http://gerrit.cloudera.org:8080/7783 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins
233 lines
5.6 KiB
Plaintext
233 lines
5.6 KiB
Plaintext
# Rows per node is < 3000: codegen should be disabled.
|
|
select count(*) from functional.alltypes
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=148.00MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*)
|
|
|
|
|
00:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# Rows per node is > 3000: codegen should be enabled.
|
|
select count(*) from functional.alltypesagg
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=100.00MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: count(*)
|
|
|
|
|
00:SCAN HDFS [functional.alltypesagg]
|
|
partitions=11/11 files=11 size=814.73KB
|
|
====
|
|
# No stats on functional_parquet: codegen should be disabled.
|
|
select count(*) from functional_parquet.alltypes
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=21.00MB
|
|
WARNING: The following tables are missing relevant table and/or column statistics.
|
|
functional_parquet.alltypes
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
03:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
01:AGGREGATE
|
|
| output: sum_init_zero(functional_parquet.alltypes.parquet-stats: num_rows)
|
|
|
|
|
00:SCAN HDFS [functional_parquet.alltypes]
|
|
partitions=24/24 files=24 size=178.13KB
|
|
====
|
|
# > 3000 rows returned to coordinator: codegen should be enabled
|
|
select * from functional_parquet.alltypes
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=128.00MB
|
|
WARNING: The following tables are missing relevant table and/or column statistics.
|
|
functional_parquet.alltypes
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
00:SCAN HDFS [functional_parquet.alltypes]
|
|
partitions=24/24 files=24 size=178.13KB
|
|
====
|
|
# Optimisation is enabled for join producing < 3000 rows
|
|
select count(*)
|
|
from functional.alltypes t1
|
|
join functional.alltypestiny t2 on t1.id = t2.id
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=1.94MB
|
|
Per-Host Resource Estimates: Memory=181.94MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
|
|
|
02:HASH JOIN [INNER JOIN, BROADCAST]
|
|
| hash predicates: t1.id = t2.id
|
|
| runtime filters: RF000 <- t2.id
|
|
|
|
|
|--04:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypestiny t2]
|
|
| partitions=4/4 files=4 size=460B
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
runtime filters: RF000 -> t1.id
|
|
====
|
|
# Optimisation is disabled by cross join producing > 3000 rows
|
|
select count(*) from functional.alltypes t1, functional.alltypes t2
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=276.00MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
06:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
|
|
|
05:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
|
|
|
02:NESTED LOOP JOIN [CROSS JOIN, BROADCAST]
|
|
|
|
|
|--04:EXCHANGE [BROADCAST]
|
|
| |
|
|
| 01:SCAN HDFS [functional.alltypes t2]
|
|
| partitions=24/24 files=24 size=478.45KB
|
|
|
|
|
00:SCAN HDFS [functional.alltypes t1]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# Optimisation is enabled for union producing < 3000 rows
|
|
select count(*) from (
|
|
select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypestiny) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=148.00MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypestiny]
|
|
| partitions=4/4 files=4 size=460B
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# Optimisation is disabled by union producing > 3000 rows
|
|
select count(*) from (
|
|
select * from functional.alltypes
|
|
union all
|
|
select * from functional.alltypes) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=148.00MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
05:AGGREGATE [FINALIZE]
|
|
| output: count:merge(*)
|
|
|
|
|
04:EXCHANGE [UNPARTITIONED]
|
|
|
|
|
03:AGGREGATE
|
|
| output: count(*)
|
|
|
|
|
00:UNION
|
|
| pass-through-operands: all
|
|
|
|
|
|--02:SCAN HDFS [functional.alltypes]
|
|
| partitions=24/24 files=24 size=478.45KB
|
|
|
|
|
01:SCAN HDFS [functional.alltypes]
|
|
partitions=24/24 files=24 size=478.45KB
|
|
====
|
|
# Scan with limit on large table: the number of rows scanned is bounded,
|
|
# codegen should be disabled
|
|
select sum(l_discount)
|
|
from (select * from tpch.lineitem limit 1000) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=274.00MB
|
|
Codegen disabled by planner
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: sum(tpch.lineitem.l_discount)
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
| limit: 1000
|
|
|
|
|
00:SCAN HDFS [tpch.lineitem]
|
|
partitions=1/1 files=1 size=718.94MB
|
|
limit: 1000
|
|
====
|
|
# Scan with limit and predicates on large table: any number of rows could be scanned:
|
|
# codegen should be enabled
|
|
select sum(l_discount)
|
|
from (select * from tpch.lineitem where l_orderkey > 100 limit 1000) v
|
|
---- DISTRIBUTEDPLAN
|
|
Max Per-Host Resource Reservation: Memory=0B
|
|
Per-Host Resource Estimates: Memory=274.00MB
|
|
|
|
PLAN-ROOT SINK
|
|
|
|
|
01:AGGREGATE [FINALIZE]
|
|
| output: sum(tpch.lineitem.l_discount)
|
|
|
|
|
02:EXCHANGE [UNPARTITIONED]
|
|
| limit: 1000
|
|
|
|
|
00:SCAN HDFS [tpch.lineitem]
|
|
partitions=1/1 files=1 size=718.94MB
|
|
predicates: l_orderkey > 100
|
|
limit: 1000
|
|
====
|