Files
impala/testdata/workloads/functional-planner/queries/PlannerTest/disable-codegen.test
Taras Bobrovytsky 57d7c614bc IMPALA-5036: Parquet count star optimization
Instead of materializing empty rows when computing count star, we use
the data stored in the Parquet RowGroup.num_rows field. The Parquet
scanner tuple is modified to have one slot into which we will write the
num rows statistic. The aggregate function is changed from count to a
special sum function that gets initialized to 0. We also add a rewrite
rule so that count(<literal>) is rewritten to count(*) in order to make
sure that this optimization is applied in all cases.

Testing:
- Added functional and planner tests

Change-Id: I536b85c014821296aed68a0c68faadae96005e62
Reviewed-on: http://gerrit.cloudera.org:8080/6812
Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com>
Tested-by: Impala Public Jenkins
2017-07-06 01:26:44 +00:00

233 lines
5.6 KiB
Plaintext

# Rows per node is < 3000: codegen should be disabled.
select count(*) from functional.alltypes
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=138.00MB
Codegen disabled by planner
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
|
02:EXCHANGE [UNPARTITIONED]
|
01:AGGREGATE
| output: count(*)
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# Rows per node is > 3000: codegen should be enabled.
select count(*) from functional.alltypesagg
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=90.00MB
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
|
02:EXCHANGE [UNPARTITIONED]
|
01:AGGREGATE
| output: count(*)
|
00:SCAN HDFS [functional.alltypesagg]
partitions=11/11 files=11 size=814.73KB
====
# No stats on functional_parquet: codegen should be disabled.
select count(*) from functional_parquet.alltypes
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=26.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
functional_parquet.alltypes
PLAN-ROOT SINK
|
03:AGGREGATE [FINALIZE]
| output: count:merge(*)
|
02:EXCHANGE [UNPARTITIONED]
|
01:AGGREGATE
| output: sum_init_zero(functional_parquet.alltypes.parquet-stats: num_rows)
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=179.68KB
====
# > 3000 rows returned to coordinator: codegen should be enabled
select * from functional_parquet.alltypes
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=128.00MB
WARNING: The following tables are missing relevant table and/or column statistics.
functional_parquet.alltypes
PLAN-ROOT SINK
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN HDFS [functional_parquet.alltypes]
partitions=24/24 files=24 size=179.68KB
====
# Optimisation is enabled for join producing < 3000 rows
select count(*)
from functional.alltypes t1
join functional.alltypestiny t2 on t1.id = t2.id
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=136.00MB
Per-Host Resource Estimates: Memory=138.00MB
Codegen disabled by planner
PLAN-ROOT SINK
|
06:AGGREGATE [FINALIZE]
| output: count:merge(*)
|
05:EXCHANGE [UNPARTITIONED]
|
03:AGGREGATE
| output: count(*)
|
02:HASH JOIN [INNER JOIN, BROADCAST]
| hash predicates: t1.id = t2.id
| runtime filters: RF000 <- t2.id
|
|--04:EXCHANGE [BROADCAST]
| |
| 01:SCAN HDFS [functional.alltypestiny t2]
| partitions=4/4 files=4 size=460B
|
00:SCAN HDFS [functional.alltypes t1]
partitions=24/24 files=24 size=478.45KB
runtime filters: RF000 -> t1.id
====
# Optimisation is disabled by cross join producing > 3000 rows
select count(*) from functional.alltypes t1, functional.alltypes t2
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=138.00MB
PLAN-ROOT SINK
|
06:AGGREGATE [FINALIZE]
| output: count:merge(*)
|
05:EXCHANGE [UNPARTITIONED]
|
03:AGGREGATE
| output: count(*)
|
02:NESTED LOOP JOIN [CROSS JOIN, BROADCAST]
|
|--04:EXCHANGE [BROADCAST]
| |
| 01:SCAN HDFS [functional.alltypes t2]
| partitions=24/24 files=24 size=478.45KB
|
00:SCAN HDFS [functional.alltypes t1]
partitions=24/24 files=24 size=478.45KB
====
# Optimisation is enabled for union producing < 3000 rows
select count(*) from (
select * from functional.alltypes
union all
select * from functional.alltypestiny) v
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=170.00MB
Codegen disabled by planner
PLAN-ROOT SINK
|
05:AGGREGATE [FINALIZE]
| output: count:merge(*)
|
04:EXCHANGE [UNPARTITIONED]
|
03:AGGREGATE
| output: count(*)
|
00:UNION
| pass-through-operands: all
|
|--02:SCAN HDFS [functional.alltypestiny]
| partitions=4/4 files=4 size=460B
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# Optimisation is disabled by union producing > 3000 rows
select count(*) from (
select * from functional.alltypes
union all
select * from functional.alltypes) v
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=266.00MB
PLAN-ROOT SINK
|
05:AGGREGATE [FINALIZE]
| output: count:merge(*)
|
04:EXCHANGE [UNPARTITIONED]
|
03:AGGREGATE
| output: count(*)
|
00:UNION
| pass-through-operands: all
|
|--02:SCAN HDFS [functional.alltypes]
| partitions=24/24 files=24 size=478.45KB
|
01:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
====
# Scan with limit on large table: the number of rows scanned is bounded,
# codegen should be disabled
select sum(l_discount)
from (select * from tpch.lineitem limit 1000) v
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=274.00MB
Codegen disabled by planner
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum(tpch.lineitem.l_discount)
|
02:EXCHANGE [UNPARTITIONED]
| limit: 1000
|
00:SCAN HDFS [tpch.lineitem]
partitions=1/1 files=1 size=718.94MB
limit: 1000
====
# Scan with limit and predicates on large table: any number of rows could be scanned:
# codegen should be enabled
select sum(l_discount)
from (select * from tpch.lineitem where l_orderkey > 100 limit 1000) v
---- DISTRIBUTEDPLAN
Per-Host Resource Reservation: Memory=0B
Per-Host Resource Estimates: Memory=274.00MB
PLAN-ROOT SINK
|
01:AGGREGATE [FINALIZE]
| output: sum(tpch.lineitem.l_discount)
|
02:EXCHANGE [UNPARTITIONED]
| limit: 1000
|
00:SCAN HDFS [tpch.lineitem]
partitions=1/1 files=1 size=718.94MB
predicates: l_orderkey > 100
limit: 1000
====