Files
impala/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
liuyao 1a01bfe831 IMPALA-10377: Improve the accuracy of resource estimation
PlanNode does not consider some factors when estimating memory,
this will cause a large error rate

AggregationNode
1.MemoryEstimate = Ndv * (AvgRowSize + SizeOfBucket)
2.When estimating the Ndv of merge aggregation, Ndv should be
  divided only once.
3.If there is no grouping exprs, MemoryEstimate =
  MIN_PLAIN_AGG_MEM

SortNode
1.MemoryEstimate = Cardinality * AvgRowSize. Memory used when
  there is enough memory

HashJoinNode
1.MemoryEstimate= DataRows + Buckets + DuplicateNodes,
  DataRows = RightTableCardinality * AvgRowSize,
  Buckets= roundUpToPowerOf2(RightTableCardinality) *
           SizeOfBucket,
  DuplicateNodes = (RightTableCardinality - RightNdv) *
                    SizeOfDuplicateNode

KuduScanNode
1.MemoryEstimate = Columns * BytesPerColumn * MaxScannerThreads,
  Columns are scanned in query, not all the columns of the table

UnitTest
1.CardinalityTest adds test cases to test memory estimation.
  Modify existing test cases related to memory estimation

Change-Id: Ic01db168ff2c6d6de33ee553a8175599f035d7a1
Reviewed-on: http://gerrit.cloudera.org:8080/16842
Reviewed-by: Zoltan Borok-Nagy <boroknagyz@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2021-03-12 14:23:04 +00:00

88 lines
4.6 KiB
Plaintext

====
---- QUERY
# Explain a simple hash join query.
explain
select *
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=[0-9.]*MB Threads=[0-9]*.*
row_regex:.*Per-Host Resource Estimates: Memory=[0-9.]*MB.*
'Analyzed query: SELECT * FROM tpch.lineitem INNER JOIN tpch.orders ON l_orderkey'
'= o_orderkey'
''
'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1
'PLAN-ROOT SINK'
'| output exprs: tpch.lineitem.l_orderkey, tpch.lineitem.l_partkey, tpch.lineitem.l_suppkey, tpch.lineitem.l_linenumber, tpch.lineitem.l_quantity, tpch.lineitem.l_extendedprice, tpch.lineitem.l_discount, tpch.lineitem.l_tax, tpch.lineitem.l_returnflag, tpch.lineitem.l_linestatus, tpch.lineitem.l_shipdate, tpch.lineitem.l_commitdate, tpch.lineitem.l_receiptdate, tpch.lineitem.l_shipinstruct, tpch.lineitem.l_shipmode, tpch.lineitem.l_comment, tpch.orders.o_orderkey, tpch.orders.o_custkey, tpch.orders.o_orderstatus, tpch.orders.o_totalprice, tpch.orders.o_orderdate, tpch.orders.o_orderpriority, tpch.orders.o_clerk, tpch.orders.o_shippriority, tpch.orders.o_comment'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B spill-buffer=[0-9.]*MB thread-reservation=0
'|'
'04:EXCHANGE [UNPARTITIONED]'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
'| tuple-ids=0,1 row-size=402B cardinality=5.76M'
'| in pipelines: 00(GETNEXT)'
'|'
'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=.*
'02:HASH JOIN [INNER JOIN, BROADCAST]'
'| hash predicates: l_orderkey = o_orderkey'
'| fk/pk conjuncts: l_orderkey = o_orderkey'
'| runtime filters: RF000[bloom] <- o_orderkey'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B spill-buffer=[0-9.]*MB thread-reservation=0.*
'| tuple-ids=0,1 row-size=402B cardinality=5.76M'
'| in pipelines: 00(GETNEXT), 01(OPEN)'
'|'
'|--03:EXCHANGE [BROADCAST]'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
'| | tuple-ids=1 row-size=171B cardinality=1.50M'
'| | in pipelines: 01(GETNEXT)'
'| |'
'| F01:PLAN FRAGMENT [RANDOM] hosts=2 instances=2'
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=.*
'| 01:SCAN HDFS [tpch.orders, RANDOM]'
row_regex:.*partitions=1/1 files=1 size=.*
'| stored statistics:'
row_regex:.*table: rows=[0-9.]*[A-Z]* size=.*
'| columns: all'
row_regex:.*extrapolated-rows=disabled max-scan-range-rows=[0-9]*.*
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1.*
'| tuple-ids=1 row-size=171B cardinality=1.50M'
'| in pipelines: 01(GETNEXT)'
'|'
'00:SCAN HDFS [tpch.lineitem, RANDOM]'
row_regex:.*partitions=1/1 files=1 size=.*
' runtime filters: RF000[bloom] -> l_orderkey'
' stored statistics:'
row_regex:.*table: rows=[0-9.]*[A-Z]* size=.*
' columns: all'
row_regex:.*extrapolated-rows=disabled max-scan-range-rows=[0-9]*.*
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1.*
' tuple-ids=0 row-size=231B cardinality=6.00M'
' in pipelines: 00(GETNEXT)'
====
---- QUERY
# Tests the warning about missing table stats in the explain header.
explain select count(t1.int_col), avg(t2.float_col), sum(t3.bigint_col)
from functional_avro.alltypes t1
inner join functional_parquet.alltypessmall t2 on (t1.id = t2.id)
left outer join functional_avro.alltypes t3 on (t2.id = t3.id)
where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
---- RESULTS: VERIFY_IS_SUBSET
'Per-Host Resource Estimates: Memory=58MB'
'WARNING: The following tables are missing relevant table and/or column statistics.'
'functional_avro.alltypes, functional_parquet.alltypessmall'
====
---- QUERY
# Tests the warning about missing table stats in the explain header.
# Disable the estimation of cardinality for an hdfs table withot stats.
set DISABLE_HDFS_NUM_ROWS_ESTIMATE=1;
explain select count(t1.int_col), avg(t2.float_col), sum(t3.bigint_col)
from functional_avro.alltypes t1
inner join functional_parquet.alltypessmall t2 on (t1.id = t2.id)
left outer join functional_avro.alltypes t3 on (t2.id = t3.id)
where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
---- RESULTS: VERIFY_IS_SUBSET
'Per-Host Resource Estimates: Memory=4.05GB'
'WARNING: The following tables are missing relevant table and/or column statistics.'
'functional_avro.alltypes, functional_parquet.alltypessmall'
====