Files
impala/testdata/workloads/functional-query/queries/QueryTest/explain-level2.test
Fang-Yu Rao b3b00da1a1 IMPALA-7608: Estimate row count from file size when no stats available
Added the feature that computes an estimated number of rows in the current
hdfs table if the statistics for the cardinality of the current hdfs table is not
available.

Also added an additional query option to revert the change in case of regression.

Testing:
(1) In CardinalityTest.java, replaced the original statement
"verifyCardinality("SELECT a FROM functional.tinytable", -1);" in
the method testBasicsWithoutStats() with
"verifyCardinality("SELECT a FROM functional.tinytable", 2);".
(2) In CarginalityTest.java, added more tests to check the cardinality
of most PlanNode implementations. For each tested PlanNode, the behaviors
before and after we disable the feature are both tested.
(3) In set.test, modified three related test cases to make sure that
the added query option is included after executing "set all" in various
scenarios.
(4) There are 8 JUnit tests in PlannerTest.java that would produce different
distributed query plans when this feature is enabled. Added an additional
JUnit test for 6 of those 8 affected JUnit tests when this feature is
enabled. Specifically, each tested query in a newly added test files involves
at least one hdfs table without available statistics.
We do not add test cases for 2 of the affected JUnit tests when this feature
is enabled since it results in flaky tests. These two JUnit tests are
testResourceRequirements() and testSpillableBufferSizing(). In this patch
we only test them when the feature is disabled.
(5) There are 5 Python end to end tests that consist of queries that would
produce different results. Added an additional query for each affected query
when this feature is disabled.

Change-Id: Ic414121c8df0d5222e4aeea096b5365beb04568a
Reviewed-on: http://gerrit.cloudera.org:8080/12974
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2019-06-21 03:28:43 +00:00

87 lines
4.0 KiB
Plaintext

====
---- QUERY
# Explain a simple hash join query.
explain
select *
from tpch.lineitem join tpch.orders on l_orderkey = o_orderkey;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=[0-9.]*MB Threads=[0-9]*.*
row_regex:.*Per-Host Resource Estimates: Memory=[0-9.]*MB.*
'Analyzed query: SELECT * FROM tpch.lineitem INNER JOIN tpch.orders ON l_orderkey'
'= o_orderkey'
''
'F02:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
'|'
'04:EXCHANGE [UNPARTITIONED]'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
'| tuple-ids=0,1 row-size=402B cardinality=5.76M'
'| in pipelines: 00(GETNEXT)'
'|'
'F00:PLAN FRAGMENT [RANDOM] hosts=3 instances=3'
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=.*
'02:HASH JOIN [INNER JOIN, BROADCAST]'
'| hash predicates: l_orderkey = o_orderkey'
'| fk/pk conjuncts: l_orderkey = o_orderkey'
'| runtime filters: RF000[bloom] <- o_orderkey'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B spill-buffer=[0-9.]*MB thread-reservation=0.*
'| tuple-ids=0,1 row-size=402B cardinality=5.76M'
'| in pipelines: 00(GETNEXT), 01(OPEN)'
'|'
'|--03:EXCHANGE [BROADCAST]'
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=0
'| | tuple-ids=1 row-size=171B cardinality=1.50M'
'| | in pipelines: 01(GETNEXT)'
'| |'
'| F01:PLAN FRAGMENT [RANDOM] hosts=2 instances=2'
row_regex:.*Per-Host Resources: mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=.*
'| 01:SCAN HDFS [tpch.orders, RANDOM]'
row_regex:.*partitions=1/1 files=1 size=.*
'| stored statistics:'
row_regex:.*table: rows=[0-9.]*[A-Z]* size=.*
'| columns: all'
row_regex:.*extrapolated-rows=disabled max-scan-range-rows=[0-9]*.*
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1.*
'| tuple-ids=1 row-size=171B cardinality=1.50M'
'| in pipelines: 01(GETNEXT)'
'|'
'00:SCAN HDFS [tpch.lineitem, RANDOM]'
row_regex:.*partitions=1/1 files=1 size=.*
' runtime filters: RF000[bloom] -> l_orderkey'
' stored statistics:'
row_regex:.*table: rows=[0-9.]*[A-Z]* size=.*
' columns: all'
row_regex:.*extrapolated-rows=disabled max-scan-range-rows=[0-9]*.*
row_regex:.*mem-estimate=[0-9.]*[A-Z]*B mem-reservation=[0-9.]*[A-Z]*B thread-reservation=1.*
' tuple-ids=0 row-size=231B cardinality=6.00M'
' in pipelines: 00(GETNEXT)'
====
---- QUERY
# Tests the warning about missing table stats in the explain header.
explain select count(t1.int_col), avg(t2.float_col), sum(t3.bigint_col)
from functional_avro.alltypes t1
inner join functional_parquet.alltypessmall t2 on (t1.id = t2.id)
left outer join functional_avro.alltypes t3 on (t2.id = t3.id)
where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
---- RESULTS: VERIFY_IS_SUBSET
'Per-Host Resource Estimates: Memory=74MB'
'WARNING: The following tables are missing relevant table and/or column statistics.'
'functional_avro.alltypes, functional_parquet.alltypessmall'
====
---- QUERY
# Tests the warning about missing table stats in the explain header.
# Disable the estimation of cardinality for an hdfs table withot stats.
set DISABLE_HDFS_NUM_ROWS_ESTIMATE=1;
explain select count(t1.int_col), avg(t2.float_col), sum(t3.bigint_col)
from functional_avro.alltypes t1
inner join functional_parquet.alltypessmall t2 on (t1.id = t2.id)
left outer join functional_avro.alltypes t3 on (t2.id = t3.id)
where t1.month = 1 and t2.year = 2009 and t3.bool_col = false
---- RESULTS: VERIFY_IS_SUBSET
'Per-Host Resource Estimates: Memory=4.07GB'
'WARNING: The following tables are missing relevant table and/or column statistics.'
'functional_avro.alltypes, functional_parquet.alltypessmall'
====