mirror of
https://github.com/apache/impala.git
synced 2026-01-30 06:00:18 -05:00
Cardinality is vital to understanding why a plan has the form it does, yet the planner normally emits cardinality information only for the detailed levels. Unfortunately, most query profiles we see are at the standard level without this information (except in the summary table), making it hard to understand what happened. This patch adds cardinality to the standard EXPLAIN output. It also changes the displayed cardinality value to be in abbreviated "metric" form: 1.23K instead of 1234, etc. Changing the DESCRIBE output has a huge impact on PlannerTest: all the "golden" test files must change. To avoid doing this twice, this patch also includes: IMPALA-7919: Add predicates line in plan output for partition key predicates This is also the time to also include: IMPALA-8022: Add cardinality checks to PlannerTest The comparison code was changed to allow a set of validators, one of which compares cardinality to ensure it is within 5% of the expected value. This should ensure we don't change estimates unintentionally. While many planner tests are concerned with cardinality, many others are not. Testing showed that the cardinality is actually unstable within tests. For such tests, added filters to ignore cardinality. The filter is enabled by default (for backward compatibility) but disabled (to allow cardinality verification) for the critical tests. Rebasing the tests was complicated by a bug in the error-matching code, so this patch also fixes: IMPALA-8023: Fix PlannerTest to handle error lines consistently Now, the error output written to the output "save results" file matches that expected in the "golden" file -- no more handling these specially. Testing: * Added cardinality verification. * Reran all FE tests. * Rebased all PlannerTest .test files. * Adjusted the metadata/test_explain.py test to handle the changed EXPLAIN output. Change-Id: Ie9aa2d715b04cbb279aaffec8c5692686562d986 Reviewed-on: http://gerrit.cloudera.org:8080/12136 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
211 lines
8.3 KiB
Plaintext
211 lines
8.3 KiB
Plaintext
====
|
|
---- QUERY
|
|
# This test relies on a deterministic row order so we use "sort by (id)".
|
|
create table alltypes sort by (id) like functional_parquet.alltypes;
|
|
alter table alltypes set tblproperties("impala.enable.stats.extrapolation"="true");
|
|
insert into alltypes partition(year, month)
|
|
select * from functional_parquet.alltypes where year = 2009;
|
|
====
|
|
---- QUERY
|
|
# No stats are available.
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' stored statistics:'
|
|
' table: rows=unavailable size=unavailable'
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: unavailable'
|
|
row_regex:.* extrapolated-rows=unavailable.*
|
|
' tuple-ids=0 row-size=4B cardinality=unavailable'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
compute stats alltypes
|
|
---- RESULTS
|
|
'Updated 1 partition(s) and 11 column(s).'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Only the table-level row count is stored. The partition row counts are extrapolated.
|
|
show table stats alltypes
|
|
---- LABELS
|
|
YEAR, MONTH, #ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
|
|
---- RESULTS
|
|
'2009','1',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=1'
|
|
'2009','2',-1,288,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=2'
|
|
'2009','3',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=3'
|
|
'2009','4',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=4'
|
|
'2009','5',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=5'
|
|
'2009','6',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=6'
|
|
'2009','7',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=7'
|
|
'2009','8',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=8'
|
|
'2009','9',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=9'
|
|
'2009','10',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=10'
|
|
'2009','11',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=11'
|
|
'2009','12',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=12'
|
|
'Total','',3650,3650,12,regex:.*B,'0B','','','',''
|
|
---- TYPES
|
|
STRING,STRING,BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING
|
|
====
|
|
---- QUERY
|
|
# Stats are available now.
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes.*
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN HDFS [$DATABASE.alltypes]'
|
|
row_regex:.*partitions=12/12 files=12 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3650 size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=3650.*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=3.65K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Select a subset of partitions.
|
|
explain select id from alltypes where month in (1, 2, 3);
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
|
|
'month IN (CAST(1 AS INT), CAST(2 AS INT), CAST(3 AS INT))'
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN HDFS [$DATABASE.alltypes]'
|
|
' partition predicates: month IN (CAST(1 AS INT), CAST(2 AS INT), CAST(3 AS INT))'
|
|
row_regex:.*partitions=3/12 files=3 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3650 size=.*
|
|
' partitions: 0/3 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=904.*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=904'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Double the data in existing partitions.
|
|
insert into alltypes partition(year, month)
|
|
select * from functional_parquet.alltypes where year = 2009;
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=.*
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes.*
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN HDFS [$DATABASE.alltypes]'
|
|
row_regex:.*partitions=12/12 files=24 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3650 size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=7300.*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=7.30K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Create new partitions and extrapolate their row count.
|
|
insert into alltypes partition(year, month)
|
|
select * from functional_parquet.alltypes where year = 2010;
|
|
explain select id from alltypes where year = 2010;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=16MB.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
|
|
'year = CAST(2010 AS INT)'
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN HDFS [$DATABASE.alltypes]'
|
|
' partition predicates: year = CAST(2010 AS INT)'
|
|
row_regex:.*partitions=12/24 files=12 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3650 size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=3651.*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=3.65K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Compute stats and run the same query again.
|
|
compute stats alltypes;
|
|
explain select id from alltypes where year = 2010;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=16MB.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
|
|
'year = CAST(2010 AS INT)'
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN HDFS [$DATABASE.alltypes]'
|
|
' partition predicates: year = CAST(2010 AS INT)'
|
|
row_regex:.*partitions=12/24 files=12 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=10950 size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=3651
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=3.65K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Test that dropping stats resets everything.
|
|
drop stats alltypes;
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' stored statistics:'
|
|
' table: rows=unavailable size=unavailable'
|
|
' partitions: 0/24 rows=unavailable'
|
|
' columns: unavailable'
|
|
row_regex:.* extrapolated-rows=unavailable.*
|
|
' tuple-ids=0 row-size=4B cardinality=unavailable'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|