mirror of
https://github.com/apache/impala.git
synced 2026-02-01 03:00:22 -05:00
This work addresses the current limitation in computing the total row count for a Hive table in a scan. The row count can be incorrectly computed as 0, even though there exists data in the Hive table. This is the stats corruption at table level. Similar stats corruption exists for a partition. The row count of a table or a partition sometime can also be -1 which indicates a missing stats situation. In the fix, as long as no partition in a Hive table exhibits any missing or corrupt stats, the total row count for the table is computed from the row counts in all partitions. Otherwise, Impala looks at the table level stats particularly the table row count. In addition, if the table stats is missing or corrupted, Impala estimates a row count for the table, if feasible. This row count is the sum of the row count from the partitions with good stats, and an estimation of the number of rows in the partitions with missing or corrupt stats. Such estimation also applies when some partition has corrupt stats. One way to observe the fix is through the explain of queries scanning Hive tables with missing or corrupted stats. The cardinality for any full scan should be a positive value (i.e. the estimated row count), instead of 'unavailable'. At the beginning of the explain output, that table is still listed in the WARNING section for potentially corrupt table statistics. Testing: 1. Ran unit tests with queries documented in the case against Hive tables with the following configrations: a. No stats corruption in any partitions b. Stats corruption in some partitions c. Stats corruption in all partitions 2. Added two new tests in test_compute_stats.py: a. test_corrupted_stats_in_partitioned_Hive_tables b. test_corrupted_stats_in_unpartitioned_Hive_tables 3. Fixed failures in corrupt-stats.test 4. Ran "core" test Change-Id: I9f4c64616ff7c0b6d5a48f2b5331325feeff3576 Reviewed-on: http://gerrit.cloudera.org:8080/16098 Reviewed-by: Sahil Takiar <stakiar@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
245 lines
9.4 KiB
Plaintext
245 lines
9.4 KiB
Plaintext
====
|
|
---- QUERY
|
|
# This test relies on a deterministic row order so we use "sort by (id)".
|
|
create table alltypes sort by (id) like functional_parquet.alltypes;
|
|
alter table alltypes set tblproperties("impala.enable.stats.extrapolation"="true");
|
|
insert into alltypes partition(year, month)
|
|
select * from functional_parquet.alltypes where year = 2009;
|
|
====
|
|
---- QUERY
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' stored statistics:'
|
|
' table: rows=unavailable size=unavailable'
|
|
' partitions: 0/12 rows=5.97K'
|
|
' columns: unavailable'
|
|
row_regex:.* extrapolated-rows=unavailable.*
|
|
' tuple-ids=0 row-size=4B cardinality=5.97K'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Disable the estimation of cardinality for an hdfs table withot stats.
|
|
SET DISABLE_HDFS_NUM_ROWS_ESTIMATE=1;
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' stored statistics:'
|
|
' table: rows=unavailable size=unavailable'
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: unavailable'
|
|
row_regex:.* extrapolated-rows=unavailable.*
|
|
' tuple-ids=0 row-size=4B cardinality=unavailable'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
compute stats alltypes
|
|
---- RESULTS
|
|
'Updated 1 partition(s) and 11 column(s).'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Only the table-level row count is stored. The partition row counts are extrapolated.
|
|
show table stats alltypes
|
|
---- LABELS
|
|
YEAR, MONTH, #ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
|
|
---- RESULTS
|
|
'2009','1',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=1'
|
|
'2009','10',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=10'
|
|
'2009','11',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=11'
|
|
'2009','12',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=12'
|
|
'2009','2',-1,290,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=2'
|
|
'2009','3',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=3'
|
|
'2009','4',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=4'
|
|
'2009','5',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=5'
|
|
'2009','6',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=6'
|
|
'2009','7',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=7'
|
|
'2009','8',-1,307,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=8'
|
|
'2009','9',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=9'
|
|
'Total','',3650,3650,12,regex:.*B,'0B','','','',''
|
|
---- TYPES
|
|
STRING,STRING,BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING
|
|
====
|
|
---- QUERY
|
|
# Stats are available now.
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes.*
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
'| output exprs: id'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN $FILESYSTEM_NAME [$DATABASE.alltypes]'
|
|
row_regex:.*partitions=12/12 files=12 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3.65K size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=3.65K .*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=3.65K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Select a subset of partitions.
|
|
explain select id from alltypes where month in (1, 2, 3);
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
|
|
'`month` IN (CAST(1 AS INT), CAST(2 AS INT), CAST(3 AS INT))'
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
'| output exprs: id'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN $FILESYSTEM_NAME [$DATABASE.alltypes]'
|
|
' partition predicates: `month` IN (CAST(1 AS INT), CAST(2 AS INT), CAST(3 AS INT))'
|
|
row_regex:.*partitions=3/12 files=3 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3.65K size=.*
|
|
' partitions: 0/3 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=904.*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=904'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Double the data in existing partitions.
|
|
insert into alltypes partition(year, month)
|
|
select * from functional_parquet.alltypes where year = 2009;
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=.*
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes.*
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
'| output exprs: id'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN $FILESYSTEM_NAME [$DATABASE.alltypes]'
|
|
row_regex:.*partitions=12/12 files=24 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3.65K size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=7.30K .*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=7.30K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Create new partitions and extrapolate their row count.
|
|
insert into alltypes partition(year, month)
|
|
select * from functional_parquet.alltypes where year = 2010;
|
|
explain select id from alltypes where year = 2010;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=16MB.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
|
|
'`year` = CAST(2010 AS INT)'
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
'| output exprs: id'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN $FILESYSTEM_NAME [$DATABASE.alltypes]'
|
|
' partition predicates: `year` = CAST(2010 AS INT)'
|
|
row_regex:.*partitions=12/24 files=12 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=3.65K size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=3.65K .*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=3.65K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Compute stats and run the same query again.
|
|
compute stats alltypes;
|
|
explain select id from alltypes where year = 2010;
|
|
---- RESULTS: VERIFY_IS_EQUAL
|
|
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
|
|
row_regex:.*Per-Host Resource Estimates: Memory=16MB.*
|
|
'Codegen disabled by planner'
|
|
row_regex:.*Analyzed query: SELECT id FROM test_stats_extrapolation_.*.alltypes WHERE.*
|
|
'`year` = CAST(2010 AS INT)'
|
|
''
|
|
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
|
|
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
|
|
'PLAN-ROOT SINK'
|
|
'| output exprs: id'
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
'|'
|
|
'00:SCAN $FILESYSTEM_NAME [$DATABASE.alltypes]'
|
|
' partition predicates: `year` = CAST(2010 AS INT)'
|
|
row_regex:.*partitions=12/24 files=12 size=.*
|
|
' stored statistics:'
|
|
row_regex:.*table: rows=10.95K size=.*
|
|
' partitions: 0/12 rows=unavailable'
|
|
' columns: all'
|
|
row_regex:.* extrapolated-rows=3.65K .*
|
|
row_regex:.*mem-estimate=.* mem-reservation=.*
|
|
' tuple-ids=0 row-size=4B cardinality=3.65K'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Test that dropping stats resets everything.
|
|
drop stats alltypes;
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' stored statistics:'
|
|
' table: rows=unavailable size=unavailable'
|
|
' partitions: 0/24 rows=17.91K'
|
|
' columns: unavailable'
|
|
row_regex:.* extrapolated-rows=unavailable.*
|
|
row_regex:.* tuple-ids=0 row-size=4B cardinality=17\.9.*K
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|
|
---- QUERY
|
|
# Test that dropping stats resets everything.
|
|
SET DISABLE_HDFS_NUM_ROWS_ESTIMATE=1;
|
|
drop stats alltypes;
|
|
explain select id from alltypes;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' stored statistics:'
|
|
' table: rows=unavailable size=unavailable'
|
|
' partitions: 0/24 rows=unavailable'
|
|
' columns: unavailable'
|
|
row_regex:.* extrapolated-rows=unavailable.*
|
|
' tuple-ids=0 row-size=4B cardinality=unavailable'
|
|
' in pipelines: 00(GETNEXT)'
|
|
---- TYPES
|
|
STRING
|
|
====
|