Files
impala/testdata/workloads/functional-query/queries/QueryTest/stats-extrapolation.test
Zoltan Borok-Nagy de7f09d726 IMPALA-7644: Hide Parquet page index writing with feature flag
This commit adds the command line flag enable_parquet_page_index_writing
to the Impala daemon that switches Impala's ability of writing the
Parquet page index. By default the flag is false, i.e. Impala doesn't
write the page index.

This flag is only temporary, we plan to remove it once Impala is able to
read the page index and has better testing around it.

Because of this change I had to move test_parquet_page_index.py to the
custom_cluster test suite since I need to set this command line flag
in order to test the functionality. I also merged most of the test cases
because we don't want to restart the cluster too many times.

I removed 'num_data_pages_' from BaseColumnWriter since it was rather
confusing and didn't provide any measurable performance improvement.

This commit fixes the ASAN error produced by the first IMPALA-7644
commit which was reverted later.

Change-Id: Ib4a9098a2085a385351477c715ae245d83bf1c72
Reviewed-on: http://gerrit.cloudera.org:8080/11694
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-10-17 19:57:17 +00:00

200 lines
7.6 KiB
Plaintext

====
---- QUERY
# This test relies on a deterministic row order so we use "sort by (id)".
create table alltypes sort by (id) like functional_parquet.alltypes;
alter table alltypes set tblproperties("impala.enable.stats.extrapolation"="true");
insert into alltypes partition(year, month)
select * from functional_parquet.alltypes where year = 2009;
====
---- QUERY
# No stats are available.
explain select id from alltypes;
---- RESULTS: VERIFY_IS_SUBSET
' stored statistics:'
' table: rows=unavailable size=unavailable'
' partitions: 0/12 rows=unavailable'
' columns: unavailable'
row_regex:.* extrapolated-rows=unavailable.*
' tuple-ids=0 row-size=4B cardinality=unavailable'
---- TYPES
STRING
====
---- QUERY
compute stats alltypes
---- RESULTS
'Updated 1 partition(s) and 11 column(s).'
---- TYPES
STRING
====
---- QUERY
# Only the table-level row count is stored. The partition row counts are extrapolated.
show table stats alltypes
---- LABELS
YEAR, MONTH, #ROWS, EXTRAP #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS, LOCATION
---- RESULTS
'2009','1',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=1'
'2009','2',-1,288,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=2'
'2009','3',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=3'
'2009','4',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=4'
'2009','5',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=5'
'2009','6',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=6'
'2009','7',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=7'
'2009','8',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=8'
'2009','9',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=9'
'2009','10',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=10'
'2009','11',-1,302,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=11'
'2009','12',-1,308,1,regex:.*B,'NOT CACHED','NOT CACHED','PARQUET','false','$NAMENODE/test-warehouse/$DATABASE.db/alltypes/year=2009/month=12'
'Total','',3650,3650,12,regex:.*B,'0B','','','',''
---- TYPES
STRING,STRING,BIGINT,BIGINT,BIGINT,STRING,STRING,STRING,STRING,STRING,STRING
====
---- QUERY
# Stats are available now.
explain select id from alltypes;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=.*
'Codegen disabled by planner'
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
row_regex:.*partitions=12/12 files=12 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=3650.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=3650'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Select a subset of partitions.
explain select id from alltypes where month in (1, 2, 3);
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=.*
'Codegen disabled by planner'
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
row_regex:.*partitions=3/12 files=3 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/3 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=904.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=904'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Double the data in existing partitions.
insert into alltypes partition(year, month)
select * from functional_parquet.alltypes where year = 2009;
explain select id from alltypes;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=.*
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
row_regex:.*partitions=12/12 files=24 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=7300.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=7300'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Create new partitions and extrapolate their row count.
insert into alltypes partition(year, month)
select * from functional_parquet.alltypes where year = 2010;
explain select id from alltypes where year = 2010;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=16MB'
'Codegen disabled by planner'
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
row_regex:.*partitions=12/24 files=12 size=.*
' stored statistics:'
row_regex:.*table: rows=3650 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=3651.*
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=3651'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Compute stats and run the same query again.
compute stats alltypes;
explain select id from alltypes where year = 2010;
---- RESULTS: VERIFY_IS_EQUAL
row_regex:.*Max Per-Host Resource Reservation: Memory=.*
row_regex:.*Per-Host Resource Estimates: Memory=16MB'
'Codegen disabled by planner'
''
'F00:PLAN FRAGMENT [UNPARTITIONED] hosts=1 instances=1'
row_regex:.*Per-Host Resources: mem-estimate=.* mem-reservation=.*
'PLAN-ROOT SINK'
row_regex:.*mem-estimate=.* mem-reservation=.*
'|'
'00:SCAN HDFS [$DATABASE.alltypes]'
row_regex:.*partitions=12/24 files=12 size=.*
' stored statistics:'
row_regex:.*table: rows=10950 size=.*
' partitions: 0/12 rows=unavailable'
' columns: all'
row_regex:.* extrapolated-rows=3651
row_regex:.*mem-estimate=.* mem-reservation=.*
' tuple-ids=0 row-size=4B cardinality=3651'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====
---- QUERY
# Test that dropping stats resets everything.
drop stats alltypes;
explain select id from alltypes;
---- RESULTS: VERIFY_IS_SUBSET
' stored statistics:'
' table: rows=unavailable size=unavailable'
' partitions: 0/24 rows=unavailable'
' columns: unavailable'
row_regex:.* extrapolated-rows=unavailable.*
' tuple-ids=0 row-size=4B cardinality=unavailable'
' in pipelines: 00(GETNEXT)'
---- TYPES
STRING
====