Files
impala/testdata/workloads/functional-query/queries/QueryTest/alter-table-set-column-stats.test
Bharath Vissapragada 04d027df13 IMPALA-7659: Populate NULL count while computing column stats
It was disabled for performance reasons (IMPALA-1003) and this patch
re-enables it since a lot of codegen improvements have happened since
then.

This patch switches the aggregation to use the CASE conditional instead
of IF since the former has proper codegen support (IMPALA-7655).

Tests:
=====

- Updated the affected tests to include the null counts.
- Added unit tests that verify IS [NOT] NULL predicates' cardinality
  estimation.

Perf note:
=========

I reran the compute stats child query with null counts included on the
store_sales table from 1000 SF (1TB) tpcds dataset. The table had 22
non-partitioned columns (on which null counts were computed) and ~2.8B
rows. This experiment showed around 7-8% perf drop compared to the same
child query without null counts for these columns.

Change-Id: Ic68f8b4c3756eb1980ce299a602a7d56db1e507a
Reviewed-on: http://gerrit.cloudera.org:8080/11565
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-12-08 00:21:55 +00:00

152 lines
5.4 KiB
Plaintext

====
---- QUERY
create external table alltypes_clone like functional_parquet.alltypes
location '$FILESYSTEM_PREFIX/test-warehouse/alltypes_parquet';
alter table alltypes_clone recover partitions;
====
---- QUERY
# Set various column stats.
alter table alltypes_clone set column stats double_col ('numDVs'='2');
alter table alltypes_clone set column stats timestamp_col ('numNulls'='9');
alter table alltypes_clone set column stats int_col ('numDVs'='100','numNulls'='20');
alter table alltypes_clone set column stats string_col ('maxSize'='555','avgSize'='60');
====
---- QUERY
show column stats alltypes_clone
---- LABELS
COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
---- RESULTS
'id','INT',-1,-1,4,4
'bool_col','BOOLEAN',-1,-1,1,1
'tinyint_col','TINYINT',-1,-1,1,1
'smallint_col','SMALLINT',-1,-1,2,2
'int_col','INT',100,20,4,4
'bigint_col','BIGINT',-1,-1,8,8
'float_col','FLOAT',-1,-1,4,4
'double_col','DOUBLE',2,-1,8,8
'date_string_col','STRING',-1,-1,-1,-1
'string_col','STRING',-1,-1,555,60
'timestamp_col','TIMESTAMP',-1,9,16,16
'year','INT',2,0,4,4
'month','INT',12,0,4,4
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE
====
---- QUERY
# Make sure compute stats still works.
compute stats alltypes_clone
---- RESULTS
'Updated 24 partition(s) and 11 column(s).'
---- TYPES
STRING
====
---- QUERY
# Reset the column stats to an unknown state by setting the values to -1
alter table alltypes_clone set column stats double_col ('numDVs'='-1');
alter table alltypes_clone set column stats timestamp_col ('numNulls'='-1');
alter table alltypes_clone set column stats int_col ('numDVs'='-1','numNulls'='-1');
alter table alltypes_clone set column stats string_col ('maxSize'='-1','avgSize'='-1');
====
---- QUERY
show column stats alltypes_clone
---- LABELS
COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
---- RESULTS
'id','INT',7300,0,4,4
'bool_col','BOOLEAN',2,0,1,1
'tinyint_col','TINYINT',10,0,1,1
'smallint_col','SMALLINT',10,0,2,2
'int_col','INT',-1,-1,4,4
'bigint_col','BIGINT',10,0,8,8
'float_col','FLOAT',10,0,4,4
'double_col','DOUBLE',-1,0,8,8
'date_string_col','STRING',736,0,8,8
'string_col','STRING',10,0,-1,-1
'timestamp_col','TIMESTAMP',7300,-1,16,16
'year','INT',2,0,4,4
'month','INT',12,0,4,4
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE
====
---- QUERY
# Also alter a few 'numRows' parameters to make sure manually setting all stats works.
alter table alltypes_clone partition(year=2009,month=2) set tblproperties('numRows'='280');
alter table alltypes_clone set tblproperties('numRows'='7300');
====
---- QUERY
# Check that we can query the table.
select id, int_col, double_col, string_col, timestamp_col from alltypes_clone
where year = 2009 and month between 2 and 3 and int_col = 9 and id between 300 and 400
---- RESULTS
319,9,90.89999999999999,'9',2009-02-01 00:09:00.360000000
329,9,90.89999999999999,'9',2009-02-02 00:19:00.810000000
339,9,90.89999999999999,'9',2009-02-03 00:29:01.260000000
349,9,90.89999999999999,'9',2009-02-04 00:39:01.710000000
359,9,90.89999999999999,'9',2009-02-05 00:49:02.160000000
369,9,90.89999999999999,'9',2009-02-06 00:59:02.610000000
379,9,90.89999999999999,'9',2009-02-07 01:09:03.600000000
389,9,90.89999999999999,'9',2009-02-08 01:19:03.510000000
399,9,90.89999999999999,'9',2009-02-09 01:29:03.960000000
---- TYPES
INT, INT, DOUBLE, STRING, TIMESTAMP
====
---- QUERY
# Similar test on an HBase table.
create external table alltypes_hbase_clone like functional_hbase.alltypes
====
---- QUERY
alter table alltypes_hbase_clone set column stats double_col ('numDVs'='2');
alter table alltypes_hbase_clone set column stats timestamp_col ('numNulls'='9');
alter table alltypes_hbase_clone set column stats int_col ('numDVs'='100','numNulls'='20');
alter table alltypes_hbase_clone set column stats string_col ('maxSize'='555','avgSize'='60');
====
---- QUERY
show column stats alltypes_hbase_clone
---- LABELS
COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
---- RESULTS
'id','INT',-1,-1,4,4
'bool_col','BOOLEAN',-1,-1,1,1
'tinyint_col','TINYINT',-1,-1,1,1
'smallint_col','SMALLINT',-1,-1,2,2
'int_col','INT',100,20,4,4
'bigint_col','BIGINT',-1,-1,8,8
'float_col','FLOAT',-1,-1,4,4
'double_col','DOUBLE',2,-1,8,8
'date_string_col','STRING',-1,-1,-1,-1
'string_col','STRING',-1,-1,555,60
'timestamp_col','TIMESTAMP',-1,9,16,16
'year','INT',-1,-1,4,4
'month','INT',-1,-1,4,4
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE
====
---- QUERY
# Reset the column stats to an unknown state by setting the values to -1
alter table alltypes_hbase_clone set column stats double_col ('numDVs'='-1');
alter table alltypes_hbase_clone set column stats timestamp_col ('numNulls'='-1');
alter table alltypes_hbase_clone set column stats int_col ('numDVs'='-1','numNulls'='-1');
alter table alltypes_hbase_clone set column stats string_col ('maxSize'='-1','avgSize'='-1');
====
---- QUERY
show column stats alltypes_hbase_clone
---- LABELS
COLUMN, TYPE, #DISTINCT VALUES, #NULLS, MAX SIZE, AVG SIZE
---- RESULTS
'id','INT',-1,-1,4,4
'bool_col','BOOLEAN',-1,-1,1,1
'tinyint_col','TINYINT',-1,-1,1,1
'smallint_col','SMALLINT',-1,-1,2,2
'int_col','INT',-1,-1,4,4
'bigint_col','BIGINT',-1,-1,8,8
'float_col','FLOAT',-1,-1,4,4
'double_col','DOUBLE',-1,-1,8,8
'date_string_col','STRING',-1,-1,-1,-1
'string_col','STRING',-1,-1,-1,-1
'timestamp_col','TIMESTAMP',-1,-1,16,16
'year','INT',-1,-1,4,4
'month','INT',-1,-1,4,4
---- TYPES
STRING, STRING, BIGINT, BIGINT, BIGINT, DOUBLE
====