Files
impala/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test
Lars Volker 9270346825 IMPALA-4815, IMPALA-4817, IMPALA-4819: Write and Read Parquet Statistics for remaining types
This change adds functionality to write and read parquet::Statistics for
Decimal, String, and Timestamp values. As an exception, we don't read
statistics for CHAR columns, since CHAR support is broken in Impala
(IMPALA-1652).

This change also switches from using the deprecated fields 'min' and
'max' to populate the new fields 'min_value' and 'max_value' in
parquet::Statistics, that were added in parquet-format pull request #46.

The HdfsParquetScanner will preferably read the new fields if they are
populated and if the column order 'TypeDefinedOrder' has been used to
compute the statistics. For columns without a column order set or with
only the deprecated fields populated, the scanner will read them only if
they are of simple numeric type, i.e. boolean, integer, or floating
point.

This change removes the validation of the Parquet Statistics we write to
Hive from the tests, since Hive does not write the new fields. Instead
it adds a parquet file written by Hive that uses the deprecated fields
for its statistics. It uses that file to exercise the fallback logic for
supported types in a test.

This change also cleans up the interface of ParquetPlainEncoder in
parquet-common.h.

Change-Id: I3ef4a5d25a57c82577fd498d6d1c4297ecf39312
Reviewed-on: http://gerrit.cloudera.org:8080/6563
Reviewed-by: Lars Volker <lv@cloudera.com>
Tested-by: Lars Volker <lv@cloudera.com>
2017-05-09 15:47:21 +00:00

450 lines
12 KiB
Plaintext

====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 0
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
set explain_level=2;
explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0;
---- RESULTS: VERIFY_IS_SUBSET
' parquet statistics predicates: int_col < 0'
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where smallint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where int_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where bigint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where float_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where double_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test with inverted predicate
select id, bool_col from functional_parquet.alltypessmall where -1 > int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col > 9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where smallint_col > 9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col > 9
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where bigint_col > 90
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where float_col > 9.9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where double_col > 99
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0
---- RESULTS
12
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col = -1
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
set explain_level=2;
explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
---- RESULTS: VERIFY_IS_SUBSET
' parquet statistics predicates: tinyint_col = 10'
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80
---- RESULTS
51
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Mix with partitioning columns
select count(*) from functional_parquet.alltypessmall where int_col < 0 and year < 2012
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test that without expr rewrite and thus without constant folding, constant exprs still
# can be used to prune row groups.
set enable_expr_rewrites=0;
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test that without expr rewrite and thus without constant folding, constant exprs still
# can be used to prune row groups.
set enable_expr_rewrites=0;
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test name based column resolution
create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall;
alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint, smallint_col smallint, id int);
set parquet_fallback_schema_resolution=NAME;
# If this picks up the stats from int_col, then it will filter all row groups and return
# an incorrect result.
select count(*) from name_resolve where id > 10;
---- RESULTS
89
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Query that has an implicit cast to a larger integer type
select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Predicates with explicit casts are not supported when evaluating parquet::Statistics.
select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as string)
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Explicit casts between numerical types can violate the transitivity of "min()", so they
# are not supported when evaluating parquet::Statistics.
select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10;
---- RESULTS
3878
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
---- RESULTS
9
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Test the conversion of constant IN lists to min/max predicats
select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 24
====
---- QUERY
select count(*) from functional_parquet.alltypes where id IN (1,25,49);
---- RESULTS
3
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 23
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col < "0"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col <= "/"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col < "1"
---- RESULTS
12
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col >= "9"
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col > ":"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01 00:00:00"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01 00:00:00"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03 00:24:00.96"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03 00:24:00.96"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d1 < 1234
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678
---- RESULTS
5
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Test that stats are disabled for CHAR type columns.
create table chars (id int, c char(4)) stored as parquet;
insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4)));
select count(*) from chars;
---- RESULTS
2
====
---- QUERY
select count(*) from chars where c <= "aaaa"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====