Files
impala/testdata/workloads/functional-query/queries/QueryTest/parquet_stats.test
Lars Volker f7f8c4518a IMPALA-4988: Add query option read_parquet_statistics
This change adds a query option to disable reading Parquet statistics.
It provides a workaround when dealing with files that have corrupt
parquet statistics.

Note that Impala handles Parquet files affected by PARQUET-251 correctly
by ignoring statistics for anything but plain numeric types. This query
option is supposed to help with files affected by unknown or errors or
by errors that are yet to be made.

Change-Id: I427f7fde40d0f4b703751e40f3c2109a850643f7
Reviewed-on: http://gerrit.cloudera.org:8080/7001
Reviewed-by: Dan Hecht <dhecht@cloudera.com>
Tested-by: Impala Public Jenkins
2017-06-01 01:00:13 +00:00

461 lines
13 KiB
Plaintext

====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 0
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
set explain_level=2;
explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0;
---- RESULTS: VERIFY_IS_SUBSET
' parquet statistics predicates: int_col < 0'
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where smallint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where int_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where bigint_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where float_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where double_col < 0
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test with inverted predicate
select id, bool_col from functional_parquet.alltypessmall where -1 > int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col > 9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where smallint_col > 9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col > 9
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where bigint_col > 90
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where float_col > 9.9
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where double_col > 99
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0
---- RESULTS
12
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col = -1
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
set explain_level=2;
explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
---- RESULTS: VERIFY_IS_SUBSET
' parquet statistics predicates: tinyint_col = 10'
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80
---- RESULTS
51
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Mix with partitioning columns
select count(*) from functional_parquet.alltypessmall where int_col < 0 and year < 2012
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test that without expr rewrite and thus without constant folding, constant exprs still
# can be used to prune row groups.
set enable_expr_rewrites=0;
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test that without expr rewrite and thus without constant folding, constant exprs still
# can be used to prune row groups.
set enable_expr_rewrites=0;
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Test name based column resolution
create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall;
alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint, smallint_col smallint, id int);
set parquet_fallback_schema_resolution=NAME;
# If this picks up the stats from int_col, then it will filter all row groups and return
# an incorrect result.
select count(*) from name_resolve where id > 10;
---- RESULTS
89
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Query that has an implicit cast to a larger integer type
select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
# Predicates with explicit casts are not supported when evaluating parquet::Statistics.
select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as string)
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Explicit casts between numerical types can violate the transitivity of "min()", so they
# are not supported when evaluating parquet::Statistics.
select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10;
---- RESULTS
3878
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
---- RESULTS
9
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# Test the conversion of constant IN lists to min/max predicats
select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 24
====
---- QUERY
select count(*) from functional_parquet.alltypes where id IN (1,25,49);
---- RESULTS
3
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 23
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col < "0"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col <= "/"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col < "1"
---- RESULTS
12
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col >= "9"
---- RESULTS
8
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where string_col > ":"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01 00:00:00"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01 00:00:00"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03 00:24:00.96"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 4
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03 00:24:00.96"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96"
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumStatsFilteredRowGroups): 3
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d1 < 1234
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789
---- RESULTS
1
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678
---- RESULTS
5
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 1
====
---- QUERY
# Test that stats are disabled for CHAR type columns.
create table chars (id int, c char(4)) stored as parquet;
insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4)));
select count(*) from chars;
---- RESULTS
2
====
---- QUERY
select count(*) from chars where c <= "aaaa"
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumStatsFilteredRowGroups): 0
====
---- QUERY
# IMPALA-4988: Test that stats support can be disabled using the parquet_read_statistics
# query option.
set parquet_read_statistics=0;
select count(*) from functional_parquet.alltypes where id < 0;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumStatsFilteredRowGroups): 0
====