mirror of
https://github.com/apache/impala.git
synced 2026-01-06 06:01:03 -05:00
This change adds a query option to disable reading Parquet statistics. It provides a workaround when dealing with files that have corrupt parquet statistics. Note that Impala handles Parquet files affected by PARQUET-251 correctly by ignoring statistics for anything but plain numeric types. This query option is supposed to help with files affected by unknown or errors or by errors that are yet to be made. Change-Id: I427f7fde40d0f4b703751e40f3c2109a850643f7 Reviewed-on: http://gerrit.cloudera.org:8080/7001 Reviewed-by: Dan Hecht <dhecht@cloudera.com> Tested-by: Impala Public Jenkins
461 lines
13 KiB
Plaintext
461 lines
13 KiB
Plaintext
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 0
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
set explain_level=2;
|
|
explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' parquet statistics predicates: int_col < 0'
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where smallint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where int_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where bigint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where float_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where double_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test with inverted predicate
|
|
select id, bool_col from functional_parquet.alltypessmall where -1 > int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col > 9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where smallint_col > 9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col > 9
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where bigint_col > 90
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where float_col > 9.9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where double_col > 99
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0
|
|
---- RESULTS
|
|
12
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9
|
|
---- RESULTS
|
|
8
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col = -1
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
set explain_level=2;
|
|
explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' parquet statistics predicates: tinyint_col = 10'
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80
|
|
---- RESULTS
|
|
51
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Mix with partitioning columns
|
|
select count(*) from functional_parquet.alltypessmall where int_col < 0 and year < 2012
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test that without expr rewrite and thus without constant folding, constant exprs still
|
|
# can be used to prune row groups.
|
|
set enable_expr_rewrites=0;
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test that without expr rewrite and thus without constant folding, constant exprs still
|
|
# can be used to prune row groups.
|
|
set enable_expr_rewrites=0;
|
|
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test name based column resolution
|
|
create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall;
|
|
alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint, smallint_col smallint, id int);
|
|
set parquet_fallback_schema_resolution=NAME;
|
|
# If this picks up the stats from int_col, then it will filter all row groups and return
|
|
# an incorrect result.
|
|
select count(*) from name_resolve where id > 10;
|
|
---- RESULTS
|
|
89
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Query that has an implicit cast to a larger integer type
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Predicates with explicit casts are not supported when evaluating parquet::Statistics.
|
|
select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as string)
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Explicit casts between numerical types can violate the transitivity of "min()", so they
|
|
# are not supported when evaluating parquet::Statistics.
|
|
select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10;
|
|
---- RESULTS
|
|
3878
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
|
|
---- RESULTS
|
|
9
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Test the conversion of constant IN lists to min/max predicats
|
|
select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 24
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypes where id IN (1,25,49);
|
|
---- RESULTS
|
|
3
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 23
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col < "0"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col <= "/"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col < "1"
|
|
---- RESULTS
|
|
12
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col >= "9"
|
|
---- RESULTS
|
|
8
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col > ":"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d1 < 1234
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678
|
|
---- RESULTS
|
|
5
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Test that stats are disabled for CHAR type columns.
|
|
create table chars (id int, c char(4)) stored as parquet;
|
|
insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4)));
|
|
select count(*) from chars;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from chars where c <= "aaaa"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# IMPALA-4988: Test that stats support can be disabled using the parquet_read_statistics
|
|
# query option.
|
|
set parquet_read_statistics=0;
|
|
select count(*) from functional_parquet.alltypes where id < 0;
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|