mirror of
https://github.com/apache/impala.git
synced 2026-01-07 18:02:33 -05:00
This change adds functionality to write and read parquet::Statistics for Decimal, String, and Timestamp values. As an exception, we don't read statistics for CHAR columns, since CHAR support is broken in Impala (IMPALA-1652). This change also switches from using the deprecated fields 'min' and 'max' to populate the new fields 'min_value' and 'max_value' in parquet::Statistics, that were added in parquet-format pull request #46. The HdfsParquetScanner will preferably read the new fields if they are populated and if the column order 'TypeDefinedOrder' has been used to compute the statistics. For columns without a column order set or with only the deprecated fields populated, the scanner will read them only if they are of simple numeric type, i.e. boolean, integer, or floating point. This change removes the validation of the Parquet Statistics we write to Hive from the tests, since Hive does not write the new fields. Instead it adds a parquet file written by Hive that uses the deprecated fields for its statistics. It uses that file to exercise the fallback logic for supported types in a test. This change also cleans up the interface of ParquetPlainEncoder in parquet-common.h. Change-Id: I3ef4a5d25a57c82577fd498d6d1c4297ecf39312 Reviewed-on: http://gerrit.cloudera.org:8080/6563 Reviewed-by: Lars Volker <lv@cloudera.com> Tested-by: Lars Volker <lv@cloudera.com>
450 lines
12 KiB
Plaintext
450 lines
12 KiB
Plaintext
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 0
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
set explain_level=2;
|
|
explain select id, bool_col from functional_parquet.alltypessmall where int_col < 0;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' parquet statistics predicates: int_col < 0'
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where smallint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where int_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where bigint_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where float_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where double_col < 0
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test with inverted predicate
|
|
select id, bool_col from functional_parquet.alltypessmall where -1 > int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col > 9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where smallint_col > 9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col > 9
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where bigint_col > 90
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where float_col > 9.9
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where double_col > 99
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 10
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col <= 0
|
|
---- RESULTS
|
|
12
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col >= 9
|
|
---- RESULTS
|
|
8
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col = -1
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
set explain_level=2;
|
|
explain select count(*) from functional_parquet.alltypessmall where tinyint_col = 10
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
' parquet statistics predicates: tinyint_col = 10'
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where id >= 30 and id <= 80
|
|
---- RESULTS
|
|
51
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Mix with partitioning columns
|
|
select count(*) from functional_parquet.alltypessmall where int_col < 0 and year < 2012
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test that without expr rewrite and thus without constant folding, constant exprs still
|
|
# can be used to prune row groups.
|
|
set enable_expr_rewrites=0;
|
|
select id, bool_col from functional_parquet.alltypessmall where int_col < 3 - 3
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test that without expr rewrite and thus without constant folding, constant exprs still
|
|
# can be used to prune row groups.
|
|
set enable_expr_rewrites=0;
|
|
select id, bool_col from functional_parquet.alltypessmall where 5 + 5 < int_col
|
|
---- RESULTS
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Test name based column resolution
|
|
create table name_resolve stored as parquet as select * from functional_parquet.alltypessmall;
|
|
alter table name_resolve replace columns (int_col int, bool_col boolean, tinyint_col tinyint, smallint_col smallint, id int);
|
|
set parquet_fallback_schema_resolution=NAME;
|
|
# If this picks up the stats from int_col, then it will filter all row groups and return
|
|
# an incorrect result.
|
|
select count(*) from name_resolve where id > 10;
|
|
---- RESULTS
|
|
89
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Query that has an implicit cast to a larger integer type
|
|
select count(*) from functional_parquet.alltypessmall where tinyint_col > 1000000000000
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
# Predicates with explicit casts are not supported when evaluating parquet::Statistics.
|
|
select count(*) from functional_parquet.alltypessmall where '0' > cast(tinyint_col as string)
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Explicit casts between numerical types can violate the transitivity of "min()", so they
|
|
# are not supported when evaluating parquet::Statistics.
|
|
select count(*) from functional_parquet.alltypes where cast(id as tinyint) < 10;
|
|
---- RESULTS
|
|
3878
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.complextypestbl.int_array where pos < 5;
|
|
---- RESULTS
|
|
9
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 2
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
# Test the conversion of constant IN lists to min/max predicats
|
|
select count(*) from functional_parquet.alltypes where int_col in (-1,-2,-3,-4);
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 24
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypes where id IN (1,25,49);
|
|
---- RESULTS
|
|
3
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 24
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 23
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col < "0"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col <= "/"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col < "1"
|
|
---- RESULTS
|
|
12
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col >= "9"
|
|
---- RESULTS
|
|
8
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where string_col > ":"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col < "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col <= "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-01-01 00:00:00"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col > "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 4
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col >= "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.alltypessmall where timestamp_col = "2009-04-03 00:24:00.96"
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 4
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 3
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d1 < 1234
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 < 1.23456789
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456788
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d3 = 1.23456789
|
|
---- RESULTS
|
|
1
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 > 0.123456789
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345678
|
|
---- RESULTS
|
|
5
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|
|
---- QUERY
|
|
select count(*) from functional_parquet.decimal_tbl where d4 >= 0.12345679
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 1
|
|
====
|
|
---- QUERY
|
|
# Test that stats are disabled for CHAR type columns.
|
|
create table chars (id int, c char(4)) stored as parquet;
|
|
insert into chars values (1, cast("abaa" as char(4))), (2, cast("abab" as char(4)));
|
|
select count(*) from chars;
|
|
---- RESULTS
|
|
2
|
|
====
|
|
---- QUERY
|
|
select count(*) from chars where c <= "aaaa"
|
|
---- RESULTS
|
|
0
|
|
---- RUNTIME_PROFILE
|
|
aggregation(SUM, NumRowGroups): 1
|
|
aggregation(SUM, NumStatsFilteredRowGroups): 0
|
|
====
|