mirror of
https://github.com/apache/impala.git
synced 2026-01-07 09:02:19 -05:00
IMPALA-5448: fix invalid number of splits reported in Parquet scan node
Parquet splits with multi columns are marked as completed by using HdfsScanNodeBase::RangeComplete(). It duplicately counts the file types as column codec types. Thus the number of parquet splits are the real count multiplies number of materialized columns. Furthermore, according to the Parquet definition, it allows mixed compression codecs on different columns. This's handled in this patch as well. A parquet file using gzip and snappy compression codec will be reported as: FileFormats: PARQUET/(GZIP,SNAPPY):1 This patch introduces a compression types set for the above cases. Testing: Add end-to-end tests handling parquet files with all columns compressed in snappy, and handling parquet files with multi compression codec. Change-Id: Iaacc2d775032f5707061e704f12e0a63cde695d1 Reviewed-on: http://gerrit.cloudera.org:8080/8147 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins
This commit is contained in:
committed by
Impala Public Jenkins
parent
adb92d3397
commit
192cd96d9e
20
testdata/workloads/functional-query/queries/QueryTest/hdfs_parquet_scan_node_profile.test
vendored
Normal file
20
testdata/workloads/functional-query/queries/QueryTest/hdfs_parquet_scan_node_profile.test
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
# Regression test for IMPALA-5448
|
||||
# This query will do a full scan on a parquet file
|
||||
select * from functional_parquet.alltypestiny where year=2009 and month=1
|
||||
---- RUNTIME_PROFILE
|
||||
row_regex: .*File Formats: PARQUET/SNAPPY:1
|
||||
====
|
||||
---- QUERY
|
||||
# This query will do a full scan on a parquet table with two partitions.
|
||||
# Each partition uses different compression types.
|
||||
select * from alltypes_multi_compression
|
||||
---- RUNTIME_PROFILE
|
||||
row_regex: .*File Formats: PARQUET/GZIP:1 PARQUET/SNAPPY:1
|
||||
====
|
||||
---- QUERY
|
||||
# This query will do a full scan on a parquet table with multiple
|
||||
# compression types
|
||||
select * from multi_compression
|
||||
---- RUNTIME_PROFILE
|
||||
row_regex: .*File Formats: PARQUET/\(GZIP,SNAPPY\):2
|
||||
====
|
||||
Reference in New Issue
Block a user