mirror of
https://github.com/apache/impala.git
synced 2026-01-03 15:00:52 -05:00
Impala supports reading Parquet files with multiple row groups but with possible performance degradation due to remote reads. This patch maximizes scan locality by allowing multiple impalads to scan the rowgroups in their local splits. Each impalad starts a new scan range for each split local to it if that split contains row group(s) that need to be scanned. Change-Id: Iaecc5fb8e89364780bc59dbfa9ae51d0d124d16e Reviewed-on: http://gerrit.cloudera.org:8080/908 Reviewed-by: Sailesh Mukil <sailesh@cloudera.com> Tested-by: Internal Jenkins
99 lines
2.1 KiB
Plaintext
99 lines
2.1 KiB
Plaintext
====
|
|
---- QUERY
|
|
# IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0
|
|
# IMPALA-720: data file with multiple row groups
|
|
SELECT * from bad_parquet where field = "parquet"
|
|
---- TYPES
|
|
string
|
|
---- RESULTS
|
|
'parquet'
|
|
'parquet'
|
|
'parquet'
|
|
'parquet'
|
|
====
|
|
---- QUERY
|
|
SELECT count(distinct field) from bad_parquet
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
1005
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid metadata size in the file footer.
|
|
SELECT * from bad_metadata_len
|
|
---- CATCH
|
|
Invalid metadata size in file footer
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid column dict_page_offset.
|
|
SELECT * from bad_dict_page_offset
|
|
---- CATCH
|
|
Column 0 has invalid column offsets (offset=10000, size=47, file_size=249)
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid column total_compressed_size.
|
|
SELECT * from bad_compressed_size
|
|
---- CATCH
|
|
Column 0 has invalid column offsets (offset=4, size=1000000, file_size=245)
|
|
====
|
|
---- QUERY
|
|
# Parquet file with required fields.
|
|
select * from kite_required_fields
|
|
---- TYPES
|
|
bigint,bigint,string,string,boolean,boolean,bigint,bigint,bigint,bigint
|
|
---- RESULTS
|
|
1,2,'foo','bar',true,false,1,2,3,4
|
|
1,NULL,'foo','NULL',true,NULL,NULL,NULL,3,4
|
|
100,NULL,'foooo','NULL',false,NULL,NULL,NULL,300,400
|
|
====
|
|
---- QUERY
|
|
# Parquet file with invalid magic number
|
|
SELECT * from bad_magic_number
|
|
---- CATCH
|
|
File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid version number: XXXX
|
|
====
|
|
---- QUERY
|
|
# count(*) query on parquet file with multiple blocks
|
|
SELECT count(*) from lineitem_multiblock
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
20000
|
|
====
|
|
---- QUERY
|
|
# Select multiple columns from parquet file with multiple blocks
|
|
SELECT count(l_comment), min(l_partkey), max(l_linenumber) from lineitem_multiblock;
|
|
---- TYPES
|
|
bigint, bigint, int
|
|
---- RESULTS
|
|
20000,2,7
|
|
====
|
|
---- QUERY
|
|
# Test limit queries on parquet with multiple blocks
|
|
select distinct l_orderkey from functional_parquet.lineitem_multiblock where
|
|
l_orderkey < 5 or l_orderkey > 15000 order by l_orderkey limit 20;
|
|
---- TYPES
|
|
bigint
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
4
|
|
15008
|
|
15009
|
|
15010
|
|
15011
|
|
15012
|
|
15013
|
|
15014
|
|
15015
|
|
15040
|
|
15041
|
|
15042
|
|
15043
|
|
15044
|
|
15045
|
|
15046
|
|
15047
|
|
====
|