mirror of
https://github.com/apache/impala.git
synced 2026-01-22 09:01:58 -05:00
IMPALA-7543 introduced sub-ranges in scan ranges. These are smaller parts of the scan ranges that actually need to be read, other parts of the scan range can be skipped. Currently sub-ranges are only used in the Parquet scanner during page filtering. With sub-ranges the scan range has a new field 'bytes_to_read_', that is the sum of the lengths of the sub-ranges. Or, if there are no sub-ranges, 'bytes_to_read_' equals to field 'len_' which is the length of the whole scan range. At some parts of Impala ScanRange::len() is being used instead of ScanRange::bytes_to_read(). It doesn't cause a bug because only the Parquet scanner uses sub-ranges, i.e. bytes_to_read() usually equals to len(). The Parquet scanner also doesn't hit the bug because it tracks which pages it reads. However, it can be a potential source of bugs in the future to leave the invocations of len() instead of bytes_to_read(). Also, the scanners might allocate more memory than needed. At couple of places we still need to invoke len(), e.g. when we test scan-range containment (for local splits), or when we test whether a scan range contains the mid-point of a Parquet row group. Testing: Added a scanner reservation test. Ran the exhaustive tests. Change-Id: Ie896db3f4b5f3e2272d81c2d360049af09c41d9c Reviewed-on: http://gerrit.cloudera.org:8080/14348 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
64 lines
2.0 KiB
Plaintext
64 lines
2.0 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Scan moderately large file - scanner should try to increase reservation and succeed.
|
|
select count(*)
|
|
from tpch.customer
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
150000
|
|
---- RUNTIME_PROFILE
|
|
row_regex:.*InitialRangeIdealReservation.*Avg: 24.00 MB.*Number of samples: 1.*
|
|
row_regex:.*InitialRangeActualReservation.*Avg: 24.00 MB.*Number of samples: 1.*
|
|
====
|
|
---- QUERY
|
|
# Scan moderately large file - scanner should try to increase reservation and fail.
|
|
set debug_action="-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0";
|
|
select count(*)
|
|
from tpch.customer
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
150000
|
|
---- RUNTIME_PROFILE
|
|
row_regex:.*InitialRangeIdealReservation.*Avg: 24.00 MB.*Number of samples: 1.*
|
|
row_regex:.*InitialRangeActualReservation.*Avg: 8.00 MB.*Number of samples: 1.*
|
|
====
|
|
---- QUERY
|
|
# Scan large Parquet column - scanner should try to increase reservation and succeed.
|
|
select min(l_comment)
|
|
from tpch_parquet.lineitem
|
|
---- TYPES
|
|
STRING
|
|
---- RESULTS
|
|
' Tiresias '
|
|
---- RUNTIME_PROFILE
|
|
row_regex:.*InitialRangeIdealReservation.*Avg: 128.00 KB.*
|
|
row_regex:.*InitialRangeActualReservation.*Avg: 4.00 MB.*
|
|
row_regex:.*ParquetRowGroupIdealReservation.*Avg: 24.00 MB.*
|
|
row_regex:.*ParquetRowGroupActualReservation.*Avg: 24.00 MB.*
|
|
====
|
|
---- QUERY
|
|
# Scan moderately large file - scanner should try to increase reservation and fail.
|
|
set debug_action="-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0";
|
|
select min(l_comment)
|
|
from tpch_parquet.lineitem
|
|
---- TYPES
|
|
STRING
|
|
---- RESULTS
|
|
' Tiresias '
|
|
---- RUNTIME_PROFILE
|
|
row_regex:.*InitialRangeIdealReservation.*Avg: 128.00 KB.*
|
|
row_regex:.*InitialRangeActualReservation.*Avg: 4.00 MB.*
|
|
row_regex:.*ParquetRowGroupIdealReservation.*Avg: 24.00 MB.*
|
|
row_regex:.*ParquetRowGroupActualReservation.*Avg: 4.00 MB.*
|
|
====
|
|
---- QUERY
|
|
# IMPALA-8742: Use ScanRange::bytes_to_read() instead of len(), it has an effect
|
|
# on the calculated ideal reservation.
|
|
select * from tpch_parquet.lineitem
|
|
where l_orderkey < 10;
|
|
---- RUNTIME_PROFILE
|
|
row_regex:.*ParquetRowGroupIdealReservation.*Avg: 3.50 MB.*
|
|
====
|