mirror of
https://github.com/apache/impala.git
synced 2026-01-31 09:00:19 -05:00
The minimum requirement for a spillable operator is ((min_buffers -2) * default_buffer_size) + 2 * max_row_size. In the min reservation, we only reserve space for two large pages, one for reading, the other for writing. However, to make the non-streaming GroupingAggregator work correctly, we have to manage these extra reservations carefully. So it won't run out of the min reservation when it actually needs to spill a large page, or when it actually needs to read a large page. To be specific, for how to manage the large write page reservation, depending on whether needs_serialize is true or false: - If the aggregator needs to serialize the intermediate results when spilling a partition, we have to save a large page worth of reservation for the serialize stream, in case it needs to write large rows. This space can be restored when all the partitions are spilled so the serialize stream is not needed until we build/repartition a spilled partition and thus have pinned partitions again. If the large write page reservation is used, we save it back whenever possible after we spill or close a partition. - If the aggregator doesn't need the serialize stream at all, we can restore the large write page reservation whenever we fail to add a large row, before spilling any partitions. Reclaim it whenever possible after we spill or close a partition. A special case is when we are processing a large row and it's the last row in building/repartitioning a spilled partition, the large write page reservation can be restored for it no matter whether we need the serialize stream. Because partitions will be read out after this so no needs for spilling. For the large read page reservation, it's transferred to the spilled BufferedTupleStream that we are reading in building/repartitioning a spilled partition. The stream will restore some of it when reading a large page, and reclaim it when the output row batch is reset. Note that the stream is read in attach_on_read mode, the large page will be attached to the row batch's buffers and only get freed when the row batch is reset. Tests: - Add tests in test_spilling_large_rows (test_spilling.py) with different row sizes to reproduce the issue. - One test in test_spilling_no_debug_action becomes flaky after this patch. Revise the query to make the udf allocate larger strings so it can consistently pass. - Run CORE tests. Change-Id: I3d9c3a2e7f0da60071b920dec979729e86459775 Reviewed-on: http://gerrit.cloudera.org:8080/16240 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com>
128 lines
3.8 KiB
Plaintext
128 lines
3.8 KiB
Plaintext
# This file contains tests where we don't want the python test framework to supply the
|
|
# debug_action value because the test won't succeed with all possible debug_action values.
|
|
====
|
|
---- QUERY
|
|
# Tests for the case where a spilled partition has 0 probe rows and so we don't build the
|
|
# hash table in a partitioned hash join. Always runs with the minimum reservation to force
|
|
# spilling.
|
|
# INNER JOIN
|
|
set debug_action="-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0";
|
|
select straight_join count(*)
|
|
from
|
|
lineitem a, lineitem b
|
|
where
|
|
a.l_partkey = 1 and
|
|
a.l_orderkey = b.l_orderkey;
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
173
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# spilled partition with 0 probe rows, NULL AWARE LEFT ANTI JOIN
|
|
set debug_action="-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0";
|
|
select straight_join count(*)
|
|
from
|
|
lineitem a
|
|
where
|
|
a.l_partkey not in (select l_partkey from lineitem where l_partkey > 10)
|
|
and a.l_partkey < 1000;
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
287
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# spilled partition with 0 probe rows, RIGHT OUTER JOIN
|
|
set debug_action="-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0";
|
|
select straight_join count(*)
|
|
from
|
|
supplier right outer join lineitem on s_suppkey = l_suppkey
|
|
where s_acctbal > 0 and s_acctbal < 10;
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
12138
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# spilled partition with 0 probe rows, RIGHT OUTER JOIN
|
|
# Setting max_row_size == default_spillable_buffer_size was sufficient to trigger
|
|
# IMPALA-9349, because it means there is no surplus reservation during repartitioning.
|
|
set debug_action="-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0";
|
|
select straight_join count(*)
|
|
from
|
|
supplier right outer join lineitem on s_suppkey = l_suppkey
|
|
where s_acctbal > 0 and s_acctbal < 10;
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
12138
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# spilled partition with 0 probe rows, RIGHT ANTI JOIN
|
|
set debug_action="-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0";
|
|
with x as (select * from supplier limit 10)
|
|
select straight_join count(*)
|
|
from
|
|
x right anti join lineitem on s_suppkey + 100 = l_suppkey;
|
|
---- TYPES
|
|
BIGINT
|
|
---- RESULTS
|
|
5995258
|
|
---- RUNTIME_PROFILE
|
|
row_regex: .*NumHashTableBuildsSkipped: .* \([1-9][0-9]*\)
|
|
====
|
|
---- QUERY
|
|
# Aggregation query that will OOM and fail to spill because of IMPALA-3304 without
|
|
# any help from DEBUG_ACTION.
|
|
set mem_limit=75m;
|
|
select l_orderkey, group_concat(repeat(l_comment, 10)) comments
|
|
from lineitem
|
|
group by l_orderkey
|
|
order by comments desc
|
|
limit 5
|
|
---- CATCH
|
|
Memory limit exceeded
|
|
====
|
|
---- QUERY
|
|
# Top-N query with large limit that will OOM because spilling is not implemented:
|
|
# IMPALA-3471. It does not need any help from DEBUG_ACTION.
|
|
set topn_bytes_limit=-1;
|
|
set mem_limit=100m;
|
|
select *
|
|
from lineitem
|
|
order by l_orderkey desc
|
|
limit 6000000
|
|
---- CATCH
|
|
Memory limit exceeded
|
|
====
|
|
---- QUERY
|
|
# Hash join that will fail to repartition and therefore fail from out-of-memory because
|
|
# of a large number of duplicate keys on the build side: IMPALA-4857. It does not need
|
|
# any help from DEBUG_ACTION.
|
|
set mem_limit=250m;
|
|
select straight_join *
|
|
from supplier join /* +broadcast */ lineitem on s_suppkey = l_linenumber
|
|
order by l_tax desc
|
|
limit 5
|
|
---- CATCH
|
|
row_regex:.*Cannot perform hash join at node with id .*. Repartitioning did not reduce the size of a spilled partition.*
|
|
====
|
|
---- QUERY
|
|
# Analytic query with certain kinds of large windows can't be spilled: IMPALA-5738. It
|
|
# does not need any help from DEBUG_ACTION.
|
|
set mem_limit=100m;
|
|
select avg(l_tax) over (order by l_orderkey rows between 100000000 preceding and 10000000 following)
|
|
from lineitem
|
|
---- CATCH
|
|
Memory limit exceeded
|
|
====
|