mirror of
https://github.com/apache/impala.git
synced 2026-01-01 09:00:42 -05:00
Before this change, ReservoirSample functions (such as APPX_MEDIAN()) allocated memory for 20,000 elements up front per grouping key. This caused inefficient memory usage for aggregations with many grouping keys. This patch fixes this by initially allocating memory for 16 elements. Once the buffer becomes full, we reallocate a new buffer with double capacity and copy the original buffer into the new one. We continue doubling the buffer size until the buffer has room for 20,000 elements as before. Testing: Added some EE APPX_MEDIAN() tests on larger datasets that exercise the resize code path. Perf Benchrmark (about 35,000 elements per bucket): SELECT MAX(a) from ( SELECT c1, appx_median(c2) as a FROM benchmark GROUP BY c1) t BEFORE: 11s067ms Operator #Hosts Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail ------------------------------------------------------------------------------------------------------------------------- 06:AGGREGATE 1 124.726us 124.726us 1 1 28.00 KB -1.00 B FINALIZE 05:EXCHANGE 1 29.544us 29.544us 3 1 0 -1.00 B UNPARTITIONED 02:AGGREGATE 3 86.406us 120.372us 3 1 44.00 KB 10.00 MB 04:AGGREGATE 3 1s840ms 2s824ms 2.00K -1 1.02 GB 128.00 MB FINALIZE 03:EXCHANGE 3 1s163ms 1s989ms 6.00K -1 0 0 HASH(c1) 01:AGGREGATE 3 3s356ms 3s416ms 6.00K -1 1.95 GB 128.00 MB STREAMING 00:SCAN HDFS 3 64.962ms 65.490ms 65.54M -1 25.97 MB 64.00 MB tpcds_10_parquet.benchmark AFTER: 9s465ms Operator #Hosts Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail ------------------------------------------------------------------------------------------------------------------------ 06:AGGREGATE 1 73.961us 73.961us 1 1 28.00 KB -1.00 B FINALIZE 05:EXCHANGE 1 18.101us 18.101us 3 1 0 -1.00 B UNPARTITIONED 02:AGGREGATE 3 75.795us 83.969us 3 1 44.00 KB 10.00 MB 04:AGGREGATE 3 1s608ms 2s683ms 2.00K -1 1.02 GB 128.00 MB FINALIZE 03:EXCHANGE 3 826.683ms 1s322ms 6.00K -1 0 0 HASH(c1) 01:AGGREGATE 3 2s457ms 2s672ms 6.00K -1 3.14 GB 128.00 MB STREAMING 00:SCAN HDFS 3 81.514ms 89.056ms 65.54M -1 25.94 MB 64.00 MB tpcds_10_parquet.benchmark Memory Benchmark (about 12 elements per bucket): SELECT MAX(a) FROM ( SELECT ss_customer_sk, APPX_MEDIAN(ss_sold_date_sk) as a FROM tpcds_parquet.store_sales GROUP BY ss_customer_sk) t BEFORE: 7s477ms Operator #Hosts Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail --------------------------------------------------------------------------------------------------------------------- 06:AGGREGATE 1 114.686us 114.686us 1 1 28.00 KB -1.00 B FINALIZE 05:EXCHANGE 1 18.214us 18.214us 3 1 0 -1.00 B UNPARTITIONED 02:AGGREGATE 3 147.055us 165.464us 3 1 28.00 KB 10.00 MB 04:AGGREGATE 3 2s043ms 2s147ms 14.82K -1 4.94 GB 128.00 MB FINALIZE 03:EXCHANGE 3 840.528ms 943.254ms 15.61K -1 0 0 HASH(ss_customer_sk) 01:AGGREGATE 3 1s769ms 1s869ms 15.61K -1 5.32 GB 128.00 MB STREAMING 00:SCAN HDFS 3 17.941ms 37.109ms 183.59K -1 1.94 MB 16.00 MB tpcds_parquet.store_sales AFTER: 434ms Operator #Hosts Avg Time Max Time #Rows Est. #Rows Peak Mem Est. Peak Mem Detail --------------------------------------------------------------------------------------------------------------------- 06:AGGREGATE 1 125.915us 125.915us 1 1 28.00 KB -1.00 B FINALIZE 05:EXCHANGE 1 72.179us 72.179us 3 1 0 -1.00 B UNPARTITIONED 02:AGGREGATE 3 79.054us 83.385us 3 1 28.00 KB 10.00 MB 04:AGGREGATE 3 6.559ms 7.669ms 14.82K -1 17.32 MB 128.00 MB FINALIZE 03:EXCHANGE 3 67.370us 85.068us 15.60K -1 0 0 HASH(ss_customer_sk) 01:AGGREGATE 3 19.245ms 24.472ms 15.60K -1 9.48 MB 128.00 MB STREAMING 00:SCAN HDFS 3 53.173ms 55.844ms 183.59K -1 1.18 MB 16.00 MB tpcds_parquet.store_sales Change-Id: I99adaad574d4fb0a3cf38c6cbad8b2a23df12968 Reviewed-on: http://gerrit.cloudera.org:8080/6025 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: Impala Public Jenkins
98 lines
2.8 KiB
Plaintext
98 lines
2.8 KiB
Plaintext
====
|
|
---- QUERY
|
|
# TODO: IMPALA-3350: Add 'group by' to these tests to exercise different code paths.
|
|
select ndv(string_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 1024 bytes.
|
|
====
|
|
---- QUERY
|
|
select min(string_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 1 bytes.
|
|
====
|
|
---- QUERY
|
|
select max(string_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 1 bytes.
|
|
====
|
|
---- QUERY
|
|
select avg(d1) from functional.decimal_tbl
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 48 bytes.
|
|
====
|
|
---- QUERY
|
|
select avg(double_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 16 bytes.
|
|
====
|
|
---- QUERY
|
|
select avg(timestamp_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 16 bytes.
|
|
====
|
|
---- QUERY
|
|
select sample(timestamp_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 248 bytes.
|
|
====
|
|
---- QUERY
|
|
select distinctpc(int_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 256 bytes.
|
|
====
|
|
---- QUERY
|
|
select distinctpcsa(string_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 256 bytes.
|
|
====
|
|
---- QUERY
|
|
select group_concat(string_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 4 bytes.
|
|
====
|
|
---- QUERY
|
|
select rank() over (partition by month order by year) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 16 bytes.
|
|
====
|
|
---- QUERY
|
|
select extract(year from timestamp_col) from functional.alltypes limit 10
|
|
---- CATCH
|
|
FunctionContextImpl::AllocateLocal() failed to allocate 4 bytes.
|
|
====
|
|
---- QUERY
|
|
select trunc(timestamp_col, 'YEAR') from functional.alltypes limit 10
|
|
---- CATCH
|
|
FunctionContextImpl::AllocateLocal() failed to allocate 4 bytes.
|
|
====
|
|
---- QUERY
|
|
select first_value(string_col) over (partition by month order by year) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 1 bytes.
|
|
====
|
|
---- QUERY
|
|
select last_value(string_col) over (partition by month order by year) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 1 bytes.
|
|
====
|
|
---- QUERY
|
|
select rand() from functional.alltypes;
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 4 bytes.
|
|
====
|
|
---- QUERY
|
|
select case when min(int_col) = 0 then 0 end from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 16 bytes.
|
|
====
|
|
---- QUERY
|
|
select cast(string_col as char(120)) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContextImpl::AllocateLocal() failed to allocate 120 bytes.
|
|
====
|
|
---- QUERY
|
|
select appx_median(int_col) from functional.alltypes
|
|
---- CATCH
|
|
FunctionContext::Allocate() failed to allocate 248 bytes.
|
|
====
|