IMPALA-13963: Crash when setting 'write.parquet.page-size-bytes' to a higher value

When setting the Iceberg table property 'write.parquet.page-size-bytes'
to a higher value, inserting into the table crashes Impala:

  create table lineitem_iceberg_comment
  stored as iceberg
  tblproperties("write.parquet.page-size-bytes"="1048576")
  as select l_comment from tpch_parquet.lineitem;

The impala executors crash because of memory corruption caused by buffer
overflow in HdfsParquetTableWriter::ColumnWriter::ProcessValue(). Before
attempting to write the next value, it checks whether the total byte
size would exceed 'plain_page_size_', but the buffer into which it
writes ('values_buffer_') has length 'values_buffer_len_'.
'values_buffer_len_' is initialised in the constructor to
'DEFAULT_DATA_PAGE_SIZE', irrespective of the value of
'plain_page_size_'. However, it is intended to have at least the same
size, as can be seen from the check in ProcessValue() or the
GrowPageSize() method. The error does not usually surface because
'plain_page_size_' has the same default value, 'DEFAULT_DATA_PAGE_SIZE'.

'values_buffer_' is also used for DICTIONARY encoding, but that takes
care of growing it as necessary.

This change fixes the problem by initialising 'values_buffer_len_' to
the value of 'plain_page_size_' in the constructor.

This leads to exposing another bug: in BitWriter::PutValue(), when we
check whether the next element fits in the buffer, we multiply
'max_bytes_' by 8, which overflows because 'max_bytes_' is a 32-bit int.
This happens with values that we already use in our tests.

This change changes the type of 'max_bytes_' to int64_t, so multiplying
it by 8 (converting from bytes to bits) is now safe.

Testing:
  - Added an EE test in iceberg-insert.test that reproduced the error.

Change-Id: Icb94df8ac3087476ddf1613a1285297f23a54c76
Reviewed-on: http://gerrit.cloudera.org:8080/22777
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Noemi Pap-Takacs <npaptakacs@cloudera.com>
Reviewed-by: Csaba Ringhofer <csringhofer@cloudera.com>
This commit is contained in:
Daniel Becker
2025-04-14 10:29:55 +02:00
committed by Csaba Ringhofer
parent 0816986b15
commit bc0a92c5ed
3 changed files with 19 additions and 8 deletions

View File

@@ -131,7 +131,7 @@ class HdfsParquetTableWriter::BaseColumnWriter {
total_uncompressed_byte_size_(0),
dict_encoder_base_(nullptr),
def_levels_(nullptr),
values_buffer_len_(DEFAULT_DATA_PAGE_SIZE),
values_buffer_len_(plain_page_size_),
page_stats_base_(nullptr),
row_group_stats_base_(nullptr),
table_sink_mem_tracker_(parent_->parent_->mem_tracker()),
@@ -539,6 +539,8 @@ class HdfsParquetTableWriter::ColumnWriter :
DCHECK_GT(current_page_->header.uncompressed_page_size, 0);
return false;
}
DCHECK_LE(current_page_->header.uncompressed_page_size + *bytes_needed,
values_buffer_len_);
uint8_t* dst_ptr = values_buffer_ + current_page_->header.uncompressed_page_size;
int64_t written_len =
ParquetPlainEncoder::Encode(*val, plain_encoded_value_size_, dst_ptr);

View File

@@ -50,7 +50,7 @@ class BitWriter {
/// fraction of a byte). Includes buffered values.
int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); }
uint8_t* buffer() const { return buffer_; }
int buffer_len() const { return max_bytes_; }
int64_t buffer_len() const { return max_bytes_; }
/// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
/// packed. Returns false if there was not enough space. num_bits must be <= 64.
@@ -99,7 +99,7 @@ class BitWriter {
private:
uint8_t* buffer_;
int max_bytes_;
int64_t max_bytes_;
/// Bit-packed values are initially written to this variable before being memcpy'd to
/// buffer_. This is faster than writing values byte by byte directly to buffer_.

View File

@@ -234,7 +234,7 @@ NumModifiedRows: 7300
alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='536870912',
'write.parquet.compression-codec'='none',
'write.parquet.page-size-bytes'='134217728',
'write.parquet.page-size-bytes'='131072',
'write.parquet.dict-size-bytes'='805306368');
====
---- QUERY
@@ -250,7 +250,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='1073741824',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='1',
'write.parquet.page-size-bytes'='402653184',
'write.parquet.page-size-bytes'='196608',
'write.parquet.dict-size-bytes'='536870912');
====
---- QUERY
@@ -266,7 +266,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='1610612736',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='13',
'write.parquet.page-size-bytes'='536870912',
'write.parquet.page-size-bytes'='262144',
'write.parquet.dict-size-bytes'='402653184');
====
---- QUERY
@@ -282,7 +282,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='1879048192',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='18',
'write.parquet.page-size-bytes'='805306368',
'write.parquet.page-size-bytes'='327680',
'write.parquet.dict-size-bytes'='134217728');
====
---- QUERY
@@ -298,7 +298,7 @@ alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='2146435072',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='22',
'write.parquet.page-size-bytes'='1073741824',
'write.parquet.page-size-bytes'='1048576',
'write.parquet.dict-size-bytes'='65536');
====
---- QUERY
@@ -332,3 +332,12 @@ select count(*) from iceberg_alltypes_parq_tblprop;
---- TYPES
BIGINT
====
---- QUERY
set MT_DOP=1;
create table lineitem_iceberg_comment
stored as iceberg
tblproperties("write.parquet.page-size-bytes"="1048576")
as select l_comment from tpch_parquet.lineitem;
---- RESULTS
row_regex:.*6001215.*
====