IMPALA-13625: Allow reading Parquet int32/int64 as decimal without logical types

This patch allows reading columns with integer logical type as decimals.
This can occur when we're trying to read files that were written as INT but
the column was altered to a suitable DECIMAL. In this case the precision
is based on physical type and equals 9 and 18, for int32 and int64
respectively.

Test:
* add new e2e tests

Change-Id: I56006eb3cca28c81ec8467d77b35005fbf669680
Reviewed-on: http://gerrit.cloudera.org:8080/22922
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Daniel Vanko
2025-06-25 11:42:51 +02:00
committed by Zoltan Borok-Nagy
parent 8a691a3507
commit ee69ed1d03
3 changed files with 123 additions and 3 deletions

View File

@@ -70,6 +70,21 @@ class ParquetDataConverter {
}
int32_t GetPrecision() const {
// If logical type is INTEGER, the precision is determined by the physical type.
if (parquet_element_->__isset.logicalType
&& UNLIKELY(parquet_element_->logicalType.__isset.INTEGER)) {
switch (parquet_element_->type) {
case parquet::Type::INT32:
return ColumnType::MAX_DECIMAL4_PRECISION + 1;
case parquet::Type::INT64:
return ColumnType::MAX_DECIMAL8_PRECISION + 1;
default:
DCHECK(false) << "Unexpected physical type for INTEGER logical type: "
<< to_string(parquet_element_->type);
break;
}
}
if (parquet_element_->__isset.logicalType
&& parquet_element_->logicalType.__isset.DECIMAL) {
return parquet_element_->logicalType.DECIMAL.precision;
@@ -77,6 +92,7 @@ class ParquetDataConverter {
return parquet_element_->precision;
}
/// Returns true if we need to do a conversion from the Parquet type to the slot type.
bool CheckIfNeedsConversion() {
if (!MATERIALIZED) return false;
@@ -87,6 +103,11 @@ class ParquetDataConverter {
return true;
}
if (col_type_->type == TYPE_DECIMAL) {
// If the logical type is INTEGER for a Decimal slot, conversion is needed.
if (parquet_element_->__isset.logicalType
&& UNLIKELY(parquet_element_->logicalType.__isset.INTEGER)) {
return true;
}
if (col_type_->precision != GetPrecision()) {
// Decimal values can be stored by Decimal4Value (4 bytes), Decimal8Value, and
// Decimal16Value. We only need to do a conversion for different precision if

View File

@@ -89,6 +89,11 @@ bool IsSupportedType(PrimitiveType impala_type,
return encodings->second.find(parquet_type) != encodings->second.end();
}
/// Returns true if Parquet's logical type is INTEGER.
bool IsIntLogicalType(const parquet::SchemaElement& element) {
return element.__isset.logicalType && element.logicalType.__isset.INTEGER;
}
/// Returns true if encoding 'e' is supported by Impala, false otherwise.
static bool IsEncodingSupported(parquet::Encoding::type e) {
switch (e) {
@@ -208,7 +213,21 @@ int32_t GetScale(const parquet::SchemaElement& schema_element) {
}
// Precision is required, this should be called after checking IsPrecisionSet()
// unless logical type is INTEGER, in which case the precision is based on physical type.
int32_t GetPrecision(const parquet::SchemaElement& schema_element) {
if (UNLIKELY(IsIntLogicalType(schema_element))) {
switch (schema_element.type) {
case parquet::Type::INT32:
return ColumnType::MAX_DECIMAL4_PRECISION + 1;
case parquet::Type::INT64:
return ColumnType::MAX_DECIMAL8_PRECISION + 1;
default:
DCHECK(false) << "Unexpected physical type for INTEGER logical type: "
<< to_string(schema_element.type);
break;
}
}
DCHECK(IsPrecisionSet(schema_element));
if (schema_element.__isset.logicalType && schema_element.logicalType.__isset.DECIMAL) {
return schema_element.logicalType.DECIMAL.precision;
@@ -370,7 +389,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
// We require that the precision be a positive value, and not larger than the
// precision in table schema.
if (!IsPrecisionSet(schema_element)) {
if (!IsPrecisionSet(schema_element) && !IsIntLogicalType(schema_element)) {
ErrorMsg msg(TErrorCode::PARQUET_MISSING_PRECISION, filename, schema_element.name);
return Status(msg);
} else {
@@ -388,7 +407,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
}
}
if (!is_converted_type_decimal) {
if (!is_converted_type_decimal && !IsIntLogicalType(schema_element)) {
// TODO: is this validation useful? It is not required at all to read the data and
// might only serve to reject otherwise perfectly readable files.
ErrorMsg msg(TErrorCode::PARQUET_BAD_CONVERTED_TYPE, filename,

View File

@@ -6,4 +6,84 @@ select * from primitive_type_widening;
10,20,30,40,50,60,70,80,90,1230.4560546875
---- TYPES
SMALLINT,INT,BIGINT,DOUBLE,INT,BIGINT,DOUBLE,INT,DOUBLE,DOUBLE
====
====
---- QUERY
# IMPALA-13625: Allow reading Parquet int32/int64 as decimal without logical types
create table read_int_as_decimal (c1 int, c2 bigint) stored as parquet;
insert into read_int_as_decimal
values
(1, cast(10000000000 as bigint)),
(MIN_INT(), MIN_BIGINT()),
(MAX_INT(), MAX_BIGINT());
select * from read_int_as_decimal order by c1;
---- RESULTS
-2147483648,-9223372036854775808
1,10000000000
2147483647,9223372036854775807
---- TYPES
INT, BIGINT
====
---- QUERY
alter table read_int_as_decimal change c1 c1 decimal(10,0);
select c1 from read_int_as_decimal order by c1;
---- RESULTS
-2147483648
1
2147483647
---- TYPES
DECIMAL
====
---- QUERY
alter table read_int_as_decimal change c1 c1 decimal(8,0);
select c1 from read_int_as_decimal order by c1;
---- CATCH
column 'c1' has a precision that does not match the table metadata precision. File metadata precision: 10, table metadata precision: 8.
====
---- QUERY
alter table read_int_as_decimal change c1 c1 decimal(20,4);
select c1 from read_int_as_decimal order by c1;
---- RESULTS
-2147483648.0000
1.0000
2147483647.0000
---- TYPES
DECIMAL
====
---- QUERY
alter table read_int_as_decimal change c1 c1 decimal(12,4);
select c1 from read_int_as_decimal order by c1;
---- RESULTS
1.0000
NULL
NULL
====
---- QUERY
alter table read_int_as_decimal change c2 c2 decimal(19,0);
select c2 from read_int_as_decimal order by c2;
---- RESULTS
-9223372036854775808
10000000000
9223372036854775807
====
---- QUERY
alter table read_int_as_decimal change c2 c2 decimal(2,0);
select c2 from read_int_as_decimal order by c2;
---- CATCH
column 'c2' has a precision that does not match the table metadata precision. File metadata precision: 19, table metadata precision: 2.
====
---- QUERY
alter table read_int_as_decimal change c2 c2 decimal(25,5);
select c2 from read_int_as_decimal order by c2;
---- RESULTS
-9223372036854775808.00000
10000000000.00000
9223372036854775807.00000
====
---- QUERY
alter table read_int_as_decimal change c2 c2 decimal(20,5);
select c2 from read_int_as_decimal order by c2;
---- RESULTS
10000000000.00000
NULL
NULL
====