mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-13625: Allow reading Parquet int32/int64 as decimal without logical types
This patch allows reading columns with integer logical type as decimals. This can occur when we're trying to read files that were written as INT but the column was altered to a suitable DECIMAL. In this case the precision is based on physical type and equals 9 and 18, for int32 and int64 respectively. Test: * add new e2e tests Change-Id: I56006eb3cca28c81ec8467d77b35005fbf669680 Reviewed-on: http://gerrit.cloudera.org:8080/22922 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Zoltan Borok-Nagy
parent
8a691a3507
commit
ee69ed1d03
@@ -70,6 +70,21 @@ class ParquetDataConverter {
|
||||
}
|
||||
|
||||
int32_t GetPrecision() const {
|
||||
// If logical type is INTEGER, the precision is determined by the physical type.
|
||||
if (parquet_element_->__isset.logicalType
|
||||
&& UNLIKELY(parquet_element_->logicalType.__isset.INTEGER)) {
|
||||
switch (parquet_element_->type) {
|
||||
case parquet::Type::INT32:
|
||||
return ColumnType::MAX_DECIMAL4_PRECISION + 1;
|
||||
case parquet::Type::INT64:
|
||||
return ColumnType::MAX_DECIMAL8_PRECISION + 1;
|
||||
default:
|
||||
DCHECK(false) << "Unexpected physical type for INTEGER logical type: "
|
||||
<< to_string(parquet_element_->type);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (parquet_element_->__isset.logicalType
|
||||
&& parquet_element_->logicalType.__isset.DECIMAL) {
|
||||
return parquet_element_->logicalType.DECIMAL.precision;
|
||||
@@ -77,6 +92,7 @@ class ParquetDataConverter {
|
||||
|
||||
return parquet_element_->precision;
|
||||
}
|
||||
|
||||
/// Returns true if we need to do a conversion from the Parquet type to the slot type.
|
||||
bool CheckIfNeedsConversion() {
|
||||
if (!MATERIALIZED) return false;
|
||||
@@ -87,6 +103,11 @@ class ParquetDataConverter {
|
||||
return true;
|
||||
}
|
||||
if (col_type_->type == TYPE_DECIMAL) {
|
||||
// If the logical type is INTEGER for a Decimal slot, conversion is needed.
|
||||
if (parquet_element_->__isset.logicalType
|
||||
&& UNLIKELY(parquet_element_->logicalType.__isset.INTEGER)) {
|
||||
return true;
|
||||
}
|
||||
if (col_type_->precision != GetPrecision()) {
|
||||
// Decimal values can be stored by Decimal4Value (4 bytes), Decimal8Value, and
|
||||
// Decimal16Value. We only need to do a conversion for different precision if
|
||||
|
||||
@@ -89,6 +89,11 @@ bool IsSupportedType(PrimitiveType impala_type,
|
||||
return encodings->second.find(parquet_type) != encodings->second.end();
|
||||
}
|
||||
|
||||
/// Returns true if Parquet's logical type is INTEGER.
|
||||
bool IsIntLogicalType(const parquet::SchemaElement& element) {
|
||||
return element.__isset.logicalType && element.logicalType.__isset.INTEGER;
|
||||
}
|
||||
|
||||
/// Returns true if encoding 'e' is supported by Impala, false otherwise.
|
||||
static bool IsEncodingSupported(parquet::Encoding::type e) {
|
||||
switch (e) {
|
||||
@@ -208,7 +213,21 @@ int32_t GetScale(const parquet::SchemaElement& schema_element) {
|
||||
}
|
||||
|
||||
// Precision is required, this should be called after checking IsPrecisionSet()
|
||||
// unless logical type is INTEGER, in which case the precision is based on physical type.
|
||||
int32_t GetPrecision(const parquet::SchemaElement& schema_element) {
|
||||
if (UNLIKELY(IsIntLogicalType(schema_element))) {
|
||||
switch (schema_element.type) {
|
||||
case parquet::Type::INT32:
|
||||
return ColumnType::MAX_DECIMAL4_PRECISION + 1;
|
||||
case parquet::Type::INT64:
|
||||
return ColumnType::MAX_DECIMAL8_PRECISION + 1;
|
||||
default:
|
||||
DCHECK(false) << "Unexpected physical type for INTEGER logical type: "
|
||||
<< to_string(schema_element.type);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
DCHECK(IsPrecisionSet(schema_element));
|
||||
if (schema_element.__isset.logicalType && schema_element.logicalType.__isset.DECIMAL) {
|
||||
return schema_element.logicalType.DECIMAL.precision;
|
||||
@@ -370,7 +389,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
|
||||
|
||||
// We require that the precision be a positive value, and not larger than the
|
||||
// precision in table schema.
|
||||
if (!IsPrecisionSet(schema_element)) {
|
||||
if (!IsPrecisionSet(schema_element) && !IsIntLogicalType(schema_element)) {
|
||||
ErrorMsg msg(TErrorCode::PARQUET_MISSING_PRECISION, filename, schema_element.name);
|
||||
return Status(msg);
|
||||
} else {
|
||||
@@ -388,7 +407,7 @@ Status ParquetMetadataUtils::ValidateColumn(const char* filename,
|
||||
}
|
||||
}
|
||||
|
||||
if (!is_converted_type_decimal) {
|
||||
if (!is_converted_type_decimal && !IsIntLogicalType(schema_element)) {
|
||||
// TODO: is this validation useful? It is not required at all to read the data and
|
||||
// might only serve to reject otherwise perfectly readable files.
|
||||
ErrorMsg msg(TErrorCode::PARQUET_BAD_CONVERTED_TYPE, filename,
|
||||
|
||||
@@ -6,4 +6,84 @@ select * from primitive_type_widening;
|
||||
10,20,30,40,50,60,70,80,90,1230.4560546875
|
||||
---- TYPES
|
||||
SMALLINT,INT,BIGINT,DOUBLE,INT,BIGINT,DOUBLE,INT,DOUBLE,DOUBLE
|
||||
====
|
||||
====
|
||||
---- QUERY
|
||||
# IMPALA-13625: Allow reading Parquet int32/int64 as decimal without logical types
|
||||
create table read_int_as_decimal (c1 int, c2 bigint) stored as parquet;
|
||||
insert into read_int_as_decimal
|
||||
values
|
||||
(1, cast(10000000000 as bigint)),
|
||||
(MIN_INT(), MIN_BIGINT()),
|
||||
(MAX_INT(), MAX_BIGINT());
|
||||
select * from read_int_as_decimal order by c1;
|
||||
---- RESULTS
|
||||
-2147483648,-9223372036854775808
|
||||
1,10000000000
|
||||
2147483647,9223372036854775807
|
||||
---- TYPES
|
||||
INT, BIGINT
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c1 c1 decimal(10,0);
|
||||
select c1 from read_int_as_decimal order by c1;
|
||||
---- RESULTS
|
||||
-2147483648
|
||||
1
|
||||
2147483647
|
||||
---- TYPES
|
||||
DECIMAL
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c1 c1 decimal(8,0);
|
||||
select c1 from read_int_as_decimal order by c1;
|
||||
---- CATCH
|
||||
column 'c1' has a precision that does not match the table metadata precision. File metadata precision: 10, table metadata precision: 8.
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c1 c1 decimal(20,4);
|
||||
select c1 from read_int_as_decimal order by c1;
|
||||
---- RESULTS
|
||||
-2147483648.0000
|
||||
1.0000
|
||||
2147483647.0000
|
||||
---- TYPES
|
||||
DECIMAL
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c1 c1 decimal(12,4);
|
||||
select c1 from read_int_as_decimal order by c1;
|
||||
---- RESULTS
|
||||
1.0000
|
||||
NULL
|
||||
NULL
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c2 c2 decimal(19,0);
|
||||
select c2 from read_int_as_decimal order by c2;
|
||||
---- RESULTS
|
||||
-9223372036854775808
|
||||
10000000000
|
||||
9223372036854775807
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c2 c2 decimal(2,0);
|
||||
select c2 from read_int_as_decimal order by c2;
|
||||
---- CATCH
|
||||
column 'c2' has a precision that does not match the table metadata precision. File metadata precision: 19, table metadata precision: 2.
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c2 c2 decimal(25,5);
|
||||
select c2 from read_int_as_decimal order by c2;
|
||||
---- RESULTS
|
||||
-9223372036854775808.00000
|
||||
10000000000.00000
|
||||
9223372036854775807.00000
|
||||
====
|
||||
---- QUERY
|
||||
alter table read_int_as_decimal change c2 c2 decimal(20,5);
|
||||
select c2 from read_int_as_decimal order by c2;
|
||||
---- RESULTS
|
||||
10000000000.00000
|
||||
NULL
|
||||
NULL
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user