mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
IMPALA-11346: Migrated partitioned Iceberg tables might return ERROR when WHERE condition is used on partition column
Identity-partitioned columns are not necessarily stored in the data files. E.g. when we migrate a legacy partitioned table to Iceberg without rewriting the data files, the partition columns won't be present in the files. The Parquet scanner does a few optimizations to eliminate row groups, i.e. filtering based on stats, bloom filters, etc. When a column is not present in the data file that has some predicate on, then it is assumed that the whole row group doesn't pass the filtering criteria. But for Iceberg some files might contain partition columns, while other files doesn't, so we need to prepare the scanners to handle such cases. The ORC scanner doesn't have that many optimizations so it didn't ran into this issue. Testing: * e2e tests Change-Id: Ie706317888981f634d792fb570f3eab1ec11a4f4 Reviewed-on: http://gerrit.cloudera.org:8080/18605 Reviewed-by: Csaba Ringhofer <csringhofer@cloudera.com> Reviewed-by: Tamas Mate <tmater@apache.org> Reviewed-by: <lipenglin@sensorsdata.cn> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
23d09638de
commit
1a1536bd1d
@@ -139,7 +139,6 @@ void FileMetadataUtils::AddIcebergColumns(MemPool* mem_pool, Tuple** template_tu
|
||||
bool FileMetadataUtils::IsValuePartitionCol(const SlotDescriptor* slot_desc) {
|
||||
DCHECK(context_ != nullptr);
|
||||
DCHECK(file_desc_ != nullptr);
|
||||
if (slot_desc->parent() != scan_node_->tuple_desc()) return false;
|
||||
if (slot_desc->col_pos() < scan_node_->num_partition_keys() &&
|
||||
!slot_desc->IsVirtual()) {
|
||||
return true;
|
||||
|
||||
@@ -566,6 +566,7 @@ Status HdfsParquetScanner::EvaluateStatsConjuncts(
|
||||
RETURN_IF_ERROR(ResolveSchemaForStatFiltering(slot_desc, &missing_field, &node));
|
||||
|
||||
if (missing_field) {
|
||||
if (!file_metadata_utils_.NeedDataInFile(slot_desc)) continue;
|
||||
// We are selecting a column that is not in the file. We would set its slot to NULL
|
||||
// during the scan, so any predicate would evaluate to false. Return early. NULL
|
||||
// comparisons cannot happen here, since predicates with NULL literals are filtered
|
||||
@@ -707,6 +708,7 @@ Status HdfsParquetScanner::EvaluateOverlapForRowGroup(
|
||||
RETURN_IF_ERROR(ResolveSchemaForStatFiltering(slot_desc, &missing_field, &node));
|
||||
|
||||
if (missing_field) {
|
||||
if (!file_metadata_utils_.NeedDataInFile(slot_desc)) continue;
|
||||
// We are selecting a column that is not in the file. We would set its slot to NULL
|
||||
// during the scan, so any predicate would evaluate to false. Return early. NULL
|
||||
// comparisons cannot happen here, since predicates with NULL literals are filtered
|
||||
@@ -1981,6 +1983,8 @@ Status HdfsParquetScanner::CreateColIdx2EqConjunctMap() {
|
||||
}
|
||||
|
||||
if (missing_field) {
|
||||
if (!file_metadata_utils_.NeedDataInFile(slot_desc)) continue;
|
||||
|
||||
return Status(Substitute(
|
||||
"Unable to find SchemaNode for path '$0' in the schema of file '$1'.",
|
||||
PrintPath(*scan_node_->hdfs_table(), slot_desc->col_path()), filename()));
|
||||
|
||||
@@ -11,6 +11,54 @@ select * from functional_parquet.iceberg_alltypes_part
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
# Queries with WHERE clauses
|
||||
select * from functional_parquet.iceberg_alltypes_part
|
||||
where i = 1;
|
||||
---- RESULTS
|
||||
1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part
|
||||
where i = 3;
|
||||
---- RESULTS
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part
|
||||
where p_int = 1;
|
||||
---- RESULTS
|
||||
1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part
|
||||
where p_int = 2;
|
||||
---- RESULTS
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part
|
||||
where p_bool = true;
|
||||
---- RESULTS
|
||||
1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part
|
||||
where p_bool = false;
|
||||
---- RESULTS
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
# Read only the partition columns.
|
||||
select p_bool, p_int, p_bigint, p_float,
|
||||
p_double, p_decimal, p_date, p_string
|
||||
@@ -33,6 +81,54 @@ select * from functional_parquet.iceberg_alltypes_part_orc
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
# Queries with WHERE clauses
|
||||
select * from functional_parquet.iceberg_alltypes_part_orc
|
||||
where i = 1;
|
||||
---- RESULTS
|
||||
1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part_orc
|
||||
where i = 3;
|
||||
---- RESULTS
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part_orc
|
||||
where p_int = 1;
|
||||
---- RESULTS
|
||||
1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part_orc
|
||||
where p_int = 2;
|
||||
---- RESULTS
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part_orc
|
||||
where p_bool = true;
|
||||
---- RESULTS
|
||||
1,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
2,true,1,11,1.100000023841858,2.222,123.321,2022-02-22,'impala'
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_alltypes_part_orc
|
||||
where p_bool = false;
|
||||
---- RESULTS
|
||||
---- TYPES
|
||||
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DECIMAL, DATE, STRING
|
||||
====
|
||||
---- QUERY
|
||||
# Read only the partition columns.
|
||||
select p_bool, p_int, p_bigint, p_float,
|
||||
p_double, p_decimal, p_date, p_string
|
||||
@@ -58,6 +154,23 @@ select * from functional_parquet.iceberg_legacy_partition_schema_evolution
|
||||
BIGINT, DOUBLE, DECIMAL, INT
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_legacy_partition_schema_evolution
|
||||
where p_int_long = 1;
|
||||
---- RESULTS
|
||||
1,1.100000023841858,2.718,2
|
||||
1,1.100000023841858,3.141,1
|
||||
---- TYPES
|
||||
BIGINT, DOUBLE, DECIMAL, INT
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_legacy_partition_schema_evolution
|
||||
where p_dec_dec = 2.718;
|
||||
---- RESULTS
|
||||
1,1.100000023841858,2.718,2
|
||||
---- TYPES
|
||||
BIGINT, DOUBLE, DECIMAL, INT
|
||||
====
|
||||
---- QUERY
|
||||
# Read only the partition columns.
|
||||
select p_int_long, p_float_double, p_dec_dec
|
||||
from functional_parquet.iceberg_legacy_partition_schema_evolution;
|
||||
@@ -82,6 +195,23 @@ select * from functional_parquet.iceberg_legacy_partition_schema_evolution_orc
|
||||
BIGINT, DOUBLE, DECIMAL, INT
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_legacy_partition_schema_evolution_orc
|
||||
where p_int_long = 1;
|
||||
---- RESULTS
|
||||
1,1.100000023841858,2.718,2
|
||||
1,1.100000023841858,3.141,1
|
||||
---- TYPES
|
||||
BIGINT, DOUBLE, DECIMAL, INT
|
||||
====
|
||||
---- QUERY
|
||||
select * from functional_parquet.iceberg_legacy_partition_schema_evolution_orc
|
||||
where p_dec_dec = 2.718;
|
||||
---- RESULTS
|
||||
1,1.100000023841858,2.718,2
|
||||
---- TYPES
|
||||
BIGINT, DOUBLE, DECIMAL, INT
|
||||
====
|
||||
---- QUERY
|
||||
# Read only the partition columns.
|
||||
select p_int_long, p_float_double, p_dec_dec
|
||||
from functional_parquet.iceberg_legacy_partition_schema_evolution_orc;
|
||||
@@ -106,3 +236,11 @@ select * from only_part_cols;
|
||||
---- TYPES
|
||||
INT, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select * from only_part_cols
|
||||
where i = 2 and s = 's'
|
||||
---- RESULTS
|
||||
2,'s'
|
||||
---- TYPES
|
||||
INT, STRING
|
||||
====
|
||||
|
||||
Reference in New Issue
Block a user