IMPALA-14185: Error unnesting nested array from Iceberg with DELETE files

When trying to doubly unnest a 2D array from an Iceberg table that has
delete files but not for every data file, we run into an error:

  Filtering an unnested collection that comes from a UNION [ALL] is not
  supported yet.

This is because there is a UNION node because of the Iceberg delete
files, and there is an added "not-empty" conjunct on the collections.

IMPALA-12753 describes a bug where a conjunct on an unnested collection
coming from a UNION ALL is only applied to the first UNION operand. To
avoid incorrectness, we disabled this case in the commit for
IMPALA-12695, but its unintended consequence is that it leads to this
error with Iceberg tables.

However, in this case with Iceberg deletes, the bug described in
IMPALA-12753 is not present because both sides of the UNION have the
same tuple id, so conjuncts are naturally applied to both sides.

This commit relaxes the check, which now does not fire if all UNION
operands have the same tuple ids.

Testing:
 - existing tests related to IMPALA-12753 pass
 - added a regression test with an Iceberg table with DELETE files

Change-Id: Ifbc6f580586d4b337f33a2f32052aa07f6fca828
Reviewed-on: http://gerrit.cloudera.org:8080/23107
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Daniel Becker
2025-06-27 14:43:50 +02:00
committed by Impala Public Jenkins
parent bff3561573
commit 191aec5298
2 changed files with 37 additions and 0 deletions

View File

@@ -90,6 +90,9 @@ public class UnnestNode extends PlanNode {
// Filtering an unnested collection that comes from a UNION [ALL] is not supported, see
// IMPALA-12753.
// The exception is if all children of the UNION node have the same tuple id(s), because
// then the conjuncts are naturally applied to all UNION operands. This is the case for
// UNION nodes inserted because of Iceberg delete operations. See IMPALA-14185.
private void checkUnnestFromUnionWithPredicate(Analyzer analyzer)
throws AnalysisException {
PlanNode subplanInputNode = containingSubplanNode_.getChild(0);
@@ -97,6 +100,8 @@ public class UnnestNode extends PlanNode {
UnionNode union = (UnionNode) subplanInputNode;
if (allUnionChildrenHaveSameTupleIds(union)) return;
// Tuple descriptors of the UNION and their descendants (for complex types).
List<TupleDescriptor> unionDescs = new ArrayList<>();
for (TupleId tid : union.getTupleIds()) {
@@ -125,6 +130,15 @@ public class UnnestNode extends PlanNode {
}
}
private static boolean allUnionChildrenHaveSameTupleIds(UnionNode union) {
if (union.getChildren().size() < 2) return true;
final List<TupleId> firstChildTupleIds = union.getChild(0).getTupleIds();
return union.getChildren().stream()
.map(planNode -> planNode.getTupleIds())
.allMatch(tupleIdList -> tupleIdList.equals(firstChildTupleIds));
}
// Returns the TupleDescriptors contained by 'tuple' (includes item tuple descs of
// collections).
private void getCollTupleDescs(TupleDescriptor tuple,

View File

@@ -194,6 +194,29 @@ class TestNestedCollectionsInSelectList(ImpalaTestSuite):
"""Queries where a map column is in the select list"""
self.run_test_case('QueryTest/nested-map-in-select-list', vector)
@SkipIfFS.hive
def test_nested_array_from_iceberg_with_delete(self, unique_database):
"""Tests that a 2D array can be unnested from an Iceberg table that has delete files
but not for all data files. In this case there is a UNION in the plan.
Regression test for IMPALA-14185.
"""
tbl_name = unique_database + ".nested_arr_in_iceberg_with_delete"
self.execute_query("create table {} (id INT, arr ARRAY<ARRAY<int>>) stored by \
iceberg tblproperties('format-version'='2')".format(tbl_name))
# INSERTs are done in Hive as Impala cannot write complex types.
self.run_stmt_in_hive("insert into {} values ( \
1, array(array(1), array(2), array(3), array(4), array(5)))".format(tbl_name))
self.run_stmt_in_hive("insert into {} values ( \
2, array(array(1), array(2), array(3), array(4), array(5)))".format(tbl_name))
# Impala can delete rows containing complex types.
self.execute_query("delete from {} where id=2".format(tbl_name))
result = self.execute_query_expect_success(self.client,
"select id, a1.item.item unnested_item from {0}, {0}.arr a1, a1.item \
order by id, unnested_item".format(tbl_name))
assert result.data == ['1\t1', '1\t2', '1\t3', '1\t4', '1\t5']
class TestMixedCollectionsAndStructsInSelectList(ImpalaTestSuite):
"""Functional tests for the case where collections and structs are embedded into one