From e8f604a2139be2ee3f011e6f2ce71fa0dde26492 Mon Sep 17 00:00:00 2001 From: Csaba Ringhofer Date: Mon, 30 Mar 2020 20:15:09 +0200 Subject: [PATCH] IMPALA-9572: Fix DCHECK in nested Parquet scanning The issue occurred when there were skipped pages and a column inside a collection was scanned, but its position was not needed. The repetition level still needs to be read in this case, as the skipped ranges are set in top level rows, so collection items need to know which top level row do they belong to. A DCHECK in StrideWriter's constructor was hit, otherwise the code ran correctly in release mode. The DCHECK is moved to functions where the condition would actually cause problems. Testing: - added and ran a regression test Change-Id: I5e8ef514ead71f732c73f910af7fd1aecd37bb81 Reviewed-on: http://gerrit.cloudera.org:8080/15598 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/parquet/parquet-column-readers.cc | 2 +- be/src/util/mem-util.h | 9 ++++++++- .../QueryTest/nested-types-parquet-page-index.test | 9 +++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc index 512632be2..58cc12f21 100644 --- a/be/src/exec/parquet/parquet-column-readers.cc +++ b/be/src/exec/parquet/parquet-column-readers.cc @@ -1323,7 +1323,7 @@ int BaseScalarColumnReader::FillPositionsInCandidateRange(int rows_remaining, int rep_level = rep_levels_.CacheGetNext(); if (rep_level == 0) ++row_count; ++val_count; - if (pos_slot_desc_ != nullptr) { + if (pos_writer.IsValid()) { if (rep_level <= max_rep_level() - 1) pos_current_value_ = 0; *pos_writer.Advance() = pos_current_value_++; } diff --git a/be/src/util/mem-util.h b/be/src/util/mem-util.h index a7125a984..16e808574 100644 --- a/be/src/util/mem-util.h +++ b/be/src/util/mem-util.h @@ -31,17 +31,21 @@ namespace impala { template struct StrideWriter { // The next element to write to. May not be aligned to the natural alignment of T. + // Can be null for convenience, but the StrideWriter cannot be used in that case. T* current; // The stride in bytes between subsequent values. int64_t stride; explicit StrideWriter(T* current, int64_t stride) : current(current), stride(stride) { - DCHECK(current != nullptr); } + /// No other functions can be called if false. + ALWAYS_INLINE bool IsValid() const { return current != nullptr; } + /// Set the next element to 'val' and advance 'current' to the next element. ALWAYS_INLINE void SetNext(T& val) { + DCHECK(IsValid()); // memcpy() is necessary because 'current' may not be aligned. memcpy(current, &val, sizeof(T)); SkipNext(1); @@ -49,6 +53,7 @@ struct StrideWriter { /// Return a pointer to the current element and advance 'current' to the next element. ALWAYS_INLINE T* Advance() { + DCHECK(IsValid()); T* curr = current; SkipNext(1); return curr; @@ -56,11 +61,13 @@ struct StrideWriter { /// Set the next 'count' elements to 'val', advancing 'current' by 'count' values. void SetNext(T& val, int64_t count) { + DCHECK(IsValid()); for (int64_t i = 0; i < count; ++i) SetNext(val); } /// Advance 'current' by 'count' values. ALWAYS_INLINE void SkipNext(int64_t count) { + DCHECK(IsValid()); DCHECK_GE(count, 0); current = reinterpret_cast(reinterpret_cast(current) + stride * count); } diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test index 8c85b575d..8a372f782 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test +++ b/testdata/workloads/functional-query/queries/QueryTest/nested-types-parquet-page-index.test @@ -702,3 +702,12 @@ BIGINT, DECIMAL ---- RUNTIME_PROFILE aggregation(SUM, NumStatsFilteredPages): 6 ==== +---- QUERY +# Regression test for IMPALA-9572. +select count(l_partkey) from tpch_nested_parquet.customer.c_orders.o_lineitems +where l_partkey < 10 +---- RESULTS +263 +---- TYPES +BIGINT +====