diff --git a/be/src/exec/parquet/CMakeLists.txt b/be/src/exec/parquet/CMakeLists.txt index b121fe5f4..af405d2fa 100644 --- a/be/src/exec/parquet/CMakeLists.txt +++ b/be/src/exec/parquet/CMakeLists.txt @@ -31,12 +31,14 @@ add_library(Parquet parquet-collection-column-reader.cc parquet-column-readers.cc parquet-column-stats.cc + parquet-complex-column-reader.cc parquet-level-decoder.cc parquet-metadata-utils.cc parquet-column-chunk-reader.cc parquet-page-reader.cc parquet-common.cc parquet-page-index.cc + parquet-struct-column-reader.cc ) add_dependencies(Parquet gen-deps) diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc index a32294c73..592fae0b3 100644 --- a/be/src/exec/parquet/hdfs-parquet-scanner.cc +++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc @@ -30,6 +30,7 @@ #include "exec/parquet/parquet-bloom-filter-util.h" #include "exec/parquet/parquet-collection-column-reader.h" #include "exec/parquet/parquet-column-readers.h" +#include "exec/parquet/parquet-struct-column-reader.h" #include "exec/scanner-context.inline.h" #include "exec/scratch-tuple-batch.h" #include "exprs/literal.h" @@ -305,9 +306,9 @@ void HdfsParquetScanner::Close(RowBatch* row_batch) { while (!readers.empty()) { ParquetColumnReader* reader = readers.top(); readers.pop(); - if (reader->IsCollectionReader()) { - CollectionColumnReader* coll_reader = static_cast(reader); - for (ParquetColumnReader* r: *coll_reader->children()) readers.push(r); + if (reader->IsComplexReader()) { + ComplexColumnReader* complex_reader = static_cast(reader); + for (ParquetColumnReader* r: *complex_reader->children()) readers.push(r); continue; } BaseScalarColumnReader* scalar_reader = static_cast(reader); @@ -385,10 +386,10 @@ int HdfsParquetScanner::CountScalarColumns( while (!readers.empty()) { ParquetColumnReader* col_reader = readers.top(); readers.pop(); - if (col_reader->IsCollectionReader()) { - CollectionColumnReader* collection_reader = - static_cast(col_reader); - for (ParquetColumnReader* r: *collection_reader->children()) readers.push(r); + if (col_reader->IsComplexReader()) { + ComplexColumnReader* complex_reader = + static_cast(col_reader); + for (ParquetColumnReader* r: *complex_reader->children()) readers.push(r); continue; } ++num_columns; @@ -918,7 +919,7 @@ Status HdfsParquetScanner::NextRowGroup() { } } - InitCollectionColumns(); + InitComplexColumns(); RETURN_IF_ERROR(InitScalarColumns()); // Start scanning dictionary filtering column readers, so we can read the dictionary @@ -1656,9 +1657,9 @@ bool HdfsParquetScanner::IsDictFilterable(BaseScalarColumnReader* col_reader) { void HdfsParquetScanner::PartitionReaders( const vector& readers, bool can_eval_dict_filters) { for (auto* reader : readers) { - if (reader->IsCollectionReader()) { - CollectionColumnReader* col_reader = static_cast(reader); - collection_readers_.push_back(col_reader); + if (reader->IsComplexReader()) { + ComplexColumnReader* col_reader = static_cast(reader); + complex_readers_.push_back(col_reader); PartitionReaders(*col_reader->children(), can_eval_dict_filters); } else { BaseScalarColumnReader* scalar_reader = @@ -2234,7 +2235,8 @@ Status HdfsParquetScanner::AssembleRows(RowBatch* row_batch, bool* skip_row_grou DCHECK(scratch_batch_ != nullptr); if (filter_readers_.empty() || non_filter_readers_.empty() || - late_materialization_threshold_ < 0 || filter_readers_[0]->max_rep_level() > 0) { + late_materialization_threshold_ < 0 || filter_readers_[0]->max_rep_level() > 0 || + HasStructColumnReader(non_filter_readers_)) { // Late Materialization is either disabled or not applicable for assembling rows here. return AssembleRowsWithoutLateMaterialization(column_readers_, row_batch, skip_row_group); @@ -2326,6 +2328,14 @@ Status HdfsParquetScanner::AssembleRows(RowBatch* row_batch, bool* skip_row_grou return Status::OK(); } +bool HdfsParquetScanner::HasStructColumnReader( + const std::vector& column_readers) const { + for (const ParquetColumnReader* col_reader : column_readers) { + if (col_reader->HasStructReader()) return true; + } + return false; +} + Status HdfsParquetScanner::SkipRowsForColumns( const vector& column_readers, int64_t* num_rows_to_skip, int64_t* skip_to_row) { @@ -2755,14 +2765,14 @@ Status HdfsParquetScanner::CreateColumnReaders(const TupleDescriptor& tuple_desc *node, slot_desc->type().IsCollectionType(), slot_desc, this); column_readers->push_back(col_reader); - if (col_reader->IsCollectionReader()) { + if (col_reader->IsComplexReader()) { // Recursively populate col_reader's children DCHECK(slot_desc->children_tuple_descriptor() != nullptr); const TupleDescriptor* item_tuple_desc = slot_desc->children_tuple_descriptor(); - CollectionColumnReader* collection_reader = + ComplexColumnReader* complex_reader = static_cast(col_reader); RETURN_IF_ERROR(CreateColumnReaders( - *item_tuple_desc, schema_resolver, collection_reader->children())); + *item_tuple_desc, schema_resolver, complex_reader->children())); } else { scalar_reader_map_[node->col_idx] = static_cast( col_reader); @@ -2850,8 +2860,8 @@ Status HdfsParquetScanner::CreateCountingReader(const SchemaPath& parent_path, return Status::OK(); } -void HdfsParquetScanner::InitCollectionColumns() { - for (CollectionColumnReader* col_reader: collection_readers_) { +void HdfsParquetScanner::InitComplexColumns() { + for (ComplexColumnReader* col_reader: complex_readers_) { col_reader->Reset(); } } @@ -2940,7 +2950,7 @@ Status HdfsParquetScanner::ValidateEndOfRowGroup( // Validate scalar column readers' state int num_values_read = -1; for (int c = 0; c < column_readers.size(); ++c) { - if (column_readers[c]->IsCollectionReader()) continue; + if (column_readers[c]->IsComplexReader()) continue; BaseScalarColumnReader* reader = static_cast(column_readers[c]); // All readers should have exhausted the final data page. This could fail if one diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.h b/be/src/exec/parquet/hdfs-parquet-scanner.h index df58c4985..c78609e66 100644 --- a/be/src/exec/parquet/hdfs-parquet-scanner.h +++ b/be/src/exec/parquet/hdfs-parquet-scanner.h @@ -42,13 +42,15 @@ struct SchemaNode; class ParquetLevelDecoder; /// Per column reader. -class ParquetColumnReader; +class BaseScalarColumnReader; class CollectionColumnReader; class ColumnStatsReader; -class BaseScalarColumnReader; +class ComplexColumnReader; +class ParquetColumnReader; +class ParquetPageReader; template class ScalarColumnReader; -class ParquetPageReader; + /// This scanner parses Parquet files located in HDFS, and writes the content as tuples in /// the Impala in-memory representation of data, e.g. (tuples, rows, row batches). @@ -474,7 +476,7 @@ class HdfsParquetScanner : public HdfsColumnarScanner { std::vector scalar_readers_; /// Flattened collection column readers that point to readers in column_readers_. - std::vector collection_readers_; + std::vector complex_readers_; /// Mapping from Parquet column indexes to scalar readers. std::unordered_map scalar_reader_map_; @@ -741,6 +743,11 @@ class HdfsParquetScanner : public HdfsColumnarScanner { const vector& column_readers, RowBatch* row_batch, bool* skip_row_group) WARN_UNUSED_RESULT; + /// Returns true if any of the 'column_readers' or their children is a Struct column + /// reader. + bool HasStructColumnReader( + const std::vector& column_readers) const; + /// Commit num_rows to the given row batch. /// Returns OK if the query is not cancelled and hasn't exceeded any mem limits. /// Scanner can call this with 0 rows to flush any pending resources (attached pools @@ -786,7 +793,7 @@ class HdfsParquetScanner : public HdfsColumnarScanner { std::vector* column_readers) WARN_UNUSED_RESULT; /// Returns the total number of scalar column readers in 'column_readers', including - /// the children of collection readers. + /// the children of complex readers. int CountScalarColumns(const std::vector& column_readers); /// Creates a column reader that reads one value for each item in the table or @@ -808,8 +815,8 @@ class HdfsParquetScanner : public HdfsColumnarScanner { /// does not start any scan ranges. Status InitScalarColumns() WARN_UNUSED_RESULT; - /// Initializes the column readers in collection_readers_. - void InitCollectionColumns(); + /// Initializes the column readers in complex_readers_. + void InitComplexColumns(); /// Initialize dictionaries for all column readers Status InitDictionaries(const std::vector& column_readers) @@ -840,7 +847,7 @@ class HdfsParquetScanner : public HdfsColumnarScanner { bool IsDictFilterable(BaseScalarColumnReader* col_reader); /// Partitions the readers into scalar and collection readers. The collection readers - /// are flattened into collection_readers_. The scalar readers are partitioned into + /// are flattened into complex_readers_. The scalar readers are partitioned into /// dict_filterable_readers_ and non_dict_filterable_readers_ depending on whether /// dictionary filtering is enabled and the reader can be dictionary filtered. All /// scalar readers are also flattened into scalar_readers_. @@ -848,7 +855,7 @@ class HdfsParquetScanner : public HdfsColumnarScanner { bool can_eval_dict_filters); /// Divides the column readers into dict_filterable_readers_, - /// non_dict_filterable_readers_ and collection_readers_. Allocates memory for + /// non_dict_filterable_readers_ and complex_readers_. Allocates memory for /// dict_filter_tuple_map_. Status InitDictFilterStructures() WARN_UNUSED_RESULT; diff --git a/be/src/exec/parquet/parquet-collection-column-reader.cc b/be/src/exec/parquet/parquet-collection-column-reader.cc index 66ba96492..1499a1073 100644 --- a/be/src/exec/parquet/parquet-collection-column-reader.cc +++ b/be/src/exec/parquet/parquet-collection-column-reader.cc @@ -21,12 +21,6 @@ namespace impala { -void CollectionColumnReader::Close(RowBatch* row_batch) { - for (ParquetColumnReader* child_reader : children_) { - child_reader->Close(row_batch); - } -} - bool CollectionColumnReader::NextLevels() { DCHECK(!children_.empty()); DCHECK_LE(rep_level_, new_collection_rep_level()); @@ -58,7 +52,7 @@ bool CollectionColumnReader::ReadValue(MemPool* pool, Tuple* tuple) { CollectionValue* slot = tuple->GetCollectionSlot(tuple_offset_); *slot = CollectionValue(); } else { - tuple->SetNull(null_indicator_offset_); + SetNullSlot(tuple); } return CollectionColumnReader::NextLevels(); } diff --git a/be/src/exec/parquet/parquet-collection-column-reader.h b/be/src/exec/parquet/parquet-collection-column-reader.h index c8f7ba2ad..ee41f8cdc 100644 --- a/be/src/exec/parquet/parquet-collection-column-reader.h +++ b/be/src/exec/parquet/parquet-collection-column-reader.h @@ -17,27 +17,25 @@ #pragma once -#include - -#include "exec/parquet/parquet-column-readers.h" +#include "exec/parquet/parquet-complex-column-reader.h" namespace impala { /// Collections are not materialized directly in parquet files; only scalar values appear /// in the file. CollectionColumnReader uses the definition and repetition levels of child /// column readers to figure out the boundaries of each collection in this column. -class CollectionColumnReader : public ParquetColumnReader { +class CollectionColumnReader : public ComplexColumnReader { public: CollectionColumnReader( HdfsParquetScanner* parent, const SchemaNode& node, const SlotDescriptor* slot_desc) - : ParquetColumnReader(parent, node, slot_desc) { + : ComplexColumnReader(parent, node, slot_desc) { DCHECK(node_.is_repeated()); if (slot_desc != nullptr) DCHECK(slot_desc->type().IsCollectionType()); } virtual ~CollectionColumnReader() {} - vector* children() { return &children_; } + virtual bool IsStructReader() const override { return false; } virtual bool IsCollectionReader() const override { return true; } @@ -66,41 +64,12 @@ class CollectionColumnReader : public ParquetColumnReader { /// reader's state. virtual bool NextLevels() override; - /// This is called once for each row group in the file. - void Reset() { - def_level_ = ParquetLevel::INVALID_LEVEL; - rep_level_ = ParquetLevel::INVALID_LEVEL; - pos_current_value_ = ParquetLevel::INVALID_POS; - } - - virtual void Close(RowBatch* row_batch) override; - /// Skips the number of encoded values specified by 'num_rows', without materilizing or /// decoding them. /// Returns true on success, false otherwise. virtual bool SkipRows(int64_t num_rows, int64_t skip_row_id) override; - virtual bool SetRowGroupAtEnd() override { - DCHECK(!children_.empty()); - for (int c = 0; c < children_.size(); ++c) { - if (!children_[c]->SetRowGroupAtEnd()) return false; - } - return true; - } - - /// Returns the index of the row that was processed most recently. - int64_t LastProcessedRow() const override { - DCHECK(!children_.empty()); - return children_[0]->LastProcessedRow(); - } - private: - /// Column readers of fields contained within this collection. There is at least one - /// child reader per collection reader. Child readers either materialize slots in the - /// collection item tuples, or there is a single child reader that does not materialize - /// any slot and is only used by this reader to read def and rep levels. - vector children_; - /// Updates this reader's def_level_, rep_level_, and pos_current_value_ based on child /// reader's state. void UpdateDerivedState(); diff --git a/be/src/exec/parquet/parquet-column-readers.cc b/be/src/exec/parquet/parquet-column-readers.cc index ec84f08f5..c85671ebf 100644 --- a/be/src/exec/parquet/parquet-column-readers.cc +++ b/be/src/exec/parquet/parquet-column-readers.cc @@ -25,6 +25,7 @@ #include "exec/parquet/parquet-data-converter.h" #include "exec/parquet/parquet-level-decoder.h" #include "exec/parquet/parquet-metadata-utils.h" +#include "exec/parquet/parquet-struct-column-reader.h" #include "exec/scratch-tuple-batch.h" #include "parquet-collection-column-reader.h" #include "runtime/runtime-state.h" @@ -85,10 +86,10 @@ class ScalarColumnReader : public BaseScalarColumnReader { virtual bool NeedsConversion() override { return NeedsConversionInline(); } virtual bool NeedsValidation() override { return NeedsValidationInline(); } - protected: template inline bool ReadValue(Tuple* tuple); + protected: /// Implementation of the ReadValueBatch() functions specialized for this /// column reader type. This function drives the reading of data pages and /// caching of rep/def levels. Once a data page and cached levels are available, @@ -426,7 +427,7 @@ bool ScalarColumnReader::ReadValue( } if (!continue_execution) return false; } else { - tuple->SetNull(null_indicator_offset_); + SetNullSlot(tuple); } } return NextLevels(); @@ -569,7 +570,7 @@ bool ScalarColumnReader::MaterializeVa bool continue_execution = ReadSlot(tuple); if (UNLIKELY(!continue_execution)) return false; } else { - tuple->SetNull(null_indicator_offset_); + SetNullSlot(tuple); } } curr_tuple += tuple_size; @@ -705,7 +706,7 @@ bool ScalarColumnReader::ReadSlot( if (UNLIKELY(!parent_->parse_status_.ok())) return false; // The value is invalid but execution should continue - set the null indicator and // skip conversion. - tuple->SetNull(null_indicator_offset_); + SetNullSlot(tuple); return true; } return true; @@ -739,7 +740,7 @@ bool ScalarColumnReader::ReadAndConver if (UNLIKELY(!parent_->parse_status_.ok())) return false; // The value or the conversion is invalid but execution should continue - set the // null indicator. - tuple->SetNull(null_indicator_offset_); + SetNullSlot(tuple); continue; } } @@ -763,7 +764,7 @@ bool ScalarColumnReader::ReadSlotsNoCo if (UNLIKELY(!parent_->parse_status_.ok())) return false; // The value is invalid but execution should continue - set the null indicator and // skip conversion. - tuple->SetNull(null_indicator_offset_); + SetNullSlot(tuple); } } } @@ -1730,6 +1731,8 @@ ParquetColumnReader* ParquetColumnReader::Create(const SchemaNode& node, parent, node, slot_desc); case TYPE_DECIMAL: return CreateDecimalColumnReader(node, slot_desc, parent); + case TYPE_STRUCT: + return new StructColumnReader(parent, node, slot_desc); default: DCHECK(false) << slot_desc->type().DebugString(); return nullptr; diff --git a/be/src/exec/parquet/parquet-column-readers.h b/be/src/exec/parquet/parquet-column-readers.h index cacda51ed..706b975c5 100644 --- a/be/src/exec/parquet/parquet-column-readers.h +++ b/be/src/exec/parquet/parquet-column-readers.h @@ -27,9 +27,10 @@ namespace impala { +class ComplexColumnReader; class DictDecoderBase; -class Tuple; class MemPool; +class Tuple; /// Base class for reading a Parquet column. Reads a logical column, not necessarily a /// column materialized in the file (e.g. collections). The two subclasses are @@ -86,6 +87,17 @@ class ParquetColumnReader { /// Returns true if this reader materializes collections (i.e. CollectionValues). virtual bool IsCollectionReader() const = 0; + /// Returns true if this reader materializes structs. + virtual bool IsStructReader() const = 0; + + /// Returns true if this reader materializes struct or has a child (recursively) that + /// does so. + virtual bool HasStructReader() const = 0; + + /// Returns true if this reader materializes nested types such as Collections or + /// Structs. + virtual bool IsComplexReader() const = 0; + const char* filename() const { return parent_->filename(); } /// Read the current value (or null) into 'tuple' for this column. This should only be @@ -163,7 +175,7 @@ class ParquetColumnReader { /// and frees up other resources. If 'row_batch' is NULL frees all resources instead. virtual void Close(RowBatch* row_batch) = 0; - /// Skips the number of encoded values specified by 'num_rows', without materilizing or + /// Skips the number of encoded values specified by 'num_rows', without materializing or /// decoding them across pages. If page filtering is enabled, then it directly skips to /// row after 'skip_row_id' and ignores 'num_rows'. /// It invokes 'SkipToLevelRows' for all 'children_'. @@ -181,6 +193,11 @@ class ParquetColumnReader { // Returns 'true' if the reader supports page index. virtual bool DoesPageFiltering() const { return false; } + /// Set the reader's slot in the given 'tuple' to NULL + virtual void SetNullSlot(Tuple* tuple) { + tuple->SetNull(DCHECK_NOTNULL(slot_desc_)->null_indicator_offset()); + } + protected: HdfsParquetScanner* parent_; const SchemaNode& node_; @@ -270,6 +287,12 @@ class BaseScalarColumnReader : public ParquetColumnReader { virtual bool IsCollectionReader() const override { return false; } + virtual bool IsStructReader() const override { return false; } + + virtual bool HasStructReader() const override { return false; } + + virtual bool IsComplexReader() const override { return false; } + /// Resets the reader for each row group in the file and creates the scan /// range for the column, but does not start it. To start scanning, /// set_io_reservation() must be called to assign reservation to this @@ -330,6 +353,10 @@ class BaseScalarColumnReader : public ParquetColumnReader { // we know this row can be skipped. This could be very useful with stats and big // sections can be skipped. Implement that when we can benefit from it. + /// Implementation for NextLevels(). + template + bool NextLevels(); + protected: // Friend parent scanner so it can perform validation (e.g. ValidateEndOfRowGroup()) friend class HdfsParquetScanner; @@ -446,10 +473,6 @@ class BaseScalarColumnReader : public ParquetColumnReader { /// if page has rows of interest to actually buffer the values. bool AdvanceNextPageHeader(); - /// Implementation for NextLevels(). - template - bool NextLevels(); - /// Creates a dictionary decoder from values/size. 'decoder' is set to point to a /// dictionary decoder stored in this object. Subclass must implement this. Returns /// an error status if the dictionary values could not be decoded successfully. diff --git a/be/src/exec/parquet/parquet-complex-column-reader.cc b/be/src/exec/parquet/parquet-complex-column-reader.cc new file mode 100644 index 000000000..041de1e9a --- /dev/null +++ b/be/src/exec/parquet/parquet-complex-column-reader.cc @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exec/parquet/parquet-complex-column-reader.h" + +namespace impala { + +void ComplexColumnReader::Close(RowBatch* row_batch) { + for (ParquetColumnReader* child_reader : children_) { + child_reader->Close(row_batch); + } +} + +bool ComplexColumnReader::HasStructReader() const { + if (IsStructReader()) return true; + for (const ParquetColumnReader* child : children_) { + if (child->HasStructReader()) return true; + } + return false; +}; + +} // namespace impala \ No newline at end of file diff --git a/be/src/exec/parquet/parquet-complex-column-reader.h b/be/src/exec/parquet/parquet-complex-column-reader.h new file mode 100644 index 000000000..39c5c1731 --- /dev/null +++ b/be/src/exec/parquet/parquet-complex-column-reader.h @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "exec/parquet/parquet-column-readers.h" + +namespace impala { + +/// Abstract class to hold common functionality between the complex column readers, the +/// CollectionColumnReader and the StructColumnReader. +class ComplexColumnReader : public ParquetColumnReader { +public: + vector* children() { return &children_; } + + virtual void Close(RowBatch* row_batch) override; + + virtual bool SetRowGroupAtEnd() override { + DCHECK(!children_.empty()); + for (int c = 0; c < children_.size(); ++c) { + if (!children_[c]->SetRowGroupAtEnd()) return false; + } + return true; + } + + /// Returns the index of the row that was processed most recently. + virtual int64_t LastProcessedRow() const override { + DCHECK(!children_.empty()); + return children_[0]->LastProcessedRow(); + } + + virtual bool IsComplexReader() const override { return true; } + + virtual bool HasStructReader() const override; + + /// This is called once for each row group in the file. + void Reset() { + def_level_ = ParquetLevel::INVALID_LEVEL; + rep_level_ = ParquetLevel::INVALID_LEVEL; + pos_current_value_ = ParquetLevel::INVALID_POS; + } + +protected: + ComplexColumnReader(HdfsParquetScanner* parent, const SchemaNode& node, + const SlotDescriptor* slot_desc) + : ParquetColumnReader(parent, node, slot_desc) { + if (slot_desc != nullptr) DCHECK(slot_desc->type().IsComplexType()); + } + + /// Column readers of fields contained within this complex column. There is at least one + /// child reader per complex reader. Child readers either materialize slots in the + /// complex item tuples, or there is a single child reader that does not materialize + /// any slot and is only used by this reader to read def and rep levels. + std::vector children_; +}; +} // namespace impala diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index bf905b789..3f61464bb 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++ b/be/src/exec/parquet/parquet-metadata-utils.cc @@ -303,6 +303,7 @@ Status ParquetMetadataUtils::ValidateOffsetInFile(const string& filename, int co Status ParquetMetadataUtils::ValidateRowGroupColumn( const parquet::FileMetaData& file_metadata, const char* filename, int row_group_idx, int col_idx, const parquet::SchemaElement& schema_element, RuntimeState* state) { + DCHECK_GE(col_idx, 0); const parquet::ColumnMetaData& col_chunk_metadata = file_metadata.row_groups[row_group_idx].columns[col_idx].meta_data; @@ -709,7 +710,7 @@ Status ParquetSchemaResolver::ResolvePathHelper(ArrayEncoding array_encoding, if (*missing_field) return Status::OK(); } else if (col_type->type == TYPE_STRUCT) { DCHECK_GT(col_type->children.size(), 0); - // Nothing to do for structs + RETURN_IF_ERROR(ResolveStruct(**node, *col_type, path, i)); } else { DCHECK(!col_type->IsComplexType()); DCHECK_EQ(i, path.size() - 1); @@ -935,6 +936,16 @@ Status ParquetSchemaResolver::ResolveMap(const SchemaPath& path, int idx, return Status::OK(); } +Status ParquetSchemaResolver::ResolveStruct(const SchemaNode& node, + const ColumnType& col_type, const SchemaPath& path, int idx) const { + if (node.children.size() < 1) { + ErrorMsg msg(TErrorCode::PARQUET_UNRECOGNIZED_SCHEMA, filename_, + PrintSubPath(tbl_desc_, path, idx), "struct", node.DebugString()); + return Status::Expected(msg); + } + return Status::OK(); +} + Status ParquetSchemaResolver::ValidateScalarNode(const SchemaNode& node, const ColumnType& col_type, const SchemaPath& path, int idx) const { if (!node.children.empty()) { diff --git a/be/src/exec/parquet/parquet-metadata-utils.h b/be/src/exec/parquet/parquet-metadata-utils.h index df9656d16..14950ada7 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.h +++ b/be/src/exec/parquet/parquet-metadata-utils.h @@ -239,6 +239,10 @@ class ParquetSchemaResolver { Status ResolveMap(const SchemaPath& path, int idx, SchemaNode** node, bool* missing_field) const; + /// The ResolvePathHelper() logic for structs. + Status ResolveStruct(const SchemaNode& node, const ColumnType& col_type, + const SchemaPath& path, int idx) const; + /// The ResolvePathHelper() logic for scalars (just does validation since there's no /// more actual work to be done). Status ValidateScalarNode(const SchemaNode& node, const ColumnType& col_type, diff --git a/be/src/exec/parquet/parquet-struct-column-reader.cc b/be/src/exec/parquet/parquet-struct-column-reader.cc new file mode 100644 index 000000000..c58420d7c --- /dev/null +++ b/be/src/exec/parquet/parquet-struct-column-reader.cc @@ -0,0 +1,117 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "parquet-struct-column-reader.h" + +namespace impala { + +bool StructColumnReader::NextLevels() { + DCHECK(!children_.empty()); + bool result = true; + for (ParquetColumnReader* child_reader : children_) { + result &= child_reader->NextLevels(); + } + def_level_ = children_[0]->def_level(); + rep_level_ = children_[0]->rep_level(); + return result; +} + +template +bool StructColumnReader::ReadValue(MemPool* pool, Tuple* tuple, bool* read_row) { + DCHECK(!children_.empty()); + DCHECK(!*read_row); + bool should_abort = true; + if (def_level_ >= max_def_level()) { + for (ParquetColumnReader* child_col_reader : children_) { + if (IN_COLLECTION) { + should_abort &= child_col_reader->ReadValue(pool, tuple); + } else { + should_abort &= child_col_reader->ReadNonRepeatedValue(pool, tuple); + } + } + *read_row = true; + } else { + if (!HasNullCollectionAncestor()) { + SetNullSlot(tuple); + *read_row = true; + } + should_abort = NextLevels(); + } + + def_level_ = children_[0]->def_level(); + rep_level_ = children_[0]->rep_level(); + return should_abort; +} + +template +bool StructColumnReader::HasNullCollectionAncestor() const { + if (!IN_COLLECTION) return false; + // If none of the parents are NULL + if (def_level_ >= max_def_level() - 1) return false; + // There is a null ancestor. Have to check if there is a null collection + // in the chain between this column reader and the topmost null ancestor. + if (def_level_ < def_level_of_immediate_repeated_ancestor()) return true; + return false; +} + +bool StructColumnReader::ReadValue(MemPool* pool, Tuple* tuple) { + bool dummy = false; + return ReadValue(pool, tuple, &dummy); +} + +bool StructColumnReader::ReadNonRepeatedValue(MemPool* pool, Tuple* tuple) { + bool dummy = false; + return ReadValue(pool, tuple, &dummy); +} + +bool StructColumnReader::ReadValueBatch(MemPool* pool, int max_values, + int tuple_size, uint8_t* tuple_mem, int* num_values) { + return ReadValueBatch(pool, max_values, tuple_size, tuple_mem, num_values); +} + +bool StructColumnReader::ReadNonRepeatedValueBatch(MemPool* pool, int max_values, + int tuple_size, uint8_t* tuple_mem, int* num_values) { + return ReadValueBatch(pool, max_values, tuple_size, tuple_mem, num_values); +} + +template +bool StructColumnReader::ReadValueBatch(MemPool* pool, int max_values, int tuple_size, + uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT { + if (def_level_ == ParquetLevel::INVALID_LEVEL && !NextLevels()) return false; + + int val_count = 0; + bool continue_execution = true; + while (val_count < max_values && !RowGroupAtEnd() && continue_execution) { + Tuple* tuple = reinterpret_cast(tuple_mem + val_count * tuple_size); + bool read_row = false; + continue_execution = ReadValue(pool, tuple, &read_row); + if (read_row) ++val_count; + if (SHOULD_TRIGGER_COL_READER_DEBUG_ACTION(val_count)) { + continue_execution &= ColReaderDebugAction(&val_count); + } + } + *num_values = val_count; + return continue_execution; +} + +bool StructColumnReader::SkipRows(int64_t num_rows, int64_t skip_row_id) { + // Structs are excluded from late materialization so no need to implement SkipRows(). + DCHECK(false); + return true; +} + +} // namespace impala diff --git a/be/src/exec/parquet/parquet-struct-column-reader.h b/be/src/exec/parquet/parquet-struct-column-reader.h new file mode 100644 index 000000000..3221c127f --- /dev/null +++ b/be/src/exec/parquet/parquet-struct-column-reader.h @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "exec/parquet/parquet-complex-column-reader.h" + +namespace impala { + +/// A struct is not directly represented in a Parquet file, hence a StructColumnReader +/// delegates the actual reading of values to it's children. +class StructColumnReader : public ComplexColumnReader { + public: + StructColumnReader(HdfsParquetScanner* parent, const SchemaNode& node, + const SlotDescriptor* slot_desc) : ComplexColumnReader(parent, node, slot_desc) { + DCHECK(!node.children.empty()); + if (slot_desc != nullptr) DCHECK(slot_desc->type().IsStructType()); + } + + virtual ~StructColumnReader() {} + + /// Calls ReadValue() with 'IN_COLLECTION' = true as template parameter. + virtual bool ReadValue(MemPool* pool, Tuple* tuple) override; + + /// Calls ReadValue() with 'IN_COLLECTION' = false as template parameter. + virtual bool ReadNonRepeatedValue(MemPool* pool, Tuple* tuple) override; + + /// Reads a batch of values and assumes that this column reader has a collection column + /// reader parent. Note, this will delegate the reading to the children of the struct + /// and their value are read in a non-batched manner calling ReadValue(). + virtual bool ReadValueBatch(MemPool* pool, int max_values, int tuple_size, + uint8_t* tuple_mem, int* num_values) override; + + /// Similar as above but this expects that this column reader doesn't have a collection + /// column reader parent. + virtual bool ReadNonRepeatedValueBatch(MemPool* pool, int max_values, int tuple_size, + uint8_t* tuple_mem, int* num_values) override; + + /// Calls NextLevels() on each children and then sets the def_level_, rep_level_ + /// members. Returns false if any of the NextLevels() call returns false. + virtual bool NextLevels() override; + + virtual bool IsStructReader() const override { return true; } + + virtual bool IsCollectionReader() const override { return false; } + + /// Skips the number of encoded values specified by 'num_rows', without materilizing or + /// decoding them. + /// Returns true on success, false otherwise. + virtual bool SkipRows(int64_t num_rows, int64_t skip_row_id) override; + + void SetNullSlot(Tuple* tuple) override { + for (ParquetColumnReader* child : children_) child->SetNullSlot(tuple); + tuple->SetNull(DCHECK_NOTNULL(slot_desc_)->null_indicator_offset()); + } + + private: + /// Returns true if the struct represented by this column reader has a collection + /// parent that is null. + template + bool HasNullCollectionAncestor() const; + + /// Helper function for ReadValueBatch() and ReadNonRepeatedValueBatch() functions. + template + bool ReadValueBatch(MemPool* pool, int max_values, int tuple_size, + uint8_t* RESTRICT tuple_mem, int* RESTRICT num_values) RESTRICT; + + /// Calls ReadValue() for all the children of this StructColumnReader. NextLevels() + /// should be called before the first usage of this function to initialize def_level_ + /// and rep_level_ members. After the first call this function will take care itself + /// of having the mentioned members up-to-date. Takes care of setting the struct to + /// null in case the def_level_ of the children says so. Returns false if any of the + /// ReadValue() or NextLevels() calls fail. + /// 'read_row' is set to true if running this function actually resulted in filling a + /// value in 'tuple'. E.g. If there is a collection parent that is null then there won't + /// be any values written into 'tuple'. + template + bool ReadValue(MemPool* pool, Tuple* tuple, bool* read_row); +}; +} // namespace impala \ No newline at end of file diff --git a/fe/src/main/java/org/apache/impala/analysis/SlotRef.java b/fe/src/main/java/org/apache/impala/analysis/SlotRef.java index 266e4da1c..ac3ceaaf4 100644 --- a/fe/src/main/java/org/apache/impala/analysis/SlotRef.java +++ b/fe/src/main/java/org/apache/impala/analysis/SlotRef.java @@ -185,9 +185,9 @@ public class SlotRef extends Expr { } FeFsTable feTable = (FeFsTable)rootTable; for (HdfsFileFormat format : feTable.getFileFormats()) { - if (format != HdfsFileFormat.ORC) { - throw new AnalysisException("Querying STRUCT is only supported for ORC file " + - "format."); + if (format != HdfsFileFormat.ORC && format != HdfsFileFormat.PARQUET) { + throw new AnalysisException("Querying STRUCT is only supported for ORC and " + + "Parquet file formats."); } } } diff --git a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java index 78577769d..295925e97 100644 --- a/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java +++ b/fe/src/main/java/org/apache/impala/planner/HdfsScanNode.java @@ -1005,9 +1005,19 @@ public class HdfsScanNode extends ScanNode { // at-a-time. Expect a single slot conjunct to be associated with a single tuple-id. if (slotIds.size() != 1) return; + SlotDescriptor firstSlotDesc = analyzer.getSlotDesc(slotIds.get(0)); // Check to see if this slot is a collection type. Dictionary pruning is applicable // to scalar values nested in collection types, not enclosing collection types. - if (analyzer.getSlotDesc(slotIds.get(0)).getType().isCollectionType()) return; + if (firstSlotDesc.getType().isCollectionType()) return; + + // If any of the slot descriptors affected by 'conjunct' happens to be a scalar member + // of a struct, where the struct is also given in the select list then skip dictionary + // filtering as the slot/tuple IDs in the conjunct would mismatch with the ones in the + // select list and would result a Precondition check failure later. + for (SlotId slotId : slotIds) { + SlotDescriptor slotDesc = analyzer.getSlotDesc(slotId); + if (IsMemberOfStructInSelectList(slotDesc)) return; + } // Check to see if this conjunct contains any known randomized function if (conjunct.contains(Expr.IS_NONDETERMINISTIC_BUILTIN_FN_PREDICATE)) return; @@ -1038,6 +1048,27 @@ public class HdfsScanNode extends ScanNode { slotList.add(conjunctIdx); } + /** + * Checks if 'slotDesc' is a member of a struct slot where the struct slot is given in + * the select list. + */ + private boolean IsMemberOfStructInSelectList(SlotDescriptor slotDesc) { + SlotDescriptor parentStructSlot = slotDesc.getParent().getParentSlotDesc(); + // Check if 'slotDesc' is a member of a struct slot descriptor. + if (slotDesc.getType().isScalarType() && parentStructSlot == null) return false; + + if (slotDesc.getType().isStructType()) { + // Check if the struct is in the select list. + for (SlotDescriptor scannedSlots : desc_.getSlots()) { + if (scannedSlots.getId() == slotDesc.getId()) return true; + } + } + + // Recursively check the parent struct if it's given in the select list. + if (parentStructSlot != null) return IsMemberOfStructInSelectList(parentStructSlot); + return false; + } + /** * Walks through conjuncts_ and collectionConjuncts_ and populates * dictionaryFilterConjuncts_. @@ -1992,6 +2023,7 @@ public class HdfsScanNode extends ScanNode { List conjuncts; TupleDescriptor tupleDescriptor = entry.getKey(); String tupleName = ""; + if (tupleDescriptor == getTupleDesc()) { conjuncts = conjuncts_; } else { diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java index 2a585c371..43c925057 100644 --- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java +++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeStmtsTest.java @@ -782,8 +782,8 @@ public class AnalyzeStmtsTest extends AnalyzerTest { // Check the support of struct in the select list for different file formats. AnalysisContext ctx = createAnalysisCtx(); ctx.getQueryOptions().setDisable_codegen(true); - AnalysisError("select alltypes from functional_parquet.complextypes_structs", ctx, - "Querying STRUCT is only supported for ORC file format."); + AnalysisError("select int_struct_col from functional.allcomplextypes", ctx, + "Querying STRUCT is only supported for ORC and Parquet file formats."); AnalyzesOk("select alltypes from functional_orc_def.complextypes_structs", ctx); // Check if a struct in the select list raises an error if it contains collections. diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql index 996591da2..012783f37 100644 --- a/testdata/datasets/functional/functional_schema_template.sql +++ b/testdata/datasets/functional/functional_schema_template.sql @@ -811,7 +811,7 @@ year int month int ---- COLUMNS id int -struct_val struct +struct_val struct ---- DEPENDENT_LOAD_HIVE INSERT INTO {db_name}{db_suffix}.{table_name} PARTITION (year, month) @@ -826,11 +826,10 @@ PARTITION (year, month) 'float_col', float_col, 'double_col', double_col, 'date_string_col', date_string_col, - 'string_col', string_col, - 'timestamp_col', timestamp_col), + 'string_col', string_col), year, month - FROM {db_name}{db_suffix}.alltypes; + FROM {db_name}.alltypes; ---- LOAD ==== ---- DATASET diff --git a/testdata/workloads/functional-query/queries/QueryTest/nested-struct-in-select-list.test b/testdata/workloads/functional-query/queries/QueryTest/nested-struct-in-select-list.test index 95676c5e3..9063d0e92 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/nested-struct-in-select-list.test +++ b/testdata/workloads/functional-query/queries/QueryTest/nested-struct-in-select-list.test @@ -2,7 +2,7 @@ ---- QUERY # Select a struct that contains multiple structs. select id, outer_struct -from functional_orc_def.complextypes_nested_structs; +from complextypes_nested_structs; ---- RESULTS 1,'{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}' 2,'{"str":"str","inner_struct1":null,"inner_struct2":{"i":100,"str":"str3"},"inner_struct3":{"s":{"i":321,"s":"dfgs"}}}' @@ -15,7 +15,7 @@ INT,STRING ---- QUERY # Select a struct that contains multiple structs using a filter on a non-struct field. select id, outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs where id > 2; ---- RESULTS 3,'NULL' @@ -27,7 +27,7 @@ INT,STRING ---- QUERY # Select a struct that contains multiple structs using a filter on a struct field. select id, outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs where length(outer_struct.str) > 3; ---- RESULTS 1,'{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}' @@ -37,7 +37,7 @@ INT,STRING ---- QUERY # Select a nested struct with an order by. select id, outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs order by id; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}' @@ -51,7 +51,7 @@ INT,STRING ---- QUERY # Select a nested struct with an order by. select id, outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs order by id desc; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 5,'{"str":null,"inner_struct1":null,"inner_struct2":null,"inner_struct3":null}' @@ -65,7 +65,7 @@ INT,STRING ---- QUERY # Select the same nested struct multiple times in one query. select id, outer_struct, outer_struct -from functional_orc_def.complextypes_nested_structs; +from complextypes_nested_structs; ---- RESULTS 1,'{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}','{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}' 2,'{"str":"str","inner_struct1":null,"inner_struct2":{"i":100,"str":"str3"},"inner_struct3":{"s":{"i":321,"s":"dfgs"}}}','{"str":"str","inner_struct1":null,"inner_struct2":{"i":100,"str":"str3"},"inner_struct3":{"s":{"i":321,"s":"dfgs"}}}' @@ -78,7 +78,7 @@ INT,STRING,STRING ---- QUERY # Select the same nested struct multiple times in one query and order the results. select id, outer_struct, outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs order by id desc; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 5,'{"str":null,"inner_struct1":null,"inner_struct2":null,"inner_struct3":null}','{"str":null,"inner_struct1":null,"inner_struct2":null,"inner_struct3":null}' @@ -93,7 +93,7 @@ INT,STRING,STRING # Similar to the above query but here the 'id' field is not in the select list but still # used in the order by. select outer_struct, outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs order by id desc; ---- RESULTS: VERIFY_IS_EQUAL_SORTED '{"str":null,"inner_struct1":null,"inner_struct2":null,"inner_struct3":null}','{"str":null,"inner_struct1":null,"inner_struct2":null,"inner_struct3":null}' @@ -105,9 +105,18 @@ order by id desc; STRING,STRING ==== ---- QUERY +# Select an inner struct where the outer struct is null. +select outer_struct.inner_struct1 from complextypes_nested_structs +where id = 3; +---- RESULTS +'NULL' +---- TYPES +STRING +==== +---- QUERY # WITH clause creates an inline view containing a nested struct. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select sub.id, sub.outer_struct from sub; ---- RESULTS 1,'{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}' @@ -122,7 +131,7 @@ INT,STRING # WITH clause creates an inline view containing a nested struct and we query a nested # field. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select sub.id, sub.outer_struct.inner_struct2 from sub; ---- RESULTS 1,'{"i":333222111,"str":"somestr3"}' @@ -137,7 +146,7 @@ INT,STRING # WITH clause creates an inline view containing a nested struct and we query a doubly # nested field. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select sub.id, sub.outer_struct.inner_struct2.i from sub; ---- RESULTS 1,333222111 @@ -153,7 +162,7 @@ INT,INT # that is a struct itself; query both in the main select statement. with sub as ( select id, outer_struct, outer_struct.inner_struct3 inner3 - from functional_orc_def.complextypes_nested_structs) + from complextypes_nested_structs) select id, outer_struct, inner3 from sub; ---- RESULTS 1,'{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}','{"s":{"i":112288,"s":null}}' @@ -170,7 +179,7 @@ INT,STRING,STRING # 'outer_struct' as well as 'inner3' in the main select statement. with sub as ( select id, outer_struct, outer_struct.inner_struct3 inner3 - from functional_orc_def.complextypes_nested_structs) + from complextypes_nested_structs) select id, outer_struct.inner_struct2, inner3 from sub; ---- RESULTS 1,'{"i":333222111,"str":"somestr3"}','{"s":{"i":112288,"s":null}}' @@ -187,7 +196,7 @@ INT,STRING,STRING # 'outer_struct' as well as a member of 'inner3' in the main select statement. with sub as ( select id, outer_struct, outer_struct.inner_struct3 inner3 - from functional_orc_def.complextypes_nested_structs) + from complextypes_nested_structs) select id, outer_struct.inner_struct2, inner3.s from sub; ---- RESULTS 1,'{"i":333222111,"str":"somestr3"}','{"i":112288,"s":null}' @@ -201,7 +210,7 @@ INT,STRING,STRING ---- QUERY # WITH clause creates an inline view containing a nested struct; we order by id. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select sub.id, sub.outer_struct.inner_struct2 from sub order by sub.id desc; ---- RESULTS 5,'NULL' @@ -216,7 +225,7 @@ INT,STRING # WITH clause creates an inline view containing a nested struct; we order by a nested # field. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select sub.id, sub.outer_struct.inner_struct2 from sub order by sub.outer_struct.inner_struct2.i, sub.id; ---- RESULTS @@ -232,7 +241,7 @@ INT,STRING # WITH clause creates an inline view containing a nested struct; we order by a nested # field that is not present in the select list. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select sub.id, sub.outer_struct.inner_struct1 from sub order by sub.outer_struct.inner_struct2.i, sub.id; ---- RESULTS @@ -248,7 +257,7 @@ INT,STRING # WITH clause creates an inline view containing a nested struct; filter by a struct field # from the inline view. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select sub.id, sub.outer_struct.str from sub where length(sub.outer_struct.str) < 4; @@ -263,7 +272,7 @@ INT,STRING # filter by one of its fields. with sub as ( select id, outer_struct.inner_struct1 s1, outer_struct.inner_struct2 s2 - from functional_orc_def.complextypes_nested_structs) + from complextypes_nested_structs) select sub.id, s2 from sub where length(s2.str) < 8; @@ -277,7 +286,7 @@ INT,STRING # WITH clause creates an inline view containing a nested struct; filter by a struct field # from the inline view but do not select anything from it. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select 1 from sub where length(sub.outer_struct.str) < 4; @@ -292,7 +301,7 @@ TINYINT # the inline view and ordering by a non-complex item from the view. with sub as ( select id, outer_struct - from functional_orc_def.complextypes_nested_structs + from complextypes_nested_structs where length(outer_struct.str) > 3) select sub.id, sub.outer_struct from sub order by sub.id desc; ---- RESULTS @@ -303,7 +312,7 @@ INT,STRING ---- QUERY # Two-level inline view, querying a struct filed. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select id, s from (select id, outer_struct.inner_struct1 as s from sub) v order by id; ---- RESULTS @@ -318,7 +327,7 @@ INT,STRING ---- QUERY # Two-level inline view, querying the top level struct. with sub as ( - select id, outer_struct from functional_orc_def.complextypes_nested_structs) + select id, outer_struct from complextypes_nested_structs) select id, s from (select id, outer_struct as s from sub) v order by id; ---- RESULTS @@ -335,7 +344,7 @@ INT,STRING # when the struct fields in the predicate are itemTupleDescriptors within the struct(s), # not in the main tuple. select id, outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs where outer_struct.inner_struct2.i > length(outer_struct.str) ---- RESULTS 1,'{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}' @@ -349,7 +358,7 @@ INT,STRING # enclosing struct is embedded in another one, but the top level struct is not present in # the query. select id, outer_struct.inner_struct2.i, outer_struct.inner_struct2 -from functional_orc_def.complextypes_nested_structs; +from complextypes_nested_structs; ---- RESULTS 1,333222111,'{"i":333222111,"str":"somestr3"}' 2,100,'{"i":100,"str":"str3"}' @@ -363,8 +372,8 @@ INT,INT,STRING # An inner join where struct fields are in the join condition and their parent struct is # in the select list. select a.outer_struct, b.small_struct -from functional_orc_def.complextypes_nested_structs a - inner join functional_orc_def.complextypes_structs b +from complextypes_nested_structs a + inner join complextypes_structs b on b.small_struct.i = a.outer_struct.inner_struct2.i + 19091; ---- RESULTS '{"str":"str","inner_struct1":null,"inner_struct2":{"i":100,"str":"str3"},"inner_struct3":{"s":{"i":321,"s":"dfgs"}}}','{"i":19191,"s":"small_struct_str"}' @@ -375,8 +384,8 @@ STRING,STRING # An outer join where struct fields are in the join condition and their parent struct is # in the select list. select a.outer_struct, b.small_struct -from functional_orc_def.complextypes_nested_structs a - full outer join functional_orc_def.complextypes_structs b +from complextypes_nested_structs a + full outer join complextypes_structs b on b.small_struct.i = a.outer_struct.inner_struct2.i + 19091; ---- RESULTS '{"str":"somestr1","inner_struct1":{"str":"somestr2","de":12345.12},"inner_struct2":{"i":333222111,"str":"somestr3"},"inner_struct3":{"s":{"i":112288,"s":null}}}','NULL' @@ -394,7 +403,7 @@ STRING,STRING ==== ---- QUERY # Checks that "SELECT nested_struct.* ..." omits the nested structs from the output. -select id, outer_struct.* from functional_orc_def.complextypes_nested_structs; +select id, outer_struct.* from complextypes_nested_structs; ---- RESULTS 1,'somestr1' 2,'str' @@ -406,8 +415,8 @@ INT,STRING ==== ---- QUERY # IMPALA-10839: Display nulls at the correct level. -select id, outer_struct.inner_struct3 from -functional_orc_def.complextypes_nested_structs; +select id, outer_struct.inner_struct3 +from complextypes_nested_structs; ---- RESULTS 1,'{"s":{"i":112288,"s":null}}' 2,'{"s":{"i":321,"s":"dfgs"}}' @@ -419,8 +428,8 @@ INT,STRING ==== ---- QUERY # IMPALA-10839: Display nulls at the correct level. -select id, outer_struct.inner_struct3.s from -functional_orc_def.complextypes_nested_structs; +select id, outer_struct.inner_struct3.s +from complextypes_nested_structs; ---- RESULTS 1,'{"i":112288,"s":null}' 2,'{"i":321,"s":"dfgs"}' @@ -434,10 +443,10 @@ INT,STRING # Subquery that returns a complex type is not supported. # IMPALA-9500 select outer_struct -from functional_orc_def.complextypes_nested_structs +from complextypes_nested_structs where outer_struct in -(select outer_struct from functional_orc_def.complextypes_nested_structs); +(select outer_struct from functional_parquet.complextypes_nested_structs); ---- CATCH -AnalysisException: A subquery can't return complex types. (SELECT outer_struct FROM functional_orc_def.complex +AnalysisException: A subquery can't return complex types. (SELECT outer_struct FROM functional_parquet.complex types_nested_structs) ==== diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-ambiguous-list-legacy.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-ambiguous-list-legacy.test index 2ae67bc43..22089e616 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet-ambiguous-list-legacy.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-ambiguous-list-legacy.test @@ -48,15 +48,16 @@ int,int,int,int ==== ---- QUERY # Incorrect field resolutions with THREE_LEVEL and POSITION -# because the data file uses the 2-level encoding. +# because the data file uses the 2-level encoding. We get an error due to unsuccessfully +# resolving the struct part. The full error message is the following: +# File '/ambig_legacy/AmbiguousList_Legacy.parquet' has an incompatible +# Parquet schema for column '.ambig_legacy.ambigarray.item.s2'. Column type: +# struct, Parquet schema: optional int32 f21 [i:0 d:2 r:1] set parquet_fallback_schema_resolution=position; set parquet_array_resolution=three_level; select f11, f12, s2.f21, s2.f22 from ambig_legacy.ambigarray; ----- RESULTS -22,NULL,NULL,NULL -220,NULL,NULL,NULL ----- TYPES -int,int,int,int +---- CATCH +has an incompatible Parquet schema for column ==== ---- QUERY # All fields are interpreted as missing with THREE_LEVEL and NAME. diff --git a/testdata/workloads/functional-query/queries/QueryTest/struct-in-select-list.test b/testdata/workloads/functional-query/queries/QueryTest/struct-in-select-list.test index 74a369784..deddd5159 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/struct-in-select-list.test +++ b/testdata/workloads/functional-query/queries/QueryTest/struct-in-select-list.test @@ -1,7 +1,7 @@ ==== ---- QUERY # Select a simple struct with one bool member. -select id, tiny_struct from functional_orc_def.complextypes_structs; +select id, tiny_struct from complextypes_structs; ---- RESULTS 1,'{"b":true}' 2,'{"b":false}' @@ -14,7 +14,7 @@ INT,STRING ==== ---- QUERY # Similar query as above but with an order by. -select id, tiny_struct from functional_orc_def.complextypes_structs order by id; +select id, tiny_struct from complextypes_structs order by id; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'{"b":true}' 2,'{"b":false}' @@ -29,7 +29,7 @@ INT,STRING # Ordering by a member of the struct. # Forced to use a SORT node instead of a TOPN. set disable_outermost_topn = 1; -select id, alltypes from functional_orc_def.complextypes_structs +select id, alltypes from complextypes_structs order by alltypes.ti; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 4,'{"ti":90,"si":30482,"i":1664336,"bi":23567459873,"b":true,"f":0.5600000023841858,"do":NaN,"da":"2000-12-31","ts":"2024-01-01 00:00:00.123400000","s1":"random string","s2":"","c1":"c","c2":"d ","vc":"addsdrr","de1":33357,"de2":null}' @@ -44,7 +44,7 @@ INT,STRING ---- QUERY # Querying two simple structs. There is a string in one of them and also a non-struct # string in the select list. -select id, str, tiny_struct, small_struct from functional_orc_def.complextypes_structs; +select id, str, tiny_struct, small_struct from complextypes_structs; ---- RESULTS 1,'first item','{"b":true}','NULL' 2,'second item','{"b":false}','{"i":19191,"s":"small_struct_str"}' @@ -58,7 +58,7 @@ INT,STRING,STRING,STRING ---- QUERY # Similar query as above but with an order by. select id, str, tiny_struct, small_struct -from functional_orc_def.complextypes_structs +from complextypes_structs order by id; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'first item','{"b":true}','NULL' @@ -72,7 +72,7 @@ INT,STRING,STRING,STRING ==== ---- QUERY # Querying the same struct multiple times in one query. -select id, small_struct, small_struct from functional_orc_def.complextypes_structs; +select id, small_struct, small_struct from complextypes_structs; ---- RESULTS 1,'NULL','NULL' 2,'{"i":19191,"s":"small_struct_str"}','{"i":19191,"s":"small_struct_str"}' @@ -87,7 +87,7 @@ INT,STRING,STRING # The same struct multiple times in the select list where there is an ordering in the # results. select id, tiny_struct, tiny_struct -from functional_orc_def.complextypes_structs +from complextypes_structs order by id desc; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 6,'NULL','NULL' @@ -103,7 +103,7 @@ INT,STRING,STRING # Similar to the above query but here the 'id' field is not in the select list but still # used in the order by. select tiny_struct, tiny_struct -from functional_orc_def.complextypes_structs +from complextypes_structs order by id desc; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 'NULL','NULL' @@ -120,7 +120,7 @@ STRING,STRING # There are multiple string columns to check if none of the overwrites the other. # There is a row where all the children of the struct are null but the struct is non # null. Another row hold a struct that is itself null. -select id, str, alltypes from functional_orc_def.complextypes_structs; +select id, str, alltypes from complextypes_structs; ---- RESULTS 1,'first item','{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' 2,'second item','{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -133,7 +133,7 @@ INT,STRING,STRING ==== ---- QUERY # Similar query as above but with an order by. -select id, str, alltypes from functional_orc_def.complextypes_structs order by id; +select id, str, alltypes from complextypes_structs order by id; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 1,'first item','{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' 2,'second item','{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -146,7 +146,7 @@ INT,STRING,STRING ==== ---- QUERY # Similar query as above but with an order by desc. -select id, str, alltypes from functional_orc_def.complextypes_structs order by id desc; +select id, str, alltypes from complextypes_structs order by id desc; ---- RESULTS: VERIFY_IS_EQUAL_SORTED 6,'sixth item','{"ti":127,"si":100,"i":234732212,"bi":664233223342,"b":true,"f":34.56000137329102,"do":99523423.33,"da":"1985-11-19","ts":"2020-09-15 03:11:22","s1":"string1","s2":"string2","c1":"z","c2":" ","vc":"cv","de1":346,"de2":6235.600}' 5,'fifth item','NULL' @@ -160,7 +160,7 @@ INT,STRING,STRING ---- QUERY # Setting BATCH_SIZE to force the results to fit in multiple row batches. set BATCH_SIZE=2; -select id, str, alltypes from functional_orc_def.complextypes_structs; +select id, str, alltypes from complextypes_structs; ---- RESULTS 1,'first item','{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' 2,'second item','{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -175,7 +175,7 @@ INT,STRING,STRING # Querying struct in the select list and filter on one member of the struct. set BATCH_SIZE=0; select id, str, alltypes -from functional_orc_def.complextypes_structs +from complextypes_structs where alltypes.b = true; ---- RESULTS 1,'first item','{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' @@ -185,8 +185,19 @@ where alltypes.b = true; INT,STRING,STRING ==== ---- QUERY +# Filter on a column that is not a member of a struct. This triggers late materialization +# where the SkipRows() functions of the struct readers could be exercised. +select id, alltypes +from complextypes_structs +where id = 1; +---- RESULTS +1,'{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' +---- TYPES +INT,STRING +==== +---- QUERY # Query a single struct slot. -select alltypes from functional_orc_def.complextypes_structs; +select alltypes from complextypes_structs; ---- RESULTS '{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' '{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -199,7 +210,7 @@ STRING ==== ---- QUERY # Query a single struct slot and order by a member of the struct. -select alltypes from functional_orc_def.complextypes_structs order by alltypes.si; +select alltypes from complextypes_structs order by alltypes.si; ---- RESULTS: VERIFY_IS_EQUAL_SORTED '{"ti":127,"si":100,"i":234732212,"bi":664233223342,"b":true,"f":34.56000137329102,"do":99523423.33,"da":"1985-11-19","ts":"2020-09-15 03:11:22","s1":"string1","s2":"string2","c1":"z","c2":" ","vc":"cv","de1":346,"de2":6235.600}' '{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -212,7 +223,7 @@ STRING ==== ---- QUERY # Query struct slots only. -select small_struct, alltypes from functional_orc_def.complextypes_structs; +select small_struct, alltypes from complextypes_structs; ---- RESULTS 'NULL','{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' '{"i":19191,"s":"small_struct_str"}','{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -226,8 +237,8 @@ STRING,STRING ---- QUERY # Query struct slot in a join query. select allt.id, comt.alltypes -from functional_orc_def.alltypes allt -join functional_orc_def.complextypes_structs comt on allt.id = comt.id; +from alltypes allt +join complextypes_structs comt on allt.id = comt.id; ---- RESULTS 1,'{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' 2,'{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -241,8 +252,8 @@ INT,STRING ---- QUERY # Similar join query as above but with different join order. select allt.id, comt.alltypes -from functional_orc_def.complextypes_structs comt -join functional_orc_def.alltypes allt on comt.id = allt.id; +from complextypes_structs comt +join alltypes allt on comt.id = allt.id; ---- RESULTS 1,'{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}' 2,'{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -256,7 +267,7 @@ INT,STRING ---- QUERY # Querying IS NULL on a member of a struct. select id, str, alltypes -from functional_orc_def.complextypes_structs +from complextypes_structs where alltypes.da is null; ---- RESULTS 2,'second item','{"ti":123,"si":4567,"i":1562322212,"bi":334333345342,"b":false,"f":NaN,"do":23233423.099,"da":null,"ts":"2020-06-11 12:10:04","s1":null,"s2":"NULL","c1":"a","c2":"ab ","vc":"varchar","de1":11223,"de2":null}' @@ -270,7 +281,7 @@ INT,STRING,STRING # in the FROM clause. This also triggers a re-analysis of the statement as the table is # full ACID. select inner_arr.ITEM, inner_arr.ITEM.e, inner_arr.ITEM.f -from functional_orc_def.complextypestbl.nested_struct.c.d.ITEM as inner_arr; +from complextypestbl.nested_struct.c.d.ITEM as inner_arr; ---- RESULTS '{"e":-1,"f":"nonnullable"}',-1,'nonnullable' '{"e":10,"f":"aaa"}',10,'aaa' @@ -288,7 +299,7 @@ from functional_orc_def.complextypestbl.nested_struct.c.d.ITEM as inner_arr; STRING,INT,STRING ==== ---- QUERY -# Similar to the above, but on a non-transactional version of the table. +# Similar to the above, but on a non-transactional ORC version of the table. # Regression test for IMPALA-11011. select inner_arr.ITEM from functional_orc_def.complextypestbl_non_transactional.nested_struct.c.d.ITEM as inner_arr; @@ -312,7 +323,7 @@ STRING # Querying a struct that is inside a nested array. Referencing the inner array through a # join with the base table. select tbl.id, inner_arr.ITEM -from functional_orc_def.complextypestbl tbl, tbl.nested_struct.c.d.ITEM as inner_arr; +from complextypestbl tbl, tbl.nested_struct.c.d.ITEM as inner_arr; ---- RESULTS 8,'{"e":-1,"f":"nonnullable"}' 1,'{"e":10,"f":"aaa"}' @@ -333,7 +344,7 @@ BIGINT,STRING # Querying a struct that is inside a nested array. Used 2 joins to reference the inner # array from the FROM clause. select tbl.id, inner_arr.ITEM -from functional_orc_def.complextypestbl tbl, +from complextypestbl tbl, tbl.nested_struct.c.d as outer_arr, outer_arr.ITEM as inner_arr; ---- RESULTS 8,'{"e":-1,"f":"nonnullable"}' @@ -355,7 +366,7 @@ BIGINT,STRING # Querying a struct that is inside a nested array. Used different kind of joins to # reference the inner array from the FROM clause. select tbl.id, inner_arr.ITEM -from functional_orc_def.complextypestbl tbl left join +from complextypestbl tbl left join tbl.nested_struct.c.d as outer_arr inner join outer_arr.ITEM as inner_arr; ---- RESULTS 8,'{"e":-1,"f":"nonnullable"}' @@ -376,7 +387,7 @@ BIGINT,STRING ---- QUERY # Similar query as above but with an order by. select tbl.id, inner_arr.ITEM -from functional_orc_def.complextypestbl tbl, +from complextypestbl tbl, tbl.nested_struct.c.d as outer_arr, outer_arr.ITEM as inner_arr order by tbl.id; ---- RESULTS: VERIFY_IS_EQUAL_SORTED @@ -398,7 +409,7 @@ BIGINT,STRING ---- QUERY # Structs are allowed in an inline view. select v.ts from - (select tiny_struct as ts from functional_orc_def.complextypes_structs) v + (select tiny_struct as ts from complextypes_structs) v ---- RESULTS '{"b":true}' '{"b":false}' @@ -414,13 +425,13 @@ STRING select v.ts from (select int_struct_col as ts from functional.allcomplextypes) v ---- CATCH -AnalysisException: Querying STRUCT is only supported for ORC file format. +AnalysisException: Querying STRUCT is only supported for ORC and Parquet file formats. ==== ---- QUERY # Structs in an inline view with order by. select v.id, v.ts from (select id, tiny_struct as ts - from functional_orc_def.complextypes_structs + from complextypes_structs order by id limit 3) v ---- RESULTS @@ -433,7 +444,7 @@ INT,STRING ---- QUERY select v.id, v.ts from (select id, tiny_struct as ts - from functional_orc_def.complextypes_structs + from complextypes_structs order by id limit 3) v order by id desc @@ -447,7 +458,7 @@ INT,STRING ---- QUERY select v.id, v.ts from (select id, tiny_struct as ts - from functional_orc_def.complextypes_structs) v + from complextypes_structs) v order by id desc ---- RESULTS: VERIFY_IS_EQUAL_SORTED 6,'NULL' @@ -461,9 +472,11 @@ INT,STRING ==== ---- QUERY # CREATE VIEW AS SELECT where the select returns struct. -create view $DATABASE.struct_view as select id, small_struct -from functional_orc_def.complextypes_structs; -select id, small_struct from $DATABASE.struct_view; +drop view if exists struct_view; +create view struct_view as + select id, small_struct + from complextypes_structs; +select id, small_struct from struct_view; ---- RESULTS 1,'NULL' 2,'{"i":19191,"s":"small_struct_str"}' @@ -476,7 +489,7 @@ INT,STRING ==== ---- QUERY # WITH clause creates an inline view containing a struct. -with sub as (select id, small_struct from functional_orc_def.complextypes_structs) +with sub as (select id, small_struct from complextypes_structs) select sub.id, sub.small_struct from sub; ---- RESULTS 1,'NULL' @@ -493,7 +506,7 @@ INT,STRING # view and ordering by a non-complex item from the view. with sub as ( select id, small_struct - from functional_orc_def.complextypes_structs + from complextypes_structs where small_struct.i > 19200) select sub.id, sub.small_struct from sub order by sub.id desc; ---- RESULTS: VERIFY_IS_EQUAL_SORTED @@ -504,8 +517,9 @@ INT,STRING ==== ---- QUERY # Create a view containing structs and query the view. +drop view if exists tmp_view; create view tmp_view as - select id, str, tiny_struct, alltypes from functional_orc_def.complextypes_structs; + select id, str, tiny_struct, alltypes from complextypes_structs; select id, alltypes, tiny_struct from tmp_view; ---- RESULTS 1,'{"ti":100,"si":12348,"i":156789012,"bi":163234345342,"b":true,"f":1234.56005859375,"do":65323423.33,"da":"2021-05-30","ts":"2021-06-01 10:19:04","s1":"some string","s2":"another str","c1":"x","c2":"xyz","vc":"somevarcha","de1":12345,"de2":null}','{"b":true}' @@ -520,33 +534,33 @@ INT,STRING,STRING ---- QUERY # Query a struct from a partitioned table to check multi-fragment execution. set disable_outermost_topn = 1; -select id, struct_val from functional_orc_def.alltypes_structs order by id desc limit 5; +select id, struct_val from alltypes_structs order by id desc limit 5; ---- RESULTS: VERIFY_IS_EQUAL_SORTED -7299,'{"bool_col":false,"tinyint_col":9,"smallint_col":9,"int_col":9,"bigint_col":90,"float_col":9.899999618530273,"double_col":90.89999999999999,"date_string_col":"12/31/10","string_col":"9","timestamp_col":"2010-12-31 05:09:13.860000000"}' -7298,'{"bool_col":true,"tinyint_col":8,"smallint_col":8,"int_col":8,"bigint_col":80,"float_col":8.800000190734863,"double_col":80.8,"date_string_col":"12/31/10","string_col":"8","timestamp_col":"2010-12-31 05:08:13.780000000"}' -7297,'{"bool_col":false,"tinyint_col":7,"smallint_col":7,"int_col":7,"bigint_col":70,"float_col":7.699999809265137,"double_col":70.7,"date_string_col":"12/31/10","string_col":"7","timestamp_col":"2010-12-31 05:07:13.710000000"}' -7296,'{"bool_col":true,"tinyint_col":6,"smallint_col":6,"int_col":6,"bigint_col":60,"float_col":6.599999904632568,"double_col":60.59999999999999,"date_string_col":"12/31/10","string_col":"6","timestamp_col":"2010-12-31 05:06:13.650000000"}' -7295,'{"bool_col":false,"tinyint_col":5,"smallint_col":5,"int_col":5,"bigint_col":50,"float_col":5.5,"double_col":50.5,"date_string_col":"12/31/10","string_col":"5","timestamp_col":"2010-12-31 05:05:13.600000000"}' +7299,'{"bool_col":false,"tinyint_col":9,"smallint_col":9,"int_col":9,"bigint_col":90,"float_col":9.899999618530273,"double_col":90.89999999999999,"date_string_col":"12/31/10","string_col":"9"}' +7298,'{"bool_col":true,"tinyint_col":8,"smallint_col":8,"int_col":8,"bigint_col":80,"float_col":8.800000190734863,"double_col":80.8,"date_string_col":"12/31/10","string_col":"8"}' +7297,'{"bool_col":false,"tinyint_col":7,"smallint_col":7,"int_col":7,"bigint_col":70,"float_col":7.699999809265137,"double_col":70.7,"date_string_col":"12/31/10","string_col":"7"}' +7296,'{"bool_col":true,"tinyint_col":6,"smallint_col":6,"int_col":6,"bigint_col":60,"float_col":6.599999904632568,"double_col":60.59999999999999,"date_string_col":"12/31/10","string_col":"6"}' +7295,'{"bool_col":false,"tinyint_col":5,"smallint_col":5,"int_col":5,"bigint_col":50,"float_col":5.5,"double_col":50.5,"date_string_col":"12/31/10","string_col":"5"}' ---- TYPES INT,STRING ==== ---- QUERY # Query the same struct multiple times from a partitioned table. -select id, struct_val, struct_val from functional_orc_def.alltypes_structs order by id limit 2; +select id, struct_val, struct_val from alltypes_structs order by id limit 2; ---- RESULTS: VERIFY_IS_EQUAL_SORTED -0,'{"bool_col":true,"tinyint_col":0,"smallint_col":0,"int_col":0,"bigint_col":0,"float_col":0,"double_col":0,"date_string_col":"01/01/09","string_col":"0","timestamp_col":"2009-01-01 00:00:00"}','{"bool_col":true,"tinyint_col":0,"smallint_col":0,"int_col":0,"bigint_col":0,"float_col":0,"double_col":0,"date_string_col":"01/01/09","string_col":"0","timestamp_col":"2009-01-01 00:00:00"}' -1,'{"bool_col":false,"tinyint_col":1,"smallint_col":1,"int_col":1,"bigint_col":10,"float_col":1.100000023841858,"double_col":10.1,"date_string_col":"01/01/09","string_col":"1","timestamp_col":"2009-01-01 00:01:00"}','{"bool_col":false,"tinyint_col":1,"smallint_col":1,"int_col":1,"bigint_col":10,"float_col":1.100000023841858,"double_col":10.1,"date_string_col":"01/01/09","string_col":"1","timestamp_col":"2009-01-01 00:01:00"}' +0,'{"bool_col":true,"tinyint_col":0,"smallint_col":0,"int_col":0,"bigint_col":0,"float_col":0,"double_col":0,"date_string_col":"01/01/09","string_col":"0"}','{"bool_col":true,"tinyint_col":0,"smallint_col":0,"int_col":0,"bigint_col":0,"float_col":0,"double_col":0,"date_string_col":"01/01/09","string_col":"0"}' +1,'{"bool_col":false,"tinyint_col":1,"smallint_col":1,"int_col":1,"bigint_col":10,"float_col":1.100000023841858,"double_col":10.1,"date_string_col":"01/01/09","string_col":"1"}','{"bool_col":false,"tinyint_col":1,"smallint_col":1,"int_col":1,"bigint_col":10,"float_col":1.100000023841858,"double_col":10.1,"date_string_col":"01/01/09","string_col":"1"}' ---- TYPES INT,STRING,STRING ==== ---- QUERY # Query struct from a partitioned table with where clause on the struct's members. select id, struct_val -from functional_orc_def.alltypes_structs -where struct_val.tinyint_col=8 and struct_val.timestamp_col > "2010-12-30"; +from alltypes_structs +where struct_val.tinyint_col=8 and strleft(struct_val.date_string_col, 5) = "12/30"; ---- RESULTS -7288,'{"bool_col":true,"tinyint_col":8,"smallint_col":8,"int_col":8,"bigint_col":80,"float_col":8.800000190734863,"double_col":80.8,"date_string_col":"12/30/10","string_col":"8","timestamp_col":"2010-12-30 04:58:13.330000000"}' -7298,'{"bool_col":true,"tinyint_col":8,"smallint_col":8,"int_col":8,"bigint_col":80,"float_col":8.800000190734863,"double_col":80.8,"date_string_col":"12/31/10","string_col":"8","timestamp_col":"2010-12-31 05:08:13.780000000"}' +7288,'{"bool_col":true,"tinyint_col":8,"smallint_col":8,"int_col":8,"bigint_col":80,"float_col":8.800000190734863,"double_col":80.8,"date_string_col":"12/30/10","string_col":"8"}' +3638,'{"bool_col":true,"tinyint_col":8,"smallint_col":8,"int_col":8,"bigint_col":80,"float_col":8.800000190734863,"double_col":80.8,"date_string_col":"12/30/09","string_col":"8"}' ---- TYPES INT,STRING ==== @@ -562,13 +576,13 @@ is not supported when querying STRUCT type STRUCT # support selecting structs. create view tmp_view as select id, int_struct_col from functional.allcomplextypes; ---- CATCH -AnalysisException: Querying STRUCT is only supported for ORC file format. +AnalysisException: Querying STRUCT is only supported for ORC and Parquet file formats. ==== ---- QUERY # Querying IS NULL on a struct is not supported. # IMPALA-3060 select id, str, alltypes -from functional_orc_def.complextypes_structs +from complextypes_structs where alltypes is null; ---- CATCH AnalysisException: IS NULL predicate does not support complex types: alltypes IS NULL @@ -577,54 +591,54 @@ AnalysisException: IS NULL predicate does not support complex types: alltypes IS # Subquery that returns a complex type is not supported. # IMPALA-9500 select alltypes -from functional_orc_def.complextypes_structs -where alltypes in (select alltypes from functional_orc_def.complextypes_structs); +from complextypes_structs +where alltypes in (select alltypes from functional_parquet.complextypes_structs); ---- CATCH -AnalysisException: A subquery can't return complex types. (SELECT alltypes FROM functional_orc_def.complextypes_structs) +AnalysisException: A subquery can't return complex types. (SELECT alltypes FROM functional_parquet.complextypes_structs) ==== ---- QUERY -select tbl.nested_struct from functional_orc_def.complextypestbl tbl; +select tbl.nested_struct from complextypestbl tbl; ---- CATCH AnalysisException: Struct containing a collection type is not allowed in the select list. ==== ---- QUERY -select tbl.nested_struct.c from functional_orc_def.complextypestbl tbl; +select tbl.nested_struct.c from complextypestbl tbl; ---- CATCH AnalysisException: Struct containing a collection type is not allowed in the select list. ==== ---- QUERY # Unioning structs is not supported. # IMPALA-10752 -select id, tiny_struct from functional_orc_def.complextypes_structs +select id, tiny_struct from complextypes_structs union all -select id, tiny_struct from functional_orc_def.complextypes_structs; +select id, tiny_struct from complextypes_structs; ---- CATCH AnalysisException: Set operations don't support STRUCT type. STRUCT in tiny_struct ==== ---- QUERY # Ordering by struct column is not supported. -select id, tiny_struct from functional_orc_def.complextypes_structs +select id, tiny_struct from complextypes_structs order by tiny_struct ---- CATCH AnalysisException: ORDER BY expression 'tiny_struct' with complex type 'STRUCT' is not supported. ==== ---- QUERY # Ordering by struct column (using the index of the column) is not supported. -select id, tiny_struct from functional_orc_def.complextypes_structs +select id, tiny_struct from complextypes_structs order by 2 ---- CATCH AnalysisException: ORDER BY expression 'tiny_struct' with complex type 'STRUCT' is not supported. ==== ---- QUERY # Check that the order by don't confuse the 3rd column with the member of the struct. -select id, tiny_struct from functional_orc_def.complextypes_structs +select id, tiny_struct from complextypes_structs order by 3 ---- CATCH AnalysisException: ORDER BY: ordinal exceeds the number of items in the SELECT list: 3 ==== ---- QUERY # Structs inside arrays are not yet supported. -select nested_struct.c.d from functional_orc_def.complextypestbl; +select nested_struct.c.d from complextypestbl; ---- CATCH AnalysisException: STRUCT type inside collection types is not supported. ==== diff --git a/tests/query_test/test_nested_types.py b/tests/query_test/test_nested_types.py index 23e46e6e5..196b7c69a 100644 --- a/tests/query_test/test_nested_types.py +++ b/tests/query_test/test_nested_types.py @@ -147,21 +147,22 @@ class TestNestedStructsInSelectList(ImpalaTestSuite): cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint) - def test_struct_in_select_list(self, vector, unique_database): + def test_struct_in_select_list(self, vector): """Queries where a struct column is in the select list""" - if vector.get_value('table_format').file_format == 'parquet': - pytest.skip() if vector.get_value('exec_option')['disable_codegen'] == 'False': pytest.skip() - self.run_test_case('QueryTest/struct-in-select-list', vector, unique_database) + new_vector = deepcopy(vector) + new_vector.get_value('exec_option')['convert_legacy_hive_parquet_utc_timestamps'] = 1 + new_vector.get_value('exec_option')['TIMEZONE'] = '"Europe/Budapest"' + self.run_test_case('QueryTest/struct-in-select-list', new_vector) - def test_nested_struct_in_select_list(self, vector, unique_database): + def test_nested_struct_in_select_list(self, vector): """Queries where a nested struct column is in the select list""" - if vector.get_value('table_format').file_format == 'parquet': - pytest.skip() if vector.get_value('exec_option')['disable_codegen'] == 'False': pytest.skip() - self.run_test_case('QueryTest/nested-struct-in-select-list', vector, unique_database) + new_vector = deepcopy(vector) + new_vector.get_value('exec_option')['convert_legacy_hive_parquet_utc_timestamps'] = 1 + self.run_test_case('QueryTest/nested-struct-in-select-list', new_vector) class TestNestedTArraysInSelectList(ImpalaTestSuite):