impala/common/thrift/PlanNodes.thrift

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

//
// This file contains all structs, enums, etc., that together make up
// a plan tree. All information recorded in struct TPlan and below is independent
// of the execution parameters of any one of the backends on which it is running
// (those are recorded in TPlanFragmentInstanceCtx).

namespace py impala_thrift_gen.PlanNodes
namespace cpp impala
namespace java org.apache.impala.thrift

include "CatalogObjects.thrift"
include "Data.thrift"
include "ExecStats.thrift"
include "Exprs.thrift"
include "Types.thrift"
include "ExternalDataSource.thrift"
include "ResourceProfile.thrift"

enum TPlanNodeType {
  HDFS_SCAN_NODE = 0
  HBASE_SCAN_NODE = 1
  HASH_JOIN_NODE = 2
  AGGREGATION_NODE = 3
  SORT_NODE = 4
  EMPTY_SET_NODE = 5
  EXCHANGE_NODE = 6
  UNION_NODE = 7
  SELECT_NODE = 8
  NESTED_LOOP_JOIN_NODE = 9
  DATA_SOURCE_NODE = 10
  ANALYTIC_EVAL_NODE = 11
  SINGULAR_ROW_SRC_NODE = 12
  UNNEST_NODE = 13
  SUBPLAN_NODE = 14
  KUDU_SCAN_NODE = 15
  CARDINALITY_CHECK_NODE = 16
  MULTI_AGGREGATION_NODE = 17
  ICEBERG_DELETE_NODE = 18
  ICEBERG_METADATA_SCAN_NODE = 19
  TUPLE_CACHE_NODE = 20
  SYSTEM_TABLE_SCAN_NODE = 21
  ICEBERG_MERGE_NODE = 22
}

// phases of an execution node
// must be kept in sync with tests/failure/test_failpoints.py
enum TExecNodePhase {
  PREPARE = 0
  PREPARE_SCANNER = 1
  OPEN = 2
  GETNEXT = 3
  GETNEXT_SCANNER = 4
  CLOSE = 5
  // After a scanner thread completes a range with an error but before it propagates the
  // error.
  SCANNER_ERROR = 6
  INVALID = 7
}

// what to do when hitting a debug point (TImpalaQueryOptions.DEBUG_ACTION)
enum TDebugAction {
  WAIT = 0
  FAIL = 1
  INJECT_ERROR_LOG = 2
  MEM_LIMIT_EXCEEDED = 3
  // A floating point number in range [0.0, 1.0] that gives the probability of denying
  // each reservation increase request after the initial reservation.
  SET_DENY_RESERVATION_PROBABILITY = 4
  // Delay for a short amount of time: 100ms or the specified number of seconds in the
  // optional parameter.
  DELAY = 5
}

// Preference for replica selection
enum TReplicaPreference {
  CACHE_LOCAL = 0
  CACHE_RACK = 1
  DISK_LOCAL = 2
  DISK_RACK = 3
  REMOTE = 4
}

// Specification of a runtime filter target.
struct TRuntimeFilterTargetDesc {
  // Target node id
  1: Types.TPlanNodeId node_id

  // Expr on which the filter is applied
  2: required Exprs.TExpr target_expr

  // Indicates if 'target_expr' is bound only by partition columns
  3: required bool is_bound_by_partition_columns

  // Slot ids on which 'target_expr' is bound on
  4: required list<Types.TSlotId> target_expr_slotids

  // Indicates if this target is on the same fragment as the join that
  // produced the runtime filter
  5: required bool is_local_target

  // If the target node is a Kudu scan node, the name, in the case it appears in Kudu, and
  // type of the targeted column.
  6: optional string kudu_col_name
  7: optional Types.TColumnType kudu_col_type;

  // The low and high value as seen in the column stats of the targeted column.
  8: optional Data.TColumnValue low_value
  9: optional Data.TColumnValue high_value

  // Indicates if the low and high value in column stats are present
  10: optional bool is_min_max_value_present

  // Indicates if the column is actually stored in the data files (unlike partition
  // columns in regular Hive tables)
  11: optional bool is_column_in_data_file
}

enum TRuntimeFilterType {
  BLOOM = 0
  MIN_MAX = 1
  IN_LIST = 2
}

// The level of filtering of enabled min/max filters to be applied to Parquet scan nodes.
enum TMinmaxFilteringLevel {
  ROW_GROUP = 1
  PAGE = 2
  ROW = 3
}

// Specification of a runtime filter.
struct TRuntimeFilterDesc {
  // Filter unique id (within a query)
  1: required i32 filter_id

  // Expr on which the filter is built on a hash or nested loop join.
  2: required Exprs.TExpr src_expr

  // List of targets for this runtime filter
  3: required list<TRuntimeFilterTargetDesc> targets

  // Map of target node id to the corresponding index in 'targets'
  4: required map<Types.TPlanNodeId, i32> planid_to_target_ndx

  // Indicates if the source join node of this filter is a broadcast or
  // a partitioned join.
  5: required bool is_broadcast_join

  // Indicates if there is at least one target scan node that is in the
  // same fragment as the broadcast join that produced the runtime filter
  6: required bool has_local_targets

  // Indicates if there is at least one target scan node that is not in the same
  // fragment as the broadcast join that produced the runtime filter
  7: required bool has_remote_targets

  // Indicates if this filter is applied only on partition columns
  8: required bool applied_on_partition_columns

  // The estimated number of distinct values that the planner expects the filter to hold.
  // Used to compute the size of the filter.
  9: optional i64 ndv_estimate

  // The type of runtime filter to build.
  10: required TRuntimeFilterType type

  // The comparison operator between targets and src_expr
  11: required ExternalDataSource.TComparisonOp compareOp

  // The size of the filter based on the ndv estimate and the min/max limit specified in
  // the query options. Should be greater than zero for bloom filters, zero otherwise.
  12: optional i64 filter_size_bytes

  // The ID of the plan node that produces this filter.
  13: optional Types.TPlanNodeId src_node_id
}

// The information contained in subclasses of ScanNode captured in two separate
// Thrift structs:
// - TScanRange: the data range that's covered by the scan (which varies with the
//   particular partition of the plan fragment of which the scan node is a part)
// - T<subclass>: all other operational parameters that are the same across
//   all plan fragments

// Specification of a subsection of a single HDFS file. Corresponds to HdfsFileSpiltPB and
// should be kept in sync with it.
struct THdfsFileSplit {
  // File name (not the full path).  The path is assumed to be relative to the
  // 'location' of the THdfsPartition referenced by partition_id.
  1: required string relative_path

  // starting offset
  2: required i64 offset

  // length of split
  3: required i64 length

  // ID of partition within the THdfsTable associated with this scan node.
  4: required i64 partition_id

  // total size of the hdfs file
  5: required i64 file_length

  // compression type of the hdfs file
  6: required CatalogObjects.THdfsCompression file_compression

  // last modified time of the file
  7: required i64 mtime

  // Whether the HDFS file is stored with erasure coding.
  8: optional bool is_erasure_coded

  // Hash of the partition's path. This must be hashed with a hash algorithm that is
  // consistent across different processes and machines. This is currently using
  // Java's String.hashCode(), which is consistent. For testing purposes, this can use
  // any consistent hash.
  9: required i32 partition_path_hash

  // The absolute path of the file, it's used only when data files are outside of
  // the Iceberg table location (IMPALA-11507).
  10: optional string absolute_path

  // Whether the HDFS file is stored with transparent data encryption.
  11: optional bool is_encrypted
}

// Key range for single THBaseScanNode. Corresponds to HBaseKeyRangePB and should be kept
// in sync with it.
// TODO: does 'binary' have an advantage over string? strings can
// already store binary data
struct THBaseKeyRange {
  // inclusive
  1: optional string startKey

  // exclusive
  2: optional string stopKey
}

// Specifies how THdfsFileSplits can be generated from HDFS files.
// Currently used for files that do not have block locations,
// such as S3, ADLS, and Local. The Frontend creates these and the
// coordinator's scheduler expands them into THdfsFileSplits.
// The plan is to use TFileSplitGeneratorSpec as well for HDFS
// files with block information. Doing so will permit the FlatBuffer
// representation used to represent block information to pass from the
// FrontEnd to the Coordinator without transforming to a heavier
// weight Thrift representation. See IMPALA-6458.
struct TFileSplitGeneratorSpec {
  1: required CatalogObjects.THdfsFileDesc file_desc

  // Maximum length of a file split to generate.
  2: required i64 max_block_size

  3: required bool is_splittable

  // ID of partition within the THdfsTable associated with this scan node.
  4: required i64 partition_id

  // Hash of the partition path
  5: required i32 partition_path_hash

  // True if only footer range (the last block in file) is needed.
  // If True, is_splittable must also be True as well.
  6: required bool is_footer_only
}

// Specification of an individual data range which is held in its entirety
// by a storage server. Corresponds to ScanRangePB and should be kept in sync with it.
struct TScanRange {
  // one of these must be set for every TScanRange
  1: optional THdfsFileSplit hdfs_file_split
  2: optional THBaseKeyRange hbase_key_range
  3: optional binary kudu_scan_token
  4: optional binary file_metadata
  5: optional bool is_system_scan
}

// Specification of an overlap predicate desc.
//  filter_id: id of the filter
//  slot_index: the index of a slot descriptor in minMaxTuple where the min value is
//              stored. The slot next to it (at index +1) stores the max value.
struct TOverlapPredicateDesc {
  1: required i32 filter_id
  2: required i32 slot_index
}

struct THdfsScanNode {
  1: required Types.TTupleId tuple_id

  // Conjuncts that can be evaluated while materializing the items (tuples) of
  // collection-typed slots. Maps from item tuple id to the list of conjuncts
  // to be evaluated.
  2: optional map<Types.TTupleId, list<Exprs.TExpr>> collection_conjuncts

  // Option to control replica selection during scan scheduling.
  3: optional TReplicaPreference replica_preference

  // Option to control tie breaks during scan scheduling.
  4: optional bool random_replica

  // Number of header lines to skip at the beginning of each file of this table. Only set
  // for hdfs text files.
  5: optional i32 skip_header_line_count

  // Indicates whether the MT scan node implementation should be used.
  // If this is true then the MT_DOP query option must be > 0.
  // TODO: Remove this option when the MT scan node supports all file formats.
  6: optional bool use_mt_scan_node

  // Conjuncts that can be pushed down to the ORC reader, or be evaluated against
  // parquet::Statistics using the tuple referenced by 'stats_tuple_id'.
  7: optional list<Exprs.TExpr> stats_conjuncts

  // Tuple to evaluate 'stats_conjuncts' against.
  8: optional Types.TTupleId stats_tuple_id

  // The conjuncts that are eligible for dictionary filtering.
  9: optional map<Types.TSlotId, list<i32>> dictionary_filter_conjuncts

  // The byte offset of the slot for counter if count star optimization is enabled.
  10: optional i32 count_star_slot_offset

  // If true, the backend only needs to return one row per partition.
  11: optional bool is_partition_key_scan

  // The list of file formats that this scan node may need to process. It is possible that
  // in some fragment instances not all will actually occur. This list can be used to
  // codegen code remotely for other impalad's as it contains all file formats that may be
  // needed for this scan node.
  12: required set<CatalogObjects.THdfsFileFormat> file_formats;

  // The overlap predicates
  13: optional list<TOverlapPredicateDesc> overlap_predicate_descs

  // For IcebergScanNodes that are the left children of an IcebergDeleteNode this stores
  // the node ID of the right child.
  14: optional Types.TPlanNodeId deleteFileScanNodeId

  // Whether mt_dop should use deterministic scan range assignment. If true,
  // each fragment instance has its own list of scan ranges. If false,
  // the fragment instances use a shared queue.
  15: optional bool deterministic_scanrange_assignment
}

struct TDataSourceScanNode {
  1: required Types.TTupleId tuple_id

  // The external data source to scan
  2: required CatalogObjects.TDataSource data_source

  // Init string for the table passed to the data source. May be an empty string.
  3: required string init_string

  // Scan predicates in conjunctive normal form that were accepted by the data source.
  4: required list<list<ExternalDataSource.TBinaryPredicate>> accepted_predicates
}

struct THBaseFilter {
  1: required string family
  // The qualifier for HBase Key column can be null, thus the field is optional here.
  2: optional string qualifier
  // Ordinal number into enum HBase CompareFilter.CompareOp.
  // We don't use TExprOperator because the op is interpreted by an HBase Filter, and
  // not the c++ expr eval.
  3: required i32 op_ordinal
  4: required string filter_constant
}

struct THBaseScanNode {
  1: required Types.TTupleId tuple_id

  // TODO: remove this, we already have THBaseTable.tableName
  2: required string table_name

  3: optional list<THBaseFilter> filters

  // Suggested max value for "hbase.client.scan.setCaching"
  4: optional i32 suggested_max_caching
}

struct TKuduScanNode {
  1: required Types.TTupleId tuple_id

  // Indicates whether the MT scan node implementation should be used.
  // If this is true, then the MT_DOP query option must be > 0.
  2: optional bool use_mt_scan_node

  // The byte offset of the slot for Kudu metadata if count star optimization is enabled.
  3: optional i32 count_star_slot_offset
}

struct TSystemTableScanNode {
  1: required Types.TTupleId tuple_id
  2: required CatalogObjects.TSystemTableName table_name
}

struct TEqJoinCondition {
  // left-hand side of "<a> = <b>"
  1: required Exprs.TExpr left;
  // right-hand side of "<a> = <b>"
  2: required Exprs.TExpr right;
  // In SQL NULL values aren't equal to each other, in other words NULL == NULL is false.
  // However, there are some cases when joining tables where we'd like to have the
  // NULL == NULL comparison to return true. This flag is true in this case.
  // One example is when we join Iceberg equality delete files to the data files where we
  // want the NULLs in the delete files to match with the NULLs in the data files.
  // Another example is when this operator is a "<=>", also known as "IS NOT DISTINCT
  // FROM".
  3: required bool is_not_distinct_from;
}

// Different join operations supported by backend join operations.
enum TJoinOp {
  INNER_JOIN = 0
  LEFT_OUTER_JOIN = 1
  LEFT_SEMI_JOIN = 2
  LEFT_ANTI_JOIN = 3

  // Similar to LEFT_ANTI_JOIN with special handling for NULLs for the join conjuncts
  // on the build side. Those NULLs are considered candidate matches, and therefore could
  // be rejected (ANTI-join), based on the other join conjuncts. This is in contrast
  // to LEFT_ANTI_JOIN where NULLs are not matches and therefore always returned.
  NULL_AWARE_LEFT_ANTI_JOIN = 4

  RIGHT_OUTER_JOIN = 5
  RIGHT_SEMI_JOIN = 6
  RIGHT_ANTI_JOIN = 7
  FULL_OUTER_JOIN = 8
  CROSS_JOIN = 9

  // Iceberg Delete operator is based on join operation
  ICEBERG_DELETE_JOIN = 10
}

struct THashJoinNode {
  // equi-join predicates
  1: required list<TEqJoinCondition> eq_join_conjuncts

  // non equi-join predicates
  2: optional list<Exprs.TExpr> other_join_conjuncts

  // Hash seed to use. Must be the same as the join builder's hash seed, if there is
  // a separate join build. Must be positive.
  3: optional i32 hash_seed
}

struct TNestedLoopJoinNode {
  // Join conjuncts (both equi-join and non equi-join). All other conjuncts that are
  // evaluated at the join node are stored in TPlanNode.conjuncts.
  1: optional list<Exprs.TExpr> join_conjuncts
}

struct TIcebergDeleteNode {
  // equi-join predicates
  1: required list<TEqJoinCondition> eq_join_conjuncts
}

// Top-level struct for a join node. Elements that are shared between the different
// join implementations are top-level variables and elements that are specific to a
// join implementation live in a specialized struct.
struct TJoinNode {
  1: required TJoinOp join_op

  // Tuples in build row.
  2: optional list<Types.TTupleId> build_tuples

  // Nullable tuples in build row.
  3: optional list<bool> nullable_build_tuples

  // One of these must be set.
  4: optional THashJoinNode hash_join_node
  5: optional TNestedLoopJoinNode nested_loop_join_node
  6: optional TIcebergDeleteNode iceberg_delete_node
}

struct TAggregator {
  1: optional list<Exprs.TExpr> grouping_exprs
  // aggregate exprs. The root of each expr is the aggregate function. The
  // other exprs are the inputs to the aggregate function.
  2: required list<Exprs.TExpr> aggregate_functions

  // Tuple id used for intermediate aggregations (with slots of agg intermediate types)
  3: required Types.TTupleId intermediate_tuple_id

  // Tupld id used for the aggregation output (with slots of agg output types)
  // Equal to intermediate_tuple_id if intermediate type == output type for all
  // aggregate functions.
  4: required Types.TTupleId output_tuple_id

  // Set to true if this aggregator needs to run the finalization step.
  5: required bool need_finalize

  // Set to true to use the streaming preagg algorithm. Node must be a preaggregation.
  6: required bool use_streaming_preaggregation

  7: required ResourceProfile.TBackendResourceProfile resource_profile
}

struct TAggregationNode {
  // Aggregators for this node, each with a unique set of grouping exprs.
  1: required list<TAggregator> aggregators

  // Used in streaming aggregations to determine how much memory to use.
  2: required i64 estimated_input_cardinality

  // If true, this is the first AggregationNode in a aggregation plan with multiple
  // Aggregators and the entire input to this node should be passed to each Aggregator.
  3: required bool replicate_input

  // Set to true if this aggregation can complete early
  4: required bool fast_limit_check
}

struct TSortInfo {
  1: required list<Exprs.TExpr> ordering_exprs
  2: required list<bool> is_asc_order
  // Indicates, for each expr, if nulls should be listed first or last. This is
  // independent of is_asc_order.
  3: required list<bool> nulls_first
  // Expressions evaluated over the input row that materialize the tuple to be sorted.
  // Contains one expr per slot in the materialized tuple.
  4: optional list<Exprs.TExpr> sort_tuple_slot_exprs
  // The sorting order used in SORT BY clauses.
  5: required Types.TSortingOrder sorting_order
  // Number of the leading lexical keys in the ordering exprs. Only used in Z-order
  // for the pre-insert sort node to sort rows lexically on partition keys first, and
  // sort the remaining columns in Z-order.
  6: optional i32 num_lexical_keys_in_zorder
}

enum TSortType {
  // Sort the entire input.
  TOTAL = 0

  // Return the first N sorted elements.
  TOPN = 1

  // Divide the input into batches, each of which is sorted individually.
  PARTIAL = 2

  // Return the first N sorted elements from each partition.
  PARTITIONED_TOPN = 3
}

struct TSortNode {
  1: required TSortInfo sort_info
  2: required TSortType type
  // This is the number of rows to skip before returning results.
  // Not used with TSortType::PARTIAL or TSortType::PARTITIONED_TOPN.
  3: optional i64 offset

  // Estimated bytes of input that will go into this sort node across all backends.
  // -1 if such estimate is unavailable.
  4: optional i64 estimated_full_input_size

  // Whether to include ties for the last place in the Top-N
  5: optional bool include_ties

  // If include_ties is true, the limit including ties.
  6: optional i64 limit_with_ties

  // Max number of rows to return per partition.
  // Used only with TSortType::PARTITIONED_TOPN
  7: optional i64 per_partition_limit
  // Used only with TSortType::PARTITIONED_TOPN - expressions over
  // the sort tuple to use as the partition key.
  8: optional list<Exprs.TExpr> partition_exprs
  // Used only with TSortType::PARTITIONED_TOPN - sort info for ordering
  // within a partition.
  9: optional TSortInfo intra_partition_sort_info
}

enum TAnalyticWindowType {
  // Specifies the window as a logical offset
  RANGE = 0

  // Specifies the window in physical units
  ROWS = 1
}

enum TAnalyticWindowBoundaryType {
  // The window starts/ends at the current row.
  CURRENT_ROW = 0

  // The window starts/ends at an offset preceding current row.
  PRECEDING = 1

  // The window starts/ends at an offset following current row.
  FOLLOWING = 2
}

struct TAnalyticWindowBoundary {
  1: required TAnalyticWindowBoundaryType type

  // Predicate that checks: child tuple '<=' buffered tuple + offset for the orderby expr
  2: optional Exprs.TExpr range_offset_predicate

  // Offset from the current row for ROWS windows.
  3: optional i64 rows_offset_value
}

struct TAnalyticWindow {
  // Specifies the window type for the start and end bounds.
  1: required TAnalyticWindowType type

  // Absence indicates window start is UNBOUNDED PRECEDING.
  2: optional TAnalyticWindowBoundary window_start

  // Absence indicates window end is UNBOUNDED FOLLOWING.
  3: optional TAnalyticWindowBoundary window_end
}

// Defines a group of one or more analytic functions that share the same window,
// partitioning expressions and order-by expressions and are evaluated by a single
// ExecNode.
struct TAnalyticNode {
  // Exprs on which the analytic function input is partitioned. Input is already sorted
  // on partitions and order by clauses, partition_exprs is used to identify partition
  // boundaries. Empty if no partition clause is specified.
  1: required list<Exprs.TExpr> partition_exprs

  // Exprs specified by an order-by clause for RANGE windows. Used to evaluate RANGE
  // window boundaries. Empty if no order-by clause is specified or for windows
  // specifying ROWS.
  2: required list<Exprs.TExpr> order_by_exprs

  // Functions evaluated over the window for each input row. The root of each expr is
  // the aggregate function. Child exprs are the inputs to the function.
  3: required list<Exprs.TExpr> analytic_functions

  // Window specification
  4: optional TAnalyticWindow window

  // Tuple used for intermediate results of analytic function evaluations
  // (with slots of analytic intermediate types)
  5: required Types.TTupleId intermediate_tuple_id

  // Tupld used for the analytic function output (with slots of analytic output types)
  // Equal to intermediate_tuple_id if intermediate type == output type for all
  // analytic functions.
  6: required Types.TTupleId output_tuple_id

  // id of the buffered tuple (identical to the input tuple, which is assumed
  // to come from a single SortNode); not set if both partition_exprs and
  // order_by_exprs are empty
  7: optional Types.TTupleId buffered_tuple_id

  // predicate that checks: child tuple is in the same partition as the buffered tuple,
  // i.e. each partition expr is equal or both are not null. Only set if
  // buffered_tuple_id is set; should be evaluated over a row that is composed of the
  // child tuple and the buffered tuple
  8: optional Exprs.TExpr partition_by_eq

  // predicate that checks: the order_by_exprs are equal or both NULL when evaluated
  // over the child tuple and the buffered tuple. only set if buffered_tuple_id is set;
  // should be evaluated over a row that is composed of the child tuple and the buffered
  // tuple
  9: optional Exprs.TExpr order_by_eq
}

struct TUnionNode {
  // A UnionNode materializes all const/result exprs into this tuple.
  1: required Types.TTupleId tuple_id
  // List or expr lists materialized by this node.
  // There is one list of exprs per query stmt feeding into this union node.
  2: required list<list<Exprs.TExpr>> result_expr_lists
  // Separate list of expr lists coming from a constant select stmts.
  3: required list<list<Exprs.TExpr>> const_expr_lists
  // Index of the first child that needs to be materialized.
  4: required i64 first_materialized_child_idx
}

struct TExchangeNode {
  // The ExchangeNode's input rows form a prefix of the output rows it produces;
  // this describes the composition of that prefix
  1: required list<Types.TTupleId> input_row_tuples
  // For a merging exchange, the sort information.
  2: optional TSortInfo sort_info
  // This is the number of rows to skip before returning results
  3: optional i64 offset
}

struct TUnnestNode {
  // Exprs that return the in-memory collections to be scanned.
  // Currently always SlotRefs into array-typed slots.
  1: required list<Exprs.TExpr> collection_exprs
}

struct TCardinalityCheckNode {
  // Associated statement of child
  1: required string display_statement
}

struct TIcebergMetadataScanNode {
  // Tuple ID of the Tuple this scanner should scan.
  1: required Types.TTupleId tuple_id
  // Name of the FeIcebergTable that will be used for the metadata table scan.
  2: required CatalogObjects.TTableName table_name
  // The metadata table name specifies which metadata table has to be scanned.
  3: required string metadata_table_name;
}

struct TTupleCacheNode {
  // Compile-time cache key that includes a hashed representation of the entire subtree
  // below this point in the plan.
  1: required string compile_time_key
  // To calculate the per-fragment key, this keeps track of scan nodes that feed
  // into this node. The TupleCacheNode will hash the scan ranges for its fragment
  // at runtime.
  2: required list<i32> input_scan_node_ids;
}

enum TMergeCaseType {
  UPDATE = 0
  INSERT = 1
  DELETE = 2
}

enum TMergeMatchType {
  MATCHED = 0
  NOT_MATCHED_BY_TARGET = 1
  NOT_MATCHED_BY_SOURCE = 2
}

enum TIcebergMergeRowPresent {
  BOTH = 0
  SOURCE = 1
  TARGET = 2
}

struct TIcebergMergeCase {
  // Output expressions that will transform the values of the incoming
  // merge result set.
  1: required list<Exprs.TExpr> output_expressions
  // Filter expressions whose are evaluated after matching the case.
  2: optional list<Exprs.TExpr> filter_conjuncts
  // Type of the merge case that reflects the operation in it.
  3: required TMergeCaseType type
  4: required TMergeMatchType match_type
}

struct TIcebergMergeNode {
  1: required list<TIcebergMergeCase> cases
  2: required Exprs.TExpr row_present
  3: required list<Exprs.TExpr> delete_meta_exprs
  4: required list<Exprs.TExpr> partition_meta_exprs
  5: required Types.TTupleId merge_action_tuple_id
  6: required Types.TTupleId target_tuple_id
}

// See PipelineMembership in the frontend for details.
struct TPipelineMembership {
  1: required Types.TPlanNodeId pipe_id
  2: required i32 height
  3: required TExecNodePhase phase
}

// This is essentially a union of all messages corresponding to subclasses
// of PlanNode.
struct TPlanNode {
  // node id, needed to reassemble tree structure
  1: required Types.TPlanNodeId node_id
  2: required TPlanNodeType node_type
  3: required i32 num_children
  4: required i64 limit
  // Tuples in row produced by node. Must be non-empty.
  5: required list<Types.TTupleId> row_tuples

  // nullable_tuples[i] is true if row_tuples[i] is nullable
  6: required list<bool> nullable_tuples
  7: optional list<Exprs.TExpr> conjuncts

  // Set to true if codegen should be disabled for this plan node. Otherwise the plan
  // node is codegen'd if the backend supports it.
  8: required bool disable_codegen

  9: required list<TPipelineMembership> pipelines

  // one field per PlanNode subclass
  10: optional THdfsScanNode hdfs_scan_node
  11: optional THBaseScanNode hbase_scan_node
  12: optional TKuduScanNode kudu_scan_node
  13: optional TDataSourceScanNode data_source_node
  14: optional TJoinNode join_node
  15: optional TAggregationNode agg_node
  16: optional TSortNode sort_node
  17: optional TUnionNode union_node
  18: optional TExchangeNode exchange_node
  19: optional TAnalyticNode analytic_node
  20: optional TUnnestNode unnest_node
  21: optional TIcebergMetadataScanNode iceberg_scan_metadata_node
  30: optional TIcebergMergeNode merge_node

  // Label that should be used to print this node to the user.
  22: optional string label

  // Additional details that should be printed to the user. This is node specific
  // e.g. table name, join strategy, etc.
  23: optional string label_detail

  // Estimated execution stats generated by the planner.
  24: optional ExecStats.TExecStats estimated_stats

  // Runtime filters assigned to this plan node
  25: optional list<TRuntimeFilterDesc> runtime_filters

  // Resource profile for this plan node.
  26: required ResourceProfile.TBackendResourceProfile resource_profile

  27: optional TCardinalityCheckNode cardinality_check_node

  28: optional TTupleCacheNode tuple_cache_node

  29: optional TSystemTableScanNode system_table_scan_node
}

// A flattened representation of a tree of PlanNodes, obtained by depth-first
// traversal.
struct TPlan {
  1: required list<TPlanNode> nodes
}