mirror of
https://github.com/apache/impala.git
synced 2025-12-23 03:44:48 -05:00
When Hive (and probably other engines as well) converts a legacy Hive table to Iceberg it doesn't rewrite the data files. It means that the data files don't have write ids neither partition column data. Currently Impala expects the partition columns to be present in the data files, so it is not able to read converted partitioned tables. With this patch Impala loads partition values from the Iceberg metadata. The extra metadata information is attached to the file descriptor objects and propageted to the scanners. This metadata contains the Iceberg data file format (later it could be used to handle mixed-format tables), and partition data. We use the partition data in the HdfsScanner to create the template tuple that contains the partition values of identity-partitioned columns. This is not only true to migrated tables, but all Iceberg tables with identity partitions, which means we also save some IO and CPU time for such columns. The partition information could also be used for Dynamic Partition Pruning later. We use the (human-readable) string representation of the partition data when storing them in the flat buffers. This helps debugging, also it provides the needed flexibility when the partition columns evolve (e.g. INT -> BIGINT, DECIMAL(4,2) -> DECIMAL(6,2)). Testing * e2e test for all data types that can be used to partition a table * e2e test for migrated partitioned table + schema evolution (without renaming columns) * e2e for table where all columns are used as identity-partitions Change-Id: Iac11a02de709d43532056f71359c49d20c1be2b8 Reviewed-on: http://gerrit.cloudera.org:8080/18240 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
78 lines
2.6 KiB
Protocol Buffer
78 lines
2.6 KiB
Protocol Buffer
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
syntax="proto2";
|
|
|
|
package impala;
|
|
|
|
import "common.proto";
|
|
|
|
// Specification of a subsection of a single HDFS file. Corresponds to THdfsFileSpilt and
|
|
// should be kept in sync with it.
|
|
message HdfsFileSplitPB {
|
|
// File name (not the full path). The path is assumed to be relative to the
|
|
// 'location' of the THdfsPartition referenced by partition_id.
|
|
optional string relative_path = 1;
|
|
|
|
// Starting offset.
|
|
optional int64 offset = 2;
|
|
|
|
// Length of split.
|
|
optional int64 length = 3;
|
|
|
|
// ID of partition within the THdfsTable associated with this scan node.
|
|
optional int64 partition_id = 4;
|
|
|
|
// Total size of the hdfs file.
|
|
optional int64 file_length = 5;
|
|
|
|
// Compression type of the hdfs file.
|
|
optional CompressionTypePB file_compression = 6;
|
|
|
|
// Last modified time of the file.
|
|
optional int64 mtime = 7;
|
|
|
|
// Whether this file is erasure-coded.
|
|
optional bool is_erasure_coded = 8;
|
|
|
|
// Hash of the partition's path. This must be hashed with a hash algorithm that is
|
|
// consistent across different processes and machines. This is currently using
|
|
// Java's String.hashCode(), which is consistent. For testing purposes, this can use
|
|
// any consistent hash.
|
|
optional int32 partition_path_hash = 9;
|
|
}
|
|
|
|
// Key range for single THBaseScanNode. Corresponds to THBaseKeyRange and should be kept
|
|
// in sync with it.
|
|
message HBaseKeyRangePB {
|
|
// Inclusive
|
|
optional string startKey = 1;
|
|
|
|
// Exclusive
|
|
optional string stopKey = 2;
|
|
}
|
|
|
|
// Specification of an individual data range which is held in its entirety by a storage
|
|
// server. Corresponds to TScanRange and should be kept in sync with it.
|
|
message ScanRangePB {
|
|
// One of these must be set for every ScanRangePB.
|
|
optional HdfsFileSplitPB hdfs_file_split = 1;
|
|
optional HBaseKeyRangePB hbase_key_range = 2;
|
|
optional bytes kudu_scan_token = 3;
|
|
optional bytes file_metadata = 4;
|
|
}
|