mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
This patch mainly implement the creation/drop of paimon table
through impala.
Supported impala data types:
- BOOLEAN
- TINYINT
- SMALLINT
- INTEGER
- BIGINT
- FLOAT
- DOUBLE
- STRING
- DECIMAL(P,S)
- TIMESTAMP
- CHAR(N)
- VARCHAR(N)
- BINARY
- DATE
Syntax for creating paimon table:
CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
(
[col_name data_type ,...]
[PRIMARY KEY (col1,col2)]
)
[PARTITIONED BY (col_name data_type [COMMENT 'col_comment'], ...)]
STORED AS PAIMON
[LOCATION 'hdfs_path']
[TBLPROPERTIES (
'primary-key'='col1,col2',
'file.format' = 'orc/parquet',
'bucket' = '2',
'bucket-key' = 'col3',
];
Two types of paimon catalogs are supported.
(1) Create table with hive catalog:
CREATE TABLE paimon_hive_cat(userid INT,movieId INT)
STORED AS PAIMON;
(2) Create table with hadoop catalog:
CREATE [EXTERNAL] TABLE paimon_hadoop_cat
STORED AS PAIMON
TBLPROPERTIES('paimon.catalog'='hadoop',
'paimon.catalog_location'='/path/to/paimon_hadoop_catalog',
'paimon.table_identifier'='paimondb.paimontable');
SHOW TABLE STAT/SHOW COLUMN STAT/SHOW PARTITIONS/SHOW FILES
statements are also supported.
TODO:
- Patches pending submission:
- Query support for paimon data files.
- Partition pruning and predicate push down.
- Query support with time travel.
- Query support for paimon meta tables.
- WIP:
- Complex type query support.
- Virtual Column query support for querying
paimon data table.
- Native paimon table scanner, instead of
jni based.
Testing:
- Add unit test for paimon impala type conversion.
- Add unit test for ToSqlTest.java.
- Add unit test for AnalyzeDDLTest.java.
- Update default_file_format TestEnumCase in
be/src/service/query-options-test.cc.
- Update test case in
testdata/workloads/functional-query/queries/QueryTest/set.test.
- Add test cases in metadata/test_show_create_table.py.
- Add custom test test_paimon.py.
Change-Id: I57e77f28151e4a91353ef77050f9f0cd7d9d05ef
Reviewed-on: http://gerrit.cloudera.org:8080/22914
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Riza Suminto <riza.suminto@cloudera.com>
130 lines
5.2 KiB
Thrift
130 lines
5.2 KiB
Thrift
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
namespace py impala_thrift_gen.Descriptors
|
|
namespace cpp impala
|
|
namespace java org.apache.impala.thrift
|
|
|
|
include "CatalogObjects.thrift"
|
|
include "Types.thrift"
|
|
include "Exprs.thrift"
|
|
|
|
struct TSlotDescriptor {
|
|
1: required Types.TSlotId id
|
|
2: required Types.TTupleId parent
|
|
// Only set for collection slots. The tuple ID of the item tuple for the collection.
|
|
3: optional Types.TTupleId itemTupleId
|
|
4: required Types.TColumnType slotType
|
|
|
|
// Absolute path into the table schema pointing to the column/field materialized into
|
|
// this slot. Empty for slots that do not materialize a table column/field, e.g., slots
|
|
// materializing an aggregation result.
|
|
//
|
|
// materializedPath[i] is the ordinal position of the column/field of the table schema
|
|
// at level i. For example, materializedPath[0] is an ordinal into the list of table
|
|
// columns, materializedPath[1] is an ordinal into the list of fields of the
|
|
// complex-typed column at position materializedPath[0], etc.
|
|
//
|
|
// The materialized path is used to determine when a new tuple (containing a new
|
|
// instance of this slot) should be created. A tuple is emitted for every data item
|
|
// pointed to by the materialized path. For scalar slots this trivially means that every
|
|
// data item goes into a different tuple. For collection slots, the materialized path
|
|
// determines how many data items go into a single collection value.
|
|
5: required list<i32> materializedPath
|
|
|
|
6: required i32 byteOffset // into tuple
|
|
7: required i32 nullIndicatorByte
|
|
8: required i32 nullIndicatorBit
|
|
9: required i32 slotIdx
|
|
10: required CatalogObjects.TVirtualColumnType virtual_col_type =
|
|
CatalogObjects.TVirtualColumnType.NONE
|
|
// The path includes column / field names materialized by a scan. This is set for
|
|
// producing the tuple cache key, because the names of columns / fields determine
|
|
// behavior when resolving Parquet columns/fields by name. This information is
|
|
// provided by other structures for the executor, so it only needs to be set for
|
|
// the tuple cache.
|
|
11: optional string path
|
|
// If this is in a struct, this is the index of the field within that struct. This
|
|
// corresponds to the final entry in the absolute path. The materialized path is
|
|
// sometimes truncated, so it may not contain this information. This value is not
|
|
// interesting if this slot is not inside a struct.
|
|
12: optional i32 structFieldIdx
|
|
}
|
|
|
|
struct TColumnDescriptor {
|
|
1: required string name
|
|
2: required Types.TColumnType type
|
|
|
|
// Field id of an iceberg column.
|
|
3: optional i32 icebergFieldId
|
|
// Key and value field id for Iceberg column with Map type.
|
|
4: optional i32 icebergFieldMapKeyId
|
|
5: optional i32 icebergFieldMapValueId
|
|
}
|
|
|
|
// "Union" of all table types.
|
|
struct TTableDescriptor {
|
|
// Query local id assigned in DescriptorTable:toThrift()
|
|
1: required Types.TTableId id
|
|
2: required CatalogObjects.TTableType tableType
|
|
// Clustering/partition columns come first.
|
|
3: required list<TColumnDescriptor> columnDescriptors
|
|
4: required i32 numClusteringCols
|
|
|
|
5: optional CatalogObjects.THdfsTable hdfsTable
|
|
6: optional CatalogObjects.THBaseTable hbaseTable
|
|
9: optional CatalogObjects.TDataSourceTable dataSourceTable
|
|
10: optional CatalogObjects.TKuduTable kuduTable
|
|
11: optional CatalogObjects.TIcebergTable icebergTable
|
|
12: optional CatalogObjects.TSystemTable systemTable
|
|
13: optional CatalogObjects.TPaimonTable paimonTable
|
|
|
|
// Unqualified name of table
|
|
7: required string tableName
|
|
|
|
// Name of the database that the table belongs to
|
|
8: required string dbName
|
|
}
|
|
|
|
struct TTupleDescriptor {
|
|
1: required Types.TTupleId id
|
|
2: required i32 byteSize
|
|
3: required i32 numNullBytes
|
|
4: optional Types.TTableId tableId
|
|
|
|
// Absolute path into the table schema pointing to the collection whose fields
|
|
// are materialized into this tuple. Non-empty if this tuple belongs to a
|
|
// nested collection, empty otherwise.
|
|
5: optional list<i32> tuplePath
|
|
}
|
|
|
|
struct TDescriptorTable {
|
|
1: optional list<TSlotDescriptor> slotDescriptors
|
|
2: required list<TTupleDescriptor> tupleDescriptors
|
|
|
|
// TTableDescriptor(s) referenced by tupleDescriptors and scan nodes in
|
|
// the fragment.
|
|
3: optional list<TTableDescriptor> tableDescriptors
|
|
}
|
|
|
|
// Binary blob containing a serialized TDescriptorTable. See desc_tbl_* fields on
|
|
// TQueryCtx for more context on when this is used.
|
|
struct TDescriptorTableSerialized {
|
|
// TDescriptorTable serialized
|
|
1: required binary thrift_desc_tbl
|
|
} (cpp.customostream)
|