mirror of
https://github.com/apache/impala.git
synced 2026-01-28 00:04:45 -05:00
The Iceberg format spec defines what types to use for different file formats, e.g.: https://iceberg.apache.org/spec/#parquet Impala should follow the specification, so this patch * annotates strings with UTF8 in Parquet metadata * removes fixed(L) <-> CHAR(L) mapping * forbids INSERTs when the Iceberg schema has a TIMESTAMPTZ column This patch also refactors the type/schema conversions as Impala => Iceberg conversions were duplicated in IcebergCatalogOpExecutor and IcebergUtil. I introduced the class 'IcebergSchemaConverter' to contain the code for conversions. Testing: * added test to check CHAR and VARCHAR types are not allowed * test that INSERTs are not allowed when the table has TIMESTMAPTZ * added test to check that strings are annotated with UTF8 Change-Id: I652565f82708824f5cf7497139153b06f116ccd3 Reviewed-on: http://gerrit.cloudera.org:8080/16851 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
272 lines
6.8 KiB
Plaintext
272 lines
6.8 KiB
Plaintext
====
|
|
---- QUERY
|
|
# Create a table that is a subset of 'alltypes' table, i.e. it only
|
|
# contains the data types supported by Iceberg.
|
|
create table iceberg_alltypes(
|
|
id INT COMMENT 'Add a comment',
|
|
bool_col BOOLEAN,
|
|
int_col INT,
|
|
bigint_col BIGINT,
|
|
float_col FLOAT,
|
|
double_col DOUBLE,
|
|
date_col DATE,
|
|
string_col STRING,
|
|
timestamp_col TIMESTAMP
|
|
)
|
|
stored as iceberg
|
|
tblproperties('iceberg.catalog'='hadoop.tables');
|
|
---- RESULTS
|
|
'Table has been created.'
|
|
====
|
|
---- QUERY
|
|
insert into iceberg_alltypes
|
|
select id, bool_col, int_col, bigint_col, float_col, double_col,
|
|
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
|
|
from functional.alltypes
|
|
order by id
|
|
limit 5;
|
|
---- RESULTS
|
|
: 5
|
|
====
|
|
---- QUERY
|
|
select * from iceberg_alltypes;
|
|
---- RESULTS
|
|
0,true,0,0,0,0,2009-01-01,'0',2009-01-01 00:00:00
|
|
1,false,1,10,1.100000023841858,10.1,2009-01-01,'1',2009-01-01 00:01:00
|
|
2,true,2,20,2.200000047683716,20.2,2009-01-01,'2',2009-01-01 00:02:00.100000000
|
|
3,false,3,30,3.299999952316284,30.3,2009-01-01,'3',2009-01-01 00:03:00.300000000
|
|
4,true,4,40,4.400000095367432,40.4,2009-01-01,'4',2009-01-01 00:04:00.600000000
|
|
---- TYPES
|
|
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DATE, STRING, TIMESTAMP
|
|
====
|
|
---- QUERY
|
|
# Create table with decimal types
|
|
CREATE TABLE decimal_tbl (
|
|
d1 DECIMAL(9,0),
|
|
d2 DECIMAL(10,0),
|
|
d3 DECIMAL(20,10),
|
|
d4 DECIMAL(38,38),
|
|
d5 DECIMAL(10,5),
|
|
d6 DECIMAL(9,0)
|
|
)
|
|
STORED AS iceberg
|
|
TBLPROPERTIES('iceberg.catalog'='hadoop.tables');
|
|
---- RESULTS
|
|
'Table has been created.'
|
|
====
|
|
---- QUERY
|
|
insert into decimal_tbl select * from functional_parquet.decimal_tbl;
|
|
select * from decimal_tbl;
|
|
---- RESULTS
|
|
1234,2222,1.2345678900,0.12345678900000000000000000000000000000,12345.78900,1
|
|
2345,111,12.3456789000,0.12345678900000000000000000000000000000,3.14100,1
|
|
12345,333,123.4567890000,0.12345678900000000000000000000000000000,11.22000,1
|
|
12345,333,1234.5678900000,0.12345678900000000000000000000000000000,0.10000,1
|
|
132842,333,12345.6789000000,0.12345678900000000000000000000000000000,0.77889,1
|
|
---- TYPES
|
|
DECIMAL, DECIMAL, DECIMAL, DECIMAL, DECIMAL, DECIMAL
|
|
====
|
|
---- QUERY
|
|
# Create non-Iceberg table with INT96 nanos.
|
|
create table int96_nanos (ts timestamp) stored as parquet;
|
|
====
|
|
---- QUERY
|
|
# Insert edge values as "normal" int96 timestamps that can represent all values.
|
|
set parquet_timestamp_type=INT96_NANOS;
|
|
insert into int96_nanos values
|
|
("1400-01-01"),
|
|
("2019-01-18 00:00:00.000000001"),
|
|
("2019-01-18 00:00:00.000001"),
|
|
("2019-01-18 00:00:00.001"),
|
|
("2019-01-18 23:59:59.999"),
|
|
("2019-01-18 23:59:59.999999"),
|
|
("2019-01-18 23:59:59.999999999")
|
|
====
|
|
---- QUERY
|
|
# Iceberg should write timestamps as INT64 micros.
|
|
create table ts_iceberg (ts timestamp) stored as iceberg
|
|
tblproperties('iceberg.catalog'='hadoop.tables');
|
|
insert into ts_iceberg select * from int96_nanos;
|
|
select * from ts_iceberg;
|
|
---- RESULTS
|
|
1400-01-01 00:00:00
|
|
2019-01-18 00:00:00
|
|
2019-01-18 00:00:00.000001000
|
|
2019-01-18 00:00:00.001000000
|
|
2019-01-18 23:59:59.999000000
|
|
2019-01-18 23:59:59.999999000
|
|
2019-01-18 23:59:59.999999000
|
|
====
|
|
---- QUERY
|
|
# Insert into hadoop catalog.
|
|
create table iceberg_hadoop_cat (i int)
|
|
stored as iceberg
|
|
tblproperties('iceberg.catalog'='hadoop.catalog',
|
|
'iceberg.catalog_location'='/test-warehouse/$DATABASE.db/hadoop_catalog_test');
|
|
insert into iceberg_hadoop_cat values (1), (2), (3);
|
|
---- RESULTS
|
|
: 3
|
|
====
|
|
---- QUERY
|
|
select * from iceberg_hadoop_cat;
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
show files in iceberg_hadoop_cat;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/hadoop_catalog_test/$DATABASE/iceberg_hadoop_cat/data/.*.0.parq','.*',''
|
|
---- TYPES
|
|
STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Insert into hadoop catalog with custom table identifier.
|
|
create table iceberg_hadoop_cat_ti (i int)
|
|
stored as iceberg
|
|
tblproperties('iceberg.catalog'='hadoop.catalog',
|
|
'iceberg.catalog_location'='/test-warehouse/$DATABASE.db/hadoop_catalog_test',
|
|
'iceberg.table_identifier'='test.custom_db.int_table');
|
|
insert into iceberg_hadoop_cat_ti values (1), (2), (3);
|
|
---- RESULTS
|
|
: 3
|
|
====
|
|
---- QUERY
|
|
select * from iceberg_hadoop_cat_ti;
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
show files in iceberg_hadoop_cat_ti;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/hadoop_catalog_test/test/custom_db/int_table/data/.*.0.parq','.*',''
|
|
---- TYPES
|
|
STRING, STRING, STRING
|
|
====
|
|
---- QUERY
|
|
# Insert into table stored in Iceberg's HiveCatalog
|
|
create table iceberg_hive_cat (i int)
|
|
stored as iceberg
|
|
tblproperties('iceberg.catalog'='hive.catalog');
|
|
---- RESULTS
|
|
'Table has been created.'
|
|
====
|
|
---- QUERY
|
|
insert into iceberg_hive_cat values (7);
|
|
select * from iceberg_hive_cat;
|
|
---- RESULTS
|
|
7
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
# Query external Iceberg table
|
|
create external table iceberg_hive_cat_ext (i int)
|
|
stored as iceberg
|
|
location '/test-warehouse/$DATABASE.db/iceberg_hive_cat'
|
|
tblproperties('iceberg.catalog'='hive.catalog',
|
|
'iceberg.table_identifier'='$DATABASE.iceberg_hive_cat');
|
|
---- RESULTS
|
|
'Table has been created.'
|
|
====
|
|
---- QUERY
|
|
select * from iceberg_hive_cat_ext;
|
|
---- RESULTS
|
|
7
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
# INSET INTO external Iceberg table stored in HiveCatalog.
|
|
insert into iceberg_hive_cat_ext values (8);
|
|
select * from iceberg_hive_cat_ext;
|
|
---- RESULTS
|
|
7
|
|
8
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
# Query original table
|
|
refresh iceberg_hive_cat;
|
|
select * from iceberg_hive_cat;
|
|
---- RESULTS
|
|
7
|
|
8
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
# DROP external Iceberg table
|
|
drop table iceberg_hive_cat_ext
|
|
---- RESULTS
|
|
'Table has been dropped.'
|
|
====
|
|
---- QUERY
|
|
# Original table is not affected after external table drop.
|
|
refresh iceberg_hive_cat;
|
|
select * from iceberg_hive_cat;
|
|
---- RESULTS
|
|
7
|
|
8
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
# Create another external Iceberg table
|
|
create external table iceberg_hive_cat_ext_2 (i int)
|
|
stored as iceberg
|
|
location '/test-warehouse/$DATABASE.db/iceberg_hive_cat'
|
|
tblproperties('iceberg.catalog'='hive.catalog',
|
|
'iceberg.table_identifier'='$DATABASE.iceberg_hive_cat');
|
|
select * from iceberg_hive_cat_ext_2
|
|
---- RESULTS
|
|
7
|
|
8
|
|
====
|
|
---- QUERY
|
|
# DROP the synchronized Iceberg table (data is purged).
|
|
drop table iceberg_hive_cat
|
|
---- RESULTS
|
|
'Table has been dropped.'
|
|
====
|
|
---- QUERY
|
|
# The data has been purged, so querying the external table fails.
|
|
select * from iceberg_hive_cat_ext_2
|
|
---- CATCH
|
|
Table does not exist
|
|
====
|
|
---- QUERY
|
|
# Insert into hive catalog with custom location.
|
|
create table iceberg_hive_cat_custom_loc (i int)
|
|
stored as iceberg
|
|
location '/test-warehouse/$DATABASE.db/custom_hive_cat'
|
|
tblproperties('iceberg.catalog'='hive.catalog');
|
|
insert into iceberg_hive_cat_custom_loc values (1), (2), (3);
|
|
---- RESULTS
|
|
: 3
|
|
====
|
|
---- QUERY
|
|
select * from iceberg_hive_cat_custom_loc;
|
|
---- RESULTS
|
|
1
|
|
2
|
|
3
|
|
---- TYPES
|
|
INT
|
|
====
|
|
---- QUERY
|
|
show files in iceberg_hive_cat_custom_loc;
|
|
---- RESULTS: VERIFY_IS_SUBSET
|
|
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/custom_hive_cat/data/.*.0.parq','.*',''
|
|
---- TYPES
|
|
STRING, STRING, STRING
|
|
====
|