Files
impala/testdata/workloads/functional-query/queries/QueryTest/iceberg-insert.test
Attila Jeges fabe994d1f IMPALA-10627: Use standard parquet-related Iceberg table properties
This patch adds support for the following standard Iceberg properties:

write.parquet.compression-codec:
  Parquet compression codec. Supported values are: NONE, GZIP, SNAPPY
  (default value), LZ4, ZSTD. The table property will be ignored if
  COMPRESSION_CODEC query option is set.

write.parquet.compression-level:
  Parquet compression level. Used with ZSTD compression only.
  Supported range is [1, 22]. Default value is 3. The table property
  will be ignored if COMPRESSION_CODEC query option is set.

write.parquet.row-group-size-bytes :
  Parquet row group size in bytes. Supported range is [8388608,
  2146435072] (8MB - 2047MB). The table property will be ignored if
  PARQUET_FILE_SIZE query option is set.
  If neither the table property nor the PARQUET_FILE_SIZE query option
  is set, the way Impala calculates row group size will remain
  unchanged.

write.parquet.page-size-bytes:
  Parquet page size in bytes. Used for PLAIN encoding. Supported range
  is [65536, 1073741824] (64KB - 1GB).
  If the table property is unset, the way Impala calculates page size
  will remain unchanged.

write.parquet.dict-size-bytes:
  Parquet dictionary page size in bytes. Used for dictionary encoding.
  Supported range is [65536, 1073741824] (64KB - 1GB).
  If the table property is unset, the way Impala calculates dictionary
  page size will remain unchanged.

This patch also renames 'iceberg.file_format' table property to
'write.format.default' which is the standard Iceberg name for the
table property.

Change-Id: I3b8aa9a52c13c41b48310d2f7c9c7426e1ff5f23
Reviewed-on: http://gerrit.cloudera.org:8080/17654
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2021-07-20 23:58:06 +00:00

406 lines
11 KiB
Plaintext

====
---- QUERY
# Create a table that is a subset of 'alltypes' table, i.e. it only
# contains the data types supported by Iceberg.
create table iceberg_alltypes(
id INT COMMENT 'Add a comment',
bool_col BOOLEAN,
int_col INT,
bigint_col BIGINT,
float_col FLOAT,
double_col DOUBLE,
date_col DATE,
string_col STRING,
timestamp_col TIMESTAMP
)
stored as iceberg
tblproperties('iceberg.catalog'='hadoop.tables');
---- RESULTS
'Table has been created.'
====
---- QUERY
insert into iceberg_alltypes
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes
order by id
limit 5;
---- RESULTS
: 5
====
---- QUERY
select * from iceberg_alltypes;
---- RESULTS
0,true,0,0,0,0,2009-01-01,'0',2009-01-01 00:00:00
1,false,1,10,1.100000023841858,10.1,2009-01-01,'1',2009-01-01 00:01:00
2,true,2,20,2.200000047683716,20.2,2009-01-01,'2',2009-01-01 00:02:00.100000000
3,false,3,30,3.299999952316284,30.3,2009-01-01,'3',2009-01-01 00:03:00.300000000
4,true,4,40,4.400000095367432,40.4,2009-01-01,'4',2009-01-01 00:04:00.600000000
---- TYPES
INT, BOOLEAN, INT, BIGINT, FLOAT, DOUBLE, DATE, STRING, TIMESTAMP
====
---- QUERY
# Create table with decimal types
CREATE TABLE decimal_tbl (
d1 DECIMAL(9,0),
d2 DECIMAL(10,0),
d3 DECIMAL(20,10),
d4 DECIMAL(38,38),
d5 DECIMAL(10,5),
d6 DECIMAL(9,0)
)
STORED AS iceberg
TBLPROPERTIES('iceberg.catalog'='hadoop.tables');
---- RESULTS
'Table has been created.'
====
---- QUERY
insert into decimal_tbl select * from functional_parquet.decimal_tbl;
select * from decimal_tbl;
---- RESULTS
1234,2222,1.2345678900,0.12345678900000000000000000000000000000,12345.78900,1
2345,111,12.3456789000,0.12345678900000000000000000000000000000,3.14100,1
12345,333,123.4567890000,0.12345678900000000000000000000000000000,11.22000,1
12345,333,1234.5678900000,0.12345678900000000000000000000000000000,0.10000,1
132842,333,12345.6789000000,0.12345678900000000000000000000000000000,0.77889,1
---- TYPES
DECIMAL, DECIMAL, DECIMAL, DECIMAL, DECIMAL, DECIMAL
====
---- QUERY
# Create non-Iceberg table with INT96 nanos.
create table int96_nanos (ts timestamp) stored as parquet;
====
---- QUERY
# Insert edge values as "normal" int96 timestamps that can represent all values.
set parquet_timestamp_type=INT96_NANOS;
insert into int96_nanos values
("1400-01-01"),
("2019-01-18 00:00:00.000000001"),
("2019-01-18 00:00:00.000001"),
("2019-01-18 00:00:00.001"),
("2019-01-18 23:59:59.999"),
("2019-01-18 23:59:59.999999"),
("2019-01-18 23:59:59.999999999")
====
---- QUERY
# Iceberg should write timestamps as INT64 micros.
create table ts_iceberg (ts timestamp) stored as iceberg
tblproperties('iceberg.catalog'='hadoop.tables');
insert into ts_iceberg select * from int96_nanos;
select * from ts_iceberg;
---- RESULTS
1400-01-01 00:00:00
2019-01-18 00:00:00
2019-01-18 00:00:00.000001000
2019-01-18 00:00:00.001000000
2019-01-18 23:59:59.999000000
2019-01-18 23:59:59.999999000
2019-01-18 23:59:59.999999000
====
---- QUERY
# Insert into hadoop catalog.
create table iceberg_hadoop_cat (i int)
stored as iceberg
tblproperties('iceberg.catalog'='hadoop.catalog',
'iceberg.catalog_location'='/test-warehouse/$DATABASE.db/hadoop_catalog_test');
insert into iceberg_hadoop_cat values (1), (2), (3);
---- RESULTS
: 3
====
---- QUERY
select * from iceberg_hadoop_cat;
---- RESULTS
1
2
3
---- TYPES
INT
====
---- QUERY
show files in iceberg_hadoop_cat;
---- RESULTS: VERIFY_IS_SUBSET
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/hadoop_catalog_test/$DATABASE/iceberg_hadoop_cat/data/.*.0.parq','.*',''
---- TYPES
STRING, STRING, STRING
====
---- QUERY
# Insert into hadoop catalog with custom table identifier.
create table iceberg_hadoop_cat_ti (i int)
stored as iceberg
tblproperties('iceberg.catalog'='hadoop.catalog',
'iceberg.catalog_location'='/test-warehouse/$DATABASE.db/hadoop_catalog_test',
'iceberg.table_identifier'='test.custom_db.int_table');
insert into iceberg_hadoop_cat_ti values (1), (2), (3);
---- RESULTS
: 3
====
---- QUERY
select * from iceberg_hadoop_cat_ti;
---- RESULTS
1
2
3
---- TYPES
INT
====
---- QUERY
show files in iceberg_hadoop_cat_ti;
---- RESULTS: VERIFY_IS_SUBSET
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/hadoop_catalog_test/test/custom_db/int_table/data/.*.0.parq','.*',''
---- TYPES
STRING, STRING, STRING
====
---- QUERY
# Insert into table stored in Iceberg's HiveCatalog
create table iceberg_hive_cat (i int)
stored as iceberg
tblproperties('iceberg.catalog'='hive.catalog');
---- RESULTS
'Table has been created.'
====
---- QUERY
insert into iceberg_hive_cat values (7);
select * from iceberg_hive_cat;
---- RESULTS
7
---- TYPES
INT
====
---- QUERY
# Query external Iceberg table
create external table iceberg_hive_cat_ext (i int)
stored as iceberg
location '/test-warehouse/$DATABASE.db/iceberg_hive_cat'
tblproperties('iceberg.catalog'='hive.catalog',
'iceberg.table_identifier'='$DATABASE.iceberg_hive_cat');
---- RESULTS
'Table has been created.'
====
---- QUERY
select * from iceberg_hive_cat_ext;
---- RESULTS
7
---- TYPES
INT
====
---- QUERY
# INSET INTO external Iceberg table stored in HiveCatalog.
insert into iceberg_hive_cat_ext values (8);
select * from iceberg_hive_cat_ext;
---- RESULTS
7
8
---- TYPES
INT
====
---- QUERY
# Query original table
refresh iceberg_hive_cat;
select * from iceberg_hive_cat;
---- RESULTS
7
8
---- TYPES
INT
====
---- QUERY
# DROP external Iceberg table
drop table iceberg_hive_cat_ext
---- RESULTS
'Table has been dropped.'
====
---- QUERY
# Original table is not affected after external table drop.
refresh iceberg_hive_cat;
select * from iceberg_hive_cat;
---- RESULTS
7
8
---- TYPES
INT
====
---- QUERY
# Create another external Iceberg table
create external table iceberg_hive_cat_ext_2 (i int)
stored as iceberg
location '/test-warehouse/$DATABASE.db/iceberg_hive_cat'
tblproperties('iceberg.catalog'='hive.catalog',
'iceberg.table_identifier'='$DATABASE.iceberg_hive_cat');
select * from iceberg_hive_cat_ext_2
---- RESULTS
7
8
====
---- QUERY
# DROP the synchronized Iceberg table (data is purged).
drop table iceberg_hive_cat
---- RESULTS
'Table has been dropped.'
====
---- QUERY
# The data has been purged, so querying the external table fails.
select * from iceberg_hive_cat_ext_2
---- CATCH
Table does not exist
====
---- QUERY
# Insert into hive catalog with custom location.
create table iceberg_hive_cat_custom_loc (i int)
stored as iceberg
location '/test-warehouse/$DATABASE.db/custom_hive_cat'
tblproperties('iceberg.catalog'='hive.catalog');
insert into iceberg_hive_cat_custom_loc values (1), (2), (3);
---- RESULTS
: 3
====
---- QUERY
select * from iceberg_hive_cat_custom_loc;
---- RESULTS
1
2
3
---- TYPES
INT
====
---- QUERY
show files in iceberg_hive_cat_custom_loc;
---- RESULTS: VERIFY_IS_SUBSET
row_regex:'$NAMENODE/test-warehouse/$DATABASE.db/custom_hive_cat/data/.*.0.parq','.*',''
---- TYPES
STRING, STRING, STRING
====
---- QUERY
# Create a table that is a subset of 'alltypes' table, i.e. it only
# contains the data types supported by Iceberg.
create table iceberg_alltypes_parq_tblprop(
id INT COMMENT 'Add a comment',
bool_col BOOLEAN,
int_col INT,
bigint_col BIGINT,
float_col FLOAT,
double_col DOUBLE,
date_col DATE,
string_col STRING,
timestamp_col TIMESTAMP
)
stored as iceberg
tblproperties('write.format.default'='parquet',
'write.parquet.row-group-size-bytes'='8388608',
'write.parquet.compression-codec'='gzip',
'write.parquet.page-size-bytes'='65536',
'write.parquet.dict-size-bytes'='1073741824'
);
---- RESULTS
'Table has been created.'
====
---- QUERY
insert into iceberg_alltypes_parq_tblprop
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes;
---- RESULTS
: 7300
====
---- QUERY
alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='536870912',
'write.parquet.compression-codec'='none',
'write.parquet.page-size-bytes'='134217728',
'write.parquet.dict-size-bytes'='805306368');
====
---- QUERY
insert into iceberg_alltypes_parq_tblprop
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes;
---- RESULTS
: 7300
====
---- QUERY
alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='1073741824',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='1',
'write.parquet.page-size-bytes'='402653184',
'write.parquet.dict-size-bytes'='536870912');
====
---- QUERY
insert into iceberg_alltypes_parq_tblprop
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes;
---- RESULTS
: 7300
====
---- QUERY
alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='1610612736',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='13',
'write.parquet.page-size-bytes'='536870912',
'write.parquet.dict-size-bytes'='402653184');
====
---- QUERY
insert into iceberg_alltypes_parq_tblprop
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes;
---- RESULTS
: 7300
====
---- QUERY
alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='1879048192',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='18',
'write.parquet.page-size-bytes'='805306368',
'write.parquet.dict-size-bytes'='134217728');
====
---- QUERY
insert into iceberg_alltypes_parq_tblprop
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes;
---- RESULTS
: 7300
====
---- QUERY
alter table iceberg_alltypes_parq_tblprop set tblproperties (
'write.parquet.row-group-size-bytes'='2146435072',
'write.parquet.compression-codec'='zstd',
'write.parquet.compression-level'='22',
'write.parquet.page-size-bytes'='1073741824',
'write.parquet.dict-size-bytes'='65536');
====
---- QUERY
insert into iceberg_alltypes_parq_tblprop
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes;
---- RESULTS
: 7300
====
---- QUERY
alter table iceberg_alltypes_parq_tblprop unset tblproperties (
'write.parquet.row-group-size-bytes',
'write.parquet.compression-codec',
'write.parquet.compression-level',
'write.parquet.page-size-bytes',
'write.parquet.dict-size-bytes');
====
---- QUERY
insert into iceberg_alltypes_parq_tblprop
select id, bool_col, int_col, bigint_col, float_col, double_col,
CAST(date_string_col as date FORMAT 'MM/DD/YY'), string_col, timestamp_col
from functional.alltypes;
---- RESULTS
: 7300
====
---- QUERY
select count(*) from iceberg_alltypes_parq_tblprop;
---- RESULTS
51100
---- TYPES
BIGINT
====