mirror of
https://github.com/apache/impala.git
synced 2025-12-30 03:01:44 -05:00
Avro tables that were not created with a column-definition list do not have their columns properly populated in the Metastore backend DB (HIVE-6308). For such tables COMPUTE STATS and Hive's ANALYZE TABLE cannot succeed. This patch fails COMPUTE STATS in analysis for such broken Avro tables and adds tests for Avro tables with mismatched a column-definition list and Avro schema. Change-Id: I561ecea944ae2f83d69950b7a1ab9edaa89bdcea Reviewed-on: http://gerrit.ent.cloudera.com:8080/1892 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: jenkins Reviewed-on: http://gerrit.ent.cloudera.com:8080/1920
108 lines
4.6 KiB
SQL
108 lines
4.6 KiB
SQL
USE functional_avro_snap;
|
|
|
|
DROP TABLE IF EXISTS schema_resolution_test;
|
|
|
|
-- Specify the Avro schema in SERDEPROPERTIES instead of TBLPROPERTIES to validate
|
|
-- IMPALA-538. Also, give the table a different column definition (col1, col2) than what
|
|
-- is defined in the Avro schema for testing mismatched table/deserializer schemas.
|
|
CREATE EXTERNAL TABLE schema_resolution_test (col1 string, col2 string)
|
|
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
|
|
WITH SERDEPROPERTIES ('avro.schema.literal'='{
|
|
"name": "a",
|
|
"type": "record",
|
|
"fields": [
|
|
{"name":"boolean1", "type":"boolean", "default": true},
|
|
{"name":"int1", "type":"int", "default": 1},
|
|
{"name":"long1", "type":"long", "default": 1},
|
|
{"name":"float1", "type":"float", "default": 1.0},
|
|
{"name":"double1", "type":"double", "default": 1.0},
|
|
{"name":"string1", "type":"string", "default": "default string"},
|
|
{"name":"string2", "type": ["string", "null"], "default": ""},
|
|
{"name":"string3", "type": ["null", "string"], "default": null}
|
|
]}')
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/avro_schema_resolution_test/';
|
|
|
|
LOAD DATA LOCAL INPATH 'records1.avro' OVERWRITE INTO TABLE schema_resolution_test;
|
|
LOAD DATA LOCAL INPATH 'records2.avro' INTO TABLE schema_resolution_test;
|
|
|
|
-- The following tables are used to test Impala's handling of HIVE-6308 which causes
|
|
-- COMPUTE STATS and Hive's ANALYZE TABLE to fail for Avro tables with mismatched
|
|
-- column definitions and Avro-schema columns. In such cases, COMPUTE STATS is expected to
|
|
-- fail in analysis and not after actually computing stats (IMPALA-867).
|
|
-- TODO: The creation of these tables should migrate into our regular data loading
|
|
-- framework. There are issues with beeline for these CREATE TABLE stmts (NPEs).
|
|
|
|
-- No explicit column definitions for an Avro table.
|
|
DROP TABLE IF EXISTS alltypes_no_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_no_coldef
|
|
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json');
|
|
|
|
-- Column definition list has one more column than the Avro schema.
|
|
DROP TABLE IF EXISTS alltypes_extra_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_extra_coldef (
|
|
id int,
|
|
bool_col boolean,
|
|
tinyint_col tinyint,
|
|
smallint_col smallint,
|
|
int_col int,
|
|
bigint_col bigint,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string,
|
|
timestamp_col timestamp,
|
|
extra_col string)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json');
|
|
|
|
-- Column definition list is missing 'tinyint_col' and 'timestamp_col' from the Avro schema.
|
|
DROP TABLE IF EXISTS alltypes_missing_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_missing_coldef (
|
|
id int,
|
|
bool_col boolean,
|
|
smallint_col smallint,
|
|
int_col int,
|
|
bigint_col bigint,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json');
|
|
|
|
-- Matching number of columns and column names, but mismatched type (bigint_col is a string).
|
|
DROP TABLE IF EXISTS alltypes_type_mismatch;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_type_mismatch (
|
|
id int,
|
|
bool_col boolean,
|
|
tinyint_col tinyint,
|
|
smallint_col smallint,
|
|
int_col int,
|
|
bigint_col string,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string,
|
|
timestamp_col timestamp)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json'); |