Files
impala/testdata/avro_schema_resolution/create_table.sql
ishaan dee6911b20 Enable loading metadata from the hive metastore snapshot and cleanup build scripts.
This patch contains the following changes:
  - Add a metastore_snapshot_file parameter to build.sh
  - Enable skipping loading the metadata.
  - create-load-data.sh is refactored into functions.
  - A lot of scripts source impala-config, which creates a lot of log spew. This has now
    been muted.
  - Unecessary log spew from compute-table-stats has been muted.
  - build_thirdparty.sh determins its parallelism from the system, it was previously hard
    coded to 4
  - Only force load data of the particular dataset if a schema change is detected.

Change-Id: I909336451e5c1ca57d21f040eb94c0e831546837
Reviewed-on: http://gerrit.sjc.cloudera.com:8080/5540
Reviewed-by: Ishaan Joshi <ishaan@cloudera.com>
Tested-by: jenkins
2014-12-19 13:41:00 -08:00

109 lines
4.7 KiB
SQL

USE functional_avro_snap;
DROP TABLE IF EXISTS schema_resolution_test;
-- Specify the Avro schema in SERDEPROPERTIES instead of TBLPROPERTIES to validate
-- IMPALA-538. Also, give the table a different column definition (col1, col2) than what
-- is defined in the Avro schema for testing mismatched table/deserializer schemas.
CREATE EXTERNAL TABLE schema_resolution_test (col1 string, col2 string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
WITH SERDEPROPERTIES ('avro.schema.literal'='{
"name": "a",
"type": "record",
"fields": [
{"name":"boolean1", "type":"boolean", "default": true},
{"name":"int1", "type":"int", "default": 1},
{"name":"long1", "type":"long", "default": 1},
{"name":"float1", "type":"float", "default": 1.0},
{"name":"double1", "type":"double", "default": 1.0},
{"name":"string1", "type":"string", "default": "default string"},
{"name":"string2", "type": ["string", "null"], "default": ""},
{"name":"string3", "type": ["null", "string"], "default": null}
]}')
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/avro_schema_resolution_test/';
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records1.avro' OVERWRITE INTO TABLE schema_resolution_test;
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records2.avro' INTO TABLE schema_resolution_test;
-- The following tables are used to test Impala's handling of HIVE-6308 which causes
-- COMPUTE STATS and Hive's ANALYZE TABLE to fail for Avro tables with mismatched
-- column definitions and Avro-schema columns. In such cases, COMPUTE STATS is expected to
-- fail in analysis and not after actually computing stats (IMPALA-867).
-- TODO: The creation of these tables should migrate into our regular data loading
-- framework. There are issues with beeline for these CREATE TABLE stmts (NPEs).
-- No explicit column definitions for an Avro table.
DROP TABLE IF EXISTS alltypes_no_coldef;
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_no_coldef
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '/test-warehouse/alltypes_avro_snap'
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json');
-- Column definition list has one more column than the Avro schema.
DROP TABLE IF EXISTS alltypes_extra_coldef;
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_extra_coldef (
id int,
bool_col boolean,
tinyint_col tinyint,
smallint_col smallint,
int_col int,
bigint_col bigint,
float_col float,
double_col double,
date_string_col string,
string_col string,
timestamp_col timestamp,
extra_col string)
PARTITIONED BY (year int, month int)
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '/test-warehouse/alltypes_avro_snap'
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json');
-- Column definition list is missing 'tinyint_col' and 'timestamp_col' from the Avro schema.
DROP TABLE IF EXISTS alltypes_missing_coldef;
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_missing_coldef (
id int,
bool_col boolean,
smallint_col smallint,
int_col int,
bigint_col bigint,
float_col float,
double_col double,
date_string_col string,
string_col string)
PARTITIONED BY (year int, month int)
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '/test-warehouse/alltypes_avro_snap'
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json');
-- Matching number of columns and column names, but mismatched type (bigint_col is a string).
DROP TABLE IF EXISTS alltypes_type_mismatch;
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_type_mismatch (
id int,
bool_col boolean,
tinyint_col tinyint,
smallint_col smallint,
int_col int,
bigint_col string,
float_col float,
double_col double,
date_string_col string,
string_col string,
timestamp_col timestamp)
PARTITIONED BY (year int, month int)
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
LOCATION '/test-warehouse/alltypes_avro_snap'
TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json');