mirror of
https://github.com/apache/impala.git
synced 2026-01-26 12:02:21 -05:00
Hive 3 changed the typical storage model for tables to split them between two directories: - hive.metastore.warehouse.dir stores managed tables (which is now defined to be only transactional tables) - hive.metastore.warehouse.external.dir stores external tables (everything that is not a transactional table) In more recent commits of Hive, there is now validation that the external tables cannot be stored in the managed directory. In order to adopt these newer versions of Hive, we need to use separate directories for external vs managed warehouses. Most of our test tables are not transactional, so they would reside in the external directory. To keep the test changes small, this uses /test-warehouse for the external directory and /test-warehouse/managed for the managed directory. Having the managed directory be a subdirectory of /test-warehouse means that the data snapshot code should not need to change. The Hive 2 configuration doesn't change as it does not have this concept. Since this changes the dataload layout, this also sets the CDH_MAJOR_VERSION to 7 for USE_CDP_HIVE=true. This means that dataload will uses a separate location for data as compared to USE_CDP_HIVE=false. That should reduce conflicts between the two configurations. Testing: - Ran exhaustive tests with USE_CDP_HIVE=false - Ran exhaustive tests with USE_CDP_HIVE=true (with current Hive version) - Verified that dataload succeeds and tests are able to run with a newer Hive version. Change-Id: I3db69f1b8ca07ae98670429954f5f7a1a359eaec Reviewed-on: http://gerrit.cloudera.org:8080/15026 Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
220 lines
8.8 KiB
SQL
220 lines
8.8 KiB
SQL
-- Licensed to the Apache Software Foundation (ASF) under one
|
|
-- or more contributor license agreements. See the NOTICE file
|
|
-- distributed with this work for additional information
|
|
-- regarding copyright ownership. The ASF licenses this file
|
|
-- to you under the Apache License, Version 2.0 (the
|
|
-- "License"); you may not use this file except in compliance
|
|
-- with the License. You may obtain a copy of the License at
|
|
--
|
|
-- http://www.apache.org/licenses/LICENSE-2.0
|
|
--
|
|
-- Unless required by applicable law or agreed to in writing,
|
|
-- software distributed under the License is distributed on an
|
|
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
-- KIND, either express or implied. See the License for the
|
|
-- specific language governing permissions and limitations
|
|
-- under the License.
|
|
|
|
-- NOTE: Queries in this file have to be executed against Hive and some won't work with
|
|
-- Impala due to different type compatibility rules.
|
|
|
|
USE functional_avro_snap;
|
|
|
|
DROP TABLE IF EXISTS schema_resolution_test;
|
|
|
|
-- Specify the Avro schema in SERDEPROPERTIES instead of TBLPROPERTIES to validate
|
|
-- IMPALA-538. Also, give the table a different column definition (col1, col2) than what
|
|
-- is defined in the Avro schema for testing mismatched table/deserializer schemas.
|
|
CREATE EXTERNAL TABLE schema_resolution_test (col1 string, col2 string)
|
|
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
|
|
WITH SERDEPROPERTIES ('avro.schema.literal'='{
|
|
"name": "a",
|
|
"type": "record",
|
|
"fields": [
|
|
{"name":"boolean1", "type":"boolean", "default": true},
|
|
{"name":"int1", "type":"int", "default": 1},
|
|
{"name":"long1", "type":"long", "default": 1},
|
|
{"name":"float1", "type":"float", "default": 1.0},
|
|
{"name":"double1", "type":"double", "default": 1.0},
|
|
{"name":"date1", "type": {"type": "int", "logicalType": "date"}, "default": 1},
|
|
{"name":"string1", "type":"string", "default": "default string"},
|
|
{"name":"string2", "type": ["string", "null"], "default": ""},
|
|
{"name":"string3", "type": ["null", "string"], "default": null}
|
|
]}')
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/avro_schema_resolution_test/';
|
|
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records1.avro' OVERWRITE INTO TABLE schema_resolution_test;
|
|
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records2.avro' INTO TABLE schema_resolution_test;
|
|
|
|
-- The following tables are used to test Impala's handling of HIVE-6308 which causes
|
|
-- COMPUTE STATS and Hive's ANALYZE TABLE to fail for Avro tables with mismatched
|
|
-- column definitions and Avro-schema columns. In such cases, COMPUTE STATS is expected to
|
|
-- fail in analysis and not after actually computing stats (IMPALA-867).
|
|
-- TODO: The creation of these tables should migrate into our regular data loading
|
|
-- framework. There are issues with beeline for these CREATE TABLE stmts (NPEs).
|
|
|
|
-- No explicit column definitions for an Avro table.
|
|
DROP TABLE IF EXISTS alltypes_no_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_no_coldef
|
|
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');
|
|
|
|
-- Column definition list has one more column than the Avro schema.
|
|
DROP TABLE IF EXISTS alltypes_extra_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_extra_coldef (
|
|
id int,
|
|
bool_col boolean,
|
|
tinyint_col tinyint,
|
|
smallint_col smallint,
|
|
int_col int,
|
|
bigint_col bigint,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string,
|
|
timestamp_col timestamp,
|
|
extra_col string)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');
|
|
|
|
-- Column definition list is missing 'tinyint_col' and 'timestamp_col' from the Avro schema.
|
|
DROP TABLE IF EXISTS alltypes_missing_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_missing_coldef (
|
|
id int,
|
|
bool_col boolean,
|
|
smallint_col smallint,
|
|
int_col int,
|
|
bigint_col bigint,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');
|
|
|
|
-- Matching number of columns and column names, but mismatched type (bigint_col is a string).
|
|
DROP TABLE IF EXISTS alltypes_type_mismatch;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS alltypes_type_mismatch (
|
|
id int,
|
|
bool_col boolean,
|
|
tinyint_col tinyint,
|
|
smallint_col smallint,
|
|
int_col int,
|
|
bigint_col string,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string,
|
|
timestamp_col timestamp)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS
|
|
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
|
|
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
|
|
LOCATION '/test-warehouse/alltypes_avro_snap'
|
|
TBLPROPERTIES ('avro.schema.url'='${env:DEFAULT_FS}/test-warehouse/avro_schemas/functional/alltypes.json');
|
|
|
|
-- IMPALA-2798, create two avro tables with same underlying data location, one table
|
|
-- has an extra column at the end. Validate Impala doesn't use codegen decoding function
|
|
-- if table schema is not the same as file schema.
|
|
DROP TABLE IF EXISTS avro_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS avro_coldef (
|
|
bool_col boolean,
|
|
tinyint_col int,
|
|
smallint_col int,
|
|
int_col int,
|
|
bigint_col bigint,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string,
|
|
timestamp_col timestamp)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS avro
|
|
LOCATION '/test-warehouse/avro_coldef_snap'
|
|
TBLPROPERTIES ('avro.schema.literal'='{
|
|
"name": "a",
|
|
"type": "record",
|
|
"fields": [
|
|
{"name":"bool_col", "type":"boolean"},
|
|
{"name":"tinyint_col", "type":"int"},
|
|
{"name":"smallint_col", "type":"int"},
|
|
{"name":"int_col", "type":"int"},
|
|
{"name":"bigint_col", "type":"long"},
|
|
{"name":"float_col", "type":"float"},
|
|
{"name":"double_col", "type": "double"},
|
|
{"name":"date_string_col", "type": "string"},
|
|
{"name":"string_col", "type": "string"},
|
|
{"name":"timestamp_col", "type": "long"}
|
|
]}');
|
|
|
|
-- Reload existing partitions from HDFS. Without this, the overwrite will fail to remove
|
|
-- any preexisting data files, which in turn will fail the query.
|
|
MSCK REPAIR TABLE avro_coldef;
|
|
|
|
INSERT OVERWRITE TABLE avro_coldef PARTITION(year=2014, month=1)
|
|
SELECT bool_col, tinyint_col, smallint_col, int_col, bigint_col,
|
|
float_col, double_col, date_string_col, string_col, timestamp_col
|
|
FROM (select * from functional.alltypes order by id limit 5) a;
|
|
|
|
DROP TABLE IF EXISTS avro_extra_coldef;
|
|
CREATE EXTERNAL TABLE IF NOT EXISTS avro_extra_coldef (
|
|
bool_col boolean,
|
|
tinyint_col int,
|
|
smallint_col int,
|
|
int_col int,
|
|
bigint_col bigint,
|
|
float_col float,
|
|
double_col double,
|
|
date_string_col string,
|
|
string_col string,
|
|
timestamp_col timestamp,
|
|
extra_col string)
|
|
PARTITIONED BY (year int, month int)
|
|
STORED AS avro
|
|
LOCATION '/test-warehouse/avro_coldef_snap'
|
|
TBLPROPERTIES ('avro.schema.literal'='{
|
|
"name": "a",
|
|
"type": "record",
|
|
"fields": [
|
|
{"name":"bool_col", "type":"boolean"},
|
|
{"name":"tinyint_col", "type":"int"},
|
|
{"name":"smallint_col", "type":"int"},
|
|
{"name":"int_col", "type":"int"},
|
|
{"name":"bigint_col", "type":"long"},
|
|
{"name":"float_col", "type":"float"},
|
|
{"name":"double_col", "type": "double"},
|
|
{"name":"date_string_col", "type": "string"},
|
|
{"name":"string_col", "type": "string"},
|
|
{"name":"timestamp_col", "type": "long"},
|
|
{"name":"extra_col", "type": "string", "default": "null"}
|
|
]}');
|
|
|
|
-- Reload existing partitions from HDFS. Without this, the overwrite will fail to remove
|
|
-- any preexisting data files, which in turn will fail the query.
|
|
MSCK REPAIR TABLE avro_extra_coldef;
|
|
|
|
INSERT OVERWRITE TABLE avro_extra_coldef PARTITION(year=2014, month=2)
|
|
SELECT bool_col, tinyint_col, smallint_col, int_col, bigint_col,
|
|
float_col, double_col, date_string_col, string_col,
|
|
timestamp_col, "avro" AS extra_col FROM
|
|
(select * from functional.alltypes order by id limit 5) a;
|
|
|
|
-- Reload the partitions for the first table once again. This will make sure that the new
|
|
-- partition from the second insert shows up in the first table, too.
|
|
MSCK REPAIR TABLE avro_coldef;
|