mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-13284: Loading test data on Apache Hive3
There are some failures in loading test data on Apache Hive 3.1.3:
- STORED AS JSONFILE is not supported
- STORED BY ICEBERG is not supported. Similarly, STORED BY ICEBERG
STORED AS AVRO is not supported.
- Missing the jar of iceberg-hive-runtime in CLASSPATH of HMS and Tez
jobs.
- Creating table in Impala is not translated to EXTERNAL table in HMS
- Hive INSERT on insert-only tables failed in generating InsertEvents
(HIVE-20067).
This patch fixes the syntax issues by using old syntax of Apache Hive
3.1.3:
- Convert STORED AS JSONFILE to ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.JsonSerDe'
- Convert STORED BY ICEBERG to STORED BY
'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'
- Convert STORED BY ICEBERG STORED AS AVRO to the above one with
tblproperties('write.format.default'='avro')
Most of the conversion are done in generate-schema-statements.py. One
exception is in testdata/bin/load-dependent-tables.sql where we need to
generate a new file with the conversion when using it.
The missing jar of iceberg-hive-runtime is added into HIVE_AUX_JARS_PATH
in bin/impala-config.sh. Note that this is only needed by Apache Hive3
since CDP Hive3 has the jar of hive-iceberg-handler in its lib folder.
To fix the failure of InsertEvents, we add the patch of HIVE-20067 and
modify testdata/bin/patch_hive.sh to also recompile the submodule
standalone-metastore.
Modified some statements in
testdata/datasets/functional/functional_schema_template.sql to be more
reliable in retry.
Tests
- Verified the testdata can be loaded in ubuntu-20.04-from-scratch
Change-Id: I8f52c91602da8822b0f46f19dc4111c7187ce400
Reviewed-on: http://gerrit.cloudera.org:8080/21657
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
d35afd9f10
commit
d2e495e83a
@@ -1025,6 +1025,13 @@ export HIVE_CONF_DIR="$IMPALA_FE_DIR/./src/test/resources"
|
||||
export POSTGRES_JDBC_DRIVER="${IMPALA_FE_DIR}/target/dependency/postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar"
|
||||
|
||||
export HIVE_AUX_JARS_PATH="$POSTGRES_JDBC_DRIVER"
|
||||
# Add the jar of iceberg-hive-runtime to have HiveIcebergStorageHandler.
|
||||
# Only needed by Apache Hive3 since CDP Hive3 has the jar of hive-iceberg-handler in its
|
||||
# lib folder.
|
||||
if $USE_APACHE_HIVE; then
|
||||
export HIVE_AUX_JARS_PATH="$HIVE_AUX_JARS_PATH:\
|
||||
$IMPALA_HOME/fe/target/dependency/iceberg-hive-runtime-${IMPALA_ICEBERG_VERSION}.jar"
|
||||
fi
|
||||
export AUX_CLASSPATH=""
|
||||
### Tell hive not to use jline
|
||||
export HADOOP_USER_CLASSPATH_FIRST=true
|
||||
|
||||
11
testdata/bin/create-load-data.sh
vendored
11
testdata/bin/create-load-data.sh
vendored
@@ -382,8 +382,15 @@ function copy-and-load-dependent-tables {
|
||||
|
||||
# For tables that rely on loading data from local fs test-wareload-house
|
||||
# TODO: Find a good way to integrate this with the normal data loading scripts
|
||||
beeline -n $USER -u "${JDBC_URL}" -f\
|
||||
${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
|
||||
SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
|
||||
if $USE_APACHE_HIVE; then
|
||||
# Apache Hive 3.1 doesn't support "STORED AS JSONFILE" (HIVE-19899)
|
||||
NEW_SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables-hive3.sql
|
||||
sed "s/STORED AS JSONFILE/ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'"\
|
||||
$SQL_FILE > $NEW_SQL_FILE
|
||||
SQL_FILE=$NEW_SQL_FILE
|
||||
fi
|
||||
beeline -n $USER -u "${JDBC_URL}" -f $SQL_FILE
|
||||
}
|
||||
|
||||
function create-internal-hbase-table {
|
||||
|
||||
29
testdata/bin/generate-schema-statements.py
vendored
29
testdata/bin/generate-schema-statements.py
vendored
@@ -232,7 +232,7 @@ HINT_SHUFFLE = "/* +shuffle, clustered */"
|
||||
|
||||
def build_create_statement(table_template, table_name, db_name, db_suffix,
|
||||
file_format, compression, hdfs_location,
|
||||
force_reload):
|
||||
force_reload, is_hive_stmt):
|
||||
create_stmt = ''
|
||||
if (force_reload):
|
||||
tbl_type = 'TABLE'
|
||||
@@ -247,12 +247,34 @@ def build_create_statement(table_template, table_name, db_name, db_suffix,
|
||||
# Remove location part from the format string
|
||||
table_template = table_template.replace("LOCATION '{hdfs_location}'", "")
|
||||
|
||||
create_stmt += table_template.format(
|
||||
stmt = table_template.format(
|
||||
db_name=db_name,
|
||||
db_suffix=db_suffix,
|
||||
table_name=table_name,
|
||||
file_format=FILE_FORMAT_TO_STORED_AS_MAP[file_format],
|
||||
hdfs_location=hdfs_location)
|
||||
# Apache Hive 3.1 doesn't support "STORED BY ICEBERG STORED AS AVRO" and
|
||||
# "STORED AS JSONFILE" (HIVE-25162, HIVE-19899)
|
||||
if is_hive_stmt and os.environ['USE_APACHE_HIVE'] == "true":
|
||||
if "STORED AS JSONFILE" in stmt:
|
||||
stmt = stmt.replace("STORED AS JSONFILE",
|
||||
"ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'")
|
||||
elif "STORED BY ICEBERG" in stmt:
|
||||
if "STORED AS" not in stmt:
|
||||
stmt = stmt.replace(
|
||||
"STORED BY ICEBERG",
|
||||
"STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'")
|
||||
else:
|
||||
assert "TBLPROPERTIES" not in stmt,\
|
||||
("Cannot convert STORED BY ICEBERG STORED AS file_format with TBLPROPERTIES "
|
||||
"also in the statement:\n" + stmt)
|
||||
iceberg_file_format = re.search(r"STORED AS (\w+)", stmt).group(1)
|
||||
stmt = re.sub(r"STORED BY ICEBERG\s+STORED AS \w+",
|
||||
("STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'"
|
||||
" TBLPROPERTIES('write.format.default'='{}')").format(
|
||||
iceberg_file_format),
|
||||
stmt)
|
||||
create_stmt += stmt
|
||||
return create_stmt
|
||||
|
||||
|
||||
@@ -835,7 +857,8 @@ def generate_statements(output_name, test_vectors, sections,
|
||||
|
||||
if table_template:
|
||||
output.create.append(build_create_statement(table_template, table_name, db_name,
|
||||
db_suffix, create_file_format, create_codec, data_path, force_reload))
|
||||
db_suffix, create_file_format, create_codec, data_path, force_reload,
|
||||
create_hive))
|
||||
# HBASE create table
|
||||
if file_format == 'hbase':
|
||||
# If the HBASE_COLUMN_FAMILIES section does not exist, default to 'd'
|
||||
|
||||
1
testdata/bin/load-dependent-tables.sql
vendored
1
testdata/bin/load-dependent-tables.sql
vendored
@@ -94,7 +94,6 @@ LOCATION '/test-warehouse/chars_formats_text';
|
||||
DROP TABLE IF EXISTS functional_json.chars_formats;
|
||||
CREATE EXTERNAL TABLE functional_json.chars_formats
|
||||
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
|
||||
ROW FORMAT delimited fields terminated by ',' escaped by '\\'
|
||||
STORED AS JSONFILE
|
||||
LOCATION '/test-warehouse/chars_formats_json';
|
||||
|
||||
|
||||
3
testdata/bin/patch_hive.sh
vendored
3
testdata/bin/patch_hive.sh
vendored
@@ -75,7 +75,8 @@ done
|
||||
# 3. Repackage the hive submodules affected by the patch
|
||||
if [[ "${HIVE_REBUILD}" = "true" ]]; then
|
||||
echo "Repackage the hive-exec module"
|
||||
${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql clean package -Dmaven.test.skip
|
||||
${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql,standalone-metastore clean package \
|
||||
-Dmaven.test.skip
|
||||
cp $HIVE_SRC_DIR/ql/target/hive-exec-${APACHE_HIVE_VERSION}.jar $HIVE_HOME/lib/
|
||||
fi
|
||||
popd
|
||||
|
||||
3
testdata/cluster/hive/README
vendored
3
testdata/cluster/hive/README
vendored
@@ -10,3 +10,6 @@ Bump guava version to 28.1-jre. Fix HIVE-22717.
|
||||
|
||||
patch2-HIVE-20038.diff:
|
||||
Update queries on non-bucketed + partitioned tables throws NPE
|
||||
|
||||
patch3-HIVE-20067.diff:
|
||||
Fix failures in firing InsertEvent for on insert-only tables
|
||||
|
||||
46
testdata/cluster/hive/patch3-HIVE-20067.diff
vendored
Normal file
46
testdata/cluster/hive/patch3-HIVE-20067.diff
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
diff --git a/ql/src/test/queries/clientpositive/mm_all.q b/ql/src/test/queries/clientpositive/mm_all.q
|
||||
index 61dd3e7475..a524c29ef5 100644
|
||||
--- a/ql/src/test/queries/clientpositive/mm_all.q
|
||||
+++ b/ql/src/test/queries/clientpositive/mm_all.q
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
-- MASK_LINEAGE
|
||||
|
||||
+set hive.metastore.dml.events=true;
|
||||
set hive.mapred.mode=nonstrict;
|
||||
set hive.explain.user=false;
|
||||
set hive.fetch.task.conversion=none;
|
||||
diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
|
||||
index aa014e9317..60ad7db60e 100644
|
||||
--- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
|
||||
+++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
|
||||
@@ -18,7 +18,9 @@
|
||||
|
||||
package org.apache.hadoop.hive.metastore.events;
|
||||
|
||||
-import com.google.common.collect.Lists;
|
||||
+import java.util.ArrayList;
|
||||
+import java.util.List;
|
||||
+
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.hive.metastore.IHMSHandler;
|
||||
@@ -33,8 +35,7 @@
|
||||
import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
|
||||
import org.apache.thrift.TException;
|
||||
|
||||
-import java.util.ArrayList;
|
||||
-import java.util.List;
|
||||
+import com.google.common.collect.Lists;
|
||||
|
||||
@InterfaceAudience.Public
|
||||
@InterfaceStability.Stable
|
||||
@@ -65,7 +66,7 @@ public InsertEvent(String catName, String db, String table, List<String> partVal
|
||||
// TODO MS-SPLIT Switch this back once HiveMetaStoreClient is moved.
|
||||
//req.setCapabilities(HiveMetaStoreClient.TEST_VERSION);
|
||||
req.setCapabilities(new ClientCapabilities(
|
||||
- Lists.newArrayList(ClientCapability.TEST_CAPABILITY)));
|
||||
+ Lists.newArrayList(ClientCapability.TEST_CAPABILITY, ClientCapability.INSERT_ONLY_TABLES)));
|
||||
try {
|
||||
this.tableObj = handler.get_table_req(req).getTable();
|
||||
if (partVals != null) {
|
||||
@@ -2620,7 +2620,10 @@ materialized_view
|
||||
-- The create materialized view command is moved down so that the database's
|
||||
-- managed directory has been created. Otherwise the command would fail. This
|
||||
-- is a bug in Hive.
|
||||
CREATE MATERIALIZED VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name}
|
||||
-- Always drop the view first since IF NOT EXISTS is ignored in CREATE VIEW
|
||||
-- in Apache Hive3 (HIVE-20462, HIVE-21675).
|
||||
DROP MATERIALIZED VIEW IF EXISTS {db_name}{db_suffix}.{table_name};
|
||||
CREATE MATERIALIZED VIEW {db_name}{db_suffix}.{table_name}
|
||||
AS SELECT * FROM {db_name}{db_suffix}.insert_only_transactional_table;
|
||||
=====
|
||||
---- DATASET
|
||||
@@ -3840,6 +3843,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
|
||||
)
|
||||
STORED BY ICEBERG STORED AS AVRO
|
||||
LOCATION '/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_avro_format';
|
||||
---- DEPENDENT_LOAD_HIVE
|
||||
INSERT INTO TABLE {db_name}{db_suffix}.{table_name} values(1, 'A', 0.5, true),(2, 'B', 1.5, true),(3, 'C', 2.5, false);
|
||||
====
|
||||
---- DATASET
|
||||
@@ -3887,7 +3891,7 @@ functional
|
||||
---- BASE_TABLE_NAME
|
||||
iceberg_view
|
||||
---- CREATE
|
||||
CREATE VIEW {db_name}{db_suffix}.{table_name} AS
|
||||
CREATE VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} AS
|
||||
SELECT * FROM {db_name}{db_suffix}.iceberg_query_metadata;
|
||||
====
|
||||
---- DATASET
|
||||
@@ -3995,7 +3999,7 @@ functional
|
||||
---- BASE_TABLE_NAME
|
||||
iceberg_lineitem_sixblocks
|
||||
---- CREATE
|
||||
CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
|
||||
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
|
||||
LIKE PARQUET '/test-warehouse/lineitem_sixblocks_iceberg/lineitem_sixblocks.parquet'
|
||||
STORED AS PARQUET
|
||||
LOCATION '/test-warehouse/lineitem_sixblocks_iceberg/';
|
||||
|
||||
Reference in New Issue
Block a user