IMPALA-13284: Loading test data on Apache Hive3

There are some failures in loading test data on Apache Hive 3.1.3:
 - STORED AS JSONFILE is not supported
 - STORED BY ICEBERG is not supported. Similarly, STORED BY ICEBERG
   STORED AS AVRO is not supported.
 - Missing the jar of iceberg-hive-runtime in CLASSPATH of HMS and Tez
   jobs.
 - Creating table in Impala is not translated to EXTERNAL table in HMS
 - Hive INSERT on insert-only tables failed in generating InsertEvents
   (HIVE-20067).

This patch fixes the syntax issues by using old syntax of Apache Hive
3.1.3:
 - Convert STORED AS JSONFILE to ROW FORMAT SERDE
   'org.apache.hadoop.hive.serde2.JsonSerDe'
 - Convert STORED BY ICEBERG to STORED BY
   'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'
 - Convert STORED BY ICEBERG STORED AS AVRO to the above one with
   tblproperties('write.format.default'='avro')
Most of the conversion are done in generate-schema-statements.py. One
exception is in testdata/bin/load-dependent-tables.sql where we need to
generate a new file with the conversion when using it.

The missing jar of iceberg-hive-runtime is added into HIVE_AUX_JARS_PATH
in bin/impala-config.sh. Note that this is only needed by Apache Hive3
since CDP Hive3 has the jar of hive-iceberg-handler in its lib folder.

To fix the failure of InsertEvents, we add the patch of HIVE-20067 and
modify testdata/bin/patch_hive.sh to also recompile the submodule
standalone-metastore.

Modified some statements in
testdata/datasets/functional/functional_schema_template.sql to be more
reliable in retry.

Tests
 - Verified the testdata can be loaded in ubuntu-20.04-from-scratch

Change-Id: I8f52c91602da8822b0f46f19dc4111c7187ce400
Reviewed-on: http://gerrit.cloudera.org:8080/21657
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
stiga-huang
2024-08-08 15:53:58 +08:00
committed by Impala Public Jenkins
parent d35afd9f10
commit d2e495e83a
8 changed files with 100 additions and 10 deletions

View File

@@ -1025,6 +1025,13 @@ export HIVE_CONF_DIR="$IMPALA_FE_DIR/./src/test/resources"
export POSTGRES_JDBC_DRIVER="${IMPALA_FE_DIR}/target/dependency/postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar"
export HIVE_AUX_JARS_PATH="$POSTGRES_JDBC_DRIVER"
# Add the jar of iceberg-hive-runtime to have HiveIcebergStorageHandler.
# Only needed by Apache Hive3 since CDP Hive3 has the jar of hive-iceberg-handler in its
# lib folder.
if $USE_APACHE_HIVE; then
export HIVE_AUX_JARS_PATH="$HIVE_AUX_JARS_PATH:\
$IMPALA_HOME/fe/target/dependency/iceberg-hive-runtime-${IMPALA_ICEBERG_VERSION}.jar"
fi
export AUX_CLASSPATH=""
### Tell hive not to use jline
export HADOOP_USER_CLASSPATH_FIRST=true

View File

@@ -382,8 +382,15 @@ function copy-and-load-dependent-tables {
# For tables that rely on loading data from local fs test-wareload-house
# TODO: Find a good way to integrate this with the normal data loading scripts
beeline -n $USER -u "${JDBC_URL}" -f\
${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
if $USE_APACHE_HIVE; then
# Apache Hive 3.1 doesn't support "STORED AS JSONFILE" (HIVE-19899)
NEW_SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables-hive3.sql
sed "s/STORED AS JSONFILE/ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'"\
$SQL_FILE > $NEW_SQL_FILE
SQL_FILE=$NEW_SQL_FILE
fi
beeline -n $USER -u "${JDBC_URL}" -f $SQL_FILE
}
function create-internal-hbase-table {

View File

@@ -232,7 +232,7 @@ HINT_SHUFFLE = "/* +shuffle, clustered */"
def build_create_statement(table_template, table_name, db_name, db_suffix,
file_format, compression, hdfs_location,
force_reload):
force_reload, is_hive_stmt):
create_stmt = ''
if (force_reload):
tbl_type = 'TABLE'
@@ -247,12 +247,34 @@ def build_create_statement(table_template, table_name, db_name, db_suffix,
# Remove location part from the format string
table_template = table_template.replace("LOCATION '{hdfs_location}'", "")
create_stmt += table_template.format(
stmt = table_template.format(
db_name=db_name,
db_suffix=db_suffix,
table_name=table_name,
file_format=FILE_FORMAT_TO_STORED_AS_MAP[file_format],
hdfs_location=hdfs_location)
# Apache Hive 3.1 doesn't support "STORED BY ICEBERG STORED AS AVRO" and
# "STORED AS JSONFILE" (HIVE-25162, HIVE-19899)
if is_hive_stmt and os.environ['USE_APACHE_HIVE'] == "true":
if "STORED AS JSONFILE" in stmt:
stmt = stmt.replace("STORED AS JSONFILE",
"ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'")
elif "STORED BY ICEBERG" in stmt:
if "STORED AS" not in stmt:
stmt = stmt.replace(
"STORED BY ICEBERG",
"STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'")
else:
assert "TBLPROPERTIES" not in stmt,\
("Cannot convert STORED BY ICEBERG STORED AS file_format with TBLPROPERTIES "
"also in the statement:\n" + stmt)
iceberg_file_format = re.search(r"STORED AS (\w+)", stmt).group(1)
stmt = re.sub(r"STORED BY ICEBERG\s+STORED AS \w+",
("STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'"
" TBLPROPERTIES('write.format.default'='{}')").format(
iceberg_file_format),
stmt)
create_stmt += stmt
return create_stmt
@@ -835,7 +857,8 @@ def generate_statements(output_name, test_vectors, sections,
if table_template:
output.create.append(build_create_statement(table_template, table_name, db_name,
db_suffix, create_file_format, create_codec, data_path, force_reload))
db_suffix, create_file_format, create_codec, data_path, force_reload,
create_hive))
# HBASE create table
if file_format == 'hbase':
# If the HBASE_COLUMN_FAMILIES section does not exist, default to 'd'

View File

@@ -94,7 +94,6 @@ LOCATION '/test-warehouse/chars_formats_text';
DROP TABLE IF EXISTS functional_json.chars_formats;
CREATE EXTERNAL TABLE functional_json.chars_formats
(cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
ROW FORMAT delimited fields terminated by ',' escaped by '\\'
STORED AS JSONFILE
LOCATION '/test-warehouse/chars_formats_json';

View File

@@ -75,7 +75,8 @@ done
# 3. Repackage the hive submodules affected by the patch
if [[ "${HIVE_REBUILD}" = "true" ]]; then
echo "Repackage the hive-exec module"
${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql clean package -Dmaven.test.skip
${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql,standalone-metastore clean package \
-Dmaven.test.skip
cp $HIVE_SRC_DIR/ql/target/hive-exec-${APACHE_HIVE_VERSION}.jar $HIVE_HOME/lib/
fi
popd

View File

@@ -10,3 +10,6 @@ Bump guava version to 28.1-jre. Fix HIVE-22717.
patch2-HIVE-20038.diff:
Update queries on non-bucketed + partitioned tables throws NPE
patch3-HIVE-20067.diff:
Fix failures in firing InsertEvent for on insert-only tables

View File

@@ -0,0 +1,46 @@
diff --git a/ql/src/test/queries/clientpositive/mm_all.q b/ql/src/test/queries/clientpositive/mm_all.q
index 61dd3e7475..a524c29ef5 100644
--- a/ql/src/test/queries/clientpositive/mm_all.q
+++ b/ql/src/test/queries/clientpositive/mm_all.q
@@ -3,6 +3,7 @@
-- MASK_LINEAGE
+set hive.metastore.dml.events=true;
set hive.mapred.mode=nonstrict;
set hive.explain.user=false;
set hive.fetch.task.conversion=none;
diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
index aa014e9317..60ad7db60e 100644
--- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
+++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
@@ -18,7 +18,9 @@
package org.apache.hadoop.hive.metastore.events;
-import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.hive.metastore.IHMSHandler;
@@ -33,8 +35,7 @@
import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
import org.apache.thrift.TException;
-import java.util.ArrayList;
-import java.util.List;
+import com.google.common.collect.Lists;
@InterfaceAudience.Public
@InterfaceStability.Stable
@@ -65,7 +66,7 @@ public InsertEvent(String catName, String db, String table, List<String> partVal
// TODO MS-SPLIT Switch this back once HiveMetaStoreClient is moved.
//req.setCapabilities(HiveMetaStoreClient.TEST_VERSION);
req.setCapabilities(new ClientCapabilities(
- Lists.newArrayList(ClientCapability.TEST_CAPABILITY)));
+ Lists.newArrayList(ClientCapability.TEST_CAPABILITY, ClientCapability.INSERT_ONLY_TABLES)));
try {
this.tableObj = handler.get_table_req(req).getTable();
if (partVals != null) {

View File

@@ -2620,7 +2620,10 @@ materialized_view
-- The create materialized view command is moved down so that the database's
-- managed directory has been created. Otherwise the command would fail. This
-- is a bug in Hive.
CREATE MATERIALIZED VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name}
-- Always drop the view first since IF NOT EXISTS is ignored in CREATE VIEW
-- in Apache Hive3 (HIVE-20462, HIVE-21675).
DROP MATERIALIZED VIEW IF EXISTS {db_name}{db_suffix}.{table_name};
CREATE MATERIALIZED VIEW {db_name}{db_suffix}.{table_name}
AS SELECT * FROM {db_name}{db_suffix}.insert_only_transactional_table;
=====
---- DATASET
@@ -3840,6 +3843,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
)
STORED BY ICEBERG STORED AS AVRO
LOCATION '/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_avro_format';
---- DEPENDENT_LOAD_HIVE
INSERT INTO TABLE {db_name}{db_suffix}.{table_name} values(1, 'A', 0.5, true),(2, 'B', 1.5, true),(3, 'C', 2.5, false);
====
---- DATASET
@@ -3887,7 +3891,7 @@ functional
---- BASE_TABLE_NAME
iceberg_view
---- CREATE
CREATE VIEW {db_name}{db_suffix}.{table_name} AS
CREATE VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} AS
SELECT * FROM {db_name}{db_suffix}.iceberg_query_metadata;
====
---- DATASET
@@ -3995,7 +3999,7 @@ functional
---- BASE_TABLE_NAME
iceberg_lineitem_sixblocks
---- CREATE
CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
LIKE PARQUET '/test-warehouse/lineitem_sixblocks_iceberg/lineitem_sixblocks.parquet'
STORED AS PARQUET
LOCATION '/test-warehouse/lineitem_sixblocks_iceberg/';