IMPALA-13284: Loading test data on Apache Hive3

There are some failures in loading test data on Apache Hive 3.1.3: - STORED AS JSONFILE is not supported - STORED BY ICEBERG is not supported. Similarly, STORED BY ICEBERG STORED AS AVRO is not supported. - Missing the jar of iceberg-hive-runtime in CLASSPATH of HMS and Tez jobs. - Creating table in Impala is not translated to EXTERNAL table in HMS - Hive INSERT on insert-only tables failed in generating InsertEvents (HIVE-20067). This patch fixes the syntax issues by using old syntax of Apache Hive 3.1.3: - Convert STORED AS JSONFILE to ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe' - Convert STORED BY ICEBERG to STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' - Convert STORED BY ICEBERG STORED AS AVRO to the above one with tblproperties('write.format.default'='avro') Most of the conversion are done in generate-schema-statements.py. One exception is in testdata/bin/load-dependent-tables.sql where we need to generate a new file with the conversion when using it. The missing jar of iceberg-hive-runtime is added into HIVE_AUX_JARS_PATH in bin/impala-config.sh. Note that this is only needed by Apache Hive3 since CDP Hive3 has the jar of hive-iceberg-handler in its lib folder. To fix the failure of InsertEvents, we add the patch of HIVE-20067 and modify testdata/bin/patch_hive.sh to also recompile the submodule standalone-metastore. Modified some statements in testdata/datasets/functional/functional_schema_template.sql to be more reliable in retry. Tests - Verified the testdata can be loaded in ubuntu-20.04-from-scratch Change-Id: I8f52c91602da8822b0f46f19dc4111c7187ce400 Reviewed-on: http://gerrit.cloudera.org:8080/21657 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2024-08-08 15:53:58 +08:00
parent d35afd9f10
commit d2e495e83a
8 changed files with 100 additions and 10 deletions
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -1025,6 +1025,13 @@ export HIVE_CONF_DIR="$IMPALA_FE_DIR/./src/test/resources"
 export POSTGRES_JDBC_DRIVER="${IMPALA_FE_DIR}/target/dependency/postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar"

 export HIVE_AUX_JARS_PATH="$POSTGRES_JDBC_DRIVER"
+# Add the jar of iceberg-hive-runtime to have HiveIcebergStorageHandler.
+# Only needed by Apache Hive3 since CDP Hive3 has the jar of hive-iceberg-handler in its
+# lib folder.
+if $USE_APACHE_HIVE; then
+  export HIVE_AUX_JARS_PATH="$HIVE_AUX_JARS_PATH:\
+$IMPALA_HOME/fe/target/dependency/iceberg-hive-runtime-${IMPALA_ICEBERG_VERSION}.jar"
+fi
 export AUX_CLASSPATH=""
 ### Tell hive not to use jline
 export HADOOP_USER_CLASSPATH_FIRST=true
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -382,8 +382,15 @@ function copy-and-load-dependent-tables {

  # For tables that rely on loading data from local fs test-wareload-house
  # TODO: Find a good way to integrate this with the normal data loading scripts
-  beeline -n $USER -u "${JDBC_URL}" -f\
-    ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
+  SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
+  if $USE_APACHE_HIVE; then
+    # Apache Hive 3.1 doesn't support "STORED AS JSONFILE" (HIVE-19899)
+    NEW_SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables-hive3.sql
+    sed "s/STORED AS JSONFILE/ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'"\
+      $SQL_FILE > $NEW_SQL_FILE
+    SQL_FILE=$NEW_SQL_FILE
+  fi
+  beeline -n $USER -u "${JDBC_URL}" -f $SQL_FILE
 }

 function create-internal-hbase-table {
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -232,7 +232,7 @@ HINT_SHUFFLE = "/* +shuffle, clustered */"

 def build_create_statement(table_template, table_name, db_name, db_suffix,
                           file_format, compression, hdfs_location,
-                           force_reload):
+                           force_reload, is_hive_stmt):
  create_stmt = ''
  if (force_reload):
    tbl_type = 'TABLE'
@@ -247,12 +247,34 @@ def build_create_statement(table_template, table_name, db_name, db_suffix,
    # Remove location part from the format string
    table_template = table_template.replace("LOCATION '{hdfs_location}'", "")

-  create_stmt += table_template.format(
+  stmt = table_template.format(
    db_name=db_name,
    db_suffix=db_suffix,
    table_name=table_name,
    file_format=FILE_FORMAT_TO_STORED_AS_MAP[file_format],
    hdfs_location=hdfs_location)
+  # Apache Hive 3.1 doesn't support "STORED BY ICEBERG STORED AS AVRO" and
+  # "STORED AS JSONFILE" (HIVE-25162, HIVE-19899)
+  if is_hive_stmt and os.environ['USE_APACHE_HIVE'] == "true":
+    if "STORED AS JSONFILE" in stmt:
+      stmt = stmt.replace("STORED AS JSONFILE",
+                          "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'")
+    elif "STORED BY ICEBERG" in stmt:
+      if "STORED AS" not in stmt:
+        stmt = stmt.replace(
+            "STORED BY ICEBERG",
+            "STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'")
+      else:
+        assert "TBLPROPERTIES" not in stmt,\
+            ("Cannot convert STORED BY ICEBERG STORED AS file_format with TBLPROPERTIES "
+             "also in the statement:\n" + stmt)
+        iceberg_file_format = re.search(r"STORED AS (\w+)", stmt).group(1)
+        stmt = re.sub(r"STORED BY ICEBERG\s+STORED AS \w+",
+                      ("STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'"
+                       " TBLPROPERTIES('write.format.default'='{}')").format(
+                            iceberg_file_format),
+                      stmt)
+  create_stmt += stmt
  return create_stmt


@@ -835,7 +857,8 @@ def generate_statements(output_name, test_vectors, sections,

      if table_template:
        output.create.append(build_create_statement(table_template, table_name, db_name,
-            db_suffix, create_file_format, create_codec, data_path, force_reload))
+            db_suffix, create_file_format, create_codec, data_path, force_reload,
+            create_hive))
      # HBASE create table
      if file_format == 'hbase':
        # If the HBASE_COLUMN_FAMILIES section does not exist, default to 'd'
--- a/testdata/bin/load-dependent-tables.sql
+++ b/testdata/bin/load-dependent-tables.sql
@@ -94,7 +94,6 @@ LOCATION '/test-warehouse/chars_formats_text';
 DROP TABLE IF EXISTS functional_json.chars_formats;
 CREATE EXTERNAL TABLE functional_json.chars_formats
 (cs CHAR(5), cl CHAR(140), vc VARCHAR(32))
-ROW FORMAT delimited fields terminated by ','  escaped by '\\'
 STORED AS JSONFILE
 LOCATION '/test-warehouse/chars_formats_json';

--- a/testdata/bin/patch_hive.sh
+++ b/testdata/bin/patch_hive.sh
@@ -75,7 +75,8 @@ done
 # 3. Repackage the hive submodules affected by the patch
 if [[ "${HIVE_REBUILD}" = "true" ]]; then
  echo "Repackage the hive-exec module"
-  ${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql clean package -Dmaven.test.skip
+  ${IMPALA_HOME}/bin/mvn-quiet.sh -pl ql,standalone-metastore clean package \
+      -Dmaven.test.skip
  cp $HIVE_SRC_DIR/ql/target/hive-exec-${APACHE_HIVE_VERSION}.jar $HIVE_HOME/lib/
 fi
 popd
--- a/testdata/cluster/hive/README
+++ b/testdata/cluster/hive/README
@@ -10,3 +10,6 @@ Bump guava version to 28.1-jre. Fix HIVE-22717.

 patch2-HIVE-20038.diff:
 Update queries on non-bucketed + partitioned tables throws NPE
+
+patch3-HIVE-20067.diff:
+Fix failures in firing InsertEvent for on insert-only tables
--- a/testdata/cluster/hive/patch3-HIVE-20067.diff
+++ b/testdata/cluster/hive/patch3-HIVE-20067.diff
@@ -0,0 +1,46 @@
+diff --git a/ql/src/test/queries/clientpositive/mm_all.q b/ql/src/test/queries/clientpositive/mm_all.q
+index 61dd3e7475..a524c29ef5 100644
+--- a/ql/src/test/queries/clientpositive/mm_all.q
+++ b/ql/src/test/queries/clientpositive/mm_all.q
+@@ -3,6 +3,7 @@
+ 
+ -- MASK_LINEAGE
+ 
+set hive.metastore.dml.events=true;
+ set hive.mapred.mode=nonstrict;
+ set hive.explain.user=false;
+ set hive.fetch.task.conversion=none;
+diff --git a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
+index aa014e9317..60ad7db60e 100644
+--- a/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
+++ b/standalone-metastore/src/main/java/org/apache/hadoop/hive/metastore/events/InsertEvent.java
+@@ -18,7 +18,9 @@
+ 
+ package org.apache.hadoop.hive.metastore.events;
+ 
+-import com.google.common.collect.Lists;
+import java.util.ArrayList;
+import java.util.List;
+
+ import org.apache.hadoop.classification.InterfaceAudience;
+ import org.apache.hadoop.classification.InterfaceStability;
+ import org.apache.hadoop.hive.metastore.IHMSHandler;
+@@ -33,8 +35,7 @@
+ import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
+ import org.apache.thrift.TException;
+ 
+-import java.util.ArrayList;
+-import java.util.List;
+import com.google.common.collect.Lists;
+ 
+ @InterfaceAudience.Public
+ @InterfaceStability.Stable
+@@ -65,7 +66,7 @@ public InsertEvent(String catName, String db, String table, List<String> partVal
+     // TODO MS-SPLIT Switch this back once HiveMetaStoreClient is moved.
+     //req.setCapabilities(HiveMetaStoreClient.TEST_VERSION);
+     req.setCapabilities(new ClientCapabilities(
+-      Lists.newArrayList(ClientCapability.TEST_CAPABILITY)));
+        Lists.newArrayList(ClientCapability.TEST_CAPABILITY, ClientCapability.INSERT_ONLY_TABLES)));
+     try {
+       this.tableObj = handler.get_table_req(req).getTable();
+       if (partVals != null) {
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -2620,7 +2620,10 @@ materialized_view
 -- The create materialized view command is moved down so that the database's
 -- managed directory has been created. Otherwise the command would fail. This
 -- is a bug in Hive.
-CREATE MATERIALIZED VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+-- Always drop the view first since IF NOT EXISTS is ignored in CREATE VIEW
+-- in Apache Hive3 (HIVE-20462, HIVE-21675).
+DROP MATERIALIZED VIEW IF EXISTS {db_name}{db_suffix}.{table_name};
+CREATE MATERIALIZED VIEW {db_name}{db_suffix}.{table_name}
  AS SELECT * FROM {db_name}{db_suffix}.insert_only_transactional_table;
 =====
 ---- DATASET
@@ -3840,6 +3843,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} (
 )
 STORED BY ICEBERG STORED AS AVRO
 LOCATION '/test-warehouse/iceberg_test/hadoop_catalog/ice/iceberg_avro_format';
+---- DEPENDENT_LOAD_HIVE
 INSERT INTO TABLE {db_name}{db_suffix}.{table_name} values(1, 'A', 0.5, true),(2, 'B', 1.5, true),(3, 'C', 2.5, false);
 ====
 ---- DATASET
@@ -3887,7 +3891,7 @@ functional
 ---- BASE_TABLE_NAME
 iceberg_view
 ---- CREATE
-CREATE VIEW {db_name}{db_suffix}.{table_name} AS
+CREATE VIEW IF NOT EXISTS {db_name}{db_suffix}.{table_name} AS
 SELECT * FROM  {db_name}{db_suffix}.iceberg_query_metadata;
 ====
 ---- DATASET
@@ -3995,7 +3999,7 @@ functional
 ---- BASE_TABLE_NAME
 iceberg_lineitem_sixblocks
 ---- CREATE
-CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+CREATE EXTERNAL TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
 LIKE PARQUET '/test-warehouse/lineitem_sixblocks_iceberg/lineitem_sixblocks.parquet'
 STORED AS PARQUET
 LOCATION '/test-warehouse/lineitem_sixblocks_iceberg/';