mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
When the environment variable USE_APACHE_HIVE is set to true, build Impala for adapting to Apache Hive 3.x. In order to better distinguish it from Apache Hive 2.x later, rename USE_APACHE_HIVE to USE_APACHE_HIVE_3. Additionally, to facilitate referencing different versions of the Hive MetastoreShim, the major version of Hive has been added to the environment variable IMPALA_HIVE_DIST_TYPE. Change-Id: I11b5fe1604b6fc34469fb357c98784b7ad88574d Reviewed-on: http://gerrit.cloudera.org:8080/21724 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
697 lines
29 KiB
Bash
Executable File
697 lines
29 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# This script can be executed in two ways:
|
|
# 1) Without any command line parameters - A normal data load will happen where data is
|
|
# generated as needed, generally by issuing 'INSERT INTO <table> SELECT *' commands.
|
|
# 2) With a command line parameter pointing to a test-warehouse snapshot file - In this
|
|
# case the snapshot file contents will be copied into HDFS prior to calling the data load
|
|
# scripts. This speeds up overall data loading time because it usually means only the
|
|
# table metadata needs to be created.
|
|
#
|
|
# For more information look at testdata/bin/load-test-warehouse-snapshot.sh and
|
|
# bin/load-data.py
|
|
|
|
set -euo pipefail
|
|
. $IMPALA_HOME/bin/report_build_error.sh
|
|
setup_report_build_error
|
|
|
|
. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
|
|
. ${IMPALA_HOME}/testdata/bin/run-step.sh
|
|
|
|
# Environment variables used to direct the data loading process to an external cluster.
|
|
# TODO: We need a better way of managing how these get set. See IMPALA-4346
|
|
: ${HS2_HOST_PORT=localhost:11050}
|
|
: ${HDFS_NN=${INTERNAL_LISTEN_HOST}:20500}
|
|
: ${IMPALAD=localhost}
|
|
: ${REMOTE_LOAD=}
|
|
: ${CM_HOST=}
|
|
: ${IMPALA_SERIAL_DATALOAD=}
|
|
# We don't expect dataload to take more than 2.5 hours.
|
|
: ${TIMEOUT_FOR_CREATE_LOAD_DATA_MINS:= 150}
|
|
|
|
SKIP_METADATA_LOAD=0
|
|
SKIP_SNAPSHOT_LOAD=0
|
|
SKIP_RANGER=0
|
|
SNAPSHOT_FILE=""
|
|
LOAD_DATA_ARGS=""
|
|
EXPLORATION_STRATEGY="exhaustive"
|
|
export JDBC_URL="jdbc:hive2://${HS2_HOST_PORT}/default;"
|
|
HIVE_CMD="beeline -n $USER -u $JDBC_URL"
|
|
|
|
# For logging when using run-step.
|
|
LOG_DIR=${IMPALA_DATA_LOADING_LOGS_DIR}
|
|
|
|
echo "Executing: create-load-data.sh $@"
|
|
|
|
while [ -n "$*" ]
|
|
do
|
|
case $1 in
|
|
-exploration_strategy)
|
|
EXPLORATION_STRATEGY=${2-}
|
|
if [[ -z "$EXPLORATION_STRATEGY" ]]; then
|
|
echo "Must provide an exploration strategy from e.g. core, exhaustive"
|
|
exit 1;
|
|
fi
|
|
shift;
|
|
;;
|
|
-skip_metadata_load)
|
|
SKIP_METADATA_LOAD=1
|
|
;;
|
|
-skip_snapshot_load)
|
|
SKIP_SNAPSHOT_LOAD=1
|
|
;;
|
|
-snapshot_file)
|
|
SNAPSHOT_FILE=${2-}
|
|
if [ ! -f $SNAPSHOT_FILE ]; then
|
|
echo "-snapshot_file does not exist: $SNAPSHOT_FILE"
|
|
exit 1;
|
|
fi
|
|
shift;
|
|
;;
|
|
-cm_host)
|
|
CM_HOST=${2-}
|
|
shift;
|
|
;;
|
|
-skip_ranger)
|
|
SKIP_RANGER=1
|
|
;;
|
|
-timeout)
|
|
TIMEOUT_FOR_CREATE_LOAD_DATA_MINS=${2-}
|
|
shift;
|
|
;;
|
|
-help|-h|*)
|
|
echo "create-load-data.sh : Creates data and loads from scratch"
|
|
echo "[-skip_metadata_load] : Skips loading of metadata"
|
|
echo "[-skip_snapshot_load] : Assumes that the snapshot is already loaded"
|
|
echo "[-snapshot_file] : Loads the test warehouse snapshot into hdfs"
|
|
echo "[-cm_host] : Address of the Cloudera Manager host if loading to a remote cluster"
|
|
echo "[-skip_ranger] : Skip the set-up for Ranger."
|
|
echo "[-timeout] : The timeout in minutes for loading data."
|
|
exit 1;
|
|
;;
|
|
esac
|
|
shift;
|
|
done
|
|
|
|
if [[ -n $REMOTE_LOAD ]]; then
|
|
SKIP_RANGER=1
|
|
fi
|
|
|
|
"${IMPALA_HOME}/bin/script-timeout-check.sh" -timeout $TIMEOUT_FOR_CREATE_LOAD_DATA_MINS \
|
|
-script_name "$(basename $0)" &
|
|
TIMEOUT_PID=$!
|
|
|
|
SCHEMA_MISMATCH_ERROR="A schema change has been detected in the metadata, "
|
|
SCHEMA_MISMATCH_ERROR+="but it cannot be loaded on Isilon, s3, gcs, cos, oss, obs or "
|
|
SCHEMA_MISMATCH_ERROR+="local filesystem, and the filesystem is ${TARGET_FILESYSTEM}".
|
|
|
|
if [[ $SKIP_METADATA_LOAD -eq 0 && "$SNAPSHOT_FILE" = "" ]]; then
|
|
run-step "Generating HBase data" create-hbase.log \
|
|
${IMPALA_HOME}/testdata/bin/create-hbase.sh
|
|
run-step "Creating /test-warehouse HDFS directory" create-test-warehouse-dir.log \
|
|
hadoop fs -mkdir -p /test-warehouse
|
|
elif [ $SKIP_SNAPSHOT_LOAD -eq 0 ]; then
|
|
run-step "Loading HDFS data from snapshot: $SNAPSHOT_FILE" \
|
|
load-test-warehouse-snapshot.log \
|
|
${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$SNAPSHOT_FILE"
|
|
# Don't skip the metadata load if a schema change is detected.
|
|
if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh; then
|
|
if [[ "${TARGET_FILESYSTEM}" == "isilon" || "${TARGET_FILESYSTEM}" == "s3" || \
|
|
"${TARGET_FILESYSTEM}" == "local" || "${TARGET_FILESYSTEM}" == "gs" || \
|
|
"${TARGET_FILESYSTEM}" == "cosn" || "${TARGET_FILESYSTEM}" == "oss" || \
|
|
"${TARGET_FILESYSTEM}" == "obs" || "${TARGET_FILESYSTEM}" == "ozone" ]] ; then
|
|
echo "ERROR in $0 at line $LINENO: A schema change has been detected in the"
|
|
echo "metadata, but it cannot be loaded on isilon, s3, gcs, cos, oss, obs, ozone,"
|
|
echo "or local and the target file system is ${TARGET_FILESYSTEM}. Exiting."
|
|
# Generate an explicit JUnitXML symptom report here for easier triaging
|
|
${IMPALA_HOME}/bin/generate_junitxml.py --phase=dataload \
|
|
--step=check-schema-diff.sh --error "${SCHEMA_MISMATCH_ERROR}"
|
|
exit 1
|
|
fi
|
|
echo "Schema change detected, metadata will be loaded."
|
|
SKIP_METADATA_LOAD=0
|
|
fi
|
|
else
|
|
# hdfs data already exists, don't load it.
|
|
echo Skipping loading data to hdfs.
|
|
fi
|
|
|
|
echo "Derived params for create-load-data.sh:"
|
|
echo "EXPLORATION_STRATEGY=${EXPLORATION_STRATEGY:-}"
|
|
echo "SKIP_METADATA_LOAD=${SKIP_METADATA_LOAD:-}"
|
|
echo "SKIP_SNAPSHOT_LOAD=${SKIP_SNAPSHOT_LOAD:-}"
|
|
echo "SNAPSHOT_FILE=${SNAPSHOT_FILE:-}"
|
|
echo "CM_HOST=${CM_HOST:-}"
|
|
echo "REMOTE_LOAD=${REMOTE_LOAD:-}"
|
|
|
|
function start-impala {
|
|
: ${START_CLUSTER_ARGS=""}
|
|
# Use a fast statestore update so that DDL operations run faster.
|
|
START_CLUSTER_ARGS_INT="--state_store_args=--statestore_update_frequency_ms=50"
|
|
# Disable strict datafile location checks for Iceberg tables
|
|
DATAFILE_LOCATION_CHECK="-iceberg_allow_datafiles_in_table_location_only=false"
|
|
START_CLUSTER_ARGS_INT+=("--catalogd_args=$DATAFILE_LOCATION_CHECK")
|
|
if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then
|
|
START_CLUSTER_ARGS_INT+=("--impalad_args=--abort_on_config_error=false -s 1")
|
|
else
|
|
START_CLUSTER_ARGS_INT+=("-s 3")
|
|
fi
|
|
START_CLUSTER_ARGS_INT+=("${START_CLUSTER_ARGS}")
|
|
${IMPALA_HOME}/bin/start-impala-cluster.py --log_dir=${IMPALA_DATA_LOADING_LOGS_DIR} \
|
|
${START_CLUSTER_ARGS_INT[@]}
|
|
}
|
|
|
|
function restart-cluster {
|
|
# Break out each individual step for clarity
|
|
echo "Shutting down Impala"
|
|
${IMPALA_HOME}/bin/start-impala-cluster.py --kill
|
|
echo "Shutting down the minicluster"
|
|
${IMPALA_HOME}/testdata/bin/kill-all.sh
|
|
echo "Starting the minicluster"
|
|
${IMPALA_HOME}/testdata/bin/run-all.sh
|
|
echo "Starting Impala"
|
|
start-impala
|
|
}
|
|
|
|
function load-custom-schemas {
|
|
# HDFS commandline calls are slow, so consolidate the manipulation into
|
|
# as few calls as possible by populating a temporary directory with the
|
|
# appropriate structure and copying it in a single call.
|
|
TMP_DIR=$(mktemp -d)
|
|
|
|
# Cleanup old schemas dir
|
|
hadoop fs -rm -r -f /test-warehouse/schemas
|
|
SCHEMA_SRC_DIR=${IMPALA_HOME}/testdata/data/schemas
|
|
SCHEMA_TMP_DIR="${TMP_DIR}/schemas"
|
|
mkdir ${SCHEMA_TMP_DIR}
|
|
mkdir ${SCHEMA_TMP_DIR}/enum
|
|
ln -s ${SCHEMA_SRC_DIR}/zipcode_incomes.parquet ${SCHEMA_TMP_DIR}
|
|
ln -s ${SCHEMA_SRC_DIR}/alltypestiny.parquet ${SCHEMA_TMP_DIR}
|
|
ln -s ${SCHEMA_SRC_DIR}/enum/* ${SCHEMA_TMP_DIR}/enum
|
|
ln -s ${SCHEMA_SRC_DIR}/malformed_decimal_tiny.parquet ${SCHEMA_TMP_DIR}
|
|
ln -s ${SCHEMA_SRC_DIR}/decimal.parquet ${SCHEMA_TMP_DIR}
|
|
ln -s ${SCHEMA_SRC_DIR}/nested/modern_nested.parquet ${SCHEMA_TMP_DIR}
|
|
ln -s ${SCHEMA_SRC_DIR}/nested/legacy_nested.parquet ${SCHEMA_TMP_DIR}
|
|
|
|
# CHAR and VARCHAR tables written by Hive
|
|
mkdir -p ${TMP_DIR}/chars_formats_avro_snap \
|
|
${TMP_DIR}/chars_formats_parquet \
|
|
${TMP_DIR}/chars_formats_text \
|
|
${TMP_DIR}/chars_formats_orc_def \
|
|
${TMP_DIR}/chars_formats_json
|
|
|
|
ln -s ${IMPALA_HOME}/testdata/data/chars-formats.avro ${TMP_DIR}/chars_formats_avro_snap
|
|
ln -s ${IMPALA_HOME}/testdata/data/chars-formats.parquet ${TMP_DIR}/chars_formats_parquet
|
|
ln -s ${IMPALA_HOME}/testdata/data/chars-formats.orc ${TMP_DIR}/chars_formats_orc_def
|
|
ln -s ${IMPALA_HOME}/testdata/data/chars-formats.txt ${TMP_DIR}/chars_formats_text
|
|
ln -s ${IMPALA_HOME}/testdata/data/chars-formats.json ${TMP_DIR}/chars_formats_json
|
|
|
|
# File used by CreateTableLikeOrc tests
|
|
ln -s ${IMPALA_HOME}/testdata/data/alltypes_non_acid.orc ${SCHEMA_TMP_DIR}
|
|
|
|
ln -s ${IMPALA_HOME}/testdata/data/warmup_table_list.txt\
|
|
${TMP_DIR}/warmup_table_list.txt
|
|
|
|
hadoop fs -put -f ${TMP_DIR}/* /test-warehouse
|
|
|
|
rm -r ${TMP_DIR}
|
|
}
|
|
|
|
function load-data {
|
|
WORKLOAD=${1}
|
|
EXPLORATION_STRATEGY=${2:-"core"}
|
|
TABLE_FORMATS=${3:-}
|
|
FORCE_LOAD=${4:-}
|
|
|
|
LOAD_MSG="Loading workload '$WORKLOAD'"
|
|
ARGS=("--workloads $WORKLOAD")
|
|
LOAD_MSG+=" using exploration strategy '$EXPLORATION_STRATEGY'"
|
|
ARGS+=("-e $EXPLORATION_STRATEGY")
|
|
if [ $TABLE_FORMATS ]; then
|
|
LOAD_MSG+=" in table formats '$TABLE_FORMATS'"
|
|
ARGS+=("--table_formats $TABLE_FORMATS")
|
|
fi
|
|
if [ $LOAD_DATA_ARGS ]; then
|
|
ARGS+=("$LOAD_DATA_ARGS")
|
|
fi
|
|
# functional-query is unique. The dataset name is not the same as the workload name.
|
|
if [ "${WORKLOAD}" = "functional-query" ]; then
|
|
WORKLOAD="functional"
|
|
fi
|
|
|
|
# TODO: Why is there a REMOTE_LOAD condition? See IMPALA-4347
|
|
#
|
|
# Force load the dataset if we detect a schema change.
|
|
if [[ -z "$REMOTE_LOAD" ]]; then
|
|
if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh $WORKLOAD; then
|
|
ARGS+=("--force")
|
|
echo "Force loading $WORKLOAD because a schema change was detected"
|
|
elif [ "${FORCE_LOAD}" = "force" ]; then
|
|
ARGS+=("--force")
|
|
echo "Force loading."
|
|
fi
|
|
fi
|
|
|
|
ARGS+=("--impalad ${IMPALAD}")
|
|
ARGS+=("--hive_hs2_hostport ${HS2_HOST_PORT}")
|
|
ARGS+=("--hdfs_namenode ${HDFS_NN}")
|
|
|
|
# Disable parallelism for dataload if IMPALA_SERIAL_DATALOAD is set
|
|
if [[ "${IMPALA_SERIAL_DATALOAD}" -eq 1 ]]; then
|
|
ARGS+=("--num_processes 1")
|
|
fi
|
|
|
|
if [[ -n ${TABLE_FORMATS} ]]; then
|
|
# TBL_FMT_STR replaces slashes with underscores,
|
|
# e.g., kudu/none/none -> kudu_none_none
|
|
TBL_FMT_STR=${TABLE_FORMATS//[\/]/_}
|
|
LOG_BASENAME=data-load-${WORKLOAD}-${EXPLORATION_STRATEGY}-${TBL_FMT_STR}.log
|
|
else
|
|
LOG_BASENAME=data-load-${WORKLOAD}-${EXPLORATION_STRATEGY}.log
|
|
fi
|
|
|
|
LOG_FILE=${IMPALA_DATA_LOADING_LOGS_DIR}/${LOG_BASENAME}
|
|
echo "$LOAD_MSG. Logging to ${LOG_FILE}"
|
|
# Use unbuffered logging by executing with -u
|
|
if ! impala-python3 -u ${IMPALA_HOME}/bin/load-data.py ${ARGS[@]} &> ${LOG_FILE}; then
|
|
echo Error loading data. The end of the log file is:
|
|
tail -n 50 $LOG_FILE
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
function load-tpcds-data {
|
|
load-data "tpcds" "core"
|
|
load-data "tpcds_partitioned" "core"
|
|
}
|
|
|
|
function cache-test-tables {
|
|
echo CACHING tpch.nation AND functional.alltypestiny
|
|
# uncaching the tables first makes this operation idempotent.
|
|
${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD}\
|
|
-q "alter table functional.alltypestiny set uncached"
|
|
${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD}\
|
|
-q "alter table tpch.nation set uncached"
|
|
${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD}\
|
|
-q "alter table tpch.nation set cached in 'testPool'"
|
|
${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD} -q\
|
|
"alter table functional.alltypestiny set cached in 'testPool'"
|
|
}
|
|
|
|
function load-aux-workloads {
|
|
LOG_FILE=${IMPALA_DATA_LOADING_LOGS_DIR}/data-load-auxiliary-workloads-core.log
|
|
rm -f $LOG_FILE
|
|
# Load all the auxiliary workloads (if any exist)
|
|
if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
|
|
echo Loading auxiliary workloads. Logging to $LOG_FILE.
|
|
if ! impala-python3 -u ${IMPALA_HOME}/bin/load-data.py --workloads all\
|
|
--impalad=${IMPALAD}\
|
|
--hive_hs2_hostport=${HS2_HOST_PORT}\
|
|
--hdfs_namenode=${HDFS_NN}\
|
|
--workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
|
|
--dataset_dir=${IMPALA_AUX_DATASET_DIR}\
|
|
--exploration_strategy=core ${LOAD_DATA_ARGS} >> $LOG_FILE 2>&1; then
|
|
echo Error loading aux workloads. The end of the log file is:
|
|
tail -n 20 $LOG_FILE
|
|
return 1
|
|
fi
|
|
else
|
|
echo "Skipping load of auxilary workloads because directories do not exist"
|
|
fi
|
|
}
|
|
|
|
function copy-and-load-dependent-tables {
|
|
# COPY
|
|
# TODO: The multi-format table will move these files. So we need to copy them to a
|
|
# temporary location for that table to use. Should find a better way to handle this.
|
|
echo COPYING AND LOADING DATA FOR DEPENDENT TABLES
|
|
hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat \
|
|
/tmp/alltypes_rc /tmp/alltypes_seq /tmp/alltypes_parquet
|
|
hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009 \
|
|
/tmp/alltypes_rc/year=2009 /tmp/alltypes_parquet/year=2009
|
|
|
|
# The file written by hive to /test-warehouse will be strangely replicated rather than
|
|
# erasure coded if EC is not set in /tmp
|
|
if [[ -n "${HDFS_ERASURECODE_POLICY:-}" ]]; then
|
|
hdfs ec -setPolicy -policy "${HDFS_ERASURECODE_POLICY}" -path "/tmp/alltypes_rc"
|
|
hdfs ec -setPolicy -policy "${HDFS_ERASURECODE_POLICY}" -path "/tmp/alltypes_seq"
|
|
hdfs ec -setPolicy -policy "${HDFS_ERASURECODE_POLICY}" -path "/tmp/alltypes_parquet"
|
|
fi
|
|
|
|
hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_parquet/year=2009/month=4/ /tmp/alltypes_parquet/year=2009
|
|
|
|
# Create a hidden file in AllTypesSmall
|
|
hadoop fs -cp -f /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/_hidden
|
|
hadoop fs -cp -f /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/.hidden
|
|
|
|
# In case the data is updated by a non-super user, make sure the user can write
|
|
# by chmoding 777 /tmp/alltypes_rc and /tmp/alltypes_seq. This is needed in order
|
|
# to prevent this error during data load to a remote cluster:
|
|
#
|
|
# ERROR : Failed with exception Unable to move source hdfs://cluster-1.foo.cloudera.com:
|
|
# 8020/tmp/alltypes_seq/year=2009/month=2/000023_0 to destination hdfs://cluster-1.foo.
|
|
# cloudera.com:8020/test-warehouse/alltypesmixedformat/year=2009/month=2/000023_0
|
|
# [...]
|
|
# Caused by: org.apache.hadoop.security.AccessControlException:
|
|
# Permission denied: user=impala, access=WRITE
|
|
# inode="/tmp/alltypes_seq/year=2009/month=2":hdfs:supergroup:drwxr-xr-x
|
|
#
|
|
# The error occurs while loading dependent tables.
|
|
#
|
|
# See: logs/data_loading/copy-and-load-dependent-tables.log)
|
|
# See also: IMPALA-4345
|
|
hadoop fs -chmod -R 777 /tmp/alltypes_rc /tmp/alltypes_seq /tmp/alltypes_parquet
|
|
|
|
# For tables that rely on loading data from local fs test-wareload-house
|
|
# TODO: Find a good way to integrate this with the normal data loading scripts
|
|
SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
|
|
if $USE_APACHE_HIVE_3; then
|
|
# Apache Hive 3.1 doesn't support "STORED AS JSONFILE" (HIVE-19899)
|
|
NEW_SQL_FILE=${IMPALA_HOME}/testdata/bin/load-dependent-tables-hive3.sql
|
|
sed "s/STORED AS JSONFILE/ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.JsonSerDe'"\
|
|
$SQL_FILE > $NEW_SQL_FILE
|
|
SQL_FILE=$NEW_SQL_FILE
|
|
fi
|
|
beeline -n $USER -u "${JDBC_URL}" -f $SQL_FILE
|
|
}
|
|
|
|
function create-internal-hbase-table {
|
|
# TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does
|
|
# not exist. To work around this, disable exit on error before executing this command.
|
|
# Need to investigate this more, but this works around the problem to unblock automation.
|
|
set +o errexit
|
|
beeline -n $USER -u "${JDBC_URL}" -e\
|
|
"DROP TABLE IF EXISTS functional_hbase.internal_hbase_table;"
|
|
echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
set -e
|
|
# Used by CatalogTest to confirm that non-external HBase tables are identified
|
|
# correctly (IMP-581)
|
|
# Note that the usual 'hbase.table.name' property is not specified to avoid
|
|
# creating tables in HBase as a side-effect.
|
|
cat > /tmp/create-hbase-internal.sql << EOF
|
|
CREATE TABLE functional_hbase.internal_hbase_table(key int, value string)
|
|
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
|
|
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val");
|
|
EOF
|
|
beeline -n $USER -u "${JDBC_URL}" -f /tmp/create-hbase-internal.sql
|
|
rm -f /tmp/create-hbase-internal.sql
|
|
}
|
|
|
|
function load-custom-data {
|
|
# Add a sequence file that only contains a header (see IMPALA-362)
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \
|
|
/test-warehouse/tinytable_seq_snap
|
|
|
|
# IMPALA-1619: payload compressed with snappy used for constructing large snappy block
|
|
# compressed file
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/compressed_formats/compressed_payload.snap \
|
|
/test-warehouse/compressed_payload.snap
|
|
|
|
# Create Avro tables
|
|
beeline -n $USER -u "${JDBC_URL}" -f\
|
|
${IMPALA_HOME}/testdata/avro_schema_resolution/create_table.sql
|
|
|
|
# Delete potentially existing avro data
|
|
hadoop fs -rm -f /test-warehouse/avro_schema_resolution_test/*.avro
|
|
|
|
# Upload Avro data to the 'schema_resolution_test' table
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/avro_schema_resolution/records*.avro \
|
|
/test-warehouse/avro_schema_resolution_test
|
|
}
|
|
|
|
function build-and-copy-hive-udfs {
|
|
# Build the test Hive UDFs
|
|
pushd ${IMPALA_HOME}/java/test-hive-udfs
|
|
${IMPALA_HOME}/bin/mvn-quiet.sh clean
|
|
${IMPALA_HOME}/bin/mvn-quiet.sh package
|
|
popd
|
|
# Copy the test UDF/UDA libraries into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh -build
|
|
}
|
|
|
|
# Additional data loading actions that must be executed after the main data is loaded.
|
|
function custom-post-load-steps {
|
|
# TODO: Why is there a REMOTE_LOAD condition? See IMPALA-4347
|
|
if [[ -z "$REMOTE_LOAD" ]]; then
|
|
# Configure alltypes_seq as a read-only table. This is required for fe tests.
|
|
# Set both read and execute permissions because accessing the contents of a directory on
|
|
# the local filesystem requires the x permission (while on HDFS it requires the r
|
|
# permission).
|
|
hadoop fs -chmod -R 555 \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=1 \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=3
|
|
fi
|
|
|
|
hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_sixblocks_parquet \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_one_row_group_parquet \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_variable_num_rows_parquet
|
|
|
|
#IMPALA-1881: data file produced by hive with multiple blocks.
|
|
hadoop fs -Ddfs.block.size=1048576 -put -f \
|
|
${IMPALA_HOME}/testdata/LineItemMultiBlock/000000_0 \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet
|
|
|
|
# IMPALA-2466: Add more tests to the HDFS Parquet scanner (Added after IMPALA-1881)
|
|
hadoop fs -Ddfs.block.size=1048576 -put -f \
|
|
${IMPALA_HOME}/testdata/LineItemMultiBlock/lineitem_sixblocks.parquet \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_sixblocks_parquet
|
|
|
|
# IMPALA-2466: Add more tests to the HDFS Parquet scanner (this has only one row group)
|
|
hadoop fs -Ddfs.block.size=1048576 -put -f \
|
|
${IMPALA_HOME}/testdata/LineItemMultiBlock/lineitem_one_row_group.parquet \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_one_row_group_parquet
|
|
|
|
# IMPALA-11350: Add tests for row groups with variable num rows.
|
|
hadoop fs -Ddfs.block.size=1048576 -put -f \
|
|
${IMPALA_HOME}/testdata/LineItemMultiBlock/lineitem_multiblock_variable_num_rows.parquet \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_variable_num_rows_parquet
|
|
|
|
# IMPALA-3307: Upload test time-zone database
|
|
hadoop fs -Ddfs.block.size=1048576 -put -f ${IMPALA_HOME}/testdata/tzdb \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/
|
|
}
|
|
|
|
function copy-and-load-ext-data-source {
|
|
# Copy the test data source library into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-ext-data-sources.sh
|
|
# Load the underlying data of the data source
|
|
${IMPALA_HOME}/testdata/bin/load-ext-data-sources.sh
|
|
# Fill in the create-ext-data-source-table.sql.template to replace
|
|
# $WAREHOUSE_LOCATION_PREFIX
|
|
cat ${IMPALA_HOME}/testdata/bin/create-ext-data-source-table.sql.template \
|
|
| sed "s#\$WAREHOUSE_LOCATION_PREFIX#${WAREHOUSE_LOCATION_PREFIX}#" \
|
|
> ${IMPALA_DATA_LOADING_LOGS_DIR}/create-ext-data-source-table.sql
|
|
# Create data sources table.
|
|
${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD} -f\
|
|
${IMPALA_DATA_LOADING_LOGS_DIR}/create-ext-data-source-table.sql
|
|
# Create external JDBC tables for TPCH/TPCDS queries.
|
|
${IMPALA_HOME}/testdata/bin/create-tpc-jdbc-tables.py --jdbc_db_name=tpch_jdbc \
|
|
--workload=tpch --database_type=impala --clean
|
|
${IMPALA_HOME}/testdata/bin/create-tpc-jdbc-tables.py --jdbc_db_name=tpcds_jdbc \
|
|
--workload=tpcds --database_type=impala --clean
|
|
}
|
|
|
|
function check-hdfs-health {
|
|
if [[ -n "${HDFS_ERASURECODE_POLICY:-}" ]]; then
|
|
if ! grep "Replicated Blocks:[[:space:]]*#[[:space:]]*Total size:[[:space:]]*0 B"\
|
|
<<< $(hdfs fsck /test-warehouse | tr '\n' '#'); then
|
|
echo "There are some replicated files despite that erasure coding is on"
|
|
echo "Failing the data loading job"
|
|
exit 1
|
|
fi
|
|
return
|
|
fi
|
|
MAX_FSCK=30
|
|
SLEEP_SEC=120
|
|
LAST_NUMBER_UNDER_REPLICATED=-1
|
|
for ((FSCK_COUNT = 0; FSCK_COUNT <= MAX_FSCK; FSCK_COUNT++)); do
|
|
FSCK_OUTPUT="$(hdfs fsck /test-warehouse)"
|
|
echo "$FSCK_OUTPUT"
|
|
NUMBER_UNDER_REPLICATED=$(
|
|
grep -oP "Under-replicated blocks:[[:space:]]*\K[[:digit:]]*" <<< "$FSCK_OUTPUT")
|
|
if [[ "$NUMBER_UNDER_REPLICATED" -eq 0 ]] ; then
|
|
# All the blocks are fully-replicated. The data loading can continue.
|
|
return
|
|
fi
|
|
if [[ $(($FSCK_COUNT + 1)) -eq "$MAX_FSCK" ]] ; then
|
|
echo "Some HDFS blocks are still under-replicated after running HDFS fsck"\
|
|
"$MAX_FSCK times."
|
|
echo "Some tests cannot pass without fully-replicated blocks (IMPALA-3887)."
|
|
echo "Failing the data loading."
|
|
exit 1
|
|
fi
|
|
if [[ "$NUMBER_UNDER_REPLICATED" -eq "$LAST_NUMBER_UNDER_REPLICATED" ]] ; then
|
|
echo "There are under-replicated blocks in HDFS and HDFS is not making progress"\
|
|
"in $SLEEP_SEC seconds. Attempting to restart HDFS to resolve this issue."
|
|
# IMPALA-7119: Other minicluster components (like HBase) can fail if HDFS is
|
|
# restarted by itself, so restart the whole cluster, including Impala.
|
|
restart-cluster
|
|
fi
|
|
LAST_NUMBER_UNDER_REPLICATED="$NUMBER_UNDER_REPLICATED"
|
|
echo "$NUMBER_UNDER_REPLICATED under replicated blocks remaining."
|
|
echo "Sleeping for $SLEEP_SEC seconds before rechecking."
|
|
sleep "$SLEEP_SEC"
|
|
done
|
|
}
|
|
|
|
function warm-up-hive {
|
|
echo "Running warm up Hive statements"
|
|
$HIVE_CMD -e "create database if not exists functional;"
|
|
$HIVE_CMD -e "create table if not exists hive_warm_up_tbl (i int);"
|
|
# The insert below starts a Tez session (if Hive uses Tez) and initializes
|
|
# .hiveJars directory in HDFS, see IMPALA-8841.
|
|
$HIVE_CMD -e "insert overwrite table hive_warm_up_tbl values (1);"
|
|
}
|
|
|
|
# IMPALA-13015, IMPALA-13026: This should be called during serial phase of data load.
|
|
function create-hadoop-credential {
|
|
rm -f ${IMPALA_HOME}/testdata/jceks/test.jceks
|
|
hadoop credential create "openai-api-key-secret" -value "secret" -provider \
|
|
"localjceks://file/${IMPALA_HOME}/testdata/jceks/test.jceks"
|
|
}
|
|
|
|
run-step "Creating hadoop credential" create-hadoop-credential.log \
|
|
create-hadoop-credential
|
|
|
|
# For kerberized clusters, use kerberos
|
|
if ${CLUSTER_DIR}/admin is_kerberized; then
|
|
LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}"
|
|
fi
|
|
|
|
# Start Impala
|
|
if [[ -z "$REMOTE_LOAD" ]]; then
|
|
run-step "Starting Impala cluster" start-impala-cluster.log start-impala
|
|
fi
|
|
|
|
# The hdfs environment script sets up kms (encryption) and cache pools (hdfs caching).
|
|
# On a non-hdfs filesystem, we don't test encryption or hdfs caching, so this setup is not
|
|
# needed.
|
|
if [[ "${TARGET_FILESYSTEM}" == "hdfs" ]]; then
|
|
run-step "Setting up HDFS environment" setup-hdfs-env.log \
|
|
${IMPALA_HOME}/testdata/bin/setup-hdfs-env.sh
|
|
fi
|
|
|
|
if [ $SKIP_METADATA_LOAD -eq 0 ]; then
|
|
# Using Hive in non-parallel mode before starting parallel execution may help with some
|
|
# flakiness during data load, see IMPALA-8841. The problem only occurs in Hive 3
|
|
# environment, but always doing the warm up shouldn't hurt much and may make it easier
|
|
# to investigate future issues where Hive doesn't work at all.
|
|
warm-up-hive
|
|
run-step "Loading custom schemas" load-custom-schemas.log load-custom-schemas
|
|
# Run some steps in parallel, with run-step-backgroundable / run-step-wait-all.
|
|
# This is effective on steps that take a long time and don't depend on each
|
|
# other. Functional-query takes about ~35 minutes, and TPC-H and TPC-DS can
|
|
# finish while functional-query is running.
|
|
run-step-backgroundable "Loading functional-query data" load-functional-query.log \
|
|
load-data "functional-query" "exhaustive"
|
|
run-step-backgroundable "Loading TPC-H data" load-tpch.log load-data "tpch" "core"
|
|
run-step-backgroundable "Loading TPC-DS data" load-tpcds.log load-tpcds-data
|
|
run-step-wait-all
|
|
# Load tpch nested data.
|
|
# TODO: Hacky and introduces more complexity into the system, but it is expedient.
|
|
if [[ -n "$CM_HOST" ]]; then
|
|
LOAD_NESTED_ARGS="--cm-host $CM_HOST"
|
|
fi
|
|
run-step "Loading nested parquet data" load-nested.log \
|
|
${IMPALA_HOME}/testdata/bin/load_nested.py \
|
|
-t tpch_nested_parquet -f parquet/none ${LOAD_NESTED_ARGS:-}
|
|
run-step "Loading nested orc data" load-nested.log \
|
|
${IMPALA_HOME}/testdata/bin/load_nested.py \
|
|
-t tpch_nested_orc_def -f orc/def ${LOAD_NESTED_ARGS:-}
|
|
run-step "Loading auxiliary workloads" load-aux-workloads.log load-aux-workloads
|
|
run-step "Loading dependent tables" copy-and-load-dependent-tables.log \
|
|
copy-and-load-dependent-tables
|
|
run-step "Loading custom data" load-custom-data.log load-custom-data
|
|
run-step "Creating many block table" create-table-many-blocks.log \
|
|
${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 1
|
|
elif [ "${TARGET_FILESYSTEM}" = "hdfs" ]; then
|
|
echo "Skipped loading the metadata."
|
|
run-step "Loading HBase data only" load-hbase-only.log \
|
|
load-data "functional-query" "core" "hbase/none"
|
|
fi
|
|
|
|
if [[ $SKIP_METADATA_LOAD -eq 1 ]]; then
|
|
# Tests depend on the kudu data being clean, so load the data from scratch.
|
|
# This is only necessary if this is not a full dataload, because a full dataload
|
|
# already loads Kudu functional and TPC-H tables from scratch.
|
|
run-step-backgroundable "Loading Kudu functional" load-kudu.log \
|
|
load-data "functional-query" "core" "kudu/none/none" force
|
|
run-step-backgroundable "Loading Kudu TPCH" load-kudu-tpch.log \
|
|
load-data "tpch" "core" "kudu/none/none" force
|
|
fi
|
|
run-step-backgroundable "Loading Hive UDFs" build-and-copy-hive-udfs.log \
|
|
build-and-copy-hive-udfs
|
|
run-step-wait-all
|
|
run-step "Running custom post-load steps" custom-post-load-steps.log \
|
|
custom-post-load-steps
|
|
|
|
if [ "${TARGET_FILESYSTEM}" = "hdfs" ]; then
|
|
# Caching tables in s3 returns an IllegalArgumentException, see IMPALA-1714
|
|
run-step "Caching test tables" cache-test-tables.log cache-test-tables
|
|
|
|
run-step "Creating internal HBase table" create-internal-hbase-table.log \
|
|
create-internal-hbase-table
|
|
|
|
run-step "Checking HDFS health" check-hdfs-health.log check-hdfs-health
|
|
|
|
# Saving the list of created files can help in debugging missing files.
|
|
run-step "Logging created files" created-files.log hdfs dfs -ls -R /test-warehouse
|
|
fi
|
|
|
|
if [[ "${TARGET_FILESYSTEM}" = "hdfs" || "${TARGET_FILESYSTEM}" = "ozone" || \
|
|
"${TARGET_FILESYSTEM}" = "s3" ]]; then
|
|
# TODO: Modify the .sql file that creates the table to take an alternative location into
|
|
# account.
|
|
run-step "Loading external data sources" load-ext-data-source.log \
|
|
copy-and-load-ext-data-source
|
|
fi
|
|
|
|
# TODO: Investigate why all stats are not preserved. Theoretically, we only need to
|
|
# recompute stats for HBase.
|
|
run-step "Computing table stats" compute-table-stats.log \
|
|
${IMPALA_HOME}/testdata/bin/compute-table-stats.sh
|
|
|
|
# IMPALA-8346: this step only applies if the cluster is the local minicluster
|
|
if [[ -z "$REMOTE_LOAD" ]]; then
|
|
run-step "Creating tpcds testcase data" create-tpcds-testcase-data.log \
|
|
${IMPALA_HOME}/testdata/bin/create-tpcds-testcase-files.sh
|
|
fi
|
|
|
|
if [[ $SKIP_RANGER -eq 0 ]]; then
|
|
run-step "Setting up Ranger" setup-ranger.log \
|
|
${IMPALA_HOME}/testdata/bin/setup-ranger.sh
|
|
fi
|
|
|
|
# Restart the minicluster. This is strictly to provide a sanity check that
|
|
# restarting the minicluster works and doesn't impact the tests. This is a common
|
|
# operation for developers, so it is nice to test it.
|
|
restart-cluster
|
|
|
|
# Kill the spawned timeout process and its child sleep process.
|
|
# There may not be a sleep process, so ignore failure.
|
|
pkill -P $TIMEOUT_PID || true
|
|
kill $TIMEOUT_PID
|