mirror of
https://github.com/apache/impala.git
synced 2026-01-09 15:00:11 -05:00
Allow Impala to start only with a running HMS (and no additional services like HDFS, HBase, Hive, YARN) and use the local file system. Skip all tests that need these services, use HDFS caching or assume that multiple impalads are running. To run Impala with the local filesystem, set TARGET_FILESYSTEM to 'local' and WAREHOUSE_LOCATION_PREFIX to a location on the local filesystem where the current user has permissions since this is the location where the test data will be extracted. Test coverage (with core strategy) in comparison with HDFS and S3: HDFS 1348 tests passed S3 1157 tests passed Local Filesystem 1161 tests passed Change-Id: Ic9718c7e0307273382b1cc6baf203ff2fb2acd03 Reviewed-on: http://gerrit.cloudera.org:8080/1352 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Internal Jenkins Readability: Alex Behm <alex.behm@cloudera.com>
366 lines
15 KiB
Bash
Executable File
366 lines
15 KiB
Bash
Executable File
#!/bin/bash
|
|
# Copyright 2012 Cloudera Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# This script can be executed in two ways:
|
|
# 1) Without any command line parameters - A normal data load will happen where data is
|
|
# generated as needed, generally by issuing 'INSERT INTO <table> SELECT *' commands.
|
|
# 2) With a command line parameter pointing to a test-warehouse snapshot file - In this
|
|
# case the snapshot file contents will be copied into HDFS prior to calling the data load
|
|
# scripts. This speeds up overall data loading time because it usually means only the
|
|
# table metadata needs to be created.
|
|
#
|
|
# For more information look at testdata/bin/load-test-warehouse-snapshot.sh and
|
|
# bin/load-data.py
|
|
|
|
# Exit on error.
|
|
set -e
|
|
|
|
. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
|
|
SKIP_METADATA_LOAD=0
|
|
SKIP_SNAPSHOT_LOAD=0
|
|
SNAPSHOT_FILE=""
|
|
LOAD_DATA_ARGS=""
|
|
JDBC_URL="jdbc:hive2://localhost:11050/default;"
|
|
DATA_LOADING_LOG_DIR=${IMPALA_TEST_CLUSTER_LOG_DIR}/data_loading
|
|
mkdir -p ${DATA_LOADING_LOG_DIR}
|
|
|
|
while [ -n "$*" ]
|
|
do
|
|
case $1 in
|
|
-skip_metadata_load)
|
|
SKIP_METADATA_LOAD=1
|
|
;;
|
|
-skip_snapshot_load)
|
|
SKIP_SNAPSHOT_LOAD=1
|
|
;;
|
|
-snapshot_file)
|
|
SNAPSHOT_FILE=${2-}
|
|
if [ ! -f $SNAPSHOT_FILE ]; then
|
|
echo "-snapshot_file does not exist: $SNAPSHOT_FILE"
|
|
exit 1;
|
|
fi
|
|
shift;
|
|
;;
|
|
-help|-h|*)
|
|
echo "create-load-data.sh : Creates data and loads from scratch"
|
|
echo "[-skip_metadata_load] : Skips loading of metadata"
|
|
echo "[-skip_snapshot_load] : Assumes that the snapshot is already loaded"
|
|
echo "[-snapshot_file] : Loads the test warehouse snapshot into hdfs"
|
|
exit 1;
|
|
;;
|
|
esac
|
|
shift;
|
|
done
|
|
|
|
if [[ $SKIP_METADATA_LOAD -eq 0 && "$SNAPSHOT_FILE" = "" ]]; then
|
|
echo "Loading Hive Builtins"
|
|
${IMPALA_HOME}/testdata/bin/load-hive-builtins.sh
|
|
echo "Generating HBase data"
|
|
${IMPALA_HOME}/testdata/bin/create-hbase.sh &> ${DATA_LOADING_LOG_DIR}/create-hbase.log
|
|
echo "Creating /test-warehouse HDFS directory"
|
|
hadoop fs -mkdir /test-warehouse
|
|
elif [ $SKIP_SNAPSHOT_LOAD -eq 0 ]; then
|
|
echo Loading hdfs data from snapshot: $SNAPSHOT_FILE
|
|
${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$SNAPSHOT_FILE"
|
|
# Don't skip the metadata load if a schema change is detected.
|
|
if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh; then
|
|
echo "Schema change detected, metadata will be loaded."
|
|
SKIP_METADATA_LOAD=0
|
|
fi
|
|
else
|
|
# hdfs data already exists, don't load it.
|
|
echo Skipping loading data to hdfs.
|
|
fi
|
|
|
|
function load-custom-schemas {
|
|
echo LOADING CUSTOM SCHEMAS
|
|
SCHEMA_SRC_DIR=${IMPALA_HOME}/testdata/data/schemas
|
|
SCHEMA_DEST_DIR=/test-warehouse/schemas
|
|
# clean the old schemas directory.
|
|
hadoop fs -rm -r -f ${SCHEMA_DEST_DIR}
|
|
hadoop fs -mkdir ${SCHEMA_DEST_DIR}
|
|
hadoop fs -put $SCHEMA_SRC_DIR/zipcode_incomes.parquet ${SCHEMA_DEST_DIR}/
|
|
hadoop fs -put $SCHEMA_SRC_DIR/alltypestiny.parquet ${SCHEMA_DEST_DIR}/
|
|
hadoop fs -put $SCHEMA_SRC_DIR/malformed_decimal_tiny.parquet ${SCHEMA_DEST_DIR}/
|
|
hadoop fs -put $SCHEMA_SRC_DIR/decimal.parquet ${SCHEMA_DEST_DIR}/
|
|
hadoop fs -put $SCHEMA_SRC_DIR/nested/modern_nested.parquet ${SCHEMA_DEST_DIR}/
|
|
hadoop fs -put $SCHEMA_SRC_DIR/nested/legacy_nested.parquet ${SCHEMA_DEST_DIR}/
|
|
|
|
# CHAR and VARCHAR tables written by Hive
|
|
hadoop fs -mkdir -p /test-warehouse/chars_formats_avro_snap/
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.avro \
|
|
/test-warehouse/chars_formats_avro_snap
|
|
hadoop fs -mkdir -p /test-warehouse/chars_formats_parquet/
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.parquet \
|
|
/test-warehouse/chars_formats_parquet
|
|
hadoop fs -mkdir -p /test-warehouse/chars_formats_text/
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.txt \
|
|
/test-warehouse/chars_formats_text
|
|
}
|
|
|
|
function load-data {
|
|
WORKLOAD=${1}
|
|
EXPLORATION_STRATEGY=${2:-"core"}
|
|
TABLE_FORMATS=${3:-}
|
|
|
|
MSG="Loading workload '$WORKLOAD'"
|
|
ARGS=("--workloads $WORKLOAD")
|
|
MSG+=" Using exploration strategy '$EXPLORATION_STRATEGY'"
|
|
ARGS+=("-e $EXPLORATION_STRATEGY")
|
|
if [ $TABLE_FORMATS ]; then
|
|
MSG+=" in table formats '$TABLE_FORMATS'"
|
|
ARGS+=("--table_formats $TABLE_FORMATS")
|
|
fi
|
|
if [ $LOAD_DATA_ARGS ]; then
|
|
ARGS+=("$LOAD_DATA_ARGS")
|
|
fi
|
|
# functional-query is unique. The dataset name is not the same as the workload name.
|
|
if [ "${WORKLOAD}" = "functional-query" ]; then
|
|
WORKLOAD="functional"
|
|
fi
|
|
# Force load the dataset if we detect a schema change.
|
|
if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh $WORKLOAD; then
|
|
ARGS+=("--force")
|
|
echo "Force loading $WORKLOAD because a schema change was detected"
|
|
fi
|
|
LOG_FILE=${DATA_LOADING_LOG_DIR}/data-load-${WORKLOAD}-${EXPLORATION_STRATEGY}.log
|
|
echo "$MSG. Logging to ${LOG_FILE}"
|
|
# Use unbuffered logging by executing with -u
|
|
impala-python -u ${IMPALA_HOME}/bin/load-data.py ${ARGS[@]} &> ${LOG_FILE}
|
|
}
|
|
|
|
function cache-test-tables {
|
|
echo CACHING tpch.nation AND functional.alltypestiny
|
|
# uncaching the tables first makes this operation idempotent.
|
|
${IMPALA_HOME}/bin/impala-shell.sh -q "alter table functional.alltypestiny set uncached"
|
|
${IMPALA_HOME}/bin/impala-shell.sh -q "alter table tpch.nation set uncached"
|
|
${IMPALA_HOME}/bin/impala-shell.sh -q "alter table tpch.nation set cached in 'testPool'"
|
|
${IMPALA_HOME}/bin/impala-shell.sh -q\
|
|
"alter table functional.alltypestiny set cached in 'testPool'"
|
|
}
|
|
|
|
function load-aux-workloads {
|
|
echo LOADING AUXILIARY WORKLOADS
|
|
LOG_FILE=${DATA_LOADING_LOG_DIR}/data-load-auxiliary-workloads-core.log
|
|
rm -f $LOG_FILE
|
|
# Load all the auxiliary workloads (if any exist)
|
|
if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
|
|
impala-python -u ${IMPALA_HOME}/bin/load-data.py --workloads all\
|
|
--workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
|
|
--dataset_dir=${IMPALA_AUX_DATASET_DIR}\
|
|
--exploration_strategy=core ${LOAD_DATA_ARGS} &>> $LOG_FILE
|
|
else
|
|
echo "Skipping load of auxilary workloads because directories do not exist"
|
|
fi
|
|
}
|
|
|
|
function copy-auth-policy {
|
|
echo COPYING AUTHORIZATION POLICY FILE
|
|
hadoop fs -rm -f ${FILESYSTEM_PREFIX}/test-warehouse/authz-policy.ini
|
|
hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/
|
|
}
|
|
|
|
function copy-and-load-dependent-tables {
|
|
# COPY
|
|
# TODO: The multi-format table will move these files. So we need to copy them to a
|
|
# temporary location for that table to use. Should find a better way to handle this.
|
|
echo COPYING AND LOADING DATA FOR DEPENDENT TABLES
|
|
hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat
|
|
hadoop fs -rm -r -f /tmp/alltypes_rc
|
|
hadoop fs -rm -r -f /tmp/alltypes_seq
|
|
hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009
|
|
hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009
|
|
|
|
# Create a hidden file in AllTypesSmall
|
|
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden
|
|
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden
|
|
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/_hidden
|
|
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/.hidden
|
|
|
|
# For tables that rely on loading data from local fs test-warehouse
|
|
# TODO: Find a good way to integrate this with the normal data loading scripts
|
|
beeline -n $USER -u "${JDBC_URL}" -f\
|
|
${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
|
|
}
|
|
|
|
function create-internal-hbase-table {
|
|
echo CREATING INTERNAL HBASE TABLE
|
|
# TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does
|
|
# not exist. To work around this, disable exit on error before executing this command.
|
|
# Need to investigate this more, but this works around the problem to unblock automation.
|
|
set +o errexit
|
|
beeline -n $USER -u "${JDBC_URL}" -e\
|
|
"DROP TABLE IF EXISTS functional_hbase.internal_hbase_table"
|
|
echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
set -e
|
|
# Used by CatalogTest to confirm that non-external HBase tables are identified
|
|
# correctly (IMP-581)
|
|
# Note that the usual 'hbase.table.name' property is not specified to avoid
|
|
# creating tables in HBase as a side-effect.
|
|
cat > /tmp/create-hbase-internal.sql << EOF
|
|
CREATE TABLE functional_hbase.internal_hbase_table(key int, value string)
|
|
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
|
|
WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val");
|
|
EOF
|
|
beeline -n $USER -u "${JDBC_URL}" -f /tmp/create-hbase-internal.sql
|
|
rm -f /tmp/create-hbase-internal.sql
|
|
}
|
|
|
|
function load-custom-data {
|
|
echo LOADING CUSTOM DATA
|
|
# Load the index files for corrupted lzo data.
|
|
hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \
|
|
/test-warehouse/bad_text_lzo_text_lzo/
|
|
|
|
hadoop fs -rm -r -f /bad_text_lzo_text_lzo/
|
|
hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ /
|
|
# Cleanup the old bad_text_lzo files, if they exist.
|
|
hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/
|
|
|
|
# Index all lzo files in HDFS under /test-warehouse
|
|
${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse
|
|
|
|
hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/
|
|
|
|
# IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary)
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# IMPALA-720: data file produced by parquet-mr with multiple row groups
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# IMPALA-1401: data file produced by Hive 13 containing page statistics with long min/max
|
|
# string values
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/long_page_header.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# Remove all index files in this partition.
|
|
hadoop fs -rm /test-warehouse/alltypes_text_lzo/year=2009/month=1/*.lzo.index
|
|
|
|
# Add a sequence file that only contains a header (see IMPALA-362)
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \
|
|
/test-warehouse/tinytable_seq_snap
|
|
|
|
beeline -n $USER -u "${JDBC_URL}" -f\
|
|
${IMPALA_HOME}/testdata/avro_schema_resolution/create_table.sql
|
|
}
|
|
|
|
function build-and-copy-hive-udfs {
|
|
# Build the test Hive UDFs
|
|
pushd ${IMPALA_HOME}/tests/test-hive-udfs
|
|
mvn clean
|
|
# quietly resolve dependencies to avoid log spew in jenkins runs
|
|
if [ "${USER}" == "jenkins" ]; then
|
|
echo "Quietly resolving test UDF dependencies."
|
|
mvn -q dependency:resolve
|
|
fi
|
|
mvn package
|
|
popd
|
|
# Copy the test UDF/UDA libraries into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh -build
|
|
}
|
|
|
|
function copy-and-load-ext-data-source {
|
|
# Copy the test data source library into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-data-sources.sh
|
|
# Create data sources table.
|
|
${IMPALA_HOME}/bin/impala-shell.sh -f\
|
|
${IMPALA_HOME}/testdata/bin/create-data-source-table.sql
|
|
}
|
|
|
|
|
|
# Enable debug logging.
|
|
set -x
|
|
|
|
# For kerberized clusters, use kerberos
|
|
if ${CLUSTER_DIR}/admin is_kerberized; then
|
|
LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}"
|
|
fi
|
|
|
|
# Start Impala
|
|
START_CLUSTER_ARGS=""
|
|
if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then
|
|
START_CLUSTER_ARGS="--impalad_args=--abort_on_config_error=false -s 1"
|
|
else
|
|
START_CLUSTER_ARGS="-s 3"
|
|
fi
|
|
${IMPALA_HOME}/bin/start-impala-cluster.py --log_dir=${DATA_LOADING_LOG_DIR} \
|
|
${START_CLUSTER_ARGS}
|
|
# The hdfs environment script sets up kms (encryption) and cache pools (hdfs caching).
|
|
# On a non-hdfs filesystem, we don't test encryption or hdfs caching, so this setup is not
|
|
# needed.
|
|
if [[ "${TARGET_FILESYSTEM}" == "hdfs" ]]; then
|
|
${IMPALA_HOME}/testdata/bin/setup-hdfs-env.sh
|
|
fi
|
|
|
|
if [ $SKIP_METADATA_LOAD -eq 0 ]; then
|
|
# load custom schems
|
|
load-custom-schemas
|
|
# load functional/tpcds/tpch
|
|
load-data "functional-query" "exhaustive"
|
|
load-data "tpch" "core"
|
|
# Load tpch nested data.
|
|
# TODO: Hacky and introduces more complexity into the system, but it is expedient.
|
|
${IMPALA_HOME}/testdata/bin/load_nested.sh
|
|
load-data "tpcds" "core"
|
|
load-aux-workloads
|
|
copy-and-load-dependent-tables
|
|
load-custom-data
|
|
${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 1
|
|
elif [ "${TARGET_FILESYSTEM}" = "hdfs" ]; then
|
|
echo "Skipped loading the metadata. Loading HBase."
|
|
load-data "functional-query" "core" "hbase/none"
|
|
fi
|
|
|
|
build-and-copy-hive-udfs
|
|
# Configure alltypes_seq as a read-only table. This is required for fe tests.
|
|
# Set both read and execute permissions because accessing the contents of a directory on
|
|
# the local filesystem requires the x permission (while on HDFS it requires the r
|
|
# permission).
|
|
hadoop fs -chmod -R 555 ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=1
|
|
hadoop fs -chmod -R 555 ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=3
|
|
|
|
#IMPALA-1881: data file produced by hive with multiple blocks.
|
|
hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet
|
|
hadoop fs -Ddfs.block.size=1048576 -put -f \
|
|
${IMPALA_HOME}/testdata/LineItemMultiBlock/000000_0 \
|
|
${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet
|
|
|
|
if [ "${TARGET_FILESYSTEM}" = "hdfs" ]; then
|
|
# Caching tables in s3 returns an IllegalArgumentException, see IMPALA-1714
|
|
cache-test-tables
|
|
# TODO: Modify the .sql file that creates the table to take an alternative location into
|
|
# account.
|
|
copy-and-load-ext-data-source
|
|
${IMPALA_HOME}/testdata/bin/split-hbase.sh > /dev/null 2>&1
|
|
create-internal-hbase-table
|
|
fi
|
|
# TODO: Investigate why all stats are not preserved. Theorectically, we only need to
|
|
# recompute stats for HBase.
|
|
${IMPALA_HOME}/testdata/bin/compute-table-stats.sh
|
|
copy-auth-policy
|