#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # This script can be executed in two ways: # 1) Without any command line parameters - A normal data load will happen where data is # generated as needed, generally by issuing 'INSERT INTO SELECT *' commands. # 2) With a command line parameter pointing to a test-warehouse snapshot file - In this # case the snapshot file contents will be copied into HDFS prior to calling the data load # scripts. This speeds up overall data loading time because it usually means only the # table metadata needs to be created. # # For more information look at testdata/bin/load-test-warehouse-snapshot.sh and # bin/load-data.py set -euo pipefail trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 . ${IMPALA_HOME}/testdata/bin/run-step.sh # Environment variables used to direct the data loading process to an external cluster. # TODO: We need a better way of managing how these get set. See IMPALA-4346 : ${HS2_HOST_PORT=localhost:11050} : ${HDFS_NN=localhost:20500} : ${IMPALAD=localhost:21000} : ${REMOTE_LOAD=} : ${CM_HOST=} SKIP_METADATA_LOAD=0 SKIP_SNAPSHOT_LOAD=0 SNAPSHOT_FILE="" LOAD_DATA_ARGS="" EXPLORATION_STRATEGY="exhaustive" export JDBC_URL="jdbc:hive2://${HS2_HOST_PORT}/default;" # For logging when using run-step. LOG_DIR=${IMPALA_DATA_LOADING_LOGS_DIR} echo "Executing: create-load-data.sh $@" while [ -n "$*" ] do case $1 in -exploration_strategy) EXPLORATION_STRATEGY=${2-} if [[ -z "$EXPLORATION_STRATEGY" ]]; then echo "Must provide an exploration strategy from e.g. core, exhaustive" exit 1; fi shift; ;; -skip_metadata_load) SKIP_METADATA_LOAD=1 ;; -skip_snapshot_load) SKIP_SNAPSHOT_LOAD=1 ;; -snapshot_file) SNAPSHOT_FILE=${2-} if [ ! -f $SNAPSHOT_FILE ]; then echo "-snapshot_file does not exist: $SNAPSHOT_FILE" exit 1; fi shift; ;; -cm_host) CM_HOST=${2-} shift; ;; -help|-h|*) echo "create-load-data.sh : Creates data and loads from scratch" echo "[-skip_metadata_load] : Skips loading of metadata" echo "[-skip_snapshot_load] : Assumes that the snapshot is already loaded" echo "[-snapshot_file] : Loads the test warehouse snapshot into hdfs" echo "[-cm_host] : Address of the Cloudera Manager host if loading to a remote cluster" exit 1; ;; esac shift; done if [[ $SKIP_METADATA_LOAD -eq 0 && "$SNAPSHOT_FILE" = "" ]]; then if [[ -z "$REMOTE_LOAD" ]]; then run-step "Loading Hive Builtins" load-hive-builtins.log \ ${IMPALA_HOME}/testdata/bin/load-hive-builtins.sh fi run-step "Generating HBase data" create-hbase.log \ ${IMPALA_HOME}/testdata/bin/create-hbase.sh run-step "Creating /test-warehouse HDFS directory" create-test-warehouse-dir.log \ hadoop fs -mkdir /test-warehouse elif [ $SKIP_SNAPSHOT_LOAD -eq 0 ]; then run-step "Loading HDFS data from snapshot: $SNAPSHOT_FILE" \ load-test-warehouse-snapshot.log \ ${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$SNAPSHOT_FILE" # Don't skip the metadata load if a schema change is detected. if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh; then if [[ "${TARGET_FILESYSTEM}" == "isilon" || "${TARGET_FILESYSTEM}" == "s3" || \ "${TARGET_FILESYSTEM}" == "local" ]] ; then echo "ERROR in $0 at line $LINENO: A schema change has been detected in the" echo "metadata, but it cannot be loaded on isilon, s3 or local and the" echo "target file system is ${TARGET_FILESYSTEM}. Exiting." exit 1 fi echo "Schema change detected, metadata will be loaded." SKIP_METADATA_LOAD=0 fi else # hdfs data already exists, don't load it. echo Skipping loading data to hdfs. fi echo "Derived params for create-load-data.sh:" echo "EXPLORATION_STRATEGY=${EXPLORATION_STRATEGY:-}" echo "SKIP_METADATA_LOAD=${SKIP_METADATA_LOAD:-}" echo "SKIP_SNAPSHOT_LOAD=${SKIP_SNAPSHOT_LOAD:-}" echo "SNAPSHOT_FILE=${SNAPSHOT_FILE:-}" echo "CM_HOST=${CM_HOST:-}" echo "REMOTE_LOAD=${REMOTE_LOAD:-}" function load-custom-schemas { SCHEMA_SRC_DIR=${IMPALA_HOME}/testdata/data/schemas SCHEMA_DEST_DIR=/test-warehouse/schemas # clean the old schemas directory. hadoop fs -rm -r -f ${SCHEMA_DEST_DIR} hadoop fs -mkdir ${SCHEMA_DEST_DIR} hadoop fs -put $SCHEMA_SRC_DIR/zipcode_incomes.parquet ${SCHEMA_DEST_DIR}/ hadoop fs -put $SCHEMA_SRC_DIR/alltypestiny.parquet ${SCHEMA_DEST_DIR}/ hadoop fs -put $SCHEMA_SRC_DIR/enum ${SCHEMA_DEST_DIR}/ hadoop fs -put $SCHEMA_SRC_DIR/malformed_decimal_tiny.parquet ${SCHEMA_DEST_DIR}/ hadoop fs -put $SCHEMA_SRC_DIR/decimal.parquet ${SCHEMA_DEST_DIR}/ hadoop fs -put $SCHEMA_SRC_DIR/nested/modern_nested.parquet ${SCHEMA_DEST_DIR}/ hadoop fs -put $SCHEMA_SRC_DIR/nested/legacy_nested.parquet ${SCHEMA_DEST_DIR}/ # CHAR and VARCHAR tables written by Hive hadoop fs -mkdir -p /test-warehouse/chars_formats_avro_snap/ hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.avro \ /test-warehouse/chars_formats_avro_snap hadoop fs -mkdir -p /test-warehouse/chars_formats_parquet/ hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.parquet \ /test-warehouse/chars_formats_parquet hadoop fs -mkdir -p /test-warehouse/chars_formats_text/ hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.txt \ /test-warehouse/chars_formats_text } function load-data { WORKLOAD=${1} EXPLORATION_STRATEGY=${2:-"core"} TABLE_FORMATS=${3:-} FORCE_LOAD=${4:-} MSG="Loading workload '$WORKLOAD'" ARGS=("--workloads $WORKLOAD") MSG+=" using exploration strategy '$EXPLORATION_STRATEGY'" ARGS+=("-e $EXPLORATION_STRATEGY") if [ $TABLE_FORMATS ]; then MSG+=" in table formats '$TABLE_FORMATS'" ARGS+=("--table_formats $TABLE_FORMATS") fi if [ $LOAD_DATA_ARGS ]; then ARGS+=("$LOAD_DATA_ARGS") fi # functional-query is unique. The dataset name is not the same as the workload name. if [ "${WORKLOAD}" = "functional-query" ]; then WORKLOAD="functional" fi # TODO: Why is there a REMOTE_LOAD condition? See IMPALA-4347 # # Force load the dataset if we detect a schema change. if [[ -z "$REMOTE_LOAD" ]]; then if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh $WORKLOAD; then ARGS+=("--force") echo "Force loading $WORKLOAD because a schema change was detected" elif [ "${FORCE_LOAD}" = "force" ]; then ARGS+=("--force") echo "Force loading." fi fi ARGS+=("--impalad ${IMPALAD}") ARGS+=("--hive_hs2_hostport ${HS2_HOST_PORT}") ARGS+=("--hdfs_namenode ${HDFS_NN}") if [[ -n ${TABLE_FORMATS} ]]; then # TBL_FMT_STR replaces slashes with underscores, # e.g., kudu/none/none -> kudu_none_none TBL_FMT_STR=${TABLE_FORMATS//[\/]/_} LOG_BASENAME=data-load-${WORKLOAD}-${EXPLORATION_STRATEGY}-${TBL_FMT_STR}.log else LOG_BASENAME=data-load-${WORKLOAD}-${EXPLORATION_STRATEGY}.log fi LOG_FILE=${IMPALA_DATA_LOADING_LOGS_DIR}/${LOG_BASENAME} echo "$MSG. Logging to ${LOG_FILE}" # Use unbuffered logging by executing with -u if ! impala-python -u ${IMPALA_HOME}/bin/load-data.py ${ARGS[@]} &> ${LOG_FILE}; then echo Error loading data. The end of the log file is: tail -n 50 $LOG_FILE return 1 fi } function cache-test-tables { echo CACHING tpch.nation AND functional.alltypestiny # uncaching the tables first makes this operation idempotent. ${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD}\ -q "alter table functional.alltypestiny set uncached" ${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD}\ -q "alter table tpch.nation set uncached" ${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD}\ -q "alter table tpch.nation set cached in 'testPool'" ${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD} -q\ "alter table functional.alltypestiny set cached in 'testPool'" } function load-aux-workloads { LOG_FILE=${IMPALA_DATA_LOADING_LOGS_DIR}/data-load-auxiliary-workloads-core.log rm -f $LOG_FILE # Load all the auxiliary workloads (if any exist) if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then echo Loading auxiliary workloads. Logging to $LOG_FILE. if ! impala-python -u ${IMPALA_HOME}/bin/load-data.py --workloads all\ --impalad=${IMPALAD}\ --hive_hs2_hostport=${HS2_HOST_PORT}\ --hdfs_namenode=${HDFS_NN}\ --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\ --dataset_dir=${IMPALA_AUX_DATASET_DIR}\ --exploration_strategy=core ${LOAD_DATA_ARGS} >> $LOG_FILE 2>&1; then echo Error loading aux workloads. The end of the log file is: tail -n 20 $LOG_FILE return 1 fi else echo "Skipping load of auxilary workloads because directories do not exist" fi } function copy-auth-policy { echo COPYING AUTHORIZATION POLICY FILE hadoop fs -rm -f ${FILESYSTEM_PREFIX}/test-warehouse/authz-policy.ini hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini \ ${FILESYSTEM_PREFIX}/test-warehouse/ } function copy-and-load-dependent-tables { # COPY # TODO: The multi-format table will move these files. So we need to copy them to a # temporary location for that table to use. Should find a better way to handle this. echo COPYING AND LOADING DATA FOR DEPENDENT TABLES hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat hadoop fs -rm -r -f /tmp/alltypes_rc hadoop fs -rm -r -f /tmp/alltypes_seq hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009 hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009 hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009 hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009 # Create a hidden file in AllTypesSmall hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ /test-warehouse/alltypessmall/year=2009/month=1/_hidden hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ /test-warehouse/alltypessmall/year=2009/month=1/.hidden # In case the data is updated by a non-super user, make sure the user can write # by chmoding 777 /tmp/alltypes_rc and /tmp/alltypes_seq. This is needed in order # to prevent this error during data load to a remote cluster: # # ERROR : Failed with exception Unable to move source hdfs://cluster-1.foo.cloudera.com: # 8020/tmp/alltypes_seq/year=2009/month=2/000023_0 to destination hdfs://cluster-1.foo. # cloudera.com:8020/test-warehouse/alltypesmixedformat/year=2009/month=2/000023_0 # [...] # Caused by: org.apache.hadoop.security.AccessControlException: # Permission denied: user=impala, access=WRITE # inode="/tmp/alltypes_seq/year=2009/month=2":hdfs:supergroup:drwxr-xr-x # # The error occurs while loading dependent tables. # # See: logs/data_loading/copy-and-load-dependent-tables.log) # See also: IMPALA-4345 hadoop fs -chmod -R 777 /tmp/alltypes_rc hadoop fs -chmod -R 777 /tmp/alltypes_seq # For tables that rely on loading data from local fs test-wareload-house # TODO: Find a good way to integrate this with the normal data loading scripts beeline -n $USER -u "${JDBC_URL}" -f\ ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql } function create-internal-hbase-table { # TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does # not exist. To work around this, disable exit on error before executing this command. # Need to investigate this more, but this works around the problem to unblock automation. set +o errexit beeline -n $USER -u "${JDBC_URL}" -e\ "DROP TABLE IF EXISTS functional_hbase.internal_hbase_table;" echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell set -e # Used by CatalogTest to confirm that non-external HBase tables are identified # correctly (IMP-581) # Note that the usual 'hbase.table.name' property is not specified to avoid # creating tables in HBase as a side-effect. cat > /tmp/create-hbase-internal.sql << EOF CREATE TABLE functional_hbase.internal_hbase_table(key int, value string) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val"); EOF beeline -n $USER -u "${JDBC_URL}" -f /tmp/create-hbase-internal.sql rm -f /tmp/create-hbase-internal.sql } function load-custom-data { # Load the index files for corrupted lzo data. hadoop fs -mkdir -p /test-warehouse/bad_text_lzo_text_lzo hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \ /test-warehouse/bad_text_lzo_text_lzo/ hadoop fs -rm -r -f /bad_text_lzo_text_lzo/ hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ / # Cleanup the old bad_text_lzo files, if they exist. hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/ # TODO: Why is there a REMOTE_LOAD condition? See IMPALA-4347 if [[ -z $REMOTE_LOAD ]]; then # Index all lzo files in HDFS under /test-warehouse ${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse fi hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/ # IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0 hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \ /test-warehouse/bad_parquet_parquet # Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary) hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \ /test-warehouse/bad_parquet_parquet # IMPALA-720: data file produced by parquet-mr with multiple row groups hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \ /test-warehouse/bad_parquet_parquet # IMPALA-1401: data file produced by Hive 13 containing page statistics with long min/max # string values hadoop fs -put -f ${IMPALA_HOME}/testdata/data/long_page_header.parquet \ /test-warehouse/bad_parquet_parquet # IMPALA-3732: parquet files with corrupt strings local parq_file for parq_file in dict-encoded-negative-len.parq plain-encoded-negative-len.parq; do hadoop fs -put -f ${IMPALA_HOME}/testdata/bad_parquet_data/$parq_file \ /test-warehouse/bad_parquet_strings_negative_len_parquet done for parq_file in dict-encoded-out-of-bounds.parq plain-encoded-out-of-bounds.parq; do hadoop fs -put -f ${IMPALA_HOME}/testdata/bad_parquet_data/$parq_file \ /test-warehouse/bad_parquet_strings_out_of_bounds_parquet done # Remove all index files in this partition. hadoop fs -rm -f /test-warehouse/alltypes_text_lzo/year=2009/month=1/*.lzo.index # Add a sequence file that only contains a header (see IMPALA-362) hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \ /test-warehouse/tinytable_seq_snap # IMPALA-1619: payload compressed with snappy used for constructing large snappy block # compressed file hadoop fs -put -f ${IMPALA_HOME}/testdata/compressed_formats/compressed_payload.snap \ /test-warehouse/compressed_payload.snap # Create Avro tables beeline -n $USER -u "${JDBC_URL}" -f\ ${IMPALA_HOME}/testdata/avro_schema_resolution/create_table.sql # Delete potentially existing avro data hadoop fs -rm -f /test-warehouse/avro_schema_resolution_test/*.avro # Upload Avro data to the 'schema_resolution_test' table hadoop fs -put ${IMPALA_HOME}/testdata/avro_schema_resolution/records*.avro \ /test-warehouse/avro_schema_resolution_test } function build-and-copy-hive-udfs { # Build the test Hive UDFs pushd ${IMPALA_HOME}/tests/test-hive-udfs ${IMPALA_HOME}/bin/mvn-quiet.sh clean ${IMPALA_HOME}/bin/mvn-quiet.sh package popd # Copy the test UDF/UDA libraries into HDFS ${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh -build } # Additional data loading actions that must be executed after the main data is loaded. function custom-post-load-steps { # TODO: Why is there a REMOTE_LOAD condition? See IMPALA-4347 if [[ -z "$REMOTE_LOAD" ]]; then # Configure alltypes_seq as a read-only table. This is required for fe tests. # Set both read and execute permissions because accessing the contents of a directory on # the local filesystem requires the x permission (while on HDFS it requires the r # permission). hadoop fs -chmod -R 555 ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=1 hadoop fs -chmod -R 555 ${FILESYSTEM_PREFIX}/test-warehouse/alltypes_seq/year=2009/month=3 fi #IMPALA-1881: data file produced by hive with multiple blocks. hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet hadoop fs -Ddfs.block.size=1048576 -put -f \ ${IMPALA_HOME}/testdata/LineItemMultiBlock/000000_0 \ ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_parquet # IMPALA-2466: Add more tests to the HDFS Parquet scanner (Added after IMPALA-1881) hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_sixblocks_parquet && \ hadoop fs -Ddfs.block.size=1048576 -put -f \ ${IMPALA_HOME}/testdata/LineItemMultiBlock/lineitem_sixblocks.parquet \ ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_sixblocks_parquet # IMPALA-2466: Add more tests to the HDFS Parquet scanner (this has only one row group) hadoop fs -mkdir -p ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_one_row_group_parquet && \ hadoop fs -Ddfs.block.size=1048576 -put -f \ ${IMPALA_HOME}/testdata/LineItemMultiBlock/lineitem_one_row_group.parquet \ ${FILESYSTEM_PREFIX}/test-warehouse/lineitem_multiblock_one_row_group_parquet } function copy-and-load-ext-data-source { # Copy the test data source library into HDFS ${IMPALA_HOME}/testdata/bin/copy-data-sources.sh # Create data sources table. ${IMPALA_HOME}/bin/impala-shell.sh -i ${IMPALAD} -f\ ${IMPALA_HOME}/testdata/bin/create-data-source-table.sql } function wait-hdfs-replication { FAIL_COUNT=0 while [[ "$FAIL_COUNT" -ne "6" ]] ; do FSCK_OUTPUT="$(hdfs fsck /test-warehouse)" echo "$FSCK_OUTPUT" if grep "Under-replicated blocks:[[:space:]]*0" <<< "$FSCK_OUTPUT"; then return fi let FAIL_COUNT="$FAIL_COUNT"+1 sleep 5 done echo "Some HDFS blocks are still under replicated after 30s." echo "Some tests cannot pass without fully replicated blocks (IMPALA-3887)." echo "Failing the data loading." exit 1 } # For kerberized clusters, use kerberos if ${CLUSTER_DIR}/admin is_kerberized; then LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}" fi # Start Impala : ${START_CLUSTER_ARGS=""} if [[ "${TARGET_FILESYSTEM}" == "local" ]]; then START_CLUSTER_ARGS="--impalad_args=--abort_on_config_error=false -s 1 ${START_CLUSTER_ARGS}" else START_CLUSTER_ARGS="-s 3 ${START_CLUSTER_ARGS}" fi if [[ -z "$REMOTE_LOAD" ]]; then run-step "Starting Impala cluster" start-impala-cluster.log \ ${IMPALA_HOME}/bin/start-impala-cluster.py --log_dir=${IMPALA_DATA_LOADING_LOGS_DIR} \ ${START_CLUSTER_ARGS} fi # The hdfs environment script sets up kms (encryption) and cache pools (hdfs caching). # On a non-hdfs filesystem, we don't test encryption or hdfs caching, so this setup is not # needed. if [[ "${TARGET_FILESYSTEM}" == "hdfs" ]]; then run-step "Setting up HDFS environment" setup-hdfs-env.log \ ${IMPALA_HOME}/testdata/bin/setup-hdfs-env.sh fi if [ $SKIP_METADATA_LOAD -eq 0 ]; then run-step "Loading custom schemas" load-custom-schemas.log load-custom-schemas # Run some steps in parallel, with run-step-backgroundable / run-step-wait-all. # This is effective on steps that take a long time and don't depend on each # other. Functional-query takes about ~35 minutes, and TPC-H and TPC-DS can # finish while functional-query is running. run-step-backgroundable "Loading functional-query data" load-functional-query.log \ load-data "functional-query" "exhaustive" run-step-backgroundable "Loading TPC-H data" load-tpch.log load-data "tpch" "core" run-step-backgroundable "Loading TPC-DS data" load-tpcds.log load-data "tpcds" "core" run-step-wait-all # Load tpch nested data. # TODO: Hacky and introduces more complexity into the system, but it is expedient. if [[ -n "$CM_HOST" ]]; then LOAD_NESTED_ARGS="--cm-host $CM_HOST" fi run-step "Loading nested data" load-nested.log \ ${IMPALA_HOME}/testdata/bin/load_nested.py ${LOAD_NESTED_ARGS:-} run-step "Loading auxiliary workloads" load-aux-workloads.log load-aux-workloads run-step "Loading dependent tables" copy-and-load-dependent-tables.log \ copy-and-load-dependent-tables run-step "Loading custom data" load-custom-data.log load-custom-data run-step "Creating many block table" create-table-many-blocks.log \ ${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 1 elif [ "${TARGET_FILESYSTEM}" = "hdfs" ]; then echo "Skipped loading the metadata." run-step "Loading HBase data only" load-hbase-only.log \ load-data "functional-query" "core" "hbase/none" fi if $KUDU_IS_SUPPORTED; then # Tests depend on the kudu data being clean, so load the data from scratch. run-step-backgroundable "Loading Kudu functional" load-kudu.log \ load-data "functional-query" "core" "kudu/none/none" force run-step-backgroundable "Loading Kudu TPCH" load-kudu-tpch.log \ load-data "tpch" "core" "kudu/none/none" force fi run-step-backgroundable "Loading Hive UDFs" build-and-copy-hive-udfs.log \ build-and-copy-hive-udfs run-step-wait-all run-step "Running custom post-load steps" custom-post-load-steps.log \ custom-post-load-steps if [ "${TARGET_FILESYSTEM}" = "hdfs" ]; then # Caching tables in s3 returns an IllegalArgumentException, see IMPALA-1714 run-step "Caching test tables" cache-test-tables.log cache-test-tables # TODO: Modify the .sql file that creates the table to take an alternative location into # account. run-step "Loading external data sources" load-ext-data-source.log \ copy-and-load-ext-data-source # HBase splitting is only relevant for FE tests if [[ -z "$REMOTE_LOAD" ]]; then run-step "Splitting HBase" create-hbase.log ${IMPALA_HOME}/testdata/bin/split-hbase.sh fi run-step "Creating internal HBase table" create-internal-hbase-table.log \ create-internal-hbase-table run-step "Waiting for HDFS replication" wait-hdfs-replication.log wait-hdfs-replication fi # TODO: Investigate why all stats are not preserved. Theoretically, we only need to # recompute stats for HBase. run-step "Computing table stats" compute-table-stats.log \ ${IMPALA_HOME}/testdata/bin/compute-table-stats.sh run-step "Copying auth policy file" copy-auth-policy.log copy-auth-policy