mirror of
https://github.com/apache/impala.git
synced 2026-02-03 09:00:39 -05:00
When loading testdata for TPC-H/TPC-DS, we first run a preload script to generate local data, and then upload them to HDFS to be used by Hive. The preload script currently always generates the data, which is time-consuming in large scale factors. This patch modifies the preload scripts to check if the last run succeeded, and reuse the data if it does. Otherwise, generate the data and leave a success marker in the data directory. Tests: - Verified the scripts locally. Change-Id: Ied40e599cda009ae0ad88ad13385e7bb86428bb4 Reviewed-on: http://gerrit.cloudera.org:8080/18233 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
73 lines
2.0 KiB
Bash
Executable File
73 lines
2.0 KiB
Bash
Executable File
#!/bin/bash
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
set -euo pipefail
|
|
. $IMPALA_HOME/bin/report_build_error.sh
|
|
setup_report_build_error
|
|
|
|
IMPALA_DATA=${IMPALA_HOME}/testdata/impala-data
|
|
TPC_H_DATA=${IMPALA_DATA}/tpch
|
|
|
|
SCALE_FACTOR=1
|
|
if [[ $# == 1 && $1 -gt 1 ]]
|
|
then
|
|
SCALE_FACTOR=$1
|
|
TPC_H_DATA=${TPC_H_DATA}${SCALE_FACTOR}
|
|
fi
|
|
|
|
if [ -f ${TPC_H_DATA}/SUCCESS ]; then
|
|
echo "Reuse existing TPC-H data in ${TPC_H_DATA}"
|
|
exit 0
|
|
fi
|
|
|
|
TPC_H_HOME=${IMPALA_TOOLCHAIN_PACKAGES_HOME}/tpc-h-${IMPALA_TPC_H_VERSION}
|
|
TPC_H_DBGEN=${TPC_H_HOME}/bin/dbgen
|
|
|
|
if [ ! -x ${TPC_H_DBGEN} ]; then
|
|
echo "Could not find TPC-H data generator executable: ${TPC_H_DBGEN}"
|
|
exit 1
|
|
fi
|
|
|
|
echo "Generating TPC-H data into ${TPC_H_DATA}"
|
|
# Delete any preexisting data or symlinks
|
|
# Need to change permissions to work around an old TPC-H data tarball that had a
|
|
# non-writable top-level directory when extracted.
|
|
chmod +w ${TPC_H_DATA} || true
|
|
rm -rf ${TPC_H_DATA}
|
|
mkdir -p ${TPC_H_DATA}
|
|
cd ${TPC_H_DATA}
|
|
|
|
if [ -t 1 ]
|
|
then
|
|
# Output is terminal, show progress verbosely
|
|
VERBOSITY='-v'
|
|
else
|
|
VERBOSITY=''
|
|
fi
|
|
|
|
${TPC_H_DBGEN} ${VERBOSITY} -f -s ${SCALE_FACTOR}
|
|
|
|
# Impala expects each table to be in its own subdirectory.
|
|
for FILE in *.tbl; do
|
|
FILE_DIR=${FILE%.tbl}
|
|
mkdir -p ${FILE_DIR}
|
|
mv ${FILE} ${FILE_DIR}
|
|
done
|
|
|
|
touch SUCCESS
|