mirror of
https://github.com/apache/impala.git
synced 2026-01-07 18:02:33 -05:00
This change moves (almost) all the functional data loading to the new data loading framework. This removes the need for the create.sql, load.sql, and load-raw-data.sql file. Instead we just have the single schema template file: testdata/datasets/functional/functional_schema_template.sql This template can be used to generate the schema for all file formats and compression variations. It also should help make loading data easier. Now you can run: bin/load-impala-data.sh "query-test" "exhaustive" And get all data needed for running the query tests. This change also includes the initial changes for new dataset/workload directory structure. The new structure looks like: testdata/workload <- Will contain query files and test vectors/dimensions testdata/datasets <- WIll contain the data files and schema templates Note: This is the first part of the change to this directory structure - it's not yet complete. # Please enter the commit message for your changes. Lines starting
76 lines
2.4 KiB
Bash
Executable File
76 lines
2.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# This script creates schema and loads data into hive for running benchmarks and
|
|
# other tests. Using this script requires passing in two parameters:
|
|
# The first is the data set type (benchmark, tpch). This will load the appropriate
|
|
# collection of data sets for the run type.
|
|
# The second is the exploration strategy. This determines the different combinations
|
|
# of file format, compression, etc that will be created and loaded. 'Core' defines
|
|
# a basic set of combinations. If 'pairwise' is specified the pairwise combinations
|
|
# of workload # + file format + compression will be loaded. If 'exhaustive' is
|
|
# passed as an argument the exhaustive set of combinations will be loaded.
|
|
# TODO: Rewrite this script in python and detect and load workloads by enumerating
|
|
# the workloads directory.
|
|
|
|
exploration_strategy=
|
|
data_set_type=
|
|
|
|
if [ $1 = "hive-benchmark" ]; then
|
|
data_set_type=$1
|
|
elif [ $1 = "functional" ]; then
|
|
data_set_type=$1
|
|
elif [ $1 = "tpch" ]; then
|
|
data_set_type=$1
|
|
elif [ $1 = "query-test" ]; then
|
|
data_set_type="tpch functional"
|
|
elif [ $1 = "all" ]; then
|
|
data_set_type="hive-benchmark tpch functional"
|
|
else
|
|
echo "Invalid run type: $1. Valid values are 'all, query-test,"\
|
|
"functional, tpch, hive-benchmark'"
|
|
exit 1
|
|
fi
|
|
|
|
if [ $2 = "core" -o $2 = "pairwise" -o $2 = "exhaustive" ]; then
|
|
exploration_strategy=$2
|
|
else
|
|
echo "Invalid exploration strategy: $2. Valid values are 'core, pairwise, exhaustive'"
|
|
exit 1
|
|
fi
|
|
|
|
bin=`dirname "$0"`
|
|
bin=`cd "$bin"; pwd`
|
|
. "$bin"/impala-config.sh
|
|
|
|
set -e
|
|
|
|
SCRIPT_DIR=$IMPALA_HOME/testdata/bin
|
|
WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
|
|
DATASET_DIR=$IMPALA_HOME/testdata/datasets
|
|
|
|
function execute_hive_query_from_file {
|
|
hive_args="-hiveconf hive.root.logger=WARN,console -v -f"
|
|
"$HIVE_HOME/bin/hive" $hive_args $1
|
|
if [ $? != 0 ]; then
|
|
echo LOAD OF $1 FAILED
|
|
exit -1
|
|
fi
|
|
}
|
|
|
|
pushd "$IMPALA_HOME/testdata/bin";
|
|
for ds in $data_set_type
|
|
do
|
|
./generate_schema_statements.py --exploration_strategy ${exploration_strategy}\
|
|
--workload=${ds} --verbose
|
|
execute_hive_query_from_file \
|
|
"$SCRIPT_DIR/load-${ds}-${exploration_strategy}-generated.sql"
|
|
done
|
|
popd
|
|
|
|
# TODO: Temporarily disable block id generation for everything except benchmark runs
|
|
# due to IMP-134
|
|
if [ $1 = "hive-benchmark" ]; then
|
|
$IMPALA_HOME/testdata/bin/generate-block-ids.sh
|
|
fi
|