Add support for auxiliary workloads, tests, and datasets

This change adds support for auxiliary worksloads, tests, and datasets. This is useful
to augment the regular test runs with some additional tests that do not belong in the
main Impala repo.
This commit is contained in:
Lenni Kuff
2013-05-15 08:56:52 -07:00
committed by Henry Robinson
parent b1de018298
commit 2f7198292a
11 changed files with 42 additions and 120 deletions

1
.gitignore vendored
View File

@@ -8,7 +8,6 @@ cscope.out
org.eclipse.jdt.core.prefs
org.eclipse.jdt.ui.prefs
*benchmark_results.csv*
load-trevni-*-generated.sh
load-*-generated.sql
bin/version.info

View File

@@ -83,3 +83,22 @@ generate_config core-site.xml.template core-site.xml
popd
echo "Completed config generation"
# Creates a symlink in TARGET_DIR to all subdirectories under SOURCE_DIR
function symlink_subdirs {
SOURCE_DIR=$1
TARGET_DIR=$2
if [ -d "${SOURCE_DIR}" ]; then
find ${SOURCE_DIR}/ -maxdepth 1 -mindepth 1 -type d -exec ln -f -s {} ${TARGET_DIR} \;
else
echo "No auxiliary tests found at: ${SOURCE_DIR}"
fi
}
# The Impala test framework support running additional tests outside of the main repo.
# This is an optional feature that can be enabled by setting the IMPALA_AUX_* environment
# variables to valid locations.
echo "Searching for auxiliary tests, workloads, and datasets (if any exist)."
symlink_subdirs ${IMPALA_AUX_WORKLOAD_DIR} ${IMPALA_WORKLOAD_DIR}
symlink_subdirs ${IMPALA_AUX_DATASET_DIR} ${IMPALA_DATASET_DIR}
symlink_subdirs ${IMPALA_AUX_TEST_HOME}/tests ${IMPALA_HOME}/tests

View File

@@ -54,6 +54,7 @@ fi
export HADOOP_LZO=${HADOOP_LZO-~/hadoop-lzo}
export IMPALA_LZO=${IMPALA_LZO-~/Impala-lzo}
export IMPALA_AUX_TEST_HOME=${IMPALA_AUX_TEST_HOME-~/impala-auxiliary-tests}
export IMPALA_GFLAGS_VERSION=2.0
export IMPALA_GPERFTOOLS_VERSION=2.0
@@ -72,7 +73,9 @@ export IMPALA_AVRO_VERSION=1.7.1-cdh4.2.0
export IMPALA_FE_DIR=$IMPALA_HOME/fe
export IMPALA_BE_DIR=$IMPALA_HOME/be
export IMPALA_WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
export IMPALA_AUX_WORKLOAD_DIR=$IMPALA_AUX_TEST_HOME/testdata/workloads
export IMPALA_DATASET_DIR=$IMPALA_HOME/testdata/datasets
export IMPALA_AUX_DATASET_DIR=$IMPALA_AUX_TEST_HOME/testdata/datasets
export IMPALA_COMMON_DIR=$IMPALA_HOME/common
export PATH=$IMPALA_HOME/bin:$PATH

View File

@@ -42,10 +42,16 @@ parser.add_option("--table_formats", dest="table_formats", default=None,
"formats. Ex. --table_formats=seq/snap/block,text/none")
parser.add_option("--hdfs_namenode", dest="hdfs_namenode", default="localhost:20500",
help="HDFS name node for Avro schema URLs, default localhost:20500")
(options, args) = parser.parse_args()
parser.add_option("--workload_dir", dest="workload_dir",
default=os.environ['IMPALA_WORKLOAD_DIR'],
help="Directory that contains Impala workloads")
parser.add_option("--dataset_dir", dest="dataset_dir",
default=os.environ['IMPALA_DATASET_DIR'],
help="Directory that contains Impala datasets")
options, args = parser.parse_args()
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
DATASET_DIR = os.environ['IMPALA_DATASET_DIR']
WORKLOAD_DIR = options.workload_dir
DATASET_DIR = options.dataset_dir
TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
AVRO_SCHEMA_DIR = "avro_schemas"

View File

@@ -356,15 +356,6 @@
<value>${use_external_impalad}</value>
</property>
</systemProperties>
<excludes>
<exclude>**/DdlQueryTest.java</exclude>
<exclude>**/HBaseQueryTest.java</exclude>
<exclude>**/InsertQueryTest.java</exclude>
<exclude>**/JoinQueryTest.java</exclude>
<exclude>**/MiscQueryTest.java</exclude>
<exclude>**/QueryTest.java</exclude>
<exclude>**/TpchQueryTest.java</exclude>
</excludes>
</configuration>
</plugin>

View File

@@ -486,15 +486,6 @@ public abstract class BaseQueryTest {
*/
protected void runTestInExecutionMode(TestExecMode executionMode, String testFile,
boolean abortOnError, int maxErrors) {
// TPCH currently takes a long time to run so just run it with one setting.
if (testFile.trim().startsWith("tpch")) {
List<TestConfiguration> testConfigs = generateAllConfigurationPermutations(
TEXT_FORMAT_ONLY, UNCOMPRESSED_ONLY,
ImmutableList.of(1024), ImmutableList.of(2), ImmutableList.of(false));
runQueryWithTestConfigs(testConfigs, testFile, abortOnError, maxErrors);
return;
}
switch (executionMode) {
case REDUCED:
// TODO: Consider running with the fastest format to cut down on execution time

View File

@@ -34,11 +34,19 @@ set -u
# Load the data set
pushd ${IMPALA_HOME}/bin
./start-impala-cluster.py -s 1 --wait_for_cluster
./start-impala-cluster.py -s 3 --wait_for_cluster
# Use unbuffered logging by executing these data loading steps with 'python -u'
python -u ./load-data.py --workloads functional-query --exploration_strategy exhaustive
python -u ./load-data.py --workloads tpcds --exploration_strategy core
python -u ./load-data.py --workloads tpch --exploration_strategy core
# Load all the auxiliary workloads (if any exist)
if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
python -u ./load-data.py --workloads all --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
--dataset_dir=${IMPALA_AUX_DATASET_DIR} --exploration_strategy core
else
echo "Skipping load of auxilary workloads because directories do not exist"
fi
./start-impala-cluster.py --kill_only
popd

1
tests/.gitignore vendored
View File

@@ -1 +1,2 @@
results
aux_*

View File

@@ -61,65 +61,3 @@ CREATE TABLE RunInfo (
run_info char(255),
PRIMARY KEY (run_info_id)
);
-- Populate valid file formats
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('text', 'none', 'none');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'none', 'none');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'none', 'none');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('parquet', 'none', 'none');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'snap', 'block');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'gzip', 'block');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'def', 'block');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'snap', 'record');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'gzip', 'record');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'def', 'record');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'snap', 'block');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'gzip', 'block');
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'def', 'block');
-- Populate known workloads
INSERT INTO Workload (name, scale_factor) VALUES('tpch', '');
INSERT INTO Workload (name, scale_factor) VALUES('tpch', '1000gb');
INSERT INTO Workload (name, scale_factor) VALUES('tpch', '10000gb');
-- Populate known queries
insert into Query (name, query) values ('TPCH-Q3', '');
insert into Query (name, query) values ('TPCH-Q17_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q17_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q19', '');
insert into Query (name, query) values ('TPCH-Q12', '');
insert into Query (name, query) values ('TPCH-Q11_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q11_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q22_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q22_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q16_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q16_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q16_QUERY_3', '');
insert into Query (name, query) values ('TPCH-Q13', '');
insert into Query (name, query) values ('TPCH-Q21', '');
insert into Query (name, query) values ('TPCH-Q9', '');
insert into Query (name, query) values ('TPCH-Q2_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q2_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q2_QUERY_3', '');
insert into Query (name, query) values ('TPCH-Q20_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q20_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q20_QUERY_3', '');
insert into Query (name, query) values ('TPCH-Q20_QUERY_4', '');
insert into Query (name, query) values ('TPCH-Q20_QUERY_5', '');
insert into Query (name, query) values ('TPCH-Q4', '');
insert into Query (name, query) values ('TPCH-Q8', '');
insert into Query (name, query) values ('TPCH-Q1', '');
insert into Query (name, query) values ('TPCH-Q18_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q18_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q14', '');
insert into Query (name, query) values ('TPCH-Q10', '');
insert into Query (name, query) values ('TPCH-Q5', '');
insert into Query (name, query) values ('TPCH-Q15_QUERY_1', '');
insert into Query (name, query) values ('TPCH-Q15_QUERY_2', '');
insert into Query (name, query) values ('TPCH-Q15_QUERY_3', '');
insert into Query (name, query) values ('TPCH-Q6', '');

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
# Targeted Impala insert tests
#
import logging
import pytest
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
class TestParquetInsertBase(ImpalaTestSuite):
@classmethod
def add_test_dimensions(cls):
super(TestParquetInsertBase, cls).add_test_dimensions()
# This test only inserts into the parquet format.
cls.TestMatrix.add_dimension(TestDimension('table_format', \
*[TableFormatInfo.create_from_string(cls.get_workload(), 'parquet/none')]))
cls.TestMatrix.add_constraint(lambda v:\
v.get_value('exec_option')['batch_size'] == 0)
class TestParquetInsertTpch(TestParquetInsertBase):
@classmethod
def get_workload(self):
return 'tpch'
def test_insert(self, vector):
self.run_test_case('insert_parquet', vector)
class TestParquetInsertTpcds(TestParquetInsertBase):
@classmethod
def get_workload(self):
return 'tpcds'
def test_insert(self, vector):
self.run_test_case('insert_parquet', vector)

View File

@@ -22,7 +22,7 @@ import pytest
import sys
# We whitelist valid test directories. If a new test directory is added, update this.
VALID_TEST_DIRS = ['failure', 'query_test', 'stress', 'unittests']
VALID_TEST_DIRS = ['failure', 'query_test', 'stress', 'unittests', 'aux_query_tests']
TEST_DIR = os.path.join(os.environ['IMPALA_HOME'], 'tests')
TEST_RESULT_DIR = os.path.join(TEST_DIR, 'results')