mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
Add support for auxiliary workloads, tests, and datasets
This change adds support for auxiliary worksloads, tests, and datasets. This is useful to augment the regular test runs with some additional tests that do not belong in the main Impala repo.
This commit is contained in:
committed by
Henry Robinson
parent
b1de018298
commit
2f7198292a
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,7 +8,6 @@ cscope.out
|
||||
org.eclipse.jdt.core.prefs
|
||||
org.eclipse.jdt.ui.prefs
|
||||
*benchmark_results.csv*
|
||||
load-trevni-*-generated.sh
|
||||
load-*-generated.sql
|
||||
bin/version.info
|
||||
|
||||
|
||||
@@ -83,3 +83,22 @@ generate_config core-site.xml.template core-site.xml
|
||||
popd
|
||||
|
||||
echo "Completed config generation"
|
||||
|
||||
# Creates a symlink in TARGET_DIR to all subdirectories under SOURCE_DIR
|
||||
function symlink_subdirs {
|
||||
SOURCE_DIR=$1
|
||||
TARGET_DIR=$2
|
||||
if [ -d "${SOURCE_DIR}" ]; then
|
||||
find ${SOURCE_DIR}/ -maxdepth 1 -mindepth 1 -type d -exec ln -f -s {} ${TARGET_DIR} \;
|
||||
else
|
||||
echo "No auxiliary tests found at: ${SOURCE_DIR}"
|
||||
fi
|
||||
}
|
||||
|
||||
# The Impala test framework support running additional tests outside of the main repo.
|
||||
# This is an optional feature that can be enabled by setting the IMPALA_AUX_* environment
|
||||
# variables to valid locations.
|
||||
echo "Searching for auxiliary tests, workloads, and datasets (if any exist)."
|
||||
symlink_subdirs ${IMPALA_AUX_WORKLOAD_DIR} ${IMPALA_WORKLOAD_DIR}
|
||||
symlink_subdirs ${IMPALA_AUX_DATASET_DIR} ${IMPALA_DATASET_DIR}
|
||||
symlink_subdirs ${IMPALA_AUX_TEST_HOME}/tests ${IMPALA_HOME}/tests
|
||||
|
||||
@@ -54,6 +54,7 @@ fi
|
||||
|
||||
export HADOOP_LZO=${HADOOP_LZO-~/hadoop-lzo}
|
||||
export IMPALA_LZO=${IMPALA_LZO-~/Impala-lzo}
|
||||
export IMPALA_AUX_TEST_HOME=${IMPALA_AUX_TEST_HOME-~/impala-auxiliary-tests}
|
||||
|
||||
export IMPALA_GFLAGS_VERSION=2.0
|
||||
export IMPALA_GPERFTOOLS_VERSION=2.0
|
||||
@@ -72,7 +73,9 @@ export IMPALA_AVRO_VERSION=1.7.1-cdh4.2.0
|
||||
export IMPALA_FE_DIR=$IMPALA_HOME/fe
|
||||
export IMPALA_BE_DIR=$IMPALA_HOME/be
|
||||
export IMPALA_WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
|
||||
export IMPALA_AUX_WORKLOAD_DIR=$IMPALA_AUX_TEST_HOME/testdata/workloads
|
||||
export IMPALA_DATASET_DIR=$IMPALA_HOME/testdata/datasets
|
||||
export IMPALA_AUX_DATASET_DIR=$IMPALA_AUX_TEST_HOME/testdata/datasets
|
||||
export IMPALA_COMMON_DIR=$IMPALA_HOME/common
|
||||
export PATH=$IMPALA_HOME/bin:$PATH
|
||||
|
||||
|
||||
@@ -42,10 +42,16 @@ parser.add_option("--table_formats", dest="table_formats", default=None,
|
||||
"formats. Ex. --table_formats=seq/snap/block,text/none")
|
||||
parser.add_option("--hdfs_namenode", dest="hdfs_namenode", default="localhost:20500",
|
||||
help="HDFS name node for Avro schema URLs, default localhost:20500")
|
||||
(options, args) = parser.parse_args()
|
||||
parser.add_option("--workload_dir", dest="workload_dir",
|
||||
default=os.environ['IMPALA_WORKLOAD_DIR'],
|
||||
help="Directory that contains Impala workloads")
|
||||
parser.add_option("--dataset_dir", dest="dataset_dir",
|
||||
default=os.environ['IMPALA_DATASET_DIR'],
|
||||
help="Directory that contains Impala datasets")
|
||||
options, args = parser.parse_args()
|
||||
|
||||
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
||||
DATASET_DIR = os.environ['IMPALA_DATASET_DIR']
|
||||
WORKLOAD_DIR = options.workload_dir
|
||||
DATASET_DIR = options.dataset_dir
|
||||
TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
|
||||
AVRO_SCHEMA_DIR = "avro_schemas"
|
||||
|
||||
|
||||
@@ -356,15 +356,6 @@
|
||||
<value>${use_external_impalad}</value>
|
||||
</property>
|
||||
</systemProperties>
|
||||
<excludes>
|
||||
<exclude>**/DdlQueryTest.java</exclude>
|
||||
<exclude>**/HBaseQueryTest.java</exclude>
|
||||
<exclude>**/InsertQueryTest.java</exclude>
|
||||
<exclude>**/JoinQueryTest.java</exclude>
|
||||
<exclude>**/MiscQueryTest.java</exclude>
|
||||
<exclude>**/QueryTest.java</exclude>
|
||||
<exclude>**/TpchQueryTest.java</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
|
||||
@@ -486,15 +486,6 @@ public abstract class BaseQueryTest {
|
||||
*/
|
||||
protected void runTestInExecutionMode(TestExecMode executionMode, String testFile,
|
||||
boolean abortOnError, int maxErrors) {
|
||||
// TPCH currently takes a long time to run so just run it with one setting.
|
||||
if (testFile.trim().startsWith("tpch")) {
|
||||
List<TestConfiguration> testConfigs = generateAllConfigurationPermutations(
|
||||
TEXT_FORMAT_ONLY, UNCOMPRESSED_ONLY,
|
||||
ImmutableList.of(1024), ImmutableList.of(2), ImmutableList.of(false));
|
||||
runQueryWithTestConfigs(testConfigs, testFile, abortOnError, maxErrors);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (executionMode) {
|
||||
case REDUCED:
|
||||
// TODO: Consider running with the fastest format to cut down on execution time
|
||||
|
||||
10
testdata/bin/create-load-data.sh
vendored
10
testdata/bin/create-load-data.sh
vendored
@@ -34,11 +34,19 @@ set -u
|
||||
|
||||
# Load the data set
|
||||
pushd ${IMPALA_HOME}/bin
|
||||
./start-impala-cluster.py -s 1 --wait_for_cluster
|
||||
./start-impala-cluster.py -s 3 --wait_for_cluster
|
||||
# Use unbuffered logging by executing these data loading steps with 'python -u'
|
||||
python -u ./load-data.py --workloads functional-query --exploration_strategy exhaustive
|
||||
python -u ./load-data.py --workloads tpcds --exploration_strategy core
|
||||
python -u ./load-data.py --workloads tpch --exploration_strategy core
|
||||
# Load all the auxiliary workloads (if any exist)
|
||||
if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
|
||||
python -u ./load-data.py --workloads all --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
|
||||
--dataset_dir=${IMPALA_AUX_DATASET_DIR} --exploration_strategy core
|
||||
else
|
||||
echo "Skipping load of auxilary workloads because directories do not exist"
|
||||
fi
|
||||
|
||||
./start-impala-cluster.py --kill_only
|
||||
popd
|
||||
|
||||
|
||||
1
tests/.gitignore
vendored
1
tests/.gitignore
vendored
@@ -1 +1,2 @@
|
||||
results
|
||||
aux_*
|
||||
|
||||
@@ -61,65 +61,3 @@ CREATE TABLE RunInfo (
|
||||
run_info char(255),
|
||||
PRIMARY KEY (run_info_id)
|
||||
);
|
||||
|
||||
-- Populate valid file formats
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('text', 'none', 'none');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'none', 'none');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'none', 'none');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('parquet', 'none', 'none');
|
||||
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'snap', 'block');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'gzip', 'block');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'def', 'block');
|
||||
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'snap', 'record');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'gzip', 'record');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'def', 'record');
|
||||
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'snap', 'block');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'gzip', 'block');
|
||||
INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'def', 'block');
|
||||
|
||||
|
||||
-- Populate known workloads
|
||||
INSERT INTO Workload (name, scale_factor) VALUES('tpch', '');
|
||||
INSERT INTO Workload (name, scale_factor) VALUES('tpch', '1000gb');
|
||||
INSERT INTO Workload (name, scale_factor) VALUES('tpch', '10000gb');
|
||||
|
||||
|
||||
-- Populate known queries
|
||||
insert into Query (name, query) values ('TPCH-Q3', '');
|
||||
insert into Query (name, query) values ('TPCH-Q17_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q17_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q19', '');
|
||||
insert into Query (name, query) values ('TPCH-Q12', '');
|
||||
insert into Query (name, query) values ('TPCH-Q11_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q11_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q22_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q22_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q16_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q16_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q16_QUERY_3', '');
|
||||
insert into Query (name, query) values ('TPCH-Q13', '');
|
||||
insert into Query (name, query) values ('TPCH-Q21', '');
|
||||
insert into Query (name, query) values ('TPCH-Q9', '');
|
||||
insert into Query (name, query) values ('TPCH-Q2_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q2_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q2_QUERY_3', '');
|
||||
insert into Query (name, query) values ('TPCH-Q20_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q20_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q20_QUERY_3', '');
|
||||
insert into Query (name, query) values ('TPCH-Q20_QUERY_4', '');
|
||||
insert into Query (name, query) values ('TPCH-Q20_QUERY_5', '');
|
||||
insert into Query (name, query) values ('TPCH-Q4', '');
|
||||
insert into Query (name, query) values ('TPCH-Q8', '');
|
||||
insert into Query (name, query) values ('TPCH-Q1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q18_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q18_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q14', '');
|
||||
insert into Query (name, query) values ('TPCH-Q10', '');
|
||||
insert into Query (name, query) values ('TPCH-Q5', '');
|
||||
insert into Query (name, query) values ('TPCH-Q15_QUERY_1', '');
|
||||
insert into Query (name, query) values ('TPCH-Q15_QUERY_2', '');
|
||||
insert into Query (name, query) values ('TPCH-Q15_QUERY_3', '');
|
||||
insert into Query (name, query) values ('TPCH-Q6', '');
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
# Targeted Impala insert tests
|
||||
#
|
||||
import logging
|
||||
import pytest
|
||||
from tests.common.test_vector import *
|
||||
from tests.common.impala_test_suite import *
|
||||
|
||||
class TestParquetInsertBase(ImpalaTestSuite):
|
||||
@classmethod
|
||||
def add_test_dimensions(cls):
|
||||
super(TestParquetInsertBase, cls).add_test_dimensions()
|
||||
# This test only inserts into the parquet format.
|
||||
cls.TestMatrix.add_dimension(TestDimension('table_format', \
|
||||
*[TableFormatInfo.create_from_string(cls.get_workload(), 'parquet/none')]))
|
||||
cls.TestMatrix.add_constraint(lambda v:\
|
||||
v.get_value('exec_option')['batch_size'] == 0)
|
||||
|
||||
class TestParquetInsertTpch(TestParquetInsertBase):
|
||||
@classmethod
|
||||
def get_workload(self):
|
||||
return 'tpch'
|
||||
|
||||
def test_insert(self, vector):
|
||||
self.run_test_case('insert_parquet', vector)
|
||||
|
||||
class TestParquetInsertTpcds(TestParquetInsertBase):
|
||||
@classmethod
|
||||
def get_workload(self):
|
||||
return 'tpcds'
|
||||
|
||||
def test_insert(self, vector):
|
||||
self.run_test_case('insert_parquet', vector)
|
||||
@@ -22,7 +22,7 @@ import pytest
|
||||
import sys
|
||||
|
||||
# We whitelist valid test directories. If a new test directory is added, update this.
|
||||
VALID_TEST_DIRS = ['failure', 'query_test', 'stress', 'unittests']
|
||||
VALID_TEST_DIRS = ['failure', 'query_test', 'stress', 'unittests', 'aux_query_tests']
|
||||
|
||||
TEST_DIR = os.path.join(os.environ['IMPALA_HOME'], 'tests')
|
||||
TEST_RESULT_DIR = os.path.join(TEST_DIR, 'results')
|
||||
|
||||
Reference in New Issue
Block a user