Add support for auxiliary workloads, tests, and datasets

This change adds support for auxiliary worksloads, tests, and datasets. This is useful to augment the regular test runs with some additional tests that do not belong in the main Impala repo.
2025-12-19 18:12:08 -05:00 · 2013-05-15 08:56:52 -07:00
parent b1de018298
commit 2f7198292a
11 changed files with 42 additions and 120 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,7 +8,6 @@ cscope.out
 org.eclipse.jdt.core.prefs
 org.eclipse.jdt.ui.prefs
 *benchmark_results.csv*
-load-trevni-*-generated.sh
 load-*-generated.sql
 bin/version.info

--- a/bin/create-test-configuration.sh
+++ b/bin/create-test-configuration.sh
@@ -83,3 +83,22 @@ generate_config core-site.xml.template core-site.xml
 popd

 echo "Completed config generation"
+
+# Creates a symlink in TARGET_DIR to all subdirectories under SOURCE_DIR
+function symlink_subdirs {
+  SOURCE_DIR=$1
+  TARGET_DIR=$2
+  if [ -d "${SOURCE_DIR}" ]; then
+    find ${SOURCE_DIR}/ -maxdepth 1 -mindepth 1 -type d -exec ln -f -s {} ${TARGET_DIR} \;
+  else
+    echo "No auxiliary tests found at: ${SOURCE_DIR}"
+  fi
+}
+
+# The Impala test framework support running additional tests outside of the main repo.
+# This is an optional feature that can be enabled by setting the IMPALA_AUX_* environment
+# variables to valid locations.
+echo "Searching for auxiliary tests, workloads, and datasets (if any exist)."
+symlink_subdirs ${IMPALA_AUX_WORKLOAD_DIR} ${IMPALA_WORKLOAD_DIR}
+symlink_subdirs ${IMPALA_AUX_DATASET_DIR} ${IMPALA_DATASET_DIR}
+symlink_subdirs ${IMPALA_AUX_TEST_HOME}/tests ${IMPALA_HOME}/tests
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -54,6 +54,7 @@ fi

 export HADOOP_LZO=${HADOOP_LZO-~/hadoop-lzo}
 export IMPALA_LZO=${IMPALA_LZO-~/Impala-lzo}
+export IMPALA_AUX_TEST_HOME=${IMPALA_AUX_TEST_HOME-~/impala-auxiliary-tests}

 export IMPALA_GFLAGS_VERSION=2.0
 export IMPALA_GPERFTOOLS_VERSION=2.0
@@ -72,7 +73,9 @@ export IMPALA_AVRO_VERSION=1.7.1-cdh4.2.0
 export IMPALA_FE_DIR=$IMPALA_HOME/fe
 export IMPALA_BE_DIR=$IMPALA_HOME/be
 export IMPALA_WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
+export IMPALA_AUX_WORKLOAD_DIR=$IMPALA_AUX_TEST_HOME/testdata/workloads
 export IMPALA_DATASET_DIR=$IMPALA_HOME/testdata/datasets
+export IMPALA_AUX_DATASET_DIR=$IMPALA_AUX_TEST_HOME/testdata/datasets
 export IMPALA_COMMON_DIR=$IMPALA_HOME/common
 export PATH=$IMPALA_HOME/bin:$PATH

--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -42,10 +42,16 @@ parser.add_option("--table_formats", dest="table_formats", default=None,
                  "formats. Ex. --table_formats=seq/snap/block,text/none")
 parser.add_option("--hdfs_namenode", dest="hdfs_namenode", default="localhost:20500",
                  help="HDFS name node for Avro schema URLs, default localhost:20500")
-(options, args) = parser.parse_args()
+parser.add_option("--workload_dir", dest="workload_dir",
+                  default=os.environ['IMPALA_WORKLOAD_DIR'],
+                  help="Directory that contains Impala workloads")
+parser.add_option("--dataset_dir", dest="dataset_dir",
+                  default=os.environ['IMPALA_DATASET_DIR'],
+                  help="Directory that contains Impala datasets")
+options, args = parser.parse_args()

-WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
-DATASET_DIR = os.environ['IMPALA_DATASET_DIR']
+WORKLOAD_DIR = options.workload_dir
+DATASET_DIR = options.dataset_dir
 TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
 AVRO_SCHEMA_DIR = "avro_schemas"

--- a/fe/pom.xml
+++ b/fe/pom.xml
@@ -356,15 +356,6 @@
              <value>${use_external_impalad}</value>
            </property>
          </systemProperties>
-          <excludes>
-            <exclude>**/DdlQueryTest.java</exclude>
-            <exclude>**/HBaseQueryTest.java</exclude>
-            <exclude>**/InsertQueryTest.java</exclude>
-            <exclude>**/JoinQueryTest.java</exclude>
-            <exclude>**/MiscQueryTest.java</exclude>
-            <exclude>**/QueryTest.java</exclude>
-            <exclude>**/TpchQueryTest.java</exclude>
-          </excludes>
        </configuration>
      </plugin>

--- a/fe/src/test/java/com/cloudera/impala/service/BaseQueryTest.java
+++ b/fe/src/test/java/com/cloudera/impala/service/BaseQueryTest.java
@@ -486,15 +486,6 @@ public abstract class BaseQueryTest {
   */
  protected void runTestInExecutionMode(TestExecMode executionMode, String testFile,
      boolean abortOnError, int maxErrors) {
-    // TPCH currently takes a long time to run so just run it with one setting.
-    if (testFile.trim().startsWith("tpch")) {
-      List<TestConfiguration> testConfigs = generateAllConfigurationPermutations(
-          TEXT_FORMAT_ONLY, UNCOMPRESSED_ONLY,
-          ImmutableList.of(1024), ImmutableList.of(2),  ImmutableList.of(false));
-      runQueryWithTestConfigs(testConfigs, testFile, abortOnError, maxErrors);
-      return;
-    }
-
    switch (executionMode) {
      case REDUCED:
        // TODO: Consider running with the fastest format to cut down on execution time
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -34,11 +34,19 @@ set -u

 # Load the data set
 pushd ${IMPALA_HOME}/bin
-./start-impala-cluster.py -s 1 --wait_for_cluster
+./start-impala-cluster.py -s 3 --wait_for_cluster
 # Use unbuffered logging by executing these data loading steps with 'python -u'
 python -u ./load-data.py --workloads functional-query --exploration_strategy exhaustive
 python -u ./load-data.py --workloads tpcds --exploration_strategy core
 python -u ./load-data.py --workloads tpch --exploration_strategy core
+# Load all the auxiliary workloads (if any exist)
+if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
+  python -u ./load-data.py --workloads all --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
+      --dataset_dir=${IMPALA_AUX_DATASET_DIR} --exploration_strategy core
+else
+  echo "Skipping load of auxilary workloads because directories do not exist"
+fi
+
 ./start-impala-cluster.py --kill_only
 popd

--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1 +1,2 @@
 results
+aux_*
--- a/tests/benchmark/create_perf_result_schema.sql
+++ b/tests/benchmark/create_perf_result_schema.sql
@@ -61,65 +61,3 @@ CREATE TABLE RunInfo (
     run_info char(255),
     PRIMARY KEY (run_info_id)
 );
-
-- Populate valid file formats
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('text', 'none', 'none');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'none', 'none');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'none', 'none');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('parquet', 'none', 'none');
-
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'snap', 'block');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'gzip', 'block');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'def', 'block');
-
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'snap', 'record');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'gzip', 'record');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('seq', 'def', 'record');
-
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'snap', 'block');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'gzip', 'block');
-INSERT INTO FileType (format, compression_codec, compression_type) VALUES('rc', 'def', 'block');
-
-
-- Populate known workloads
-INSERT INTO Workload (name, scale_factor) VALUES('tpch', '');
-INSERT INTO Workload (name, scale_factor) VALUES('tpch', '1000gb');
-INSERT INTO Workload (name, scale_factor) VALUES('tpch', '10000gb');
-
-
-- Populate known queries
-insert into Query (name, query) values ('TPCH-Q3', '');
-insert into Query (name, query) values ('TPCH-Q17_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q17_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q19', '');
-insert into Query (name, query) values ('TPCH-Q12', '');
-insert into Query (name, query) values ('TPCH-Q11_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q11_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q22_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q22_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q16_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q16_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q16_QUERY_3', '');
-insert into Query (name, query) values ('TPCH-Q13', '');
-insert into Query (name, query) values ('TPCH-Q21', '');
-insert into Query (name, query) values ('TPCH-Q9', '');
-insert into Query (name, query) values ('TPCH-Q2_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q2_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q2_QUERY_3', '');
-insert into Query (name, query) values ('TPCH-Q20_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q20_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q20_QUERY_3', '');
-insert into Query (name, query) values ('TPCH-Q20_QUERY_4', '');
-insert into Query (name, query) values ('TPCH-Q20_QUERY_5', '');
-insert into Query (name, query) values ('TPCH-Q4', '');
-insert into Query (name, query) values ('TPCH-Q8', '');
-insert into Query (name, query) values ('TPCH-Q1', '');
-insert into Query (name, query) values ('TPCH-Q18_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q18_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q14', '');
-insert into Query (name, query) values ('TPCH-Q10', '');
-insert into Query (name, query) values ('TPCH-Q5', '');
-insert into Query (name, query) values ('TPCH-Q15_QUERY_1', '');
-insert into Query (name, query) values ('TPCH-Q15_QUERY_2', '');
-insert into Query (name, query) values ('TPCH-Q15_QUERY_3', '');
-insert into Query (name, query) values ('TPCH-Q6', '');
--- a/tests/parquet_data_load/test_parquet_insert.py
+++ b/tests/parquet_data_load/test_parquet_insert.py
@@ -1,34 +0,0 @@
-#!/usr/bin/env python
-# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
-# Targeted Impala insert tests
-#
-import logging
-import pytest
-from tests.common.test_vector import *
-from tests.common.impala_test_suite import *
-
-class TestParquetInsertBase(ImpalaTestSuite):
-  @classmethod
-  def add_test_dimensions(cls):
-    super(TestParquetInsertBase, cls).add_test_dimensions()
-    # This test only inserts into the parquet format.
-    cls.TestMatrix.add_dimension(TestDimension('table_format', \
-        *[TableFormatInfo.create_from_string(cls.get_workload(), 'parquet/none')]))
-    cls.TestMatrix.add_constraint(lambda v:\
-        v.get_value('exec_option')['batch_size'] == 0)
-
-class TestParquetInsertTpch(TestParquetInsertBase):
-  @classmethod
-  def get_workload(self):
-    return 'tpch'
-
-  def test_insert(self, vector):
-    self.run_test_case('insert_parquet', vector)
-
-class TestParquetInsertTpcds(TestParquetInsertBase):
-  @classmethod
-  def get_workload(self):
-    return 'tpcds'
-
-  def test_insert(self, vector):
-    self.run_test_case('insert_parquet', vector)
--- a/tests/run-tests.py
+++ b/tests/run-tests.py
@@ -22,7 +22,7 @@ import pytest
 import sys

 # We whitelist valid test directories. If a new test directory is added, update this.
-VALID_TEST_DIRS = ['failure', 'query_test', 'stress', 'unittests']
+VALID_TEST_DIRS = ['failure', 'query_test', 'stress', 'unittests', 'aux_query_tests']

 TEST_DIR = os.path.join(os.environ['IMPALA_HOME'], 'tests')
 TEST_RESULT_DIR = os.path.join(TEST_DIR, 'results')