Update benchmark tests to run against generic workload, data loading with scale factor, +more

This change updates the run-benchmark script to enable it to target one or more workloads. Now benchmarks can be run like: ./run-benchmark --workloads=hive-benchmark,tpch We lookup the workload in the workloads directory, then read the associated query .test files and start executing them. To ensure the queries are not duplicated between benchmark and query tests, I moved all existing queries (under fe/src/test/resources/* to the workloads directory. You do NOT need to look through all the .test files, I've just moved them. The one new file is the 'hive-benchmark.test' which contains the hive benchmark queries. Also added support for generating schema for different scale factors as well as executing against these scale factors. For example, let's say we have a dataset with a scale factor called "SF1". We would first generate the schema using: ./generate_schema_statements --workload=<workload> --scale_factor="SF3" This will create tables with a unique names from the other scale factors. Run the generated .sql file to load the data. Alternatively, the data can loaded by running a new python script: ./bin/load-data.py -w <workload1>,<workload2> -e <exploration strategy> -s [scale factor] For example: load-data.sh -w tpch -e core -s SF3 Then run against this: ./run-benchmark --workloads=<workload> --scale_factor=SF3 This changeset also includes a few other minor tweaks to some of the test scripts. Change-Id: Ife8a8d91567d75c9612be37bec96c1e7780f50d6
2025-12-19 18:12:08 -05:00 · 2012-08-02 23:38:59 -07:00
parent 81d54e85e5
commit 04edc8f534
90 changed files with 1707 additions and 1566 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,9 +7,8 @@ cscope.out
 org.eclipse.jdt.core.prefs
 benchmark_results.csv
 reference_benchmark_results.csv
-testdata/data/test-warehouse
-testdata/bin/create-*-generated.sql
-testdata/bin/load-*-generated.sql
+load-trevni-*-generated.sh
+load-*-generated.sql

 pprof.out

--- a/bin/copy-test-data.sh
+++ b/bin/copy-test-data.sh
@@ -1,11 +1,8 @@
 #!/usr/bin/env bash
 # Copyright (c) 2012 Cloudera, Inc. All rights reserved.

-bin=`dirname "$0"`
-bin=`cd "$bin"; pwd`
-. "$bin"/impala-config.sh
-
 set -e
+set -u

 echo "Copying data files from the share.  If the file already exists locally, the files"\
     "will not be copied.  It's not check summing the files or anything like that, if"\
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -36,6 +36,8 @@ fi

 export IMPALA_FE_DIR=$IMPALA_HOME/fe
 export IMPALA_BE_DIR=$IMPALA_HOME/be
+export IMPALA_WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
+export IMPALA_DATASET_DIR=$IMPALA_HOME/testdata/datasets
 export IMPALA_COMMON_DIR=$IMPALA_HOME/common
 export PATH=$IMPALA_HOME/bin:$PATH

--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
+#
+# This script is used to load the proper datasets for the specified workloads. It loads
+# all data via Hive except for Trevni data which needs to be loaded via Impala.
+import collections
+import os
+import re
+import subprocess
+import sys
+import tempfile
+import time
+from itertools import product
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy", default="core",
+                  help="The exploration strategy for schema gen: 'core', "\
+                  "'pairwise', or 'exhaustive'")
+parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
+                  help="The HDFS path to the base Hive test warehouse directory")
+parser.add_option("-w", "--workloads", dest="workloads",
+                  help="Comma-separated list of workloads to load data for. If 'all' is "\
+                       "specified then data for all workloads is loaded.")
+parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
+                  help="An optional scale factor to generate the schema for")
+parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
+                  default=False, help='Skips HDFS exists check and reloads all tables')
+
+(options, args) = parser.parse_args()
+
+WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
+DATASET_DIR = os.environ['IMPALA_DATASET_DIR']
+TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
+
+GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s "\
+                      "--workload=%s --scale_factor=%s --verbose"
+HIVE_CMD = os.path.join(os.environ['HIVE_HOME'], 'bin/hive')
+HIVE_ARGS = "-hiveconf hive.root.logger=WARN,console -v"
+
+def available_workloads(workload_dir):
+  return [subdir for subdir in os.listdir(workload_dir)
+            if os.path.isdir(os.path.join(workload_dir, subdir))]
+
+def validate_workloads(all_workloads, workloads):
+  for workload in workloads:
+    if workload not in all_workloads:
+      print 'Workload \'%s\' not found in workload directory' % workload
+      print 'Available workloads: ' + ', '.join(all_workloads)
+      sys.exit(1)
+
+def exec_hive_query_from_file(file_name):
+  hive_cmd = "%s %s -f %s" % (HIVE_CMD, HIVE_ARGS, file_name)
+  print 'Executing Hive Command: ' + hive_cmd
+  ret_val = subprocess.call(hive_cmd, shell = True)
+  if ret_val != 0:
+    print 'Error executing file from Hive: ' + file_name
+    sys.exit(ret_val)
+
+def exec_bash_script(file_name):
+  bash_cmd = "bash %s" % file_name
+  print 'Executing Bash Command: ' + bash_cmd
+  ret_val = subprocess.call(bash_cmd, shell = True)
+  if ret_val != 0:
+    print 'Error bash script: ' + file_name
+    sys.exit(ret_val)
+
+def generate_schema_statements(workload):
+  generate_cmd = GENERATE_SCHEMA_CMD % (options.exploration_strategy, workload,
+                                        options.scale_factor)
+  if options.force_reload:
+    generate_cmd += " --force_reload"
+  if options.hive_warehouse_dir is not None:
+    generate_cmd += " --hive_warehouse_dir=%s" % options.hive_warehouse_dir
+  print 'Executing Generate Schema Command: ' + generate_cmd
+  ret_val = subprocess.call(os.path.join(TESTDATA_BIN_DIR, generate_cmd), shell = True)
+  if ret_val != 0:
+    print 'Error generating schema statements for workload: ' + workload
+    sys.exit(ret_val)
+
+def get_dataset_for_workload(workload):
+  dimension_file_name = os.path.join(WORKLOAD_DIR, workload,
+                                     '%s_dimensions.csv' % workload)
+  if not os.path.isfile(dimension_file_name):
+    print 'Dimension file not found: ' + dimension_file_name
+    sys.exit(1)
+  with open(dimension_file_name, 'rb') as input_file:
+    match = re.search('dataset:\s*(\w+)', input_file.read())
+    if match:
+      return match.group(1)
+    else:
+      print 'Dimension file does not contain dataset for workload \'%s\'' % (workload)
+      sys.exit(1)
+
+all_workloads = available_workloads(WORKLOAD_DIR)
+workloads = []
+if options.workloads is None:
+  print "At least one workload name must be specified."
+  parser.print_help()
+  sys.exit(1)
+elif options.workloads == 'all':
+  print 'Loading data for all workloads.'
+  workloads = all_workloads
+else:
+  workloads = options.workloads.split(",")
+  validate_workloads(all_workloads, workloads)
+
+print 'Starting data load for the following workloads: ' + ', '.join(workloads)
+
+loading_time_map = collections.defaultdict(float)
+for workload in workloads:
+  start_time = time.time()
+  dataset = get_dataset_for_workload(workload)
+  print "Dataset for workload '%s' is '%s'" % (workload, dataset)
+  dataset_dir = os.path.join(DATASET_DIR, dataset)
+  os.chdir(dataset_dir)
+  generate_schema_statements(workload)
+  exec_hive_query_from_file(os.path.join(dataset_dir,
+     'load-%s-%s-generated.sql' % (workload, options.exploration_strategy)))
+
+  exec_bash_script(os.path.join(dataset_dir,
+     'load-trevni-%s-%s-generated.sh' % (workload, options.exploration_strategy)))
+  loading_time_map[workload] = time.time() - start_time
+
+total_time = 0.0
+for workload, load_time in loading_time_map.iteritems():
+  total_time += load_time
+  print 'Data loading for workload \'%s\' completed in: %.2fs'\
+      % (workload, load_time)
+print 'Total load time: %.2fs\n' % total_time
--- a/bin/load-data.sh
+++ b/bin/load-data.sh
@@ -1,77 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
-#
-# This script creates schema and loads data into hive for running benchmarks and
-# other tests. Using this script requires passing in two parameters:
-# The first is the  data set type (benchmark, tpch). This will load the appropriate
-# collection of data sets for the run type.
-# The second is the exploration strategy. This determines the different combinations
-# of file format, compression, etc that will be created and loaded. 'Core' defines
-# a basic set of combinations. If 'pairwise' is specified the pairwise combinations
-# of workload # + file format + compression will be loaded. If 'exhaustive' is
-# passed as an argument the exhaustive set of combinations will be loaded.
-# TODO: Rewrite this script in python and detect and load workloads by enumerating
-# the workloads directory.
-
-exploration_strategy=
-data_set_type=
-
-if [ $1 = "hive-benchmark" ]; then
-  data_set_type=$1
-elif [ $1 = "functional" ]; then
-  data_set_type=$1
-elif [ $1 = "tpch" ]; then
-  data_set_type=$1
-elif [ $1 = "query-test" ]; then
-  data_set_type="tpch functional"
-elif [ $1 = "all" ]; then
-  data_set_type="hive-benchmark tpch functional"
-else
-  echo "Invalid run type: $1. Valid values are 'all, query-test,"\
-       "functional, tpch, hive-benchmark'"
-  exit 1
-fi
-
-if [ $2 = "core" -o $2 = "pairwise" -o $2 = "exhaustive" ]; then
- exploration_strategy=$2
-else
-  echo "Invalid exploration strategy: $2. Valid values are 'core, pairwise, exhaustive'"
-  exit 1
-fi
-
-bin=`dirname "$0"`
-bin=`cd "$bin"; pwd`
-. "$bin"/impala-config.sh
-
-set -e
-
-WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
-DATASET_DIR=$IMPALA_HOME/testdata/datasets
-BIN_DIR=$IMPALA_HOME/testdata/bin
-
-function execute_hive_query_from_file {
-  hive_args="-hiveconf hive.root.logger=WARN,console -v -f"
-  "$HIVE_HOME/bin/hive" $hive_args $1
-  if [ $? != 0 ]; then
-    echo LOAD OF $1 FAILED
-    exit -1
-  fi
-}
-
-for ds in $data_set_type
-do
-  SCRIPT_DIR=$DATASET_DIR/$ds
-  pushd $SCRIPT_DIR
-  $BIN_DIR/generate_schema_statements.py --exploration_strategy ${exploration_strategy}\
-                                  --workload=${ds} --verbose
-  execute_hive_query_from_file \
-      "$SCRIPT_DIR/load-${ds}-${exploration_strategy}-generated.sql"
-  bash $SCRIPT_DIR/load-trevni-${ds}-${exploration_strategy}-generated.sh
-  popd
-done
-
-# TODO: Temporarily disable block id generation for everything except benchmark runs
-# due to IMP-134
-if [ $1 = "hive-benchmark" ]; then
-  $IMPALA_HOME/testdata/bin/generate-block-ids.sh
-fi
--- a/bin/report-benchmark-results.py
+++ b/bin/report-benchmark-results.py
@@ -47,13 +47,14 @@ COLUMN_WIDTH = 18
 TOTAL_WIDTH = 122 if options.verbose else 90

 # These are the indexes in the input row for each column value
-QUERY_IDX = 0
-FILE_FORMAT_IDX = 1
-COMPRESSION_IDX = 2
-IMPALA_AVG_IDX = 3
-IMPALA_STDDEV_IDX = 4
-HIVE_AVG_IDX = 5
-HIVE_STDDEV_IDX = 6
+WORKLOAD_IDX = 0
+QUERY_IDX = 1
+FILE_FORMAT_IDX = 2
+COMPRESSION_IDX = 3
+IMPALA_AVG_IDX = 4
+IMPALA_STDDEV_IDX = 5
+HIVE_AVG_IDX = 6
+HIVE_STDDEV_IDX = 7

 # Formats a string so that is is wrapped across multiple lines with no single line
 # being longer than the given width
@@ -91,7 +92,8 @@ def find_matching_row_in_reference_results(search_row, reference_results):
  for row in reference_results:
    if (row[QUERY_IDX] == search_row[QUERY_IDX] and
        row[FILE_FORMAT_IDX] == search_row[FILE_FORMAT_IDX] and
-        row[COMPRESSION_IDX] == search_row[COMPRESSION_IDX]):
+        row[COMPRESSION_IDX] == search_row[COMPRESSION_IDX] and
+        row[WORKLOAD_IDX] == search_row[WORKLOAD_IDX]):
      return row
  return None

@@ -117,7 +119,7 @@ def print_table(results, verbose, reference_results = None):
    print build_padded_row_string(table_header, COLUMN_WIDTH)
    print "-" * TOTAL_WIDTH
    for row in group:
-      full_row = row[1:] + [format_if_float(calculate_impala_hive_speedup(row)) + 'X']
+      full_row = row[2:] + [format_if_float(calculate_impala_hive_speedup(row)) + 'X']
      if not verbose:
        del full_row[HIVE_AVG_IDX - 1]
        del full_row[HIVE_STDDEV_IDX - 2]
@@ -193,7 +195,6 @@ def read_csv_result_file(file_name):
    results.append(row)
  return results

-
 reference_results = []
 results = []
 if os.path.isfile(options.result_file):
--- a/bin/run-benchmark.py
+++ b/bin/run-benchmark.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python
 # Copyright (c) 2012 Cloudera, Inc. All rights reserved.
 #
-# This script should be used to benchmark queries.  It can either run in batch mode, in
-# which case it will run the set of hive benchmark queries or to run a single query.  In
-# either case, it will first try to warm the buffer cache before running the query
-# multiple times.  There are command line options to control how many times to prerun the
-# query for the buffer cache as well as the number of iterations.
+# This script is used to run benchmark queries.  It runs the set queries specified in the
+# given workload(s) under <workload name>/queries. This script will first try to warm the
+# buffer cache before running the query. There is a command line options to control how
+# many iterations to run each query.
 #
 # By default, the script will have minimal output.  Verbose output can be turned on with
 # the -v option which will output the normal query output.  In addition, the -p option
@@ -15,7 +14,7 @@
 # The script parses for output in the specific format in the regex below (result_regex).
 # This is not very robust but probably okay for this script.
 #
-# The planservice needs to be running before this script.
+# The planservice or ImpalaD needs to be running before executing any workload.
 # Run with the --help option to see the arguments.
 import collections
 import csv
@@ -36,18 +35,17 @@ parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
                  default = False, help="If set, outputs all benchmark diagnostics.")
 parser.add_option("--remote", dest="remote", action="store_true",
                  default = False, help="Set to true if running on remote cluster.")
-parser.add_option("-q", "--query", dest="query", default = "",
-                  help="Query to run.  If none specified, runs all queries.")
-parser.add_option("--iterations", dest="iterations", default="3",
-                  help="Number of times to run the query.  Only to be used with -q")
-parser.add_option("--prime_cache", dest="prime_cache", default= True,
-                  help="Whether or not to prime the buffer cache.  Only to be "\
-                  "used with -q")
 parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
                  help="The exploration strategy to use for running benchmark: 'core', "\
                  "'pairwise', or 'exhaustive'")
+parser.add_option("-w", "--workloads", dest="workloads", default="hive-benchmark",
+                  help="The workload(s) to execute in a comma-separated list format."\
+                  "Some valid workloads: 'hive-benchmark', 'tpch', ...")
+parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
+                  help="The dataset scale factor to run the workload against.")
 parser.add_option("--query_cmd", dest="query_cmd",
-                  default='build/release/service/runquery -profile_output_file=""',
+                  default=os.path.join(os.environ['IMPALA_HOME'],
+                      'be/build/release/service/runquery') + ' -profile_output_file=""',
                  help="The command to use for executing queries")
 parser.add_option("--compare_with_hive", dest="compare_with_hive", action="store_true",
                  default= False, help="Run all queries using Hive as well as Impala")
@@ -56,12 +54,21 @@ parser.add_option("--results_csv_file", dest="results_csv_file",
                  help="The output file where benchmark results are saved")
 parser.add_option("--hive_cmd", dest="hive_cmd", default="hive -e",
                  help="The command to use for executing hive queries")
+parser.add_option("-i", "--iterations", dest="iterations", default="5",
+                  help="Number of times to run each query.")
+parser.add_option("--prime_cache", dest="prime_cache", default= True,
+                  help="Whether or not to prime the buffer cache. Only to be "\
+                  "used with -q")

 (options, args) = parser.parse_args()

-profile_output_file = 'build/release/service/profile.tmp'
-gprof_cmd = 'google-pprof --text build/release/service/runquery %s | head -n 60'
-prime_cache_cmd = os.environ['IMPALA_HOME'] + "/testdata/bin/cache_tables.py -q \"%s\""
+WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
+profile_output_file = os.path.join(os.environ['IMPALA_HOME'],
+                                   'be/build/release/service/profile.tmp')
+
+gprof_cmd = 'google-pprof --text ' + options.query_cmd + ' %s | head -n 60'
+prime_cache_cmd = os.path.join(os.environ['IMPALA_HOME'],
+                               "testdata/bin/cache_tables.py") + " -q \"%s\""
 result_single_regex = 'returned (\d*) rows? in (\d*).(\d*) s'
 result_multiple_regex = 'returned (\d*) rows? in (\d*).(\d*) s with stddev (\d*).(\d*)'
 hive_result_regex = 'Time taken: (\d*).(\d*) seconds'
@@ -80,12 +87,14 @@ class QueryExecutionResult:
    self.stddev = stddev

 class QueryExecutionDetail:
-  def __init__(self,  file_format, compression, impala_execution_result,
-               hive_execution_result):
+  def __init__(self, workload, file_format, compression_codec, compression_type,
+               impala_execution_result, hive_execution_result):
+    self.workload = workload
+    self.file_format = file_format
+    self.compression_codec = compression_codec
+    self.compression_type = compression_type
    self.impala_execution_result = impala_execution_result
    self.hive_execution_result = hive_execution_result
-    self.file_format = file_format
-    self.compression = compression

 # Parse for the tables used in this query
 def parse_tables(query):
@@ -246,8 +255,14 @@ def run_query(query, prime_buffer_cache, iterations):
  execution_result = QueryExecutionResult(str(avg_time), str(stddev))
  return [output, execution_result]

-def choose_input_vector_file_name(exploration_strategy):
-  return "hive-benchmark_%s.csv" % exploration_strategy
+def vector_file_name(workload, exploration_strategy):
+  return "%s_%s.csv" % (workload, exploration_strategy)
+
+# Gets the name of the database to use for the specified workload and scale factor.
+def database_name_to_use(workload, scale_factor):
+  if workload == 'tpch':
+    return '%s%s.' % (workload, scale_factor)
+  return ''

 def build_table_suffix(file_format, codec, compression_type):
  if file_format == 'text' and codec == 'none':
@@ -259,54 +274,27 @@ def build_table_suffix(file_format, codec, compression_type):
  else:
    return '_%s_%s' % (file_format, codec)

-def build_query(query_format_string, exploration_strategy, data_set,
-                file_format, codec, compression_type):
+def build_query(query_format_string, file_format, codec, compression_type,
+                workload, scale_factor):
+  database_name = database_name_to_use(workload, scale_factor)
  table_suffix = build_table_suffix(file_format, codec, compression_type)
-  return query_format_string % {'table_suffix': table_suffix}
+  # $TABLE is used as a token for table suffix in the queries. Here we insert the proper
+  # database name based on the workload and query.
+  return re.sub('(\w+\.){0,1}(?P<table_name>\w+)\$TABLE', '%s%s%s' %\
+                (database_name, r'\g<table_name>', table_suffix), query_format_string)

 def read_vector_file(file_name):
  if not os.path.isfile(file_name):
    print 'Cannot find vector file: ' + file_name
    sys.exit(1)

+  vector_values = []
  with open(file_name, 'rb') as vector_file:
-    return [line.strip().split(',')
-        for line in vector_file.readlines() if not line.startswith('#')]
-
-os.chdir(os.environ['IMPALA_BE_DIR'])
-
-# This table contains a hash of dataset -> [query, numbers of times to prime buffer cache,
-# number of iterations].  Queries should be grouped by the data they touch.  This
-# eliminates the need for the buffer cache priming iterations.
-# TODO: it would be good if this table also contained the expected numbers and
-# automatically flag regressions.  How do we reconcile the fact we are running on
-# different machines?
-queries = {'grep1gb': [
-  ["select count(*) from grep1gb%(table_suffix)s", 1, 5],
-  ["select count(field) from grep1gb%(table_suffix)s", 0, 5],
-  ["select count(field) from grep1gb%(table_suffix)s where field like '%%xyz%%'", 0, 5]
-  ],
-
-  'web': [
-  ["select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue "\
-   "from uservisits%(table_suffix)s uv join rankings%(table_suffix)s r on "\
-   "(r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate "\
-   "< '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1", 1, 5],
-  ["select sourceIP, SUM(adRevenue) FROM uservisits%(table_suffix)s GROUP by sourceIP "\
-   "order by SUM(adRevenue) desc limit 10", 1, 5],
-  ["select pageRank, pageURL from rankings%(table_suffix)s where pageRank > 10 "\
-   "order by pageRank limit 100", 1, 5],
-  ["select count(*) from rankings%(table_suffix)s where "\
-   "pageRank > 10 && pageRank < 25", 1, 5],
-  ["select avg(adRevenue) from uservisits%(table_suffix)s", 1, 5],
-  ["select avg(adRevenue) from uservisits%(table_suffix)s "\
-   "where visitdate > '1999-07-01' and visitdate < '1999-12-31'", 1, 5],
-  ],
-
-  'grep10gb': [
-  ["select count(field) from grep10gb%(table_suffix)s where field like '%%xyz%%'", 0, 1]
-  ]
-}
+    for line in vector_file.readlines():
+      if line.strip().startswith('#'):
+        continue
+      vector_values.append([value.split(':')[1].strip() for value in line.split(',')])
+  return vector_values

 # Writes out results to a CSV file. Columns are delimited by '|' characters
 def write_to_csv(result_map, output_csv_file):
@@ -316,51 +304,99 @@ def write_to_csv(result_map, output_csv_file):

  for query, execution_results in result_map.iteritems():
    for result in execution_results:
-      csv_writer.writerow([query, result.file_format, result.compression,
+      csv_writer.writerow([result.workload, query, result.file_format,
+                           '%s/%s' % (result.compression_codec, result.compression_type),
                           result.impala_execution_result.avg_time,
                           result.impala_execution_result.stddev,
                           result.hive_execution_result.avg_time,
-                           result.hive_execution_result.stddev])
+                           result.hive_execution_result.stddev,
+                           ])

-# Run all queries
-if (len(options.query) == 0):
-  vector_file_path = os.path.join(
-      os.environ['IMPALA_HOME'], 'testdata/workloads/hive-benchmark/',
-      choose_input_vector_file_name(options.exploration_strategy))
+# Recursively scans the given directory for all test query files
+def enumerate_query_files(base_directory):
+  query_files = []
+  for item in os.listdir(base_directory):
+    full_path = os.path.join(base_directory, item)
+    if os.path.isfile(full_path) and item.endswith('.test'):
+      query_files.append(full_path)
+    elif os.path.isdir(full_path):
+      query_files += enumerate_query_files(full_path)
+  return query_files

-  vector = read_vector_file(vector_file_path)
-  output = ""
-  result_map = collections.defaultdict(list)
+# Strips out comments and empty lines from the input query string
+def strip_comments(query_string):
+  query = []
+  for line in query_string.split('\n'):
+    if not line or line.strip().startswith('#') or line.strip().startswith('//'):
+      continue
+    query.append(line)
+  return '\n'.join(query).strip()

-  for row in vector:
-    file_format, data_set, codec, compression_type = row[:4]
-    for query in queries[data_set]:
-      query_string = build_query(query[0], options.exploration_strategy, data_set,
-                                 file_format, codec, compression_type)
-      result = run_query(query_string, query[1], query[2])
+# Enumerate all the query files for a workload and extract the actual query
+# strings.
+def extract_queries_from_test_files(workload):
+  workload_base_dir = os.path.join(WORKLOAD_DIR, workload)
+  if not os.path.isdir(workload_base_dir):
+    print "Workload '%s' not found at path '%s'" % (workload, workload_base_dir)
+    sys.exit(1)
+
+  query_dir = os.path.join(workload_base_dir, 'queries')
+  if not os.path.isdir(query_dir):
+    print "Workload query directory not found at path '%s'" % (query_dir)
+
+  queries = []
+  for query_file_name in enumerate_query_files(query_dir):
+    if options.verbose != 0:
+      print 'Parsing Query Test File: ' + query_file_name
+    with open(query_file_name, 'rb') as query_file:
+      # Query files are split into sections separated by '=====', with subsections
+      # separeted by '----'. The first item in each subsection is the actual query
+      # to execute.
+      for query_section in query_file.read().split("===="):
+        formatted_query = strip_comments(query_section.split("----")[0])
+        if formatted_query:
+          queries.append(formatted_query.strip())
+  return queries
+
+result_map = collections.defaultdict(list)
+output = ""
+
+# For each workload specified in, look up the associated query files. Extract valid
+# queries in each file and execute them using the specified number of execution
+# iterations. Finally, write results to an output CSV file for reporting.
+for workload in options.workloads.split(','):
+  print 'Starting running of workload: ' + workload
+  queries = extract_queries_from_test_files(workload)
+
+  vector_file_path = os.path.join(WORKLOAD_DIR, workload,
+                                  vector_file_name(workload,
+                                  options.exploration_strategy))
+  test_vector = read_vector_file(vector_file_path)
+
+  # Execute the queries for combinations of file format, compression, etc.
+  for row in test_vector:
+    file_format, data_group, codec, compression_type = row[:4]
+    print 'Test Vector Values: ' + ', '.join(row)
+    for query in queries:
+      query_string = build_query(query.strip(), file_format, codec, compression_type,
+                                 workload, options.scale_factor)
+      result = run_query(query_string, 1, int(options.iterations))
      output += result[0]
-      print result[0]
      execution_result = result[1]
      hive_execution_result = QueryExecutionResult("N/A", "N/A")
      if options.compare_with_hive:
-        hive_result = run_query_using_hive(query_string, query[1], query[2])
+        hive_result = run_query_using_hive(query_string, 1, int(options.iterations))
        print "Hive Results:"
        print hive_result[0]
        hive_execution_result = hive_result[1]
      if options.verbose != 0:
-        print "--------------------------------------------------------------------------"
+        print "------------------------------------------------------------------------"

-      execution_detail = QueryExecutionDetail(file_format, codec, execution_result,
+      execution_detail = QueryExecutionDetail(workload, file_format, codec,
+                                              compression_type, execution_result,
                                              hive_execution_result)
-      result_map[query[0]].append(execution_detail)
+      result_map[query].append(execution_detail)

  print "\nResults saving to: " + options.results_csv_file
  write_to_csv(result_map, options.results_csv_file)
-
  print output
-
-# Run query from command line
-else:
-  result = run_query(options.query, int(options.prime_cache),
-                    int(options.iterations))
-  print result[1] or result[0]
--- a/fe/src/test/java/com/cloudera/impala/dataerror/DataErrorsTest.java
+++ b/fe/src/test/java/com/cloudera/impala/dataerror/DataErrorsTest.java
@@ -1,4 +1,4 @@
-// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
+// Copyright (c) 2012 Cloudera, Inc. All rights reserved.

 package com.cloudera.impala.dataerror;

@@ -23,7 +23,7 @@ public class DataErrorsTest {
  private static Catalog catalog;
  private static Executor executor;
  private static StringBuilder testErrorLog;
-  private final String testDir = "DataErrorsTest";
+  private final String testDir = "functional-query/queries/DataErrorsTest";
  private static ArrayList<String>  tableList;

  @BeforeClass
--- a/fe/src/test/java/com/cloudera/impala/planner/PlannerTest.java
+++ b/fe/src/test/java/com/cloudera/impala/planner/PlannerTest.java
@@ -1,4 +1,4 @@
-// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
+// Copyright (c) 2012 Cloudera, Inc. All rights reserved.

 package com.cloudera.impala.planner;

@@ -33,7 +33,7 @@ public class PlannerTest {

  private static Catalog catalog;
  private static AnalysisContext analysisCtxt;
-  private final String testDir = "PlannerTest";
+  private final String testDir = "functional-planner/queries/PlannerTest";
  private final String outDir = "/tmp/PlannerTest/";

  private final StringBuilder explainStringBuilder = new StringBuilder();
--- a/fe/src/test/java/com/cloudera/impala/service/BaseQueryTest.java
+++ b/fe/src/test/java/com/cloudera/impala/service/BaseQueryTest.java
@@ -4,6 +4,7 @@ package com.cloudera.impala.service;

 import static org.junit.Assert.fail;

+import java.io.File;
 import java.io.IOException;
 import java.util.List;
 import java.util.Set;
@@ -41,7 +42,7 @@ import com.google.common.collect.Sets;
 */
 public abstract class BaseQueryTest {
  private static final Logger LOG = Logger.getLogger(BaseQueryTest.class);
-  private static final String TEST_DIR = "QueryTest";
+  private static final String TEST_DIR = "functional-query/queries/QueryTest";
  private static final int DEFAULT_FE_PORT = 21000;

  // If set to true, new test results will be generated and saved to the specified
@@ -93,6 +94,9 @@ public abstract class BaseQueryTest {
  protected final static TestExecMode EXECUTION_MODE = TestExecMode.valueOf(
      System.getProperty("testExecutionMode", "reduced").toUpperCase());

+  // A relative path from the 'workloads' directory to the base test directory.
+  private final String testDirName;
+
  /**
   * The type of target test environments. Determines whether the front end is running
   * in-process or out-of-process (ImpalaD).
@@ -166,6 +170,14 @@ public abstract class BaseQueryTest {
    }
  }

+  protected BaseQueryTest() {
+    this(TEST_DIR);
+  }
+
+  protected BaseQueryTest(String testDirName) {
+    this.testDirName = testDirName;
+  }
+
  @BeforeClass
  public static void setUp() throws Exception {
    String impaladHostname = System.getProperty("impalad");
@@ -418,7 +430,7 @@ public abstract class BaseQueryTest {

  private void runQueryWithTestConfigs(List<TestConfiguration> testConfigs,
      String testFile, boolean abortOnError, int maxErrors) {
-    String fileName = TEST_DIR + "/" + testFile + ".test";
+    String fileName = new File(testDirName, testFile + ".test").getPath();
    TestFileParser queryFileParser = new TestFileParser(fileName);

    LOG.debug("Running the following configurations over file " + fileName + " : ");
--- a/fe/src/test/java/com/cloudera/impala/service/TpchQueryTest.java
+++ b/fe/src/test/java/com/cloudera/impala/service/TpchQueryTest.java
@@ -6,6 +6,10 @@ import org.junit.Test;

 public class TpchQueryTest extends BaseQueryTest {

+  public TpchQueryTest() {
+    super("tpch/queries");
+  }
+
  @Test
  public void TestTpchQ1() {
    runTestInExecutionMode(EXECUTION_MODE, "tpch-q1", false, 1000);
--- a/fe/src/test/java/com/cloudera/impala/testutil/TestFileParser.java
+++ b/fe/src/test/java/com/cloudera/impala/testutil/TestFileParser.java
@@ -4,8 +4,10 @@ package com.cloudera.impala.testutil;

 import static org.junit.Assert.fail;

+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.EnumMap;
 import java.util.List;
@@ -183,7 +185,7 @@ public class TestFileParser {

  private int lineNum = 0;
  private final String fileName;
-  private InputStream stream;
+  private BufferedReader reader;
  private Scanner scanner;

  /**
@@ -210,9 +212,9 @@ public class TestFileParser {
   */
  private void open(String table) {
    try {
-      ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
-      stream = classLoader.getResourceAsStream(fileName);
-      scanner = new Scanner(stream);
+      String fullPath = new File(TestFileUtils.getTestFileBaseDir(), fileName).getPath();
+      reader = new BufferedReader(new FileReader(fullPath));
+      scanner = new Scanner(reader);
    } catch (Exception e) {
      fail(e.getMessage());
    }
@@ -293,16 +295,16 @@ public class TestFileParser {
  }

  private void close() {
+    if (reader != null) {
+      try {
+        reader.close();
+      } catch (IOException e) {
+        fail(e.getMessage());
+      }
+    }
+
    if (scanner != null) {
      scanner.close();
    }
-
-    if (stream != null) {
-      try {
-        stream.close();
-      } catch (IOException e) {
-        // ignore
-      }
-    }
  }
 }
--- a/fe/src/test/java/com/cloudera/impala/testutil/TestFileUtils.java
+++ b/fe/src/test/java/com/cloudera/impala/testutil/TestFileUtils.java
@@ -67,4 +67,11 @@ public class TestFileUtils {
      fw.close();
    }
  }
+
+  /**
+   * Returns the base directory for test files.
+   */
+  public static String getTestFileBaseDir() {
+    return new File(System.getenv("IMPALA_HOME"), "testdata/workloads").getPath();
+  }
 }
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -6,22 +6,14 @@ if [ x${JAVA_HOME} == x ]; then
  exit 1
 fi

+set -u
 set -e

 # Load the data set
 pushd ${IMPALA_HOME}/bin
-
-./load-data.sh functional exhaustive
-if [ $? != 0 ]; then
-  echo LOAD OF FUNCTIONAL DATA FAILED
-  exit 1
-fi
-
-./load-data.sh tpch core
-if [ $? != 0 ]; then
-  echo LOAD OF TPCH DATA FAILED
-  exit 1
-fi
+./load-data.py --workloads functional-query --exploration_strategy exhaustive
+./load-data.py --workloads functional-planner --exploration_strategy exhaustive
+./load-data.py --workloads tpch --exploration_strategy core
 popd

 # TODO: The multi-format table will move these files. So we need to copy them to a
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -34,21 +34,28 @@ from itertools import product
 from optparse import OptionParser

 parser = OptionParser()
-parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
+parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy", default="core",
                  help="The exploration strategy for schema gen: 'core', "\
                  "'pairwise', or 'exhaustive'")
 parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
                  default="/test-warehouse",
                  help="The HDFS path to the base Hive test warehouse directory")
-parser.add_option("--workload", dest="workload", default="functional",
+parser.add_option("-w", "--workload", dest="workload",
                  help="The workload to generate schema for: tpch, hive-benchmark, ...")
-parser.add_option("--force_reload", dest="force_reload", action="store_true",
+parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
+                  help="An optional scale factor to generate the schema for")
+parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
                  default= False, help='Skips HDFS exists check and reloads all tables')
 parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
                  default = False, help="If set, outputs additional logging.")

 (options, args) = parser.parse_args()

+if options.workload is None:
+  print "A workload name must be specified."
+  parser.print_help()
+  sys.exit(1)
+
 WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads'
 DATASET_DIR = os.environ['IMPALA_HOME'] + '/testdata/datasets'

@@ -58,8 +65,8 @@ COMPRESSION_CODEC =\
    "SET mapred.output.compression.codec=org.apache.hadoop.io.compress.%s;"
 SET_DYNAMIC_PARTITION_STATEMENT = "SET hive.exec.dynamic.partition=true;"
 SET_PARTITION_MODE_NONSTRICT_STATEMENT = "SET hive.exec.dynamic.partition.mode=nonstrict;"
-SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;"\
-                        "SET hive.input.format=org.apache.hadoop.hive.ql.io.%s;"
+SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;\n"\
+                        "SET hive.input.format=org.apache.hadoop.hive.ql.io.%s;\n"

 FILE_FORMAT_IDX = 0
 DATA_SET_IDX = 1
@@ -99,10 +106,12 @@ class SqlGenerationStatement:
    self.trevni = trevni.strip()
    self.load_local = load_local.strip()

-def build_create_statement(table_template, table_name, file_format, compression):
+def build_create_statement(table_template, table_name, file_format,
+                           compression, scale_factor):
  create_statement = 'DROP TABLE IF EXISTS %s;\n' % table_name
  create_statement += table_template % {'table_name': table_name,
-                                        'file_format': FILE_FORMAT_MAP[file_format] }
+                                        'file_format': FILE_FORMAT_MAP[file_format],
+                                        'scale_factor': scale_factor}
  if file_format != 'trevni':
    return create_statement

@@ -151,9 +160,10 @@ def build_insert(insert, table_name, base_table_name, codec, compression_type):
                                        table_name) + "\n"
  return output

-def build_load_statement(load_template, table_name):
+def build_load_statement(load_template, table_name, scale_factor):
  tmp_load_template = load_template.replace(' % ', ' *** ')
-  return (tmp_load_template % {'table_name': table_name}).replace(' *** ', ' % ')
+  return (tmp_load_template % {'table_name': table_name,
+                               'scale_factor': scale_factor}).replace(' *** ', ' % ')

 def build_trevni(trevni_template, table_name, base_table_name):
  return trevni_template % {'table_name': table_name, 'base_table_name': base_table_name}
@@ -171,10 +181,16 @@ def build_table_suffix(file_format, codec, compression_type):
  else:
    return '_%s_%s' % (file_format, codec)

+# Vector files have the format:
+# dimension_name1:value1, dimension_name2:value2, ...
 def read_vector_file(file_name):
+  vector_values = []
  with open(file_name, 'rb') as vector_file:
-    return [line.strip().split(',')
-        for line in vector_file.readlines() if not line.startswith('#')]
+    for line in vector_file.readlines():
+      if line.strip().startswith('#'):
+        continue
+      vector_values.append([value.split(':')[1].strip() for value in line.split(',')])
+  return vector_values

 def write_array_to_file(file_name, array):
  with open(file_name, 'w') as f:
@@ -207,32 +223,32 @@ def write_trevni_file(file_name, array):
      # Kill off the plan service.
      f.write("\nkill -9 $PID\n")

-def write_statements_to_file_based_on_input_vector(output_name, input_file_name,
+def write_statements_to_file_based_on_input_vector(output_name, test_vectors,
                                                   statements):
  output_create = []
  output_load = []
  output_load_base = []
  output_trevni = []
-  results = read_vector_file(input_file_name)
  existing_tables = list_hdfs_subdir_names(options.hive_warehouse_dir)
-  for row in results:
+  for row in test_vectors:
    file_format, data_set, codec, compression_type = row[:4]
    for s in statements[data_set.strip()]:
      create = s.create
      insert = s.insert
      trevni = s.trevni
      load_local = s.load_local
-      table_name = s.base_table_name +\
-                   build_table_suffix(file_format, codec, compression_type)
+      base_table_name = s.base_table_name % {'scale_factor' : options.scale_factor}
+      table_name = base_table_name + \
+                       build_table_suffix(file_format, codec, compression_type)

      # HBase only supports text format and mixed format tables have formats defined.
      # TODO: Implement a better way to tag a table as only being generated with a fixed
      # set of file formats.
-      if (("hbase" in table_name or "mixedformat" in table_name) and
-          "text" not in file_format):
+      if ("hbase" in table_name and "text" not in file_format):
        continue

-      output_create.append(build_create_statement(create, table_name, file_format, codec))
+      output_create.append(build_create_statement(create, table_name, file_format, codec,
+                                                  options.scale_factor))

      # If the directory already exists in HDFS, assume that data files already exist
      # and skip loading the data. Otherwise, the data is generated using either an
@@ -242,21 +258,22 @@ def write_statements_to_file_based_on_input_vector(output_name, input_file_name,
        print 'Path:', data_path, 'already exists in HDFS. Data loading can be skipped.'
      else:
        print 'Path:', data_path, 'does not exists in HDFS. Data file will be generated.'
-        if table_name == s.base_table_name:
+        if table_name == base_table_name:
          if load_local:
-            output_load_base.append(build_load_statement(load_local, table_name))
+            output_load_base.append(build_load_statement(load_local, table_name,
+                                                         options.scale_factor))
          else:
            print 'Empty base table load for %s. Skipping load generation' % table_name
        elif file_format == 'trevni':
          if trevni:
-            output_trevni.append(build_trevni(trevni, table_name, s.base_table_name))
+            output_trevni.append(build_trevni(trevni, table_name, base_table_name))
          else:
            print \
                'Empty trevni load for table %s. Skipping insert generation' % table_name
        else:
          if insert:
-            output_load.append(build_insert(insert, table_name, s.base_table_name,
-                                          codec, compression_type))
+            output_load.append(build_insert(insert, table_name, base_table_name,
+                                            codec, compression_type))
          else:
              print 'Empty insert for table %s. Skipping insert generation' % table_name

@@ -285,22 +302,30 @@ if (options.exploration_strategy != 'core' and
  print 'Invalid exploration strategy:', options.exploration_strategy
  sys.exit(1)

-schema_template_file = os.path.join(DATASET_DIR, options.workload,
-                                    '%s_schema_template.sql' % (options.workload))
+test_vector_file = os.path.join(WORKLOAD_DIR, options.workload,
+                                '%s_%s.csv' % (options.workload,
+                                               options.exploration_strategy))
+
+if not os.path.isfile(test_vector_file):
+  print 'Vector file not found: ' + test_vector_file
+  sys.exit(1)
+
+test_vectors = read_vector_file(test_vector_file)
+
+if len(test_vectors) == 0:
+  print 'No test vectors found in file: ' + test_vector_file
+  sys.exit(1)
+
+target_dataset = test_vectors[0][DATA_SET_IDX]
+print 'Target Dataset: ' + target_dataset
+schema_template_file = os.path.join(DATASET_DIR, target_dataset,
+                                    '%s_schema_template.sql' % target_dataset)

 if not os.path.isfile(schema_template_file):
  print 'Schema file not found: ' + schema_template_file
  sys.exit(1)

-test_vector_file = os.path.join(WORKLOAD_DIR, options.workload,
-                                '%s_%s.csv' % (options.workload,
-                                               options.exploration_strategy))
-
-if not os.path.isfile(schema_template_file):
-  print 'Vector file not found: ' + schema_template_file
-  sys.exit(1)
-
 statements = parse_benchmark_file(schema_template_file)
 write_statements_to_file_based_on_input_vector(
    '%s-%s' % (options.workload, options.exploration_strategy),
-    test_vector_file, statements)
+    test_vectors, statements)
--- a/testdata/bin/generate-test-vectors.py
+++ b/testdata/bin/generate-test-vectors.py
@@ -35,23 +35,28 @@ import metacomm.combinatorics.all_pairs2
 all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2

 parser = OptionParser()
-parser.add_option("--dimension_file", dest="dimension_file",
-                  default = "hive-benchmark_dimensions.csv",
-                  help="The file containing the list of dimensions.")
-parser.add_option("--workload", dest="workload", default = "hive-benchmark",
+parser.add_option("-w", "--workload", dest="workload",
                  help="The workload to generate test vectors for")
 (options, args) = parser.parse_args()

-WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads'
+if options.workload is None:
+  print "A workload name must be specified."
+  parser.print_help()
+  sys.exit(1)

-FILE_FORMAT_IDX = 0
-DATA_SET_IDX = 1
-COMPRESSION_IDX = 2
-COMPRESSION_TYPE_IDX = 3
+WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']

-KNOWN_DIMENSION_NAMES = ['file_format', 'data_group', 'compression_codec',
+# This array also defines the order of the dimension values. This ordering
+# is important because it is used to apply constraints. Add new items to the
+# end of the list.
+KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec',
                         'compression_type']

+FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format')
+DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset')
+COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec')
+COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type')
+
 class VectorGenerator:
  def __init__(self, input_vectors):
    self.input_vectors = input_vectors
@@ -76,13 +81,15 @@ def is_valid_combination(vector):
        (vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or
        (vector[FILE_FORMAT_IDX] == 'trevni' and
        (vector[COMPRESSION_IDX] == 'gzip' or vector[COMPRESSION_IDX] == 'bzip')) or
-        (vector[DATA_SET_IDX] == 'tpch' and vector[FILE_FORMAT_IDX] != 'text'))
+        (vector[DATASET_IDX] == 'tpch' and
+        (vector[FILE_FORMAT_IDX] != 'text' and vector[FILE_FORMAT_IDX] != 'trevni')))

  # The pairwise generator may call this with different vector lengths. In that case this
  # should always return true.
  return True

-# Vector files have the format: <dimension name>: value1, value2, ...
+# Vector files have the format: <dimension name>: value1, value2, ... this function
+# adds all specified dimensions to a map of dimension name-to-value
 def read_dimension_file(file_name):
  dimension_map = collections.defaultdict(list)
  with open(file_name, 'rb') as input_file:
@@ -98,16 +105,18 @@ def read_dimension_file(file_name):
        print 'Unknown dimension name: ' + values[0]
        print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES)
        sys.exit(1)
-      dimension_map[values[0]] = [val.strip() for val in  values[1].split(',')]
+      dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
  return dimension_map

 def write_vectors_to_csv(output_dir, output_file, matrix):
-  output_text = "# Generated File. The vector value order is: file format, data_group, "\
-                "compression codec, compression type"
+  output_text = "# Generated File."
  for row in matrix:
-    output_text += '\n' + ','.join(row)
+    row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))]
+    output_text += '\n' + ', '.join(row)

-  with open(os.path.join(output_dir, output_file), 'wb') as output_file:
+  output_path = os.path.join(output_dir, output_file)
+  print 'Writing test vectors to: ' + output_path
+  with open(output_path, 'wb') as output_file:
    output_file.write(output_text)
    output_file.write('\n')

@@ -120,11 +129,10 @@ if not os.path.isfile(dimension_file):
 print 'Reading dimension file: ' + dimension_file
 vector_map = read_dimension_file(dimension_file)
 vectors = []
+
 # This ordering matters! We need to know the order to apply the proper constraints.
-vectors.append(vector_map['file_format'])
-vectors.append(vector_map['data_group'])
-vectors.append(vector_map['compression_codec'])
-vectors.append(vector_map['compression_type'])
+for dimension_name in KNOWN_DIMENSION_NAMES:
+  vectors.append(vector_map[dimension_name])
 vg = VectorGenerator(vectors)

 output_dir = os.path.join(WORKLOAD_DIR, options.workload)
--- a/testdata/datasets/hive-benchmark/hive-benchmark_schema_template.sql
+++ b/testdata/datasets/hive-benchmark/hive-benchmark_schema_template.sql
@@ -24,7 +24,7 @@
 # ---- <- End sub-section
 # LOAD from LOCAL - How to load data for the the base table
 ====
-grep1gb
+hive-benchmark
 ----
 grep1gb
 ----
@@ -49,7 +49,7 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-000
 LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
 LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
 ====
-grep10gb
+hive-benchmark
 ----
 grep10gb
 ----
@@ -75,7 +75,7 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00
 LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
 LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
 ====
-web
+hive-benchmark
 ----
 rankings
 ----
@@ -94,7 +94,7 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
 ----
 LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/html1GB/Rankings.dat' OVERWRITE INTO TABLE %(table_name)s;
 ====
-web
+hive-benchmark
 ----
 uservisits
 ----
--- a/testdata/datasets/tpch/tpch_schema_template.sql
+++ b/testdata/datasets/tpch/tpch_schema_template.sql
@@ -1,10 +1,11 @@
 # Copyright (c) 2012 Cloudera, Inc. All rights reserved.
-# For details on this file format please see benchmark_schema_template.sql
+# For details on this file format please see hive-benchmark_schema_template.sql
 ====
 tpch
 ----
-lineitem
+tpch%(scale_factor)s.lineitem
 ----
+CREATE DATABASE IF NOT EXISTS tpch%(scale_factor)s;
 CREATE EXTERNAL TABLE %(table_name)s (
 L_ORDERKEY INT,
 L_PARTKEY INT,
@@ -24,7 +25,7 @@ L_SHIPMODE STRING,
 L_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -32,12 +33,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/lineitem.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/lineitem/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-part
+tpch%(scale_factor)s.part
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 P_PARTKEY INT,
@@ -52,7 +53,7 @@ P_RETAILPRICE DOUBLE,
 P_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -60,12 +61,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/part.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/part/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-partsupp
+tpch%(scale_factor)s.partsupp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 PS_PARTKEY INT,
@@ -75,7 +76,7 @@ PS_SUPPLYCOST DOUBLE,
 PS_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -83,12 +84,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/partsupp.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/partsupp/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-supplier
+tpch%(scale_factor)s.supplier
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 S_SUPPKEY INT,
@@ -100,7 +101,7 @@ S_ACCTBAL DOUBLE,
 S_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -108,12 +109,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/supplier.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/supplier/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-nation
+tpch%(scale_factor)s.nation
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 N_NATIONKEY INT,
@@ -122,7 +123,7 @@ N_REGIONKEY INT,
 N_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -130,12 +131,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/nation.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/nation/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-region
+tpch%(scale_factor)s.region
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 R_REGIONKEY INT,
@@ -143,7 +144,7 @@ R_NAME STRING,
 R_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -151,12 +152,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/region.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/region/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-orders
+tpch%(scale_factor)s.orders
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 O_ORDERKEY INT,
@@ -170,7 +171,7 @@ O_SHIPPRIORITY INT,
 O_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -178,12 +179,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/orders.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/orders/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-customer
+tpch%(scale_factor)s.customer
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 C_CUSTKEY INT,
@@ -196,7 +197,7 @@ C_MKTSEGMENT STRING,
 C_COMMENT STRING)
 ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
 ----
@@ -204,12 +205,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
  INSERT OVERWRITE TABLE %(table_name)s \
  select * FROM %(base_table_name)s"
 ----
-LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/customer.tbl'
+LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/customer/'
 OVERWRITE INTO TABLE %(table_name)s;
 ====
 tpch
 ----
-q2_minimum_cost_supplier_tmp1
+tpch%(scale_factor)s.q2_minimum_cost_supplier_tmp1
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 s_acctbal double,
@@ -222,27 +223,27 @@ s_address string,
 s_phone string,
 s_comment string)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q2_minimum_cost_supplier_tmp2
+tpch%(scale_factor)s.q2_minimum_cost_supplier_tmp2
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 p_partkey int,
 ps_min_supplycost double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q7_volume_shipping_tmp
+tpch%(scale_factor)s.q7_volume_shipping_tmp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 supp_nation string,
@@ -250,73 +251,73 @@ cust_nation string,
 s_nationkey int,
 c_nationkey int)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q11_part_tmp
+tpch%(scale_factor)s.q11_part_tmp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 ps_partkey int,
 part_value double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q11_sum_tmp
+tpch%(scale_factor)s.q11_sum_tmp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (total_value double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-revenue
+tpch%(scale_factor)s.revenue
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 supplier_no int,
 total_revenue double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-max_revenue
+tpch%(scale_factor)s.max_revenue
 ----
 CREATE EXTERNAL TABLE %(table_name)s (max_revenue double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-supplier_tmp
+tpch%(scale_factor)s.supplier_tmp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (s_suppkey int)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q16_tmp
+tpch%(scale_factor)s.q16_tmp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 p_brand string,
@@ -324,94 +325,94 @@ p_type string,
 p_size int,
 ps_suppkey int)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-lineitem_tmp
+tpch%(scale_factor)s.lineitem_tmp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 t_partkey int,
 t_avg_quantity double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q18_tmp
+tpch%(scale_factor)s.q18_tmp
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 l_orderkey int,
 t_sum_quantity double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q20_tmp1
+tpch%(scale_factor)s.q20_tmp1
 ----
 CREATE EXTERNAL TABLE %(table_name)s (p_partkey int)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q20_tmp2
+tpch%(scale_factor)s.q20_tmp2
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 l_partkey int,
 l_suppkey int,
 sum_quantity double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q20_tmp3
+tpch%(scale_factor)s.q20_tmp3
 ----
 CREATE EXTERNAL TABLE %(table_name)s (
 ps_suppkey int,
 ps_availqty int,
 sum_quantity double)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q20_tmp4
+tpch%(scale_factor)s.q20_tmp4
 ----
 CREATE EXTERNAL TABLE %(table_name)s (ps_suppkey int)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
 ====
 tpch
 ----
-q22_customer_tmp1
+tpch%(scale_factor)s.q22_customer_tmp1
 ----
 CREATE EXTERNAL TABLE %(table_name)s (avg_acctbal double, cust_name_char string)
 STORED AS %(file_format)s
-LOCATION '/test-warehouse/%(table_name)s';
+LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
 ----
 ----
 ----
--- a/testdata/workloads/functional-planner/functional-planner_core.csv
+++ b/testdata/workloads/functional-planner/functional-planner_core.csv
@@ -0,0 +1,2 @@
+# Manually Created File.
+file_format: text, dataset: functional, compression_codec: none, compression_type: none
--- a/testdata/workloads/functional-planner/functional-planner_dimensions.csv
+++ b/testdata/workloads/functional-planner/functional-planner_dimensions.csv
@@ -0,0 +1,4 @@
+file_format: text
+dataset: functional
+compression_codec: none
+compression_type: none
--- a/testdata/workloads/functional-planner/functional-planner_exhaustive.csv
+++ b/testdata/workloads/functional-planner/functional-planner_exhaustive.csv
@@ -0,0 +1,2 @@
+# Generated File.
+file_format: text, dataset: functional, compression_codec: none, compression_type: none
--- a/testdata/workloads/functional-planner/functional-planner_pairwise.csv
+++ b/testdata/workloads/functional-planner/functional-planner_pairwise.csv
@@ -0,0 +1,2 @@
+# Generated File.
+file_format: text, dataset: functional, compression_codec: none, compression_type: none
--- a/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/aggregation.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/distinct.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/distinct.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/hbase.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/hbase.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/hdfs.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/hdfs.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/insert.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/insert.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/joins.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/order.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/order.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/subquery.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/subquery.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/topn.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/topn.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/tpch-all.test
--- a/testdata/workloads/functional-planner/queries/PlannerTest/union.test
+++ b/testdata/workloads/functional-planner/queries/PlannerTest/union.test
--- a/testdata/workloads/functional-query/functional-query_core.csv
+++ b/testdata/workloads/functional-query/functional-query_core.csv
@@ -0,0 +1,3 @@
+# Manually created file.
+file_format:text, dataset:functional, compression_codec:none, compression_type:none
+file_format:seq, dataset:functional, compression_codec:none, compression_type:none
--- a/testdata/workloads/functional-query/functional-query_dimensions.csv
+++ b/testdata/workloads/functional-query/functional-query_dimensions.csv
@@ -1,4 +1,4 @@
 file_format: text,seq,rc,trevni
-data_group: functional
+dataset: functional
 compression_codec: none,def,gzip,bzip,snap
 compression_type: none,block,record
--- a/testdata/workloads/functional-query/functional-query_exhaustive.csv
+++ b/testdata/workloads/functional-query/functional-query_exhaustive.csv
@@ -0,0 +1,19 @@
+# Generated File.
+file_format: text, dataset: functional, compression_codec: none, compression_type: none
+file_format: seq, dataset: functional, compression_codec: none, compression_type: none
+file_format: seq, dataset: functional, compression_codec: def, compression_type: block
+file_format: seq, dataset: functional, compression_codec: def, compression_type: record
+file_format: seq, dataset: functional, compression_codec: gzip, compression_type: block
+file_format: seq, dataset: functional, compression_codec: gzip, compression_type: record
+file_format: seq, dataset: functional, compression_codec: bzip, compression_type: block
+file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record
+file_format: seq, dataset: functional, compression_codec: snap, compression_type: block
+file_format: seq, dataset: functional, compression_codec: snap, compression_type: record
+file_format: rc, dataset: functional, compression_codec: none, compression_type: none
+file_format: rc, dataset: functional, compression_codec: def, compression_type: block
+file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
+file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
+file_format: rc, dataset: functional, compression_codec: snap, compression_type: block
+file_format: trevni, dataset: functional, compression_codec: none, compression_type: none
+file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
+file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block
--- a/testdata/workloads/functional-query/functional-query_pairwise.csv
+++ b/testdata/workloads/functional-query/functional-query_pairwise.csv
@@ -0,0 +1,8 @@
+# Generated File.
+file_format: text, dataset: functional, compression_codec: none, compression_type: none
+file_format: seq, dataset: functional, compression_codec: def, compression_type: block
+file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
+file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block
+file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
+file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
+file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record
--- a/testdata/workloads/functional-query/queries/DataErrorsTest/hbase-scan-node-errors.test
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hbase-scan-node-errors.test
--- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-rcfile-scan-node-errors.test
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-rcfile-scan-node-errors.test
--- a/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/hdfs-scan-node-errors.test
--- a/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/aggregation.test
--- a/testdata/workloads/functional-query/queries/QueryTest/distinct.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/distinct.test
--- a/testdata/workloads/functional-query/queries/QueryTest/empty.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/empty.test
--- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test
--- a/testdata/workloads/functional-query/queries/QueryTest/hbase-filters.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hbase-filters.test
--- a/testdata/workloads/functional-query/queries/QueryTest/hbase-rowkeys.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hbase-rowkeys.test
--- a/testdata/workloads/functional-query/queries/QueryTest/hbase-scan-node.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hbase-scan-node.test
--- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-partitions.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-partitions.test
--- a/testdata/workloads/functional-query/queries/QueryTest/hdfs-scan-node.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs-scan-node.test
--- a/testdata/workloads/functional-query/queries/QueryTest/insert.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/insert.test
--- a/testdata/workloads/functional-query/queries/QueryTest/joins.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/joins.test
--- a/testdata/workloads/functional-query/queries/QueryTest/limit.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/limit.test
--- a/testdata/workloads/functional-query/queries/QueryTest/mixed-format.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/mixed-format.test
--- a/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/outer-joins.test
--- a/testdata/workloads/functional-query/queries/QueryTest/subquery.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/subquery.test
--- a/testdata/workloads/functional-query/queries/QueryTest/top-n.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/top-n.test
--- a/testdata/workloads/functional-query/queries/QueryTest/union.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/union.test
--- a/testdata/workloads/functional/functional_core.csv
+++ b/testdata/workloads/functional/functional_core.csv
@@ -1,3 +0,0 @@
-# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
-text,functional,none,none
-seq,functional,none,none
--- a/testdata/workloads/functional/functional_exhaustive.csv
+++ b/testdata/workloads/functional/functional_exhaustive.csv
@@ -1,19 +0,0 @@
-# Generated File. The vector value order is: file format, data_group, compression codec, compression type
-text,functional,none,none
-seq,functional,none,none
-seq,functional,def,block
-seq,functional,def,record
-seq,functional,gzip,block
-seq,functional,gzip,record
-seq,functional,bzip,block
-seq,functional,bzip,record
-seq,functional,snap,block
-seq,functional,snap,record
-rc,functional,none,none
-rc,functional,def,block
-rc,functional,gzip,block
-rc,functional,bzip,block
-rc,functional,snap,block
-trevni,functional,none,none
-trevni,functional,def,block
-trevni,functional,snap,block
--- a/testdata/workloads/functional/functional_pairwise.csv
+++ b/testdata/workloads/functional/functional_pairwise.csv
@@ -1,8 +0,0 @@
-# Generated File. The vector value order is: file format, data_group, compression codec, compression type
-text,functional,none,none
-seq,functional,def,block
-rc,functional,gzip,block
-trevni,functional,snap,block
-trevni,functional,def,block
-rc,functional,bzip,block
-seq,functional,bzip,record
--- a/testdata/workloads/hive-benchmark/hive-benchmark_core.csv
+++ b/testdata/workloads/hive-benchmark/hive-benchmark_core.csv
@@ -1,9 +1,7 @@
-# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
-text,grep1gb,none,none
-text,grep10gb,none,none
-text,web,none,none
-seq,grep1gb,bzip,none
-seq,web,snap,record
-seq,web,none,none
-rc,grep1gb,def,block
-rc,web,none,none
+# Manually created file.
+file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
+file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: none
+file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record
+file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block
+file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none
+file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
--- a/testdata/workloads/hive-benchmark/hive-benchmark_dimensions.csv
+++ b/testdata/workloads/hive-benchmark/hive-benchmark_dimensions.csv
@@ -1,4 +1,4 @@
 file_format: text,seq,rc,trevni
-data_group: grep1gb,grep10gb,web
+dataset: hive-benchmark
 compression_codec: none,def,gzip,bzip,snap
 compression_type: none,block,record
--- a/testdata/workloads/hive-benchmark/hive-benchmark_exhaustive.csv
+++ b/testdata/workloads/hive-benchmark/hive-benchmark_exhaustive.csv
@@ -1,55 +1,19 @@
-# Generated File. The vector value order is: file format, data_group, compression codec, compression type
-text,grep1gb,none,none
-text,grep10gb,none,none
-text,web,none,none
-seq,grep1gb,none,none
-seq,grep1gb,def,block
-seq,grep1gb,def,record
-seq,grep1gb,gzip,block
-seq,grep1gb,gzip,record
-seq,grep1gb,bzip,block
-seq,grep1gb,bzip,record
-seq,grep1gb,snap,block
-seq,grep1gb,snap,record
-seq,grep10gb,none,none
-seq,grep10gb,def,block
-seq,grep10gb,def,record
-seq,grep10gb,gzip,block
-seq,grep10gb,gzip,record
-seq,grep10gb,bzip,block
-seq,grep10gb,bzip,record
-seq,grep10gb,snap,block
-seq,grep10gb,snap,record
-seq,web,none,none
-seq,web,def,block
-seq,web,def,record
-seq,web,gzip,block
-seq,web,gzip,record
-seq,web,bzip,block
-seq,web,bzip,record
-seq,web,snap,block
-seq,web,snap,record
-rc,grep1gb,none,none
-rc,grep1gb,def,block
-rc,grep1gb,gzip,block
-rc,grep1gb,bzip,block
-rc,grep1gb,snap,block
-rc,grep10gb,none,none
-rc,grep10gb,def,block
-rc,grep10gb,gzip,block
-rc,grep10gb,bzip,block
-rc,grep10gb,snap,block
-rc,web,none,none
-rc,web,def,block
-rc,web,gzip,block
-rc,web,bzip,block
-rc,web,snap,block
-trevni,grep1gb,none,none
-trevni,grep1gb,def,block
-trevni,grep1gb,snap,block
-trevni,grep10gb,none,none
-trevni,grep10gb,def,block
-trevni,grep10gb,snap,block
-trevni,web,none,none
-trevni,web,def,block
-trevni,web,snap,block
+# Generated File.
+file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
+file_format: seq, dataset: hive-benchmark, compression_codec: none, compression_type: none
+file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block
+file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: record
+file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
+file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record
+file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
+file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record
+file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: block
+file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: record
+file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none
+file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block
+file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
+file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
+file_format: rc, dataset: hive-benchmark, compression_codec: snap, compression_type: block
+file_format: trevni, dataset: hive-benchmark, compression_codec: none, compression_type: none
+file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block
+file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
--- a/testdata/workloads/hive-benchmark/hive-benchmark_pairwise.csv
+++ b/testdata/workloads/hive-benchmark/hive-benchmark_pairwise.csv
@@ -1,10 +1,8 @@
-# Generated File. The vector value order is: file format, data_group, compression codec, compression type
-text,grep1gb,none,none
-seq,grep10gb,def,block
-rc,web,gzip,block
-trevni,web,snap,block
-trevni,grep10gb,none,none
-rc,grep1gb,bzip,block
-seq,web,none,none
-text,grep10gb,none,none
-text,web,none,none
+# Generated File.
+file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
+file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block
+file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
+file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
+file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block
+file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
+file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record
--- a/testdata/workloads/hive-benchmark/queries/hive-benchmark.test
+++ b/testdata/workloads/hive-benchmark/queries/hive-benchmark.test
@@ -0,0 +1,27 @@
+====
+select count(*) from grep1gb$TABLE
+====
+select count(field) from grep1gb$TABLE
+====
+select count(field) from grep1gb$TABLE where field like '%%xyz%%'
+====
+select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue
+from uservisits$TABLE uv join rankings$TABLE r on
+   (r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate
+   < '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1
+====
+select sourceIP, SUM(adRevenue) FROM uservisits$TABLE GROUP by sourceIP
+order by SUM(adRevenue) desc limit 10
+====
+select pageRank, pageURL from rankings$TABLE where pageRank > 10
+order by pageRank limit 100
+====
+select count(*) from rankings$TABLE where pageRank > 10 && pageRank < 25
+====
+select avg(adRevenue) from uservisits$TABLE
+====
+select avg(adRevenue) from uservisits$TABLE
+where visitdate > '1999-07-01' and visitdate < '1999-12-31'
+====
+select count(field) from grep10gb$TABLE where field like '%%xyz%%'
+====
--- a/testdata/workloads/tpch/queries/tpch-q1.test
+++ b/testdata/workloads/tpch/queries/tpch-q1.test
@@ -11,7 +11,7 @@ select
  round(avg(l_extendedprice), 1),
  round(avg(l_discount), 1), count(1)
 from
-  lineitem$TABLE
+  tpch.lineitem$TABLE
 where
  l_shipdate<='1998-09-02'
 group by
--- a/testdata/workloads/tpch/queries/tpch-q10.test
+++ b/testdata/workloads/tpch/queries/tpch-q10.test
@@ -11,12 +11,12 @@ select
  c_address,
  c_phone,
  c_comment
-from lineitem$TABLE l
-  join orders$TABLE o
+from tpch.lineitem$TABLE l
+  join tpch.orders$TABLE o
    on (l.l_orderkey = o.o_orderkey)
-  join customer$TABLE c
+  join tpch.customer$TABLE c
    on (c.c_custkey = o.o_custkey)
-  join nation$TABLE n
+  join tpch.nation$TABLE n
    on (c.c_nationkey = n.n_nationkey)
 where
  o.o_orderdate >= '1993-10-01' and
--- a/testdata/workloads/tpch/queries/tpch-q11.test
+++ b/testdata/workloads/tpch/queries/tpch-q11.test
@@ -1,27 +1,27 @@
 # Q11 - Important Stock Identification
 # NOTE: Alan is re-writing part of this query set
 # so it is missing for now
-insert overwrite table q11_part_tmp$TABLE
+insert overwrite table tpch.q11_part_tmp$TABLE
 select ps_partkey, sum(ps_supplycost * ps_availqty) as part_value
-from nation$TABLE n
-join supplier$TABLE s
+from tpch.nation$TABLE n
+join tpch.supplier$TABLE s
  on s.s_nationkey = n.n_nationkey and n.n_name = 'GERMANY'
-join partsupp$TABLE ps
+join tpch.partsupp$TABLE ps
  on ps.ps_suppkey = s.s_suppkey
 group by ps_partkey
 ---- SETUP
-RESET q11_sum_tmp$TABLE
-RELOAD q11_sum_tmp$TABLE
+RESET tpch.q11_sum_tmp$TABLE
+RELOAD tpch.q11_sum_tmp$TABLE
 ---- RESULTS
 ---- NUMROWS
 29818
 ====
-insert overwrite table q11_sum_tmp$TABLE
+insert overwrite table tpch.q11_sum_tmp$TABLE
 select sum(part_value) as total_value
-from q11_part_tmp$TABLE
+from tpch.q11_part_tmp$TABLE
 ---- SETUP
-RESET q11_part_tmp$TABLE
-RELOAD q11_part_tmp$TABLE
+RESET tpch.q11_part_tmp$TABLE
+RELOAD tpch.q11_part_tmp$TABLE
 ---- RESULTS
 ---- NUMROWS
 1
--- a/testdata/workloads/tpch/queries/tpch-q12.test
+++ b/testdata/workloads/tpch/queries/tpch-q12.test
@@ -15,8 +15,8 @@ sum(case
      else 0
    end
 ) as low_line_count
-from lineitem$TABLE l
-  join orders$TABLE o
+from tpch.lineitem$TABLE l
+  join tpch.orders$TABLE o
    on (o.o_orderkey = l.l_orderkey and
        l.l_commitdate < l.l_receiptdate and
        l.l_shipdate < l.l_commitdate)
--- a/testdata/workloads/tpch/queries/tpch-q13.test
+++ b/testdata/workloads/tpch/queries/tpch-q13.test
@@ -6,8 +6,8 @@ from
  ( select
      c_custkey,
      count(o_orderkey) as c_count
-    from orders$TABLE o
-    right outer join customer$TABLE c
+    from tpch.orders$TABLE o
+    right outer join tpch.customer$TABLE c
      on (c.c_custkey = o.o_custkey and o.o_comment not like '%special%requests%')
    group by
      c_custkey
--- a/testdata/workloads/tpch/queries/tpch-q14.test
+++ b/testdata/workloads/tpch/queries/tpch-q14.test
@@ -4,8 +4,8 @@ round(100.00 * sum(case when p_type like 'PROMO%' then l_extendedprice*(1-l_disc
             else 0.0
             end
             ) / sum(l_extendedprice * (1 - l_discount)), 5) as promo_revenue
-from lineitem$TABLE l
-join part$TABLE p
+from tpch.lineitem$TABLE l
+join tpch.part$TABLE p
  on l.l_partkey = p.p_partkey and
     l.l_shipdate >= '1995-09-01' and
     l.l_shipdate < '1995-10-01'
--- a/testdata/workloads/tpch/queries/tpch-q15.test
+++ b/testdata/workloads/tpch/queries/tpch-q15.test
@@ -1,24 +1,24 @@
 # Q15 - Top Supplier Query
-insert overwrite table revenue$TABLE
+insert overwrite table tpch.revenue$TABLE
 select
  l_suppkey as supplier_no,
  sum(l_extendedprice * (1 - l_discount)) as total_revenue
-from lineitem$TABLE
+from tpch.lineitem$TABLE
 where l_shipdate >= '1996-01-01' and l_shipdate < '1996-04-01'
 group by l_suppkey
 ---- SETUP
-RESET revenue$TABLE
-RELOAD revenue$TABLE
+RESET tpch.revenue$TABLE
+RELOAD tpch.revenue$TABLE
 ---- RESULTS
 ---- NUMROWS
 10000
 ====
-insert overwrite table max_revenue$TABLE
+insert overwrite table tpch.max_revenue$TABLE
 select max(total_revenue)
-from revenue$TABLE
+from tpch.revenue$TABLE
 ---- SETUP
-RESET max_revenue$TABLE
-RELOAD max_revenue$TABLE
+RESET tpch.max_revenue$TABLE
+RELOAD tpch.max_revenue$TABLE
 ---- RESULTS
 ---- NUMROWS
 1
@@ -30,10 +30,10 @@ select
  s_address,
  s_phone,
  total_revenue
-from supplier$TABLE s
-join revenue$TABLE r
+from tpch.supplier$TABLE s
+join tpch.revenue$TABLE r
  on (s.s_suppkey = r.supplier_no)
-join max_revenue$TABLE m
+join tpch.max_revenue$TABLE m
  on (r.total_revenue = m.max_revenue)
 order by s_suppkey
 limit 100
--- a/testdata/workloads/tpch/queries/tpch-q16.test
+++ b/testdata/workloads/tpch/queries/tpch-q16.test
@@ -1,27 +1,27 @@
 # Q16 - Parts/Supplier Relation Query
-insert overwrite table supplier_tmp$TABLE
+insert overwrite table tpch.supplier_tmp$TABLE
 select s_suppkey
-from supplier$TABLE
+from tpch.supplier$TABLE
 where not s_comment like '%Customer%Complaints%'
 ---- SETUP
-RESET supplier_tmp$TABLE
-RELOAD supplier_tmp$TABLE
+RESET tpch.supplier_tmp$TABLE
+RELOAD tpch.supplier_tmp$TABLE
 ---- RESULTS
 ---- NUMROWS
 9996
 ====
-insert overwrite table q16_tmp$TABLE
+insert overwrite table tpch.q16_tmp$TABLE
 select p_brand, p_type, p_size, ps_suppkey
-from partsupp$TABLE ps
-join part$TABLE p
+from tpch.partsupp$TABLE ps
+join tpch.part$TABLE p
  on p.p_partkey = ps.ps_partkey and
     p.p_brand <> 'Brand#45' and
     not p.p_type like 'MEDIUM POLISHED%'
-join supplier_tmp$TABLE s
+join tpch.supplier_tmp$TABLE s
  on ps.ps_suppkey = s.s_suppkey
 ---- SETUP
-RESET q16_tmp$TABLE
-RELOAD q16_tmp$TABLE
+RESET tpch.q16_tmp$TABLE
+RELOAD tpch.q16_tmp$TABLE
 ---- RESULTS
 ---- NUMROWS
 741971
@@ -29,7 +29,7 @@ RELOAD q16_tmp$TABLE
 # Modifications: Added limit, removed 'DISTINCT' from count due to IMP-132
 select p_brand, p_type, p_size, count(ps_suppkey) as supplier_cnt
 from
-( select * from q16_tmp$TABLE
+( select * from tpch.q16_tmp$TABLE
  where p_size = 49 or p_size = 14 or
        p_size = 23 or p_size = 45 or
        p_size = 19 or p_size = 3 or
--- a/testdata/workloads/tpch/queries/tpch-q17.test
+++ b/testdata/workloads/tpch/queries/tpch-q17.test
@@ -1,11 +1,11 @@
 # Q17 - Small-Quantity-Order Revenue Query
-insert overwrite table lineitem_tmp$TABLE
+insert overwrite table tpch.lineitem_tmp$TABLE
 select l_partkey as t_partkey, 0.2 * avg(l_quantity) as t_avg_quantity
-from lineitem$TABLE
+from tpch.lineitem$TABLE
 group by l_partkey
 ---- SETUP
-RESET lineitem_tmp$TABLE
-RELOAD lineitem_tmp$TABLE
+RESET tpch.lineitem_tmp$TABLE
+RELOAD tpch.lineitem_tmp$TABLE
 ---- RESULTS
 ---- NUMROWS
 200000
@@ -13,10 +13,10 @@ RELOAD lineitem_tmp$TABLE
 # Modifications: Converted selects from multiple tables to joins,
 # added round() call, removed subquery
 select round(sum(l_extendedprice) / 7.0, 5) as avg_yearly
-from lineitem$TABLE l
-  join part$TABLE p
+from tpch.lineitem$TABLE l
+  join tpch.part$TABLE p
    on (p.p_partkey = l.l_partkey)
-  join lineitem_tmp$TABLE lt
+  join tpch.lineitem_tmp$TABLE lt
    on (lt.t_partkey = p.p_partkey)
 where
  p.p_brand = 'Brand#23' and
--- a/testdata/workloads/tpch/queries/tpch-q18.test
+++ b/testdata/workloads/tpch/queries/tpch-q18.test
@@ -1,11 +1,11 @@
 # Q18 - Large Value Customer Query
-insert overwrite table q18_tmp$TABLE
+insert overwrite table tpch.q18_tmp$TABLE
 select l_orderkey, sum(l_quantity) as t_sum_quantity
-from lineitem$TABLE
+from tpch.lineitem$TABLE
 group by l_orderkey
 ---- SETUP
-RESET q18_tmp$TABLE
-RELOAD q18_tmp$TABLE
+RESET tpch.q18_tmp$TABLE
+RELOAD tpch.q18_tmp$TABLE
 ---- RESULTS
 ---- NUMROWS
 1500000
@@ -19,12 +19,12 @@ select
  o_orderdate,
  cast(o_totalprice as bigint) as total_price_bigint,
  round(sum(l_quantity), 5)
-from lineitem$TABLE l
-  join orders$TABLE o
+from tpch.lineitem$TABLE l
+  join tpch.orders$TABLE o
    on (o.o_orderkey = l.l_orderkey)
-  join customer$TABLE c
+  join tpch.customer$TABLE c
    on (c.c_custkey = o.o_custkey)
-  join q18_tmp$TABLE t
+  join tpch.q18_tmp$TABLE t
    on (o.o_orderkey = t.l_orderkey and t.t_sum_quantity > 300)
 group by
  c_name,
--- a/testdata/workloads/tpch/queries/tpch-q19.test
+++ b/testdata/workloads/tpch/queries/tpch-q19.test
@@ -1,8 +1,8 @@
 # Q19 - Discounted Revenue Query
 # Modifications: Added round() calls
 select round(sum(l_extendedprice * (1 - l_discount) ), 5) as revenue
-from lineitem$TABLE l
-join part$TABLE p
+from tpch.lineitem$TABLE l
+join tpch.part$TABLE p
  on p.p_partkey = l.l_partkey
 where
 (
--- a/testdata/workloads/tpch/queries/tpch-q2.test
+++ b/testdata/workloads/tpch/queries/tpch-q2.test
@@ -1,5 +1,5 @@
 # Q2 - Minimum Cost Supplier Query
-insert overwrite table q2_minimum_cost_supplier_tmp1$TABLE
+insert overwrite table tpch.q2_minimum_cost_supplier_tmp1$TABLE
 select
  s.s_acctbal,
  s.s_name,
@@ -10,31 +10,31 @@ select
  s.s_address,
  s.s_phone,
  s.s_comment
-from partsupp$TABLE ps
-  join part$TABLE p
+from tpch.partsupp$TABLE ps
+  join tpch.part$TABLE p
    on (p.p_partkey = ps.ps_partkey and p.p_size = 15 and p.p_type like '%BRASS')
-  join supplier$TABLE s
+  join tpch.supplier$TABLE s
    on (s.s_suppkey = ps.ps_suppkey)
-  join nation$TABLE n
+  join tpch.nation$TABLE n
    on (s.s_nationkey = n.n_nationkey)
-  join region$TABLE r
+  join tpch.region$TABLE r
    on (n.n_regionkey = r.r_regionkey and r.r_name = 'EUROPE')
 ---- SETUP
-RESET q2_minimum_cost_supplier_tmp1$TABLE
-RELOAD q2_minimum_cost_supplier_tmp1$TABLE
+RESET tpch.q2_minimum_cost_supplier_tmp1$TABLE
+RELOAD tpch.q2_minimum_cost_supplier_tmp1$TABLE
 ---- RESULTS
 ---- NUMROWS
 642
 ====
-insert overwrite table q2_minimum_cost_supplier_tmp2$TABLE
+insert overwrite table tpch.q2_minimum_cost_supplier_tmp2$TABLE
 select
  p_partkey,
  min(ps_supplycost)
-from  q2_minimum_cost_supplier_tmp1$TABLE
+from  tpch.q2_minimum_cost_supplier_tmp1$TABLE
 group by p_partkey
 ---- SETUP
-RESET q2_minimum_cost_supplier_tmp2$TABLE
-RELOAD q2_minimum_cost_supplier_tmp2$TABLE
+RESET tpch.q2_minimum_cost_supplier_tmp2$TABLE
+RELOAD tpch.q2_minimum_cost_supplier_tmp2$TABLE
 ---- RESULTS
 ---- NUMROWS
 460
@@ -49,8 +49,8 @@ select
  t1.s_address,
  t1.s_phone,
  t1.s_comment
-from q2_minimum_cost_supplier_tmp1$TABLE t1
-join q2_minimum_cost_supplier_tmp2$TABLE t2
+from tpch.q2_minimum_cost_supplier_tmp1$TABLE t1
+join tpch.q2_minimum_cost_supplier_tmp2$TABLE t2
  on (t1.p_partkey = t2.p_partkey and t1.ps_supplycost = t2.ps_min_supplycost)
 order by
  s_acctbal desc,
--- a/testdata/workloads/tpch/queries/tpch-q20.test
+++ b/testdata/workloads/tpch/queries/tpch-q20.test
@@ -1,21 +1,21 @@
 # Q20 - Potential Part Promotion Query
-insert overwrite table q20_tmp1$TABLE
+insert overwrite table tpch.q20_tmp1$TABLE
 select distinct p_partkey
-from part$TABLE
+from tpch.part$TABLE
 where p_name like 'forest%'
 ---- SETUP
-RESET q20_tmp1$TABLE
-RELOAD q20_tmp1$TABLE
+RESET tpch.q20_tmp1$TABLE
+RELOAD tpch.q20_tmp1$TABLE
 ---- RESULTS
 ---- NUMROWS
 2127
 ====
-insert overwrite table q20_tmp2$TABLE
+insert overwrite table tpch.q20_tmp2$TABLE
 select
  l_partkey,
  l_suppkey,
  0.5 * sum(l_quantity)
-from lineitem$TABLE
+from tpch.lineitem$TABLE
 where
  l_shipdate >= '1994-01-01' and
  l_shipdate < '1995-01-01'
@@ -23,52 +23,52 @@ group by
  l_partkey,
  l_suppkey
 ---- SETUP
-RESET q20_tmp2$TABLE
-RELOAD q20_tmp2$TABLE
+RESET tpch.q20_tmp2$TABLE
+RELOAD tpch.q20_tmp2$TABLE
 ---- RESULTS
 ---- NUMROWS
 543210
 ====
-insert overwrite table q20_tmp3$TABLE
+insert overwrite table tpch.q20_tmp3$TABLE
 select
  ps_suppkey,
  ps_availqty,
  sum_quantity
-from partsupp$TABLE ps
-  join q20_tmp2$TABLE t2
+from tpch.partsupp$TABLE ps
+  join tpch.q20_tmp2$TABLE t2
    on (ps.ps_partkey = t2.l_partkey and ps.ps_suppkey = t2.l_suppkey)
-  join q20_tmp1$TABLE t1
+  join tpch.q20_tmp1$TABLE t1
    on (ps.ps_partkey = t1.p_partkey)
 ---- SETUP
-RESET q20_tmp3$TABLE
-RELOAD q20_tmp3$TABLE
+RESET tpch.q20_tmp3$TABLE
+RELOAD tpch.q20_tmp3$TABLE
 ---- RESULTS
 ---- NUMROWS
 5843
 ====
 # Modified to use subquery to work around IMP-127
-insert overwrite table q20_tmp4$TABLE
+insert overwrite table tpch.q20_tmp4$TABLE
 select a.ps_suppkey
 from (select
        ps_suppkey,
-        count(1) from q20_tmp3$TABLE
+        count(1) from tpch.q20_tmp3$TABLE
      where ps_availqty > sum_quantity
      group by ps_suppkey
     ) a
 ---- SETUP
-RESET q20_tmp4$TABLE
-RELOAD q20_tmp4$TABLE
+RESET tpch.q20_tmp4$TABLE
+RELOAD tpch.q20_tmp4$TABLE
 ---- RESULTS
 ====
 # Modifications: Added limit
 select
  s_name,
  s_address
-from supplier$TABLE s
-  join nation$TABLE n
+from tpch.supplier$TABLE s
+  join tpch.nation$TABLE n
    on (s.s_nationkey = n.n_nationkey and
        n.n_name = 'CANADA')
-join q20_tmp4$TABLE t4
+join tpch.q20_tmp4$TABLE t4
  on (s.s_suppkey = t4.ps_suppkey)
 order by
  s_name
--- a/testdata/workloads/tpch/queries/tpch-q21.test
+++ b/testdata/workloads/tpch/queries/tpch-q21.test
@@ -6,16 +6,16 @@
 select
  s_name,
  count(*) as numwait
-from lineitem$TABLE l1
-  join supplier$TABLE s
+from tpch.lineitem$TABLE l1
+  join tpch.supplier$TABLE s
    on (s.s_suppkey = l1.l_suppkey)
-  join orders$TABLE o
+  join tpch.orders$TABLE o
    on (o.o_orderkey = l1.l_orderkey)
-  join nation$TABLE n
+  join tpch.nation$TABLE n
    on (s.s_nationkey = n.n_nationkey)
-  left semi join lineitem$TABLE l2
+  left semi join tpch.lineitem$TABLE l2
    on (l2.l_orderkey = l1.l_orderkey)
-  left outer join lineitem$TABLE l3
+  left outer join tpch.lineitem$TABLE l3
    on (l3.l_orderkey = l1.l_orderkey and
        l3.l_receiptdate > l3.l_commitdate
       )
--- a/testdata/workloads/tpch/queries/tpch-q22.test
+++ b/testdata/workloads/tpch/queries/tpch-q22.test
@@ -3,11 +3,11 @@
 # a constant value ('C') so that we can do a join between this table
 # in the main query. This was needed because we only support equi-joins
 # and had to have a column to join on.
-insert overwrite table q22_customer_tmp1$TABLE
+insert overwrite table tpch.q22_customer_tmp1$TABLE
 select
  avg(c_acctbal) avg_acctbal,
  substr(c_name, 1, 1) as cust_name_char
-from customer$TABLE c
+from tpch.customer$TABLE c
 where
   c.c_acctbal > 0.00 and
   (substr(c.c_phone, 1, 2) = '13' or
@@ -20,8 +20,8 @@ where
 group by
  substr(c_name, 1, 1)
 ---- SETUP
-RESET q22_customer_tmp1$TABLE
-RELOAD q22_customer_tmp1$TABLE
+RESET tpch.q22_customer_tmp1$TABLE
+RELOAD tpch.q22_customer_tmp1$TABLE
 ---- RESULTS
 ---- NUMROWS
 1
@@ -36,10 +36,10 @@ select
  substring(c_phone, 1, 2) as cntrycode,
  count(*) as numcust,
  round(sum(c_acctbal), 4) as totacctbal
-from customer$TABLE c
-  join q22_customer_tmp1$TABLE ct
+from tpch.customer$TABLE c
+  join tpch.q22_customer_tmp1$TABLE ct
    on (substr(c.c_name, 1, 1) = ct.cust_name_char)
-  left outer join orders$TABLE o
+  left outer join tpch.orders$TABLE o
    on (o.o_custkey = c.c_custkey)
 where
  o_custkey is null and
--- a/testdata/workloads/tpch/queries/tpch-q3.test
+++ b/testdata/workloads/tpch/queries/tpch-q3.test
@@ -5,10 +5,10 @@ select
  round(sum(l_extendedprice * (1 - l_discount)), 5) as revenue,
  o_orderdate,
  o_shippriority
-from lineitem$TABLE l
-  join orders$TABLE o
+from tpch.lineitem$TABLE l
+  join tpch.orders$TABLE o
    on (l.l_orderkey = o.o_orderkey)
-  join customer$TABLE c
+  join tpch.customer$TABLE c
    on (c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey)
 where
  o_orderdate < '1995-03-15' and
--- a/testdata/workloads/tpch/queries/tpch-q4.test
+++ b/testdata/workloads/tpch/queries/tpch-q4.test
@@ -5,8 +5,8 @@
 select
  o_orderpriority,
  count(distinct l_orderkey) as order_count
-from lineitem$TABLE l
-inner join orders$TABLE o
+from tpch.lineitem$TABLE l
+inner join tpch.orders$TABLE o
  on (o.o_orderkey = l.l_orderkey and
      l.l_commitdate < l.l_receiptdate)
 where
--- a/testdata/workloads/tpch/queries/tpch-q5.test
+++ b/testdata/workloads/tpch/queries/tpch-q5.test
@@ -4,16 +4,16 @@
 select
  n_name,
  round(sum(l_extendedprice * (1 - l_discount)), 5) as revenue
-from lineitem$TABLE l
-join orders$TABLE o
+from tpch.lineitem$TABLE l
+join tpch.orders$TABLE o
  on (l_orderkey = o_orderkey)
-join supplier$TABLE s
+join tpch.supplier$TABLE s
  on (l_suppkey = s_suppkey)
-join customer$TABLE
+join tpch.customer$TABLE
  on (c_nationkey = s_nationkey and c_custkey = o_custkey)
-join nation$TABLE
+join tpch.nation$TABLE
  on (s_nationkey = n_nationkey)
-join region$TABLE
+join tpch.region$TABLE
  on (n_regionkey = r_regionkey)
 where
  r_name = 'ASIA'
--- a/testdata/workloads/tpch/queries/tpch-q6.test
+++ b/testdata/workloads/tpch/queries/tpch-q6.test
@@ -1,7 +1,7 @@
 # Q6 - Forecasting Revenue Change Query
 # Modifications: Added round() call
 select round(sum(l_extendedprice * l_discount), 5) as revenue
-from lineitem$TABLE
+from tpch.lineitem$TABLE
 where l_shipdate >= '1994-01-01' and
      l_shipdate < '1995-01-01' and
      l_discount >= 0.05 and
--- a/testdata/workloads/tpch/queries/tpch-q8.test
+++ b/testdata/workloads/tpch/queries/tpch-q8.test
@@ -5,20 +5,20 @@ select
  year(o_orderdate) as o_year,
  round(sum(case when n2.n_name = 'BRAZIL' then l_extendedprice * (1 - l_discount)
                 else 0 end) / sum(l_extendedprice * (1 - l_discount)), 5) as mkt_share
-from lineitem$TABLE l
-  join orders$TABLE o
+from tpch.lineitem$TABLE l
+  join tpch.orders$TABLE o
    on (l_orderkey = o_orderkey)
-  join part$TABLE p
+  join tpch.part$TABLE p
    on (p_partkey = l_partkey)
-  join supplier$TABLE s
+  join tpch.supplier$TABLE s
    on (s_suppkey = l_suppkey)
-  join customer$TABLE c
+  join tpch.customer$TABLE c
    on (o_custkey = c_custkey)
-  join nation$TABLE n1
+  join tpch.nation$TABLE n1
    on (c_nationkey = n1.n_nationkey)
-  join region$TABLE r
+  join tpch.region$TABLE r
    on (n1.n_regionkey = r_regionkey)
-  join nation$TABLE n2
+  join tpch.nation$TABLE n2
    on (s_nationkey = n2.n_nationkey)
 where
  r_name = 'AMERICA' and
--- a/testdata/workloads/tpch/queries/tpch-q9.test
+++ b/testdata/workloads/tpch/queries/tpch-q9.test
@@ -6,16 +6,16 @@ select
  year(o.o_orderdate) as o_year,
  round(sum(l.l_extendedprice * (1 - l.l_discount) -
            ps.ps_supplycost * l.l_quantity), 1) as sum_profit
-from lineitem$TABLE l
-  join part$TABLE p
+from tpch.lineitem$TABLE l
+  join tpch.part$TABLE p
    on (p.p_partkey = l.l_partkey)
-  join orders$TABLE o
+  join tpch.orders$TABLE o
    on (o.o_orderkey = l.l_orderkey)
-  join partsupp$TABLE ps
+  join tpch.partsupp$TABLE ps
    on (ps.ps_suppkey = l.l_suppkey and ps.ps_partkey = l.l_partkey)
-  join supplier$TABLE s
+  join tpch.supplier$TABLE s
    on (s.s_suppkey = l.l_suppkey)
-  join nation$TABLE n
+  join tpch.nation$TABLE n
    on (s.s_nationkey = n.n_nationkey)
 where
  p.p_name like '%green%'
--- a/testdata/workloads/tpch/tpch_core.csv
+++ b/testdata/workloads/tpch/tpch_core.csv
@@ -1,2 +1,2 @@
-# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
-text,tpch,none,none
+# Manually created file.
+file_format:text, dataset:tpch, compression_codec:none, compression_type:none
--- a/testdata/workloads/tpch/tpch_dimensions.csv
+++ b/testdata/workloads/tpch/tpch_dimensions.csv
@@ -1,4 +1,4 @@
 file_format: text,seq,rc,trevni
-data_group: tpch
+dataset: tpch
 compression_codec: none,def,gzip,bzip,snap
 compression_type: none,block,record
--- a/testdata/workloads/tpch/tpch_exhaustive.csv
+++ b/testdata/workloads/tpch/tpch_exhaustive.csv
@@ -1,2 +1,5 @@
-# Generated File. The vector value order is: file format, data_group, compression codec, compression type
-text,tpch,none,none
+# Generated File.
+file_format: text, dataset: tpch, compression_codec: none, compression_type: none
+file_format: trevni, dataset: tpch, compression_codec: none, compression_type: none
+file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
+file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block
--- a/testdata/workloads/tpch/tpch_pairwise.csv
+++ b/testdata/workloads/tpch/tpch_pairwise.csv
@@ -1,2 +1,4 @@
-# Generated File. The vector value order is: file format, data_group, compression codec, compression type
-text,tpch,none,none
+# Generated File.
+file_format: text, dataset: tpch, compression_codec: none, compression_type: none
+file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
+file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block