mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
Update benchmark tests to run against generic workload, data loading with scale factor, +more
This change updates the run-benchmark script to enable it to target one or more workloads. Now benchmarks can be run like: ./run-benchmark --workloads=hive-benchmark,tpch We lookup the workload in the workloads directory, then read the associated query .test files and start executing them. To ensure the queries are not duplicated between benchmark and query tests, I moved all existing queries (under fe/src/test/resources/* to the workloads directory. You do NOT need to look through all the .test files, I've just moved them. The one new file is the 'hive-benchmark.test' which contains the hive benchmark queries. Also added support for generating schema for different scale factors as well as executing against these scale factors. For example, let's say we have a dataset with a scale factor called "SF1". We would first generate the schema using: ./generate_schema_statements --workload=<workload> --scale_factor="SF3" This will create tables with a unique names from the other scale factors. Run the generated .sql file to load the data. Alternatively, the data can loaded by running a new python script: ./bin/load-data.py -w <workload1>,<workload2> -e <exploration strategy> -s [scale factor] For example: load-data.sh -w tpch -e core -s SF3 Then run against this: ./run-benchmark --workloads=<workload> --scale_factor=SF3 This changeset also includes a few other minor tweaks to some of the test scripts. Change-Id: Ife8a8d91567d75c9612be37bec96c1e7780f50d6
This commit is contained in:
committed by
Henry Robinson
parent
81d54e85e5
commit
04edc8f534
5
.gitignore
vendored
5
.gitignore
vendored
@@ -7,9 +7,8 @@ cscope.out
|
||||
org.eclipse.jdt.core.prefs
|
||||
benchmark_results.csv
|
||||
reference_benchmark_results.csv
|
||||
testdata/data/test-warehouse
|
||||
testdata/bin/create-*-generated.sql
|
||||
testdata/bin/load-*-generated.sql
|
||||
load-trevni-*-generated.sh
|
||||
load-*-generated.sql
|
||||
|
||||
pprof.out
|
||||
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
bin=`dirname "$0"`
|
||||
bin=`cd "$bin"; pwd`
|
||||
. "$bin"/impala-config.sh
|
||||
|
||||
set -e
|
||||
set -u
|
||||
|
||||
echo "Copying data files from the share. If the file already exists locally, the files"\
|
||||
"will not be copied. It's not check summing the files or anything like that, if"\
|
||||
|
||||
@@ -36,6 +36,8 @@ fi
|
||||
|
||||
export IMPALA_FE_DIR=$IMPALA_HOME/fe
|
||||
export IMPALA_BE_DIR=$IMPALA_HOME/be
|
||||
export IMPALA_WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
|
||||
export IMPALA_DATASET_DIR=$IMPALA_HOME/testdata/datasets
|
||||
export IMPALA_COMMON_DIR=$IMPALA_HOME/common
|
||||
export PATH=$IMPALA_HOME/bin:$PATH
|
||||
|
||||
|
||||
130
bin/load-data.py
Executable file
130
bin/load-data.py
Executable file
@@ -0,0 +1,130 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
#
|
||||
# This script is used to load the proper datasets for the specified workloads. It loads
|
||||
# all data via Hive except for Trevni data which needs to be loaded via Impala.
|
||||
import collections
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from itertools import product
|
||||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser()
|
||||
parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy", default="core",
|
||||
help="The exploration strategy for schema gen: 'core', "\
|
||||
"'pairwise', or 'exhaustive'")
|
||||
parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
|
||||
help="The HDFS path to the base Hive test warehouse directory")
|
||||
parser.add_option("-w", "--workloads", dest="workloads",
|
||||
help="Comma-separated list of workloads to load data for. If 'all' is "\
|
||||
"specified then data for all workloads is loaded.")
|
||||
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
|
||||
help="An optional scale factor to generate the schema for")
|
||||
parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
|
||||
default=False, help='Skips HDFS exists check and reloads all tables')
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
||||
DATASET_DIR = os.environ['IMPALA_DATASET_DIR']
|
||||
TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
|
||||
|
||||
GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s "\
|
||||
"--workload=%s --scale_factor=%s --verbose"
|
||||
HIVE_CMD = os.path.join(os.environ['HIVE_HOME'], 'bin/hive')
|
||||
HIVE_ARGS = "-hiveconf hive.root.logger=WARN,console -v"
|
||||
|
||||
def available_workloads(workload_dir):
|
||||
return [subdir for subdir in os.listdir(workload_dir)
|
||||
if os.path.isdir(os.path.join(workload_dir, subdir))]
|
||||
|
||||
def validate_workloads(all_workloads, workloads):
|
||||
for workload in workloads:
|
||||
if workload not in all_workloads:
|
||||
print 'Workload \'%s\' not found in workload directory' % workload
|
||||
print 'Available workloads: ' + ', '.join(all_workloads)
|
||||
sys.exit(1)
|
||||
|
||||
def exec_hive_query_from_file(file_name):
|
||||
hive_cmd = "%s %s -f %s" % (HIVE_CMD, HIVE_ARGS, file_name)
|
||||
print 'Executing Hive Command: ' + hive_cmd
|
||||
ret_val = subprocess.call(hive_cmd, shell = True)
|
||||
if ret_val != 0:
|
||||
print 'Error executing file from Hive: ' + file_name
|
||||
sys.exit(ret_val)
|
||||
|
||||
def exec_bash_script(file_name):
|
||||
bash_cmd = "bash %s" % file_name
|
||||
print 'Executing Bash Command: ' + bash_cmd
|
||||
ret_val = subprocess.call(bash_cmd, shell = True)
|
||||
if ret_val != 0:
|
||||
print 'Error bash script: ' + file_name
|
||||
sys.exit(ret_val)
|
||||
|
||||
def generate_schema_statements(workload):
|
||||
generate_cmd = GENERATE_SCHEMA_CMD % (options.exploration_strategy, workload,
|
||||
options.scale_factor)
|
||||
if options.force_reload:
|
||||
generate_cmd += " --force_reload"
|
||||
if options.hive_warehouse_dir is not None:
|
||||
generate_cmd += " --hive_warehouse_dir=%s" % options.hive_warehouse_dir
|
||||
print 'Executing Generate Schema Command: ' + generate_cmd
|
||||
ret_val = subprocess.call(os.path.join(TESTDATA_BIN_DIR, generate_cmd), shell = True)
|
||||
if ret_val != 0:
|
||||
print 'Error generating schema statements for workload: ' + workload
|
||||
sys.exit(ret_val)
|
||||
|
||||
def get_dataset_for_workload(workload):
|
||||
dimension_file_name = os.path.join(WORKLOAD_DIR, workload,
|
||||
'%s_dimensions.csv' % workload)
|
||||
if not os.path.isfile(dimension_file_name):
|
||||
print 'Dimension file not found: ' + dimension_file_name
|
||||
sys.exit(1)
|
||||
with open(dimension_file_name, 'rb') as input_file:
|
||||
match = re.search('dataset:\s*(\w+)', input_file.read())
|
||||
if match:
|
||||
return match.group(1)
|
||||
else:
|
||||
print 'Dimension file does not contain dataset for workload \'%s\'' % (workload)
|
||||
sys.exit(1)
|
||||
|
||||
all_workloads = available_workloads(WORKLOAD_DIR)
|
||||
workloads = []
|
||||
if options.workloads is None:
|
||||
print "At least one workload name must be specified."
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
elif options.workloads == 'all':
|
||||
print 'Loading data for all workloads.'
|
||||
workloads = all_workloads
|
||||
else:
|
||||
workloads = options.workloads.split(",")
|
||||
validate_workloads(all_workloads, workloads)
|
||||
|
||||
print 'Starting data load for the following workloads: ' + ', '.join(workloads)
|
||||
|
||||
loading_time_map = collections.defaultdict(float)
|
||||
for workload in workloads:
|
||||
start_time = time.time()
|
||||
dataset = get_dataset_for_workload(workload)
|
||||
print "Dataset for workload '%s' is '%s'" % (workload, dataset)
|
||||
dataset_dir = os.path.join(DATASET_DIR, dataset)
|
||||
os.chdir(dataset_dir)
|
||||
generate_schema_statements(workload)
|
||||
exec_hive_query_from_file(os.path.join(dataset_dir,
|
||||
'load-%s-%s-generated.sql' % (workload, options.exploration_strategy)))
|
||||
|
||||
exec_bash_script(os.path.join(dataset_dir,
|
||||
'load-trevni-%s-%s-generated.sh' % (workload, options.exploration_strategy)))
|
||||
loading_time_map[workload] = time.time() - start_time
|
||||
|
||||
total_time = 0.0
|
||||
for workload, load_time in loading_time_map.iteritems():
|
||||
total_time += load_time
|
||||
print 'Data loading for workload \'%s\' completed in: %.2fs'\
|
||||
% (workload, load_time)
|
||||
print 'Total load time: %.2fs\n' % total_time
|
||||
@@ -1,77 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
#
|
||||
# This script creates schema and loads data into hive for running benchmarks and
|
||||
# other tests. Using this script requires passing in two parameters:
|
||||
# The first is the data set type (benchmark, tpch). This will load the appropriate
|
||||
# collection of data sets for the run type.
|
||||
# The second is the exploration strategy. This determines the different combinations
|
||||
# of file format, compression, etc that will be created and loaded. 'Core' defines
|
||||
# a basic set of combinations. If 'pairwise' is specified the pairwise combinations
|
||||
# of workload # + file format + compression will be loaded. If 'exhaustive' is
|
||||
# passed as an argument the exhaustive set of combinations will be loaded.
|
||||
# TODO: Rewrite this script in python and detect and load workloads by enumerating
|
||||
# the workloads directory.
|
||||
|
||||
exploration_strategy=
|
||||
data_set_type=
|
||||
|
||||
if [ $1 = "hive-benchmark" ]; then
|
||||
data_set_type=$1
|
||||
elif [ $1 = "functional" ]; then
|
||||
data_set_type=$1
|
||||
elif [ $1 = "tpch" ]; then
|
||||
data_set_type=$1
|
||||
elif [ $1 = "query-test" ]; then
|
||||
data_set_type="tpch functional"
|
||||
elif [ $1 = "all" ]; then
|
||||
data_set_type="hive-benchmark tpch functional"
|
||||
else
|
||||
echo "Invalid run type: $1. Valid values are 'all, query-test,"\
|
||||
"functional, tpch, hive-benchmark'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $2 = "core" -o $2 = "pairwise" -o $2 = "exhaustive" ]; then
|
||||
exploration_strategy=$2
|
||||
else
|
||||
echo "Invalid exploration strategy: $2. Valid values are 'core, pairwise, exhaustive'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
bin=`dirname "$0"`
|
||||
bin=`cd "$bin"; pwd`
|
||||
. "$bin"/impala-config.sh
|
||||
|
||||
set -e
|
||||
|
||||
WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
|
||||
DATASET_DIR=$IMPALA_HOME/testdata/datasets
|
||||
BIN_DIR=$IMPALA_HOME/testdata/bin
|
||||
|
||||
function execute_hive_query_from_file {
|
||||
hive_args="-hiveconf hive.root.logger=WARN,console -v -f"
|
||||
"$HIVE_HOME/bin/hive" $hive_args $1
|
||||
if [ $? != 0 ]; then
|
||||
echo LOAD OF $1 FAILED
|
||||
exit -1
|
||||
fi
|
||||
}
|
||||
|
||||
for ds in $data_set_type
|
||||
do
|
||||
SCRIPT_DIR=$DATASET_DIR/$ds
|
||||
pushd $SCRIPT_DIR
|
||||
$BIN_DIR/generate_schema_statements.py --exploration_strategy ${exploration_strategy}\
|
||||
--workload=${ds} --verbose
|
||||
execute_hive_query_from_file \
|
||||
"$SCRIPT_DIR/load-${ds}-${exploration_strategy}-generated.sql"
|
||||
bash $SCRIPT_DIR/load-trevni-${ds}-${exploration_strategy}-generated.sh
|
||||
popd
|
||||
done
|
||||
|
||||
# TODO: Temporarily disable block id generation for everything except benchmark runs
|
||||
# due to IMP-134
|
||||
if [ $1 = "hive-benchmark" ]; then
|
||||
$IMPALA_HOME/testdata/bin/generate-block-ids.sh
|
||||
fi
|
||||
@@ -47,13 +47,14 @@ COLUMN_WIDTH = 18
|
||||
TOTAL_WIDTH = 122 if options.verbose else 90
|
||||
|
||||
# These are the indexes in the input row for each column value
|
||||
QUERY_IDX = 0
|
||||
FILE_FORMAT_IDX = 1
|
||||
COMPRESSION_IDX = 2
|
||||
IMPALA_AVG_IDX = 3
|
||||
IMPALA_STDDEV_IDX = 4
|
||||
HIVE_AVG_IDX = 5
|
||||
HIVE_STDDEV_IDX = 6
|
||||
WORKLOAD_IDX = 0
|
||||
QUERY_IDX = 1
|
||||
FILE_FORMAT_IDX = 2
|
||||
COMPRESSION_IDX = 3
|
||||
IMPALA_AVG_IDX = 4
|
||||
IMPALA_STDDEV_IDX = 5
|
||||
HIVE_AVG_IDX = 6
|
||||
HIVE_STDDEV_IDX = 7
|
||||
|
||||
# Formats a string so that is is wrapped across multiple lines with no single line
|
||||
# being longer than the given width
|
||||
@@ -91,7 +92,8 @@ def find_matching_row_in_reference_results(search_row, reference_results):
|
||||
for row in reference_results:
|
||||
if (row[QUERY_IDX] == search_row[QUERY_IDX] and
|
||||
row[FILE_FORMAT_IDX] == search_row[FILE_FORMAT_IDX] and
|
||||
row[COMPRESSION_IDX] == search_row[COMPRESSION_IDX]):
|
||||
row[COMPRESSION_IDX] == search_row[COMPRESSION_IDX] and
|
||||
row[WORKLOAD_IDX] == search_row[WORKLOAD_IDX]):
|
||||
return row
|
||||
return None
|
||||
|
||||
@@ -117,7 +119,7 @@ def print_table(results, verbose, reference_results = None):
|
||||
print build_padded_row_string(table_header, COLUMN_WIDTH)
|
||||
print "-" * TOTAL_WIDTH
|
||||
for row in group:
|
||||
full_row = row[1:] + [format_if_float(calculate_impala_hive_speedup(row)) + 'X']
|
||||
full_row = row[2:] + [format_if_float(calculate_impala_hive_speedup(row)) + 'X']
|
||||
if not verbose:
|
||||
del full_row[HIVE_AVG_IDX - 1]
|
||||
del full_row[HIVE_STDDEV_IDX - 2]
|
||||
@@ -193,7 +195,6 @@ def read_csv_result_file(file_name):
|
||||
results.append(row)
|
||||
return results
|
||||
|
||||
|
||||
reference_results = []
|
||||
results = []
|
||||
if os.path.isfile(options.result_file):
|
||||
@@ -1,11 +1,10 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
#
|
||||
# This script should be used to benchmark queries. It can either run in batch mode, in
|
||||
# which case it will run the set of hive benchmark queries or to run a single query. In
|
||||
# either case, it will first try to warm the buffer cache before running the query
|
||||
# multiple times. There are command line options to control how many times to prerun the
|
||||
# query for the buffer cache as well as the number of iterations.
|
||||
# This script is used to run benchmark queries. It runs the set queries specified in the
|
||||
# given workload(s) under <workload name>/queries. This script will first try to warm the
|
||||
# buffer cache before running the query. There is a command line options to control how
|
||||
# many iterations to run each query.
|
||||
#
|
||||
# By default, the script will have minimal output. Verbose output can be turned on with
|
||||
# the -v option which will output the normal query output. In addition, the -p option
|
||||
@@ -15,7 +14,7 @@
|
||||
# The script parses for output in the specific format in the regex below (result_regex).
|
||||
# This is not very robust but probably okay for this script.
|
||||
#
|
||||
# The planservice needs to be running before this script.
|
||||
# The planservice or ImpalaD needs to be running before executing any workload.
|
||||
# Run with the --help option to see the arguments.
|
||||
import collections
|
||||
import csv
|
||||
@@ -36,18 +35,17 @@ parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
||||
default = False, help="If set, outputs all benchmark diagnostics.")
|
||||
parser.add_option("--remote", dest="remote", action="store_true",
|
||||
default = False, help="Set to true if running on remote cluster.")
|
||||
parser.add_option("-q", "--query", dest="query", default = "",
|
||||
help="Query to run. If none specified, runs all queries.")
|
||||
parser.add_option("--iterations", dest="iterations", default="3",
|
||||
help="Number of times to run the query. Only to be used with -q")
|
||||
parser.add_option("--prime_cache", dest="prime_cache", default= True,
|
||||
help="Whether or not to prime the buffer cache. Only to be "\
|
||||
"used with -q")
|
||||
parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
|
||||
help="The exploration strategy to use for running benchmark: 'core', "\
|
||||
"'pairwise', or 'exhaustive'")
|
||||
parser.add_option("-w", "--workloads", dest="workloads", default="hive-benchmark",
|
||||
help="The workload(s) to execute in a comma-separated list format."\
|
||||
"Some valid workloads: 'hive-benchmark', 'tpch', ...")
|
||||
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
|
||||
help="The dataset scale factor to run the workload against.")
|
||||
parser.add_option("--query_cmd", dest="query_cmd",
|
||||
default='build/release/service/runquery -profile_output_file=""',
|
||||
default=os.path.join(os.environ['IMPALA_HOME'],
|
||||
'be/build/release/service/runquery') + ' -profile_output_file=""',
|
||||
help="The command to use for executing queries")
|
||||
parser.add_option("--compare_with_hive", dest="compare_with_hive", action="store_true",
|
||||
default= False, help="Run all queries using Hive as well as Impala")
|
||||
@@ -56,12 +54,21 @@ parser.add_option("--results_csv_file", dest="results_csv_file",
|
||||
help="The output file where benchmark results are saved")
|
||||
parser.add_option("--hive_cmd", dest="hive_cmd", default="hive -e",
|
||||
help="The command to use for executing hive queries")
|
||||
parser.add_option("-i", "--iterations", dest="iterations", default="5",
|
||||
help="Number of times to run each query.")
|
||||
parser.add_option("--prime_cache", dest="prime_cache", default= True,
|
||||
help="Whether or not to prime the buffer cache. Only to be "\
|
||||
"used with -q")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
profile_output_file = 'build/release/service/profile.tmp'
|
||||
gprof_cmd = 'google-pprof --text build/release/service/runquery %s | head -n 60'
|
||||
prime_cache_cmd = os.environ['IMPALA_HOME'] + "/testdata/bin/cache_tables.py -q \"%s\""
|
||||
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
||||
profile_output_file = os.path.join(os.environ['IMPALA_HOME'],
|
||||
'be/build/release/service/profile.tmp')
|
||||
|
||||
gprof_cmd = 'google-pprof --text ' + options.query_cmd + ' %s | head -n 60'
|
||||
prime_cache_cmd = os.path.join(os.environ['IMPALA_HOME'],
|
||||
"testdata/bin/cache_tables.py") + " -q \"%s\""
|
||||
result_single_regex = 'returned (\d*) rows? in (\d*).(\d*) s'
|
||||
result_multiple_regex = 'returned (\d*) rows? in (\d*).(\d*) s with stddev (\d*).(\d*)'
|
||||
hive_result_regex = 'Time taken: (\d*).(\d*) seconds'
|
||||
@@ -80,12 +87,14 @@ class QueryExecutionResult:
|
||||
self.stddev = stddev
|
||||
|
||||
class QueryExecutionDetail:
|
||||
def __init__(self, file_format, compression, impala_execution_result,
|
||||
hive_execution_result):
|
||||
def __init__(self, workload, file_format, compression_codec, compression_type,
|
||||
impala_execution_result, hive_execution_result):
|
||||
self.workload = workload
|
||||
self.file_format = file_format
|
||||
self.compression_codec = compression_codec
|
||||
self.compression_type = compression_type
|
||||
self.impala_execution_result = impala_execution_result
|
||||
self.hive_execution_result = hive_execution_result
|
||||
self.file_format = file_format
|
||||
self.compression = compression
|
||||
|
||||
# Parse for the tables used in this query
|
||||
def parse_tables(query):
|
||||
@@ -246,8 +255,14 @@ def run_query(query, prime_buffer_cache, iterations):
|
||||
execution_result = QueryExecutionResult(str(avg_time), str(stddev))
|
||||
return [output, execution_result]
|
||||
|
||||
def choose_input_vector_file_name(exploration_strategy):
|
||||
return "hive-benchmark_%s.csv" % exploration_strategy
|
||||
def vector_file_name(workload, exploration_strategy):
|
||||
return "%s_%s.csv" % (workload, exploration_strategy)
|
||||
|
||||
# Gets the name of the database to use for the specified workload and scale factor.
|
||||
def database_name_to_use(workload, scale_factor):
|
||||
if workload == 'tpch':
|
||||
return '%s%s.' % (workload, scale_factor)
|
||||
return ''
|
||||
|
||||
def build_table_suffix(file_format, codec, compression_type):
|
||||
if file_format == 'text' and codec == 'none':
|
||||
@@ -259,54 +274,27 @@ def build_table_suffix(file_format, codec, compression_type):
|
||||
else:
|
||||
return '_%s_%s' % (file_format, codec)
|
||||
|
||||
def build_query(query_format_string, exploration_strategy, data_set,
|
||||
file_format, codec, compression_type):
|
||||
def build_query(query_format_string, file_format, codec, compression_type,
|
||||
workload, scale_factor):
|
||||
database_name = database_name_to_use(workload, scale_factor)
|
||||
table_suffix = build_table_suffix(file_format, codec, compression_type)
|
||||
return query_format_string % {'table_suffix': table_suffix}
|
||||
# $TABLE is used as a token for table suffix in the queries. Here we insert the proper
|
||||
# database name based on the workload and query.
|
||||
return re.sub('(\w+\.){0,1}(?P<table_name>\w+)\$TABLE', '%s%s%s' %\
|
||||
(database_name, r'\g<table_name>', table_suffix), query_format_string)
|
||||
|
||||
def read_vector_file(file_name):
|
||||
if not os.path.isfile(file_name):
|
||||
print 'Cannot find vector file: ' + file_name
|
||||
sys.exit(1)
|
||||
|
||||
vector_values = []
|
||||
with open(file_name, 'rb') as vector_file:
|
||||
return [line.strip().split(',')
|
||||
for line in vector_file.readlines() if not line.startswith('#')]
|
||||
|
||||
os.chdir(os.environ['IMPALA_BE_DIR'])
|
||||
|
||||
# This table contains a hash of dataset -> [query, numbers of times to prime buffer cache,
|
||||
# number of iterations]. Queries should be grouped by the data they touch. This
|
||||
# eliminates the need for the buffer cache priming iterations.
|
||||
# TODO: it would be good if this table also contained the expected numbers and
|
||||
# automatically flag regressions. How do we reconcile the fact we are running on
|
||||
# different machines?
|
||||
queries = {'grep1gb': [
|
||||
["select count(*) from grep1gb%(table_suffix)s", 1, 5],
|
||||
["select count(field) from grep1gb%(table_suffix)s", 0, 5],
|
||||
["select count(field) from grep1gb%(table_suffix)s where field like '%%xyz%%'", 0, 5]
|
||||
],
|
||||
|
||||
'web': [
|
||||
["select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue "\
|
||||
"from uservisits%(table_suffix)s uv join rankings%(table_suffix)s r on "\
|
||||
"(r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate "\
|
||||
"< '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1", 1, 5],
|
||||
["select sourceIP, SUM(adRevenue) FROM uservisits%(table_suffix)s GROUP by sourceIP "\
|
||||
"order by SUM(adRevenue) desc limit 10", 1, 5],
|
||||
["select pageRank, pageURL from rankings%(table_suffix)s where pageRank > 10 "\
|
||||
"order by pageRank limit 100", 1, 5],
|
||||
["select count(*) from rankings%(table_suffix)s where "\
|
||||
"pageRank > 10 && pageRank < 25", 1, 5],
|
||||
["select avg(adRevenue) from uservisits%(table_suffix)s", 1, 5],
|
||||
["select avg(adRevenue) from uservisits%(table_suffix)s "\
|
||||
"where visitdate > '1999-07-01' and visitdate < '1999-12-31'", 1, 5],
|
||||
],
|
||||
|
||||
'grep10gb': [
|
||||
["select count(field) from grep10gb%(table_suffix)s where field like '%%xyz%%'", 0, 1]
|
||||
]
|
||||
}
|
||||
for line in vector_file.readlines():
|
||||
if line.strip().startswith('#'):
|
||||
continue
|
||||
vector_values.append([value.split(':')[1].strip() for value in line.split(',')])
|
||||
return vector_values
|
||||
|
||||
# Writes out results to a CSV file. Columns are delimited by '|' characters
|
||||
def write_to_csv(result_map, output_csv_file):
|
||||
@@ -316,51 +304,99 @@ def write_to_csv(result_map, output_csv_file):
|
||||
|
||||
for query, execution_results in result_map.iteritems():
|
||||
for result in execution_results:
|
||||
csv_writer.writerow([query, result.file_format, result.compression,
|
||||
csv_writer.writerow([result.workload, query, result.file_format,
|
||||
'%s/%s' % (result.compression_codec, result.compression_type),
|
||||
result.impala_execution_result.avg_time,
|
||||
result.impala_execution_result.stddev,
|
||||
result.hive_execution_result.avg_time,
|
||||
result.hive_execution_result.stddev])
|
||||
result.hive_execution_result.stddev,
|
||||
])
|
||||
|
||||
# Run all queries
|
||||
if (len(options.query) == 0):
|
||||
vector_file_path = os.path.join(
|
||||
os.environ['IMPALA_HOME'], 'testdata/workloads/hive-benchmark/',
|
||||
choose_input_vector_file_name(options.exploration_strategy))
|
||||
# Recursively scans the given directory for all test query files
|
||||
def enumerate_query_files(base_directory):
|
||||
query_files = []
|
||||
for item in os.listdir(base_directory):
|
||||
full_path = os.path.join(base_directory, item)
|
||||
if os.path.isfile(full_path) and item.endswith('.test'):
|
||||
query_files.append(full_path)
|
||||
elif os.path.isdir(full_path):
|
||||
query_files += enumerate_query_files(full_path)
|
||||
return query_files
|
||||
|
||||
vector = read_vector_file(vector_file_path)
|
||||
output = ""
|
||||
result_map = collections.defaultdict(list)
|
||||
# Strips out comments and empty lines from the input query string
|
||||
def strip_comments(query_string):
|
||||
query = []
|
||||
for line in query_string.split('\n'):
|
||||
if not line or line.strip().startswith('#') or line.strip().startswith('//'):
|
||||
continue
|
||||
query.append(line)
|
||||
return '\n'.join(query).strip()
|
||||
|
||||
for row in vector:
|
||||
file_format, data_set, codec, compression_type = row[:4]
|
||||
for query in queries[data_set]:
|
||||
query_string = build_query(query[0], options.exploration_strategy, data_set,
|
||||
file_format, codec, compression_type)
|
||||
result = run_query(query_string, query[1], query[2])
|
||||
# Enumerate all the query files for a workload and extract the actual query
|
||||
# strings.
|
||||
def extract_queries_from_test_files(workload):
|
||||
workload_base_dir = os.path.join(WORKLOAD_DIR, workload)
|
||||
if not os.path.isdir(workload_base_dir):
|
||||
print "Workload '%s' not found at path '%s'" % (workload, workload_base_dir)
|
||||
sys.exit(1)
|
||||
|
||||
query_dir = os.path.join(workload_base_dir, 'queries')
|
||||
if not os.path.isdir(query_dir):
|
||||
print "Workload query directory not found at path '%s'" % (query_dir)
|
||||
|
||||
queries = []
|
||||
for query_file_name in enumerate_query_files(query_dir):
|
||||
if options.verbose != 0:
|
||||
print 'Parsing Query Test File: ' + query_file_name
|
||||
with open(query_file_name, 'rb') as query_file:
|
||||
# Query files are split into sections separated by '=====', with subsections
|
||||
# separeted by '----'. The first item in each subsection is the actual query
|
||||
# to execute.
|
||||
for query_section in query_file.read().split("===="):
|
||||
formatted_query = strip_comments(query_section.split("----")[0])
|
||||
if formatted_query:
|
||||
queries.append(formatted_query.strip())
|
||||
return queries
|
||||
|
||||
result_map = collections.defaultdict(list)
|
||||
output = ""
|
||||
|
||||
# For each workload specified in, look up the associated query files. Extract valid
|
||||
# queries in each file and execute them using the specified number of execution
|
||||
# iterations. Finally, write results to an output CSV file for reporting.
|
||||
for workload in options.workloads.split(','):
|
||||
print 'Starting running of workload: ' + workload
|
||||
queries = extract_queries_from_test_files(workload)
|
||||
|
||||
vector_file_path = os.path.join(WORKLOAD_DIR, workload,
|
||||
vector_file_name(workload,
|
||||
options.exploration_strategy))
|
||||
test_vector = read_vector_file(vector_file_path)
|
||||
|
||||
# Execute the queries for combinations of file format, compression, etc.
|
||||
for row in test_vector:
|
||||
file_format, data_group, codec, compression_type = row[:4]
|
||||
print 'Test Vector Values: ' + ', '.join(row)
|
||||
for query in queries:
|
||||
query_string = build_query(query.strip(), file_format, codec, compression_type,
|
||||
workload, options.scale_factor)
|
||||
result = run_query(query_string, 1, int(options.iterations))
|
||||
output += result[0]
|
||||
print result[0]
|
||||
execution_result = result[1]
|
||||
hive_execution_result = QueryExecutionResult("N/A", "N/A")
|
||||
if options.compare_with_hive:
|
||||
hive_result = run_query_using_hive(query_string, query[1], query[2])
|
||||
hive_result = run_query_using_hive(query_string, 1, int(options.iterations))
|
||||
print "Hive Results:"
|
||||
print hive_result[0]
|
||||
hive_execution_result = hive_result[1]
|
||||
if options.verbose != 0:
|
||||
print "--------------------------------------------------------------------------"
|
||||
print "------------------------------------------------------------------------"
|
||||
|
||||
execution_detail = QueryExecutionDetail(file_format, codec, execution_result,
|
||||
execution_detail = QueryExecutionDetail(workload, file_format, codec,
|
||||
compression_type, execution_result,
|
||||
hive_execution_result)
|
||||
result_map[query[0]].append(execution_detail)
|
||||
result_map[query].append(execution_detail)
|
||||
|
||||
print "\nResults saving to: " + options.results_csv_file
|
||||
write_to_csv(result_map, options.results_csv_file)
|
||||
|
||||
print output
|
||||
|
||||
# Run query from command line
|
||||
else:
|
||||
result = run_query(options.query, int(options.prime_cache),
|
||||
int(options.iterations))
|
||||
print result[1] or result[0]
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
package com.cloudera.impala.dataerror;
|
||||
|
||||
@@ -23,7 +23,7 @@ public class DataErrorsTest {
|
||||
private static Catalog catalog;
|
||||
private static Executor executor;
|
||||
private static StringBuilder testErrorLog;
|
||||
private final String testDir = "DataErrorsTest";
|
||||
private final String testDir = "functional-query/queries/DataErrorsTest";
|
||||
private static ArrayList<String> tableList;
|
||||
|
||||
@BeforeClass
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
package com.cloudera.impala.planner;
|
||||
|
||||
@@ -33,7 +33,7 @@ public class PlannerTest {
|
||||
|
||||
private static Catalog catalog;
|
||||
private static AnalysisContext analysisCtxt;
|
||||
private final String testDir = "PlannerTest";
|
||||
private final String testDir = "functional-planner/queries/PlannerTest";
|
||||
private final String outDir = "/tmp/PlannerTest/";
|
||||
|
||||
private final StringBuilder explainStringBuilder = new StringBuilder();
|
||||
|
||||
@@ -4,6 +4,7 @@ package com.cloudera.impala.service;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
@@ -41,7 +42,7 @@ import com.google.common.collect.Sets;
|
||||
*/
|
||||
public abstract class BaseQueryTest {
|
||||
private static final Logger LOG = Logger.getLogger(BaseQueryTest.class);
|
||||
private static final String TEST_DIR = "QueryTest";
|
||||
private static final String TEST_DIR = "functional-query/queries/QueryTest";
|
||||
private static final int DEFAULT_FE_PORT = 21000;
|
||||
|
||||
// If set to true, new test results will be generated and saved to the specified
|
||||
@@ -93,6 +94,9 @@ public abstract class BaseQueryTest {
|
||||
protected final static TestExecMode EXECUTION_MODE = TestExecMode.valueOf(
|
||||
System.getProperty("testExecutionMode", "reduced").toUpperCase());
|
||||
|
||||
// A relative path from the 'workloads' directory to the base test directory.
|
||||
private final String testDirName;
|
||||
|
||||
/**
|
||||
* The type of target test environments. Determines whether the front end is running
|
||||
* in-process or out-of-process (ImpalaD).
|
||||
@@ -166,6 +170,14 @@ public abstract class BaseQueryTest {
|
||||
}
|
||||
}
|
||||
|
||||
protected BaseQueryTest() {
|
||||
this(TEST_DIR);
|
||||
}
|
||||
|
||||
protected BaseQueryTest(String testDirName) {
|
||||
this.testDirName = testDirName;
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
String impaladHostname = System.getProperty("impalad");
|
||||
@@ -418,7 +430,7 @@ public abstract class BaseQueryTest {
|
||||
|
||||
private void runQueryWithTestConfigs(List<TestConfiguration> testConfigs,
|
||||
String testFile, boolean abortOnError, int maxErrors) {
|
||||
String fileName = TEST_DIR + "/" + testFile + ".test";
|
||||
String fileName = new File(testDirName, testFile + ".test").getPath();
|
||||
TestFileParser queryFileParser = new TestFileParser(fileName);
|
||||
|
||||
LOG.debug("Running the following configurations over file " + fileName + " : ");
|
||||
|
||||
@@ -6,6 +6,10 @@ import org.junit.Test;
|
||||
|
||||
public class TpchQueryTest extends BaseQueryTest {
|
||||
|
||||
public TpchQueryTest() {
|
||||
super("tpch/queries");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestTpchQ1() {
|
||||
runTestInExecutionMode(EXECUTION_MODE, "tpch-q1", false, 1000);
|
||||
|
||||
@@ -4,8 +4,10 @@ package com.cloudera.impala.testutil;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
@@ -183,7 +185,7 @@ public class TestFileParser {
|
||||
|
||||
private int lineNum = 0;
|
||||
private final String fileName;
|
||||
private InputStream stream;
|
||||
private BufferedReader reader;
|
||||
private Scanner scanner;
|
||||
|
||||
/**
|
||||
@@ -210,9 +212,9 @@ public class TestFileParser {
|
||||
*/
|
||||
private void open(String table) {
|
||||
try {
|
||||
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
||||
stream = classLoader.getResourceAsStream(fileName);
|
||||
scanner = new Scanner(stream);
|
||||
String fullPath = new File(TestFileUtils.getTestFileBaseDir(), fileName).getPath();
|
||||
reader = new BufferedReader(new FileReader(fullPath));
|
||||
scanner = new Scanner(reader);
|
||||
} catch (Exception e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
@@ -293,16 +295,16 @@ public class TestFileParser {
|
||||
}
|
||||
|
||||
private void close() {
|
||||
if (reader != null) {
|
||||
try {
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
fail(e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
if (scanner != null) {
|
||||
scanner.close();
|
||||
}
|
||||
|
||||
if (stream != null) {
|
||||
try {
|
||||
stream.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,4 +67,11 @@ public class TestFileUtils {
|
||||
fw.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the base directory for test files.
|
||||
*/
|
||||
public static String getTestFileBaseDir() {
|
||||
return new File(System.getenv("IMPALA_HOME"), "testdata/workloads").getPath();
|
||||
}
|
||||
}
|
||||
16
testdata/bin/create-load-data.sh
vendored
16
testdata/bin/create-load-data.sh
vendored
@@ -6,22 +6,14 @@ if [ x${JAVA_HOME} == x ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set -u
|
||||
set -e
|
||||
|
||||
# Load the data set
|
||||
pushd ${IMPALA_HOME}/bin
|
||||
|
||||
./load-data.sh functional exhaustive
|
||||
if [ $? != 0 ]; then
|
||||
echo LOAD OF FUNCTIONAL DATA FAILED
|
||||
exit 1
|
||||
fi
|
||||
|
||||
./load-data.sh tpch core
|
||||
if [ $? != 0 ]; then
|
||||
echo LOAD OF TPCH DATA FAILED
|
||||
exit 1
|
||||
fi
|
||||
./load-data.py --workloads functional-query --exploration_strategy exhaustive
|
||||
./load-data.py --workloads functional-planner --exploration_strategy exhaustive
|
||||
./load-data.py --workloads tpch --exploration_strategy core
|
||||
popd
|
||||
|
||||
# TODO: The multi-format table will move these files. So we need to copy them to a
|
||||
|
||||
@@ -34,21 +34,28 @@ from itertools import product
|
||||
from optparse import OptionParser
|
||||
|
||||
parser = OptionParser()
|
||||
parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
|
||||
parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy", default="core",
|
||||
help="The exploration strategy for schema gen: 'core', "\
|
||||
"'pairwise', or 'exhaustive'")
|
||||
parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
|
||||
default="/test-warehouse",
|
||||
help="The HDFS path to the base Hive test warehouse directory")
|
||||
parser.add_option("--workload", dest="workload", default="functional",
|
||||
parser.add_option("-w", "--workload", dest="workload",
|
||||
help="The workload to generate schema for: tpch, hive-benchmark, ...")
|
||||
parser.add_option("--force_reload", dest="force_reload", action="store_true",
|
||||
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
|
||||
help="An optional scale factor to generate the schema for")
|
||||
parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
|
||||
default= False, help='Skips HDFS exists check and reloads all tables')
|
||||
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
||||
default = False, help="If set, outputs additional logging.")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
if options.workload is None:
|
||||
print "A workload name must be specified."
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads'
|
||||
DATASET_DIR = os.environ['IMPALA_HOME'] + '/testdata/datasets'
|
||||
|
||||
@@ -58,8 +65,8 @@ COMPRESSION_CODEC =\
|
||||
"SET mapred.output.compression.codec=org.apache.hadoop.io.compress.%s;"
|
||||
SET_DYNAMIC_PARTITION_STATEMENT = "SET hive.exec.dynamic.partition=true;"
|
||||
SET_PARTITION_MODE_NONSTRICT_STATEMENT = "SET hive.exec.dynamic.partition.mode=nonstrict;"
|
||||
SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;"\
|
||||
"SET hive.input.format=org.apache.hadoop.hive.ql.io.%s;"
|
||||
SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;\n"\
|
||||
"SET hive.input.format=org.apache.hadoop.hive.ql.io.%s;\n"
|
||||
|
||||
FILE_FORMAT_IDX = 0
|
||||
DATA_SET_IDX = 1
|
||||
@@ -99,10 +106,12 @@ class SqlGenerationStatement:
|
||||
self.trevni = trevni.strip()
|
||||
self.load_local = load_local.strip()
|
||||
|
||||
def build_create_statement(table_template, table_name, file_format, compression):
|
||||
def build_create_statement(table_template, table_name, file_format,
|
||||
compression, scale_factor):
|
||||
create_statement = 'DROP TABLE IF EXISTS %s;\n' % table_name
|
||||
create_statement += table_template % {'table_name': table_name,
|
||||
'file_format': FILE_FORMAT_MAP[file_format] }
|
||||
'file_format': FILE_FORMAT_MAP[file_format],
|
||||
'scale_factor': scale_factor}
|
||||
if file_format != 'trevni':
|
||||
return create_statement
|
||||
|
||||
@@ -151,9 +160,10 @@ def build_insert(insert, table_name, base_table_name, codec, compression_type):
|
||||
table_name) + "\n"
|
||||
return output
|
||||
|
||||
def build_load_statement(load_template, table_name):
|
||||
def build_load_statement(load_template, table_name, scale_factor):
|
||||
tmp_load_template = load_template.replace(' % ', ' *** ')
|
||||
return (tmp_load_template % {'table_name': table_name}).replace(' *** ', ' % ')
|
||||
return (tmp_load_template % {'table_name': table_name,
|
||||
'scale_factor': scale_factor}).replace(' *** ', ' % ')
|
||||
|
||||
def build_trevni(trevni_template, table_name, base_table_name):
|
||||
return trevni_template % {'table_name': table_name, 'base_table_name': base_table_name}
|
||||
@@ -171,10 +181,16 @@ def build_table_suffix(file_format, codec, compression_type):
|
||||
else:
|
||||
return '_%s_%s' % (file_format, codec)
|
||||
|
||||
# Vector files have the format:
|
||||
# dimension_name1:value1, dimension_name2:value2, ...
|
||||
def read_vector_file(file_name):
|
||||
vector_values = []
|
||||
with open(file_name, 'rb') as vector_file:
|
||||
return [line.strip().split(',')
|
||||
for line in vector_file.readlines() if not line.startswith('#')]
|
||||
for line in vector_file.readlines():
|
||||
if line.strip().startswith('#'):
|
||||
continue
|
||||
vector_values.append([value.split(':')[1].strip() for value in line.split(',')])
|
||||
return vector_values
|
||||
|
||||
def write_array_to_file(file_name, array):
|
||||
with open(file_name, 'w') as f:
|
||||
@@ -207,32 +223,32 @@ def write_trevni_file(file_name, array):
|
||||
# Kill off the plan service.
|
||||
f.write("\nkill -9 $PID\n")
|
||||
|
||||
def write_statements_to_file_based_on_input_vector(output_name, input_file_name,
|
||||
def write_statements_to_file_based_on_input_vector(output_name, test_vectors,
|
||||
statements):
|
||||
output_create = []
|
||||
output_load = []
|
||||
output_load_base = []
|
||||
output_trevni = []
|
||||
results = read_vector_file(input_file_name)
|
||||
existing_tables = list_hdfs_subdir_names(options.hive_warehouse_dir)
|
||||
for row in results:
|
||||
for row in test_vectors:
|
||||
file_format, data_set, codec, compression_type = row[:4]
|
||||
for s in statements[data_set.strip()]:
|
||||
create = s.create
|
||||
insert = s.insert
|
||||
trevni = s.trevni
|
||||
load_local = s.load_local
|
||||
table_name = s.base_table_name +\
|
||||
build_table_suffix(file_format, codec, compression_type)
|
||||
base_table_name = s.base_table_name % {'scale_factor' : options.scale_factor}
|
||||
table_name = base_table_name + \
|
||||
build_table_suffix(file_format, codec, compression_type)
|
||||
|
||||
# HBase only supports text format and mixed format tables have formats defined.
|
||||
# TODO: Implement a better way to tag a table as only being generated with a fixed
|
||||
# set of file formats.
|
||||
if (("hbase" in table_name or "mixedformat" in table_name) and
|
||||
"text" not in file_format):
|
||||
if ("hbase" in table_name and "text" not in file_format):
|
||||
continue
|
||||
|
||||
output_create.append(build_create_statement(create, table_name, file_format, codec))
|
||||
output_create.append(build_create_statement(create, table_name, file_format, codec,
|
||||
options.scale_factor))
|
||||
|
||||
# If the directory already exists in HDFS, assume that data files already exist
|
||||
# and skip loading the data. Otherwise, the data is generated using either an
|
||||
@@ -242,21 +258,22 @@ def write_statements_to_file_based_on_input_vector(output_name, input_file_name,
|
||||
print 'Path:', data_path, 'already exists in HDFS. Data loading can be skipped.'
|
||||
else:
|
||||
print 'Path:', data_path, 'does not exists in HDFS. Data file will be generated.'
|
||||
if table_name == s.base_table_name:
|
||||
if table_name == base_table_name:
|
||||
if load_local:
|
||||
output_load_base.append(build_load_statement(load_local, table_name))
|
||||
output_load_base.append(build_load_statement(load_local, table_name,
|
||||
options.scale_factor))
|
||||
else:
|
||||
print 'Empty base table load for %s. Skipping load generation' % table_name
|
||||
elif file_format == 'trevni':
|
||||
if trevni:
|
||||
output_trevni.append(build_trevni(trevni, table_name, s.base_table_name))
|
||||
output_trevni.append(build_trevni(trevni, table_name, base_table_name))
|
||||
else:
|
||||
print \
|
||||
'Empty trevni load for table %s. Skipping insert generation' % table_name
|
||||
else:
|
||||
if insert:
|
||||
output_load.append(build_insert(insert, table_name, s.base_table_name,
|
||||
codec, compression_type))
|
||||
output_load.append(build_insert(insert, table_name, base_table_name,
|
||||
codec, compression_type))
|
||||
else:
|
||||
print 'Empty insert for table %s. Skipping insert generation' % table_name
|
||||
|
||||
@@ -285,22 +302,30 @@ if (options.exploration_strategy != 'core' and
|
||||
print 'Invalid exploration strategy:', options.exploration_strategy
|
||||
sys.exit(1)
|
||||
|
||||
schema_template_file = os.path.join(DATASET_DIR, options.workload,
|
||||
'%s_schema_template.sql' % (options.workload))
|
||||
test_vector_file = os.path.join(WORKLOAD_DIR, options.workload,
|
||||
'%s_%s.csv' % (options.workload,
|
||||
options.exploration_strategy))
|
||||
|
||||
if not os.path.isfile(test_vector_file):
|
||||
print 'Vector file not found: ' + test_vector_file
|
||||
sys.exit(1)
|
||||
|
||||
test_vectors = read_vector_file(test_vector_file)
|
||||
|
||||
if len(test_vectors) == 0:
|
||||
print 'No test vectors found in file: ' + test_vector_file
|
||||
sys.exit(1)
|
||||
|
||||
target_dataset = test_vectors[0][DATA_SET_IDX]
|
||||
print 'Target Dataset: ' + target_dataset
|
||||
schema_template_file = os.path.join(DATASET_DIR, target_dataset,
|
||||
'%s_schema_template.sql' % target_dataset)
|
||||
|
||||
if not os.path.isfile(schema_template_file):
|
||||
print 'Schema file not found: ' + schema_template_file
|
||||
sys.exit(1)
|
||||
|
||||
test_vector_file = os.path.join(WORKLOAD_DIR, options.workload,
|
||||
'%s_%s.csv' % (options.workload,
|
||||
options.exploration_strategy))
|
||||
|
||||
if not os.path.isfile(schema_template_file):
|
||||
print 'Vector file not found: ' + schema_template_file
|
||||
sys.exit(1)
|
||||
|
||||
statements = parse_benchmark_file(schema_template_file)
|
||||
write_statements_to_file_based_on_input_vector(
|
||||
'%s-%s' % (options.workload, options.exploration_strategy),
|
||||
test_vector_file, statements)
|
||||
test_vectors, statements)
|
||||
@@ -35,23 +35,28 @@ import metacomm.combinatorics.all_pairs2
|
||||
all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2
|
||||
|
||||
parser = OptionParser()
|
||||
parser.add_option("--dimension_file", dest="dimension_file",
|
||||
default = "hive-benchmark_dimensions.csv",
|
||||
help="The file containing the list of dimensions.")
|
||||
parser.add_option("--workload", dest="workload", default = "hive-benchmark",
|
||||
parser.add_option("-w", "--workload", dest="workload",
|
||||
help="The workload to generate test vectors for")
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads'
|
||||
if options.workload is None:
|
||||
print "A workload name must be specified."
|
||||
parser.print_help()
|
||||
sys.exit(1)
|
||||
|
||||
FILE_FORMAT_IDX = 0
|
||||
DATA_SET_IDX = 1
|
||||
COMPRESSION_IDX = 2
|
||||
COMPRESSION_TYPE_IDX = 3
|
||||
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
||||
|
||||
KNOWN_DIMENSION_NAMES = ['file_format', 'data_group', 'compression_codec',
|
||||
# This array also defines the order of the dimension values. This ordering
|
||||
# is important because it is used to apply constraints. Add new items to the
|
||||
# end of the list.
|
||||
KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec',
|
||||
'compression_type']
|
||||
|
||||
FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format')
|
||||
DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset')
|
||||
COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec')
|
||||
COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type')
|
||||
|
||||
class VectorGenerator:
|
||||
def __init__(self, input_vectors):
|
||||
self.input_vectors = input_vectors
|
||||
@@ -76,13 +81,15 @@ def is_valid_combination(vector):
|
||||
(vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or
|
||||
(vector[FILE_FORMAT_IDX] == 'trevni' and
|
||||
(vector[COMPRESSION_IDX] == 'gzip' or vector[COMPRESSION_IDX] == 'bzip')) or
|
||||
(vector[DATA_SET_IDX] == 'tpch' and vector[FILE_FORMAT_IDX] != 'text'))
|
||||
(vector[DATASET_IDX] == 'tpch' and
|
||||
(vector[FILE_FORMAT_IDX] != 'text' and vector[FILE_FORMAT_IDX] != 'trevni')))
|
||||
|
||||
# The pairwise generator may call this with different vector lengths. In that case this
|
||||
# should always return true.
|
||||
return True
|
||||
|
||||
# Vector files have the format: <dimension name>: value1, value2, ...
|
||||
# Vector files have the format: <dimension name>: value1, value2, ... this function
|
||||
# adds all specified dimensions to a map of dimension name-to-value
|
||||
def read_dimension_file(file_name):
|
||||
dimension_map = collections.defaultdict(list)
|
||||
with open(file_name, 'rb') as input_file:
|
||||
@@ -98,16 +105,18 @@ def read_dimension_file(file_name):
|
||||
print 'Unknown dimension name: ' + values[0]
|
||||
print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES)
|
||||
sys.exit(1)
|
||||
dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
|
||||
dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
|
||||
return dimension_map
|
||||
|
||||
def write_vectors_to_csv(output_dir, output_file, matrix):
|
||||
output_text = "# Generated File. The vector value order is: file format, data_group, "\
|
||||
"compression codec, compression type"
|
||||
output_text = "# Generated File."
|
||||
for row in matrix:
|
||||
output_text += '\n' + ','.join(row)
|
||||
row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))]
|
||||
output_text += '\n' + ', '.join(row)
|
||||
|
||||
with open(os.path.join(output_dir, output_file), 'wb') as output_file:
|
||||
output_path = os.path.join(output_dir, output_file)
|
||||
print 'Writing test vectors to: ' + output_path
|
||||
with open(output_path, 'wb') as output_file:
|
||||
output_file.write(output_text)
|
||||
output_file.write('\n')
|
||||
|
||||
@@ -120,11 +129,10 @@ if not os.path.isfile(dimension_file):
|
||||
print 'Reading dimension file: ' + dimension_file
|
||||
vector_map = read_dimension_file(dimension_file)
|
||||
vectors = []
|
||||
|
||||
# This ordering matters! We need to know the order to apply the proper constraints.
|
||||
vectors.append(vector_map['file_format'])
|
||||
vectors.append(vector_map['data_group'])
|
||||
vectors.append(vector_map['compression_codec'])
|
||||
vectors.append(vector_map['compression_type'])
|
||||
for dimension_name in KNOWN_DIMENSION_NAMES:
|
||||
vectors.append(vector_map[dimension_name])
|
||||
vg = VectorGenerator(vectors)
|
||||
|
||||
output_dir = os.path.join(WORKLOAD_DIR, options.workload)
|
||||
@@ -24,7 +24,7 @@
|
||||
# ---- <- End sub-section
|
||||
# LOAD from LOCAL - How to load data for the the base table
|
||||
====
|
||||
grep1gb
|
||||
hive-benchmark
|
||||
----
|
||||
grep1gb
|
||||
----
|
||||
@@ -49,7 +49,7 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-000
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
|
||||
====
|
||||
grep10gb
|
||||
hive-benchmark
|
||||
----
|
||||
grep10gb
|
||||
----
|
||||
@@ -75,7 +75,7 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
|
||||
====
|
||||
web
|
||||
hive-benchmark
|
||||
----
|
||||
rankings
|
||||
----
|
||||
@@ -94,7 +94,7 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/html1GB/Rankings.dat' OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
web
|
||||
hive-benchmark
|
||||
----
|
||||
uservisits
|
||||
----
|
||||
|
||||
115
testdata/datasets/tpch/tpch_schema_template.sql
vendored
115
testdata/datasets/tpch/tpch_schema_template.sql
vendored
@@ -1,10 +1,11 @@
|
||||
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
# For details on this file format please see benchmark_schema_template.sql
|
||||
# For details on this file format please see hive-benchmark_schema_template.sql
|
||||
====
|
||||
tpch
|
||||
----
|
||||
lineitem
|
||||
tpch%(scale_factor)s.lineitem
|
||||
----
|
||||
CREATE DATABASE IF NOT EXISTS tpch%(scale_factor)s;
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
L_ORDERKEY INT,
|
||||
L_PARTKEY INT,
|
||||
@@ -24,7 +25,7 @@ L_SHIPMODE STRING,
|
||||
L_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -32,12 +33,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/lineitem.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/lineitem/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
part
|
||||
tpch%(scale_factor)s.part
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
P_PARTKEY INT,
|
||||
@@ -52,7 +53,7 @@ P_RETAILPRICE DOUBLE,
|
||||
P_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -60,12 +61,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/part.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/part/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
partsupp
|
||||
tpch%(scale_factor)s.partsupp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
PS_PARTKEY INT,
|
||||
@@ -75,7 +76,7 @@ PS_SUPPLYCOST DOUBLE,
|
||||
PS_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -83,12 +84,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/partsupp.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/partsupp/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
supplier
|
||||
tpch%(scale_factor)s.supplier
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
S_SUPPKEY INT,
|
||||
@@ -100,7 +101,7 @@ S_ACCTBAL DOUBLE,
|
||||
S_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -108,12 +109,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/supplier.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/supplier/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
nation
|
||||
tpch%(scale_factor)s.nation
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
N_NATIONKEY INT,
|
||||
@@ -122,7 +123,7 @@ N_REGIONKEY INT,
|
||||
N_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -130,12 +131,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/nation.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/nation/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
region
|
||||
tpch%(scale_factor)s.region
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
R_REGIONKEY INT,
|
||||
@@ -143,7 +144,7 @@ R_NAME STRING,
|
||||
R_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -151,12 +152,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/region.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/region/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
orders
|
||||
tpch%(scale_factor)s.orders
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
O_ORDERKEY INT,
|
||||
@@ -170,7 +171,7 @@ O_SHIPPRIORITY INT,
|
||||
O_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -178,12 +179,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/orders.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/orders/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
customer
|
||||
tpch%(scale_factor)s.customer
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
C_CUSTKEY INT,
|
||||
@@ -196,7 +197,7 @@ C_MKTSEGMENT STRING,
|
||||
C_COMMENT STRING)
|
||||
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
|
||||
----
|
||||
@@ -204,12 +205,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
|
||||
INSERT OVERWRITE TABLE %(table_name)s \
|
||||
select * FROM %(base_table_name)s"
|
||||
----
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/customer.tbl'
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/customer/'
|
||||
OVERWRITE INTO TABLE %(table_name)s;
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q2_minimum_cost_supplier_tmp1
|
||||
tpch%(scale_factor)s.q2_minimum_cost_supplier_tmp1
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
s_acctbal double,
|
||||
@@ -222,27 +223,27 @@ s_address string,
|
||||
s_phone string,
|
||||
s_comment string)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q2_minimum_cost_supplier_tmp2
|
||||
tpch%(scale_factor)s.q2_minimum_cost_supplier_tmp2
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
p_partkey int,
|
||||
ps_min_supplycost double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q7_volume_shipping_tmp
|
||||
tpch%(scale_factor)s.q7_volume_shipping_tmp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
supp_nation string,
|
||||
@@ -250,73 +251,73 @@ cust_nation string,
|
||||
s_nationkey int,
|
||||
c_nationkey int)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q11_part_tmp
|
||||
tpch%(scale_factor)s.q11_part_tmp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
ps_partkey int,
|
||||
part_value double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q11_sum_tmp
|
||||
tpch%(scale_factor)s.q11_sum_tmp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (total_value double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
revenue
|
||||
tpch%(scale_factor)s.revenue
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
supplier_no int,
|
||||
total_revenue double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
max_revenue
|
||||
tpch%(scale_factor)s.max_revenue
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (max_revenue double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
supplier_tmp
|
||||
tpch%(scale_factor)s.supplier_tmp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (s_suppkey int)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q16_tmp
|
||||
tpch%(scale_factor)s.q16_tmp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
p_brand string,
|
||||
@@ -324,94 +325,94 @@ p_type string,
|
||||
p_size int,
|
||||
ps_suppkey int)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
lineitem_tmp
|
||||
tpch%(scale_factor)s.lineitem_tmp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
t_partkey int,
|
||||
t_avg_quantity double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q18_tmp
|
||||
tpch%(scale_factor)s.q18_tmp
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
l_orderkey int,
|
||||
t_sum_quantity double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q20_tmp1
|
||||
tpch%(scale_factor)s.q20_tmp1
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (p_partkey int)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q20_tmp2
|
||||
tpch%(scale_factor)s.q20_tmp2
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
l_partkey int,
|
||||
l_suppkey int,
|
||||
sum_quantity double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q20_tmp3
|
||||
tpch%(scale_factor)s.q20_tmp3
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (
|
||||
ps_suppkey int,
|
||||
ps_availqty int,
|
||||
sum_quantity double)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q20_tmp4
|
||||
tpch%(scale_factor)s.q20_tmp4
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (ps_suppkey int)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
====
|
||||
tpch
|
||||
----
|
||||
q22_customer_tmp1
|
||||
tpch%(scale_factor)s.q22_customer_tmp1
|
||||
----
|
||||
CREATE EXTERNAL TABLE %(table_name)s (avg_acctbal double, cust_name_char string)
|
||||
STORED AS %(file_format)s
|
||||
LOCATION '/test-warehouse/%(table_name)s';
|
||||
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
|
||||
----
|
||||
----
|
||||
----
|
||||
|
||||
2
testdata/workloads/functional-planner/functional-planner_core.csv
vendored
Normal file
2
testdata/workloads/functional-planner/functional-planner_core.csv
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Manually Created File.
|
||||
file_format: text, dataset: functional, compression_codec: none, compression_type: none
|
||||
|
4
testdata/workloads/functional-planner/functional-planner_dimensions.csv
vendored
Normal file
4
testdata/workloads/functional-planner/functional-planner_dimensions.csv
vendored
Normal file
@@ -0,0 +1,4 @@
|
||||
file_format: text
|
||||
dataset: functional
|
||||
compression_codec: none
|
||||
compression_type: none
|
||||
|
2
testdata/workloads/functional-planner/functional-planner_exhaustive.csv
vendored
Normal file
2
testdata/workloads/functional-planner/functional-planner_exhaustive.csv
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Generated File.
|
||||
file_format: text, dataset: functional, compression_codec: none, compression_type: none
|
||||
|
2
testdata/workloads/functional-planner/functional-planner_pairwise.csv
vendored
Normal file
2
testdata/workloads/functional-planner/functional-planner_pairwise.csv
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
# Generated File.
|
||||
file_format: text, dataset: functional, compression_codec: none, compression_type: none
|
||||
|
File diff suppressed because it is too large
Load Diff
3
testdata/workloads/functional-query/functional-query_core.csv
vendored
Normal file
3
testdata/workloads/functional-query/functional-query_core.csv
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
# Manually created file.
|
||||
file_format:text, dataset:functional, compression_codec:none, compression_type:none
|
||||
file_format:seq, dataset:functional, compression_codec:none, compression_type:none
|
||||
|
@@ -1,4 +1,4 @@
|
||||
file_format: text,seq,rc,trevni
|
||||
data_group: functional
|
||||
dataset: functional
|
||||
compression_codec: none,def,gzip,bzip,snap
|
||||
compression_type: none,block,record
|
||||
|
19
testdata/workloads/functional-query/functional-query_exhaustive.csv
vendored
Normal file
19
testdata/workloads/functional-query/functional-query_exhaustive.csv
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
# Generated File.
|
||||
file_format: text, dataset: functional, compression_codec: none, compression_type: none
|
||||
file_format: seq, dataset: functional, compression_codec: none, compression_type: none
|
||||
file_format: seq, dataset: functional, compression_codec: def, compression_type: block
|
||||
file_format: seq, dataset: functional, compression_codec: def, compression_type: record
|
||||
file_format: seq, dataset: functional, compression_codec: gzip, compression_type: block
|
||||
file_format: seq, dataset: functional, compression_codec: gzip, compression_type: record
|
||||
file_format: seq, dataset: functional, compression_codec: bzip, compression_type: block
|
||||
file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record
|
||||
file_format: seq, dataset: functional, compression_codec: snap, compression_type: block
|
||||
file_format: seq, dataset: functional, compression_codec: snap, compression_type: record
|
||||
file_format: rc, dataset: functional, compression_codec: none, compression_type: none
|
||||
file_format: rc, dataset: functional, compression_codec: def, compression_type: block
|
||||
file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
|
||||
file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
|
||||
file_format: rc, dataset: functional, compression_codec: snap, compression_type: block
|
||||
file_format: trevni, dataset: functional, compression_codec: none, compression_type: none
|
||||
file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
|
||||
file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block
|
||||
|
8
testdata/workloads/functional-query/functional-query_pairwise.csv
vendored
Normal file
8
testdata/workloads/functional-query/functional-query_pairwise.csv
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
# Generated File.
|
||||
file_format: text, dataset: functional, compression_codec: none, compression_type: none
|
||||
file_format: seq, dataset: functional, compression_codec: def, compression_type: block
|
||||
file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
|
||||
file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block
|
||||
file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
|
||||
file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
|
||||
file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record
|
||||
|
@@ -1,3 +0,0 @@
|
||||
# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,functional,none,none
|
||||
seq,functional,none,none
|
||||
|
@@ -1,19 +0,0 @@
|
||||
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,functional,none,none
|
||||
seq,functional,none,none
|
||||
seq,functional,def,block
|
||||
seq,functional,def,record
|
||||
seq,functional,gzip,block
|
||||
seq,functional,gzip,record
|
||||
seq,functional,bzip,block
|
||||
seq,functional,bzip,record
|
||||
seq,functional,snap,block
|
||||
seq,functional,snap,record
|
||||
rc,functional,none,none
|
||||
rc,functional,def,block
|
||||
rc,functional,gzip,block
|
||||
rc,functional,bzip,block
|
||||
rc,functional,snap,block
|
||||
trevni,functional,none,none
|
||||
trevni,functional,def,block
|
||||
trevni,functional,snap,block
|
||||
|
@@ -1,8 +0,0 @@
|
||||
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,functional,none,none
|
||||
seq,functional,def,block
|
||||
rc,functional,gzip,block
|
||||
trevni,functional,snap,block
|
||||
trevni,functional,def,block
|
||||
rc,functional,bzip,block
|
||||
seq,functional,bzip,record
|
||||
|
@@ -1,9 +1,7 @@
|
||||
# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,grep1gb,none,none
|
||||
text,grep10gb,none,none
|
||||
text,web,none,none
|
||||
seq,grep1gb,bzip,none
|
||||
seq,web,snap,record
|
||||
seq,web,none,none
|
||||
rc,grep1gb,def,block
|
||||
rc,web,none,none
|
||||
# Manually created file.
|
||||
file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: none
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none
|
||||
file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
|
||||
|
||||
|
@@ -1,4 +1,4 @@
|
||||
file_format: text,seq,rc,trevni
|
||||
data_group: grep1gb,grep10gb,web
|
||||
dataset: hive-benchmark
|
||||
compression_codec: none,def,gzip,bzip,snap
|
||||
compression_type: none,block,record
|
||||
|
||||
|
@@ -1,55 +1,19 @@
|
||||
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,grep1gb,none,none
|
||||
text,grep10gb,none,none
|
||||
text,web,none,none
|
||||
seq,grep1gb,none,none
|
||||
seq,grep1gb,def,block
|
||||
seq,grep1gb,def,record
|
||||
seq,grep1gb,gzip,block
|
||||
seq,grep1gb,gzip,record
|
||||
seq,grep1gb,bzip,block
|
||||
seq,grep1gb,bzip,record
|
||||
seq,grep1gb,snap,block
|
||||
seq,grep1gb,snap,record
|
||||
seq,grep10gb,none,none
|
||||
seq,grep10gb,def,block
|
||||
seq,grep10gb,def,record
|
||||
seq,grep10gb,gzip,block
|
||||
seq,grep10gb,gzip,record
|
||||
seq,grep10gb,bzip,block
|
||||
seq,grep10gb,bzip,record
|
||||
seq,grep10gb,snap,block
|
||||
seq,grep10gb,snap,record
|
||||
seq,web,none,none
|
||||
seq,web,def,block
|
||||
seq,web,def,record
|
||||
seq,web,gzip,block
|
||||
seq,web,gzip,record
|
||||
seq,web,bzip,block
|
||||
seq,web,bzip,record
|
||||
seq,web,snap,block
|
||||
seq,web,snap,record
|
||||
rc,grep1gb,none,none
|
||||
rc,grep1gb,def,block
|
||||
rc,grep1gb,gzip,block
|
||||
rc,grep1gb,bzip,block
|
||||
rc,grep1gb,snap,block
|
||||
rc,grep10gb,none,none
|
||||
rc,grep10gb,def,block
|
||||
rc,grep10gb,gzip,block
|
||||
rc,grep10gb,bzip,block
|
||||
rc,grep10gb,snap,block
|
||||
rc,web,none,none
|
||||
rc,web,def,block
|
||||
rc,web,gzip,block
|
||||
rc,web,bzip,block
|
||||
rc,web,snap,block
|
||||
trevni,grep1gb,none,none
|
||||
trevni,grep1gb,def,block
|
||||
trevni,grep1gb,snap,block
|
||||
trevni,grep10gb,none,none
|
||||
trevni,grep10gb,def,block
|
||||
trevni,grep10gb,snap,block
|
||||
trevni,web,none,none
|
||||
trevni,web,def,block
|
||||
trevni,web,snap,block
|
||||
# Generated File.
|
||||
file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: none, compression_type: none
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: record
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: block
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: record
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: snap, compression_type: block
|
||||
file_format: trevni, dataset: hive-benchmark, compression_codec: none, compression_type: none
|
||||
file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block
|
||||
file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
|
||||
|
||||
|
@@ -1,10 +1,8 @@
|
||||
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,grep1gb,none,none
|
||||
seq,grep10gb,def,block
|
||||
rc,web,gzip,block
|
||||
trevni,web,snap,block
|
||||
trevni,grep10gb,none,none
|
||||
rc,grep1gb,bzip,block
|
||||
seq,web,none,none
|
||||
text,grep10gb,none,none
|
||||
text,web,none,none
|
||||
# Generated File.
|
||||
file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
|
||||
file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
|
||||
file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block
|
||||
file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
|
||||
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record
|
||||
|
||||
|
27
testdata/workloads/hive-benchmark/queries/hive-benchmark.test
vendored
Normal file
27
testdata/workloads/hive-benchmark/queries/hive-benchmark.test
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
====
|
||||
select count(*) from grep1gb$TABLE
|
||||
====
|
||||
select count(field) from grep1gb$TABLE
|
||||
====
|
||||
select count(field) from grep1gb$TABLE where field like '%%xyz%%'
|
||||
====
|
||||
select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue
|
||||
from uservisits$TABLE uv join rankings$TABLE r on
|
||||
(r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate
|
||||
< '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1
|
||||
====
|
||||
select sourceIP, SUM(adRevenue) FROM uservisits$TABLE GROUP by sourceIP
|
||||
order by SUM(adRevenue) desc limit 10
|
||||
====
|
||||
select pageRank, pageURL from rankings$TABLE where pageRank > 10
|
||||
order by pageRank limit 100
|
||||
====
|
||||
select count(*) from rankings$TABLE where pageRank > 10 && pageRank < 25
|
||||
====
|
||||
select avg(adRevenue) from uservisits$TABLE
|
||||
====
|
||||
select avg(adRevenue) from uservisits$TABLE
|
||||
where visitdate > '1999-07-01' and visitdate < '1999-12-31'
|
||||
====
|
||||
select count(field) from grep10gb$TABLE where field like '%%xyz%%'
|
||||
====
|
||||
@@ -11,7 +11,7 @@ select
|
||||
round(avg(l_extendedprice), 1),
|
||||
round(avg(l_discount), 1), count(1)
|
||||
from
|
||||
lineitem$TABLE
|
||||
tpch.lineitem$TABLE
|
||||
where
|
||||
l_shipdate<='1998-09-02'
|
||||
group by
|
||||
@@ -11,12 +11,12 @@ select
|
||||
c_address,
|
||||
c_phone,
|
||||
c_comment
|
||||
from lineitem$TABLE l
|
||||
join orders$TABLE o
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.orders$TABLE o
|
||||
on (l.l_orderkey = o.o_orderkey)
|
||||
join customer$TABLE c
|
||||
join tpch.customer$TABLE c
|
||||
on (c.c_custkey = o.o_custkey)
|
||||
join nation$TABLE n
|
||||
join tpch.nation$TABLE n
|
||||
on (c.c_nationkey = n.n_nationkey)
|
||||
where
|
||||
o.o_orderdate >= '1993-10-01' and
|
||||
@@ -1,27 +1,27 @@
|
||||
# Q11 - Important Stock Identification
|
||||
# NOTE: Alan is re-writing part of this query set
|
||||
# so it is missing for now
|
||||
insert overwrite table q11_part_tmp$TABLE
|
||||
insert overwrite table tpch.q11_part_tmp$TABLE
|
||||
select ps_partkey, sum(ps_supplycost * ps_availqty) as part_value
|
||||
from nation$TABLE n
|
||||
join supplier$TABLE s
|
||||
from tpch.nation$TABLE n
|
||||
join tpch.supplier$TABLE s
|
||||
on s.s_nationkey = n.n_nationkey and n.n_name = 'GERMANY'
|
||||
join partsupp$TABLE ps
|
||||
join tpch.partsupp$TABLE ps
|
||||
on ps.ps_suppkey = s.s_suppkey
|
||||
group by ps_partkey
|
||||
---- SETUP
|
||||
RESET q11_sum_tmp$TABLE
|
||||
RELOAD q11_sum_tmp$TABLE
|
||||
RESET tpch.q11_sum_tmp$TABLE
|
||||
RELOAD tpch.q11_sum_tmp$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
29818
|
||||
====
|
||||
insert overwrite table q11_sum_tmp$TABLE
|
||||
insert overwrite table tpch.q11_sum_tmp$TABLE
|
||||
select sum(part_value) as total_value
|
||||
from q11_part_tmp$TABLE
|
||||
from tpch.q11_part_tmp$TABLE
|
||||
---- SETUP
|
||||
RESET q11_part_tmp$TABLE
|
||||
RELOAD q11_part_tmp$TABLE
|
||||
RESET tpch.q11_part_tmp$TABLE
|
||||
RELOAD tpch.q11_part_tmp$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
1
|
||||
@@ -15,8 +15,8 @@ sum(case
|
||||
else 0
|
||||
end
|
||||
) as low_line_count
|
||||
from lineitem$TABLE l
|
||||
join orders$TABLE o
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.orders$TABLE o
|
||||
on (o.o_orderkey = l.l_orderkey and
|
||||
l.l_commitdate < l.l_receiptdate and
|
||||
l.l_shipdate < l.l_commitdate)
|
||||
@@ -6,8 +6,8 @@ from
|
||||
( select
|
||||
c_custkey,
|
||||
count(o_orderkey) as c_count
|
||||
from orders$TABLE o
|
||||
right outer join customer$TABLE c
|
||||
from tpch.orders$TABLE o
|
||||
right outer join tpch.customer$TABLE c
|
||||
on (c.c_custkey = o.o_custkey and o.o_comment not like '%special%requests%')
|
||||
group by
|
||||
c_custkey
|
||||
@@ -4,8 +4,8 @@ round(100.00 * sum(case when p_type like 'PROMO%' then l_extendedprice*(1-l_disc
|
||||
else 0.0
|
||||
end
|
||||
) / sum(l_extendedprice * (1 - l_discount)), 5) as promo_revenue
|
||||
from lineitem$TABLE l
|
||||
join part$TABLE p
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.part$TABLE p
|
||||
on l.l_partkey = p.p_partkey and
|
||||
l.l_shipdate >= '1995-09-01' and
|
||||
l.l_shipdate < '1995-10-01'
|
||||
@@ -1,24 +1,24 @@
|
||||
# Q15 - Top Supplier Query
|
||||
insert overwrite table revenue$TABLE
|
||||
insert overwrite table tpch.revenue$TABLE
|
||||
select
|
||||
l_suppkey as supplier_no,
|
||||
sum(l_extendedprice * (1 - l_discount)) as total_revenue
|
||||
from lineitem$TABLE
|
||||
from tpch.lineitem$TABLE
|
||||
where l_shipdate >= '1996-01-01' and l_shipdate < '1996-04-01'
|
||||
group by l_suppkey
|
||||
---- SETUP
|
||||
RESET revenue$TABLE
|
||||
RELOAD revenue$TABLE
|
||||
RESET tpch.revenue$TABLE
|
||||
RELOAD tpch.revenue$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
10000
|
||||
====
|
||||
insert overwrite table max_revenue$TABLE
|
||||
insert overwrite table tpch.max_revenue$TABLE
|
||||
select max(total_revenue)
|
||||
from revenue$TABLE
|
||||
from tpch.revenue$TABLE
|
||||
---- SETUP
|
||||
RESET max_revenue$TABLE
|
||||
RELOAD max_revenue$TABLE
|
||||
RESET tpch.max_revenue$TABLE
|
||||
RELOAD tpch.max_revenue$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
1
|
||||
@@ -30,10 +30,10 @@ select
|
||||
s_address,
|
||||
s_phone,
|
||||
total_revenue
|
||||
from supplier$TABLE s
|
||||
join revenue$TABLE r
|
||||
from tpch.supplier$TABLE s
|
||||
join tpch.revenue$TABLE r
|
||||
on (s.s_suppkey = r.supplier_no)
|
||||
join max_revenue$TABLE m
|
||||
join tpch.max_revenue$TABLE m
|
||||
on (r.total_revenue = m.max_revenue)
|
||||
order by s_suppkey
|
||||
limit 100
|
||||
@@ -1,27 +1,27 @@
|
||||
# Q16 - Parts/Supplier Relation Query
|
||||
insert overwrite table supplier_tmp$TABLE
|
||||
insert overwrite table tpch.supplier_tmp$TABLE
|
||||
select s_suppkey
|
||||
from supplier$TABLE
|
||||
from tpch.supplier$TABLE
|
||||
where not s_comment like '%Customer%Complaints%'
|
||||
---- SETUP
|
||||
RESET supplier_tmp$TABLE
|
||||
RELOAD supplier_tmp$TABLE
|
||||
RESET tpch.supplier_tmp$TABLE
|
||||
RELOAD tpch.supplier_tmp$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
9996
|
||||
====
|
||||
insert overwrite table q16_tmp$TABLE
|
||||
insert overwrite table tpch.q16_tmp$TABLE
|
||||
select p_brand, p_type, p_size, ps_suppkey
|
||||
from partsupp$TABLE ps
|
||||
join part$TABLE p
|
||||
from tpch.partsupp$TABLE ps
|
||||
join tpch.part$TABLE p
|
||||
on p.p_partkey = ps.ps_partkey and
|
||||
p.p_brand <> 'Brand#45' and
|
||||
not p.p_type like 'MEDIUM POLISHED%'
|
||||
join supplier_tmp$TABLE s
|
||||
join tpch.supplier_tmp$TABLE s
|
||||
on ps.ps_suppkey = s.s_suppkey
|
||||
---- SETUP
|
||||
RESET q16_tmp$TABLE
|
||||
RELOAD q16_tmp$TABLE
|
||||
RESET tpch.q16_tmp$TABLE
|
||||
RELOAD tpch.q16_tmp$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
741971
|
||||
@@ -29,7 +29,7 @@ RELOAD q16_tmp$TABLE
|
||||
# Modifications: Added limit, removed 'DISTINCT' from count due to IMP-132
|
||||
select p_brand, p_type, p_size, count(ps_suppkey) as supplier_cnt
|
||||
from
|
||||
( select * from q16_tmp$TABLE
|
||||
( select * from tpch.q16_tmp$TABLE
|
||||
where p_size = 49 or p_size = 14 or
|
||||
p_size = 23 or p_size = 45 or
|
||||
p_size = 19 or p_size = 3 or
|
||||
@@ -1,11 +1,11 @@
|
||||
# Q17 - Small-Quantity-Order Revenue Query
|
||||
insert overwrite table lineitem_tmp$TABLE
|
||||
insert overwrite table tpch.lineitem_tmp$TABLE
|
||||
select l_partkey as t_partkey, 0.2 * avg(l_quantity) as t_avg_quantity
|
||||
from lineitem$TABLE
|
||||
from tpch.lineitem$TABLE
|
||||
group by l_partkey
|
||||
---- SETUP
|
||||
RESET lineitem_tmp$TABLE
|
||||
RELOAD lineitem_tmp$TABLE
|
||||
RESET tpch.lineitem_tmp$TABLE
|
||||
RELOAD tpch.lineitem_tmp$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
200000
|
||||
@@ -13,10 +13,10 @@ RELOAD lineitem_tmp$TABLE
|
||||
# Modifications: Converted selects from multiple tables to joins,
|
||||
# added round() call, removed subquery
|
||||
select round(sum(l_extendedprice) / 7.0, 5) as avg_yearly
|
||||
from lineitem$TABLE l
|
||||
join part$TABLE p
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.part$TABLE p
|
||||
on (p.p_partkey = l.l_partkey)
|
||||
join lineitem_tmp$TABLE lt
|
||||
join tpch.lineitem_tmp$TABLE lt
|
||||
on (lt.t_partkey = p.p_partkey)
|
||||
where
|
||||
p.p_brand = 'Brand#23' and
|
||||
@@ -1,11 +1,11 @@
|
||||
# Q18 - Large Value Customer Query
|
||||
insert overwrite table q18_tmp$TABLE
|
||||
insert overwrite table tpch.q18_tmp$TABLE
|
||||
select l_orderkey, sum(l_quantity) as t_sum_quantity
|
||||
from lineitem$TABLE
|
||||
from tpch.lineitem$TABLE
|
||||
group by l_orderkey
|
||||
---- SETUP
|
||||
RESET q18_tmp$TABLE
|
||||
RELOAD q18_tmp$TABLE
|
||||
RESET tpch.q18_tmp$TABLE
|
||||
RELOAD tpch.q18_tmp$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
1500000
|
||||
@@ -19,12 +19,12 @@ select
|
||||
o_orderdate,
|
||||
cast(o_totalprice as bigint) as total_price_bigint,
|
||||
round(sum(l_quantity), 5)
|
||||
from lineitem$TABLE l
|
||||
join orders$TABLE o
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.orders$TABLE o
|
||||
on (o.o_orderkey = l.l_orderkey)
|
||||
join customer$TABLE c
|
||||
join tpch.customer$TABLE c
|
||||
on (c.c_custkey = o.o_custkey)
|
||||
join q18_tmp$TABLE t
|
||||
join tpch.q18_tmp$TABLE t
|
||||
on (o.o_orderkey = t.l_orderkey and t.t_sum_quantity > 300)
|
||||
group by
|
||||
c_name,
|
||||
@@ -1,8 +1,8 @@
|
||||
# Q19 - Discounted Revenue Query
|
||||
# Modifications: Added round() calls
|
||||
select round(sum(l_extendedprice * (1 - l_discount) ), 5) as revenue
|
||||
from lineitem$TABLE l
|
||||
join part$TABLE p
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.part$TABLE p
|
||||
on p.p_partkey = l.l_partkey
|
||||
where
|
||||
(
|
||||
@@ -1,5 +1,5 @@
|
||||
# Q2 - Minimum Cost Supplier Query
|
||||
insert overwrite table q2_minimum_cost_supplier_tmp1$TABLE
|
||||
insert overwrite table tpch.q2_minimum_cost_supplier_tmp1$TABLE
|
||||
select
|
||||
s.s_acctbal,
|
||||
s.s_name,
|
||||
@@ -10,31 +10,31 @@ select
|
||||
s.s_address,
|
||||
s.s_phone,
|
||||
s.s_comment
|
||||
from partsupp$TABLE ps
|
||||
join part$TABLE p
|
||||
from tpch.partsupp$TABLE ps
|
||||
join tpch.part$TABLE p
|
||||
on (p.p_partkey = ps.ps_partkey and p.p_size = 15 and p.p_type like '%BRASS')
|
||||
join supplier$TABLE s
|
||||
join tpch.supplier$TABLE s
|
||||
on (s.s_suppkey = ps.ps_suppkey)
|
||||
join nation$TABLE n
|
||||
join tpch.nation$TABLE n
|
||||
on (s.s_nationkey = n.n_nationkey)
|
||||
join region$TABLE r
|
||||
join tpch.region$TABLE r
|
||||
on (n.n_regionkey = r.r_regionkey and r.r_name = 'EUROPE')
|
||||
---- SETUP
|
||||
RESET q2_minimum_cost_supplier_tmp1$TABLE
|
||||
RELOAD q2_minimum_cost_supplier_tmp1$TABLE
|
||||
RESET tpch.q2_minimum_cost_supplier_tmp1$TABLE
|
||||
RELOAD tpch.q2_minimum_cost_supplier_tmp1$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
642
|
||||
====
|
||||
insert overwrite table q2_minimum_cost_supplier_tmp2$TABLE
|
||||
insert overwrite table tpch.q2_minimum_cost_supplier_tmp2$TABLE
|
||||
select
|
||||
p_partkey,
|
||||
min(ps_supplycost)
|
||||
from q2_minimum_cost_supplier_tmp1$TABLE
|
||||
from tpch.q2_minimum_cost_supplier_tmp1$TABLE
|
||||
group by p_partkey
|
||||
---- SETUP
|
||||
RESET q2_minimum_cost_supplier_tmp2$TABLE
|
||||
RELOAD q2_minimum_cost_supplier_tmp2$TABLE
|
||||
RESET tpch.q2_minimum_cost_supplier_tmp2$TABLE
|
||||
RELOAD tpch.q2_minimum_cost_supplier_tmp2$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
460
|
||||
@@ -49,8 +49,8 @@ select
|
||||
t1.s_address,
|
||||
t1.s_phone,
|
||||
t1.s_comment
|
||||
from q2_minimum_cost_supplier_tmp1$TABLE t1
|
||||
join q2_minimum_cost_supplier_tmp2$TABLE t2
|
||||
from tpch.q2_minimum_cost_supplier_tmp1$TABLE t1
|
||||
join tpch.q2_minimum_cost_supplier_tmp2$TABLE t2
|
||||
on (t1.p_partkey = t2.p_partkey and t1.ps_supplycost = t2.ps_min_supplycost)
|
||||
order by
|
||||
s_acctbal desc,
|
||||
@@ -1,21 +1,21 @@
|
||||
# Q20 - Potential Part Promotion Query
|
||||
insert overwrite table q20_tmp1$TABLE
|
||||
insert overwrite table tpch.q20_tmp1$TABLE
|
||||
select distinct p_partkey
|
||||
from part$TABLE
|
||||
from tpch.part$TABLE
|
||||
where p_name like 'forest%'
|
||||
---- SETUP
|
||||
RESET q20_tmp1$TABLE
|
||||
RELOAD q20_tmp1$TABLE
|
||||
RESET tpch.q20_tmp1$TABLE
|
||||
RELOAD tpch.q20_tmp1$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
2127
|
||||
====
|
||||
insert overwrite table q20_tmp2$TABLE
|
||||
insert overwrite table tpch.q20_tmp2$TABLE
|
||||
select
|
||||
l_partkey,
|
||||
l_suppkey,
|
||||
0.5 * sum(l_quantity)
|
||||
from lineitem$TABLE
|
||||
from tpch.lineitem$TABLE
|
||||
where
|
||||
l_shipdate >= '1994-01-01' and
|
||||
l_shipdate < '1995-01-01'
|
||||
@@ -23,52 +23,52 @@ group by
|
||||
l_partkey,
|
||||
l_suppkey
|
||||
---- SETUP
|
||||
RESET q20_tmp2$TABLE
|
||||
RELOAD q20_tmp2$TABLE
|
||||
RESET tpch.q20_tmp2$TABLE
|
||||
RELOAD tpch.q20_tmp2$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
543210
|
||||
====
|
||||
insert overwrite table q20_tmp3$TABLE
|
||||
insert overwrite table tpch.q20_tmp3$TABLE
|
||||
select
|
||||
ps_suppkey,
|
||||
ps_availqty,
|
||||
sum_quantity
|
||||
from partsupp$TABLE ps
|
||||
join q20_tmp2$TABLE t2
|
||||
from tpch.partsupp$TABLE ps
|
||||
join tpch.q20_tmp2$TABLE t2
|
||||
on (ps.ps_partkey = t2.l_partkey and ps.ps_suppkey = t2.l_suppkey)
|
||||
join q20_tmp1$TABLE t1
|
||||
join tpch.q20_tmp1$TABLE t1
|
||||
on (ps.ps_partkey = t1.p_partkey)
|
||||
---- SETUP
|
||||
RESET q20_tmp3$TABLE
|
||||
RELOAD q20_tmp3$TABLE
|
||||
RESET tpch.q20_tmp3$TABLE
|
||||
RELOAD tpch.q20_tmp3$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
5843
|
||||
====
|
||||
# Modified to use subquery to work around IMP-127
|
||||
insert overwrite table q20_tmp4$TABLE
|
||||
insert overwrite table tpch.q20_tmp4$TABLE
|
||||
select a.ps_suppkey
|
||||
from (select
|
||||
ps_suppkey,
|
||||
count(1) from q20_tmp3$TABLE
|
||||
count(1) from tpch.q20_tmp3$TABLE
|
||||
where ps_availqty > sum_quantity
|
||||
group by ps_suppkey
|
||||
) a
|
||||
---- SETUP
|
||||
RESET q20_tmp4$TABLE
|
||||
RELOAD q20_tmp4$TABLE
|
||||
RESET tpch.q20_tmp4$TABLE
|
||||
RELOAD tpch.q20_tmp4$TABLE
|
||||
---- RESULTS
|
||||
====
|
||||
# Modifications: Added limit
|
||||
select
|
||||
s_name,
|
||||
s_address
|
||||
from supplier$TABLE s
|
||||
join nation$TABLE n
|
||||
from tpch.supplier$TABLE s
|
||||
join tpch.nation$TABLE n
|
||||
on (s.s_nationkey = n.n_nationkey and
|
||||
n.n_name = 'CANADA')
|
||||
join q20_tmp4$TABLE t4
|
||||
join tpch.q20_tmp4$TABLE t4
|
||||
on (s.s_suppkey = t4.ps_suppkey)
|
||||
order by
|
||||
s_name
|
||||
@@ -6,16 +6,16 @@
|
||||
select
|
||||
s_name,
|
||||
count(*) as numwait
|
||||
from lineitem$TABLE l1
|
||||
join supplier$TABLE s
|
||||
from tpch.lineitem$TABLE l1
|
||||
join tpch.supplier$TABLE s
|
||||
on (s.s_suppkey = l1.l_suppkey)
|
||||
join orders$TABLE o
|
||||
join tpch.orders$TABLE o
|
||||
on (o.o_orderkey = l1.l_orderkey)
|
||||
join nation$TABLE n
|
||||
join tpch.nation$TABLE n
|
||||
on (s.s_nationkey = n.n_nationkey)
|
||||
left semi join lineitem$TABLE l2
|
||||
left semi join tpch.lineitem$TABLE l2
|
||||
on (l2.l_orderkey = l1.l_orderkey)
|
||||
left outer join lineitem$TABLE l3
|
||||
left outer join tpch.lineitem$TABLE l3
|
||||
on (l3.l_orderkey = l1.l_orderkey and
|
||||
l3.l_receiptdate > l3.l_commitdate
|
||||
)
|
||||
@@ -3,11 +3,11 @@
|
||||
# a constant value ('C') so that we can do a join between this table
|
||||
# in the main query. This was needed because we only support equi-joins
|
||||
# and had to have a column to join on.
|
||||
insert overwrite table q22_customer_tmp1$TABLE
|
||||
insert overwrite table tpch.q22_customer_tmp1$TABLE
|
||||
select
|
||||
avg(c_acctbal) avg_acctbal,
|
||||
substr(c_name, 1, 1) as cust_name_char
|
||||
from customer$TABLE c
|
||||
from tpch.customer$TABLE c
|
||||
where
|
||||
c.c_acctbal > 0.00 and
|
||||
(substr(c.c_phone, 1, 2) = '13' or
|
||||
@@ -20,8 +20,8 @@ where
|
||||
group by
|
||||
substr(c_name, 1, 1)
|
||||
---- SETUP
|
||||
RESET q22_customer_tmp1$TABLE
|
||||
RELOAD q22_customer_tmp1$TABLE
|
||||
RESET tpch.q22_customer_tmp1$TABLE
|
||||
RELOAD tpch.q22_customer_tmp1$TABLE
|
||||
---- RESULTS
|
||||
---- NUMROWS
|
||||
1
|
||||
@@ -36,10 +36,10 @@ select
|
||||
substring(c_phone, 1, 2) as cntrycode,
|
||||
count(*) as numcust,
|
||||
round(sum(c_acctbal), 4) as totacctbal
|
||||
from customer$TABLE c
|
||||
join q22_customer_tmp1$TABLE ct
|
||||
from tpch.customer$TABLE c
|
||||
join tpch.q22_customer_tmp1$TABLE ct
|
||||
on (substr(c.c_name, 1, 1) = ct.cust_name_char)
|
||||
left outer join orders$TABLE o
|
||||
left outer join tpch.orders$TABLE o
|
||||
on (o.o_custkey = c.c_custkey)
|
||||
where
|
||||
o_custkey is null and
|
||||
@@ -5,10 +5,10 @@ select
|
||||
round(sum(l_extendedprice * (1 - l_discount)), 5) as revenue,
|
||||
o_orderdate,
|
||||
o_shippriority
|
||||
from lineitem$TABLE l
|
||||
join orders$TABLE o
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.orders$TABLE o
|
||||
on (l.l_orderkey = o.o_orderkey)
|
||||
join customer$TABLE c
|
||||
join tpch.customer$TABLE c
|
||||
on (c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey)
|
||||
where
|
||||
o_orderdate < '1995-03-15' and
|
||||
@@ -5,8 +5,8 @@
|
||||
select
|
||||
o_orderpriority,
|
||||
count(distinct l_orderkey) as order_count
|
||||
from lineitem$TABLE l
|
||||
inner join orders$TABLE o
|
||||
from tpch.lineitem$TABLE l
|
||||
inner join tpch.orders$TABLE o
|
||||
on (o.o_orderkey = l.l_orderkey and
|
||||
l.l_commitdate < l.l_receiptdate)
|
||||
where
|
||||
@@ -4,16 +4,16 @@
|
||||
select
|
||||
n_name,
|
||||
round(sum(l_extendedprice * (1 - l_discount)), 5) as revenue
|
||||
from lineitem$TABLE l
|
||||
join orders$TABLE o
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.orders$TABLE o
|
||||
on (l_orderkey = o_orderkey)
|
||||
join supplier$TABLE s
|
||||
join tpch.supplier$TABLE s
|
||||
on (l_suppkey = s_suppkey)
|
||||
join customer$TABLE
|
||||
join tpch.customer$TABLE
|
||||
on (c_nationkey = s_nationkey and c_custkey = o_custkey)
|
||||
join nation$TABLE
|
||||
join tpch.nation$TABLE
|
||||
on (s_nationkey = n_nationkey)
|
||||
join region$TABLE
|
||||
join tpch.region$TABLE
|
||||
on (n_regionkey = r_regionkey)
|
||||
where
|
||||
r_name = 'ASIA'
|
||||
@@ -1,7 +1,7 @@
|
||||
# Q6 - Forecasting Revenue Change Query
|
||||
# Modifications: Added round() call
|
||||
select round(sum(l_extendedprice * l_discount), 5) as revenue
|
||||
from lineitem$TABLE
|
||||
from tpch.lineitem$TABLE
|
||||
where l_shipdate >= '1994-01-01' and
|
||||
l_shipdate < '1995-01-01' and
|
||||
l_discount >= 0.05 and
|
||||
@@ -5,20 +5,20 @@ select
|
||||
year(o_orderdate) as o_year,
|
||||
round(sum(case when n2.n_name = 'BRAZIL' then l_extendedprice * (1 - l_discount)
|
||||
else 0 end) / sum(l_extendedprice * (1 - l_discount)), 5) as mkt_share
|
||||
from lineitem$TABLE l
|
||||
join orders$TABLE o
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.orders$TABLE o
|
||||
on (l_orderkey = o_orderkey)
|
||||
join part$TABLE p
|
||||
join tpch.part$TABLE p
|
||||
on (p_partkey = l_partkey)
|
||||
join supplier$TABLE s
|
||||
join tpch.supplier$TABLE s
|
||||
on (s_suppkey = l_suppkey)
|
||||
join customer$TABLE c
|
||||
join tpch.customer$TABLE c
|
||||
on (o_custkey = c_custkey)
|
||||
join nation$TABLE n1
|
||||
join tpch.nation$TABLE n1
|
||||
on (c_nationkey = n1.n_nationkey)
|
||||
join region$TABLE r
|
||||
join tpch.region$TABLE r
|
||||
on (n1.n_regionkey = r_regionkey)
|
||||
join nation$TABLE n2
|
||||
join tpch.nation$TABLE n2
|
||||
on (s_nationkey = n2.n_nationkey)
|
||||
where
|
||||
r_name = 'AMERICA' and
|
||||
@@ -6,16 +6,16 @@ select
|
||||
year(o.o_orderdate) as o_year,
|
||||
round(sum(l.l_extendedprice * (1 - l.l_discount) -
|
||||
ps.ps_supplycost * l.l_quantity), 1) as sum_profit
|
||||
from lineitem$TABLE l
|
||||
join part$TABLE p
|
||||
from tpch.lineitem$TABLE l
|
||||
join tpch.part$TABLE p
|
||||
on (p.p_partkey = l.l_partkey)
|
||||
join orders$TABLE o
|
||||
join tpch.orders$TABLE o
|
||||
on (o.o_orderkey = l.l_orderkey)
|
||||
join partsupp$TABLE ps
|
||||
join tpch.partsupp$TABLE ps
|
||||
on (ps.ps_suppkey = l.l_suppkey and ps.ps_partkey = l.l_partkey)
|
||||
join supplier$TABLE s
|
||||
join tpch.supplier$TABLE s
|
||||
on (s.s_suppkey = l.l_suppkey)
|
||||
join nation$TABLE n
|
||||
join tpch.nation$TABLE n
|
||||
on (s.s_nationkey = n.n_nationkey)
|
||||
where
|
||||
p.p_name like '%green%'
|
||||
4
testdata/workloads/tpch/tpch_core.csv
vendored
4
testdata/workloads/tpch/tpch_core.csv
vendored
@@ -1,2 +1,2 @@
|
||||
# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,tpch,none,none
|
||||
# Manually created file.
|
||||
file_format:text, dataset:tpch, compression_codec:none, compression_type:none
|
||||
|
||||
|
2
testdata/workloads/tpch/tpch_dimensions.csv
vendored
2
testdata/workloads/tpch/tpch_dimensions.csv
vendored
@@ -1,4 +1,4 @@
|
||||
file_format: text,seq,rc,trevni
|
||||
data_group: tpch
|
||||
dataset: tpch
|
||||
compression_codec: none,def,gzip,bzip,snap
|
||||
compression_type: none,block,record
|
||||
|
||||
|
7
testdata/workloads/tpch/tpch_exhaustive.csv
vendored
7
testdata/workloads/tpch/tpch_exhaustive.csv
vendored
@@ -1,2 +1,5 @@
|
||||
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,tpch,none,none
|
||||
# Generated File.
|
||||
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
|
||||
file_format: trevni, dataset: tpch, compression_codec: none, compression_type: none
|
||||
file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
|
||||
file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block
|
||||
|
||||
|
6
testdata/workloads/tpch/tpch_pairwise.csv
vendored
6
testdata/workloads/tpch/tpch_pairwise.csv
vendored
@@ -1,2 +1,4 @@
|
||||
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
|
||||
text,tpch,none,none
|
||||
# Generated File.
|
||||
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
|
||||
file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
|
||||
file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block
|
||||
|
||||
|
Reference in New Issue
Block a user