Update benchmark tests to run against generic workload, data loading with scale factor, +more

This change updates the run-benchmark script to enable it to target one or more
workloads. Now benchmarks can be run like:

./run-benchmark --workloads=hive-benchmark,tpch

We lookup the workload in the workloads directory, then read the associated
query .test files and start executing them.

To ensure the queries are not duplicated between benchmark and query tests, I
moved all existing queries (under fe/src/test/resources/* to the workloads
directory. You do NOT need to look through all the .test files, I've just moved
them. The one new file is the 'hive-benchmark.test' which contains the hive
benchmark queries.

Also added support for generating schema for different scale factors as well as
executing against these scale factors. For example, let's say we have a dataset
with a scale factor called "SF1". We would first generate the schema using:

./generate_schema_statements --workload=<workload> --scale_factor="SF3"
This will create tables with a unique names from the other scale factors.

Run the generated .sql file to load the data. Alternatively, the data can loaded
by running a new python script:
./bin/load-data.py -w <workload1>,<workload2> -e <exploration strategy> -s [scale factor]
For example: load-data.sh -w tpch -e core -s SF3

Then run against this:
./run-benchmark --workloads=<workload> --scale_factor=SF3

This changeset also includes a few other minor tweaks to some of the test
scripts.

Change-Id: Ife8a8d91567d75c9612be37bec96c1e7780f50d6
This commit is contained in:
Lenni Kuff
2012-08-02 23:38:59 -07:00
committed by Henry Robinson
parent 81d54e85e5
commit 04edc8f534
90 changed files with 1707 additions and 1566 deletions

5
.gitignore vendored
View File

@@ -7,9 +7,8 @@ cscope.out
org.eclipse.jdt.core.prefs
benchmark_results.csv
reference_benchmark_results.csv
testdata/data/test-warehouse
testdata/bin/create-*-generated.sql
testdata/bin/load-*-generated.sql
load-trevni-*-generated.sh
load-*-generated.sql
pprof.out

View File

@@ -1,11 +1,8 @@
#!/usr/bin/env bash
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
bin=`dirname "$0"`
bin=`cd "$bin"; pwd`
. "$bin"/impala-config.sh
set -e
set -u
echo "Copying data files from the share. If the file already exists locally, the files"\
"will not be copied. It's not check summing the files or anything like that, if"\

View File

@@ -36,6 +36,8 @@ fi
export IMPALA_FE_DIR=$IMPALA_HOME/fe
export IMPALA_BE_DIR=$IMPALA_HOME/be
export IMPALA_WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
export IMPALA_DATASET_DIR=$IMPALA_HOME/testdata/datasets
export IMPALA_COMMON_DIR=$IMPALA_HOME/common
export PATH=$IMPALA_HOME/bin:$PATH

130
bin/load-data.py Executable file
View File

@@ -0,0 +1,130 @@
#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This script is used to load the proper datasets for the specified workloads. It loads
# all data via Hive except for Trevni data which needs to be loaded via Impala.
import collections
import os
import re
import subprocess
import sys
import tempfile
import time
from itertools import product
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy", default="core",
help="The exploration strategy for schema gen: 'core', "\
"'pairwise', or 'exhaustive'")
parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
help="The HDFS path to the base Hive test warehouse directory")
parser.add_option("-w", "--workloads", dest="workloads",
help="Comma-separated list of workloads to load data for. If 'all' is "\
"specified then data for all workloads is loaded.")
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
help="An optional scale factor to generate the schema for")
parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
default=False, help='Skips HDFS exists check and reloads all tables')
(options, args) = parser.parse_args()
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
DATASET_DIR = os.environ['IMPALA_DATASET_DIR']
TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s "\
"--workload=%s --scale_factor=%s --verbose"
HIVE_CMD = os.path.join(os.environ['HIVE_HOME'], 'bin/hive')
HIVE_ARGS = "-hiveconf hive.root.logger=WARN,console -v"
def available_workloads(workload_dir):
return [subdir for subdir in os.listdir(workload_dir)
if os.path.isdir(os.path.join(workload_dir, subdir))]
def validate_workloads(all_workloads, workloads):
for workload in workloads:
if workload not in all_workloads:
print 'Workload \'%s\' not found in workload directory' % workload
print 'Available workloads: ' + ', '.join(all_workloads)
sys.exit(1)
def exec_hive_query_from_file(file_name):
hive_cmd = "%s %s -f %s" % (HIVE_CMD, HIVE_ARGS, file_name)
print 'Executing Hive Command: ' + hive_cmd
ret_val = subprocess.call(hive_cmd, shell = True)
if ret_val != 0:
print 'Error executing file from Hive: ' + file_name
sys.exit(ret_val)
def exec_bash_script(file_name):
bash_cmd = "bash %s" % file_name
print 'Executing Bash Command: ' + bash_cmd
ret_val = subprocess.call(bash_cmd, shell = True)
if ret_val != 0:
print 'Error bash script: ' + file_name
sys.exit(ret_val)
def generate_schema_statements(workload):
generate_cmd = GENERATE_SCHEMA_CMD % (options.exploration_strategy, workload,
options.scale_factor)
if options.force_reload:
generate_cmd += " --force_reload"
if options.hive_warehouse_dir is not None:
generate_cmd += " --hive_warehouse_dir=%s" % options.hive_warehouse_dir
print 'Executing Generate Schema Command: ' + generate_cmd
ret_val = subprocess.call(os.path.join(TESTDATA_BIN_DIR, generate_cmd), shell = True)
if ret_val != 0:
print 'Error generating schema statements for workload: ' + workload
sys.exit(ret_val)
def get_dataset_for_workload(workload):
dimension_file_name = os.path.join(WORKLOAD_DIR, workload,
'%s_dimensions.csv' % workload)
if not os.path.isfile(dimension_file_name):
print 'Dimension file not found: ' + dimension_file_name
sys.exit(1)
with open(dimension_file_name, 'rb') as input_file:
match = re.search('dataset:\s*(\w+)', input_file.read())
if match:
return match.group(1)
else:
print 'Dimension file does not contain dataset for workload \'%s\'' % (workload)
sys.exit(1)
all_workloads = available_workloads(WORKLOAD_DIR)
workloads = []
if options.workloads is None:
print "At least one workload name must be specified."
parser.print_help()
sys.exit(1)
elif options.workloads == 'all':
print 'Loading data for all workloads.'
workloads = all_workloads
else:
workloads = options.workloads.split(",")
validate_workloads(all_workloads, workloads)
print 'Starting data load for the following workloads: ' + ', '.join(workloads)
loading_time_map = collections.defaultdict(float)
for workload in workloads:
start_time = time.time()
dataset = get_dataset_for_workload(workload)
print "Dataset for workload '%s' is '%s'" % (workload, dataset)
dataset_dir = os.path.join(DATASET_DIR, dataset)
os.chdir(dataset_dir)
generate_schema_statements(workload)
exec_hive_query_from_file(os.path.join(dataset_dir,
'load-%s-%s-generated.sql' % (workload, options.exploration_strategy)))
exec_bash_script(os.path.join(dataset_dir,
'load-trevni-%s-%s-generated.sh' % (workload, options.exploration_strategy)))
loading_time_map[workload] = time.time() - start_time
total_time = 0.0
for workload, load_time in loading_time_map.iteritems():
total_time += load_time
print 'Data loading for workload \'%s\' completed in: %.2fs'\
% (workload, load_time)
print 'Total load time: %.2fs\n' % total_time

View File

@@ -1,77 +0,0 @@
#!/usr/bin/env bash
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This script creates schema and loads data into hive for running benchmarks and
# other tests. Using this script requires passing in two parameters:
# The first is the data set type (benchmark, tpch). This will load the appropriate
# collection of data sets for the run type.
# The second is the exploration strategy. This determines the different combinations
# of file format, compression, etc that will be created and loaded. 'Core' defines
# a basic set of combinations. If 'pairwise' is specified the pairwise combinations
# of workload # + file format + compression will be loaded. If 'exhaustive' is
# passed as an argument the exhaustive set of combinations will be loaded.
# TODO: Rewrite this script in python and detect and load workloads by enumerating
# the workloads directory.
exploration_strategy=
data_set_type=
if [ $1 = "hive-benchmark" ]; then
data_set_type=$1
elif [ $1 = "functional" ]; then
data_set_type=$1
elif [ $1 = "tpch" ]; then
data_set_type=$1
elif [ $1 = "query-test" ]; then
data_set_type="tpch functional"
elif [ $1 = "all" ]; then
data_set_type="hive-benchmark tpch functional"
else
echo "Invalid run type: $1. Valid values are 'all, query-test,"\
"functional, tpch, hive-benchmark'"
exit 1
fi
if [ $2 = "core" -o $2 = "pairwise" -o $2 = "exhaustive" ]; then
exploration_strategy=$2
else
echo "Invalid exploration strategy: $2. Valid values are 'core, pairwise, exhaustive'"
exit 1
fi
bin=`dirname "$0"`
bin=`cd "$bin"; pwd`
. "$bin"/impala-config.sh
set -e
WORKLOAD_DIR=$IMPALA_HOME/testdata/workloads
DATASET_DIR=$IMPALA_HOME/testdata/datasets
BIN_DIR=$IMPALA_HOME/testdata/bin
function execute_hive_query_from_file {
hive_args="-hiveconf hive.root.logger=WARN,console -v -f"
"$HIVE_HOME/bin/hive" $hive_args $1
if [ $? != 0 ]; then
echo LOAD OF $1 FAILED
exit -1
fi
}
for ds in $data_set_type
do
SCRIPT_DIR=$DATASET_DIR/$ds
pushd $SCRIPT_DIR
$BIN_DIR/generate_schema_statements.py --exploration_strategy ${exploration_strategy}\
--workload=${ds} --verbose
execute_hive_query_from_file \
"$SCRIPT_DIR/load-${ds}-${exploration_strategy}-generated.sql"
bash $SCRIPT_DIR/load-trevni-${ds}-${exploration_strategy}-generated.sh
popd
done
# TODO: Temporarily disable block id generation for everything except benchmark runs
# due to IMP-134
if [ $1 = "hive-benchmark" ]; then
$IMPALA_HOME/testdata/bin/generate-block-ids.sh
fi

View File

@@ -47,13 +47,14 @@ COLUMN_WIDTH = 18
TOTAL_WIDTH = 122 if options.verbose else 90
# These are the indexes in the input row for each column value
QUERY_IDX = 0
FILE_FORMAT_IDX = 1
COMPRESSION_IDX = 2
IMPALA_AVG_IDX = 3
IMPALA_STDDEV_IDX = 4
HIVE_AVG_IDX = 5
HIVE_STDDEV_IDX = 6
WORKLOAD_IDX = 0
QUERY_IDX = 1
FILE_FORMAT_IDX = 2
COMPRESSION_IDX = 3
IMPALA_AVG_IDX = 4
IMPALA_STDDEV_IDX = 5
HIVE_AVG_IDX = 6
HIVE_STDDEV_IDX = 7
# Formats a string so that is is wrapped across multiple lines with no single line
# being longer than the given width
@@ -91,7 +92,8 @@ def find_matching_row_in_reference_results(search_row, reference_results):
for row in reference_results:
if (row[QUERY_IDX] == search_row[QUERY_IDX] and
row[FILE_FORMAT_IDX] == search_row[FILE_FORMAT_IDX] and
row[COMPRESSION_IDX] == search_row[COMPRESSION_IDX]):
row[COMPRESSION_IDX] == search_row[COMPRESSION_IDX] and
row[WORKLOAD_IDX] == search_row[WORKLOAD_IDX]):
return row
return None
@@ -117,7 +119,7 @@ def print_table(results, verbose, reference_results = None):
print build_padded_row_string(table_header, COLUMN_WIDTH)
print "-" * TOTAL_WIDTH
for row in group:
full_row = row[1:] + [format_if_float(calculate_impala_hive_speedup(row)) + 'X']
full_row = row[2:] + [format_if_float(calculate_impala_hive_speedup(row)) + 'X']
if not verbose:
del full_row[HIVE_AVG_IDX - 1]
del full_row[HIVE_STDDEV_IDX - 2]
@@ -193,7 +195,6 @@ def read_csv_result_file(file_name):
results.append(row)
return results
reference_results = []
results = []
if os.path.isfile(options.result_file):

View File

@@ -1,11 +1,10 @@
#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This script should be used to benchmark queries. It can either run in batch mode, in
# which case it will run the set of hive benchmark queries or to run a single query. In
# either case, it will first try to warm the buffer cache before running the query
# multiple times. There are command line options to control how many times to prerun the
# query for the buffer cache as well as the number of iterations.
# This script is used to run benchmark queries. It runs the set queries specified in the
# given workload(s) under <workload name>/queries. This script will first try to warm the
# buffer cache before running the query. There is a command line options to control how
# many iterations to run each query.
#
# By default, the script will have minimal output. Verbose output can be turned on with
# the -v option which will output the normal query output. In addition, the -p option
@@ -15,7 +14,7 @@
# The script parses for output in the specific format in the regex below (result_regex).
# This is not very robust but probably okay for this script.
#
# The planservice needs to be running before this script.
# The planservice or ImpalaD needs to be running before executing any workload.
# Run with the --help option to see the arguments.
import collections
import csv
@@ -36,18 +35,17 @@ parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
default = False, help="If set, outputs all benchmark diagnostics.")
parser.add_option("--remote", dest="remote", action="store_true",
default = False, help="Set to true if running on remote cluster.")
parser.add_option("-q", "--query", dest="query", default = "",
help="Query to run. If none specified, runs all queries.")
parser.add_option("--iterations", dest="iterations", default="3",
help="Number of times to run the query. Only to be used with -q")
parser.add_option("--prime_cache", dest="prime_cache", default= True,
help="Whether or not to prime the buffer cache. Only to be "\
"used with -q")
parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
help="The exploration strategy to use for running benchmark: 'core', "\
"'pairwise', or 'exhaustive'")
parser.add_option("-w", "--workloads", dest="workloads", default="hive-benchmark",
help="The workload(s) to execute in a comma-separated list format."\
"Some valid workloads: 'hive-benchmark', 'tpch', ...")
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
help="The dataset scale factor to run the workload against.")
parser.add_option("--query_cmd", dest="query_cmd",
default='build/release/service/runquery -profile_output_file=""',
default=os.path.join(os.environ['IMPALA_HOME'],
'be/build/release/service/runquery') + ' -profile_output_file=""',
help="The command to use for executing queries")
parser.add_option("--compare_with_hive", dest="compare_with_hive", action="store_true",
default= False, help="Run all queries using Hive as well as Impala")
@@ -56,12 +54,21 @@ parser.add_option("--results_csv_file", dest="results_csv_file",
help="The output file where benchmark results are saved")
parser.add_option("--hive_cmd", dest="hive_cmd", default="hive -e",
help="The command to use for executing hive queries")
parser.add_option("-i", "--iterations", dest="iterations", default="5",
help="Number of times to run each query.")
parser.add_option("--prime_cache", dest="prime_cache", default= True,
help="Whether or not to prime the buffer cache. Only to be "\
"used with -q")
(options, args) = parser.parse_args()
profile_output_file = 'build/release/service/profile.tmp'
gprof_cmd = 'google-pprof --text build/release/service/runquery %s | head -n 60'
prime_cache_cmd = os.environ['IMPALA_HOME'] + "/testdata/bin/cache_tables.py -q \"%s\""
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
profile_output_file = os.path.join(os.environ['IMPALA_HOME'],
'be/build/release/service/profile.tmp')
gprof_cmd = 'google-pprof --text ' + options.query_cmd + ' %s | head -n 60'
prime_cache_cmd = os.path.join(os.environ['IMPALA_HOME'],
"testdata/bin/cache_tables.py") + " -q \"%s\""
result_single_regex = 'returned (\d*) rows? in (\d*).(\d*) s'
result_multiple_regex = 'returned (\d*) rows? in (\d*).(\d*) s with stddev (\d*).(\d*)'
hive_result_regex = 'Time taken: (\d*).(\d*) seconds'
@@ -80,12 +87,14 @@ class QueryExecutionResult:
self.stddev = stddev
class QueryExecutionDetail:
def __init__(self, file_format, compression, impala_execution_result,
hive_execution_result):
def __init__(self, workload, file_format, compression_codec, compression_type,
impala_execution_result, hive_execution_result):
self.workload = workload
self.file_format = file_format
self.compression_codec = compression_codec
self.compression_type = compression_type
self.impala_execution_result = impala_execution_result
self.hive_execution_result = hive_execution_result
self.file_format = file_format
self.compression = compression
# Parse for the tables used in this query
def parse_tables(query):
@@ -246,8 +255,14 @@ def run_query(query, prime_buffer_cache, iterations):
execution_result = QueryExecutionResult(str(avg_time), str(stddev))
return [output, execution_result]
def choose_input_vector_file_name(exploration_strategy):
return "hive-benchmark_%s.csv" % exploration_strategy
def vector_file_name(workload, exploration_strategy):
return "%s_%s.csv" % (workload, exploration_strategy)
# Gets the name of the database to use for the specified workload and scale factor.
def database_name_to_use(workload, scale_factor):
if workload == 'tpch':
return '%s%s.' % (workload, scale_factor)
return ''
def build_table_suffix(file_format, codec, compression_type):
if file_format == 'text' and codec == 'none':
@@ -259,54 +274,27 @@ def build_table_suffix(file_format, codec, compression_type):
else:
return '_%s_%s' % (file_format, codec)
def build_query(query_format_string, exploration_strategy, data_set,
file_format, codec, compression_type):
def build_query(query_format_string, file_format, codec, compression_type,
workload, scale_factor):
database_name = database_name_to_use(workload, scale_factor)
table_suffix = build_table_suffix(file_format, codec, compression_type)
return query_format_string % {'table_suffix': table_suffix}
# $TABLE is used as a token for table suffix in the queries. Here we insert the proper
# database name based on the workload and query.
return re.sub('(\w+\.){0,1}(?P<table_name>\w+)\$TABLE', '%s%s%s' %\
(database_name, r'\g<table_name>', table_suffix), query_format_string)
def read_vector_file(file_name):
if not os.path.isfile(file_name):
print 'Cannot find vector file: ' + file_name
sys.exit(1)
vector_values = []
with open(file_name, 'rb') as vector_file:
return [line.strip().split(',')
for line in vector_file.readlines() if not line.startswith('#')]
os.chdir(os.environ['IMPALA_BE_DIR'])
# This table contains a hash of dataset -> [query, numbers of times to prime buffer cache,
# number of iterations]. Queries should be grouped by the data they touch. This
# eliminates the need for the buffer cache priming iterations.
# TODO: it would be good if this table also contained the expected numbers and
# automatically flag regressions. How do we reconcile the fact we are running on
# different machines?
queries = {'grep1gb': [
["select count(*) from grep1gb%(table_suffix)s", 1, 5],
["select count(field) from grep1gb%(table_suffix)s", 0, 5],
["select count(field) from grep1gb%(table_suffix)s where field like '%%xyz%%'", 0, 5]
],
'web': [
["select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue "\
"from uservisits%(table_suffix)s uv join rankings%(table_suffix)s r on "\
"(r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate "\
"< '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1", 1, 5],
["select sourceIP, SUM(adRevenue) FROM uservisits%(table_suffix)s GROUP by sourceIP "\
"order by SUM(adRevenue) desc limit 10", 1, 5],
["select pageRank, pageURL from rankings%(table_suffix)s where pageRank > 10 "\
"order by pageRank limit 100", 1, 5],
["select count(*) from rankings%(table_suffix)s where "\
"pageRank > 10 && pageRank < 25", 1, 5],
["select avg(adRevenue) from uservisits%(table_suffix)s", 1, 5],
["select avg(adRevenue) from uservisits%(table_suffix)s "\
"where visitdate > '1999-07-01' and visitdate < '1999-12-31'", 1, 5],
],
'grep10gb': [
["select count(field) from grep10gb%(table_suffix)s where field like '%%xyz%%'", 0, 1]
]
}
for line in vector_file.readlines():
if line.strip().startswith('#'):
continue
vector_values.append([value.split(':')[1].strip() for value in line.split(',')])
return vector_values
# Writes out results to a CSV file. Columns are delimited by '|' characters
def write_to_csv(result_map, output_csv_file):
@@ -316,51 +304,99 @@ def write_to_csv(result_map, output_csv_file):
for query, execution_results in result_map.iteritems():
for result in execution_results:
csv_writer.writerow([query, result.file_format, result.compression,
csv_writer.writerow([result.workload, query, result.file_format,
'%s/%s' % (result.compression_codec, result.compression_type),
result.impala_execution_result.avg_time,
result.impala_execution_result.stddev,
result.hive_execution_result.avg_time,
result.hive_execution_result.stddev])
result.hive_execution_result.stddev,
])
# Run all queries
if (len(options.query) == 0):
vector_file_path = os.path.join(
os.environ['IMPALA_HOME'], 'testdata/workloads/hive-benchmark/',
choose_input_vector_file_name(options.exploration_strategy))
# Recursively scans the given directory for all test query files
def enumerate_query_files(base_directory):
query_files = []
for item in os.listdir(base_directory):
full_path = os.path.join(base_directory, item)
if os.path.isfile(full_path) and item.endswith('.test'):
query_files.append(full_path)
elif os.path.isdir(full_path):
query_files += enumerate_query_files(full_path)
return query_files
vector = read_vector_file(vector_file_path)
output = ""
result_map = collections.defaultdict(list)
# Strips out comments and empty lines from the input query string
def strip_comments(query_string):
query = []
for line in query_string.split('\n'):
if not line or line.strip().startswith('#') or line.strip().startswith('//'):
continue
query.append(line)
return '\n'.join(query).strip()
for row in vector:
file_format, data_set, codec, compression_type = row[:4]
for query in queries[data_set]:
query_string = build_query(query[0], options.exploration_strategy, data_set,
file_format, codec, compression_type)
result = run_query(query_string, query[1], query[2])
# Enumerate all the query files for a workload and extract the actual query
# strings.
def extract_queries_from_test_files(workload):
workload_base_dir = os.path.join(WORKLOAD_DIR, workload)
if not os.path.isdir(workload_base_dir):
print "Workload '%s' not found at path '%s'" % (workload, workload_base_dir)
sys.exit(1)
query_dir = os.path.join(workload_base_dir, 'queries')
if not os.path.isdir(query_dir):
print "Workload query directory not found at path '%s'" % (query_dir)
queries = []
for query_file_name in enumerate_query_files(query_dir):
if options.verbose != 0:
print 'Parsing Query Test File: ' + query_file_name
with open(query_file_name, 'rb') as query_file:
# Query files are split into sections separated by '=====', with subsections
# separeted by '----'. The first item in each subsection is the actual query
# to execute.
for query_section in query_file.read().split("===="):
formatted_query = strip_comments(query_section.split("----")[0])
if formatted_query:
queries.append(formatted_query.strip())
return queries
result_map = collections.defaultdict(list)
output = ""
# For each workload specified in, look up the associated query files. Extract valid
# queries in each file and execute them using the specified number of execution
# iterations. Finally, write results to an output CSV file for reporting.
for workload in options.workloads.split(','):
print 'Starting running of workload: ' + workload
queries = extract_queries_from_test_files(workload)
vector_file_path = os.path.join(WORKLOAD_DIR, workload,
vector_file_name(workload,
options.exploration_strategy))
test_vector = read_vector_file(vector_file_path)
# Execute the queries for combinations of file format, compression, etc.
for row in test_vector:
file_format, data_group, codec, compression_type = row[:4]
print 'Test Vector Values: ' + ', '.join(row)
for query in queries:
query_string = build_query(query.strip(), file_format, codec, compression_type,
workload, options.scale_factor)
result = run_query(query_string, 1, int(options.iterations))
output += result[0]
print result[0]
execution_result = result[1]
hive_execution_result = QueryExecutionResult("N/A", "N/A")
if options.compare_with_hive:
hive_result = run_query_using_hive(query_string, query[1], query[2])
hive_result = run_query_using_hive(query_string, 1, int(options.iterations))
print "Hive Results:"
print hive_result[0]
hive_execution_result = hive_result[1]
if options.verbose != 0:
print "--------------------------------------------------------------------------"
print "------------------------------------------------------------------------"
execution_detail = QueryExecutionDetail(file_format, codec, execution_result,
execution_detail = QueryExecutionDetail(workload, file_format, codec,
compression_type, execution_result,
hive_execution_result)
result_map[query[0]].append(execution_detail)
result_map[query].append(execution_detail)
print "\nResults saving to: " + options.results_csv_file
write_to_csv(result_map, options.results_csv_file)
print output
# Run query from command line
else:
result = run_query(options.query, int(options.prime_cache),
int(options.iterations))
print result[1] or result[0]

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
package com.cloudera.impala.dataerror;
@@ -23,7 +23,7 @@ public class DataErrorsTest {
private static Catalog catalog;
private static Executor executor;
private static StringBuilder testErrorLog;
private final String testDir = "DataErrorsTest";
private final String testDir = "functional-query/queries/DataErrorsTest";
private static ArrayList<String> tableList;
@BeforeClass

View File

@@ -1,4 +1,4 @@
// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
package com.cloudera.impala.planner;
@@ -33,7 +33,7 @@ public class PlannerTest {
private static Catalog catalog;
private static AnalysisContext analysisCtxt;
private final String testDir = "PlannerTest";
private final String testDir = "functional-planner/queries/PlannerTest";
private final String outDir = "/tmp/PlannerTest/";
private final StringBuilder explainStringBuilder = new StringBuilder();

View File

@@ -4,6 +4,7 @@ package com.cloudera.impala.service;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Set;
@@ -41,7 +42,7 @@ import com.google.common.collect.Sets;
*/
public abstract class BaseQueryTest {
private static final Logger LOG = Logger.getLogger(BaseQueryTest.class);
private static final String TEST_DIR = "QueryTest";
private static final String TEST_DIR = "functional-query/queries/QueryTest";
private static final int DEFAULT_FE_PORT = 21000;
// If set to true, new test results will be generated and saved to the specified
@@ -93,6 +94,9 @@ public abstract class BaseQueryTest {
protected final static TestExecMode EXECUTION_MODE = TestExecMode.valueOf(
System.getProperty("testExecutionMode", "reduced").toUpperCase());
// A relative path from the 'workloads' directory to the base test directory.
private final String testDirName;
/**
* The type of target test environments. Determines whether the front end is running
* in-process or out-of-process (ImpalaD).
@@ -166,6 +170,14 @@ public abstract class BaseQueryTest {
}
}
protected BaseQueryTest() {
this(TEST_DIR);
}
protected BaseQueryTest(String testDirName) {
this.testDirName = testDirName;
}
@BeforeClass
public static void setUp() throws Exception {
String impaladHostname = System.getProperty("impalad");
@@ -418,7 +430,7 @@ public abstract class BaseQueryTest {
private void runQueryWithTestConfigs(List<TestConfiguration> testConfigs,
String testFile, boolean abortOnError, int maxErrors) {
String fileName = TEST_DIR + "/" + testFile + ".test";
String fileName = new File(testDirName, testFile + ".test").getPath();
TestFileParser queryFileParser = new TestFileParser(fileName);
LOG.debug("Running the following configurations over file " + fileName + " : ");

View File

@@ -6,6 +6,10 @@ import org.junit.Test;
public class TpchQueryTest extends BaseQueryTest {
public TpchQueryTest() {
super("tpch/queries");
}
@Test
public void TestTpchQ1() {
runTestInExecutionMode(EXECUTION_MODE, "tpch-q1", false, 1000);

View File

@@ -4,8 +4,10 @@ package com.cloudera.impala.testutil;
import static org.junit.Assert.fail;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
@@ -183,7 +185,7 @@ public class TestFileParser {
private int lineNum = 0;
private final String fileName;
private InputStream stream;
private BufferedReader reader;
private Scanner scanner;
/**
@@ -210,9 +212,9 @@ public class TestFileParser {
*/
private void open(String table) {
try {
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
stream = classLoader.getResourceAsStream(fileName);
scanner = new Scanner(stream);
String fullPath = new File(TestFileUtils.getTestFileBaseDir(), fileName).getPath();
reader = new BufferedReader(new FileReader(fullPath));
scanner = new Scanner(reader);
} catch (Exception e) {
fail(e.getMessage());
}
@@ -293,16 +295,16 @@ public class TestFileParser {
}
private void close() {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
fail(e.getMessage());
}
}
if (scanner != null) {
scanner.close();
}
if (stream != null) {
try {
stream.close();
} catch (IOException e) {
// ignore
}
}
}
}

View File

@@ -67,4 +67,11 @@ public class TestFileUtils {
fw.close();
}
}
/**
* Returns the base directory for test files.
*/
public static String getTestFileBaseDir() {
return new File(System.getenv("IMPALA_HOME"), "testdata/workloads").getPath();
}
}

View File

@@ -6,22 +6,14 @@ if [ x${JAVA_HOME} == x ]; then
exit 1
fi
set -u
set -e
# Load the data set
pushd ${IMPALA_HOME}/bin
./load-data.sh functional exhaustive
if [ $? != 0 ]; then
echo LOAD OF FUNCTIONAL DATA FAILED
exit 1
fi
./load-data.sh tpch core
if [ $? != 0 ]; then
echo LOAD OF TPCH DATA FAILED
exit 1
fi
./load-data.py --workloads functional-query --exploration_strategy exhaustive
./load-data.py --workloads functional-planner --exploration_strategy exhaustive
./load-data.py --workloads tpch --exploration_strategy core
popd
# TODO: The multi-format table will move these files. So we need to copy them to a

View File

@@ -34,21 +34,28 @@ from itertools import product
from optparse import OptionParser
parser = OptionParser()
parser.add_option("--exploration_strategy", dest="exploration_strategy", default="core",
parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy", default="core",
help="The exploration strategy for schema gen: 'core', "\
"'pairwise', or 'exhaustive'")
parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
default="/test-warehouse",
help="The HDFS path to the base Hive test warehouse directory")
parser.add_option("--workload", dest="workload", default="functional",
parser.add_option("-w", "--workload", dest="workload",
help="The workload to generate schema for: tpch, hive-benchmark, ...")
parser.add_option("--force_reload", dest="force_reload", action="store_true",
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
help="An optional scale factor to generate the schema for")
parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
default= False, help='Skips HDFS exists check and reloads all tables')
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
default = False, help="If set, outputs additional logging.")
(options, args) = parser.parse_args()
if options.workload is None:
print "A workload name must be specified."
parser.print_help()
sys.exit(1)
WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads'
DATASET_DIR = os.environ['IMPALA_HOME'] + '/testdata/datasets'
@@ -58,8 +65,8 @@ COMPRESSION_CODEC =\
"SET mapred.output.compression.codec=org.apache.hadoop.io.compress.%s;"
SET_DYNAMIC_PARTITION_STATEMENT = "SET hive.exec.dynamic.partition=true;"
SET_PARTITION_MODE_NONSTRICT_STATEMENT = "SET hive.exec.dynamic.partition.mode=nonstrict;"
SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;"\
"SET hive.input.format=org.apache.hadoop.hive.ql.io.%s;"
SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;\n"\
"SET hive.input.format=org.apache.hadoop.hive.ql.io.%s;\n"
FILE_FORMAT_IDX = 0
DATA_SET_IDX = 1
@@ -99,10 +106,12 @@ class SqlGenerationStatement:
self.trevni = trevni.strip()
self.load_local = load_local.strip()
def build_create_statement(table_template, table_name, file_format, compression):
def build_create_statement(table_template, table_name, file_format,
compression, scale_factor):
create_statement = 'DROP TABLE IF EXISTS %s;\n' % table_name
create_statement += table_template % {'table_name': table_name,
'file_format': FILE_FORMAT_MAP[file_format] }
'file_format': FILE_FORMAT_MAP[file_format],
'scale_factor': scale_factor}
if file_format != 'trevni':
return create_statement
@@ -151,9 +160,10 @@ def build_insert(insert, table_name, base_table_name, codec, compression_type):
table_name) + "\n"
return output
def build_load_statement(load_template, table_name):
def build_load_statement(load_template, table_name, scale_factor):
tmp_load_template = load_template.replace(' % ', ' *** ')
return (tmp_load_template % {'table_name': table_name}).replace(' *** ', ' % ')
return (tmp_load_template % {'table_name': table_name,
'scale_factor': scale_factor}).replace(' *** ', ' % ')
def build_trevni(trevni_template, table_name, base_table_name):
return trevni_template % {'table_name': table_name, 'base_table_name': base_table_name}
@@ -171,10 +181,16 @@ def build_table_suffix(file_format, codec, compression_type):
else:
return '_%s_%s' % (file_format, codec)
# Vector files have the format:
# dimension_name1:value1, dimension_name2:value2, ...
def read_vector_file(file_name):
vector_values = []
with open(file_name, 'rb') as vector_file:
return [line.strip().split(',')
for line in vector_file.readlines() if not line.startswith('#')]
for line in vector_file.readlines():
if line.strip().startswith('#'):
continue
vector_values.append([value.split(':')[1].strip() for value in line.split(',')])
return vector_values
def write_array_to_file(file_name, array):
with open(file_name, 'w') as f:
@@ -207,32 +223,32 @@ def write_trevni_file(file_name, array):
# Kill off the plan service.
f.write("\nkill -9 $PID\n")
def write_statements_to_file_based_on_input_vector(output_name, input_file_name,
def write_statements_to_file_based_on_input_vector(output_name, test_vectors,
statements):
output_create = []
output_load = []
output_load_base = []
output_trevni = []
results = read_vector_file(input_file_name)
existing_tables = list_hdfs_subdir_names(options.hive_warehouse_dir)
for row in results:
for row in test_vectors:
file_format, data_set, codec, compression_type = row[:4]
for s in statements[data_set.strip()]:
create = s.create
insert = s.insert
trevni = s.trevni
load_local = s.load_local
table_name = s.base_table_name +\
build_table_suffix(file_format, codec, compression_type)
base_table_name = s.base_table_name % {'scale_factor' : options.scale_factor}
table_name = base_table_name + \
build_table_suffix(file_format, codec, compression_type)
# HBase only supports text format and mixed format tables have formats defined.
# TODO: Implement a better way to tag a table as only being generated with a fixed
# set of file formats.
if (("hbase" in table_name or "mixedformat" in table_name) and
"text" not in file_format):
if ("hbase" in table_name and "text" not in file_format):
continue
output_create.append(build_create_statement(create, table_name, file_format, codec))
output_create.append(build_create_statement(create, table_name, file_format, codec,
options.scale_factor))
# If the directory already exists in HDFS, assume that data files already exist
# and skip loading the data. Otherwise, the data is generated using either an
@@ -242,21 +258,22 @@ def write_statements_to_file_based_on_input_vector(output_name, input_file_name,
print 'Path:', data_path, 'already exists in HDFS. Data loading can be skipped.'
else:
print 'Path:', data_path, 'does not exists in HDFS. Data file will be generated.'
if table_name == s.base_table_name:
if table_name == base_table_name:
if load_local:
output_load_base.append(build_load_statement(load_local, table_name))
output_load_base.append(build_load_statement(load_local, table_name,
options.scale_factor))
else:
print 'Empty base table load for %s. Skipping load generation' % table_name
elif file_format == 'trevni':
if trevni:
output_trevni.append(build_trevni(trevni, table_name, s.base_table_name))
output_trevni.append(build_trevni(trevni, table_name, base_table_name))
else:
print \
'Empty trevni load for table %s. Skipping insert generation' % table_name
else:
if insert:
output_load.append(build_insert(insert, table_name, s.base_table_name,
codec, compression_type))
output_load.append(build_insert(insert, table_name, base_table_name,
codec, compression_type))
else:
print 'Empty insert for table %s. Skipping insert generation' % table_name
@@ -285,22 +302,30 @@ if (options.exploration_strategy != 'core' and
print 'Invalid exploration strategy:', options.exploration_strategy
sys.exit(1)
schema_template_file = os.path.join(DATASET_DIR, options.workload,
'%s_schema_template.sql' % (options.workload))
test_vector_file = os.path.join(WORKLOAD_DIR, options.workload,
'%s_%s.csv' % (options.workload,
options.exploration_strategy))
if not os.path.isfile(test_vector_file):
print 'Vector file not found: ' + test_vector_file
sys.exit(1)
test_vectors = read_vector_file(test_vector_file)
if len(test_vectors) == 0:
print 'No test vectors found in file: ' + test_vector_file
sys.exit(1)
target_dataset = test_vectors[0][DATA_SET_IDX]
print 'Target Dataset: ' + target_dataset
schema_template_file = os.path.join(DATASET_DIR, target_dataset,
'%s_schema_template.sql' % target_dataset)
if not os.path.isfile(schema_template_file):
print 'Schema file not found: ' + schema_template_file
sys.exit(1)
test_vector_file = os.path.join(WORKLOAD_DIR, options.workload,
'%s_%s.csv' % (options.workload,
options.exploration_strategy))
if not os.path.isfile(schema_template_file):
print 'Vector file not found: ' + schema_template_file
sys.exit(1)
statements = parse_benchmark_file(schema_template_file)
write_statements_to_file_based_on_input_vector(
'%s-%s' % (options.workload, options.exploration_strategy),
test_vector_file, statements)
test_vectors, statements)

View File

@@ -35,23 +35,28 @@ import metacomm.combinatorics.all_pairs2
all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2
parser = OptionParser()
parser.add_option("--dimension_file", dest="dimension_file",
default = "hive-benchmark_dimensions.csv",
help="The file containing the list of dimensions.")
parser.add_option("--workload", dest="workload", default = "hive-benchmark",
parser.add_option("-w", "--workload", dest="workload",
help="The workload to generate test vectors for")
(options, args) = parser.parse_args()
WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads'
if options.workload is None:
print "A workload name must be specified."
parser.print_help()
sys.exit(1)
FILE_FORMAT_IDX = 0
DATA_SET_IDX = 1
COMPRESSION_IDX = 2
COMPRESSION_TYPE_IDX = 3
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
KNOWN_DIMENSION_NAMES = ['file_format', 'data_group', 'compression_codec',
# This array also defines the order of the dimension values. This ordering
# is important because it is used to apply constraints. Add new items to the
# end of the list.
KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec',
'compression_type']
FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format')
DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset')
COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec')
COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type')
class VectorGenerator:
def __init__(self, input_vectors):
self.input_vectors = input_vectors
@@ -76,13 +81,15 @@ def is_valid_combination(vector):
(vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or
(vector[FILE_FORMAT_IDX] == 'trevni' and
(vector[COMPRESSION_IDX] == 'gzip' or vector[COMPRESSION_IDX] == 'bzip')) or
(vector[DATA_SET_IDX] == 'tpch' and vector[FILE_FORMAT_IDX] != 'text'))
(vector[DATASET_IDX] == 'tpch' and
(vector[FILE_FORMAT_IDX] != 'text' and vector[FILE_FORMAT_IDX] != 'trevni')))
# The pairwise generator may call this with different vector lengths. In that case this
# should always return true.
return True
# Vector files have the format: <dimension name>: value1, value2, ...
# Vector files have the format: <dimension name>: value1, value2, ... this function
# adds all specified dimensions to a map of dimension name-to-value
def read_dimension_file(file_name):
dimension_map = collections.defaultdict(list)
with open(file_name, 'rb') as input_file:
@@ -98,16 +105,18 @@ def read_dimension_file(file_name):
print 'Unknown dimension name: ' + values[0]
print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES)
sys.exit(1)
dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
return dimension_map
def write_vectors_to_csv(output_dir, output_file, matrix):
output_text = "# Generated File. The vector value order is: file format, data_group, "\
"compression codec, compression type"
output_text = "# Generated File."
for row in matrix:
output_text += '\n' + ','.join(row)
row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))]
output_text += '\n' + ', '.join(row)
with open(os.path.join(output_dir, output_file), 'wb') as output_file:
output_path = os.path.join(output_dir, output_file)
print 'Writing test vectors to: ' + output_path
with open(output_path, 'wb') as output_file:
output_file.write(output_text)
output_file.write('\n')
@@ -120,11 +129,10 @@ if not os.path.isfile(dimension_file):
print 'Reading dimension file: ' + dimension_file
vector_map = read_dimension_file(dimension_file)
vectors = []
# This ordering matters! We need to know the order to apply the proper constraints.
vectors.append(vector_map['file_format'])
vectors.append(vector_map['data_group'])
vectors.append(vector_map['compression_codec'])
vectors.append(vector_map['compression_type'])
for dimension_name in KNOWN_DIMENSION_NAMES:
vectors.append(vector_map[dimension_name])
vg = VectorGenerator(vectors)
output_dir = os.path.join(WORKLOAD_DIR, options.workload)

View File

@@ -24,7 +24,7 @@
# ---- <- End sub-section
# LOAD from LOCAL - How to load data for the the base table
====
grep1gb
hive-benchmark
----
grep1gb
----
@@ -49,7 +49,7 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-000
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep1GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
====
grep10gb
hive-benchmark
----
grep10gb
----
@@ -75,7 +75,7 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00004' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=4);
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/grep10GB/part-00005' OVERWRITE INTO TABLE %(table_name)s PARTITION(chunk=5);
====
web
hive-benchmark
----
rankings
----
@@ -94,7 +94,7 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/html1GB/Rankings.dat' OVERWRITE INTO TABLE %(table_name)s;
====
web
hive-benchmark
----
uservisits
----

View File

@@ -1,10 +1,11 @@
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
# For details on this file format please see benchmark_schema_template.sql
# For details on this file format please see hive-benchmark_schema_template.sql
====
tpch
----
lineitem
tpch%(scale_factor)s.lineitem
----
CREATE DATABASE IF NOT EXISTS tpch%(scale_factor)s;
CREATE EXTERNAL TABLE %(table_name)s (
L_ORDERKEY INT,
L_PARTKEY INT,
@@ -24,7 +25,7 @@ L_SHIPMODE STRING,
L_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -32,12 +33,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/lineitem.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/lineitem/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
part
tpch%(scale_factor)s.part
----
CREATE EXTERNAL TABLE %(table_name)s (
P_PARTKEY INT,
@@ -52,7 +53,7 @@ P_RETAILPRICE DOUBLE,
P_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -60,12 +61,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/part.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/part/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
partsupp
tpch%(scale_factor)s.partsupp
----
CREATE EXTERNAL TABLE %(table_name)s (
PS_PARTKEY INT,
@@ -75,7 +76,7 @@ PS_SUPPLYCOST DOUBLE,
PS_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -83,12 +84,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/partsupp.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/partsupp/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
supplier
tpch%(scale_factor)s.supplier
----
CREATE EXTERNAL TABLE %(table_name)s (
S_SUPPKEY INT,
@@ -100,7 +101,7 @@ S_ACCTBAL DOUBLE,
S_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -108,12 +109,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/supplier.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/supplier/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
nation
tpch%(scale_factor)s.nation
----
CREATE EXTERNAL TABLE %(table_name)s (
N_NATIONKEY INT,
@@ -122,7 +123,7 @@ N_REGIONKEY INT,
N_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -130,12 +131,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/nation.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/nation/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
region
tpch%(scale_factor)s.region
----
CREATE EXTERNAL TABLE %(table_name)s (
R_REGIONKEY INT,
@@ -143,7 +144,7 @@ R_NAME STRING,
R_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -151,12 +152,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/region.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/region/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
orders
tpch%(scale_factor)s.orders
----
CREATE EXTERNAL TABLE %(table_name)s (
O_ORDERKEY INT,
@@ -170,7 +171,7 @@ O_SHIPPRIORITY INT,
O_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -178,12 +179,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/orders.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/orders/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
customer
tpch%(scale_factor)s.customer
----
CREATE EXTERNAL TABLE %(table_name)s (
C_CUSTKEY INT,
@@ -196,7 +197,7 @@ C_MKTSEGMENT STRING,
C_COMMENT STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
FROM %(base_table_name)s INSERT OVERWRITE TABLE %(table_name)s SELECT *;
----
@@ -204,12 +205,12 @@ ${IMPALA_HOME}/bin/run-query.sh --query=" \
INSERT OVERWRITE TABLE %(table_name)s \
select * FROM %(base_table_name)s"
----
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch/customer.tbl'
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/impala-data/tpch%(scale_factor)s/customer/'
OVERWRITE INTO TABLE %(table_name)s;
====
tpch
----
q2_minimum_cost_supplier_tmp1
tpch%(scale_factor)s.q2_minimum_cost_supplier_tmp1
----
CREATE EXTERNAL TABLE %(table_name)s (
s_acctbal double,
@@ -222,27 +223,27 @@ s_address string,
s_phone string,
s_comment string)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q2_minimum_cost_supplier_tmp2
tpch%(scale_factor)s.q2_minimum_cost_supplier_tmp2
----
CREATE EXTERNAL TABLE %(table_name)s (
p_partkey int,
ps_min_supplycost double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q7_volume_shipping_tmp
tpch%(scale_factor)s.q7_volume_shipping_tmp
----
CREATE EXTERNAL TABLE %(table_name)s (
supp_nation string,
@@ -250,73 +251,73 @@ cust_nation string,
s_nationkey int,
c_nationkey int)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q11_part_tmp
tpch%(scale_factor)s.q11_part_tmp
----
CREATE EXTERNAL TABLE %(table_name)s (
ps_partkey int,
part_value double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q11_sum_tmp
tpch%(scale_factor)s.q11_sum_tmp
----
CREATE EXTERNAL TABLE %(table_name)s (total_value double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
revenue
tpch%(scale_factor)s.revenue
----
CREATE EXTERNAL TABLE %(table_name)s (
supplier_no int,
total_revenue double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
max_revenue
tpch%(scale_factor)s.max_revenue
----
CREATE EXTERNAL TABLE %(table_name)s (max_revenue double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
supplier_tmp
tpch%(scale_factor)s.supplier_tmp
----
CREATE EXTERNAL TABLE %(table_name)s (s_suppkey int)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q16_tmp
tpch%(scale_factor)s.q16_tmp
----
CREATE EXTERNAL TABLE %(table_name)s (
p_brand string,
@@ -324,94 +325,94 @@ p_type string,
p_size int,
ps_suppkey int)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
lineitem_tmp
tpch%(scale_factor)s.lineitem_tmp
----
CREATE EXTERNAL TABLE %(table_name)s (
t_partkey int,
t_avg_quantity double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q18_tmp
tpch%(scale_factor)s.q18_tmp
----
CREATE EXTERNAL TABLE %(table_name)s (
l_orderkey int,
t_sum_quantity double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q20_tmp1
tpch%(scale_factor)s.q20_tmp1
----
CREATE EXTERNAL TABLE %(table_name)s (p_partkey int)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q20_tmp2
tpch%(scale_factor)s.q20_tmp2
----
CREATE EXTERNAL TABLE %(table_name)s (
l_partkey int,
l_suppkey int,
sum_quantity double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q20_tmp3
tpch%(scale_factor)s.q20_tmp3
----
CREATE EXTERNAL TABLE %(table_name)s (
ps_suppkey int,
ps_availqty int,
sum_quantity double)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q20_tmp4
tpch%(scale_factor)s.q20_tmp4
----
CREATE EXTERNAL TABLE %(table_name)s (ps_suppkey int)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----
====
tpch
----
q22_customer_tmp1
tpch%(scale_factor)s.q22_customer_tmp1
----
CREATE EXTERNAL TABLE %(table_name)s (avg_acctbal double, cust_name_char string)
STORED AS %(file_format)s
LOCATION '/test-warehouse/%(table_name)s';
LOCATION '${hiveconf:hive.metastore.warehouse.dir}/%(table_name)s';
----
----
----

View File

@@ -0,0 +1,2 @@
# Manually Created File.
file_format: text, dataset: functional, compression_codec: none, compression_type: none
1 # Manually Created File.
2 file_format: text, dataset: functional, compression_codec: none, compression_type: none

View File

@@ -0,0 +1,4 @@
file_format: text
dataset: functional
compression_codec: none
compression_type: none
1 file_format: text
2 dataset: functional
3 compression_codec: none
4 compression_type: none

View File

@@ -0,0 +1,2 @@
# Generated File.
file_format: text, dataset: functional, compression_codec: none, compression_type: none
1 # Generated File.
2 file_format: text, dataset: functional, compression_codec: none, compression_type: none

View File

@@ -0,0 +1,2 @@
# Generated File.
file_format: text, dataset: functional, compression_codec: none, compression_type: none
1 # Generated File.
2 file_format: text, dataset: functional, compression_codec: none, compression_type: none

View File

@@ -0,0 +1,3 @@
# Manually created file.
file_format:text, dataset:functional, compression_codec:none, compression_type:none
file_format:seq, dataset:functional, compression_codec:none, compression_type:none
1 # Manually created file.
2 file_format:text, dataset:functional, compression_codec:none, compression_type:none
3 file_format:seq, dataset:functional, compression_codec:none, compression_type:none

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,trevni
data_group: functional
dataset: functional
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,trevni
2 data_group: functional dataset: functional
3 compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -0,0 +1,19 @@
# Generated File.
file_format: text, dataset: functional, compression_codec: none, compression_type: none
file_format: seq, dataset: functional, compression_codec: none, compression_type: none
file_format: seq, dataset: functional, compression_codec: def, compression_type: block
file_format: seq, dataset: functional, compression_codec: def, compression_type: record
file_format: seq, dataset: functional, compression_codec: gzip, compression_type: block
file_format: seq, dataset: functional, compression_codec: gzip, compression_type: record
file_format: seq, dataset: functional, compression_codec: bzip, compression_type: block
file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record
file_format: seq, dataset: functional, compression_codec: snap, compression_type: block
file_format: seq, dataset: functional, compression_codec: snap, compression_type: record
file_format: rc, dataset: functional, compression_codec: none, compression_type: none
file_format: rc, dataset: functional, compression_codec: def, compression_type: block
file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
file_format: rc, dataset: functional, compression_codec: snap, compression_type: block
file_format: trevni, dataset: functional, compression_codec: none, compression_type: none
file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block
1 # Generated File.
2 file_format: text, dataset: functional, compression_codec: none, compression_type: none
3 file_format: seq, dataset: functional, compression_codec: none, compression_type: none
4 file_format: seq, dataset: functional, compression_codec: def, compression_type: block
5 file_format: seq, dataset: functional, compression_codec: def, compression_type: record
6 file_format: seq, dataset: functional, compression_codec: gzip, compression_type: block
7 file_format: seq, dataset: functional, compression_codec: gzip, compression_type: record
8 file_format: seq, dataset: functional, compression_codec: bzip, compression_type: block
9 file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record
10 file_format: seq, dataset: functional, compression_codec: snap, compression_type: block
11 file_format: seq, dataset: functional, compression_codec: snap, compression_type: record
12 file_format: rc, dataset: functional, compression_codec: none, compression_type: none
13 file_format: rc, dataset: functional, compression_codec: def, compression_type: block
14 file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
15 file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
16 file_format: rc, dataset: functional, compression_codec: snap, compression_type: block
17 file_format: trevni, dataset: functional, compression_codec: none, compression_type: none
18 file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
19 file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block

View File

@@ -0,0 +1,8 @@
# Generated File.
file_format: text, dataset: functional, compression_codec: none, compression_type: none
file_format: seq, dataset: functional, compression_codec: def, compression_type: block
file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block
file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record
1 # Generated File.
2 file_format: text, dataset: functional, compression_codec: none, compression_type: none
3 file_format: seq, dataset: functional, compression_codec: def, compression_type: block
4 file_format: rc, dataset: functional, compression_codec: gzip, compression_type: block
5 file_format: trevni, dataset: functional, compression_codec: snap, compression_type: block
6 file_format: trevni, dataset: functional, compression_codec: def, compression_type: block
7 file_format: rc, dataset: functional, compression_codec: bzip, compression_type: block
8 file_format: seq, dataset: functional, compression_codec: bzip, compression_type: record

View File

@@ -1,3 +0,0 @@
# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
text,functional,none,none
seq,functional,none,none
1 # Manually created file. The vector value order is: file format data_group compression codec compression type
2 text functional none none
3 seq functional none none

View File

@@ -1,19 +0,0 @@
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
text,functional,none,none
seq,functional,none,none
seq,functional,def,block
seq,functional,def,record
seq,functional,gzip,block
seq,functional,gzip,record
seq,functional,bzip,block
seq,functional,bzip,record
seq,functional,snap,block
seq,functional,snap,record
rc,functional,none,none
rc,functional,def,block
rc,functional,gzip,block
rc,functional,bzip,block
rc,functional,snap,block
trevni,functional,none,none
trevni,functional,def,block
trevni,functional,snap,block
1 # Generated File. The vector value order is: file format data_group compression codec compression type
2 text functional none none
3 seq functional none none
4 seq functional def block
5 seq functional def record
6 seq functional gzip block
7 seq functional gzip record
8 seq functional bzip block
9 seq functional bzip record
10 seq functional snap block
11 seq functional snap record
12 rc functional none none
13 rc functional def block
14 rc functional gzip block
15 rc functional bzip block
16 rc functional snap block
17 trevni functional none none
18 trevni functional def block
19 trevni functional snap block

View File

@@ -1,8 +0,0 @@
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
text,functional,none,none
seq,functional,def,block
rc,functional,gzip,block
trevni,functional,snap,block
trevni,functional,def,block
rc,functional,bzip,block
seq,functional,bzip,record
1 # Generated File. The vector value order is: file format data_group compression codec compression type
2 text functional none none
3 seq functional def block
4 rc functional gzip block
5 trevni functional snap block
6 trevni functional def block
7 rc functional bzip block
8 seq functional bzip record

View File

@@ -1,9 +1,7 @@
# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
text,grep1gb,none,none
text,grep10gb,none,none
text,web,none,none
seq,grep1gb,bzip,none
seq,web,snap,record
seq,web,none,none
rc,grep1gb,def,block
rc,web,none,none
# Manually created file.
file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: none
file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record
file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block
file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none
file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
1 # Manually created file. The vector value order is: file format # Manually created file. data_group compression codec compression type
2 text file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none grep1gb none none
3 text file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: none grep10gb none none
4 text file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record web none none
5 seq file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block grep1gb bzip none
6 seq file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none web snap record
7 seq file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block web none none
rc grep1gb def block
rc web none none

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,trevni
data_group: grep1gb,grep10gb,web
dataset: hive-benchmark
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,trevni
2 data_group: grep1gb,grep10gb,web dataset: hive-benchmark
3 compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,55 +1,19 @@
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
text,grep1gb,none,none
text,grep10gb,none,none
text,web,none,none
seq,grep1gb,none,none
seq,grep1gb,def,block
seq,grep1gb,def,record
seq,grep1gb,gzip,block
seq,grep1gb,gzip,record
seq,grep1gb,bzip,block
seq,grep1gb,bzip,record
seq,grep1gb,snap,block
seq,grep1gb,snap,record
seq,grep10gb,none,none
seq,grep10gb,def,block
seq,grep10gb,def,record
seq,grep10gb,gzip,block
seq,grep10gb,gzip,record
seq,grep10gb,bzip,block
seq,grep10gb,bzip,record
seq,grep10gb,snap,block
seq,grep10gb,snap,record
seq,web,none,none
seq,web,def,block
seq,web,def,record
seq,web,gzip,block
seq,web,gzip,record
seq,web,bzip,block
seq,web,bzip,record
seq,web,snap,block
seq,web,snap,record
rc,grep1gb,none,none
rc,grep1gb,def,block
rc,grep1gb,gzip,block
rc,grep1gb,bzip,block
rc,grep1gb,snap,block
rc,grep10gb,none,none
rc,grep10gb,def,block
rc,grep10gb,gzip,block
rc,grep10gb,bzip,block
rc,grep10gb,snap,block
rc,web,none,none
rc,web,def,block
rc,web,gzip,block
rc,web,bzip,block
rc,web,snap,block
trevni,grep1gb,none,none
trevni,grep1gb,def,block
trevni,grep1gb,snap,block
trevni,grep10gb,none,none
trevni,grep10gb,def,block
trevni,grep10gb,snap,block
trevni,web,none,none
trevni,web,def,block
trevni,web,snap,block
# Generated File.
file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
file_format: seq, dataset: hive-benchmark, compression_codec: none, compression_type: none
file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block
file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: record
file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record
file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: block
file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: record
file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none
file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block
file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
file_format: rc, dataset: hive-benchmark, compression_codec: snap, compression_type: block
file_format: trevni, dataset: hive-benchmark, compression_codec: none, compression_type: none
file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block
file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
1 # Generated File. The vector value order is: file format # Generated File. data_group compression codec compression type
2 text file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none grep1gb none none
3 text file_format: seq, dataset: hive-benchmark, compression_codec: none, compression_type: none grep10gb none none
4 text file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block web none none
5 seq file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: record grep1gb none none
6 seq file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: block grep1gb def block
7 seq file_format: seq, dataset: hive-benchmark, compression_codec: gzip, compression_type: record grep1gb def record
8 seq file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: block grep1gb gzip block
9 seq file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record grep1gb gzip record
10 seq file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: block grep1gb bzip block
11 seq file_format: seq, dataset: hive-benchmark, compression_codec: snap, compression_type: record grep1gb bzip record
12 seq file_format: rc, dataset: hive-benchmark, compression_codec: none, compression_type: none grep1gb snap block
13 seq file_format: rc, dataset: hive-benchmark, compression_codec: def, compression_type: block grep1gb snap record
14 seq file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block grep10gb none none
15 seq file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block grep10gb def block
16 seq file_format: rc, dataset: hive-benchmark, compression_codec: snap, compression_type: block grep10gb def record
17 seq file_format: trevni, dataset: hive-benchmark, compression_codec: none, compression_type: none grep10gb gzip block
18 seq file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block grep10gb gzip record
19 seq file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block grep10gb bzip block
seq grep10gb bzip record
seq grep10gb snap block
seq grep10gb snap record
seq web none none
seq web def block
seq web def record
seq web gzip block
seq web gzip record
seq web bzip block
seq web bzip record
seq web snap block
seq web snap record
rc grep1gb none none
rc grep1gb def block
rc grep1gb gzip block
rc grep1gb bzip block
rc grep1gb snap block
rc grep10gb none none
rc grep10gb def block
rc grep10gb gzip block
rc grep10gb bzip block
rc grep10gb snap block
rc web none none
rc web def block
rc web gzip block
rc web bzip block
rc web snap block
trevni grep1gb none none
trevni grep1gb def block
trevni grep1gb snap block
trevni grep10gb none none
trevni grep10gb def block
trevni grep10gb snap block
trevni web none none
trevni web def block
trevni web snap block

View File

@@ -1,10 +1,8 @@
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
text,grep1gb,none,none
seq,grep10gb,def,block
rc,web,gzip,block
trevni,web,snap,block
trevni,grep10gb,none,none
rc,grep1gb,bzip,block
seq,web,none,none
text,grep10gb,none,none
text,web,none,none
# Generated File.
file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none
file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block
file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block
file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block
file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block
file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block
file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record
1 # Generated File. The vector value order is: file format # Generated File. data_group compression codec compression type
2 text file_format: text, dataset: hive-benchmark, compression_codec: none, compression_type: none grep1gb none none
3 seq file_format: seq, dataset: hive-benchmark, compression_codec: def, compression_type: block grep10gb def block
4 rc file_format: rc, dataset: hive-benchmark, compression_codec: gzip, compression_type: block web gzip block
5 trevni file_format: trevni, dataset: hive-benchmark, compression_codec: snap, compression_type: block web snap block
6 trevni file_format: trevni, dataset: hive-benchmark, compression_codec: def, compression_type: block grep10gb none none
7 rc file_format: rc, dataset: hive-benchmark, compression_codec: bzip, compression_type: block grep1gb bzip block
8 seq file_format: seq, dataset: hive-benchmark, compression_codec: bzip, compression_type: record web none none
text grep10gb none none
text web none none

View File

@@ -0,0 +1,27 @@
====
select count(*) from grep1gb$TABLE
====
select count(field) from grep1gb$TABLE
====
select count(field) from grep1gb$TABLE where field like '%%xyz%%'
====
select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue
from uservisits$TABLE uv join rankings$TABLE r on
(r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate
< '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1
====
select sourceIP, SUM(adRevenue) FROM uservisits$TABLE GROUP by sourceIP
order by SUM(adRevenue) desc limit 10
====
select pageRank, pageURL from rankings$TABLE where pageRank > 10
order by pageRank limit 100
====
select count(*) from rankings$TABLE where pageRank > 10 && pageRank < 25
====
select avg(adRevenue) from uservisits$TABLE
====
select avg(adRevenue) from uservisits$TABLE
where visitdate > '1999-07-01' and visitdate < '1999-12-31'
====
select count(field) from grep10gb$TABLE where field like '%%xyz%%'
====

View File

@@ -11,7 +11,7 @@ select
round(avg(l_extendedprice), 1),
round(avg(l_discount), 1), count(1)
from
lineitem$TABLE
tpch.lineitem$TABLE
where
l_shipdate<='1998-09-02'
group by

View File

@@ -11,12 +11,12 @@ select
c_address,
c_phone,
c_comment
from lineitem$TABLE l
join orders$TABLE o
from tpch.lineitem$TABLE l
join tpch.orders$TABLE o
on (l.l_orderkey = o.o_orderkey)
join customer$TABLE c
join tpch.customer$TABLE c
on (c.c_custkey = o.o_custkey)
join nation$TABLE n
join tpch.nation$TABLE n
on (c.c_nationkey = n.n_nationkey)
where
o.o_orderdate >= '1993-10-01' and

View File

@@ -1,27 +1,27 @@
# Q11 - Important Stock Identification
# NOTE: Alan is re-writing part of this query set
# so it is missing for now
insert overwrite table q11_part_tmp$TABLE
insert overwrite table tpch.q11_part_tmp$TABLE
select ps_partkey, sum(ps_supplycost * ps_availqty) as part_value
from nation$TABLE n
join supplier$TABLE s
from tpch.nation$TABLE n
join tpch.supplier$TABLE s
on s.s_nationkey = n.n_nationkey and n.n_name = 'GERMANY'
join partsupp$TABLE ps
join tpch.partsupp$TABLE ps
on ps.ps_suppkey = s.s_suppkey
group by ps_partkey
---- SETUP
RESET q11_sum_tmp$TABLE
RELOAD q11_sum_tmp$TABLE
RESET tpch.q11_sum_tmp$TABLE
RELOAD tpch.q11_sum_tmp$TABLE
---- RESULTS
---- NUMROWS
29818
====
insert overwrite table q11_sum_tmp$TABLE
insert overwrite table tpch.q11_sum_tmp$TABLE
select sum(part_value) as total_value
from q11_part_tmp$TABLE
from tpch.q11_part_tmp$TABLE
---- SETUP
RESET q11_part_tmp$TABLE
RELOAD q11_part_tmp$TABLE
RESET tpch.q11_part_tmp$TABLE
RELOAD tpch.q11_part_tmp$TABLE
---- RESULTS
---- NUMROWS
1

View File

@@ -15,8 +15,8 @@ sum(case
else 0
end
) as low_line_count
from lineitem$TABLE l
join orders$TABLE o
from tpch.lineitem$TABLE l
join tpch.orders$TABLE o
on (o.o_orderkey = l.l_orderkey and
l.l_commitdate < l.l_receiptdate and
l.l_shipdate < l.l_commitdate)

View File

@@ -6,8 +6,8 @@ from
( select
c_custkey,
count(o_orderkey) as c_count
from orders$TABLE o
right outer join customer$TABLE c
from tpch.orders$TABLE o
right outer join tpch.customer$TABLE c
on (c.c_custkey = o.o_custkey and o.o_comment not like '%special%requests%')
group by
c_custkey

View File

@@ -4,8 +4,8 @@ round(100.00 * sum(case when p_type like 'PROMO%' then l_extendedprice*(1-l_disc
else 0.0
end
) / sum(l_extendedprice * (1 - l_discount)), 5) as promo_revenue
from lineitem$TABLE l
join part$TABLE p
from tpch.lineitem$TABLE l
join tpch.part$TABLE p
on l.l_partkey = p.p_partkey and
l.l_shipdate >= '1995-09-01' and
l.l_shipdate < '1995-10-01'

View File

@@ -1,24 +1,24 @@
# Q15 - Top Supplier Query
insert overwrite table revenue$TABLE
insert overwrite table tpch.revenue$TABLE
select
l_suppkey as supplier_no,
sum(l_extendedprice * (1 - l_discount)) as total_revenue
from lineitem$TABLE
from tpch.lineitem$TABLE
where l_shipdate >= '1996-01-01' and l_shipdate < '1996-04-01'
group by l_suppkey
---- SETUP
RESET revenue$TABLE
RELOAD revenue$TABLE
RESET tpch.revenue$TABLE
RELOAD tpch.revenue$TABLE
---- RESULTS
---- NUMROWS
10000
====
insert overwrite table max_revenue$TABLE
insert overwrite table tpch.max_revenue$TABLE
select max(total_revenue)
from revenue$TABLE
from tpch.revenue$TABLE
---- SETUP
RESET max_revenue$TABLE
RELOAD max_revenue$TABLE
RESET tpch.max_revenue$TABLE
RELOAD tpch.max_revenue$TABLE
---- RESULTS
---- NUMROWS
1
@@ -30,10 +30,10 @@ select
s_address,
s_phone,
total_revenue
from supplier$TABLE s
join revenue$TABLE r
from tpch.supplier$TABLE s
join tpch.revenue$TABLE r
on (s.s_suppkey = r.supplier_no)
join max_revenue$TABLE m
join tpch.max_revenue$TABLE m
on (r.total_revenue = m.max_revenue)
order by s_suppkey
limit 100

View File

@@ -1,27 +1,27 @@
# Q16 - Parts/Supplier Relation Query
insert overwrite table supplier_tmp$TABLE
insert overwrite table tpch.supplier_tmp$TABLE
select s_suppkey
from supplier$TABLE
from tpch.supplier$TABLE
where not s_comment like '%Customer%Complaints%'
---- SETUP
RESET supplier_tmp$TABLE
RELOAD supplier_tmp$TABLE
RESET tpch.supplier_tmp$TABLE
RELOAD tpch.supplier_tmp$TABLE
---- RESULTS
---- NUMROWS
9996
====
insert overwrite table q16_tmp$TABLE
insert overwrite table tpch.q16_tmp$TABLE
select p_brand, p_type, p_size, ps_suppkey
from partsupp$TABLE ps
join part$TABLE p
from tpch.partsupp$TABLE ps
join tpch.part$TABLE p
on p.p_partkey = ps.ps_partkey and
p.p_brand <> 'Brand#45' and
not p.p_type like 'MEDIUM POLISHED%'
join supplier_tmp$TABLE s
join tpch.supplier_tmp$TABLE s
on ps.ps_suppkey = s.s_suppkey
---- SETUP
RESET q16_tmp$TABLE
RELOAD q16_tmp$TABLE
RESET tpch.q16_tmp$TABLE
RELOAD tpch.q16_tmp$TABLE
---- RESULTS
---- NUMROWS
741971
@@ -29,7 +29,7 @@ RELOAD q16_tmp$TABLE
# Modifications: Added limit, removed 'DISTINCT' from count due to IMP-132
select p_brand, p_type, p_size, count(ps_suppkey) as supplier_cnt
from
( select * from q16_tmp$TABLE
( select * from tpch.q16_tmp$TABLE
where p_size = 49 or p_size = 14 or
p_size = 23 or p_size = 45 or
p_size = 19 or p_size = 3 or

View File

@@ -1,11 +1,11 @@
# Q17 - Small-Quantity-Order Revenue Query
insert overwrite table lineitem_tmp$TABLE
insert overwrite table tpch.lineitem_tmp$TABLE
select l_partkey as t_partkey, 0.2 * avg(l_quantity) as t_avg_quantity
from lineitem$TABLE
from tpch.lineitem$TABLE
group by l_partkey
---- SETUP
RESET lineitem_tmp$TABLE
RELOAD lineitem_tmp$TABLE
RESET tpch.lineitem_tmp$TABLE
RELOAD tpch.lineitem_tmp$TABLE
---- RESULTS
---- NUMROWS
200000
@@ -13,10 +13,10 @@ RELOAD lineitem_tmp$TABLE
# Modifications: Converted selects from multiple tables to joins,
# added round() call, removed subquery
select round(sum(l_extendedprice) / 7.0, 5) as avg_yearly
from lineitem$TABLE l
join part$TABLE p
from tpch.lineitem$TABLE l
join tpch.part$TABLE p
on (p.p_partkey = l.l_partkey)
join lineitem_tmp$TABLE lt
join tpch.lineitem_tmp$TABLE lt
on (lt.t_partkey = p.p_partkey)
where
p.p_brand = 'Brand#23' and

View File

@@ -1,11 +1,11 @@
# Q18 - Large Value Customer Query
insert overwrite table q18_tmp$TABLE
insert overwrite table tpch.q18_tmp$TABLE
select l_orderkey, sum(l_quantity) as t_sum_quantity
from lineitem$TABLE
from tpch.lineitem$TABLE
group by l_orderkey
---- SETUP
RESET q18_tmp$TABLE
RELOAD q18_tmp$TABLE
RESET tpch.q18_tmp$TABLE
RELOAD tpch.q18_tmp$TABLE
---- RESULTS
---- NUMROWS
1500000
@@ -19,12 +19,12 @@ select
o_orderdate,
cast(o_totalprice as bigint) as total_price_bigint,
round(sum(l_quantity), 5)
from lineitem$TABLE l
join orders$TABLE o
from tpch.lineitem$TABLE l
join tpch.orders$TABLE o
on (o.o_orderkey = l.l_orderkey)
join customer$TABLE c
join tpch.customer$TABLE c
on (c.c_custkey = o.o_custkey)
join q18_tmp$TABLE t
join tpch.q18_tmp$TABLE t
on (o.o_orderkey = t.l_orderkey and t.t_sum_quantity > 300)
group by
c_name,

View File

@@ -1,8 +1,8 @@
# Q19 - Discounted Revenue Query
# Modifications: Added round() calls
select round(sum(l_extendedprice * (1 - l_discount) ), 5) as revenue
from lineitem$TABLE l
join part$TABLE p
from tpch.lineitem$TABLE l
join tpch.part$TABLE p
on p.p_partkey = l.l_partkey
where
(

View File

@@ -1,5 +1,5 @@
# Q2 - Minimum Cost Supplier Query
insert overwrite table q2_minimum_cost_supplier_tmp1$TABLE
insert overwrite table tpch.q2_minimum_cost_supplier_tmp1$TABLE
select
s.s_acctbal,
s.s_name,
@@ -10,31 +10,31 @@ select
s.s_address,
s.s_phone,
s.s_comment
from partsupp$TABLE ps
join part$TABLE p
from tpch.partsupp$TABLE ps
join tpch.part$TABLE p
on (p.p_partkey = ps.ps_partkey and p.p_size = 15 and p.p_type like '%BRASS')
join supplier$TABLE s
join tpch.supplier$TABLE s
on (s.s_suppkey = ps.ps_suppkey)
join nation$TABLE n
join tpch.nation$TABLE n
on (s.s_nationkey = n.n_nationkey)
join region$TABLE r
join tpch.region$TABLE r
on (n.n_regionkey = r.r_regionkey and r.r_name = 'EUROPE')
---- SETUP
RESET q2_minimum_cost_supplier_tmp1$TABLE
RELOAD q2_minimum_cost_supplier_tmp1$TABLE
RESET tpch.q2_minimum_cost_supplier_tmp1$TABLE
RELOAD tpch.q2_minimum_cost_supplier_tmp1$TABLE
---- RESULTS
---- NUMROWS
642
====
insert overwrite table q2_minimum_cost_supplier_tmp2$TABLE
insert overwrite table tpch.q2_minimum_cost_supplier_tmp2$TABLE
select
p_partkey,
min(ps_supplycost)
from q2_minimum_cost_supplier_tmp1$TABLE
from tpch.q2_minimum_cost_supplier_tmp1$TABLE
group by p_partkey
---- SETUP
RESET q2_minimum_cost_supplier_tmp2$TABLE
RELOAD q2_minimum_cost_supplier_tmp2$TABLE
RESET tpch.q2_minimum_cost_supplier_tmp2$TABLE
RELOAD tpch.q2_minimum_cost_supplier_tmp2$TABLE
---- RESULTS
---- NUMROWS
460
@@ -49,8 +49,8 @@ select
t1.s_address,
t1.s_phone,
t1.s_comment
from q2_minimum_cost_supplier_tmp1$TABLE t1
join q2_minimum_cost_supplier_tmp2$TABLE t2
from tpch.q2_minimum_cost_supplier_tmp1$TABLE t1
join tpch.q2_minimum_cost_supplier_tmp2$TABLE t2
on (t1.p_partkey = t2.p_partkey and t1.ps_supplycost = t2.ps_min_supplycost)
order by
s_acctbal desc,

View File

@@ -1,21 +1,21 @@
# Q20 - Potential Part Promotion Query
insert overwrite table q20_tmp1$TABLE
insert overwrite table tpch.q20_tmp1$TABLE
select distinct p_partkey
from part$TABLE
from tpch.part$TABLE
where p_name like 'forest%'
---- SETUP
RESET q20_tmp1$TABLE
RELOAD q20_tmp1$TABLE
RESET tpch.q20_tmp1$TABLE
RELOAD tpch.q20_tmp1$TABLE
---- RESULTS
---- NUMROWS
2127
====
insert overwrite table q20_tmp2$TABLE
insert overwrite table tpch.q20_tmp2$TABLE
select
l_partkey,
l_suppkey,
0.5 * sum(l_quantity)
from lineitem$TABLE
from tpch.lineitem$TABLE
where
l_shipdate >= '1994-01-01' and
l_shipdate < '1995-01-01'
@@ -23,52 +23,52 @@ group by
l_partkey,
l_suppkey
---- SETUP
RESET q20_tmp2$TABLE
RELOAD q20_tmp2$TABLE
RESET tpch.q20_tmp2$TABLE
RELOAD tpch.q20_tmp2$TABLE
---- RESULTS
---- NUMROWS
543210
====
insert overwrite table q20_tmp3$TABLE
insert overwrite table tpch.q20_tmp3$TABLE
select
ps_suppkey,
ps_availqty,
sum_quantity
from partsupp$TABLE ps
join q20_tmp2$TABLE t2
from tpch.partsupp$TABLE ps
join tpch.q20_tmp2$TABLE t2
on (ps.ps_partkey = t2.l_partkey and ps.ps_suppkey = t2.l_suppkey)
join q20_tmp1$TABLE t1
join tpch.q20_tmp1$TABLE t1
on (ps.ps_partkey = t1.p_partkey)
---- SETUP
RESET q20_tmp3$TABLE
RELOAD q20_tmp3$TABLE
RESET tpch.q20_tmp3$TABLE
RELOAD tpch.q20_tmp3$TABLE
---- RESULTS
---- NUMROWS
5843
====
# Modified to use subquery to work around IMP-127
insert overwrite table q20_tmp4$TABLE
insert overwrite table tpch.q20_tmp4$TABLE
select a.ps_suppkey
from (select
ps_suppkey,
count(1) from q20_tmp3$TABLE
count(1) from tpch.q20_tmp3$TABLE
where ps_availqty > sum_quantity
group by ps_suppkey
) a
---- SETUP
RESET q20_tmp4$TABLE
RELOAD q20_tmp4$TABLE
RESET tpch.q20_tmp4$TABLE
RELOAD tpch.q20_tmp4$TABLE
---- RESULTS
====
# Modifications: Added limit
select
s_name,
s_address
from supplier$TABLE s
join nation$TABLE n
from tpch.supplier$TABLE s
join tpch.nation$TABLE n
on (s.s_nationkey = n.n_nationkey and
n.n_name = 'CANADA')
join q20_tmp4$TABLE t4
join tpch.q20_tmp4$TABLE t4
on (s.s_suppkey = t4.ps_suppkey)
order by
s_name

View File

@@ -6,16 +6,16 @@
select
s_name,
count(*) as numwait
from lineitem$TABLE l1
join supplier$TABLE s
from tpch.lineitem$TABLE l1
join tpch.supplier$TABLE s
on (s.s_suppkey = l1.l_suppkey)
join orders$TABLE o
join tpch.orders$TABLE o
on (o.o_orderkey = l1.l_orderkey)
join nation$TABLE n
join tpch.nation$TABLE n
on (s.s_nationkey = n.n_nationkey)
left semi join lineitem$TABLE l2
left semi join tpch.lineitem$TABLE l2
on (l2.l_orderkey = l1.l_orderkey)
left outer join lineitem$TABLE l3
left outer join tpch.lineitem$TABLE l3
on (l3.l_orderkey = l1.l_orderkey and
l3.l_receiptdate > l3.l_commitdate
)

View File

@@ -3,11 +3,11 @@
# a constant value ('C') so that we can do a join between this table
# in the main query. This was needed because we only support equi-joins
# and had to have a column to join on.
insert overwrite table q22_customer_tmp1$TABLE
insert overwrite table tpch.q22_customer_tmp1$TABLE
select
avg(c_acctbal) avg_acctbal,
substr(c_name, 1, 1) as cust_name_char
from customer$TABLE c
from tpch.customer$TABLE c
where
c.c_acctbal > 0.00 and
(substr(c.c_phone, 1, 2) = '13' or
@@ -20,8 +20,8 @@ where
group by
substr(c_name, 1, 1)
---- SETUP
RESET q22_customer_tmp1$TABLE
RELOAD q22_customer_tmp1$TABLE
RESET tpch.q22_customer_tmp1$TABLE
RELOAD tpch.q22_customer_tmp1$TABLE
---- RESULTS
---- NUMROWS
1
@@ -36,10 +36,10 @@ select
substring(c_phone, 1, 2) as cntrycode,
count(*) as numcust,
round(sum(c_acctbal), 4) as totacctbal
from customer$TABLE c
join q22_customer_tmp1$TABLE ct
from tpch.customer$TABLE c
join tpch.q22_customer_tmp1$TABLE ct
on (substr(c.c_name, 1, 1) = ct.cust_name_char)
left outer join orders$TABLE o
left outer join tpch.orders$TABLE o
on (o.o_custkey = c.c_custkey)
where
o_custkey is null and

View File

@@ -5,10 +5,10 @@ select
round(sum(l_extendedprice * (1 - l_discount)), 5) as revenue,
o_orderdate,
o_shippriority
from lineitem$TABLE l
join orders$TABLE o
from tpch.lineitem$TABLE l
join tpch.orders$TABLE o
on (l.l_orderkey = o.o_orderkey)
join customer$TABLE c
join tpch.customer$TABLE c
on (c.c_mktsegment = 'BUILDING' and c.c_custkey = o.o_custkey)
where
o_orderdate < '1995-03-15' and

View File

@@ -5,8 +5,8 @@
select
o_orderpriority,
count(distinct l_orderkey) as order_count
from lineitem$TABLE l
inner join orders$TABLE o
from tpch.lineitem$TABLE l
inner join tpch.orders$TABLE o
on (o.o_orderkey = l.l_orderkey and
l.l_commitdate < l.l_receiptdate)
where

View File

@@ -4,16 +4,16 @@
select
n_name,
round(sum(l_extendedprice * (1 - l_discount)), 5) as revenue
from lineitem$TABLE l
join orders$TABLE o
from tpch.lineitem$TABLE l
join tpch.orders$TABLE o
on (l_orderkey = o_orderkey)
join supplier$TABLE s
join tpch.supplier$TABLE s
on (l_suppkey = s_suppkey)
join customer$TABLE
join tpch.customer$TABLE
on (c_nationkey = s_nationkey and c_custkey = o_custkey)
join nation$TABLE
join tpch.nation$TABLE
on (s_nationkey = n_nationkey)
join region$TABLE
join tpch.region$TABLE
on (n_regionkey = r_regionkey)
where
r_name = 'ASIA'

View File

@@ -1,7 +1,7 @@
# Q6 - Forecasting Revenue Change Query
# Modifications: Added round() call
select round(sum(l_extendedprice * l_discount), 5) as revenue
from lineitem$TABLE
from tpch.lineitem$TABLE
where l_shipdate >= '1994-01-01' and
l_shipdate < '1995-01-01' and
l_discount >= 0.05 and

View File

@@ -5,20 +5,20 @@ select
year(o_orderdate) as o_year,
round(sum(case when n2.n_name = 'BRAZIL' then l_extendedprice * (1 - l_discount)
else 0 end) / sum(l_extendedprice * (1 - l_discount)), 5) as mkt_share
from lineitem$TABLE l
join orders$TABLE o
from tpch.lineitem$TABLE l
join tpch.orders$TABLE o
on (l_orderkey = o_orderkey)
join part$TABLE p
join tpch.part$TABLE p
on (p_partkey = l_partkey)
join supplier$TABLE s
join tpch.supplier$TABLE s
on (s_suppkey = l_suppkey)
join customer$TABLE c
join tpch.customer$TABLE c
on (o_custkey = c_custkey)
join nation$TABLE n1
join tpch.nation$TABLE n1
on (c_nationkey = n1.n_nationkey)
join region$TABLE r
join tpch.region$TABLE r
on (n1.n_regionkey = r_regionkey)
join nation$TABLE n2
join tpch.nation$TABLE n2
on (s_nationkey = n2.n_nationkey)
where
r_name = 'AMERICA' and

View File

@@ -6,16 +6,16 @@ select
year(o.o_orderdate) as o_year,
round(sum(l.l_extendedprice * (1 - l.l_discount) -
ps.ps_supplycost * l.l_quantity), 1) as sum_profit
from lineitem$TABLE l
join part$TABLE p
from tpch.lineitem$TABLE l
join tpch.part$TABLE p
on (p.p_partkey = l.l_partkey)
join orders$TABLE o
join tpch.orders$TABLE o
on (o.o_orderkey = l.l_orderkey)
join partsupp$TABLE ps
join tpch.partsupp$TABLE ps
on (ps.ps_suppkey = l.l_suppkey and ps.ps_partkey = l.l_partkey)
join supplier$TABLE s
join tpch.supplier$TABLE s
on (s.s_suppkey = l.l_suppkey)
join nation$TABLE n
join tpch.nation$TABLE n
on (s.s_nationkey = n.n_nationkey)
where
p.p_name like '%green%'

View File

@@ -1,2 +1,2 @@
# Manually created file. The vector value order is: file format, data_group, compression codec, compression type
text,tpch,none,none
# Manually created file.
file_format:text, dataset:tpch, compression_codec:none, compression_type:none
1 # Manually created file. The vector value order is: file format # Manually created file. data_group compression codec compression type
2 text file_format:text, dataset:tpch, compression_codec:none, compression_type:none tpch none none

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,trevni
data_group: tpch
dataset: tpch
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,trevni
2 data_group: tpch dataset: tpch
3 compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,2 +1,5 @@
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
text,tpch,none,none
# Generated File.
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: trevni, dataset: tpch, compression_codec: none, compression_type: none
file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block
1 # Generated File. The vector value order is: file format # Generated File. data_group compression codec compression type
2 text file_format: text, dataset: tpch, compression_codec: none, compression_type: none tpch none none
3 file_format: trevni, dataset: tpch, compression_codec: none, compression_type: none
4 file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
5 file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block

View File

@@ -1,2 +1,4 @@
# Generated File. The vector value order is: file format, data_group, compression codec, compression type
text,tpch,none,none
# Generated File.
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block
1 # Generated File. The vector value order is: file format # Generated File. data_group compression codec compression type
2 text file_format: text, dataset: tpch, compression_codec: none, compression_type: none tpch none none
3 file_format: trevni, dataset: tpch, compression_codec: def, compression_type: block
4 file_format: trevni, dataset: tpch, compression_codec: snap, compression_type: block