mirror of
https://github.com/apache/impala.git
synced 2026-01-02 12:00:33 -05:00
This change includes a number of improvements for the test data loading framework: * Named sections for schema template definitions * Removal of uneeded sections from schema template definitions (ex. ANALYZE TABLE) * More granular data loading via table name filters * Improved robustness in detecting failed data loads * Table level constraints for specific file formats * Re-written compute stats script
153 lines
6.4 KiB
Python
Executable File
153 lines
6.4 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# This script is used to load the proper datasets for the specified workloads. It loads
|
|
# all data via Hive except for Trevni data which needs to be loaded via Impala.
|
|
import collections
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from itertools import product
|
|
from optparse import OptionParser
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy",
|
|
default="core",
|
|
help="The exploration strategy for schema gen: 'core', "\
|
|
"'pairwise', or 'exhaustive'")
|
|
parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
|
|
help="The HDFS path to the base Hive test warehouse directory")
|
|
parser.add_option("-w", "--workloads", dest="workloads",
|
|
help="Comma-separated list of workloads to load data for. If 'all' is "\
|
|
"specified then data for all workloads is loaded.")
|
|
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
|
|
help="An optional scale factor to generate the schema for")
|
|
parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
|
|
default=False, help='Skips HDFS exists check and reloads all tables')
|
|
parser.add_option("--compute_stats", dest="compute_stats", action="store_true",
|
|
default= False, help="Execute COMPUTE STATISTICS statements on the "\
|
|
"tables that are loaded")
|
|
parser.add_option("--impalad", dest="impala_shell_args", default="localhost:21000",
|
|
help="Impala daemon to connect to")
|
|
parser.add_option("--table_names", dest="table_names", default=None,
|
|
help="Only load the specified tables - specified as a comma-seperated "\
|
|
"list of base table names")
|
|
(options, args) = parser.parse_args()
|
|
|
|
WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
|
|
DATASET_DIR = os.environ['IMPALA_DATASET_DIR']
|
|
TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
|
|
|
|
GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s "\
|
|
"--workload=%s --scale_factor=%s --verbose"
|
|
HIVE_CMD = os.path.join(os.environ['HIVE_HOME'], 'bin/hive')
|
|
HIVE_ARGS = "-hiveconf hive.root.logger=WARN,console -v"
|
|
|
|
IMPALA_SHELL_CMD = os.path.join(os.environ['IMPALA_HOME'], 'bin/impala-shell.sh')
|
|
|
|
def available_workloads(workload_dir):
|
|
return [subdir for subdir in os.listdir(workload_dir)
|
|
if os.path.isdir(os.path.join(workload_dir, subdir))]
|
|
|
|
def validate_workloads(all_workloads, workloads):
|
|
for workload in workloads:
|
|
if workload not in all_workloads:
|
|
print 'Workload \'%s\' not found in workload directory' % workload
|
|
print 'Available workloads: ' + ', '.join(all_workloads)
|
|
sys.exit(1)
|
|
|
|
def exec_hive_query_from_file(file_name):
|
|
hive_cmd = "%s %s -f %s" % (HIVE_CMD, HIVE_ARGS, file_name)
|
|
print 'Executing Hive Command: ' + hive_cmd
|
|
ret_val = subprocess.call(hive_cmd, shell = True)
|
|
if ret_val != 0:
|
|
print 'Error executing file from Hive: ' + file_name
|
|
sys.exit(ret_val)
|
|
|
|
def exec_impala_query_from_file(file_name):
|
|
impala_cmd = "%s --impalad=%s -f %s" %\
|
|
(IMPALA_SHELL_CMD, options.impala_shell_args, file_name)
|
|
print 'Executing Impala Command: ' + impala_cmd
|
|
ret_val = subprocess.call(impala_cmd, shell = True)
|
|
if ret_val != 0:
|
|
print 'Error executing file from Impala: ' + file_name
|
|
sys.exit(ret_val)
|
|
|
|
def exec_bash_script(file_name):
|
|
bash_cmd = "bash %s" % file_name
|
|
print 'Executing Bash Command: ' + bash_cmd
|
|
ret_val = subprocess.call(bash_cmd, shell = True)
|
|
if ret_val != 0:
|
|
print 'Error bash script: ' + file_name
|
|
sys.exit(ret_val)
|
|
|
|
def generate_schema_statements(workload):
|
|
generate_cmd = GENERATE_SCHEMA_CMD % (options.exploration_strategy, workload,
|
|
options.scale_factor)
|
|
if options.table_names:
|
|
generate_cmd += " --table_names=%s" % options.table_names
|
|
if options.force_reload:
|
|
generate_cmd += " --force_reload"
|
|
if options.hive_warehouse_dir is not None:
|
|
generate_cmd += " --hive_warehouse_dir=%s" % options.hive_warehouse_dir
|
|
print 'Executing Generate Schema Command: ' + generate_cmd
|
|
ret_val = subprocess.call(os.path.join(TESTDATA_BIN_DIR, generate_cmd), shell = True)
|
|
if ret_val != 0:
|
|
print 'Error generating schema statements for workload: ' + workload
|
|
sys.exit(ret_val)
|
|
|
|
def get_dataset_for_workload(workload):
|
|
dimension_file_name = os.path.join(WORKLOAD_DIR, workload,
|
|
'%s_dimensions.csv' % workload)
|
|
if not os.path.isfile(dimension_file_name):
|
|
print 'Dimension file not found: ' + dimension_file_name
|
|
sys.exit(1)
|
|
with open(dimension_file_name, 'rb') as input_file:
|
|
match = re.search('dataset:\s*([\w\-\.]+)', input_file.read())
|
|
if match:
|
|
return match.group(1)
|
|
else:
|
|
print 'Dimension file does not contain dataset for workload \'%s\'' % (workload)
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
all_workloads = available_workloads(WORKLOAD_DIR)
|
|
workloads = []
|
|
if options.workloads is None:
|
|
print "At least one workload name must be specified."
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
elif options.workloads == 'all':
|
|
print 'Loading data for all workloads.'
|
|
workloads = all_workloads
|
|
else:
|
|
workloads = options.workloads.split(",")
|
|
validate_workloads(all_workloads, workloads)
|
|
|
|
print 'Starting data load for the following workloads: ' + ', '.join(workloads)
|
|
|
|
loading_time_map = collections.defaultdict(float)
|
|
for workload in workloads:
|
|
start_time = time.time()
|
|
dataset = get_dataset_for_workload(workload)
|
|
print "Dataset for workload '%s' is '%s'" % (workload, dataset)
|
|
dataset_dir = os.path.join(DATASET_DIR, dataset)
|
|
os.chdir(dataset_dir)
|
|
generate_schema_statements(workload)
|
|
exec_hive_query_from_file(os.path.join(dataset_dir,
|
|
'load-%s-%s-generated.sql' % (workload, options.exploration_strategy)))
|
|
|
|
exec_impala_query_from_file(os.path.join(dataset_dir,
|
|
'load-trevni-%s-%s-generated.sql' % (workload, options.exploration_strategy)))
|
|
loading_time_map[workload] = time.time() - start_time
|
|
|
|
total_time = 0.0
|
|
for workload, load_time in loading_time_map.iteritems():
|
|
total_time += load_time
|
|
print 'Data loading for workload \'%s\' completed in: %.2fs'\
|
|
% (workload, load_time)
|
|
print 'Total load time: %.2fs\n' % total_time
|