mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
To remove the dependency on Python 2, existing scripts need to use python3 rather than python. These commands find those locations (for impala-python and regular python): git grep impala-python | grep -v impala-python3 | grep -v impala-python-common | grep -v init-impala-python git grep bin/python | grep -v python3 This removes or switches most of these locations by various means: 1. If a python file has a #!/bin/env impala-python (or python) but doesn't have a main function, it removes the hash-bang and makes sure that the file is not executable. 2. Most scripts can simply switch from impala-python to impala-python3 (or python to python3) with minimal changes. 3. The cm-api pypi package (which doesn't support Python 3) has been replaced by the cm-client pypi package and interfaces have changed. Rather than migrating the code (which hasn't been used in years), this deletes the old code and stops installing cm-api into the virtualenv. The code can be restored and revamped if there is any interest in interacting with CM clusters. 4. This switches tests/comparison over to impala-python3, but this code has bit-rotted. Some pieces can be run manually, but it can't be fully verified with Python 3. It shouldn't hold back the migration on its own. 5. This also replaces locations of impala-python in comments / documentation / READMEs. 6. kazoo (used for interacting with HBase) needed to be upgraded to a version that supports Python 3. The newest version of kazoo requires upgrades of other component versions, so this uses kazoo 2.8.0 to avoid needing other upgrades. The two remaining uses of impala-python are: - bin/cmake_aux/create_virtualenv.sh - bin/impala-env-versioned-python These will be removed separately when we drop Python 2 support completely. In particular, these are useful for testing impala-shell with Python 2 until we stop supporting Python 2 for impala-shell. The docker-based tests still use /usr/bin/python, but this can be switched over independently (and doesn't impact impala-python) Testing: - Ran core job - Ran build + dataload on Centos 7, Redhat 8 - Manual testing of individual scripts (except some bitrotted areas like the random query generator) Change-Id: If209b761290bc7e7c716c312ea757da3e3bca6dc Reviewed-on: http://gerrit.cloudera.org:8080/23468 Reviewed-by: Michael Smith <michael.smith@cloudera.com> Tested-by: Michael Smith <michael.smith@cloudera.com>
513 lines
23 KiB
Python
Executable File
513 lines
23 KiB
Python
Executable File
#!/usr/bin/env impala-python3
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# This script is used to load the proper datasets for the specified workloads. It loads
|
|
# all data via Hive except for parquet data which needs to be loaded via Impala.
|
|
# Most ddl commands are executed by Impala.
|
|
from __future__ import absolute_import, division, print_function
|
|
import collections
|
|
import getpass
|
|
import logging
|
|
import os
|
|
import re
|
|
import sqlparse
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import traceback
|
|
|
|
from optparse import OptionParser
|
|
from tests.common.impala_connection import ImpylaHS2Connection
|
|
from tests.common.test_vector import HS2
|
|
from multiprocessing.pool import ThreadPool
|
|
|
|
LOG = logging.getLogger('load-data.py')
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy",
|
|
default="core",
|
|
help="The exploration strategy for schema gen: 'core', "\
|
|
"'pairwise', or 'exhaustive'")
|
|
parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
|
|
default="/test-warehouse",
|
|
help="The HDFS path to the base Hive test warehouse directory")
|
|
parser.add_option("-w", "--workloads", dest="workloads",
|
|
help="Comma-separated list of workloads to load data for. If 'all' is "\
|
|
"specified then data for all workloads is loaded.")
|
|
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
|
|
help="An optional scale factor to generate the schema for")
|
|
parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
|
|
default=False, help='Skips HDFS exists check and reloads all tables')
|
|
parser.add_option("--impalad", dest="impalad", default="localhost",
|
|
help="Impala daemon to connect to")
|
|
parser.add_option("--hive_hs2_hostport", dest="hive_hs2_hostport",
|
|
default="localhost:11050",
|
|
help="HS2 host:Port to issue Hive queries against using beeline")
|
|
parser.add_option("--table_names", dest="table_names", default=None,
|
|
help="Only load the specified tables - specified as a comma-seperated "\
|
|
"list of base table names")
|
|
parser.add_option("--table_formats", dest="table_formats", default=None,
|
|
help="Override the test vectors and load using the specified table "\
|
|
"formats. Ex. --table_formats=seq/snap/block,text/none")
|
|
parser.add_option("--hdfs_namenode", dest="hdfs_namenode", default="localhost:20500",
|
|
help="HDFS name node for Avro schema URLs, default localhost:20500")
|
|
parser.add_option("--workload_dir", dest="workload_dir",
|
|
default=os.environ['IMPALA_WORKLOAD_DIR'],
|
|
help="Directory that contains Impala workloads")
|
|
parser.add_option("--dataset_dir", dest="dataset_dir",
|
|
default=os.environ['IMPALA_DATASET_DIR'],
|
|
help="Directory that contains Impala datasets")
|
|
parser.add_option("--use_kerberos", action="store_true", default=False,
|
|
help="Load data on a kerberized cluster.")
|
|
parser.add_option("--principal", default=None, dest="principal",
|
|
help="Kerberos service principal, required if --use_kerberos is set")
|
|
parser.add_option("--num_processes", type="int", dest="num_processes",
|
|
default=os.environ['IMPALA_BUILD_THREADS'],
|
|
help="Number of parallel processes to use.")
|
|
|
|
options, args = parser.parse_args()
|
|
|
|
SQL_OUTPUT_DIR = os.environ['IMPALA_DATA_LOADING_SQL_DIR']
|
|
WORKLOAD_DIR = options.workload_dir
|
|
DATASET_DIR = options.dataset_dir
|
|
TESTDATA_BIN_DIR = os.path.join(os.environ['IMPALA_HOME'], 'testdata/bin')
|
|
AVRO_SCHEMA_DIR = "avro_schemas"
|
|
|
|
GENERATE_SCHEMA_CMD = "generate-schema-statements.py --exploration_strategy=%s "\
|
|
"--workload=%s --scale_factor=%s --verbose"
|
|
# Load data using Hive's beeline because the Hive shell has regressed (HIVE-5515).
|
|
# The Hive shell is stateful, meaning that certain series of actions lead to problems.
|
|
# Examples of problems due to the statefullness of the Hive shell:
|
|
# - Creating an HBase table changes the replication factor to 1 for subsequent LOADs.
|
|
# - INSERTs into an HBase table fail if they are the first stmt executed in a session.
|
|
# However, beeline itself also has bugs. For example, inserting a NULL literal into
|
|
# a string-typed column leads to an NPE. We work around these problems by using LOAD from
|
|
# a datafile instead of doing INSERTs.
|
|
HIVE_CMD = os.path.join(os.environ['HIVE_HOME'], 'bin/beeline')
|
|
|
|
hive_auth = "auth=none"
|
|
if options.use_kerberos:
|
|
if not options.principal:
|
|
print("--principal is required when --use_kerberos is specified")
|
|
exit(1)
|
|
hive_auth = "principal=" + options.principal
|
|
|
|
HIVE_ARGS = '-n %s -u "jdbc:hive2://%s/default;%s" --verbose=true'\
|
|
% (getpass.getuser(), options.hive_hs2_hostport, hive_auth)
|
|
|
|
HADOOP_CMD = os.path.join(os.environ['HADOOP_HOME'], 'bin/hadoop')
|
|
|
|
HS2_HOST_PORT = "{}:{}".format(options.impalad, 21050)
|
|
|
|
def available_workloads(workload_dir):
|
|
return [subdir for subdir in os.listdir(workload_dir)
|
|
if os.path.isdir(os.path.join(workload_dir, subdir))]
|
|
|
|
def validate_workloads(all_workloads, workloads):
|
|
for workload in workloads:
|
|
if workload not in all_workloads:
|
|
LOG.error('Workload \'%s\' not found in workload directory' % workload)
|
|
LOG.error('Available workloads: ' + ', '.join(all_workloads))
|
|
sys.exit(1)
|
|
|
|
def exec_cmd(cmd, error_msg=None, exit_on_error=True, out_file=None):
|
|
"""Run the given command in the shell returning whether the command
|
|
succeeded. If 'error_msg' is set, log the error message on failure.
|
|
If 'exit_on_error' is True, exit the program on failure.
|
|
If 'out_file' is specified, log all output to that file."""
|
|
success = True
|
|
if out_file:
|
|
with open(out_file, 'w') as f:
|
|
ret_val = subprocess.call(cmd, shell=True, stderr=f, stdout=f)
|
|
else:
|
|
ret_val = subprocess.call(cmd, shell=True)
|
|
if ret_val != 0:
|
|
if error_msg: LOG.info(error_msg)
|
|
if exit_on_error: sys.exit(ret_val)
|
|
success = False
|
|
return success
|
|
|
|
def exec_hive_query_from_file_beeline(file_name):
|
|
if not os.path.exists(file_name):
|
|
LOG.info("Error: File {0} not found".format(file_name))
|
|
return False
|
|
|
|
LOG.info("Beginning execution of hive SQL: {0}".format(file_name))
|
|
|
|
output_file = file_name + ".log"
|
|
hive_cmd = "{0} {1} -f {2}".format(HIVE_CMD, HIVE_ARGS, file_name)
|
|
is_success = exec_cmd(hive_cmd, exit_on_error=False, out_file=output_file)
|
|
|
|
if is_success:
|
|
LOG.info("Finished execution of hive SQL: {0}".format(file_name))
|
|
else:
|
|
LOG.info("Error executing hive SQL: {0} See: {1}".format(file_name, \
|
|
output_file))
|
|
|
|
return is_success
|
|
|
|
|
|
def exec_hbase_query_from_file(file_name, step_name):
|
|
if not os.path.exists(file_name): return
|
|
LOG.info('Begin step "%s".' % step_name)
|
|
start_time = time.time()
|
|
hbase_cmd = "hbase shell %s" % file_name
|
|
LOG.info('Executing HBase Command: %s' % hbase_cmd)
|
|
exec_cmd(hbase_cmd, error_msg='Error executing hbase create commands')
|
|
total_time = time.time() - start_time
|
|
LOG.info('End step "%s". Total time: %.2fs\n' % (step_name, total_time))
|
|
|
|
|
|
# KERBEROS TODO: fails when kerberized and impalad principal isn't "impala"
|
|
def exec_impala_query_from_file(file_name):
|
|
"""Execute each query in an Impala query file individually"""
|
|
if not os.path.exists(file_name):
|
|
LOG.info("Error: File {0} not found".format(file_name))
|
|
return False
|
|
|
|
LOG.info("Beginning execution of impala SQL on {0}: {1}".format(
|
|
options.impalad, file_name))
|
|
is_success = True
|
|
impala_client = ImpylaHS2Connection(HS2_HOST_PORT,
|
|
use_kerberos=options.use_kerberos)
|
|
output_file = file_name + ".log"
|
|
query = None
|
|
with open(output_file, 'w') as out_file:
|
|
try:
|
|
impala_client.connect()
|
|
with open(file_name, 'r+') as query_file:
|
|
queries = sqlparse.split(query_file.read())
|
|
for query in queries:
|
|
query = sqlparse.format(query.rstrip(';'), strip_comments=True)
|
|
if query.strip() != "":
|
|
result = impala_client.execute(query)
|
|
out_file.write("{0}\n{1}\n".format(query, result))
|
|
except Exception as e:
|
|
if query:
|
|
out_file.write("ERROR: {0}\n".format(query))
|
|
else:
|
|
out_file.write("Encounter errors before parsing any queries.\n")
|
|
traceback.print_exc(file=out_file)
|
|
is_success = False
|
|
|
|
if is_success:
|
|
LOG.info("Finished execution of impala SQL: {0}".format(file_name))
|
|
else:
|
|
LOG.info("Error executing impala SQL: {0} See: {1}".format(file_name, \
|
|
output_file))
|
|
|
|
return is_success
|
|
|
|
def run_dataset_preload(dataset):
|
|
"""Execute a preload script if present in dataset directory. E.g. to generate data
|
|
before loading"""
|
|
dataset_preload_script = os.path.join(DATASET_DIR, dataset, "preload")
|
|
if os.path.exists(dataset_preload_script):
|
|
LOG.info("Running preload script for " + dataset)
|
|
if options.scale_factor != "" and int(options.scale_factor) > 1:
|
|
dataset_preload_script += " " + str(options.scale_factor)
|
|
exec_cmd(dataset_preload_script, error_msg="Error executing preload script for " + dataset,
|
|
exit_on_error=True)
|
|
|
|
def generate_schema_statements(workload):
|
|
generate_cmd = GENERATE_SCHEMA_CMD % (options.exploration_strategy, workload,
|
|
options.scale_factor)
|
|
if options.table_names:
|
|
generate_cmd += " --table_names=%s" % options.table_names
|
|
if options.force_reload:
|
|
generate_cmd += " --force_reload"
|
|
if options.table_formats:
|
|
generate_cmd += " --table_formats=%s" % options.table_formats
|
|
if options.hive_warehouse_dir is not None:
|
|
generate_cmd += " --hive_warehouse_dir=%s" % options.hive_warehouse_dir
|
|
if options.hdfs_namenode is not None:
|
|
generate_cmd += " --hdfs_namenode=%s" % options.hdfs_namenode
|
|
generate_cmd += " --backend=%s" % HS2_HOST_PORT
|
|
LOG.info('Executing Generate Schema Command: ' + generate_cmd)
|
|
schema_cmd = os.path.join(TESTDATA_BIN_DIR, generate_cmd)
|
|
error_msg = 'Error generating schema statements for workload: ' + workload
|
|
exec_cmd(schema_cmd, error_msg=error_msg)
|
|
|
|
def get_dataset_for_workload(workload):
|
|
dimension_file_name = os.path.join(WORKLOAD_DIR, workload,
|
|
'%s_dimensions.csv' % workload)
|
|
if not os.path.isfile(dimension_file_name):
|
|
LOG.error('Dimension file not found: ' + dimension_file_name)
|
|
sys.exit(1)
|
|
with open(dimension_file_name, 'r') as input_file:
|
|
match = re.search(r'dataset:\s*([\w\-\.]+)', input_file.read())
|
|
if match:
|
|
return match.group(1)
|
|
else:
|
|
LOG.error('Dimension file does not contain dataset for workload \'%s\'' % (workload))
|
|
sys.exit(1)
|
|
|
|
def copy_avro_schemas_to_hdfs(schemas_dir):
|
|
"""Recursively copies all of schemas_dir to the test warehouse."""
|
|
if not os.path.exists(schemas_dir):
|
|
LOG.info('Avro schema dir (%s) does not exist. Skipping copy to HDFS.' % schemas_dir)
|
|
return
|
|
|
|
exec_hadoop_fs_cmd("-mkdir -p " + options.hive_warehouse_dir)
|
|
exec_hadoop_fs_cmd("-put -f %s %s/" % (schemas_dir, options.hive_warehouse_dir))
|
|
|
|
def exec_hadoop_fs_cmd(args, exit_on_error=True):
|
|
cmd = "%s fs %s" % (HADOOP_CMD, args)
|
|
LOG.info("Executing Hadoop command: " + cmd)
|
|
exec_cmd(cmd, error_msg="Error executing Hadoop command, exiting",
|
|
exit_on_error=exit_on_error)
|
|
|
|
|
|
def exec_query_files_parallel(thread_pool, query_files, execution_type, step_name):
|
|
"""Executes the query files provided using the execution engine specified
|
|
in parallel using the given thread pool. Aborts immediately if any execution
|
|
encounters an error."""
|
|
assert(execution_type == 'impala' or execution_type == 'hive')
|
|
if len(query_files) == 0: return
|
|
if execution_type == 'impala':
|
|
execution_function = exec_impala_query_from_file
|
|
elif execution_type == 'hive':
|
|
execution_function = exec_hive_query_from_file_beeline
|
|
|
|
LOG.info('Begin step "%s".' % step_name)
|
|
start_time = time.time()
|
|
for result in thread_pool.imap_unordered(execution_function, query_files):
|
|
if not result:
|
|
thread_pool.terminate()
|
|
sys.exit(1)
|
|
total_time = time.time() - start_time
|
|
LOG.info('End step "%s". Total time: %.2fs\n' % (step_name, total_time))
|
|
|
|
|
|
def impala_exec_query_files_parallel(thread_pool, query_files, step_name):
|
|
exec_query_files_parallel(thread_pool, query_files, 'impala', step_name)
|
|
|
|
|
|
def hive_exec_query_files_parallel(thread_pool, query_files, step_name):
|
|
exec_query_files_parallel(thread_pool, query_files, 'hive', step_name)
|
|
|
|
|
|
def main():
|
|
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
|
|
LOG.setLevel(logging.DEBUG)
|
|
|
|
# Having the actual command line at the top of each data-load-* log can help
|
|
# when debugging dataload issues.
|
|
#
|
|
LOG.debug(' '.join(sys.argv))
|
|
|
|
all_workloads = available_workloads(WORKLOAD_DIR)
|
|
workloads = []
|
|
if options.workloads is None:
|
|
LOG.error("At least one workload name must be specified.")
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
elif options.workloads == 'all':
|
|
LOG.info('Loading data for all workloads.')
|
|
workloads = all_workloads
|
|
else:
|
|
workloads = options.workloads.split(",")
|
|
validate_workloads(all_workloads, workloads)
|
|
|
|
LOG.info('Starting data load for the following workloads: ' + ', '.join(workloads))
|
|
LOG.info('Running with {0} threads'.format(options.num_processes))
|
|
|
|
# Note: The processes are in whatever the caller's directory is, so all paths
|
|
# passed to the pool need to be absolute paths. This will allow the pool
|
|
# to be used for different workloads (and thus different directories)
|
|
# simultaneously.
|
|
thread_pool = ThreadPool(processes=options.num_processes)
|
|
loading_time_map = collections.defaultdict(float)
|
|
for workload in workloads:
|
|
start_time = time.time()
|
|
dataset = get_dataset_for_workload(workload)
|
|
run_dataset_preload(dataset)
|
|
# This script is tightly coupled with testdata/bin/generate-schema-statements.py
|
|
# Specifically, this script is expecting the following:
|
|
# 1. generate-schema-statements.py generates files and puts them in the
|
|
# directory ${IMPALA_DATA_LOADING_SQL_DIR}/${workload}
|
|
# (e.g. ${IMPALA_HOME}/logs/data_loading/sql/tpch)
|
|
# 2. generate-schema-statements.py populates the subdirectory
|
|
# avro_schemas/${workload} with JSON files specifying the Avro schema for the
|
|
# tables being loaded.
|
|
# 3. generate-schema-statements.py uses a particular naming scheme to distinguish
|
|
# between SQL files of different load phases.
|
|
#
|
|
# Using the following variables:
|
|
# workload_exploration = ${workload}-${exploration_strategy} and
|
|
# file_format_suffix = ${file_format}-${codec}-${compression_type}
|
|
#
|
|
# A. Impala table creation scripts run in Impala to create tables, partitions,
|
|
# and views. There is one for each file format. They take the form:
|
|
# create-${workload_exploration}-impala-generated-${file_format_suffix}.sql
|
|
#
|
|
# B. Hive creation/load scripts run in Hive to load data into tables and create
|
|
# tables or views that Impala does not support. There is one for each
|
|
# file format. They take the form:
|
|
# load-${workload_exploration}-hive-generated-${file_format_suffix}.sql
|
|
#
|
|
# C. HBase creation script runs through the hbase commandline to create
|
|
# HBase tables. (Only generated if loading HBase table.) It takes the form:
|
|
# load-${workload_exploration}-hbase-generated.create
|
|
#
|
|
# D. HBase postload script runs through the hbase commandline to flush
|
|
# HBase tables. (Only generated if loading HBase table.) It takes the form:
|
|
# post-load-${workload_exploration}-hbase-generated.sql
|
|
#
|
|
# E. Impala load scripts run in Impala to load data. Only Parquet and Kudu
|
|
# are loaded through Impala. There is one for each of those formats loaded.
|
|
# They take the form:
|
|
# load-${workload_exploration}-impala-generated-${file_format_suffix}.sql
|
|
#
|
|
# F. Invalidation script runs through Impala to invalidate/refresh metadata
|
|
# for tables. It takes the form:
|
|
# invalidate-${workload_exploration}-impala-generated.sql
|
|
generate_schema_statements(workload)
|
|
|
|
# Determine the directory from #1
|
|
sql_dir = os.path.join(SQL_OUTPUT_DIR, dataset)
|
|
assert os.path.isdir(sql_dir),\
|
|
("Could not find the generated SQL files for loading dataset '%s'.\
|
|
\nExpected to find the SQL files in: %s" % (dataset, sql_dir))
|
|
|
|
# Copy the avro schemas (see #2) into HDFS
|
|
avro_schemas_path = os.path.join(sql_dir, AVRO_SCHEMA_DIR)
|
|
copy_avro_schemas_to_hdfs(avro_schemas_path)
|
|
|
|
# List all of the files in the sql directory to sort out the various types of
|
|
# files (see #3).
|
|
dataset_dir_contents = [os.path.join(sql_dir, f) for f in os.listdir(sql_dir)]
|
|
workload_exploration = "%s-%s" % (workload, options.exploration_strategy)
|
|
|
|
# Remove the AVRO_SCHEMA_DIR from the list of files
|
|
if os.path.exists(avro_schemas_path):
|
|
dataset_dir_contents.remove(avro_schemas_path)
|
|
|
|
# Match for Impala create files (3.A)
|
|
impala_create_match = 'create-%s-impala-generated' % workload_exploration
|
|
# Match for Hive create/load files (3.B)
|
|
hive_load_match = 'load-%s-hive-generated' % workload_exploration
|
|
# Match for HBase creation script (3.C)
|
|
hbase_create_match = 'load-%s-hbase-generated.create' % workload_exploration
|
|
# Match for HBase post-load script (3.D)
|
|
hbase_postload_match = 'post-load-%s-hbase-generated.sql' % workload_exploration
|
|
# Match for Impala load scripts (3.E)
|
|
impala_load_match = 'load-%s-impala-generated' % workload_exploration
|
|
# Match for Impala invalidate script (3.F)
|
|
invalidate_match = 'invalidate-%s-impala-generated' % workload_exploration
|
|
|
|
impala_create_files = []
|
|
hive_load_text_files = []
|
|
hive_load_orc_files = []
|
|
hive_load_nontext_files = []
|
|
hbase_create_files = []
|
|
hbase_postload_files = []
|
|
impala_load_files = []
|
|
invalidate_files = []
|
|
for filename in dataset_dir_contents:
|
|
if impala_create_match in filename:
|
|
impala_create_files.append(filename)
|
|
elif hive_load_match in filename:
|
|
if 'text-none-none' in filename:
|
|
hive_load_text_files.append(filename)
|
|
elif 'orc-def-block' in filename:
|
|
hive_load_orc_files.append(filename)
|
|
else:
|
|
hive_load_nontext_files.append(filename)
|
|
elif hbase_create_match in filename:
|
|
hbase_create_files.append(filename)
|
|
elif hbase_postload_match in filename:
|
|
hbase_postload_files.append(filename)
|
|
elif impala_load_match in filename:
|
|
impala_load_files.append(filename)
|
|
elif invalidate_match in filename:
|
|
invalidate_files.append(filename)
|
|
else:
|
|
assert False, "Unexpected input file {0}".format(filename)
|
|
|
|
# Simple helper function to dump a header followed by the filenames
|
|
def log_file_list(header, file_list):
|
|
if (len(file_list) == 0): return
|
|
LOG.debug(header)
|
|
list(map(LOG.debug, list(map(os.path.basename, file_list))))
|
|
LOG.debug("\n")
|
|
|
|
log_file_list("Impala Create Files:", impala_create_files)
|
|
log_file_list("Hive Load Text Files:", hive_load_text_files)
|
|
log_file_list("Hive Load Orc Files:", hive_load_orc_files)
|
|
log_file_list("Hive Load Non-Text Files:", hive_load_nontext_files)
|
|
log_file_list("HBase Create Files:", hbase_create_files)
|
|
log_file_list("HBase Post-Load Files:", hbase_postload_files)
|
|
log_file_list("Impala Load Files:", impala_load_files)
|
|
log_file_list("Impala Invalidate Files:", invalidate_files)
|
|
|
|
# Execute the data loading scripts.
|
|
# Creating tables in Impala has no dependencies, so we execute them first.
|
|
# HBase table inserts are done via hive, so the hbase tables need to be created before
|
|
# running the hive scripts. Some of the Impala inserts depend on hive tables,
|
|
# so they're done at the end. Finally, the Hbase Tables that have been filled with data
|
|
# need to be flushed.
|
|
|
|
impala_exec_query_files_parallel(thread_pool, impala_create_files, "Impala Create")
|
|
|
|
# There should be at most one hbase creation script
|
|
assert(len(hbase_create_files) <= 1)
|
|
for hbase_create in hbase_create_files:
|
|
exec_hbase_query_from_file(hbase_create, "HBase Create")
|
|
|
|
# If this is loading text tables plus multiple other formats, the text tables
|
|
# need to be loaded first
|
|
assert(len(hive_load_text_files) <= 1)
|
|
hive_exec_query_files_parallel(thread_pool, hive_load_text_files, "Hive Load Text")
|
|
# IMPALA-9923: Run ORC serially separately from other non-text formats. This hacks
|
|
# around flakiness seen when loading this in parallel (see IMPALA-12630 comments for
|
|
# broken tests). This should be removed as soon as possible.
|
|
assert(len(hive_load_orc_files) <= 1)
|
|
hive_exec_query_files_parallel(thread_pool, hive_load_orc_files, "Hive Load ORC")
|
|
|
|
# Load all non-text formats (goes parallel)
|
|
hive_exec_query_files_parallel(thread_pool, hive_load_nontext_files,
|
|
"Hive Load Non-Text")
|
|
|
|
assert(len(hbase_postload_files) <= 1)
|
|
for hbase_postload in hbase_postload_files:
|
|
exec_hbase_query_from_file(hbase_postload, "HBase Post-Load")
|
|
|
|
# Invalidate so that Impala sees the loads done by Hive before loading Parquet/Kudu
|
|
# Note: This only invalidates tables for this workload.
|
|
assert(len(invalidate_files) <= 1)
|
|
if impala_load_files:
|
|
impala_exec_query_files_parallel(thread_pool, invalidate_files,
|
|
"Impala Invalidate 1")
|
|
impala_exec_query_files_parallel(thread_pool, impala_load_files, "Impala Load")
|
|
# Final invalidate for this workload
|
|
impala_exec_query_files_parallel(thread_pool, invalidate_files, "Impala Invalidate 2")
|
|
loading_time_map[workload] = time.time() - start_time
|
|
|
|
total_time = 0.0
|
|
thread_pool.close()
|
|
thread_pool.join()
|
|
for workload, load_time in loading_time_map.items():
|
|
total_time += load_time
|
|
LOG.info('Data loading for workload \'%s\' completed in: %.2fs'\
|
|
% (workload, load_time))
|
|
LOG.info('Total load time: %.2fs\n' % total_time)
|
|
|
|
if __name__ == "__main__": main()
|