mirror of
https://github.com/apache/impala.git
synced 2026-01-04 09:00:56 -05:00
465 lines
21 KiB
Python
Executable File
465 lines
21 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
|
|
# This script generates the "CREATE TABLE", "INSERT", and "LOAD" statements for loading
|
|
# test data and writes them to create-*-generated.sql and
|
|
# load-*-generated.sql.
|
|
#
|
|
# The statements that are generated are based on an input test vector
|
|
# (read from a file) that describes the coverage desired. For example, currently
|
|
# we want to run benchmarks with different data sets, across different file types, and
|
|
# with different compression algorithms set. To improve data loading performance this
|
|
# script will generate an INSERT INTO statement to generate the data if the file does
|
|
# not already exist in HDFS. If the file does already exist in HDFS then we simply issue a
|
|
# LOAD statement which is much faster.
|
|
#
|
|
# The input test vectors are generated via the generate_test_vectors.py so
|
|
# ensure that script has been run (or the test vector files already exist) before
|
|
# running this script.
|
|
#
|
|
# Note: This statement generation is assuming the following data loading workflow:
|
|
# 1) Load all the data in the specified source table
|
|
# 2) Create tables for the new file formats and compression types
|
|
# 3) Run INSERT OVERWRITE TABLE SELECT * from the source table into the new tables
|
|
# or LOAD directly if the file already exists in HDFS.
|
|
import collections
|
|
import csv
|
|
import math
|
|
import json
|
|
import os
|
|
import random
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from itertools import product
|
|
from optparse import OptionParser
|
|
from tests.util.test_file_parser import *
|
|
from tests.common.test_dimensions import *
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy",
|
|
default="core", help="The exploration strategy for schema gen: 'core',"\
|
|
" 'pairwise', or 'exhaustive'")
|
|
parser.add_option("--hive_warehouse_dir", dest="hive_warehouse_dir",
|
|
default="/test-warehouse",
|
|
help="The HDFS path to the base Hive test warehouse directory")
|
|
parser.add_option("-w", "--workload", dest="workload",
|
|
help="The workload to generate schema for: tpch, hive-benchmark, ...")
|
|
parser.add_option("-s", "--scale_factor", dest="scale_factor", default="",
|
|
help="An optional scale factor to generate the schema for")
|
|
parser.add_option("-f", "--force_reload", dest="force_reload", action="store_true",
|
|
default= False, help='Skips HDFS exists check and reloads all tables')
|
|
parser.add_option("-v", "--verbose", dest="verbose", action="store_true",
|
|
default = False, help="If set, outputs additional logging.")
|
|
parser.add_option("-b", "--backend", dest="backend", default="localhost:21000",
|
|
help="Backend connection to use, default: localhost:21000")
|
|
parser.add_option("--table_names", dest="table_names", default=None,
|
|
help="Only load the specified tables - specified as a comma-seperated "\
|
|
"list of base table names")
|
|
parser.add_option("--table_formats", dest="table_formats", default=None,
|
|
help="Override the test vectors and load using the specified table "\
|
|
"formats. Ex. --table_formats=seq/snap/block,text/none")
|
|
parser.add_option("--hdfs_namenode", dest="hdfs_namenode", default="localhost:20500",
|
|
help="HDFS name node for Avro schema URLs, default localhost:20500")
|
|
(options, args) = parser.parse_args()
|
|
|
|
if options.workload is None:
|
|
print "A workload name must be specified."
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
WORKLOAD_DIR = os.environ['IMPALA_HOME'] + '/testdata/workloads'
|
|
DATASET_DIR = os.environ['IMPALA_HOME'] + '/testdata/datasets'
|
|
AVRO_SCHEMA_DIR = "avro_schemas"
|
|
|
|
COMPRESSION_TYPE = "SET mapred.output.compression.type=%s;"
|
|
COMPRESSION_ENABLED = "SET hive.exec.compress.output=%s;"
|
|
COMPRESSION_CODEC = "SET mapred.output.compression.codec=%s;"
|
|
AVRO_COMPRESSION_CODEC = "SET avro.output.codec=%s;"
|
|
SET_DYNAMIC_PARTITION_STATEMENT = "SET hive.exec.dynamic.partition=true;"
|
|
SET_PARTITION_MODE_NONSTRICT_STATEMENT = "SET hive.exec.dynamic.partition.mode=nonstrict;"
|
|
SET_HIVE_INPUT_FORMAT = "SET mapred.max.split.size=256000000;\n"\
|
|
"SET hive.input.format=org.apache.hadoop.hive.ql.io.%s;\n"
|
|
FILE_FORMAT_IDX = 0
|
|
DATASET_IDX = 1
|
|
CODEC_IDX = 2
|
|
COMPRESSION_TYPE_IDX = 3
|
|
|
|
COMPRESSION_MAP = {'def': 'org.apache.hadoop.io.compress.DefaultCodec',
|
|
'gzip': 'org.apache.hadoop.io.compress.GzipCodec',
|
|
'bzip': 'org.apache.hadoop.io.compress.BZip2Codec',
|
|
'snap': 'org.apache.hadoop.io.compress.SnappyCodec',
|
|
'lzo': 'com.hadoop.compression.lzo.LzopCodec',
|
|
'none': ''
|
|
}
|
|
|
|
AVRO_COMPRESSION_MAP = {
|
|
'def': 'deflate',
|
|
'snap': 'snappy',
|
|
'none': '',
|
|
}
|
|
|
|
FILE_FORMAT_MAP = {
|
|
'text': 'TEXTFILE',
|
|
'seq': 'SEQUENCEFILE',
|
|
'rc': 'RCFILE',
|
|
'parquet': 'PARQUETFILE',
|
|
'text_lzo':
|
|
"\nINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'" +
|
|
"\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'",
|
|
'avro':
|
|
"\nINPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'" +
|
|
"\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'"
|
|
}
|
|
|
|
HIVE_TO_AVRO_TYPE_MAP = {
|
|
'STRING': 'string',
|
|
'INT': 'int',
|
|
'TINYINT': 'int',
|
|
'SMALLINT': 'int',
|
|
'BIGINT': 'long',
|
|
'BOOLEAN': 'boolean',
|
|
'FLOAT': 'float',
|
|
'DOUBLE': 'double',
|
|
# Avro has no timestamp type, so convert to string
|
|
# TODO: this allows us to create our Avro test tables, but any tests that use
|
|
# a timestamp column will fail. We probably want to convert back to timestamps
|
|
# in our tests.
|
|
'TIMESTAMP': 'string',
|
|
}
|
|
|
|
PARQUET_ALTER_STATEMENT = "ALTER TABLE %(table_name)s SET\n\
|
|
SERDEPROPERTIES ('blocksize' = '1073741824', 'compression' = '%(compression)s');"
|
|
KNOWN_EXPLORATION_STRATEGIES = ['core', 'pairwise', 'exhaustive', 'lzo']
|
|
|
|
def build_create_statement(table_template, table_name, db_name, db_suffix,
|
|
file_format, compression, hdfs_location):
|
|
create_statement = 'DROP TABLE IF EXISTS %s%s.%s;\n' % (db_name, db_suffix, table_name)
|
|
create_statement += 'CREATE DATABASE IF NOT EXISTS %s%s;\n' % (db_name, db_suffix)
|
|
if compression == 'lzo':
|
|
file_format = '%s_%s' % (file_format, compression)
|
|
create_statement += table_template.format(db_name=db_name,
|
|
db_suffix=db_suffix,
|
|
table_name=table_name,
|
|
file_format=FILE_FORMAT_MAP[file_format],
|
|
hdfs_location=hdfs_location)
|
|
return create_statement
|
|
|
|
def build_table_template(file_format, columns, partition_columns, row_format,
|
|
avro_schema_dir):
|
|
partitioned_by = str()
|
|
if partition_columns:
|
|
partitioned_by = 'PARTITIONED BY (%s)' % ', '.join(partition_columns.split('\n'))
|
|
|
|
row_format_stmt = str()
|
|
if row_format:
|
|
row_format_stmt = 'ROW FORMAT ' + row_format
|
|
|
|
tblproperties = str()
|
|
if file_format == 'avro':
|
|
tblproperties = "TBLPROPERTIES ('avro.schema.url'=" \
|
|
"'hdfs://%s/%s/%s/{table_name}.json')" \
|
|
% (options.hdfs_namenode, options.hive_warehouse_dir, avro_schema_dir)
|
|
# Override specified row format
|
|
row_format_stmt = "ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'"
|
|
elif file_format == 'parquet':
|
|
row_format_stmt = str()
|
|
|
|
# Note: columns are ignored but allowed if a custom serde is specified
|
|
# (e.g. Avro)
|
|
stmt = """
|
|
CREATE EXTERNAL TABLE {{db_name}}{{db_suffix}}.{{table_name}} (
|
|
{columns})
|
|
{partitioned_by}
|
|
{row_format}
|
|
STORED AS {{file_format}}
|
|
LOCATION '{hive_warehouse_dir}/{{hdfs_location}}'
|
|
{tblproperties}
|
|
""".format(
|
|
row_format=row_format_stmt,
|
|
columns=',\n'.join(columns.split('\n')),
|
|
partitioned_by=partitioned_by,
|
|
hive_warehouse_dir=options.hive_warehouse_dir,
|
|
tblproperties=tblproperties
|
|
).strip()
|
|
|
|
# Remove empty lines from the stmt string. There is an empty line for
|
|
# each of the sections that didn't have anything (e.g. partitioned_by)
|
|
stmt = os.linesep.join([s for s in stmt.splitlines() if s])
|
|
stmt += ';'
|
|
return stmt
|
|
|
|
def avro_schema(columns):
|
|
record = {
|
|
"name": "a", # doesn't matter
|
|
"type": "record",
|
|
"fields": list()
|
|
}
|
|
for column_spec in columns.strip().split('\n'):
|
|
# column_spec looks something like "col_name col_type COMMENT comment"
|
|
# (comment may be omitted, we don't use it)
|
|
name = column_spec.split()[0]
|
|
type = column_spec.split()[1]
|
|
assert type.upper() in HIVE_TO_AVRO_TYPE_MAP, "Cannot convert to Avro type: %s" % type
|
|
record["fields"].append(
|
|
{'name': name,
|
|
'type': [HIVE_TO_AVRO_TYPE_MAP[type.upper()], "null"]}) # all columns nullable
|
|
return json.dumps(record)
|
|
|
|
def build_compression_codec_statement(codec, compression_type, file_format):
|
|
codec = AVRO_COMPRESSION_MAP[codec] if file_format == 'avro' else COMPRESSION_MAP[codec]
|
|
if not codec:
|
|
return str()
|
|
return (AVRO_COMPRESSION_CODEC % codec) if file_format == 'avro' else (
|
|
COMPRESSION_TYPE % compression_type.upper() + '\n' + COMPRESSION_CODEC % codec)
|
|
|
|
def build_codec_enabled_statement(codec):
|
|
compression_enabled = 'false' if codec == 'none' else 'true'
|
|
return COMPRESSION_ENABLED % compression_enabled
|
|
|
|
def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_format,
|
|
for_impala=False):
|
|
insert_statement = insert.format(db_name=db_name,
|
|
db_suffix=db_suffix,
|
|
table_name=table_name)
|
|
if for_impala:
|
|
return insert_statement
|
|
|
|
statement = SET_PARTITION_MODE_NONSTRICT_STATEMENT + "\n"
|
|
statement += SET_DYNAMIC_PARTITION_STATEMENT + "\n"
|
|
# For some reason (hive bug?) we need to have the CombineHiveInputFormat set
|
|
# for cases where we are compressing in bzip or lzo on certain tables that
|
|
# have multiple files.
|
|
if 'multi' in table_name and ('bzip' in db_suffix or 'lzo' in db_suffix):
|
|
statement += SET_HIVE_INPUT_FORMAT % "CombineHiveInputFormat"
|
|
else:
|
|
statement += SET_HIVE_INPUT_FORMAT % "HiveInputFormat"
|
|
return statement + insert_statement
|
|
|
|
def build_insert(insert, db_name, db_suffix, file_format,
|
|
codec, compression_type, table_name):
|
|
output = build_codec_enabled_statement(codec) + "\n"
|
|
output += build_compression_codec_statement(codec, compression_type, file_format) + "\n"
|
|
output += build_insert_into_statement(insert, db_name, db_suffix,
|
|
table_name, file_format) + "\n"
|
|
return output
|
|
|
|
def build_load_statement(load_template, db_name, db_suffix, table_name):
|
|
# hbase does not need the hdfs path.
|
|
if table_name.startswith('hbase'):
|
|
load_template = load_template.format(table_name=table_name,
|
|
db_name=db_name,
|
|
db_suffix=db_suffix)
|
|
else:
|
|
load_template = load_template.format(table_name=table_name,
|
|
db_name=db_name,
|
|
db_suffix=db_suffix,
|
|
impala_home = os.environ['IMPALA_HOME'])
|
|
return load_template
|
|
|
|
def build_db_suffix(file_format, codec, compression_type):
|
|
if file_format == 'text' and codec != 'none' and codec != 'lzo':
|
|
print 'Unsupported combination of file_format (text) and compression codec.'
|
|
sys.exit(1)
|
|
elif file_format == 'text' and codec == 'none':
|
|
return ''
|
|
elif codec == 'none':
|
|
return '_%s' % (file_format)
|
|
elif compression_type == 'record':
|
|
return '_%s_record_%s' % (file_format, codec)
|
|
else:
|
|
return '_%s_%s' % (file_format, codec)
|
|
|
|
# Does a hdfs directory listing and returns array with all the subdir names.
|
|
def get_hdfs_subdirs_with_data(path):
|
|
tmp_file = tempfile.TemporaryFile("w+")
|
|
cmd = "hadoop fs -du %s | grep -v '^0' | awk '{print $2}'" % path
|
|
subprocess.call([cmd], shell = True, stderr = open('/dev/null'), stdout = tmp_file)
|
|
tmp_file.seek(0)
|
|
|
|
# Results look like:
|
|
# <acls> - <user> <group> <date> /directory/subdirectory
|
|
# So to get subdirectory names just return everything after the last '/'
|
|
return [line[line.rfind('/') + 1:].strip() for line in tmp_file.readlines()]
|
|
|
|
class Statements(object):
|
|
"""Simple container object for storing SQL statements to be output to a
|
|
file. Useful for ordering the statements correctly."""
|
|
def __init__(self):
|
|
self.create = list()
|
|
self.load = list()
|
|
self.load_base = list()
|
|
|
|
def write_to_file(self, filename):
|
|
# Make sure we create the base tables first. It is important that we always write
|
|
# to the output file, even if there are no statements to generate. This makes sure
|
|
# the output file is empty and the user doesn't unexpectedly load some stale data.
|
|
output = self.create + self.load_base + self.load
|
|
with open(filename, 'w') as f:
|
|
f.write('\n\n'.join(output))
|
|
|
|
def generate_statements(output_name, test_vectors, sections,
|
|
schema_include_constraints, schema_exclude_constraints):
|
|
# The Avro SerDe causes strange problems with other unrelated tables (e.g.,
|
|
# Avro files will be written to LZO-compressed text tables). We generate
|
|
# separate schema statement files for Avro tables so we can invoke Hive
|
|
# completely separately for them.
|
|
# See https://issues.apache.org/jira/browse/HIVE-4195.
|
|
avro_output = Statements()
|
|
# Parquet statements to be executed separately by Impala
|
|
parquet_output = Statements()
|
|
default_output = Statements()
|
|
|
|
table_names = None
|
|
if options.table_names:
|
|
table_names = [name.lower() for name in options.table_names.split(',')]
|
|
existing_tables = get_hdfs_subdirs_with_data(options.hive_warehouse_dir)
|
|
for row in test_vectors:
|
|
file_format, data_set, codec, compression_type =\
|
|
[row.file_format, row.dataset, row.compression_codec, row.compression_type]
|
|
table_format = '%s/%s/%s' % (file_format, codec, compression_type)
|
|
output = default_output
|
|
if file_format == 'avro':
|
|
output = avro_output
|
|
elif file_format == 'parquet':
|
|
output = parquet_output
|
|
|
|
for section in sections:
|
|
alter = section.get('ALTER')
|
|
create = section['CREATE']
|
|
insert = section['DEPENDENT_LOAD']
|
|
load_local = section['LOAD']
|
|
base_table_name = section['BASE_TABLE_NAME']
|
|
columns = section['COLUMNS']
|
|
partition_columns = section['PARTITION_COLUMNS']
|
|
row_format = section['ROW_FORMAT']
|
|
table_name = base_table_name
|
|
db_suffix = build_db_suffix(file_format, codec, compression_type)
|
|
db_name = '{0}{1}'.format(data_set, options.scale_factor)
|
|
hdfs_location = '{0}.{1}{2}'.format(db_name, table_name, db_suffix)
|
|
# hdfs file names for hive-benchmark and functional datasets are stored
|
|
# directly under /test-warehouse
|
|
# TODO: We should not need to specify the hdfs file path in the schema file.
|
|
# This needs to be done programmatically.
|
|
if data_set in ['hive-benchmark', 'functional']:
|
|
hdfs_location = hdfs_location.split('.')[-1]
|
|
# hive does not allow hyphenated table names.
|
|
if data_set == 'hive-benchmark':
|
|
db_name = '{0}{1}'.format('hivebenchmark', options.scale_factor)
|
|
db = '{0}{1}'.format(db_name, db_suffix)
|
|
data_path = os.path.join(options.hive_warehouse_dir, hdfs_location)
|
|
|
|
if table_names and (table_name.lower() not in table_names):
|
|
print 'Skipping table: %s.%s' % (db, table_name)
|
|
continue
|
|
|
|
if schema_include_constraints[table_name.lower()] and \
|
|
table_format not in schema_include_constraints[table_name.lower()]:
|
|
print 'Skipping \'%s.%s\' due to include constraint match' % (db, table_name)
|
|
continue
|
|
|
|
if schema_exclude_constraints[base_table_name.lower()] and\
|
|
table_format in schema_exclude_constraints[base_table_name.lower()]:
|
|
print 'Skipping \'%s.%s\' due to exclude constraint match' % (db, table_name)
|
|
continue
|
|
|
|
# If a CREATE section is provided, use that. Otherwise a COLUMNS section
|
|
# must be provided (and optionally PARTITION_COLUMNS and ROW_FORMAT
|
|
# sections), which is used to generate the create table statement.
|
|
if create:
|
|
table_template = create
|
|
if file_format == 'avro':
|
|
# We don't know how to generalize CREATE sections to Avro.
|
|
print "CREATE section not supported with Avro, skipping: '%s'" % table_name
|
|
continue
|
|
else:
|
|
assert columns, "No CREATE or COLUMNS section defined for table " + table_name
|
|
avro_schema_dir = "%s/%s" % (AVRO_SCHEMA_DIR, data_set)
|
|
table_template = build_table_template(
|
|
file_format, columns, partition_columns, row_format, avro_schema_dir)
|
|
# Write Avro schema to local file
|
|
if not os.path.exists(avro_schema_dir):
|
|
os.makedirs(avro_schema_dir)
|
|
with open("%s/%s.json" % (avro_schema_dir, table_name),"w") as f:
|
|
f.write(avro_schema(columns))
|
|
|
|
output.create.append(
|
|
build_create_statement(table_template, table_name, db_name, db_suffix,
|
|
file_format, codec, hdfs_location))
|
|
|
|
# The ALTER statement in hive does not accept fully qualified table names.
|
|
# We need the use statement.
|
|
if alter:
|
|
use_table = 'USE {db_name};\n'.format(db_name=db)
|
|
output.create.append(use_table + alter.format(table_name=table_name))
|
|
|
|
# If the directory already exists in HDFS, assume that data files already exist
|
|
# and skip loading the data. Otherwise, the data is generated using either an
|
|
# INSERT INTO statement or a LOAD statement.
|
|
if not options.force_reload and hdfs_location in existing_tables:
|
|
print 'HDFS path:', data_path, 'contains data. Data loading can be skipped.'
|
|
else:
|
|
print 'HDFS path:', data_path, 'does not exists or is empty. Data will be loaded.'
|
|
if not db_suffix:
|
|
if load_local:
|
|
output.load_base.append(build_load_statement(load_local, db_name,
|
|
db_suffix, table_name))
|
|
else:
|
|
print 'Empty base table load for %s. Skipping load generation' % table_name
|
|
elif file_format == 'parquet':
|
|
if insert:
|
|
# In most cases the same load logic can be used for the parquet and
|
|
# non-parquet case, but sometimes it needs to be special cased.
|
|
insert = insert if 'LOAD_PARQUET' not in section else section['LOAD_PARQUET']
|
|
parquet_output.load.append(build_insert_into_statement(
|
|
insert, db_name, db_suffix, table_name, 'parquet', for_impala=True))
|
|
else:
|
|
print \
|
|
'Empty parquet load for table %s. Skipping insert generation' % table_name
|
|
else:
|
|
if insert:
|
|
output.load.append(build_insert(insert, db_name, db_suffix, file_format,
|
|
codec, compression_type, table_name))
|
|
else:
|
|
print 'Empty insert for table %s. Skipping insert generation' % table_name
|
|
|
|
avro_output.write_to_file('load-' + output_name + '-avro-generated.sql')
|
|
parquet_output.write_to_file('load-' + output_name + '-parquet-generated.sql')
|
|
default_output.write_to_file('load-' + output_name + '-generated.sql')
|
|
|
|
def parse_schema_template_file(file_name):
|
|
VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS',
|
|
'ROW_FORMAT', 'CREATE', 'DEPENDENT_LOAD', 'LOAD', 'ALTER',
|
|
'LOAD_PARQUET']
|
|
return parse_test_file(file_name, VALID_SECTION_NAMES, skip_unknown_sections=False)
|
|
|
|
if __name__ == "__main__":
|
|
if options.table_formats is None:
|
|
if options.exploration_strategy not in KNOWN_EXPLORATION_STRATEGIES:
|
|
print 'Invalid exploration strategy:', options.exploration_strategy
|
|
print 'Valid values:', ', '.join(KNOWN_EXPLORATION_STRATEGIES)
|
|
sys.exit(1)
|
|
|
|
test_vectors = [vector.value for vector in\
|
|
load_table_info_dimension(options.workload, options.exploration_strategy)]
|
|
else:
|
|
table_formats = options.table_formats.split(',')
|
|
dataset = get_dataset_from_workload(options.workload)
|
|
test_vectors =\
|
|
[TableFormatInfo.create_from_string(dataset, tf) for tf in table_formats]
|
|
|
|
target_dataset = test_vectors[0].dataset
|
|
print 'Target Dataset: ' + target_dataset
|
|
schema_template_file = os.path.join(DATASET_DIR, target_dataset,
|
|
'%s_schema_template.sql' % target_dataset)
|
|
|
|
if not os.path.isfile(schema_template_file):
|
|
print 'Schema file not found: ' + schema_template_file
|
|
sys.exit(1)
|
|
|
|
constraints_file = os.path.join(DATASET_DIR, target_dataset, 'schema_constraints.csv')
|
|
include_constraints, exclude_constraints = parse_table_constraints(constraints_file)
|
|
sections = parse_schema_template_file(schema_template_file)
|
|
generate_statements('%s-%s' % (options.workload, options.exploration_strategy),
|
|
test_vectors, sections, include_constraints, exclude_constraints)
|