Files
impala/testdata/common/widetable.py
David Knupp d1c9510001 Revert "IMPALA-6068: Fix dataload for complextypes_fileformat"
This reverts commit e4f585240a.

Among other things, that commit replaced hdfs command line calls
with "LOAD DATA LOCAL INPATH" using Hive. However, doing so
presumes that the minicluster is the only test environment.
Sometimes though, the data load script is against a remote cluster,
and those cases, the data load process is now broken.

Change-Id: I6dc419934d2953eb950b14d090d7895ec57aa9f2
Reviewed-on: http://gerrit.cloudera.org:8080/8653
Reviewed-by: Philip Zeyliger <philip@cloudera.com>
Reviewed-by: Zach Amsden <zamsden@cloudera.com>
Tested-by: Impala Public Jenkins
2017-11-28 02:57:04 +00:00

136 lines
4.5 KiB
Python
Executable File

#!/usr/bin/env impala-python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Functions for creating wide (i.e. many-column) tables. When run from the command line,
# specify either --get_columns to generate column descriptors, or --create_data to
# generate a CSV data file.
from datetime import datetime, timedelta
import itertools
import optparse
parser = optparse.OptionParser()
parser.add_option("--get_columns", dest="get_columns", default=False, action="store_true")
parser.add_option("--create_data", dest="create_data", default=False, action="store_true")
parser.add_option("-n", "--num_columns", dest="num_columns", type="int")
parser.add_option("-o", "--output_file", dest="output_file")
parser.add_option("--num_rows", dest="num_rows", default=10)
def get_columns(num_cols):
"""Returns 'num_cols' column declarations, cycling through every column type, as a
a list of strings."""
templates = [
'bool_col%i BOOLEAN',
'tinyint_col%i TINYINT',
'smallint_col%i SMALLINT',
'int_col%i INT',
'bigint_col%i BIGINT',
'float_col%i FLOAT',
'double_col%i DOUBLE',
'string_col%i STRING',
]
iter = itertools.cycle(templates)
# Produces [bool_col1, tinyint_col1, ..., bool_col2, tinyint_col2, ...]
# The final list has 'num_cols' elements.
return [iter.next() % (i / len(templates) + 1) for i in xrange(num_cols)]
# Data generators for different types. Each generator yields an infinite number of
# value strings suitable for writing to a CSV file.
def bool_generator():
"""Generates True, False repeating"""
b = True
while True:
yield str(b)
b = not b
def integer_generator():
"""Generates 0..4 repeating"""
i = 0
while True:
yield str(i % 5)
i += 1
def floating_point_generator():
"""Generates 0, 1.1, ..., 4.4 repeating"""
i = 0
while True:
yield str((i % 5) * 1.1)
i += 1
def quote(iter_fn):
"""Returns a generator that returns quoted values of iter_fn."""
def new_iter_fn():
iter = iter_fn()
while True:
yield "'%s'" % iter.next()
return new_iter_fn
def get_data(num_cols, num_rows, delimiter=',', quote_strings=False):
"""Returns the data for the given number of rows and columns as a list of strings, each
of which is a row delimited by 'delimiter'."""
generators = [
bool_generator, # boolean
integer_generator, # tinyint
integer_generator, # smallint
integer_generator, # int
integer_generator, # bigint
floating_point_generator, # float
floating_point_generator, # double
quote(integer_generator) if quote_strings else integer_generator, # string
]
# Create a generator instance for each column, cycling through the different types
iter = itertools.cycle(generators)
column_generators = [iter.next()() for i in xrange(num_cols)]
# Populate each row using column_generators
rows = []
for i in xrange(num_rows):
vals = [gen.next() for gen in column_generators]
rows.append(delimiter.join(vals))
return rows
if __name__ == "__main__":
(options, args) = parser.parse_args()
if options.get_columns == options.create_data:
parser.error("Must specify either --get_columns or --create_data")
if not options.num_columns:
parser.error("--num_columns option must be specified")
if options.get_columns:
# Output column descriptors
print '\n'.join(get_columns(options.num_columns))
if options.create_data:
# Generate data locally, and output the command template to load it into HDFS
if not options.output_file:
parser.error("--output_file option must be specified")
with open(options.output_file, "w") as f:
for row in get_data(options.num_columns, options.num_rows):
f.write(row)
f.write('\n')
print ("LOAD DATA LOCAL INPATH '%s' "
"OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};"
% options.output_file)