mirror of
https://github.com/apache/impala.git
synced 2026-01-09 06:05:09 -05:00
Dataload typically follows a pattern of loading data into a text version of a table, and then using an insert overwrite from the text table to populate the table for other file formats. This insert is always done in Impala for Parquet and Kudu. Otherwise it runs in Hive. Since Impala doesn't support writing nested data, the population of complextypes_fileformat tries to hack the insert to run in Hive by including it in the ALTER part of the table definition. ALTER runs immediately after CREATE and always runs in Hive. The problem is that ALTER also runs before the base table (functional.complextypes_fileformat) is populated. The insert succeeds, but it is inserting zero rows. This code change introduces a way to force the Parquet load to run using Hive. This lets complextypes_fileformat specify that the insert should happen in Hive and fixes the ordering so that the table is populated correctly. This is also useful for loading custom Parquet files into Parquet tables. Hive supports the DATA LOAD LOCAL syntax, which can read a file from the local filesystem. This means that several locations that currently use the hdfs commandline can be modified to use this SQL. This change speeds up dataload by a few minutes, as it avoids the overhead of the hdfs commandline. Any other location that could use DATA LOAD LOCAL is also switched over to use it. This includes the testescape* tables which now print the appropriate DATA LOAD commands as a result of text_delims_table.py. Any location that already uses DATA LOAD LOCAL is also switched to indicate that it must run in Hive. Any location that was doing an HDFS command in the LOAD section is moved to the LOAD_DEPENDENT_HIVE section. Testing: Ran dataload and core tests. Also verified that functional_parquet.complextypes_fileformat has rows. Change-Id: I7152306b2907198204a6d8d282a0bad561129b82 Reviewed-on: http://gerrit.cloudera.org:8080/8350 Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com> Tested-by: Impala Public Jenkins
137 lines
4.6 KiB
Python
Executable File
137 lines
4.6 KiB
Python
Executable File
#!/usr/bin/env impala-python
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# Functions for creating wide (i.e. many-column) tables. When run from the command line,
|
|
# specify either --get_columns to generate column descriptors, or --create_data to
|
|
# generate a CSV data file and prints a SQL load statement to incorporate
|
|
# into dataload SQL script generation.
|
|
|
|
from datetime import datetime, timedelta
|
|
import itertools
|
|
import optparse
|
|
|
|
parser = optparse.OptionParser()
|
|
parser.add_option("--get_columns", dest="get_columns", default=False, action="store_true")
|
|
parser.add_option("--create_data", dest="create_data", default=False, action="store_true")
|
|
parser.add_option("-n", "--num_columns", dest="num_columns", type="int")
|
|
parser.add_option("-o", "--output_file", dest="output_file")
|
|
parser.add_option("--num_rows", dest="num_rows", default=10)
|
|
|
|
def get_columns(num_cols):
|
|
"""Returns 'num_cols' column declarations, cycling through every column type, as a
|
|
a list of strings."""
|
|
templates = [
|
|
'bool_col%i BOOLEAN',
|
|
'tinyint_col%i TINYINT',
|
|
'smallint_col%i SMALLINT',
|
|
'int_col%i INT',
|
|
'bigint_col%i BIGINT',
|
|
'float_col%i FLOAT',
|
|
'double_col%i DOUBLE',
|
|
'string_col%i STRING',
|
|
]
|
|
|
|
iter = itertools.cycle(templates)
|
|
# Produces [bool_col1, tinyint_col1, ..., bool_col2, tinyint_col2, ...]
|
|
# The final list has 'num_cols' elements.
|
|
return [iter.next() % (i / len(templates) + 1) for i in xrange(num_cols)]
|
|
|
|
# Data generators for different types. Each generator yields an infinite number of
|
|
# value strings suitable for writing to a CSV file.
|
|
|
|
def bool_generator():
|
|
"""Generates True, False repeating"""
|
|
b = True
|
|
while True:
|
|
yield str(b)
|
|
b = not b
|
|
|
|
def integer_generator():
|
|
"""Generates 0..4 repeating"""
|
|
i = 0
|
|
while True:
|
|
yield str(i % 5)
|
|
i += 1
|
|
|
|
def floating_point_generator():
|
|
"""Generates 0, 1.1, ..., 4.4 repeating"""
|
|
i = 0
|
|
while True:
|
|
yield str((i % 5) * 1.1)
|
|
i += 1
|
|
|
|
def quote(iter_fn):
|
|
"""Returns a generator that returns quoted values of iter_fn."""
|
|
def new_iter_fn():
|
|
iter = iter_fn()
|
|
while True:
|
|
yield "'%s'" % iter.next()
|
|
return new_iter_fn
|
|
|
|
def get_data(num_cols, num_rows, delimiter=',', quote_strings=False):
|
|
"""Returns the data for the given number of rows and columns as a list of strings, each
|
|
of which is a row delimited by 'delimiter'."""
|
|
generators = [
|
|
bool_generator, # boolean
|
|
integer_generator, # tinyint
|
|
integer_generator, # smallint
|
|
integer_generator, # int
|
|
integer_generator, # bigint
|
|
floating_point_generator, # float
|
|
floating_point_generator, # double
|
|
quote(integer_generator) if quote_strings else integer_generator, # string
|
|
]
|
|
# Create a generator instance for each column, cycling through the different types
|
|
iter = itertools.cycle(generators)
|
|
column_generators = [iter.next()() for i in xrange(num_cols)]
|
|
|
|
# Populate each row using column_generators
|
|
rows = []
|
|
for i in xrange(num_rows):
|
|
vals = [gen.next() for gen in column_generators]
|
|
rows.append(delimiter.join(vals))
|
|
return rows
|
|
|
|
if __name__ == "__main__":
|
|
(options, args) = parser.parse_args()
|
|
|
|
if options.get_columns == options.create_data:
|
|
parser.error("Must specify either --get_columns or --create_data")
|
|
|
|
if not options.num_columns:
|
|
parser.error("--num_columns option must be specified")
|
|
|
|
if options.get_columns:
|
|
# Output column descriptors
|
|
print '\n'.join(get_columns(options.num_columns))
|
|
|
|
if options.create_data:
|
|
# Generate data locally, and output the SQL load command for use in dataload
|
|
if not options.output_file:
|
|
parser.error("--output_file option must be specified")
|
|
|
|
with open(options.output_file, "w") as f:
|
|
for row in get_data(options.num_columns, options.num_rows):
|
|
f.write(row)
|
|
f.write('\n')
|
|
|
|
print ("LOAD DATA LOCAL INPATH '%s' "
|
|
"OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};"
|
|
% options.output_file)
|