Files
impala/tests/metadata/test_show_create_table.py
Sailesh Mukil ed7f5ebf53 IMPALA-1878: Support INSERT and LOAD DATA on S3 and between filesystems
Previously Impala disallowed LOAD DATA and INSERT on S3. This patch
functionally enables LOAD DATA and INSERT on S3 without making major
changes for the sake of improving performance over S3. This patch also
enables both INSERT and LOAD DATA between file systems.

S3 does not support the rename operation, so the staged files in S3
are copied instead of renamed, which contributes to the slow
performance on S3.

The FinalizeSuccessfulInsert() function now does not make any
underlying assumptions of the filesystem it is on and works across
all supported filesystems. This is done by adding a full URI field to
the base directory for a partition in the TInsertPartitionStatus.
Also, the HdfsOp class now does not assume a single filesystem and
gets connections to the filesystems based on the URI of the file it
is operating on.

Added a python S3 client called 'boto3' to access S3 from the python
tests. A new class called S3Client is introduced which creates
wrappers around the boto3 functions and have the same function
signatures as PyWebHdfsClient by deriving from a base abstract class
BaseFileSystem so that they can be interchangeably through a
'generic_client'. test_load.py is refactored to use this generic
client. The ImpalaTestSuite setup creates a client according to the
TARGET_FILESYSTEM environment variable and assigns it to the
'generic_client'.

P.S: Currently, the test_load.py runs 4x slower on S3 than on
HDFS. Performance needs to be improved in future patches. INSERT
performance is slower than on HDFS too. This is mainly because of an
extra copy that happens between staging and the final location of a
file. However, larger INSERTs come closer to HDFS permformance than
smaller inserts.

ACLs are not taken care of for S3 in this patch. It is something
that still needs to be discussed before implementing.

Change-Id: I94e15ad67752dce21c9b7c1dced6e114905a942d
Reviewed-on: http://gerrit.cloudera.org:8080/2574
Reviewed-by: Sailesh Mukil <sailesh@cloudera.com>
Tested-by: Internal Jenkins
2016-05-12 14:17:49 -07:00

239 lines
10 KiB
Python

# Copyright (c) 2013 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import shlex
from subprocess import call
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
from tests.common.skip import SkipIf, SkipIfS3
from tests.util.filesystem_utils import WAREHOUSE
from tests.util.test_file_parser import remove_comments
# The purpose of the show create table tests are to ensure that the "SHOW CREATE TABLE"
# output can actually be used to recreate the table. A test consists of a table
# definition. The table is created, then the output of "SHOW CREATE TABLE" is used to
# test if the table can be recreated. This test class does not support --update-results.
class TestShowCreateTable(ImpalaTestSuite):
VALID_SECTION_NAMES = ["CREATE_TABLE", "CREATE_VIEW", "QUERY", "RESULTS"]
# Properties to filter before comparing results
FILTER_TBL_PROPERTIES = ["transient_lastDdlTime", "numFiles", "numPartitions",
"numRows", "rawDataSize", "totalSize", "COLUMN_STATS_ACCURATE",
"STATS_GENERATED_VIA_STATS_TASK"]
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestShowCreateTable, cls).add_test_dimensions()
# don't use any exec options, running exactly once is fine
cls.TestMatrix.clear_dimension('exec_option')
# There is no reason to run these tests using all dimensions.
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
cls.TestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec == 'none')
def test_show_create_table(self, vector, unique_database):
self.__run_show_create_table_test_case('QueryTest/show-create-table', vector,
unique_database)
@SkipIf.kudu_not_supported
def test_kudu_show_create_table(self, vector, unique_database):
self.__run_show_create_table_test_case('QueryTest/kudu-show-create', vector,
unique_database)
def __run_show_create_table_test_case(self, test_file_name, vector, unique_db_name):
"""
Runs a show-create-table test file, containing the following sections:
---- CREATE_TABLE
contains a table creation statement to create table TABLE_NAME
---- RESULTS
contains the expected result of SHOW CREATE TABLE table_name
OR
---- CREATE_VIEW
contains a view creation statement to create table VIEW_NAME
---- RESULTS
contains the expected result of SHOW CREATE VIEW table_name
OR
---- QUERY
a show create table query
---- RESULTS
contains the expected output of the SHOW CREATE TABLE query
unique_db_name is the name of the database to use for all tables and
views and must be unique so as not to conflict with other tests.
"""
sections = self.load_query_test_file(self.get_workload(), test_file_name,
self.VALID_SECTION_NAMES)
for test_section in sections:
test_case = ShowCreateTableTestCase(test_section, test_file_name, unique_db_name)
if not test_case.existing_table:
# create table in Impala
self.__exec(test_case.create_table_sql)
# execute "SHOW CREATE TABLE ..."
result = self.__exec(test_case.show_create_table_sql)
create_table_result = self.__normalize(result.data[0])
if not test_case.existing_table:
# drop the table
self.__exec(test_case.drop_table_sql)
# check the result matches the expected result
expected_result = self.__normalize(self.__replace_uri(
test_case.expected_result,
self.__get_location_uri(create_table_result)))
self.__compare_result(expected_result, create_table_result)
if test_case.existing_table:
continue
# recreate the table with the result from above
self.__exec(create_table_result)
try:
# we should get the same result from "show create table ..."
result = self.__exec(test_case.show_create_table_sql)
new_create_table_result = self.__normalize(result.data[0])
assert create_table_result == new_create_table_result
finally:
# drop the table
self.__exec(test_case.drop_table_sql)
def __exec(self, sql_str):
return self.execute_query_expect_success(self.client, sql_str)
def __get_location_uri(self, sql_str):
m = re.search("LOCATION '([^\']+)'", sql_str)
if m is not None:
return m.group(1)
def __compare_result(self, expected_sql, actual_sql):
""" Extract all properties """
expected_tbl_props = self.__get_properties_map(expected_sql, "TBLPROPERTIES")
actual_tbl_props = self.__get_properties_map(actual_sql, "TBLPROPERTIES")
assert expected_tbl_props == actual_tbl_props
expected_serde_props = self.__get_properties_map(expected_sql, "SERDEPROPERTIES")
actual_serde_props = self.__get_properties_map(actual_sql, "SERDEPROPERTIES")
assert expected_serde_props == actual_serde_props
expected_sql_filtered = self.__remove_properties_maps(expected_sql)
actual_sql_filtered = self.__remove_properties_maps(actual_sql)
assert expected_sql_filtered == actual_sql_filtered
def __normalize(self, s):
""" Normalize the string to remove extra whitespaces and remove keys
from tblproperties and serdeproperties that we don't want
"""
s = ' '.join(s.split())
for k in self.FILTER_TBL_PROPERTIES:
kv_regex = "'%s'\s*=\s*'[^\']+'\s*,?" % (k)
s = re.sub(kv_regex, "", s)
# If we removed the last property, there will be a dangling comma that is not valid
# e.g. 'k1'='v1', ) -> 'k1'='v1')
s = re.sub(",\s*\)", ")", s)
# Need to remove any whitespace after left parens and before right parens
s = re.sub("\(\s+", "(", s)
s = re.sub("\s+\)", ")", s)
# If the only properties were removed, the properties sections may be empty, which
# is not valid
s = re.sub("TBLPROPERTIES\s*\(\s*\)", "", s)
s = re.sub("SERDEPROPERTIES\s*\(\s*\)", "", s)
return s
def __properties_map_regex(self, name):
return "%s \(([^)]+)\)" % name
def __remove_properties_maps(self, s):
""" Removes the tblproperties and serdeproperties from the string """
return re.sub(self.__properties_map_regex("WITH SERDEPROPERTIES"), "",
re.sub(self.__properties_map_regex("TBLPROPERTIES"), "", s)).strip()
def __get_properties_map(self, s, properties_map_name):
""" Extracts a dict of key-value pairs from the sql string s. The properties_map_name
is the name of the properties map, e.g. 'tblproperties' or 'serdeproperties'
"""
map_match = re.search(self.__properties_map_regex(properties_map_name), s)
if map_match is None:
return dict()
kv_regex = "'([^\']+)'\s*=\s*'([^\']+)'"
kv_results = dict(re.findall(kv_regex, map_match.group(1)))
for filtered_key in self.FILTER_TBL_PROPERTIES:
if filtered_key in kv_results:
del kv_results[filtered_key]
return kv_results
def __replace_uri(self, s, uri):
return s if uri is None else s.replace("$$location_uri$$", uri)
# Represents one show-create-table test case. Performs validation of the test sections
# and provides SQL to execute for each section.
class ShowCreateTableTestCase(object):
RESULTS_DB_NAME_TOKEN = "show_create_table_test_db"
def __init__(self, test_section, test_file_name, test_db_name):
if 'QUERY' in test_section:
self.existing_table = True
self.show_create_table_sql = remove_comments(test_section['QUERY']).strip()
elif 'CREATE_TABLE' in test_section:
self.__process_create_section(test_section['CREATE_TABLE'], test_file_name,
test_db_name, 'table')
elif 'CREATE_VIEW' in test_section:
self.__process_create_section(test_section['CREATE_VIEW'], test_file_name,
test_db_name, 'view')
else:
assert 0, 'Error in test file %s. Test cases require a '\
'CREATE_TABLE section.\n%s' %\
(test_file_name, pprint.pformat(test_section))
expected_result = remove_comments(test_section['RESULTS'])
self.expected_result = expected_result.replace(
ShowCreateTableTestCase.RESULTS_DB_NAME_TOKEN, test_db_name)
def __process_create_section(self, section, test_file_name, test_db_name, table_type):
self.existing_table = False
self.create_table_sql = QueryTestSectionReader.build_query(remove_comments(section))
name = self.__get_table_name(self.create_table_sql, table_type)
assert name.find(".") == -1, 'Error in test file %s. Found unexpected %s '\
'name %s that is qualified with a database' % (table_type, test_file_name, name)
self.table_name = test_db_name + '.' + name
self.create_table_sql = self.create_table_sql.replace(name, self.table_name, 1)
self.show_create_table_sql = 'show create %s %s' % (table_type, self.table_name)
self.drop_table_sql = "drop %s %s" % (table_type, self.table_name)
def __get_table_name(self, create_table_sql, table_type):
lexer = shlex.shlex(create_table_sql)
tokens = list(lexer)
# sanity check the create table statement
if len(tokens) < 3 or tokens[0].lower() != "create":
assert 0, 'Error in test. Invalid CREATE TABLE statement: %s' % (create_table_sql)
if tokens[1].lower() != table_type.lower() and \
(tokens[1].lower() != "external" or tokens[2].lower() != table_type.lower()):
assert 0, 'Error in test. Invalid CREATE TABLE statement: %s' % (create_table_sql)
if tokens[1].lower() == "external":
# expect "create external table table_name ..."
return tokens[3]
else:
# expect a create table table_name ...
return tokens[2]