mirror of
https://github.com/apache/impala.git
synced 2025-12-31 15:00:10 -05:00
Reverts "IMPALA-1033: Remove SOURCE keyword; very common identifier" Change-Id: I3fcf6d02786e00287b564cff0a823d0c19504e7a
304 lines
13 KiB
Python
Executable File
304 lines
13 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# Impala tests for DDL statements
|
|
import logging
|
|
import pytest
|
|
import shlex
|
|
import time
|
|
from tests.common.test_result_verifier import *
|
|
from subprocess import call
|
|
from tests.common.test_vector import *
|
|
from tests.common.test_dimensions import ALL_NODES_ONLY
|
|
from tests.common.impala_test_suite import *
|
|
|
|
# Validates DDL statements (create, drop)
|
|
class TestDdlStatements(ImpalaTestSuite):
|
|
TEST_DBS = ['ddl_test_db', 'alter_table_test_db', 'alter_table_test_db2',
|
|
'function_ddl_test', 'udf_test', 'data_src_test']
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestDdlStatements, cls).add_test_dimensions()
|
|
sync_ddl_opts = [0, 1]
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
# Only run with sync_ddl on exhaustive since it increases test runtime.
|
|
sync_ddl_opts = [0]
|
|
|
|
cls.TestMatrix.add_dimension(create_exec_option_dimension(
|
|
cluster_sizes=ALL_NODES_ONLY,
|
|
disable_codegen_options=[False],
|
|
batch_sizes=[0],
|
|
sync_ddl=sync_ddl_opts))
|
|
|
|
# There is no reason to run these tests using all dimensions.
|
|
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def setup_method(self, method):
|
|
self.cleanup()
|
|
|
|
# Get the current number of queries that are in the 'EXCEPTION' state. Used for
|
|
# verification after running each test case.
|
|
self.start_exception_count = self.query_exception_count()
|
|
self.cleanup_hdfs_dirs()
|
|
|
|
def teardown_method(self, method):
|
|
end_exception_count = self.query_exception_count()
|
|
# The number of exceptions may be < than what was in setup if the queries in the
|
|
# EXCEPTION state were bumped out of the FINISHED list. We should never see an
|
|
# increase in the number of queries in the exception state.
|
|
assert end_exception_count <= self.start_exception_count
|
|
|
|
def query_exception_count(self):
|
|
"""Returns the number of occurrences of 'EXCEPTION' on the debug /queries page"""
|
|
return len(re.findall('EXCEPTION',
|
|
self.impalad_test_service.read_debug_webpage('queries')))
|
|
|
|
def cleanup(self):
|
|
map(self.cleanup_db, self.TEST_DBS)
|
|
self.cleanup_hdfs_dirs()
|
|
|
|
def cleanup_hdfs_dirs(self):
|
|
# Cleanup the test table HDFS dirs between test runs so there are no errors the next
|
|
# time a table is created with the same location. This also helps remove any stale
|
|
# data from the last test run.
|
|
self.hdfs_client.delete_file_dir("test-warehouse/part_data/", recursive=True)
|
|
self.hdfs_client.delete_file_dir("test-warehouse/t1_tmp1/", recursive=True)
|
|
self.hdfs_client.delete_file_dir("test-warehouse/t_part_tmp/", recursive=True)
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_drop_cleans_hdfs_dirs(self):
|
|
self.hdfs_client.delete_file_dir("test-warehouse/ddl_test_db.db/", recursive=True)
|
|
assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
|
|
|
|
self.client.execute('use default')
|
|
self.client.execute('create database ddl_test_db')
|
|
# Verify the db directory exists
|
|
assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
|
|
|
|
self.client.execute("create table ddl_test_db.t1(i int)")
|
|
# Verify the table directory exists
|
|
assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/t1/")
|
|
|
|
# Dropping the table removes the table's directory and preserves the db's directory
|
|
self.client.execute("drop table ddl_test_db.t1")
|
|
assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/t1/")
|
|
assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
|
|
|
|
# Dropping the db removes the db's directory
|
|
self.client.execute("drop database ddl_test_db")
|
|
assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_create(self, vector):
|
|
vector.get_value('exec_option')['abort_on_error'] = False
|
|
self.__create_db_synced('ddl_test_db', vector)
|
|
self.run_test_case('QueryTest/create', vector, use_db='ddl_test_db',
|
|
multiple_impalad=self.__use_multiple_impalad(vector))
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_alter_table(self, vector):
|
|
vector.get_value('exec_option')['abort_on_error'] = False
|
|
# Create directory for partition data that does not use the (key=value)
|
|
# format.
|
|
self.hdfs_client.make_dir("test-warehouse/part_data/", permission=777)
|
|
self.hdfs_client.create_file("test-warehouse/part_data/data.txt", file_data='1984')
|
|
|
|
# Create test databases
|
|
self.__create_db_synced('alter_table_test_db', vector)
|
|
self.__create_db_synced('alter_table_test_db2', vector)
|
|
self.run_test_case('QueryTest/alter-table', vector, use_db='alter_table_test_db',
|
|
multiple_impalad=self.__use_multiple_impalad(vector))
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_views_ddl(self, vector):
|
|
vector.get_value('exec_option')['abort_on_error'] = False
|
|
self.__create_db_synced('ddl_test_db', vector)
|
|
self.run_test_case('QueryTest/views-ddl', vector, use_db='ddl_test_db',
|
|
multiple_impalad=self.__use_multiple_impalad(vector))
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_functions_ddl(self, vector):
|
|
self.__create_db_synced('function_ddl_test', vector)
|
|
self.run_test_case('QueryTest/functions-ddl', vector, use_db='function_ddl_test',
|
|
multiple_impalad=self.__use_multiple_impalad(vector))
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_create_drop_function(self, vector):
|
|
# This will create, run, and drop the same function repeatedly, exercising the
|
|
# lib cache mechanism.
|
|
create_fn_stmt = """create function f() returns int
|
|
location '/test-warehouse/libTestUdfs.so' symbol='NoArgs'"""
|
|
select_stmt = """select f() from functional.alltypes limit 10"""
|
|
drop_fn_stmt = "drop function %s f()"
|
|
self.create_drop_ddl(vector, "udf_test", [create_fn_stmt], [drop_fn_stmt],
|
|
select_stmt)
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_create_drop_data_src(self, vector):
|
|
# This will create, run, and drop the same data source repeatedly, exercising
|
|
# the lib cache mechanism.
|
|
create_ds_stmt = """CREATE DATA SOURCE test_data_src
|
|
LOCATION '/test-warehouse/data-sources/test-data-source.jar'
|
|
CLASS 'com.cloudera.impala.extdatasource.AllTypesDataSource'
|
|
API_VERSION 'V1'"""
|
|
create_tbl_stmt = """CREATE TABLE data_src_tbl (x int)
|
|
PRODUCED BY DATA SOURCE test_data_src"""
|
|
drop_ds_stmt = "drop data source %s test_data_src"
|
|
drop_tbl_stmt = "drop table %s data_src_tbl"
|
|
select_stmt = "select * from data_src_tbl limit 1"
|
|
|
|
create_stmts = [create_ds_stmt, create_tbl_stmt]
|
|
drop_stmts = [drop_tbl_stmt, drop_ds_stmt]
|
|
self.create_drop_ddl(vector, "data_src_test", create_stmts, drop_stmts,
|
|
select_stmt)
|
|
|
|
def create_drop_ddl(self, vector, db_name, create_stmts, drop_stmts, select_stmt):
|
|
# Helper method to run CREATE/DROP DDL commands repeatedly and exercise the lib cache
|
|
# create_stmts is the list of CREATE statements to be executed in order drop_stmts is
|
|
# the list of DROP statements to be executed in order. Each statement should have a
|
|
# '%s' placeholder to insert "IF EXISTS" or "". The select_stmt is just a single
|
|
# statement to test after executing the CREATE statements.
|
|
# TODO: it's hard to tell that the cache is working (i.e. if it did nothing to drop
|
|
# the cache, these tests would still pass). Testing that is a bit harder and requires
|
|
# us to update the udf binary in the middle.
|
|
self.__create_db_synced(db_name, vector)
|
|
self.client.set_configuration(vector.get_value('exec_option'))
|
|
|
|
self.client.execute("use %s" % (db_name,))
|
|
for drop_stmt in drop_stmts: self.client.execute(drop_stmt % ("if exists"))
|
|
for i in xrange(1, 10):
|
|
for create_stmt in create_stmts: self.client.execute(create_stmt)
|
|
self.client.execute(select_stmt)
|
|
for drop_stmt in drop_stmts: self.client.execute(drop_stmt % (""))
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_create_alter_bulk_partition(self, vector):
|
|
# Change the scale depending on the exploration strategy, with 50 partitions this
|
|
# takes a few minutes to run, with 10 partitions it takes ~50s for two configurations.
|
|
num_parts = 50
|
|
if self.exploration_strategy() != 'exhaustive': num_parts = 10
|
|
|
|
self.client.execute("use default")
|
|
self.client.execute("drop table if exists foo_part")
|
|
self.client.execute("create table foo_part(i int) partitioned by(j int, s string)")
|
|
|
|
# Add some partitions (first batch of two)
|
|
for i in xrange(num_parts / 5):
|
|
start = time.time()
|
|
self.client.execute("alter table foo_part add partition(j=%d, s='%s')" % (i, i))
|
|
print 'ADD PARTITION #%d exec time: %s' % (i, time.time() - start)
|
|
|
|
# Modify one of the partitions
|
|
self.client.execute("""alter table foo_part partition(j=1, s='1')
|
|
set fileformat parquetfile""")
|
|
|
|
# Alter one partition to a non-existent location twice (IMPALA-741)
|
|
self.hdfs_client.delete_file_dir("tmp/dont_exist1/", recursive=True)
|
|
self.hdfs_client.delete_file_dir("tmp/dont_exist2/", recursive=True)
|
|
|
|
self.execute_query_expect_success(self.client,
|
|
"alter table foo_part partition(j=1,s='1') set location '/tmp/dont_exist1'")
|
|
self.execute_query_expect_success(self.client,
|
|
"alter table foo_part partition(j=1,s='1') set location '/tmp/dont_exist2'")
|
|
|
|
# Add some more partitions
|
|
for i in xrange(num_parts / 5, num_parts):
|
|
start = time.time()
|
|
self.client.execute("alter table foo_part add partition(j=%d,s='%s')" % (i,i))
|
|
print 'ADD PARTITION #%d exec time: %s' % (i, time.time() - start)
|
|
|
|
# Insert data and verify it shows up.
|
|
self.client.execute("insert into table foo_part partition(j=1, s='1') select 1")
|
|
assert '1' == self.execute_scalar("select count(*) from foo_part")
|
|
self.client.execute("drop table foo_part")
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_create_alter_tbl_properties(self, vector):
|
|
self.__create_db_synced('alter_table_test_db', vector)
|
|
self.client.execute("use alter_table_test_db")
|
|
|
|
# Specify TBLPROPERTIES and SERDEPROPERTIES at CREATE time
|
|
self.client.execute("""create table test_alter_tbl (i int)
|
|
with serdeproperties ('s1'='s2', 's3'='s4')
|
|
tblproperties ('p1'='v0', 'p1'='v1')""")
|
|
properties = self.__get_tbl_properties('test_alter_tbl')
|
|
|
|
assert len(properties) == 2
|
|
# The transient_lastDdlTime is variable, so don't verify the value.
|
|
assert 'transient_lastDdlTime' in properties
|
|
del properties['transient_lastDdlTime']
|
|
assert {'p1': 'v1'} == properties
|
|
|
|
properties = self.__get_serde_properties('test_alter_tbl')
|
|
assert {'s1': 's2', 's3': 's4'} == properties
|
|
|
|
# Modify the SERDEPROPERTIES using ALTER TABLE SET.
|
|
self.client.execute("alter table test_alter_tbl set serdeproperties "\
|
|
"('s1'='new', 's5'='s6')")
|
|
properties = self.__get_serde_properties('test_alter_tbl')
|
|
assert {'s1': 'new', 's3': 's4', 's5': 's6'} == properties
|
|
|
|
# Modify the TBLPROPERTIES using ALTER TABLE SET.
|
|
self.client.execute("alter table test_alter_tbl set tblproperties "\
|
|
"('prop1'='val1', 'p2'='val2', 'p2'='val3', ''='')")
|
|
properties = self.__get_tbl_properties('test_alter_tbl')
|
|
|
|
assert len(properties) == 5
|
|
assert 'transient_lastDdlTime' in properties
|
|
del properties['transient_lastDdlTime']
|
|
assert {'p1': 'v1', 'prop1': 'val1', 'p2': 'val3', '': ''} == properties
|
|
|
|
@classmethod
|
|
def __use_multiple_impalad(cls, vector):
|
|
return vector.get_value('exec_option')['sync_ddl'] == 1
|
|
|
|
@classmethod
|
|
def __create_db_synced(cls, db_name, vector):
|
|
"""Creates a database using synchronized DDL to ensure all nodes have the test
|
|
database available for use before executing the .test file(s).
|
|
"""
|
|
cls.client.execute('use default')
|
|
cls.client.set_configuration({'sync_ddl': 1})
|
|
cls.client.execute('create database %s' % db_name)
|
|
cls.client.set_configuration(vector.get_value('exec_option'))
|
|
|
|
def __get_tbl_properties(self, table_name):
|
|
"""Extracts the table properties mapping from the output of DESCRIBE FORMATTED"""
|
|
return self.__get_properties('Table Parameters:', table_name)
|
|
|
|
def __get_serde_properties(self, table_name):
|
|
"""Extracts the serde properties mapping from the output of DESCRIBE FORMATTED"""
|
|
return self.__get_properties('Storage Desc Params:', table_name)
|
|
|
|
def __get_properties(self, section_name, table_name):
|
|
"""Extracts the table properties mapping from the output of DESCRIBE FORMATTED"""
|
|
result = self.client.execute("describe formatted " + table_name)
|
|
match = False
|
|
properties = dict();
|
|
for row in result.data:
|
|
if section_name in row:
|
|
match = True
|
|
elif match:
|
|
row = row.split('\t')
|
|
if (row[1] == 'NULL'):
|
|
break
|
|
properties[row[1].rstrip()] = row[2].rstrip()
|
|
return properties
|