Files
impala/tests/query_test/test_ddl.py
Matthew Jacobs 65c1a6f21e Remove SOURCE keyword by parsing as an identifier and checking the value
Reverts "IMPALA-1033: Remove SOURCE keyword; very common identifier"

Change-Id: I3fcf6d02786e00287b564cff0a823d0c19504e7a
2014-06-30 16:47:47 -07:00

304 lines
13 KiB
Python
Executable File

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Impala tests for DDL statements
import logging
import pytest
import shlex
import time
from tests.common.test_result_verifier import *
from subprocess import call
from tests.common.test_vector import *
from tests.common.test_dimensions import ALL_NODES_ONLY
from tests.common.impala_test_suite import *
# Validates DDL statements (create, drop)
class TestDdlStatements(ImpalaTestSuite):
TEST_DBS = ['ddl_test_db', 'alter_table_test_db', 'alter_table_test_db2',
'function_ddl_test', 'udf_test', 'data_src_test']
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestDdlStatements, cls).add_test_dimensions()
sync_ddl_opts = [0, 1]
if cls.exploration_strategy() != 'exhaustive':
# Only run with sync_ddl on exhaustive since it increases test runtime.
sync_ddl_opts = [0]
cls.TestMatrix.add_dimension(create_exec_option_dimension(
cluster_sizes=ALL_NODES_ONLY,
disable_codegen_options=[False],
batch_sizes=[0],
sync_ddl=sync_ddl_opts))
# There is no reason to run these tests using all dimensions.
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
def setup_method(self, method):
self.cleanup()
# Get the current number of queries that are in the 'EXCEPTION' state. Used for
# verification after running each test case.
self.start_exception_count = self.query_exception_count()
self.cleanup_hdfs_dirs()
def teardown_method(self, method):
end_exception_count = self.query_exception_count()
# The number of exceptions may be < than what was in setup if the queries in the
# EXCEPTION state were bumped out of the FINISHED list. We should never see an
# increase in the number of queries in the exception state.
assert end_exception_count <= self.start_exception_count
def query_exception_count(self):
"""Returns the number of occurrences of 'EXCEPTION' on the debug /queries page"""
return len(re.findall('EXCEPTION',
self.impalad_test_service.read_debug_webpage('queries')))
def cleanup(self):
map(self.cleanup_db, self.TEST_DBS)
self.cleanup_hdfs_dirs()
def cleanup_hdfs_dirs(self):
# Cleanup the test table HDFS dirs between test runs so there are no errors the next
# time a table is created with the same location. This also helps remove any stale
# data from the last test run.
self.hdfs_client.delete_file_dir("test-warehouse/part_data/", recursive=True)
self.hdfs_client.delete_file_dir("test-warehouse/t1_tmp1/", recursive=True)
self.hdfs_client.delete_file_dir("test-warehouse/t_part_tmp/", recursive=True)
@pytest.mark.execute_serially
def test_drop_cleans_hdfs_dirs(self):
self.hdfs_client.delete_file_dir("test-warehouse/ddl_test_db.db/", recursive=True)
assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
self.client.execute('use default')
self.client.execute('create database ddl_test_db')
# Verify the db directory exists
assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
self.client.execute("create table ddl_test_db.t1(i int)")
# Verify the table directory exists
assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/t1/")
# Dropping the table removes the table's directory and preserves the db's directory
self.client.execute("drop table ddl_test_db.t1")
assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/t1/")
assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
# Dropping the db removes the db's directory
self.client.execute("drop database ddl_test_db")
assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/")
@pytest.mark.execute_serially
def test_create(self, vector):
vector.get_value('exec_option')['abort_on_error'] = False
self.__create_db_synced('ddl_test_db', vector)
self.run_test_case('QueryTest/create', vector, use_db='ddl_test_db',
multiple_impalad=self.__use_multiple_impalad(vector))
@pytest.mark.execute_serially
def test_alter_table(self, vector):
vector.get_value('exec_option')['abort_on_error'] = False
# Create directory for partition data that does not use the (key=value)
# format.
self.hdfs_client.make_dir("test-warehouse/part_data/", permission=777)
self.hdfs_client.create_file("test-warehouse/part_data/data.txt", file_data='1984')
# Create test databases
self.__create_db_synced('alter_table_test_db', vector)
self.__create_db_synced('alter_table_test_db2', vector)
self.run_test_case('QueryTest/alter-table', vector, use_db='alter_table_test_db',
multiple_impalad=self.__use_multiple_impalad(vector))
@pytest.mark.execute_serially
def test_views_ddl(self, vector):
vector.get_value('exec_option')['abort_on_error'] = False
self.__create_db_synced('ddl_test_db', vector)
self.run_test_case('QueryTest/views-ddl', vector, use_db='ddl_test_db',
multiple_impalad=self.__use_multiple_impalad(vector))
@pytest.mark.execute_serially
def test_functions_ddl(self, vector):
self.__create_db_synced('function_ddl_test', vector)
self.run_test_case('QueryTest/functions-ddl', vector, use_db='function_ddl_test',
multiple_impalad=self.__use_multiple_impalad(vector))
@pytest.mark.execute_serially
def test_create_drop_function(self, vector):
# This will create, run, and drop the same function repeatedly, exercising the
# lib cache mechanism.
create_fn_stmt = """create function f() returns int
location '/test-warehouse/libTestUdfs.so' symbol='NoArgs'"""
select_stmt = """select f() from functional.alltypes limit 10"""
drop_fn_stmt = "drop function %s f()"
self.create_drop_ddl(vector, "udf_test", [create_fn_stmt], [drop_fn_stmt],
select_stmt)
@pytest.mark.execute_serially
def test_create_drop_data_src(self, vector):
# This will create, run, and drop the same data source repeatedly, exercising
# the lib cache mechanism.
create_ds_stmt = """CREATE DATA SOURCE test_data_src
LOCATION '/test-warehouse/data-sources/test-data-source.jar'
CLASS 'com.cloudera.impala.extdatasource.AllTypesDataSource'
API_VERSION 'V1'"""
create_tbl_stmt = """CREATE TABLE data_src_tbl (x int)
PRODUCED BY DATA SOURCE test_data_src"""
drop_ds_stmt = "drop data source %s test_data_src"
drop_tbl_stmt = "drop table %s data_src_tbl"
select_stmt = "select * from data_src_tbl limit 1"
create_stmts = [create_ds_stmt, create_tbl_stmt]
drop_stmts = [drop_tbl_stmt, drop_ds_stmt]
self.create_drop_ddl(vector, "data_src_test", create_stmts, drop_stmts,
select_stmt)
def create_drop_ddl(self, vector, db_name, create_stmts, drop_stmts, select_stmt):
# Helper method to run CREATE/DROP DDL commands repeatedly and exercise the lib cache
# create_stmts is the list of CREATE statements to be executed in order drop_stmts is
# the list of DROP statements to be executed in order. Each statement should have a
# '%s' placeholder to insert "IF EXISTS" or "". The select_stmt is just a single
# statement to test after executing the CREATE statements.
# TODO: it's hard to tell that the cache is working (i.e. if it did nothing to drop
# the cache, these tests would still pass). Testing that is a bit harder and requires
# us to update the udf binary in the middle.
self.__create_db_synced(db_name, vector)
self.client.set_configuration(vector.get_value('exec_option'))
self.client.execute("use %s" % (db_name,))
for drop_stmt in drop_stmts: self.client.execute(drop_stmt % ("if exists"))
for i in xrange(1, 10):
for create_stmt in create_stmts: self.client.execute(create_stmt)
self.client.execute(select_stmt)
for drop_stmt in drop_stmts: self.client.execute(drop_stmt % (""))
@pytest.mark.execute_serially
def test_create_alter_bulk_partition(self, vector):
# Change the scale depending on the exploration strategy, with 50 partitions this
# takes a few minutes to run, with 10 partitions it takes ~50s for two configurations.
num_parts = 50
if self.exploration_strategy() != 'exhaustive': num_parts = 10
self.client.execute("use default")
self.client.execute("drop table if exists foo_part")
self.client.execute("create table foo_part(i int) partitioned by(j int, s string)")
# Add some partitions (first batch of two)
for i in xrange(num_parts / 5):
start = time.time()
self.client.execute("alter table foo_part add partition(j=%d, s='%s')" % (i, i))
print 'ADD PARTITION #%d exec time: %s' % (i, time.time() - start)
# Modify one of the partitions
self.client.execute("""alter table foo_part partition(j=1, s='1')
set fileformat parquetfile""")
# Alter one partition to a non-existent location twice (IMPALA-741)
self.hdfs_client.delete_file_dir("tmp/dont_exist1/", recursive=True)
self.hdfs_client.delete_file_dir("tmp/dont_exist2/", recursive=True)
self.execute_query_expect_success(self.client,
"alter table foo_part partition(j=1,s='1') set location '/tmp/dont_exist1'")
self.execute_query_expect_success(self.client,
"alter table foo_part partition(j=1,s='1') set location '/tmp/dont_exist2'")
# Add some more partitions
for i in xrange(num_parts / 5, num_parts):
start = time.time()
self.client.execute("alter table foo_part add partition(j=%d,s='%s')" % (i,i))
print 'ADD PARTITION #%d exec time: %s' % (i, time.time() - start)
# Insert data and verify it shows up.
self.client.execute("insert into table foo_part partition(j=1, s='1') select 1")
assert '1' == self.execute_scalar("select count(*) from foo_part")
self.client.execute("drop table foo_part")
@pytest.mark.execute_serially
def test_create_alter_tbl_properties(self, vector):
self.__create_db_synced('alter_table_test_db', vector)
self.client.execute("use alter_table_test_db")
# Specify TBLPROPERTIES and SERDEPROPERTIES at CREATE time
self.client.execute("""create table test_alter_tbl (i int)
with serdeproperties ('s1'='s2', 's3'='s4')
tblproperties ('p1'='v0', 'p1'='v1')""")
properties = self.__get_tbl_properties('test_alter_tbl')
assert len(properties) == 2
# The transient_lastDdlTime is variable, so don't verify the value.
assert 'transient_lastDdlTime' in properties
del properties['transient_lastDdlTime']
assert {'p1': 'v1'} == properties
properties = self.__get_serde_properties('test_alter_tbl')
assert {'s1': 's2', 's3': 's4'} == properties
# Modify the SERDEPROPERTIES using ALTER TABLE SET.
self.client.execute("alter table test_alter_tbl set serdeproperties "\
"('s1'='new', 's5'='s6')")
properties = self.__get_serde_properties('test_alter_tbl')
assert {'s1': 'new', 's3': 's4', 's5': 's6'} == properties
# Modify the TBLPROPERTIES using ALTER TABLE SET.
self.client.execute("alter table test_alter_tbl set tblproperties "\
"('prop1'='val1', 'p2'='val2', 'p2'='val3', ''='')")
properties = self.__get_tbl_properties('test_alter_tbl')
assert len(properties) == 5
assert 'transient_lastDdlTime' in properties
del properties['transient_lastDdlTime']
assert {'p1': 'v1', 'prop1': 'val1', 'p2': 'val3', '': ''} == properties
@classmethod
def __use_multiple_impalad(cls, vector):
return vector.get_value('exec_option')['sync_ddl'] == 1
@classmethod
def __create_db_synced(cls, db_name, vector):
"""Creates a database using synchronized DDL to ensure all nodes have the test
database available for use before executing the .test file(s).
"""
cls.client.execute('use default')
cls.client.set_configuration({'sync_ddl': 1})
cls.client.execute('create database %s' % db_name)
cls.client.set_configuration(vector.get_value('exec_option'))
def __get_tbl_properties(self, table_name):
"""Extracts the table properties mapping from the output of DESCRIBE FORMATTED"""
return self.__get_properties('Table Parameters:', table_name)
def __get_serde_properties(self, table_name):
"""Extracts the serde properties mapping from the output of DESCRIBE FORMATTED"""
return self.__get_properties('Storage Desc Params:', table_name)
def __get_properties(self, section_name, table_name):
"""Extracts the table properties mapping from the output of DESCRIBE FORMATTED"""
result = self.client.execute("describe formatted " + table_name)
match = False
properties = dict();
for row in result.data:
if section_name in row:
match = True
elif match:
row = row.split('\t')
if (row[1] == 'NULL'):
break
properties[row[1].rstrip()] = row[2].rstrip()
return properties