#!/usr/bin/env python # Copyright (c) 2012 Cloudera, Inc. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Impala tests for DDL statements import logging import pytest import shlex import time from tests.common.test_result_verifier import * from subprocess import call from tests.common.test_vector import * from tests.common.test_dimensions import ALL_NODES_ONLY from tests.common.impala_test_suite import * # Validates DDL statements (create, drop) class TestDdlStatements(ImpalaTestSuite): TEST_DBS = ['ddl_test_db', 'alter_table_test_db', 'alter_table_test_db2', 'function_ddl_test', 'udf_test', 'data_src_test'] @classmethod def get_workload(self): return 'functional-query' @classmethod def add_test_dimensions(cls): super(TestDdlStatements, cls).add_test_dimensions() sync_ddl_opts = [0, 1] if cls.exploration_strategy() != 'exhaustive': # Only run with sync_ddl on exhaustive since it increases test runtime. sync_ddl_opts = [0] cls.TestMatrix.add_dimension(create_exec_option_dimension( cluster_sizes=ALL_NODES_ONLY, disable_codegen_options=[False], batch_sizes=[0], sync_ddl=sync_ddl_opts)) # There is no reason to run these tests using all dimensions. cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload())) def setup_method(self, method): self.cleanup() # Get the current number of queries that are in the 'EXCEPTION' state. Used for # verification after running each test case. self.start_exception_count = self.query_exception_count() self.cleanup_hdfs_dirs() def teardown_method(self, method): end_exception_count = self.query_exception_count() # The number of exceptions may be < than what was in setup if the queries in the # EXCEPTION state were bumped out of the FINISHED list. We should never see an # increase in the number of queries in the exception state. assert end_exception_count <= self.start_exception_count def query_exception_count(self): """Returns the number of occurrences of 'EXCEPTION' on the debug /queries page""" return len(re.findall('EXCEPTION', self.impalad_test_service.read_debug_webpage('queries'))) def cleanup(self): map(self.cleanup_db, self.TEST_DBS) self.cleanup_hdfs_dirs() def cleanup_hdfs_dirs(self): # Cleanup the test table HDFS dirs between test runs so there are no errors the next # time a table is created with the same location. This also helps remove any stale # data from the last test run. self.hdfs_client.delete_file_dir("test-warehouse/part_data/", recursive=True) self.hdfs_client.delete_file_dir("test-warehouse/t1_tmp1/", recursive=True) self.hdfs_client.delete_file_dir("test-warehouse/t_part_tmp/", recursive=True) @pytest.mark.execute_serially def test_drop_cleans_hdfs_dirs(self): self.hdfs_client.delete_file_dir("test-warehouse/ddl_test_db.db/", recursive=True) assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/") self.client.execute('use default') self.client.execute('create database ddl_test_db') # Verify the db directory exists assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/") self.client.execute("create table ddl_test_db.t1(i int)") # Verify the table directory exists assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/t1/") # Dropping the table removes the table's directory and preserves the db's directory self.client.execute("drop table ddl_test_db.t1") assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/t1/") assert self.hdfs_client.exists("test-warehouse/ddl_test_db.db/") # Dropping the db removes the db's directory self.client.execute("drop database ddl_test_db") assert not self.hdfs_client.exists("test-warehouse/ddl_test_db.db/") @pytest.mark.execute_serially def test_create(self, vector): vector.get_value('exec_option')['abort_on_error'] = False self.__create_db_synced('ddl_test_db', vector) self.run_test_case('QueryTest/create', vector, use_db='ddl_test_db', multiple_impalad=self.__use_multiple_impalad(vector)) @pytest.mark.execute_serially def test_alter_table(self, vector): vector.get_value('exec_option')['abort_on_error'] = False # Create directory for partition data that does not use the (key=value) # format. self.hdfs_client.make_dir("test-warehouse/part_data/", permission=777) self.hdfs_client.create_file("test-warehouse/part_data/data.txt", file_data='1984') # Create test databases self.__create_db_synced('alter_table_test_db', vector) self.__create_db_synced('alter_table_test_db2', vector) self.run_test_case('QueryTest/alter-table', vector, use_db='alter_table_test_db', multiple_impalad=self.__use_multiple_impalad(vector)) @pytest.mark.execute_serially def test_views_ddl(self, vector): vector.get_value('exec_option')['abort_on_error'] = False self.__create_db_synced('ddl_test_db', vector) self.run_test_case('QueryTest/views-ddl', vector, use_db='ddl_test_db', multiple_impalad=self.__use_multiple_impalad(vector)) @pytest.mark.execute_serially def test_functions_ddl(self, vector): self.__create_db_synced('function_ddl_test', vector) self.run_test_case('QueryTest/functions-ddl', vector, use_db='function_ddl_test', multiple_impalad=self.__use_multiple_impalad(vector)) @pytest.mark.execute_serially def test_create_drop_function(self, vector): # This will create, run, and drop the same function repeatedly, exercising the # lib cache mechanism. create_fn_stmt = """create function f() returns int location '/test-warehouse/libTestUdfs.so' symbol='NoArgs'""" select_stmt = """select f() from functional.alltypes limit 10""" drop_fn_stmt = "drop function %s f()" self.create_drop_ddl(vector, "udf_test", [create_fn_stmt], [drop_fn_stmt], select_stmt) @pytest.mark.execute_serially def test_create_drop_data_src(self, vector): # This will create, run, and drop the same data source repeatedly, exercising # the lib cache mechanism. create_ds_stmt = """CREATE DATA SOURCE test_data_src LOCATION '/test-warehouse/data-sources/test-data-source.jar' CLASS 'com.cloudera.impala.extdatasource.AllTypesDataSource' API_VERSION 'V1'""" create_tbl_stmt = """CREATE TABLE data_src_tbl (x int) PRODUCED BY DATA SOURCE test_data_src""" drop_ds_stmt = "drop data source %s test_data_src" drop_tbl_stmt = "drop table %s data_src_tbl" select_stmt = "select * from data_src_tbl limit 1" create_stmts = [create_ds_stmt, create_tbl_stmt] drop_stmts = [drop_tbl_stmt, drop_ds_stmt] self.create_drop_ddl(vector, "data_src_test", create_stmts, drop_stmts, select_stmt) def create_drop_ddl(self, vector, db_name, create_stmts, drop_stmts, select_stmt): # Helper method to run CREATE/DROP DDL commands repeatedly and exercise the lib cache # create_stmts is the list of CREATE statements to be executed in order drop_stmts is # the list of DROP statements to be executed in order. Each statement should have a # '%s' placeholder to insert "IF EXISTS" or "". The select_stmt is just a single # statement to test after executing the CREATE statements. # TODO: it's hard to tell that the cache is working (i.e. if it did nothing to drop # the cache, these tests would still pass). Testing that is a bit harder and requires # us to update the udf binary in the middle. self.__create_db_synced(db_name, vector) self.client.set_configuration(vector.get_value('exec_option')) self.client.execute("use %s" % (db_name,)) for drop_stmt in drop_stmts: self.client.execute(drop_stmt % ("if exists")) for i in xrange(1, 10): for create_stmt in create_stmts: self.client.execute(create_stmt) self.client.execute(select_stmt) for drop_stmt in drop_stmts: self.client.execute(drop_stmt % ("")) @pytest.mark.execute_serially def test_create_alter_bulk_partition(self, vector): # Change the scale depending on the exploration strategy, with 50 partitions this # takes a few minutes to run, with 10 partitions it takes ~50s for two configurations. num_parts = 50 if self.exploration_strategy() != 'exhaustive': num_parts = 10 self.client.execute("use default") self.client.execute("drop table if exists foo_part") self.client.execute("create table foo_part(i int) partitioned by(j int, s string)") # Add some partitions (first batch of two) for i in xrange(num_parts / 5): start = time.time() self.client.execute("alter table foo_part add partition(j=%d, s='%s')" % (i, i)) print 'ADD PARTITION #%d exec time: %s' % (i, time.time() - start) # Modify one of the partitions self.client.execute("""alter table foo_part partition(j=1, s='1') set fileformat parquetfile""") # Alter one partition to a non-existent location twice (IMPALA-741) self.hdfs_client.delete_file_dir("tmp/dont_exist1/", recursive=True) self.hdfs_client.delete_file_dir("tmp/dont_exist2/", recursive=True) self.execute_query_expect_success(self.client, "alter table foo_part partition(j=1,s='1') set location '/tmp/dont_exist1'") self.execute_query_expect_success(self.client, "alter table foo_part partition(j=1,s='1') set location '/tmp/dont_exist2'") # Add some more partitions for i in xrange(num_parts / 5, num_parts): start = time.time() self.client.execute("alter table foo_part add partition(j=%d,s='%s')" % (i,i)) print 'ADD PARTITION #%d exec time: %s' % (i, time.time() - start) # Insert data and verify it shows up. self.client.execute("insert into table foo_part partition(j=1, s='1') select 1") assert '1' == self.execute_scalar("select count(*) from foo_part") self.client.execute("drop table foo_part") @pytest.mark.execute_serially def test_create_alter_tbl_properties(self, vector): self.__create_db_synced('alter_table_test_db', vector) self.client.execute("use alter_table_test_db") # Specify TBLPROPERTIES and SERDEPROPERTIES at CREATE time self.client.execute("""create table test_alter_tbl (i int) with serdeproperties ('s1'='s2', 's3'='s4') tblproperties ('p1'='v0', 'p1'='v1')""") properties = self.__get_tbl_properties('test_alter_tbl') assert len(properties) == 2 # The transient_lastDdlTime is variable, so don't verify the value. assert 'transient_lastDdlTime' in properties del properties['transient_lastDdlTime'] assert {'p1': 'v1'} == properties properties = self.__get_serde_properties('test_alter_tbl') assert {'s1': 's2', 's3': 's4'} == properties # Modify the SERDEPROPERTIES using ALTER TABLE SET. self.client.execute("alter table test_alter_tbl set serdeproperties "\ "('s1'='new', 's5'='s6')") properties = self.__get_serde_properties('test_alter_tbl') assert {'s1': 'new', 's3': 's4', 's5': 's6'} == properties # Modify the TBLPROPERTIES using ALTER TABLE SET. self.client.execute("alter table test_alter_tbl set tblproperties "\ "('prop1'='val1', 'p2'='val2', 'p2'='val3', ''='')") properties = self.__get_tbl_properties('test_alter_tbl') assert len(properties) == 5 assert 'transient_lastDdlTime' in properties del properties['transient_lastDdlTime'] assert {'p1': 'v1', 'prop1': 'val1', 'p2': 'val3', '': ''} == properties @classmethod def __use_multiple_impalad(cls, vector): return vector.get_value('exec_option')['sync_ddl'] == 1 @classmethod def __create_db_synced(cls, db_name, vector): """Creates a database using synchronized DDL to ensure all nodes have the test database available for use before executing the .test file(s). """ cls.client.execute('use default') cls.client.set_configuration({'sync_ddl': 1}) cls.client.execute('create database %s' % db_name) cls.client.set_configuration(vector.get_value('exec_option')) def __get_tbl_properties(self, table_name): """Extracts the table properties mapping from the output of DESCRIBE FORMATTED""" return self.__get_properties('Table Parameters:', table_name) def __get_serde_properties(self, table_name): """Extracts the serde properties mapping from the output of DESCRIBE FORMATTED""" return self.__get_properties('Storage Desc Params:', table_name) def __get_properties(self, section_name, table_name): """Extracts the table properties mapping from the output of DESCRIBE FORMATTED""" result = self.client.execute("describe formatted " + table_name) match = False properties = dict(); for row in result.data: if section_name in row: match = True elif match: row = row.split('\t') if (row[1] == 'NULL'): break properties[row[1].rstrip()] = row[2].rstrip() return properties