Files
impala/tests/query_test/test_udfs.py
Skye Wanderman-Milne bd2fc2d1d4 IMPALA-934: Refresh cached UDF library when creating a new function
This change adds the ability to refresh a local cache entry, causing
the old cache entry to be dropped and the library to be reloaded from
HDFS. This is used in ResolveSymbolLookup(), which is called by the
frontend when creating a new a function, and in ImpalaServer when
receiving a "create function" heartbeat. This change also makes sure
the FE calls into the backend for jars, so jars get refreshed as well.

Change-Id: I5fd61c1bc2e04838449335d5a68b61af8b101b01
Reviewed-on: http://gerrit.ent.cloudera.com:8080/2286
Reviewed-by: Skye Wanderman-Milne <skye@cloudera.com>
Tested-by: jenkins
(cherry picked from commit e8587794b3b82438190c91b2ebe9d1e12db73981)
Reviewed-on: http://gerrit.ent.cloudera.com:8080/2348
2014-04-24 19:39:16 -07:00

310 lines
13 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
from tests.common.impala_cluster import ImpalaCluster
from subprocess import call
class TestUdfs(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestUdfs, cls).add_test_dimensions()
# Without limiting the test suite to a single exec option, the tests will fail
# because the same test case may be executed in parallel with different exec option
# values leading to conflicting DDL ops.
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
# There is no reason to run these tests using all dimensions.
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
def test_native_functions(self, vector):
database = 'native_function_test'
self.__load_functions(
self.create_udfs_template, vector, database, '/test-warehouse/libTestUdfs.so')
self.__load_functions(
self.create_udas_template, vector, database, '/test-warehouse/libudasample.so')
self.run_test_case('QueryTest/udf', vector, use_db=database)
self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
self.run_test_case('QueryTest/uda', vector, use_db=database)
def test_ir_functions(self, vector):
database = 'ir_function_test'
self.__load_functions(
self.create_udfs_template, vector, database, '/test-warehouse/test-udfs.ll')
self.run_test_case('QueryTest/udf', vector, use_db=database)
self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
def test_hive_udfs(self, vector):
self.client.execute('create database if not exists udf_test')
self.client.execute('create database if not exists uda_test')
self.run_test_case('QueryTest/load-hive-udfs', vector)
self.run_test_case('QueryTest/hive-udf', vector)
def test_libs_with_same_filenames(self, vector):
self.run_test_case('QueryTest/libs_with_same_filenames', vector)
def test_udf_update_via_drop(self, vector):
"""Test updating the UDF binary without restarting Impala. Dropping
the function should remove the binary from the local cache."""
# Run with sync_ddl to guarantee the drop is processed by all impalads.
exec_options = vector.get_value('exec_option')
exec_options['sync_ddl'] = 1
old_udf = os.path.join(os.environ['IMPALA_HOME'],
'testdata/udfs/impala-hive-udfs.jar')
new_udf = os.path.join(os.environ['IMPALA_HOME'],
'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
udf_dst = '/test-warehouse/impala-hive-udfs2.jar'
drop_fn_stmt = 'drop function if exists default.udf_update_test_drop()'
create_fn_stmt = "create function default.udf_update_test_drop() returns string "\
"LOCATION '" + udf_dst + "' SYMBOL='com.cloudera.impala.TestUpdateUdf'"
query_stmt = "select default.udf_update_test_drop()"
# Put the old UDF binary on HDFS, make the UDF in Impala and run it.
call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
self.__run_query_all_impalads(exec_options, query_stmt, ["Old UDF"])
# Update the binary, drop and create the function again. The new binary should
# be running.
call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
self.__run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
def test_udf_update_via_create(self, vector):
"""Test updating the UDF binary without restarting Impala. Creating a new function
from the library should refresh the cache."""
# Run with sync_ddl to guarantee the create is processed by all impalads.
exec_options = vector.get_value('exec_option')
exec_options['sync_ddl'] = 1
old_udf = os.path.join(os.environ['IMPALA_HOME'],
'testdata/udfs/impala-hive-udfs.jar')
new_udf = os.path.join(os.environ['IMPALA_HOME'],
'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
udf_dst = '/test-warehouse/impala-hive-udfs3.jar'
old_function_name = "udf_update_test_create1"
new_function_name = "udf_update_test_create2"
drop_fn_template = 'drop function if exists default.%s()'
self.execute_query_expect_success(
self.client, drop_fn_template % old_function_name, exec_options)
self.execute_query_expect_success(
self.client, drop_fn_template % new_function_name, exec_options)
create_fn_template = "create function default.%s() returns string "\
"LOCATION '" + udf_dst + "' SYMBOL='com.cloudera.impala.TestUpdateUdf'"
query_template = "select default.%s()"
# Put the old UDF binary on HDFS, make the UDF in Impala and run it.
call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
self.execute_query_expect_success(
self.client, create_fn_template % old_function_name, exec_options)
self.__run_query_all_impalads(
exec_options, query_template % old_function_name, ["Old UDF"])
# Update the binary, and create a new function using the binary. The new binary
# should be running.
call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
self.execute_query_expect_success(
self.client, create_fn_template % new_function_name, exec_options)
self.__run_query_all_impalads(
exec_options, query_template % new_function_name, ["New UDF"])
# The old function should use the new library now
self.__run_query_all_impalads(
exec_options, query_template % old_function_name, ["New UDF"])
def test_drop_function_while_running(self, vector):
self.client.execute("drop function if exists default.drop_while_running(BIGINT)")
self.client.execute("create function default.drop_while_running(BIGINT) returns "\
"BIGINT LOCATION '/test-warehouse/libTestUdfs.so' SYMBOL='Identity'")
query = \
"select default.drop_while_running(l_orderkey) from tpch.lineitem limit 10000";
# Run this query asynchronously.
handle = self.execute_query_async(query, vector.get_value('exec_option'),
table_format=vector.get_value('table_format'))
# Fetch some rows from the async query to make sure the UDF is being used
results = self.client.fetch(query, handle, 1)
assert results.success
assert len(results.data) == 1
# Drop the function while the original query is running.
self.client.execute("drop function default.drop_while_running(BIGINT)")
# Fetch the rest of the rows, this should still be able to run the UDF
results = self.client.fetch(query, handle, -1)
assert results.success
assert len(results.data) == 9999
# Run serially because this will blow the process limit, potentially causing other
# queries to fail
@pytest.mark.execute_serially
def test_mem_limits(self, vector):
# Set the mem limit high enough that a simple scan can run
mem_limit = 1024 * 1024
vector.get_value('exec_option')['mem_limit'] = mem_limit
try:
self.run_test_case('QueryTest/udf-mem-limit', vector)
assert False, "Query was expected to fail"
except ImpalaBeeswaxException, e:
self.__check_exception(e)
try:
self.run_test_case('QueryTest/uda-mem-limit', vector)
assert False, "Query was expected to fail"
except ImpalaBeeswaxException, e:
self.__check_exception(e)
def __check_exception(self, e):
# The interesting exception message may be in 'e' or in its inner_exception
# depending on the point of query failure.
if 'Memory limit exceeded' in str(e) or 'Cancelled' in str(e):
return
if e.inner_exception is not None\
and ('Memory limit exceeded' in e.inner_exception.message
or 'Cancelled' not in e.inner_exception.message):
return
raise e
def __run_query_all_impalads(self, exec_options, query, expected):
impala_cluster = ImpalaCluster()
for impalad in impala_cluster.impalads:
client = impalad.service.create_beeswax_client()
result = self.execute_query_expect_success(client, query, exec_options)
assert result.data == expected
def __load_functions(self, template, vector, database, location):
queries = template.format(database=database, location=location)
# Split queries and remove empty lines
queries = [q for q in queries.split(';') if q.strip()]
exec_options = vector.get_value('exec_option')
for query in queries:
if query.strip() == '': continue
result = self.execute_query_expect_success(self.client, query, exec_options)
assert result is not None
# Create test UDA functions in {database} from library {location}
create_udas_template = """
drop function if exists {database}.test_count(int);
drop function if exists {database}.hll(int);
create database if not exists {database};
create aggregate function {database}.test_count(int) returns bigint
location '{location}' update_fn='CountUpdate';
create aggregate function {database}.hll(int) returns string
location '{location}' update_fn='HllUpdate';
"""
# Create test UDF functions in {database} from library {location}
create_udfs_template = """
drop function if exists {database}.identity(boolean);
drop function if exists {database}.identity(tinyint);
drop function if exists {database}.identity(smallint);
drop function if exists {database}.identity(int);
drop function if exists {database}.identity(bigint);
drop function if exists {database}.identity(float);
drop function if exists {database}.identity(double);
drop function if exists {database}.identity(string);
drop function if exists {database}.identity(timestamp);
drop function if exists {database}.all_types_fn(
string, boolean, tinyint, smallint, int, bigint, float, double);
drop function if exists {database}.no_args();
drop function if exists {database}.var_and(boolean...);
drop function if exists {database}.var_sum(int...);
drop function if exists {database}.var_sum(double...);
drop function if exists {database}.var_sum(string...);
drop function if exists {database}.var_sum_multiply(double, int...);
drop function if exists {database}.constant_timestamp();
drop function if exists {database}.validate_arg_type(string);
drop function if exists {database}.count_rows();
drop function if exists {database}.constant_arg(int);
drop function if exists {database}.validate_open(int);
create database if not exists {database};
create function {database}.identity(boolean) returns boolean
location '{location}' symbol='Identity';
create function {database}.identity(tinyint) returns tinyint
location '{location}' symbol='Identity';
create function {database}.identity(smallint) returns smallint
location '{location}' symbol='Identity';
create function {database}.identity(int) returns int
location '{location}' symbol='Identity';
create function {database}.identity(bigint) returns bigint
location '{location}' symbol='Identity';
create function {database}.identity(float) returns float
location '{location}' symbol='Identity';
create function {database}.identity(double) returns double
location '{location}' symbol='Identity';
create function {database}.identity(string) returns string
location '{location}'
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_9StringValE';
create function {database}.identity(timestamp) returns timestamp
location '{location}'
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_12TimestampValE';
create function {database}.all_types_fn(
string, boolean, tinyint, smallint, int, bigint, float, double)
returns int
location '{location}' symbol='AllTypes';
create function {database}.no_args() returns string
location '{location}'
symbol='_Z6NoArgsPN10impala_udf15FunctionContextE';
create function {database}.var_and(boolean...) returns boolean
location '{location}' symbol='VarAnd';
create function {database}.var_sum(int...) returns int
location '{location}' symbol='VarSum';
create function {database}.var_sum(double...) returns double
location '{location}' symbol='VarSum';
create function {database}.var_sum(string...) returns int
location '{location}' symbol='VarSum';
create function {database}.var_sum_multiply(double, int...) returns double
location '{location}'
symbol='_Z14VarSumMultiplyPN10impala_udf15FunctionContextERKNS_9DoubleValEiPKNS_6IntValE';
create function {database}.constant_timestamp() returns timestamp
location '{location}' symbol='ConstantTimestamp';
create function {database}.validate_arg_type(string) returns boolean
location '{location}' symbol='ValidateArgType';
create function {database}.count_rows() returns bigint
location '{location}' symbol='Count' prepare_fn='CountPrepare' close_fn='CountClose';
create function {database}.constant_arg(int) returns int
location '{location}' symbol='ConstantArg' prepare_fn='ConstantArgPrepare' close_fn='ConstantArgClose';
create function {database}.validate_open(int) returns boolean
location '{location}' symbol='ValidateOpen'
prepare_fn='ValidateOpenPrepare' close_fn='ValidateOpenClose';
"""