mirror of
https://github.com/apache/impala.git
synced 2025-12-31 06:02:51 -05:00
With this change, leaky UDFs built with the SDK will still fail when using the test harness, but leaky UDFs running in Impala will only trigger a warning. This change also updates the test infrastructure to always check for non-fatal errors/warnings. Change-Id: I5615349b9d691e4eddea3e03e152ef12e73835e7 Reviewed-on: http://gerrit.ent.cloudera.com:8080/2844 Reviewed-by: Skye Wanderman-Milne <skye@cloudera.com> Tested-by: jenkins (cherry picked from commit 60ce5190d96add6104aba642d2354d87a26000fa) Reviewed-on: http://gerrit.ent.cloudera.com:8080/2938
346 lines
15 KiB
Python
346 lines
15 KiB
Python
#!/usr/bin/env python
|
|
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
|
|
|
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
|
|
from tests.common.test_vector import *
|
|
from tests.common.impala_test_suite import *
|
|
from tests.common.impala_cluster import ImpalaCluster
|
|
from subprocess import call
|
|
|
|
class TestUdfs(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestUdfs, cls).add_test_dimensions()
|
|
# Without limiting the test suite to a single exec option, the tests will fail
|
|
# because the same test case may be executed in parallel with different exec option
|
|
# values leading to conflicting DDL ops.
|
|
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
|
|
|
|
# There is no reason to run these tests using all dimensions.
|
|
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def test_native_functions(self, vector):
|
|
database = 'native_function_test'
|
|
|
|
self.__load_functions(
|
|
self.create_udfs_template, vector, database, '/test-warehouse/libTestUdfs.so')
|
|
self.__load_functions(
|
|
self.create_udas_template, vector, database, '/test-warehouse/libudasample.so')
|
|
|
|
self.run_test_case('QueryTest/udf', vector, use_db=database)
|
|
self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
|
|
self.run_test_case('QueryTest/uda', vector, use_db=database)
|
|
|
|
def test_ir_functions(self, vector):
|
|
database = 'ir_function_test'
|
|
self.__load_functions(
|
|
self.create_udfs_template, vector, database, '/test-warehouse/test-udfs.ll')
|
|
self.run_test_case('QueryTest/udf', vector, use_db=database)
|
|
self.run_test_case('QueryTest/udf-init-close', vector, use_db=database)
|
|
|
|
def test_udf_errors(self, vector):
|
|
self.run_test_case('QueryTest/udf-errors', vector)
|
|
|
|
def test_hive_udfs(self, vector):
|
|
self.client.execute('create database if not exists udf_test')
|
|
self.client.execute('create database if not exists uda_test')
|
|
self.run_test_case('QueryTest/load-hive-udfs', vector)
|
|
self.run_test_case('QueryTest/hive-udf', vector)
|
|
|
|
def test_libs_with_same_filenames(self, vector):
|
|
self.run_test_case('QueryTest/libs_with_same_filenames', vector)
|
|
|
|
def test_udf_update_via_drop(self, vector):
|
|
"""Test updating the UDF binary without restarting Impala. Dropping
|
|
the function should remove the binary from the local cache."""
|
|
# Run with sync_ddl to guarantee the drop is processed by all impalads.
|
|
exec_options = vector.get_value('exec_option')
|
|
exec_options['sync_ddl'] = 1
|
|
old_udf = os.path.join(os.environ['IMPALA_HOME'],
|
|
'testdata/udfs/impala-hive-udfs.jar')
|
|
new_udf = os.path.join(os.environ['IMPALA_HOME'],
|
|
'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
|
|
udf_dst = '/test-warehouse/impala-hive-udfs2.jar'
|
|
|
|
drop_fn_stmt = 'drop function if exists default.udf_update_test_drop()'
|
|
create_fn_stmt = "create function default.udf_update_test_drop() returns string "\
|
|
"LOCATION '" + udf_dst + "' SYMBOL='com.cloudera.impala.TestUpdateUdf'"
|
|
query_stmt = "select default.udf_update_test_drop()"
|
|
|
|
# Put the old UDF binary on HDFS, make the UDF in Impala and run it.
|
|
call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
|
|
self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
|
|
self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
|
|
self.__run_query_all_impalads(exec_options, query_stmt, ["Old UDF"])
|
|
|
|
# Update the binary, drop and create the function again. The new binary should
|
|
# be running.
|
|
call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
|
|
self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
|
|
self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
|
|
self.__run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
|
|
|
|
def test_udf_update_via_create(self, vector):
|
|
"""Test updating the UDF binary without restarting Impala. Creating a new function
|
|
from the library should refresh the cache."""
|
|
# Run with sync_ddl to guarantee the create is processed by all impalads.
|
|
exec_options = vector.get_value('exec_option')
|
|
exec_options['sync_ddl'] = 1
|
|
old_udf = os.path.join(os.environ['IMPALA_HOME'],
|
|
'testdata/udfs/impala-hive-udfs.jar')
|
|
new_udf = os.path.join(os.environ['IMPALA_HOME'],
|
|
'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
|
|
udf_dst = '/test-warehouse/impala-hive-udfs3.jar'
|
|
old_function_name = "udf_update_test_create1"
|
|
new_function_name = "udf_update_test_create2"
|
|
|
|
drop_fn_template = 'drop function if exists default.%s()'
|
|
self.execute_query_expect_success(
|
|
self.client, drop_fn_template % old_function_name, exec_options)
|
|
self.execute_query_expect_success(
|
|
self.client, drop_fn_template % new_function_name, exec_options)
|
|
|
|
create_fn_template = "create function default.%s() returns string "\
|
|
"LOCATION '" + udf_dst + "' SYMBOL='com.cloudera.impala.TestUpdateUdf'"
|
|
query_template = "select default.%s()"
|
|
|
|
# Put the old UDF binary on HDFS, make the UDF in Impala and run it.
|
|
call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
|
|
self.execute_query_expect_success(
|
|
self.client, create_fn_template % old_function_name, exec_options)
|
|
self.__run_query_all_impalads(
|
|
exec_options, query_template % old_function_name, ["Old UDF"])
|
|
|
|
# Update the binary, and create a new function using the binary. The new binary
|
|
# should be running.
|
|
call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
|
|
self.execute_query_expect_success(
|
|
self.client, create_fn_template % new_function_name, exec_options)
|
|
self.__run_query_all_impalads(
|
|
exec_options, query_template % new_function_name, ["New UDF"])
|
|
|
|
# The old function should use the new library now
|
|
self.__run_query_all_impalads(
|
|
exec_options, query_template % old_function_name, ["New UDF"])
|
|
|
|
def test_drop_function_while_running(self, vector):
|
|
self.client.execute("drop function if exists default.drop_while_running(BIGINT)")
|
|
self.client.execute("create function default.drop_while_running(BIGINT) returns "\
|
|
"BIGINT LOCATION '/test-warehouse/libTestUdfs.so' SYMBOL='Identity'")
|
|
query = \
|
|
"select default.drop_while_running(l_orderkey) from tpch.lineitem limit 10000";
|
|
|
|
# Run this query asynchronously.
|
|
handle = self.execute_query_async(query, vector.get_value('exec_option'),
|
|
table_format=vector.get_value('table_format'))
|
|
|
|
# Fetch some rows from the async query to make sure the UDF is being used
|
|
results = self.client.fetch(query, handle, 1)
|
|
assert results.success
|
|
assert len(results.data) == 1
|
|
|
|
# Drop the function while the original query is running.
|
|
self.client.execute("drop function default.drop_while_running(BIGINT)")
|
|
|
|
# Fetch the rest of the rows, this should still be able to run the UDF
|
|
results = self.client.fetch(query, handle, -1)
|
|
assert results.success
|
|
assert len(results.data) == 9999
|
|
|
|
# Run serially because this will blow the process limit, potentially causing other
|
|
# queries to fail
|
|
@pytest.mark.execute_serially
|
|
def test_mem_limits(self, vector):
|
|
# Set the mem limit high enough that a simple scan can run
|
|
mem_limit = 1024 * 1024
|
|
vector.get_value('exec_option')['mem_limit'] = mem_limit
|
|
|
|
try:
|
|
self.run_test_case('QueryTest/udf-mem-limit', vector)
|
|
assert False, "Query was expected to fail"
|
|
except ImpalaBeeswaxException, e:
|
|
self.__check_exception(e)
|
|
|
|
try:
|
|
self.run_test_case('QueryTest/uda-mem-limit', vector)
|
|
assert False, "Query was expected to fail"
|
|
except ImpalaBeeswaxException, e:
|
|
self.__check_exception(e)
|
|
|
|
def __check_exception(self, e):
|
|
# The interesting exception message may be in 'e' or in its inner_exception
|
|
# depending on the point of query failure.
|
|
if 'Memory limit exceeded' in str(e) or 'Cancelled' in str(e):
|
|
return
|
|
if e.inner_exception is not None\
|
|
and ('Memory limit exceeded' in e.inner_exception.message
|
|
or 'Cancelled' not in e.inner_exception.message):
|
|
return
|
|
raise e
|
|
|
|
def __run_query_all_impalads(self, exec_options, query, expected):
|
|
impala_cluster = ImpalaCluster()
|
|
for impalad in impala_cluster.impalads:
|
|
client = impalad.service.create_beeswax_client()
|
|
result = self.execute_query_expect_success(client, query, exec_options)
|
|
assert result.data == expected
|
|
|
|
def __load_functions(self, template, vector, database, location):
|
|
queries = template.format(database=database, location=location)
|
|
# Split queries and remove empty lines
|
|
queries = [q for q in queries.split(';') if q.strip()]
|
|
exec_options = vector.get_value('exec_option')
|
|
for query in queries:
|
|
if query.strip() == '': continue
|
|
result = self.execute_query_expect_success(self.client, query, exec_options)
|
|
assert result is not None
|
|
|
|
# Create test UDA functions in {database} from library {location}
|
|
create_udas_template = """
|
|
drop function if exists {database}.test_count(int);
|
|
drop function if exists {database}.hll(int);
|
|
drop function if exists {database}.sum_small_decimal(decimal(9,2));
|
|
|
|
create database if not exists {database};
|
|
|
|
create aggregate function {database}.test_count(int) returns bigint
|
|
location '{location}' update_fn='CountUpdate';
|
|
|
|
create aggregate function {database}.hll(int) returns string
|
|
location '{location}' update_fn='HllUpdate';
|
|
|
|
create aggregate function {database}.sum_small_decimal(decimal(9,2))
|
|
returns decimal(9,2) location '{location}' update_fn='SumSmallDecimalUpdate';
|
|
"""
|
|
|
|
# Create test UDF functions in {database} from library {location}
|
|
create_udfs_template = """
|
|
drop function if exists {database}.identity(boolean);
|
|
drop function if exists {database}.identity(tinyint);
|
|
drop function if exists {database}.identity(smallint);
|
|
drop function if exists {database}.identity(int);
|
|
drop function if exists {database}.identity(bigint);
|
|
drop function if exists {database}.identity(float);
|
|
drop function if exists {database}.identity(double);
|
|
drop function if exists {database}.identity(string);
|
|
drop function if exists {database}.identity(timestamp);
|
|
drop function if exists {database}.identity(decimal(9,0));
|
|
drop function if exists {database}.identity(decimal(18,1));
|
|
drop function if exists {database}.identity(decimal(38,10));
|
|
drop function if exists {database}.all_types_fn(
|
|
string, boolean, tinyint, smallint, int, bigint, float, double, decimal(2,0));
|
|
drop function if exists {database}.no_args();
|
|
drop function if exists {database}.var_and(boolean...);
|
|
drop function if exists {database}.var_sum(int...);
|
|
drop function if exists {database}.var_sum(double...);
|
|
drop function if exists {database}.var_sum(string...);
|
|
drop function if exists {database}.var_sum(decimal(4,2)...);
|
|
drop function if exists {database}.var_sum_multiply(double, int...);
|
|
drop function if exists {database}.constant_timestamp();
|
|
drop function if exists {database}.validate_arg_type(string);
|
|
drop function if exists {database}.count_rows();
|
|
drop function if exists {database}.constant_arg(int);
|
|
drop function if exists {database}.validate_open(int);
|
|
drop function if exists {database}.mem_test(bigint);
|
|
drop function if exists {database}.mem_test_leaks(bigint);
|
|
|
|
create database if not exists {database};
|
|
|
|
create function {database}.identity(boolean) returns boolean
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(tinyint) returns tinyint
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(smallint) returns smallint
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(int) returns int
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(bigint) returns bigint
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(float) returns float
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(double) returns double
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(string) returns string
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_9StringValE';
|
|
|
|
create function {database}.identity(timestamp) returns timestamp
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_12TimestampValE';
|
|
|
|
create function {database}.identity(decimal(9,0)) returns decimal(9,0)
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_10DecimalValE';
|
|
|
|
create function {database}.identity(decimal(18,1)) returns decimal(18,1)
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_10DecimalValE';
|
|
|
|
create function {database}.identity(decimal(38,10)) returns decimal(38,10)
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_10DecimalValE';
|
|
|
|
create function {database}.all_types_fn(
|
|
string, boolean, tinyint, smallint, int, bigint, float, double, decimal(2,0))
|
|
returns int
|
|
location '{location}' symbol='AllTypes';
|
|
|
|
create function {database}.no_args() returns string
|
|
location '{location}'
|
|
symbol='_Z6NoArgsPN10impala_udf15FunctionContextE';
|
|
|
|
create function {database}.var_and(boolean...) returns boolean
|
|
location '{location}' symbol='VarAnd';
|
|
|
|
create function {database}.var_sum(int...) returns int
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum(double...) returns double
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum(string...) returns int
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum(decimal(4,2)...) returns decimal(18,2)
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum_multiply(double, int...) returns double
|
|
location '{location}'
|
|
symbol='_Z14VarSumMultiplyPN10impala_udf15FunctionContextERKNS_9DoubleValEiPKNS_6IntValE';
|
|
|
|
create function {database}.constant_timestamp() returns timestamp
|
|
location '{location}' symbol='ConstantTimestamp';
|
|
|
|
create function {database}.validate_arg_type(string) returns boolean
|
|
location '{location}' symbol='ValidateArgType';
|
|
|
|
create function {database}.count_rows() returns bigint
|
|
location '{location}' symbol='Count' prepare_fn='CountPrepare' close_fn='CountClose';
|
|
|
|
create function {database}.constant_arg(int) returns int
|
|
location '{location}' symbol='ConstantArg' prepare_fn='ConstantArgPrepare' close_fn='ConstantArgClose';
|
|
|
|
create function {database}.validate_open(int) returns boolean
|
|
location '{location}' symbol='ValidateOpen'
|
|
prepare_fn='ValidateOpenPrepare' close_fn='ValidateOpenClose';
|
|
|
|
create function {database}.mem_test(bigint) returns bigint
|
|
location '{location}' symbol='MemTest'
|
|
prepare_fn='MemTestPrepare' close_fn='MemTestClose';
|
|
|
|
create function {database}.mem_test_leaks(bigint) returns bigint
|
|
location '{location}' symbol='MemTest'
|
|
prepare_fn='MemTestPrepare';
|
|
"""
|