mirror of
https://github.com/apache/impala.git
synced 2026-01-05 12:01:11 -05:00
This is similar to the single-node execution optimisation, but applies to slightly larger queries that should run in a distributed manner but won't benefit from codegen. This adds a new query option disable_codegen_rows_threshold that defaults to 50,000. If fewer than this number of rows are processed by a plan node per impalad, the cost of codegen almost certainly outweighs the benefit. Using rows processed as a threshold is justified by a simple model that assumes the cost of codegen and execution per row for the same operation are proportional. E.g. if x is the complexity of the operation, n is the number of rows processed, C is a constant factor giving the cost of codegen and Ec/Ei are constant factor giving the cost of codegen'd and interpreted execution and d, then the cost of the codegen'd operator is C * x + Ec * x * n and the cost of the interpreted operator is Ei * x * n. Rearranging means that interpretation is cheaper if n < C / (Ei - Ec), i.e. that (at least with the simplified model) it makes sense to choose interpretation or codegen based on a constant threshold. The model also implies that it is somewhat safer to choose codegen because the additional cost of codegen is O(1) but the additional cost of interpretation is O(n). I ran some experiments with TPC-H Q1, varying the input table size, to determine what the cut-over point where codegen was beneficial was. The cutover was around 150k rows per node for both text and parquet. At 50k rows per node disabling codegen was very beneficial - around 0.12s versus 0.24s. To be somewhat conservative I set the default threshold to 50k rows. On more complex queries, e.g. TPC-H Q10, the cutover tends to be higher because there are plan nodes that process many fewer than the max rows. Fix a couple of minor issues in the frontend - the numNodes_ calculation could return 0 for Kudu, and the single node optimization didn't handle the case where for a scan node with conjuncts, a limit and missing stats correctly (it considered the estimate still valid.) Testing: Updated e2e tests that set disable_codegen to set disable_codegen_rows_threshold to 0, so that those tests run both with and without codegen still. Added an e2e test to make sure that the optimisation is applied in the backend. Added planner tests for various cases where codegen should and shouldn't be disabled. Perf: Added a targeted perf test for a join+agg over a small input, which benefits from this change. Change-Id: I273bcee58641f5b97de52c0b2caab043c914b32e Reviewed-on: http://gerrit.cloudera.org:8080/7153 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins
569 lines
25 KiB
Python
569 lines
25 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from copy import copy
|
|
import os
|
|
import pytest
|
|
from subprocess import check_call
|
|
|
|
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
|
|
from tests.common.impala_cluster import ImpalaCluster
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.skip import SkipIfLocal
|
|
from tests.common.test_dimensions import (
|
|
create_exec_option_dimension,
|
|
create_exec_option_dimension_from_dict,
|
|
create_uncompressed_text_dimension)
|
|
from tests.util.calculation_util import get_random_id
|
|
from tests.util.filesystem_utils import get_fs_path, IS_S3
|
|
from tests.verifiers.metric_verifier import MetricVerifier
|
|
|
|
class TestUdfBase(ImpalaTestSuite):
|
|
"""
|
|
Base class with utility functions for testing UDFs.
|
|
"""
|
|
def _check_exception(self, e):
|
|
# The interesting exception message may be in 'e' or in its inner_exception
|
|
# depending on the point of query failure.
|
|
if 'Memory limit exceeded' in str(e) or 'Cancelled' in str(e):
|
|
return
|
|
if e.inner_exception is not None\
|
|
and ('Memory limit exceeded' in e.inner_exception.message
|
|
or 'Cancelled' not in e.inner_exception.message):
|
|
return
|
|
raise e
|
|
|
|
def _run_query_all_impalads(self, exec_options, query, expected):
|
|
impala_cluster = ImpalaCluster()
|
|
for impalad in impala_cluster.impalads:
|
|
client = impalad.service.create_beeswax_client()
|
|
result = self.execute_query_expect_success(client, query, exec_options)
|
|
assert result.data == expected
|
|
|
|
def _load_functions(self, template, vector, database, location):
|
|
queries = template.format(database=database, location=location)
|
|
# Split queries and remove empty lines
|
|
queries = [q for q in queries.split(';') if q.strip()]
|
|
exec_options = vector.get_value('exec_option')
|
|
for query in queries:
|
|
if query.strip() == '': continue
|
|
result = self.execute_query_expect_success(self.client, query, exec_options)
|
|
assert result is not None
|
|
|
|
# Create sample UDA functions in {database} from library {location}
|
|
create_sample_udas_template = """
|
|
create aggregate function {database}.test_count(int) returns bigint
|
|
location '{location}' update_fn='CountUpdate';
|
|
|
|
create aggregate function {database}.hll(int) returns string
|
|
location '{location}' update_fn='HllUpdate';
|
|
|
|
create aggregate function {database}.sum_small_decimal(decimal(9,2))
|
|
returns decimal(9,2) location '{location}' update_fn='SumSmallDecimalUpdate';
|
|
"""
|
|
|
|
# Create test UDA functions in {database} from library {location}
|
|
create_test_udas_template = """
|
|
create aggregate function {database}.trunc_sum(double)
|
|
returns bigint intermediate double location '{location}'
|
|
update_fn='TruncSumUpdate' merge_fn='TruncSumMerge'
|
|
serialize_fn='TruncSumSerialize' finalize_fn='TruncSumFinalize';
|
|
|
|
create aggregate function {database}.arg_is_const(int, int)
|
|
returns boolean location '{location}'
|
|
init_fn='ArgIsConstInit' update_fn='ArgIsConstUpdate' merge_fn='ArgIsConstMerge';
|
|
|
|
create aggregate function {database}.toggle_null(int)
|
|
returns int location '{location}'
|
|
update_fn='ToggleNullUpdate' merge_fn='ToggleNullMerge';
|
|
|
|
create aggregate function {database}.count_nulls(bigint)
|
|
returns bigint location '{location}'
|
|
update_fn='CountNullsUpdate' merge_fn='CountNullsMerge';
|
|
|
|
create aggregate function {database}.agg_intermediate(int)
|
|
returns bigint intermediate string location '{location}'
|
|
init_fn='AggIntermediateInit' update_fn='AggIntermediateUpdate'
|
|
merge_fn='AggIntermediateMerge' finalize_fn='AggIntermediateFinalize';
|
|
|
|
create aggregate function {database}.agg_decimal_intermediate(decimal(2,1), int)
|
|
returns decimal(6,5) intermediate decimal(4,3) location '{location}'
|
|
init_fn='AggDecimalIntermediateInit' update_fn='AggDecimalIntermediateUpdate'
|
|
merge_fn='AggDecimalIntermediateMerge' finalize_fn='AggDecimalIntermediateFinalize';
|
|
|
|
create aggregate function {database}.agg_string_intermediate(decimal(20,10), bigint, string)
|
|
returns decimal(20,0) intermediate string location '{location}'
|
|
init_fn='AggStringIntermediateInit' update_fn='AggStringIntermediateUpdate'
|
|
merge_fn='AggStringIntermediateMerge' finalize_fn='AggStringIntermediateFinalize';
|
|
"""
|
|
|
|
# Create test UDF functions in {database} from library {location}
|
|
create_udfs_template = """
|
|
create function {database}.identity(boolean) returns boolean
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(tinyint) returns tinyint
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(smallint) returns smallint
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(int) returns int
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(bigint) returns bigint
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(float) returns float
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(double) returns double
|
|
location '{location}' symbol='Identity';
|
|
|
|
create function {database}.identity(string) returns string
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_9StringValE';
|
|
|
|
create function {database}.identity(timestamp) returns timestamp
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_12TimestampValE';
|
|
|
|
create function {database}.identity(decimal(9,0)) returns decimal(9,0)
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_10DecimalValE';
|
|
|
|
create function {database}.identity(decimal(18,1)) returns decimal(18,1)
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_10DecimalValE';
|
|
|
|
create function {database}.identity(decimal(38,10)) returns decimal(38,10)
|
|
location '{location}'
|
|
symbol='_Z8IdentityPN10impala_udf15FunctionContextERKNS_10DecimalValE';
|
|
|
|
create function {database}.all_types_fn(
|
|
string, boolean, tinyint, smallint, int, bigint, float, double, decimal(2,0))
|
|
returns int
|
|
location '{location}' symbol='AllTypes';
|
|
|
|
create function {database}.no_args() returns string
|
|
location '{location}'
|
|
symbol='_Z6NoArgsPN10impala_udf15FunctionContextE';
|
|
|
|
create function {database}.var_and(boolean...) returns boolean
|
|
location '{location}' symbol='VarAnd';
|
|
|
|
create function {database}.var_sum(int...) returns int
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum(double...) returns double
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum(string...) returns int
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum(decimal(4,2)...) returns decimal(18,2)
|
|
location '{location}' symbol='VarSum';
|
|
|
|
create function {database}.var_sum_multiply(double, int...) returns double
|
|
location '{location}'
|
|
symbol='_Z14VarSumMultiplyPN10impala_udf15FunctionContextERKNS_9DoubleValEiPKNS_6IntValE';
|
|
|
|
create function {database}.var_sum_multiply2(double, int...) returns double
|
|
location '{location}'
|
|
symbol='_Z15VarSumMultiply2PN10impala_udf15FunctionContextERKNS_9DoubleValEiPKNS_6IntValE';
|
|
|
|
create function {database}.xpow(double, double) returns double
|
|
location '{location}'
|
|
symbol='_ZN6impala13MathFunctions3PowEPN10impala_udf15FunctionContextERKNS1_9DoubleValES6_';
|
|
|
|
create function {database}.to_lower(string) returns string
|
|
location '{location}'
|
|
symbol='_Z7ToLowerPN10impala_udf15FunctionContextERKNS_9StringValE';
|
|
|
|
create function {database}.to_upper(string) returns string
|
|
location '{location}'
|
|
symbol='_Z7ToUpperPN10impala_udf15FunctionContextERKNS_9StringValE';
|
|
|
|
create function {database}.constant_timestamp() returns timestamp
|
|
location '{location}' symbol='ConstantTimestamp';
|
|
|
|
create function {database}.validate_arg_type(string) returns boolean
|
|
location '{location}' symbol='ValidateArgType';
|
|
|
|
create function {database}.count_rows() returns bigint
|
|
location '{location}' symbol='Count' prepare_fn='CountPrepare' close_fn='CountClose';
|
|
|
|
create function {database}.constant_arg(int) returns int
|
|
location '{location}' symbol='ConstantArg' prepare_fn='ConstantArgPrepare' close_fn='ConstantArgClose';
|
|
|
|
create function {database}.validate_open(int) returns boolean
|
|
location '{location}' symbol='ValidateOpen'
|
|
prepare_fn='ValidateOpenPrepare' close_fn='ValidateOpenClose';
|
|
|
|
create function {database}.mem_test(bigint) returns bigint
|
|
location '{location}' symbol='MemTest'
|
|
prepare_fn='MemTestPrepare' close_fn='MemTestClose';
|
|
|
|
create function {database}.mem_test_leaks(bigint) returns bigint
|
|
location '{location}' symbol='MemTest'
|
|
prepare_fn='MemTestPrepare';
|
|
|
|
-- Regression test for IMPALA-1475
|
|
create function {database}.unmangled_symbol() returns bigint
|
|
location '{location}' symbol='UnmangledSymbol';
|
|
|
|
create function {database}.four_args(int, int, int, int) returns int
|
|
location '{location}' symbol='FourArgs';
|
|
|
|
create function {database}.five_args(int, int, int, int, int) returns int
|
|
location '{location}' symbol='FiveArgs';
|
|
|
|
create function {database}.six_args(int, int, int, int, int, int) returns int
|
|
location '{location}' symbol='SixArgs';
|
|
|
|
create function {database}.seven_args(int, int, int, int, int, int, int) returns int
|
|
location '{location}' symbol='SevenArgs';
|
|
|
|
create function {database}.eight_args(int, int, int, int, int, int, int, int) returns int
|
|
location '{location}' symbol='EightArgs';
|
|
|
|
create function {database}.twenty_args(int, int, int, int, int, int, int, int, int, int,
|
|
int, int, int, int, int, int, int, int, int, int) returns int
|
|
location '{location}' symbol='TwentyArgs';
|
|
|
|
create function {database}.twenty_one_args(int, int, int, int, int, int, int, int, int, int,
|
|
int, int, int, int, int, int, int, int, int, int, int) returns int
|
|
location '{location}' symbol='TwentyOneArgs';
|
|
"""
|
|
|
|
class TestUdfExecution(TestUdfBase):
|
|
"""Test execution of UDFs with a combination of different query options."""
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestUdfExecution, cls).add_test_dimensions()
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_exec_option_dimension_from_dict({"disable_codegen" : [False, True],
|
|
"disable_codegen_rows_threshold" : [0],
|
|
"exec_single_node_rows_threshold" : [0,100],
|
|
"enable_expr_rewrites" : [False, True]}))
|
|
# There is no reason to run these tests using all dimensions.
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def test_native_functions(self, vector, unique_database):
|
|
enable_expr_rewrites = vector.get_value('exec_option')['enable_expr_rewrites']
|
|
self._load_functions(
|
|
self.create_udfs_template, vector, unique_database,
|
|
get_fs_path('/test-warehouse/libTestUdfs.so'))
|
|
self._load_functions(
|
|
self.create_sample_udas_template, vector, unique_database,
|
|
get_fs_path('/test-warehouse/libudasample.so'))
|
|
self._load_functions(
|
|
self.create_test_udas_template, vector, unique_database,
|
|
get_fs_path('/test-warehouse/libTestUdas.so'))
|
|
|
|
self.run_test_case('QueryTest/udf', vector, use_db=unique_database)
|
|
if not vector.get_value('exec_option')['disable_codegen']:
|
|
self.run_test_case('QueryTest/udf-codegen-required', vector, use_db=unique_database)
|
|
self.run_test_case('QueryTest/uda', vector, use_db=unique_database)
|
|
self.run_test_case('QueryTest/udf-init-close', vector, use_db=unique_database)
|
|
# Some tests assume determinism or non-determinism, which depends on expr rewrites.
|
|
if enable_expr_rewrites:
|
|
self.run_test_case('QueryTest/udf-init-close-deterministic', vector,
|
|
use_db=unique_database)
|
|
else:
|
|
self.run_test_case('QueryTest/udf-non-deterministic', vector,
|
|
use_db=unique_database)
|
|
|
|
def test_ir_functions(self, vector, unique_database):
|
|
if vector.get_value('exec_option')['disable_codegen']:
|
|
# IR functions require codegen to be enabled.
|
|
return
|
|
enable_expr_rewrites = vector.get_value('exec_option')['enable_expr_rewrites']
|
|
self._load_functions(
|
|
self.create_udfs_template, vector, unique_database,
|
|
get_fs_path('/test-warehouse/test-udfs.ll'))
|
|
self.run_test_case('QueryTest/udf', vector, use_db=unique_database)
|
|
self.run_test_case('QueryTest/udf-init-close', vector, use_db=unique_database)
|
|
# Some tests assume determinism or non-determinism, which depends on expr rewrites.
|
|
if enable_expr_rewrites:
|
|
self.run_test_case('QueryTest/udf-init-close-deterministic', vector,
|
|
use_db=unique_database)
|
|
else:
|
|
self.run_test_case('QueryTest/udf-non-deterministic', vector,
|
|
use_db=unique_database)
|
|
|
|
def test_java_udfs(self, vector, unique_database):
|
|
self.run_test_case('QueryTest/load-java-udfs', vector, use_db=unique_database)
|
|
self.run_test_case('QueryTest/java-udf', vector, use_db=unique_database)
|
|
|
|
def test_udf_errors(self, vector, unique_database):
|
|
# Only run with codegen disabled to force interpretation path to be taken.
|
|
# Aim to exercise two failure cases:
|
|
# 1. too many arguments
|
|
# 2. IR UDF
|
|
if vector.get_value('exec_option')['disable_codegen']:
|
|
self.run_test_case('QueryTest/udf-errors', vector, use_db=unique_database)
|
|
|
|
# Run serially because this will blow the process limit, potentially causing other
|
|
# queries to fail
|
|
@pytest.mark.execute_serially
|
|
def test_mem_limits(self, vector, unique_database):
|
|
# Set the mem limit high enough that a simple scan can run
|
|
mem_limit = 1024 * 1024
|
|
vector = copy(vector)
|
|
vector.get_value('exec_option')['mem_limit'] = mem_limit
|
|
try:
|
|
self.run_test_case('QueryTest/udf-mem-limit', vector, use_db=unique_database)
|
|
assert False, "Query was expected to fail"
|
|
except ImpalaBeeswaxException, e:
|
|
self._check_exception(e)
|
|
|
|
try:
|
|
self.run_test_case('QueryTest/uda-mem-limit', vector, use_db=unique_database)
|
|
assert False, "Query was expected to fail"
|
|
except ImpalaBeeswaxException, e:
|
|
self._check_exception(e)
|
|
|
|
# It takes a long time for Impala to free up memory after this test, especially if
|
|
# ASAN is enabled. Verify that all fragments finish executing before moving on to the
|
|
# next test to make sure that the next test is not affected.
|
|
for impalad in ImpalaCluster().impalads:
|
|
verifier = MetricVerifier(impalad.service)
|
|
verifier.wait_for_metric("impala-server.num-fragments-in-flight", 0)
|
|
verifier.verify_num_unused_buffers()
|
|
|
|
def test_udf_constant_folding(self, vector, unique_database):
|
|
"""Test that constant folding of UDFs is handled correctly. Uses count_rows(),
|
|
which returns a unique value every time it is evaluated in the same thread."""
|
|
exec_options = copy(vector.get_value('exec_option'))
|
|
# Execute on a single node so that all counter values will be unique.
|
|
exec_options["num_nodes"] = 1
|
|
create_fn_query = """create function {database}.count_rows() returns bigint
|
|
location '{location}' symbol='Count' prepare_fn='CountPrepare'
|
|
close_fn='CountClose'"""
|
|
self._load_functions(create_fn_query, vector, unique_database,
|
|
get_fs_path('/test-warehouse/libTestUdfs.so'))
|
|
|
|
# Only one distinct value if the expression is constant folded, otherwise one
|
|
# value per row in alltypes
|
|
expected_ndv = 1 if exec_options['enable_expr_rewrites'] else 7300
|
|
|
|
# Test fully constant expression, evaluated in FE.
|
|
query = "select `{0}`.count_rows() from functional.alltypes".format(unique_database)
|
|
result = self.execute_query_expect_success(self.client, query, exec_options)
|
|
actual_ndv = len(set(result.data))
|
|
assert actual_ndv == expected_ndv
|
|
|
|
# Test constant argument to a non-constant expr. The argument value can be
|
|
# cached in the backend.
|
|
query = """select concat(cast(`{0}`.count_rows() as string), '-', string_col)
|
|
from functional.alltypes""".format(unique_database)
|
|
result = self.execute_query_expect_success(self.client, query, exec_options)
|
|
actual_ndv = len(set(value.split("-")[0] for value in result.data))
|
|
assert actual_ndv == expected_ndv
|
|
|
|
|
|
class TestUdfTargeted(TestUdfBase):
|
|
"""Targeted UDF tests that don't need to be run under the full combination of
|
|
exec options."""
|
|
@classmethod
|
|
def get_workload(cls):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestUdfTargeted, cls).add_test_dimensions()
|
|
# There is no reason to run these tests using all dimensions.
|
|
cls.ImpalaTestMatrix.add_dimension(
|
|
create_uncompressed_text_dimension(cls.get_workload()))
|
|
|
|
def test_udf_invalid_symbol(self, vector, unique_database):
|
|
""" IMPALA-1642: Impala crashes if the symbol for a Hive UDF doesn't exist
|
|
Crashing is non-deterministic so we run the UDF several times."""
|
|
drop_fn_stmt = (
|
|
"drop function if exists `{0}`.fn_invalid_symbol(STRING)".format(unique_database))
|
|
create_fn_stmt = (
|
|
"create function `{0}`.fn_invalid_symbol(STRING) returns "
|
|
"STRING LOCATION '{1}' SYMBOL='not.a.Symbol'".format(
|
|
unique_database,
|
|
get_fs_path('/test-warehouse/impala-hive-udfs.jar')))
|
|
query = "select `{0}`.fn_invalid_symbol('test')".format(unique_database)
|
|
|
|
self.client.execute(drop_fn_stmt)
|
|
self.client.execute(create_fn_stmt)
|
|
for _ in xrange(5):
|
|
ex = self.execute_query_expect_failure(self.client, query)
|
|
assert "Unable to find class" in str(ex)
|
|
self.client.execute(drop_fn_stmt)
|
|
|
|
@SkipIfLocal.multiple_impalad
|
|
def test_hive_udfs_missing_jar(self, vector, unique_database):
|
|
""" IMPALA-2365: Impalad shouldn't crash if the udf jar isn't present
|
|
on HDFS"""
|
|
# Copy hive-exec.jar to a temporary file
|
|
jar_path = get_fs_path("/test-warehouse/" + get_random_id(5) + ".jar")
|
|
hive_jar = get_fs_path("/test-warehouse/hive-exec.jar")
|
|
check_call(["hadoop", "fs", "-cp", hive_jar, jar_path])
|
|
drop_fn_stmt = (
|
|
"drop function if exists "
|
|
"`{0}`.`pi_missing_jar`()".format(unique_database))
|
|
create_fn_stmt = (
|
|
"create function `{0}`.`pi_missing_jar`() returns double location '{1}' "
|
|
"symbol='org.apache.hadoop.hive.ql.udf.UDFPI'".format(unique_database, jar_path))
|
|
|
|
cluster = ImpalaCluster()
|
|
impalad = cluster.get_any_impalad()
|
|
client = impalad.service.create_beeswax_client()
|
|
# Create and drop functions with sync_ddl to make sure they are reflected
|
|
# in every impalad.
|
|
exec_option = copy(vector.get_value('exec_option'))
|
|
exec_option['sync_ddl'] = 1
|
|
|
|
self.execute_query_expect_success(client, drop_fn_stmt, exec_option)
|
|
self.execute_query_expect_success(client, create_fn_stmt, exec_option)
|
|
# Delete the udf jar
|
|
check_call(["hadoop", "fs", "-rm", jar_path])
|
|
|
|
different_impalad = cluster.get_different_impalad(impalad)
|
|
client = different_impalad.service.create_beeswax_client()
|
|
# Run a query using the udf from an impalad other than the one
|
|
# we used to create the function. This is to bypass loading from
|
|
# the cache
|
|
try:
|
|
self.execute_query_using_client(
|
|
client, "select `{0}`.`pi_missing_jar`()".format(unique_database), vector)
|
|
assert False, "Query expected to fail"
|
|
except ImpalaBeeswaxException, e:
|
|
assert "Failed to get file info" in str(e)
|
|
|
|
def test_libs_with_same_filenames(self, vector, unique_database):
|
|
self.run_test_case('QueryTest/libs_with_same_filenames', vector, use_db=unique_database)
|
|
|
|
def test_udf_update_via_drop(self, vector, unique_database):
|
|
"""Test updating the UDF binary without restarting Impala. Dropping
|
|
the function should remove the binary from the local cache."""
|
|
# Run with sync_ddl to guarantee the drop is processed by all impalads.
|
|
exec_options = copy(vector.get_value('exec_option'))
|
|
exec_options['sync_ddl'] = 1
|
|
old_udf = os.path.join(
|
|
os.environ['IMPALA_HOME'], 'testdata/udfs/impala-hive-udfs.jar')
|
|
new_udf = os.path.join(
|
|
os.environ['IMPALA_HOME'], 'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
|
|
udf_dst = get_fs_path(
|
|
'/test-warehouse/impala-hive-udfs2-{0}.jar'.format(unique_database))
|
|
|
|
drop_fn_stmt = (
|
|
'drop function if exists `{0}`.`udf_update_test_drop`()'.format(unique_database))
|
|
create_fn_stmt = (
|
|
"create function `{0}`.`udf_update_test_drop`() returns string LOCATION '{1}' "
|
|
"SYMBOL='org.apache.impala.TestUpdateUdf'".format(unique_database, udf_dst))
|
|
query_stmt = "select `{0}`.`udf_update_test_drop`()".format(unique_database)
|
|
|
|
# Put the old UDF binary on HDFS, make the UDF in Impala and run it.
|
|
check_call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
|
|
self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
|
|
self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
|
|
self._run_query_all_impalads(exec_options, query_stmt, ["Old UDF"])
|
|
|
|
# Update the binary, drop and create the function again. The new binary should
|
|
# be running.
|
|
check_call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
|
|
self.execute_query_expect_success(self.client, drop_fn_stmt, exec_options)
|
|
self.execute_query_expect_success(self.client, create_fn_stmt, exec_options)
|
|
self._run_query_all_impalads(exec_options, query_stmt, ["New UDF"])
|
|
|
|
def test_udf_update_via_create(self, vector, unique_database):
|
|
"""Test updating the UDF binary without restarting Impala. Creating a new function
|
|
from the library should refresh the cache."""
|
|
# Run with sync_ddl to guarantee the create is processed by all impalads.
|
|
exec_options = copy(vector.get_value('exec_option'))
|
|
exec_options['sync_ddl'] = 1
|
|
old_udf = os.path.join(
|
|
os.environ['IMPALA_HOME'], 'testdata/udfs/impala-hive-udfs.jar')
|
|
new_udf = os.path.join(
|
|
os.environ['IMPALA_HOME'], 'tests/test-hive-udfs/target/test-hive-udfs-1.0.jar')
|
|
udf_dst = get_fs_path(
|
|
'/test-warehouse/impala-hive-udfs3-{0}.jar'.format(unique_database))
|
|
old_function_name = "udf_update_test_create1"
|
|
new_function_name = "udf_update_test_create2"
|
|
|
|
drop_fn_template = 'drop function if exists `{0}`.`{{0}}`()'.format(unique_database)
|
|
self.execute_query_expect_success(
|
|
self.client, drop_fn_template.format(old_function_name), exec_options)
|
|
self.execute_query_expect_success(
|
|
self.client, drop_fn_template.format(new_function_name), exec_options)
|
|
|
|
create_fn_template = (
|
|
"create function `{0}`.`{{0}}`() returns string LOCATION '{1}' "
|
|
"SYMBOL='org.apache.impala.TestUpdateUdf'".format(unique_database, udf_dst))
|
|
|
|
query_template = "select `{0}`.`{{0}}`()".format(unique_database)
|
|
|
|
# Put the old UDF binary on HDFS, make the UDF in Impala and run it.
|
|
check_call(["hadoop", "fs", "-put", "-f", old_udf, udf_dst])
|
|
self.execute_query_expect_success(
|
|
self.client, create_fn_template.format(old_function_name), exec_options)
|
|
self._run_query_all_impalads(
|
|
exec_options, query_template.format(old_function_name), ["Old UDF"])
|
|
|
|
# Update the binary, and create a new function using the binary. The new binary
|
|
# should be running.
|
|
check_call(["hadoop", "fs", "-put", "-f", new_udf, udf_dst])
|
|
self.execute_query_expect_success(
|
|
self.client, create_fn_template.format(new_function_name), exec_options)
|
|
self._run_query_all_impalads(
|
|
exec_options, query_template.format(new_function_name), ["New UDF"])
|
|
|
|
# The old function should use the new library now
|
|
self._run_query_all_impalads(
|
|
exec_options, query_template.format(old_function_name), ["New UDF"])
|
|
|
|
def test_drop_function_while_running(self, vector, unique_database):
|
|
self.client.execute("drop function if exists `{0}`.drop_while_running(BIGINT)"
|
|
.format(unique_database))
|
|
self.client.execute(
|
|
"create function `{0}`.drop_while_running(BIGINT) returns "
|
|
"BIGINT LOCATION '{1}' SYMBOL='Identity'".format(
|
|
unique_database,
|
|
get_fs_path('/test-warehouse/libTestUdfs.so')))
|
|
query = ("select `{0}`.drop_while_running(l_orderkey) from tpch.lineitem limit 10000"
|
|
.format(unique_database))
|
|
|
|
# Run this query asynchronously.
|
|
handle = self.execute_query_async(query, vector.get_value('exec_option'),
|
|
table_format=vector.get_value('table_format'))
|
|
|
|
# Fetch some rows from the async query to make sure the UDF is being used
|
|
results = self.client.fetch(query, handle, 1)
|
|
assert results.success
|
|
assert len(results.data) == 1
|
|
|
|
# Drop the function while the original query is running.
|
|
self.client.execute(
|
|
"drop function `{0}`.drop_while_running(BIGINT)".format(unique_database))
|
|
|
|
# Fetch the rest of the rows, this should still be able to run the UDF
|
|
results = self.client.fetch(query, handle, -1)
|
|
assert results.success
|
|
assert len(results.data) == 9999
|
|
|