mirror of
https://github.com/apache/impala.git
synced 2025-12-20 02:20:11 -05:00
Add the -d option and -f option to the following commands: `hdfs dfs -copyFromLocal <localsrc> URI` `hdfs dfs -put [ - | <localsrc1> .. ]. <dst>` `hdfs dfs -cp URI [URI ...] <dest>` The -d option "Skip[s] creation of temporary file with the suffix ._COPYING_." which improves performance of these commands on S3 since S3 does not support metadata only renames. The -f option "Overwrites the destination if it already exists" combined with HADOOP-13884 this improves issues seen with S3 consistency issues by avoiding a HEAD request to check if the destination file exists or not. Added the method 'copy_from_local' to the BaseFilesystem class. Re-factored most usages of the aforementioned HDFS commands to use the filesystem_client. Some usages were not appropriate / worth refactoring, so occasionally this patch just adds the '-d' and '-f' options explicitly. All calls to '-put' were replaced with 'copyFromLocal' because they both copy files from the local fs to a HDFS compatible target fs. Since WebHDFS does not have good support for copying files, this patch removes the copy functionality from the PyWebHdfsClientWithChmod. Re-factored the hdfs_client so that it uses a DelegatingHdfsClient that delegates to either the HadoopFsCommandLineClient or PyWebHdfsClientWithChmod. Testing: * Ran core tests on HDFS and S3 Change-Id: I0d45db1c00554e6fb6bcc0b552596d86d4e30144 Reviewed-on: http://gerrit.cloudera.org:8080/14311 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
213 lines
7.8 KiB
Python
213 lines
7.8 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
import os
|
|
import pytest
|
|
import random
|
|
import threading
|
|
import time
|
|
from subprocess import check_call
|
|
|
|
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
|
|
from tests.common.impala_cluster import ImpalaCluster
|
|
from tests.util.filesystem_utils import get_fs_path
|
|
|
|
# This custom cluster test splits out concurrency tests to allow running with
|
|
# a higher fe_service_threads (and thus higher concurrency). This also avoids
|
|
# side-effects for other tests (see IMPALA-7639).
|
|
|
|
|
|
class TestUdfConcurrency(CustomClusterTestSuite):
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
@classmethod
|
|
def setup_class(cls):
|
|
if cls.exploration_strategy() != 'exhaustive':
|
|
pytest.skip('runs only in exhaustive')
|
|
super(TestUdfConcurrency, cls).setup_class()
|
|
|
|
@classmethod
|
|
def add_test_dimensions(cls):
|
|
super(TestUdfConcurrency, cls).add_test_dimensions()
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(impalad_args="--fe_service_threads=1000")
|
|
def test_native_functions_race(self, vector, unique_database):
|
|
""" IMPALA-6488: stress concurrent adds, uses, and deletes of native functions.
|
|
Exposes a crash caused by use-after-free in lib-cache."""
|
|
|
|
# Native function used by a query. Stresses lib-cache during analysis and
|
|
# backend expressions.
|
|
create_fn_to_use = \
|
|
"""create function {0}.use_it(string) returns string
|
|
LOCATION '{1}'
|
|
SYMBOL='_Z8IdentityPN10impala_udf15FunctionContextERKNS_9StringValE'"""
|
|
use_fn = """select * from (select max(int_col) from functional.alltypesagg
|
|
where {0}.use_it(string_col) = 'blah' union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where {0}.use_it(String_col) > '1' union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where {0}.use_it(string_col) > '1'))) v"""
|
|
# Reference to another native function from the same 'so' file. Creating/dropping
|
|
# stresses lib-cache lookup, add, and refresh.
|
|
create_another_fn = """create function if not exists {0}.other(float)
|
|
returns float location '{1}' symbol='Identity'"""
|
|
drop_another_fn = """drop function if exists {0}.other(float)"""
|
|
udf_path = get_fs_path('/test-warehouse/libTestUdfs.so')
|
|
|
|
# Tracks number of impalads prior to tests to check that none have crashed.
|
|
# All impalads are assumed to be coordinators.
|
|
cluster = ImpalaCluster.get_e2e_test_cluster()
|
|
exp_num_coordinators = cluster.num_responsive_coordinators()
|
|
|
|
setup_client = self.create_impala_client()
|
|
setup_query = create_fn_to_use.format(unique_database, udf_path)
|
|
try:
|
|
setup_client.execute(setup_query)
|
|
except Exception as e:
|
|
print "Unable to create initial function: {0}".format(setup_query)
|
|
raise
|
|
|
|
errors = []
|
|
|
|
def use_fn_method():
|
|
time.sleep(1 + random.random())
|
|
client = self.create_impala_client()
|
|
query = use_fn.format(unique_database)
|
|
try:
|
|
client.execute(query)
|
|
except Exception as e:
|
|
errors.append(e)
|
|
|
|
def load_fn_method():
|
|
time.sleep(1 + random.random())
|
|
client = self.create_impala_client()
|
|
drop = drop_another_fn.format(unique_database)
|
|
create = create_another_fn.format(unique_database, udf_path)
|
|
try:
|
|
client.execute(drop)
|
|
client.execute(create)
|
|
except Exception as e:
|
|
errors.append(e)
|
|
|
|
# number of uses/loads needed to reliably reproduce the bug.
|
|
num_uses = 200
|
|
num_loads = 200
|
|
|
|
# create threads to use native function.
|
|
runner_threads = []
|
|
for i in xrange(num_uses):
|
|
runner_threads.append(threading.Thread(target=use_fn_method))
|
|
|
|
# create threads to drop/create native functions.
|
|
for i in xrange(num_loads):
|
|
runner_threads.append(threading.Thread(target=load_fn_method))
|
|
|
|
# launch all runner threads.
|
|
for t in runner_threads: t.start()
|
|
|
|
# join all threads.
|
|
for t in runner_threads: t.join()
|
|
|
|
for e in errors: print e
|
|
|
|
# Checks that no impalad has crashed.
|
|
assert cluster.num_responsive_coordinators() == exp_num_coordinators
|
|
|
|
@pytest.mark.execute_serially
|
|
@CustomClusterTestSuite.with_args(impalad_args="--fe_service_threads=1000")
|
|
def test_concurrent_jar_drop_use(self, vector, unique_database):
|
|
"""IMPALA-6215: race between dropping/using java udf's defined in the same jar.
|
|
This test runs concurrent drop/use threads that result in class not found
|
|
exceptions when the race is present.
|
|
"""
|
|
udf_src_path = os.path.join(
|
|
os.environ['IMPALA_HOME'], "testdata/udfs/impala-hive-udfs.jar")
|
|
udf_tgt_path = get_fs_path(
|
|
'/test-warehouse/{0}.db/impala-hive-udfs.jar'.format(unique_database))
|
|
|
|
create_fn_to_drop = """create function {0}.foo_{1}() returns string
|
|
LOCATION '{2}' SYMBOL='org.apache.impala.TestUpdateUdf'"""
|
|
create_fn_to_use = """create function {0}.use_it(string) returns string
|
|
LOCATION '{1}' SYMBOL='org.apache.impala.TestUdf'"""
|
|
drop_fn = "drop function if exists {0}.foo_{1}()"
|
|
use_fn = """select * from (select max(int_col) from functional.alltypesagg
|
|
where {0}.use_it(string_col) = 'blah' union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where {0}.use_it(String_col) > '1' union all
|
|
(select max(int_col) from functional.alltypesagg
|
|
where {0}.use_it(string_col) > '1'))) v"""
|
|
num_drops = 100
|
|
num_uses = 100
|
|
|
|
# use a unique jar for this test to avoid interactions with other tests
|
|
# that use the same jar
|
|
self.filesystem_client.copy_from_local(udf_src_path, udf_tgt_path)
|
|
|
|
# create all the functions.
|
|
setup_client = self.create_impala_client()
|
|
try:
|
|
s = create_fn_to_use.format(unique_database, udf_tgt_path)
|
|
setup_client.execute(s)
|
|
except Exception as e:
|
|
print e
|
|
assert False
|
|
for i in range(0, num_drops):
|
|
try:
|
|
setup_client.execute(create_fn_to_drop.format(unique_database, i, udf_tgt_path))
|
|
except Exception as e:
|
|
print e
|
|
assert False
|
|
|
|
errors = []
|
|
|
|
def use_fn_method():
|
|
time.sleep(5 + random.random())
|
|
client = self.create_impala_client()
|
|
try:
|
|
client.execute(use_fn.format(unique_database))
|
|
except Exception as e: errors.append(e)
|
|
|
|
def drop_fn_method(i):
|
|
time.sleep(1 + random.random())
|
|
client = self.create_impala_client()
|
|
try:
|
|
client.execute(drop_fn.format(unique_database, i))
|
|
except Exception as e: errors.append(e)
|
|
|
|
# create threads to use functions.
|
|
runner_threads = []
|
|
for i in range(0, num_uses):
|
|
runner_threads.append(threading.Thread(target=use_fn_method))
|
|
|
|
# create threads to drop functions.
|
|
for i in range(0, num_drops):
|
|
runner_threads.append(threading.Thread(target=drop_fn_method, args=(i, )))
|
|
|
|
# launch all runner threads.
|
|
for t in runner_threads: t.start()
|
|
|
|
# join all threads.
|
|
for t in runner_threads: t.join()
|
|
|
|
# Check for any errors.
|
|
for e in errors: print e
|
|
assert len(errors) == 0
|