Files
impala/tests/query_test/test_observability.py
Sailesh Mukil 50bd015f2d IMPALA-5333: Add support for Impala to work with ADLS
This patch leverages the AdlFileSystem in Hadoop to allow
Impala to talk to the Azure Data Lake Store. This patch has
functional changes as well as adds test infrastructure for
testing Impala over ADLS.

We do not support ACLs on ADLS since the Hadoop ADLS
connector does not integrate ADLS ACLs with Hadoop users/groups.

For testing, we use the azure-data-lake-store-python client
from Microsoft. This client seems to have some consistency
issues. For example, a drop table through Impala will delete
the files in ADLS, however, listing that directory through
the python client immediately after the drop, will still show
the files. This behavior is unexpected since ADLS claims to be
strongly consistent. Some tests have been skipped due to this
limitation with the tag SkipIfADLS.slow_client. Tracked by
IMPALA-5335.

The azure-data-lake-store-python client also only works on CentOS 6.6
and over, so the python dependencies for Azure will not be downloaded
when the TARGET_FILESYSTEM is not "adls". While running ADLS tests,
the expectation will be that it runs on a machine that is at least
running CentOS 6.6.
Note: This is only a test limitation, not a functional one. Clusters
with older OSes like CentOS 6.4 will still work with ADLS.

Added another dependency to bootstrap_build.sh for the ADLS Python
client.

Testing: Ran core tests with and without TARGET_FILESYSTEM as
'adls' to make sure that all tests pass and that nothing breaks.

Change-Id: Ic56b9988b32a330443f24c44f9cb2c80842f7542
Reviewed-on: http://gerrit.cloudera.org:8080/6910
Tested-by: Impala Public Jenkins
Reviewed-by: Sailesh Mukil <sailesh@cloudera.com>
2017-05-25 19:35:24 +00:00

96 lines
4.3 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import SkipIfS3, SkipIfADLS, SkipIfIsilon, SkipIfLocal
class TestObservability(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
def test_merge_exchange_num_rows(self):
"""Regression test for IMPALA-1473 - checks that the exec summary for a merging
exchange with a limit reports the number of rows returned as equal to the limit,
and that the coordinator fragment portion of the runtime profile reports the number
of rows returned correctly."""
query = """select tinyint_col, count(*) from functional.alltypes
group by tinyint_col order by tinyint_col limit 5"""
result = self.execute_query(query)
assert result.exec_summary[0]['operator'] == '05:MERGING-EXCHANGE'
assert result.exec_summary[0]['num_rows'] == 5
assert result.exec_summary[0]['est_num_rows'] == 5
for line in result.runtime_profile.split('\n'):
# The first 'RowsProduced' we find is for the coordinator fragment.
if 'RowsProduced' in line:
assert '(5)' in line
break
def test_broadcast_num_rows(self):
"""Regression test for IMPALA-3002 - checks that the num_rows for a broadcast node
in the exec summaty is correctly set as the max over all instances, not the sum."""
query = """select distinct a.int_col, a.string_col from functional.alltypes a
inner join functional.alltypessmall b on (a.id = b.id)
where a.year = 2009 and b.month = 2"""
result = self.execute_query(query)
assert result.exec_summary[5]['operator'] == '04:EXCHANGE'
assert result.exec_summary[5]['num_rows'] == 25
assert result.exec_summary[5]['est_num_rows'] == 25
@SkipIfS3.hbase
@SkipIfLocal.hbase
@SkipIfIsilon.hbase
@SkipIfADLS.hbase
def test_scan_summary(self):
"""IMPALA-4499: Checks that the exec summary for scans show the table name."""
# HDFS table
query = "select count(*) from functional.alltypestiny"
result = self.execute_query(query)
scan_idx = len(result.exec_summary) - 1
assert result.exec_summary[scan_idx]['operator'] == '00:SCAN HDFS'
assert result.exec_summary[scan_idx]['detail'] == 'functional.alltypestiny'
# KUDU table
query = "select count(*) from functional_kudu.alltypestiny"
result = self.execute_query(query)
scan_idx = len(result.exec_summary) - 1
assert result.exec_summary[scan_idx]['operator'] == '00:SCAN KUDU'
assert result.exec_summary[scan_idx]['detail'] == 'functional_kudu.alltypestiny'
# HBASE table
query = "select count(*) from functional_hbase.alltypestiny"
result = self.execute_query(query)
scan_idx = len(result.exec_summary) - 1
assert result.exec_summary[scan_idx]['operator'] == '00:SCAN HBASE'
assert result.exec_summary[scan_idx]['detail'] == 'functional_hbase.alltypestiny'
def test_get_profile(self):
"""Tests that the query profile shows expected query states."""
query = "select count(*) from functional.alltypes"
handle = self.execute_query_async(query, dict())
profile = self.client.get_runtime_profile(handle)
# If ExecuteStatement() has completed but the results haven't been fetched yet, the
# query must have at least reached RUNNING.
assert "Query State: RUNNING" in profile or \
"Query State: FINISHED" in profile, profile
results = self.client.fetch(query, handle)
profile = self.client.get_runtime_profile(handle)
# After fetching the results, the query must be in state FINISHED.
assert "Query State: FINISHED" in profile, profile