mirror of
https://github.com/apache/impala.git
synced 2026-02-03 00:00:40 -05:00
This patch adds the query progress of queries to Impala /queries webpage. The query progress shows the completion progress of a query's fragment instances. It helps users track the completion of computation-intensive queries. Testing: - Added test cases to test_observability.py - Added a new functional test to test_web_pages.py Change-Id: Ic0e8695a8a8395c1364b4b249f83c4345d2cc53e Reviewed-on: http://gerrit.cloudera.org:8080/19706 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Quanlong Huang <huangquanlong@gmail.com>
869 lines
41 KiB
Python
869 lines
41 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
|
|
from tests.common.impala_test_suite import ImpalaTestSuite
|
|
from tests.common.skip import SkipIfFS, SkipIfLocal, SkipIfNotHdfsMinicluster
|
|
from tests.util.filesystem_utils import IS_EC, WAREHOUSE
|
|
from tests.util.parse_util import get_duration_us_from_str
|
|
from time import sleep, time
|
|
from RuntimeProfile.ttypes import TRuntimeProfileFormat
|
|
import pytest
|
|
import re
|
|
|
|
|
|
class TestObservability(ImpalaTestSuite):
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
def test_merge_exchange_num_rows(self):
|
|
"""Regression test for IMPALA-1473 - checks that the exec summary for a merging
|
|
exchange with a limit reports the number of rows returned as equal to the limit,
|
|
and that the coordinator fragment portion of the runtime profile reports the number
|
|
of rows returned correctly."""
|
|
query = """select tinyint_col, count(*) from functional.alltypes
|
|
group by tinyint_col order by tinyint_col limit 5"""
|
|
result = self.execute_query(query)
|
|
exchange = result.exec_summary[1]
|
|
assert exchange['operator'] == '05:MERGING-EXCHANGE'
|
|
assert exchange['num_rows'] == 5
|
|
assert exchange['est_num_rows'] == 5
|
|
assert exchange['peak_mem'] > 0
|
|
|
|
for line in result.runtime_profile.split('\n'):
|
|
# The first 'RowsProduced' we find is for the coordinator fragment.
|
|
if 'RowsProduced' in line:
|
|
assert '(5)' in line
|
|
break
|
|
|
|
def test_broadcast_num_rows(self):
|
|
"""Regression test for IMPALA-3002 - checks that the num_rows for a broadcast node
|
|
in the exec summary is correctly set as the max over all instances, not the sum."""
|
|
query = """select distinct a.int_col, a.string_col from functional.alltypes a
|
|
inner join functional.alltypessmall b on (a.id = b.id)
|
|
where a.year = 2009 and b.month = 2"""
|
|
result = self.execute_query(query)
|
|
exchange = result.exec_summary[8]
|
|
assert exchange['operator'] == '04:EXCHANGE'
|
|
assert exchange['num_rows'] == 25
|
|
assert exchange['est_num_rows'] == 25
|
|
assert exchange['peak_mem'] > 0
|
|
|
|
def test_report_time(self):
|
|
""" Regression test for IMPALA-6741 - checks that last reporting time exists in
|
|
profiles of fragment instances."""
|
|
query = """select count(distinct a.int_col) from functional.alltypes a
|
|
inner join functional.alltypessmall b on (a.id = b.id + cast(sleep(15) as INT))"""
|
|
handle = self.hs2_client.execute_async(query)
|
|
|
|
num_validated = 0
|
|
tree = self.hs2_client.get_runtime_profile(handle, TRuntimeProfileFormat.THRIFT)
|
|
while not self.hs2_client.state_is_finished(handle):
|
|
assert tree, num_validated
|
|
for node in tree.nodes:
|
|
if node.name.startswith('Instance '):
|
|
info_strings_key = 'Last report received time'
|
|
assert info_strings_key in node.info_strings
|
|
report_time_str = node.info_strings[info_strings_key].split(".")[0]
|
|
# Try converting the string to make sure it's in the expected format
|
|
assert datetime.strptime(report_time_str, '%Y-%m-%d %H:%M:%S')
|
|
num_validated += 1
|
|
tree = self.hs2_client.get_runtime_profile(handle, TRuntimeProfileFormat.THRIFT)
|
|
# Let's not hit the backend too hard
|
|
sleep(0.1)
|
|
assert num_validated > 0
|
|
self.hs2_client.close_query(handle)
|
|
|
|
@SkipIfFS.hbase
|
|
def test_scan_summary(self):
|
|
"""IMPALA-4499: Checks that the exec summary for scans show the table name."""
|
|
# HDFS table
|
|
query = "select count(*) from functional.alltypestiny"
|
|
result = self.execute_query(query)
|
|
scan_idx = len(result.exec_summary) - 1
|
|
assert result.exec_summary[scan_idx]['operator'] == '00:SCAN HDFS'
|
|
assert result.exec_summary[scan_idx]['detail'] == 'functional.alltypestiny'
|
|
|
|
# KUDU table
|
|
query = "select count(*) from functional_kudu.alltypestiny"
|
|
result = self.execute_query(query)
|
|
scan_idx = len(result.exec_summary) - 1
|
|
assert result.exec_summary[scan_idx]['operator'] == '00:SCAN KUDU'
|
|
assert result.exec_summary[scan_idx]['detail'] == 'functional_kudu.alltypestiny'
|
|
|
|
# HBASE table
|
|
query = "select count(*) from functional_hbase.alltypestiny"
|
|
result = self.execute_query(query)
|
|
scan_idx = len(result.exec_summary) - 1
|
|
assert result.exec_summary[scan_idx]['operator'] == '00:SCAN HBASE'
|
|
assert result.exec_summary[scan_idx]['detail'] == 'functional_hbase.alltypestiny'
|
|
|
|
def test_sink_summary(self, unique_database):
|
|
"""IMPALA-1048: Checks that the exec summary contains sinks."""
|
|
# SELECT query.
|
|
query = "select count(*) from functional.alltypes"
|
|
result = self.execute_query(query)
|
|
# Sanity-check the root sink.
|
|
root_sink = result.exec_summary[0]
|
|
assert root_sink['operator'] == 'F01:ROOT'
|
|
assert root_sink['max_time'] >= 0
|
|
assert root_sink['num_rows'] == -1
|
|
assert root_sink['est_num_rows'] == -1
|
|
assert root_sink['peak_mem'] >= 0
|
|
assert root_sink['est_peak_mem'] >= 0
|
|
# Sanity-check the exchange sink.
|
|
found_exchange_sender = False
|
|
for row in result.exec_summary[1:]:
|
|
if 'EXCHANGE SENDER' not in row['operator']:
|
|
continue
|
|
found_exchange_sender = True
|
|
assert re.match("F[0-9]+:EXCHANGE SENDER", row['operator'])
|
|
assert row['max_time'] >= 0
|
|
assert row['num_rows'] == -1
|
|
assert row['est_num_rows'] == -1
|
|
assert row['peak_mem'] >= 0
|
|
assert row['est_peak_mem'] >= 0
|
|
assert found_exchange_sender, result
|
|
|
|
# INSERT query.
|
|
query = "create table {0}.tmp as select count(*) from functional.alltypes".format(
|
|
unique_database)
|
|
result = self.execute_query(query)
|
|
# Sanity-check the HDFS writer sink.
|
|
assert result.exec_summary[0]['operator'] == 'F01:HDFS WRITER'
|
|
assert result.exec_summary[0]['max_time'] >= 0
|
|
assert result.exec_summary[0]['num_rows'] == -1
|
|
assert result.exec_summary[0]['est_num_rows'] == -1
|
|
assert result.exec_summary[0]['peak_mem'] >= 0
|
|
assert result.exec_summary[0]['est_peak_mem'] >= 0
|
|
|
|
def test_query_options(self):
|
|
"""Test that the query profile shows expected non-default query options, both set
|
|
explicitly through client and those set by planner"""
|
|
# Set mem_limit and runtime_filter_wait_time_ms to non-default and default value.
|
|
query_opts = {'mem_limit': 8589934592, 'runtime_filter_wait_time_ms': 0}
|
|
profile = self.execute_query("select 1", query_opts).runtime_profile
|
|
assert "Query Options (set by configuration): MEM_LIMIT=8589934592" in profile,\
|
|
profile
|
|
assert "CLIENT_IDENTIFIER=" + \
|
|
"query_test/test_observability.py::TestObservability::()::test_query_options" \
|
|
in profile
|
|
# Get the TIMEZONE value.
|
|
server_timezone = None
|
|
for row in self.execute_query("set", query_opts).data:
|
|
name, val, _ = row.split("\t")
|
|
if name == "TIMEZONE":
|
|
server_timezone = val
|
|
break
|
|
assert server_timezone is not None
|
|
|
|
# For this query, the planner sets NUM_NODES=1, NUM_SCANNER_THREADS=1,
|
|
# RUNTIME_FILTER_MODE=0 and MT_DOP=0
|
|
expected_str = ("Query Options (set by configuration and planner): "
|
|
"MEM_LIMIT=8589934592,"
|
|
"NUM_NODES=1,NUM_SCANNER_THREADS=1,"
|
|
"RUNTIME_FILTER_MODE=OFF,MT_DOP=0,TIMEZONE={timezone},"
|
|
"CLIENT_IDENTIFIER="
|
|
"query_test/test_observability.py::TestObservability::()::test_query_options,"
|
|
"SPOOL_QUERY_RESULTS=0"
|
|
"\n")
|
|
expected_str = expected_str.format(timezone=server_timezone)
|
|
assert expected_str in profile, profile
|
|
|
|
def test_exec_summary(self):
|
|
"""Test that the exec summary is populated correctly in every query state"""
|
|
query = "select count(*) from functional.alltypes"
|
|
handle = self.execute_query_async(query,
|
|
{"debug_action": "AC_BEFORE_ADMISSION:SLEEP@1000"})
|
|
# If ExecuteStatement() has completed and the query is paused in the admission control
|
|
# phase, then the coordinator has not started yet and exec_summary should be empty.
|
|
exec_summary = self.client.get_exec_summary(handle)
|
|
assert exec_summary is not None and exec_summary.nodes is None
|
|
# After completion of the admission control phase, the coordinator would have started
|
|
# and we should get a populated exec_summary.
|
|
self.client.wait_for_admission_control(handle)
|
|
exec_summary = self.client.get_exec_summary(handle)
|
|
assert exec_summary is not None and exec_summary.nodes is not None
|
|
|
|
self.client.fetch(query, handle)
|
|
exec_summary = self.client.get_exec_summary(handle)
|
|
# After fetching the results and reaching finished state, we should still be able to
|
|
# fetch an exec_summary.
|
|
assert exec_summary is not None and exec_summary.nodes is not None
|
|
# Verify the query is complete.
|
|
assert exec_summary.progress.num_completed_scan_ranges == \
|
|
exec_summary.progress.total_scan_ranges
|
|
assert exec_summary.progress.num_completed_fragment_instances == \
|
|
exec_summary.progress.total_fragment_instances
|
|
|
|
def test_exec_summary_in_runtime_profile(self):
|
|
"""Test that the exec summary is populated in runtime profile correctly in every
|
|
query state"""
|
|
query = "select count(*) from functional.alltypes"
|
|
handle = self.execute_query_async(query,
|
|
{"debug_action": "AC_BEFORE_ADMISSION:SLEEP@1000"})
|
|
|
|
# If ExecuteStatement() has completed and the query is paused in the admission control
|
|
# phase, then the coordinator has not started yet and exec_summary should be empty.
|
|
profile = self.client.get_runtime_profile(handle)
|
|
assert "ExecSummary:" not in profile, profile
|
|
# After completion of the admission control phase, the coordinator would have started
|
|
# and we should get a populated exec_summary.
|
|
self.client.wait_for_admission_control(handle)
|
|
profile = self.client.get_runtime_profile(handle)
|
|
assert "ExecSummary:" in profile, profile
|
|
|
|
self.client.fetch(query, handle)
|
|
# After fetching the results and reaching finished state, we should still be able to
|
|
# fetch an exec_summary in profile.
|
|
profile = self.client.get_runtime_profile(handle)
|
|
assert "ExecSummary:" in profile, profile
|
|
|
|
@SkipIfLocal.multiple_impalad
|
|
def test_profile_fragment_instances(self):
|
|
"""IMPALA-6081: Test that the expected number of fragment instances and their exec
|
|
nodes appear in the runtime profile, even when fragments may be quickly cancelled when
|
|
all results are already returned."""
|
|
query_opts = {'report_skew_limit': -1}
|
|
results = self.execute_query("""
|
|
with l as (select * from tpch.lineitem UNION ALL select * from tpch.lineitem)
|
|
select STRAIGHT_JOIN count(*) from (select * from tpch.lineitem a LIMIT 1) a
|
|
join (select * from l LIMIT 2000000) b on a.l_orderkey = -b.l_orderkey;""",
|
|
query_opts)
|
|
# There are 3 scan nodes and each appears in the profile n+1 times (for n fragment
|
|
# instances + the averaged fragment). n depends on how data is loaded and scheduler's
|
|
# decision.
|
|
n = results.runtime_profile.count("HDFS_SCAN_NODE")
|
|
assert n > 0 and n % 3 == 0
|
|
# There are 3 exchange nodes and each appears in the profile 2 times (for 1 fragment
|
|
# instance + the averaged fragment).
|
|
assert results.runtime_profile.count("EXCHANGE_NODE") == 6
|
|
# The following appear only in the root fragment which has 1 instance.
|
|
assert results.runtime_profile.count("HASH_JOIN_NODE") == 2
|
|
assert results.runtime_profile.count("AGGREGATION_NODE") == 2
|
|
assert results.runtime_profile.count("PLAN_ROOT_SINK") == 2
|
|
|
|
def test_query_profile_contains_query_compilation_static_events(self):
|
|
"""Test that the expected events show up in a query profile. These lines are static
|
|
and should appear in this exact order."""
|
|
event_regexes = [
|
|
r'Analysis finished:',
|
|
r'Authorization finished (.*):',
|
|
r'Value transfer graph computed:',
|
|
r'Single node plan created:',
|
|
r'Runtime filters computed:',
|
|
r'Distributed plan created:']
|
|
query = "select * from functional.alltypes"
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
self.__verify_profile_event_sequence(event_regexes, runtime_profile)
|
|
|
|
def test_query_profile_contains_query_compilation_metadata_load_events(self,
|
|
cluster_properties):
|
|
"""Test that the Metadata load started and finished events appear in the query
|
|
profile when Catalog cache is evicted."""
|
|
invalidate_query = "invalidate metadata functional.alltypes"
|
|
select_query = "select * from functional.alltypes"
|
|
self.execute_query(invalidate_query).runtime_profile
|
|
runtime_profile = self.execute_query(select_query).runtime_profile
|
|
# Depending on whether this is a catalog-v2 cluster or not some of the metadata
|
|
# loading events are different
|
|
if not cluster_properties.is_catalog_v2_cluster():
|
|
load_event_regexes = [r'Query Compilation:', r'Metadata load started:',
|
|
r'Metadata load finished. loaded-tables=.*/.* load-requests=.* '
|
|
r'catalog-updates=.*:',
|
|
r'Analysis finished:']
|
|
else:
|
|
load_event_regexes = [
|
|
r'Frontend:',
|
|
r'Referenced Tables:',
|
|
r'CatalogFetch.ColumnStats.Hits',
|
|
r'CatalogFetch.ColumnStats.Misses',
|
|
r'CatalogFetch.ColumnStats.Requests',
|
|
r'CatalogFetch.ColumnStats.Time',
|
|
r'CatalogFetch.Config.Hits',
|
|
r'CatalogFetch.Config.Misses',
|
|
r'CatalogFetch.Config.Requests',
|
|
r'CatalogFetch.Config.Time',
|
|
r'CatalogFetch.DatabaseList.Hits',
|
|
r'CatalogFetch.DatabaseList.Misses',
|
|
r'CatalogFetch.DatabaseList.Requests',
|
|
r'CatalogFetch.DatabaseList.Time',
|
|
r'CatalogFetch.PartitionLists.Hits',
|
|
r'CatalogFetch.PartitionLists.Misses',
|
|
r'CatalogFetch.PartitionLists.Requests',
|
|
r'CatalogFetch.PartitionLists.Time',
|
|
r'CatalogFetch.Partitions.Hits',
|
|
r'CatalogFetch.Partitions.Misses',
|
|
r'CatalogFetch.Partitions.Requests',
|
|
r'CatalogFetch.Partitions.Time',
|
|
r'CatalogFetch.RPCs.Bytes',
|
|
r'CatalogFetch.RPCs.Requests',
|
|
r'CatalogFetch.RPCs.Time',
|
|
r'CatalogFetch.StorageLoad.Time',
|
|
r'CatalogFetch.TableList.Hits',
|
|
r'CatalogFetch.TableList.Misses',
|
|
r'CatalogFetch.TableList.Requests',
|
|
r'CatalogFetch.TableList.Time',
|
|
r'CatalogFetch.Tables.Hits',
|
|
r'CatalogFetch.Tables.Misses',
|
|
r'CatalogFetch.Tables.Requests',
|
|
r'CatalogFetch.Tables.Time']
|
|
self.__verify_profile_event_sequence(load_event_regexes, runtime_profile)
|
|
|
|
def test_query_profile_contains_query_compilation_metadata_cached_event(self):
|
|
"""Test that the Metadata cache available event appears in the query profile when
|
|
the table is cached."""
|
|
refresh_query = "refresh functional.alltypes"
|
|
select_query = "select * from functional.alltypes"
|
|
self.execute_query(refresh_query).runtime_profile
|
|
runtime_profile = self.execute_query(select_query).runtime_profile
|
|
event_regexes = [r'Query Compilation:',
|
|
r'Metadata of all .* tables cached:',
|
|
r'Analysis finished:']
|
|
self.__verify_profile_event_sequence(event_regexes, runtime_profile)
|
|
|
|
def test_query_profile_contains_query_compilation_lineage_event(self):
|
|
"""Test that the lineage information appears in the profile in the right place. This
|
|
event depends on whether the lineage_event_log_dir is configured."""
|
|
impalad = self.impalad_test_service
|
|
lineage_event_log_dir_value = impalad.get_flag_current_value("lineage_event_log_dir")
|
|
assert lineage_event_log_dir_value is not None
|
|
if lineage_event_log_dir_value == "":
|
|
event_regexes = [
|
|
r'Distributed plan created:',
|
|
r'Planning finished:']
|
|
else:
|
|
event_regexes = [
|
|
r'Distributed plan created:',
|
|
r'Lineage info computed:',
|
|
r'Planning finished:']
|
|
# Disable auto-scaling as it can produce multiple event sequences in event_regexes
|
|
self.execute_query_expect_success(self.client, "set enable_replan=0")
|
|
query = "select * from functional.alltypes"
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
self.__verify_profile_event_sequence(event_regexes, runtime_profile)
|
|
|
|
def test_query_profile_contains_query_timeline_events(self):
|
|
"""Test that the expected events show up in a query profile."""
|
|
event_regexes = [r'Query Timeline:',
|
|
r'Query submitted:',
|
|
r'Planning finished:',
|
|
r'Submit for admission:',
|
|
r'Completed admission:',
|
|
r'Ready to start on .* backends:',
|
|
r'All .* execution backends \(.* fragment instances\) started:',
|
|
r'Rows available:',
|
|
r'First row fetched:',
|
|
r'Last row fetched:',
|
|
r'Released admission control resources:']
|
|
query = "select * from functional.alltypes"
|
|
# Use no-limits pool so that it cannot get queued in admission control (which would
|
|
# add an extra event to the above timeline).
|
|
query_opts = {'request_pool': 'root.no-limits'}
|
|
runtime_profile = self.execute_query(query, query_opts).runtime_profile
|
|
self.__verify_profile_event_sequence(event_regexes, runtime_profile)
|
|
|
|
def test_query_profile_contains_instance_events(self):
|
|
"""Test that /query_profile_encoded contains an event timeline for fragment
|
|
instances, even when there are errors."""
|
|
event_regexes = [r'Fragment Instance Lifecycle Event Timeline',
|
|
r'Prepare Finished',
|
|
r'Open Finished',
|
|
r'First Batch Produced',
|
|
r'First Batch Sent',
|
|
r'ExecInternal Finished']
|
|
query = "select count(*) from functional.alltypes"
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
self.__verify_profile_event_sequence(event_regexes, runtime_profile)
|
|
|
|
def test_query_profile_contains_node_events(self):
|
|
"""Test that ExecNode events show up in a profile."""
|
|
event_regexes = [r'Node Lifecycle Event Timeline',
|
|
r'Open Started',
|
|
r'Open Finished',
|
|
r'First Batch Requested',
|
|
r'First Batch Returned',
|
|
r'Last Batch Returned',
|
|
r'Closed']
|
|
query = "select count(*) from functional.alltypes"
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
self.__verify_profile_event_sequence(event_regexes, runtime_profile)
|
|
|
|
def __verify_profile_event_sequence(self, event_regexes, runtime_profile):
|
|
"""Check that 'event_regexes' appear in a consecutive series of lines in
|
|
'runtime_profile'"""
|
|
event_regex_index = 0
|
|
|
|
# Check that the strings appear in the above order with no gaps in the profile.
|
|
for line in runtime_profile.splitlines():
|
|
match = re.search(event_regexes[event_regex_index], line)
|
|
if match is not None:
|
|
event_regex_index += 1
|
|
if event_regex_index == len(event_regexes):
|
|
# Found all the lines - we're done.
|
|
return
|
|
else:
|
|
# Haven't found the first regex yet.
|
|
assert event_regex_index == 0, \
|
|
event_regexes[event_regex_index] + " not in " + line + "\n" + runtime_profile
|
|
assert event_regex_index == len(event_regexes), \
|
|
"Didn't find all events in profile: \n" + runtime_profile
|
|
|
|
def test_query_profile_contains_all_events(self, unique_database):
|
|
"""Test that the expected events show up in a query profile for various queries"""
|
|
# make a data file to load data from
|
|
path = "{1}/{0}.db/data_file".format(unique_database, WAREHOUSE)
|
|
self.filesystem_client.create_file(path, "1")
|
|
use_query = "use {0}".format(unique_database)
|
|
self.execute_query(use_query)
|
|
# all the events we will see for every query
|
|
event_regexes = [
|
|
r'Query Compilation:',
|
|
r'Query Timeline:',
|
|
r'Planning finished'
|
|
]
|
|
# queries that explore different code paths in Frontend compilation
|
|
queries = [
|
|
'create table if not exists impala_6568 (i int)',
|
|
'select * from impala_6568',
|
|
'explain select * from impala_6568',
|
|
'describe impala_6568',
|
|
'alter table impala_6568 set tblproperties(\'numRows\'=\'10\')',
|
|
"load data inpath '{0}' into table impala_6568".format(path)
|
|
]
|
|
# run each query...
|
|
for query in queries:
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
# and check that all the expected events appear in the resulting profile
|
|
self.__verify_profile_contains_every_event(event_regexes, runtime_profile, query)
|
|
|
|
def __verify_profile_contains_every_event(self, event_regexes, runtime_profile, query):
|
|
"""Test that all the expected events show up in a given query profile."""
|
|
for regex in event_regexes:
|
|
assert any(re.search(regex, line) for line in runtime_profile.splitlines()), \
|
|
"Didn't find event '" + regex + "' for query '" + query + \
|
|
"' in profile: \n" + runtime_profile
|
|
|
|
def test_compute_stats_profile(self, unique_database):
|
|
"""Test that the profile for a 'compute stats' query contains three unique query ids:
|
|
one for the parent 'compute stats' query and one each for the two child queries."""
|
|
table_name = "%s.test_compute_stats_profile" % unique_database
|
|
self.execute_query(
|
|
"create table %s as select * from functional.alltypestiny" % table_name)
|
|
results = self.execute_query("compute stats %s" % table_name)
|
|
# Search for all query ids (max length 33) in the profile.
|
|
matches = re.findall("Query \(id=.{,33}\)", results.runtime_profile)
|
|
query_ids = []
|
|
for query_id in matches:
|
|
if query_id not in query_ids:
|
|
query_ids.append(query_id)
|
|
assert len(query_ids) == 3, results.runtime_profile
|
|
|
|
def test_global_resource_counters_in_profile(self):
|
|
"""Test that a set of global resource usage counters show up in the profile."""
|
|
query = "select count(*) from functional.alltypes"
|
|
profile = self.execute_query(query).runtime_profile
|
|
expected_counters = ["TotalBytesRead", "TotalBytesSent", "TotalScanBytesSent",
|
|
"TotalInnerBytesSent", "ExchangeScanRatio",
|
|
"InnerNodeSelectivityRatio"]
|
|
assert all(counter in profile for counter in expected_counters)
|
|
|
|
@SkipIfNotHdfsMinicluster.tuned_for_minicluster
|
|
def test_global_exchange_counters(self):
|
|
"""Test that global exchange counters are set correctly."""
|
|
query = """select count(*) from tpch_parquet.orders o inner join tpch_parquet.lineitem
|
|
l on o.o_orderkey = l.l_orderkey group by o.o_clerk limit 10"""
|
|
profile = self.execute_query(query).runtime_profile
|
|
|
|
# TimeSeriesCounter should be prefixed with a hyphen.
|
|
assert " MemoryUsage" not in profile
|
|
assert "- MemoryUsage" in profile
|
|
|
|
assert "ExchangeScanRatio: 3.19" in profile
|
|
|
|
keys = ["TotalBytesSent", "TotalScanBytesSent", "TotalInnerBytesSent"]
|
|
counters = defaultdict(int)
|
|
for line in profile.splitlines():
|
|
for key in keys:
|
|
if key in line:
|
|
# Match byte count within parentheses
|
|
m = re.search("\(([0-9]+)\)", line)
|
|
|
|
# If a match was not found, then the value of the key should be 0
|
|
if not m:
|
|
assert key + ": 0" in line, "Invalid format for key %s" % key
|
|
assert counters[key] != 0, "Query level counter for key %s cannot be 0" % key
|
|
elif counters[key] == 0:
|
|
# Only keep first (query-level) counter
|
|
counters[key] = int(m.group(1))
|
|
|
|
# All counters have values
|
|
assert all(counters[key] > 0 for key in keys)
|
|
|
|
assert counters["TotalBytesSent"] == (counters["TotalScanBytesSent"] +
|
|
counters["TotalInnerBytesSent"])
|
|
|
|
def test_query_profile_contains_host_resource_usage(self):
|
|
"""Tests that the profile contains a sub-profile with per node resource usage."""
|
|
result = self.execute_query("select count(*), sleep(1000) from functional.alltypes")
|
|
profile = result.runtime_profile
|
|
expected_str = "Per Node Profiles:"
|
|
assert any(expected_str in line for line in profile.splitlines())
|
|
|
|
def test_query_profile_host_resource_metrics_off(self):
|
|
"""Tests that the query profile does not contain resource usage metrics by default or
|
|
when disabled explicitly."""
|
|
query = "select count(*), sleep(1000) from functional.alltypes"
|
|
for query_opts in [None, {'resource_trace_ratio': 0.0}]:
|
|
profile = self.execute_query(query, query_opts).runtime_profile
|
|
# Assert that no host resource counters exist in the profile
|
|
for line in profile.splitlines():
|
|
assert not re.search("HostCpu.*Percentage", line)
|
|
assert not re.search("HostNetworkRx", line)
|
|
assert not re.search("HostDiskReadThroughput", line)
|
|
|
|
def test_query_profile_contains_host_resource_metrics(self):
|
|
"""Tests that the query profile contains various CPU and network metrics."""
|
|
query_opts = {'resource_trace_ratio': 1.0}
|
|
query = "select count(*), sleep(1000) from functional.alltypes"
|
|
profile = self.execute_query(query, query_opts).runtime_profile
|
|
# We check for 500ms because a query with 1s duration won't hit the 64 values limit
|
|
# that would trigger resampling.
|
|
expected_strs = ["HostCpuIoWaitPercentage (500.000ms):",
|
|
"HostCpuSysPercentage (500.000ms):",
|
|
"HostCpuUserPercentage (500.000ms):",
|
|
"HostNetworkRx (500.000ms):",
|
|
"HostNetworkTx (500.000ms):",
|
|
"HostDiskReadThroughput (500.000ms):",
|
|
"HostDiskWriteThroughput (500.000ms):"]
|
|
|
|
# Assert that all expected counters exist in the profile.
|
|
for expected_str in expected_strs:
|
|
assert any(expected_str in line for line in profile.splitlines()), expected_str
|
|
|
|
# Check that there are some values for each counter.
|
|
for line in profile.splitlines():
|
|
if not any(key in line for key in expected_strs):
|
|
continue
|
|
values = line.split(':')[1].strip().split(',')
|
|
assert len(values) > 0
|
|
|
|
def _find_ts_counters_in_thrift_profile(self, profile, name):
|
|
"""Finds all time series counters in 'profile' with a matching name."""
|
|
counters = []
|
|
for node in profile.nodes:
|
|
for counter in node.time_series_counters or []:
|
|
if counter.name == name:
|
|
counters.append(counter)
|
|
return counters
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_thrift_profile_contains_host_resource_metrics(self):
|
|
"""Tests that the thrift profile contains time series counters for CPU and network
|
|
resource usage."""
|
|
query_opts = {'resource_trace_ratio': 1.0}
|
|
self.hs2_client.set_configuration(query_opts)
|
|
result = self.hs2_client.execute("select sleep(2000)",
|
|
profile_format=TRuntimeProfileFormat.THRIFT)
|
|
thrift_profile = result.profile
|
|
|
|
expected_keys = ["HostCpuUserPercentage", "HostNetworkRx", "HostDiskReadThroughput"]
|
|
for key in expected_keys:
|
|
counters = self._find_ts_counters_in_thrift_profile(thrift_profile, key)
|
|
# The query will run on a single node, we will only find the counter once.
|
|
assert len(counters) == 1
|
|
counter = counters[0]
|
|
assert len(counter.values) > 0
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_query_profile_thrift_timestamps(self):
|
|
"""Test that the query profile start and end time date-time strings have
|
|
nanosecond precision. Nanosecond precision is expected by management API clients
|
|
that consume Impala debug webpages."""
|
|
query = "select sleep(5)"
|
|
result = self.hs2_client.execute(query, profile_format=TRuntimeProfileFormat.THRIFT)
|
|
tree = result.profile
|
|
|
|
# tree.nodes[1] corresponds to ClientRequestState::summary_profile_
|
|
# See be/src/service/client-request-state.[h|cc].
|
|
start_time = tree.nodes[1].info_strings["Start Time"]
|
|
end_time = tree.nodes[1].info_strings["End Time"]
|
|
# Start and End Times are of the form "2017-12-07 22:26:52.167711000"
|
|
start_time_sub_sec_str = start_time.split('.')[-1]
|
|
end_time_sub_sec_str = end_time.split('.')[-1]
|
|
|
|
assert len(end_time_sub_sec_str) == 9, end_time
|
|
assert len(start_time_sub_sec_str) == 9, start_time
|
|
|
|
@pytest.mark.execute_serially
|
|
def test_query_end_time(self):
|
|
""" Test that verifies that the end time of a query with a coordinator is set once
|
|
the coordinator releases its admission control resources. This ensures that the
|
|
duration of the query will be determined by the time taken to do real work rather
|
|
than the duration for which the query remains open. On the other hand, for queries
|
|
without coordinators, the End Time is set only when UnregisterQuery() is called."""
|
|
# Test the end time of a query with a coordinator.
|
|
query = "select 1"
|
|
handle = self.hs2_client.execute_async(query)
|
|
result = self.hs2_client.fetch(query, handle)
|
|
# Ensure that the query returns a non-empty result set.
|
|
assert result is not None
|
|
# Once the results have been fetched, the query End Time must be set.
|
|
tree = self.hs2_client.get_runtime_profile(handle, TRuntimeProfileFormat.THRIFT)
|
|
end_time = tree.nodes[1].info_strings["End Time"]
|
|
assert end_time
|
|
self.hs2_client.close_query(handle)
|
|
|
|
# Test the end time of a query without a coordinator.
|
|
query = "describe functional.alltypes"
|
|
handle = self.hs2_client.execute_async(query)
|
|
result = self.hs2_client.fetch(query, handle)
|
|
# Ensure that the query returns a non-empty result set.
|
|
assert result
|
|
# The query End Time must not be set until the query is unregisterted
|
|
tree = self.hs2_client.get_runtime_profile(handle, TRuntimeProfileFormat.THRIFT)
|
|
end_time = tree.nodes[1].info_strings["End Time"]
|
|
assert len(end_time) == 0, end_time
|
|
# Save the last operation to be able to retrieve the profile after closing the query
|
|
last_op = handle.get_handle()._last_operation
|
|
self.hs2_client.close_query(handle)
|
|
tree = last_op.get_profile(TRuntimeProfileFormat.THRIFT)
|
|
end_time = tree.nodes[1].info_strings["End Time"]
|
|
assert end_time is not None
|
|
|
|
def test_dml_end_time(self, unique_database):
|
|
"""Same as the above test but verifies the end time of DML is set only after the work
|
|
in catalogd is done."""
|
|
stmt = "create table %s.alltypestiny like functional.alltypestiny" % unique_database
|
|
self.execute_query(stmt)
|
|
# Warm up the table
|
|
self.execute_query("describe %s.alltypestiny" % unique_database)
|
|
stmt = "insert overwrite %s.alltypestiny partition(year, month)" \
|
|
" select * from functional.alltypestiny" % unique_database
|
|
# Use debug_action to inject a delay in catalogd. The INSERT usually finishes in
|
|
# 300ms without the delay.
|
|
delay_s = 5
|
|
self.hs2_client.set_configuration_option(
|
|
"debug_action", "catalogd_insert_finish_delay:SLEEP@%d" % (delay_s * 1000))
|
|
start_ts = time()
|
|
handle = self.hs2_client.execute_async(stmt)
|
|
self.hs2_client.clear_configuration()
|
|
end_time_str = ""
|
|
duration_str = ""
|
|
while len(end_time_str) == 0:
|
|
sleep(1)
|
|
tree = self.hs2_client.get_runtime_profile(handle, TRuntimeProfileFormat.THRIFT)
|
|
end_time_str = tree.nodes[1].info_strings["End Time"]
|
|
duration_str = tree.nodes[1].info_strings["Duration"]
|
|
# End time should not show up earlier than the delay.
|
|
if time() - start_ts < delay_s:
|
|
assert len(end_time_str) == 0, "End time show up too early: {end_str}. " \
|
|
"{delay_s} second delay expected since " \
|
|
"{start_str} ({start_ts:.6f})".format(
|
|
end_str=end_time_str, delay_s=delay_s, start_ts=start_ts,
|
|
start_str=datetime.utcfromtimestamp(start_ts).strftime('%Y-%m-%d %H:%M:%S.%f'))
|
|
self.hs2_client.close_query(handle)
|
|
duration_us = get_duration_us_from_str(duration_str)
|
|
assert duration_us > delay_s * 1000000
|
|
|
|
def test_query_profile_contains_number_of_fragment_instance(self):
|
|
"""Test that the expected section for number of fragment instance in
|
|
a query profile."""
|
|
event_regexes = [r'Per Host Number of Fragment Instances']
|
|
query = "select count (*) from functional.alltypes"
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
self.__verify_profile_event_sequence(event_regexes, runtime_profile)
|
|
|
|
def test_query_profile_contains_executor_group(self):
|
|
"""Test that the profile contains an info string with the executor group that was
|
|
picked by admission control."""
|
|
query = "select count (*) from functional.alltypes"
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
assert "Executor Group:" in runtime_profile
|
|
|
|
def test_query_profile_storage_load_time_filesystem(self, unique_database,
|
|
cluster_properties):
|
|
"""Test that when a query needs load metadata for table(s), the
|
|
storage load time should be in the profile. Tests file systems."""
|
|
table_name = 'ld_prof'
|
|
self.execute_query(
|
|
"create table {0}.{1}(col1 int)".format(unique_database, table_name))
|
|
self.__check_query_profile_storage_load_time(unique_database, table_name,
|
|
cluster_properties)
|
|
|
|
@SkipIfFS.hbase
|
|
@pytest.mark.execute_serially
|
|
def test_query_profile_storage_load_time(self, cluster_properties):
|
|
"""Test that when a query needs load metadata for table(s), the
|
|
storage load time should be in the profile. Tests kudu and hbase."""
|
|
# KUDU table
|
|
self.__check_query_profile_storage_load_time("functional_kudu", "alltypes",
|
|
cluster_properties)
|
|
|
|
# HBASE table
|
|
self.__check_query_profile_storage_load_time("functional_hbase", "alltypes",
|
|
cluster_properties)
|
|
|
|
def __check_query_profile_storage_load_time(self, db_name, table_name,
|
|
cluster_properties):
|
|
"""Check query profile for storage load time with a given database."""
|
|
self.execute_query("invalidate metadata {0}.{1}".format(db_name, table_name))
|
|
query = "select count (*) from {0}.{1}".format(db_name, table_name)
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
if cluster_properties.is_catalog_v2_cluster():
|
|
storageLoadTime = "StorageLoad.Time"
|
|
else:
|
|
storageLoadTime = "storage-load-time"
|
|
assert storageLoadTime in runtime_profile
|
|
# Call the second time, no metastore loading needed.
|
|
# Only check this part in Catalog V1 because of V2's random behavior
|
|
if not cluster_properties.is_catalog_v2_cluster():
|
|
runtime_profile = self.execute_query(query).runtime_profile
|
|
assert storageLoadTime not in runtime_profile
|
|
|
|
def __verify_hashtable_stats_profile(self, runtime_profile):
|
|
assert "Hash Table" in runtime_profile
|
|
assert "Probes:" in runtime_profile
|
|
assert "Travel:" in runtime_profile
|
|
assert "HashCollisions:" in runtime_profile
|
|
assert "Resizes:" in runtime_profile
|
|
nprobes = re.search('Probes:.*\((\d+)\)', runtime_profile)
|
|
# Probes and travel can be 0. The number can be an integer or float with K.
|
|
assert nprobes and len(nprobes.groups()) == 1 and nprobes.group(1) >= 0
|
|
ntravel = re.search('Travel:.*\((\d+)\)', runtime_profile)
|
|
assert ntravel and len(ntravel.groups()) == 1 and ntravel.group(1) >= 0
|
|
|
|
def test_query_profle_hashtable(self):
|
|
"""Test that the profile for join/aggregate contains hash table related
|
|
information."""
|
|
# Join
|
|
query = """select a.int_col, a.string_col from functional.alltypes a
|
|
inner join functional.alltypessmall b on a.id = b.id"""
|
|
result = self.execute_query(query)
|
|
assert result.success
|
|
self.__verify_hashtable_stats_profile(result.runtime_profile)
|
|
# Group by
|
|
query = """select year, count(*) from
|
|
functional.alltypesagg where int_col < 7 and year = 2010 group by year"""
|
|
result = self.execute_query(query)
|
|
assert result.success
|
|
self.__verify_hashtable_stats_profile(result.runtime_profile)
|
|
|
|
@SkipIfNotHdfsMinicluster.tuned_for_minicluster
|
|
def test_skew_reporting_in_runtime_profile(self):
|
|
"""Test that the skew summary and skew details are reported in runtime profile
|
|
correctly"""
|
|
query = """select ca_state, count(*) from tpcds_parquet.store_sales,
|
|
tpcds_parquet.customer, tpcds_parquet.customer_address
|
|
where ss_customer_sk = c_customer_sk and
|
|
c_current_addr_sk = ca_address_sk
|
|
group by ca_state
|
|
order by ca_state
|
|
"""
|
|
"Set up the skew threshold to 0.0"
|
|
query_opts = {'report_skew_limit': 0.0}
|
|
results = self.execute_query(query, query_opts)
|
|
assert results.success
|
|
|
|
"When the skew summary is seen, look for the details"
|
|
skews_found = 'skew\(s\) found at:.*HASH_JOIN.*HASH_JOIN.*HDFS_SCAN_NODE'
|
|
if len(re.findall(skews_found, results.runtime_profile, re.M)) == 1:
|
|
|
|
"Expect to see skew details twice at the hash join nodes."
|
|
probe_rows_at_hj = 'HASH_JOIN_NODE.*\n.*Skew details: ProbeRows'
|
|
assert len(re.findall(probe_rows_at_hj, results.runtime_profile, re.M)) == 2
|
|
|
|
"Expect to see skew details once at the scan node."
|
|
probe_rows_at_hdfs_scan = 'HDFS_SCAN_NODE.*\n.*Skew details: RowsRead'
|
|
assert len(re.findall(probe_rows_at_hdfs_scan, results.runtime_profile, re.M)) == 1
|
|
|
|
def test_query_profile_contains_auto_scaling_events(self):
|
|
"""Test that the info about two compilation events appears in the profile."""
|
|
query = "select * from functional.alltypes"
|
|
# Specifically turn on test_replan to impose a 2-executor group environment in FE.
|
|
query_opts = {'enable_replan': 1, 'test_replan': 1}
|
|
results = self.execute_query(query, query_opts)
|
|
assert results.success
|
|
runtime_profile = results.runtime_profile
|
|
assert len(re.findall('Analysis finished:', runtime_profile, re.M)) == 2
|
|
assert len(re.findall('Single node plan created:', runtime_profile, re.M)) == 2
|
|
assert len(re.findall('Distributed plan created:', runtime_profile, re.M)) == 2
|
|
|
|
class TestQueryStates(ImpalaTestSuite):
|
|
"""Test that the 'Query State' and 'Impala Query State' are set correctly in the
|
|
runtime profile."""
|
|
|
|
@classmethod
|
|
def get_workload(self):
|
|
return 'functional-query'
|
|
|
|
def test_query_states(self):
|
|
"""Tests that the query profile shows expected query states."""
|
|
query = "select count(*) from functional.alltypes where bool_col = sleep(10)"
|
|
handle = self.execute_query_async(query,
|
|
{"debug_action": "AC_BEFORE_ADMISSION:SLEEP@1000"})
|
|
# If ExecuteStatement() has completed and the query is paused in the admission control
|
|
# phase, then the query must be in COMPILED state.
|
|
profile = self.client.get_runtime_profile(handle)
|
|
assert self.__is_line_in_profile("Query State: COMPILED", profile)
|
|
assert self.__is_line_in_profile("Impala Query State: PENDING", profile)
|
|
# After completion of the admission control phase, the query must have at least
|
|
# reached RUNNING state.
|
|
self.client.wait_for_admission_control(handle)
|
|
profile = self.client.get_runtime_profile(handle)
|
|
assert self.__is_line_in_profile("Query State: RUNNING", profile), profile
|
|
assert self.__is_line_in_profile("Impala Query State: RUNNING", profile), profile
|
|
|
|
self.client.fetch(query, handle)
|
|
profile = self.client.get_runtime_profile(handle)
|
|
# After fetching the results, the query must be in state FINISHED.
|
|
assert self.__is_line_in_profile("Query State: FINISHED", profile), profile
|
|
assert self.__is_line_in_profile("Impala Query State: FINISHED", profile), profile
|
|
|
|
def test_error_query_state(self):
|
|
"""Tests that the query profile shows the proper error state."""
|
|
query = "select * from functional.alltypes limit 10"
|
|
handle = self.execute_query_async(query, {"abort_on_error": "1",
|
|
"debug_action": "0:GETNEXT:FAIL"})
|
|
|
|
def assert_finished():
|
|
profile = self.client.get_runtime_profile(handle)
|
|
return self.__is_line_in_profile("Query State: FINISHED", profile) and \
|
|
self.__is_line_in_profile("Impala Query State: FINISHED", profile)
|
|
|
|
self.assert_eventually(300, 1, assert_finished,
|
|
lambda: self.client.get_runtime_profile(handle))
|
|
|
|
try:
|
|
self.client.fetch(query, handle)
|
|
assert False
|
|
except ImpalaBeeswaxException:
|
|
pass
|
|
|
|
profile = self.client.get_runtime_profile(handle)
|
|
assert self.__is_line_in_profile("Query State: EXCEPTION", profile), profile
|
|
assert self.__is_line_in_profile("Impala Query State: ERROR", profile), profile
|
|
|
|
def __is_line_in_profile(self, line, profile):
|
|
"""Returns true if the given 'line' is in the given 'profile'. A single line of the
|
|
profile must exactly match the given 'line' (excluding whitespaces)."""
|
|
return re.search("^\s*{0}\s*$".format(line), profile, re.M)
|